Showing preview only (1,019K chars total). Download the full file or copy to clipboard to get everything.
Repository: conversationai/conversationai-models
Branch: main
Commit: d3a724c96e24
Files: 196
Total size: 953.6 KB
Directory structure:
gitextract__2536wl_/
├── .bazelrc
├── .gitignore
├── .travis.yml
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── annotator_models/
│ ├── README.md
│ ├── bin/
│ │ ├── cancel-job
│ │ ├── ls-jobs
│ │ ├── run
│ │ ├── run_local
│ │ └── stream-logs
│ ├── cpu_config.yaml
│ ├── requirements.txt
│ ├── results/
│ │ └── .gitignore
│ └── trainer/
│ ├── __init__.py
│ ├── dawid_skene.py
│ └── dawid_skene_test.py
├── attention-tutorial/
│ ├── Attention_Model_Tutorial.ipynb
│ ├── README.md
│ ├── checkpoints/
│ │ └── README.md
│ ├── data/
│ │ └── README.md
│ ├── process_figshare.py
│ ├── requirements.txt
│ └── visualize_attention.py
├── data_preparation/
│ ├── README.md
│ ├── config.ini
│ ├── preprocessing/
│ │ ├── __init__.py
│ │ ├── constants.py
│ │ ├── preprocessing.py
│ │ └── tfrecord_utils.py
│ ├── requirements.txt
│ ├── run_preprocessing_artificial_bias.py
│ ├── run_preprocessing_data_split.py
│ └── setup.py
├── experiments/
│ ├── .gitignore
│ ├── README.md
│ ├── WORKSPACE
│ ├── __init__.py
│ ├── requirements.txt
│ ├── setup.py
│ ├── testdata/
│ │ ├── BUILD
│ │ ├── cats_and_dogs.jsonl
│ │ ├── cats_and_dogs_onehot.vocab.txt
│ │ ├── cats_and_dogs_with_cat_opt_int_labels.jsonl
│ │ └── cats_and_dogs_with_partial_cat_int_labels.jsonl
│ ├── tf_trainer/
│ │ ├── __init__.py
│ │ ├── common/
│ │ │ ├── BUILD
│ │ │ ├── __init__.py
│ │ │ ├── base_model.py
│ │ │ ├── basic_gpu_config.yaml
│ │ │ ├── cnn_spec_parser.py
│ │ │ ├── cnn_spec_parser_test.py
│ │ │ ├── dataset_config.sh
│ │ │ ├── dataset_input.py
│ │ │ ├── episodic_tfrecord_input.py
│ │ │ ├── episodic_tfrecord_input_test.py
│ │ │ ├── model_trainer.py
│ │ │ ├── p100_config.yaml
│ │ │ ├── serving_input.py
│ │ │ ├── text_preprocessor.py
│ │ │ ├── text_preprocessor_test.py
│ │ │ ├── tfrecord_input.py
│ │ │ ├── tfrecord_input_test.py
│ │ │ ├── token_embedding_index.py
│ │ │ ├── token_embedding_index_test.py
│ │ │ ├── types.py
│ │ │ └── v100_config.yaml
│ │ ├── tf_char_cnn/
│ │ │ ├── __init__.py
│ │ │ ├── hparam_config.yaml
│ │ │ ├── hparam_config_civil_comments.yaml
│ │ │ ├── hparam_config_many_communities.yaml
│ │ │ ├── hparam_config_toxicity.yaml
│ │ │ ├── model.py
│ │ │ ├── run.deploy.sh
│ │ │ ├── run.hyperparameter.sh
│ │ │ ├── run.local.sh
│ │ │ ├── run.ml_engine.sh
│ │ │ └── run.py
│ │ ├── tf_cnn/
│ │ │ ├── __init__.py
│ │ │ ├── finetune.py
│ │ │ ├── finetune.sh
│ │ │ ├── hparam_config.yaml
│ │ │ ├── hparam_config_civil_comments.yaml
│ │ │ ├── hparam_config_many_communities.yaml
│ │ │ ├── hparam_config_many_communities_40_per_8_shot.yaml
│ │ │ ├── hparam_config_toxicity.yaml
│ │ │ ├── model.py
│ │ │ ├── run.deploy.sh
│ │ │ ├── run.hyperparameter.sh
│ │ │ ├── run.local.sh
│ │ │ ├── run.ml_engine.sh
│ │ │ └── run.py
│ │ ├── tf_gru_attention/
│ │ │ ├── __init__.py
│ │ │ ├── finetune.py
│ │ │ ├── finetune.sh
│ │ │ ├── hparam_config.yaml
│ │ │ ├── hparam_config_civil_comments.yaml
│ │ │ ├── hparam_config_many_communities.yaml
│ │ │ ├── hparam_config_many_communities_40_per_8_shot.yaml
│ │ │ ├── hparam_config_toxicity.yaml
│ │ │ ├── model.py
│ │ │ ├── run.deploy.sh
│ │ │ ├── run.hyperparameter.sh
│ │ │ ├── run.local.sh
│ │ │ ├── run.ml_engine.sh
│ │ │ └── run.py
│ │ ├── tf_hub_classifier/
│ │ │ ├── __init__.py
│ │ │ ├── finetune.py
│ │ │ ├── finetune.sh
│ │ │ ├── hparam_config.yaml
│ │ │ ├── hparam_config_civil_comments.yaml
│ │ │ ├── hparam_config_many_communities.yaml
│ │ │ ├── hparam_config_many_communities_40_per_8_shot.yaml
│ │ │ ├── hparam_config_toxicity.yaml
│ │ │ ├── model.py
│ │ │ ├── run.deploy.sh
│ │ │ ├── run.hyperparameter.sh
│ │ │ ├── run.local.sh
│ │ │ ├── run.ml_engine.sh
│ │ │ └── run.py
│ │ ├── tf_hub_tfjs/
│ │ │ ├── __init__.py
│ │ │ ├── model.py
│ │ │ ├── notebook/
│ │ │ │ ├── BiasEvaluation.ipynb
│ │ │ │ └── EvaluatingClassifier.ipynb
│ │ │ ├── run.local.sh
│ │ │ └── run.py
│ │ ├── tf_kona_prototypical_network/
│ │ │ └── proto.py
│ │ └── tf_word_label_embedding/
│ │ ├── __init__.py
│ │ ├── hparam_config.yaml
│ │ ├── model.py
│ │ ├── run.hyperparameter.sh
│ │ ├── run.local.sh
│ │ ├── run.ml_engine.sh
│ │ └── run.py
│ └── tools/
│ ├── bert_tfrecord_converter.py
│ ├── convert_csv_to_tfrecord.py
│ └── convert_jsonl_to_tfrecord.py
├── hierarchical_attention_research/
│ └── han_model/
│ ├── .gitignore
│ ├── HAN_model.py
│ ├── LICENSE
│ ├── README.md
│ ├── bn_lstm.py
│ ├── bn_lstm_test.py
│ ├── data_util.py
│ ├── model_components.py
│ ├── requirements.txt
│ ├── worker.py
│ ├── yelp.py
│ └── yelp_prepare.py
├── kaggle-classification/
│ ├── .gitignore
│ ├── README.md
│ ├── __init__.py
│ ├── bin/
│ │ ├── cancel-job
│ │ ├── ls-jobs
│ │ ├── run
│ │ ├── run_keras.sh
│ │ ├── run_keras_local.sh
│ │ ├── run_local
│ │ └── stream-logs
│ ├── config.yaml
│ ├── gpu_config.yaml
│ ├── hparam_config.yaml
│ ├── keras_hparam_config.yaml
│ ├── keras_trainer/
│ │ ├── __init__.py
│ │ ├── base_model.py
│ │ ├── cnn_with_attention.py
│ │ ├── custom_metrics.py
│ │ ├── model.py
│ │ ├── rnn.py
│ │ └── single_layer_cnn.py
│ ├── requirements.txt
│ ├── setup.py
│ └── trainer/
│ ├── __init__.py
│ ├── model.py
│ └── wikidata.py
├── model_evaluation/
│ ├── BiosBias Evaluation.ipynb
│ ├── Predict bias.ipynb
│ ├── README.md
│ ├── deploy_models.sh
│ ├── few_shot_learning_baseline_evaluation.ipynb
│ ├── input_fn_example.py
│ ├── jigsaw_evaluation_pipeline.ipynb
│ ├── requirements.txt
│ ├── score_bias_data.sh
│ ├── score_scrubbed_data.sh
│ ├── score_test_data.py
│ └── utils_export/
│ ├── __init__.py
│ ├── dataset.py
│ ├── dataset_test.py
│ ├── deploy_list_models.py
│ ├── utils_cloudml.py
│ ├── utils_cloudml_test.py
│ ├── utils_tfrecords.py
│ └── utils_tfrecords_test.py
└── travis_blase_test_support/
└── bazel_0.18.1-linux-x86_64.deb.sha256
================================================
FILE CONTENTS
================================================
================================================
FILE: .bazelrc
================================================
startup --host_jvm_args=-Xmx2500m
startup --host_jvm_args=-Xms2500m
startup --batch
test --ram_utilization_factor=10
build --verbose_failures
build --spawn_strategy=standalone --genrule_strategy=standalone
test --test_strategy=standalone
================================================
FILE: .gitignore
================================================
# Editor config.
.vscode/
# Python Compiles files.
*.pyc
# Virtual Environment files.
.pyenv
.virtualenv
env
.venv
# mypy cache files for type-checking.
.mypy_cache
# Bazel
bazel-bin
bazel-experiments
bazel-genfiles
bazel-out
bazel-testlogs
================================================
FILE: .travis.yml
================================================
language: python
python:
- "3.5"
- "3.6"
dist: trusty
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- wget
- pkg-config
before_install:
- wget https://github.com/bazelbuild/bazel/releases/download/0.18.1/bazel_0.18.1-linux-x86_64.deb
- sha256sum -c travis_blase_test_support/bazel_0.18.1-linux-x86_64.deb.sha256
- sudo dpkg -i bazel_0.18.1-linux-x86_64.deb
- cd experiments
install:
- pip install -r requirements.txt
script:
- bazel test --test_output=streamed ...
================================================
FILE: CONTRIBUTING.md
================================================
# How to contribute
We'd love to accept your patches and contributions to this project. There are
just a few small guidelines you need to follow.
## Contributor License Agreement
Contributions to this project must be accompanied by a Contributor License
Agreement. You (or your employer) retain the copyright to your contribution,
this simply gives us permission to use and redistribute your contributions as
part of the project. Head over to <https://cla.developers.google.com/> to see
your current agreements on file or to sign a new one.
You generally only need to submit a CLA once, so if you've already submitted one
(even if it was for a different project), you probably don't need to do it
again.
## Code reviews
All submissions, including submissions by project members, require review. We
use GitHub pull requests for this purpose. Consult [GitHub Help] for more
information on using pull requests.
[GitHub Help]: https://help.github.com/articles/about-pull-requests/
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright {yyyy} {name of copyright owner}
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
# ConversationAI Models
This repository is contains example code to train machine learning models for text classification as part of the [Conversation AI](https://conversationai.github.io/) project.
# Outline of the codebase
* `experiments/` contains the ML training framework.
* `annotator-models/` contains a Dawid-Skene implementation for modelling rater quality to produce better annotations.
* `attention-tutorial/` contains an introductory ipython notebook for RNNs with attention, as presented at Devoxx talk ["Tensorflow, deep learning and modern RNN architectures, without a PhD by Martin Gorner"](https://www.youtube.com/watch?v=pzOzmxCR37I)
* `kaggle-classification/` early experiments with Keras and Estimator for training on [the Jigsaw Toxicity Kaggle competition](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge). Will be superceeded by `experiments/` shortly.
* `model_evaluation/` contains utilities to use a model deployed on cloud MLE, and some notebooks to illustrate typical evaluation metrics.
## About this code
This repository contains example code to help experiment with models to improve conversations; it is not an official Google product.
================================================
FILE: annotator_models/README.md
================================================
# Modeling Anotators
This is an implementation of the [Dawid-Skene model](http://crowdsourcing-class.org/readings/downloads/ml/EM.pdf). Dawid-Skene is an unsupervised model that can be used to improve the quality of a crowdsourced dataset by learning annotator error rate and predicting the true item labels.
This code was adapted from an [implementation](https://github.com/dallascard/dawid_skene) by [dallascard](https://github.com/dallascard).
## To Run Locally
1. Setup a [virtualenv](https://virtualenvwrapper.readthedocs.io/en/latest/) for
the project (recommended, but technically optional).
Python 2:
```
python -m virtualenv env
```
Python 3:
```
python3 -m venv env
```
From either to enter your virtual env:
```shell
source env/bin/activate
```
2. Install library dependencies:
```shell
pip install -r requirements.txt
```
3. Create training data. The training data must be a CSV that has fields for
the worker ID, item ID and label. You can specify the column names for these
fields as flags to the training script.
For example:
```
comment_id,worker_id,toxic
1519346288,43675129,0
1519346288,41122119,0
1519346288,38510102,0
1519346288,43650017,0
1519346288,28524232,0
...
```
4. Run a model on a given class (e.g. 'toxic' or 'obscene'). There are examples
of how to run the model locally and using ml-engine in [`bin/run_local`](bin/run_local) and
[`bin/run`](bin/run) respectively.
Note: to run in google cloud, you will need to be authenticated with
Google Cloud (you can run `gcloud auth application-default login` to do
this) and you must have access to the cloud bucket where the data is located
(you can test this by running `gcloud storage ls gs://kaggle-model-experiments/`).
5. The output is two files written to the `job-dir` directory specified in the run
script.
* `error_rates_{LABEL}_{N_ANNOTATIONS}.csv` - the error rates for each annotator
* `predictions_{LABEL}_{N_ANNOTATIONS}.csv` - the predicted labels for each item
================================================
FILE: annotator_models/bin/cancel-job
================================================
#!/bin/bash
gcloud ml-engine jobs cancel $1
================================================
FILE: annotator_models/bin/ls-jobs
================================================
#!/bin/bash
gcloud ml-engine jobs list | grep $USER
================================================
FILE: annotator_models/bin/run
================================================
#!/bin/bash
#
# A script to train the kaggle model remotely using ml-engine.
#
# To run with default hyperparameters from the kaggle-classification directory just enter:
# './bin/run'
#
#
#
# Setup Steps:
# 1. Install the gcloud SDK
# 2. Authenticate with the GCP project you want to use, `gcloud config set project [my-project]`
# 3. Put the train and test data in Cloud Storage, `gcloud storage cp [DATA_FILE] gs://[BUCKET_NAME]/`
#
# Edit these!
BUCKET_NAME=annotator_models
CONFIG=cpu_config.yaml
MAX_ITER=50
TOLERANCE=1
PSEUDO_COUNT=1
declare -a LABELS=("obscene" "sexual_explicit" "threat" "flirtation" "identity_hate" "insult")
# Note: this must be compatible with cells that have GPUs. us-central1 works.
# See: https://cloud.google.com/ml-engine/docs/using-gpus
REGION=us-central1
while getopts :c:h opt; do
case ${opt} in
h)
echo "Usage: run [-c config_filename.yaml]"
echo "Flags: "
echo -e " -c Specify a config file (e.g. use hparam_config to enable hyperparameter tuning)"
exit 0;;
c)
echo "Using custom config ${OPTARG}"
CONFIG=${OPTARG};;
:)
echo "Error: ${OPTARG} requires an argument."
echo "Use 'run -h' for help."
exit 1;;
\?)
echo "Invalid flag. Use 'run -h' for help."
exit 1;;
esac
done
echo "Writing to $OUTPUT_PATH"
for label in "${LABELS[@]}"
do
echo "Running on $label"
DATA_PATH=gs://annotator_models/kaggle_annotation_data/dawid_skene_annotations_on_kaggle_combined_${label}.csv
COMMENT_TEXT_PATH=$DATA_PATH
JOB_NAME=${USER}_dawid_skene_kaggle_${label}
DATE=`date '+%Y%m%d_%H%M%S'`
DATE_DAY_ONLY=`date '+%Y%m%d'`
OUTPUT_PATH=gs://${BUCKET_NAME}/models/${USER}/${DATE_DAY_ONLY}
gcloud ml-engine jobs submit training ${JOB_NAME}_${DATE} \
--job-dir=${OUTPUT_PATH} \
--runtime-version=1.4 \
--config=${CONFIG} \
--module-name=trainer.dawid_skene \
--package-path=trainer \
--region=$REGION \
--verbosity=debug -- \
--data-path=$DATA_PATH \
--comment-text-path=$COMMENT_TEXT_PATH \
--label=$label \
--max-iter=$MAX_ITER \
--tolerance=$TOLERANCE \
--worker-id-col='annotator_id' \
--unit-id-col='comment_id' \
--pseudo-count=$PSEUDO_COUNT
done
================================================
FILE: annotator_models/bin/run_local
================================================
#!/bin/bash
# A script to train the kaggle model locally.
DATE=`date '+%Y%m%d_%H%M%S'`
BUCKET_NAME=annotator_models
declare -a arr=("obscene" "threat" "insult" "identity_hate" "toxic" "severe_toxic")
## now loop through the above array
for label in "${arr[@]}"
do
data_path=gs://${BUCKET_NAME}/kaggle_annotation_data/dawid_skene_annotations_on_kaggle_test_${label}.csv \
gcloud ml-engine local train \
--module-name=trainer.dawid_skene \
--package-path=trainer -- \
--data-path=${data_path} \
--comment-text-path=${data_path} \
--label=${label} \
--job-dir='results' \
--worker-id-col='annotator_id' \
--unit-id-col='comment_id' \
--tolerance=50 \
--n_examples=1000
done
================================================
FILE: annotator_models/bin/stream-logs
================================================
#!/bin/bash
gcloud ml-engine jobs stream-logs $1
================================================
FILE: annotator_models/cpu_config.yaml
================================================
trainingInput:
scaleTier: CUSTOM
## Custom scaleTier needed for using > 1 GPU machines.
# scaleTier: CUSTOM
masterType: large_model
# workerType: complex_model_m_gpu
# parameterServerType: large_model
# workerCount: 9
# parameterServerCount: 3
================================================
FILE: annotator_models/requirements.txt
================================================
absl-py==0.1.12
astor==0.6.2
backports.weakref==1.0.post1
bleach==3.3.0
cachetools==2.0.1
certifi==2024.7.4
chardet==3.0.4
dill==0.2.7.1
enum34==1.1.6
funcsigs==1.0.2
future==0.18.3
futures==3.2.0
gapic-google-cloud-datastore-v1==0.15.3
gapic-google-cloud-error-reporting-v1beta1==0.15.3
gapic-google-cloud-logging-v2==0.91.3
gast==0.2.0
google-api-core==1.1.0
google-auth==1.4.1
google-auth-oauthlib==0.2.0
google-cloud==0.32.0
google-cloud-bigquery==0.31.0
google-cloud-bigquery-datatransfer==0.1.1
google-cloud-bigtable==0.28.1
google-cloud-container==0.1.1
google-cloud-core==0.28.1
google-cloud-datastore==1.4.0
google-cloud-dns==0.28.0
google-cloud-error-reporting==0.28.0
google-cloud-firestore==0.28.0
google-cloud-language==1.0.1
google-cloud-logging==1.4.0
google-cloud-monitoring==0.28.1
google-cloud-pubsub==0.30.1
google-cloud-resource-manager==0.28.1
google-cloud-runtimeconfig==0.28.1
google-cloud-spanner==0.29.0
google-cloud-speech==0.30.0
google-cloud-storage==1.6.0
google-cloud-trace==0.17.0
google-cloud-translate==1.3.1
google-cloud-videointelligence==1.0.1
google-cloud-vision==0.29.0
google-gax==0.15.16
google-resumable-media==0.3.1
googleapis-common-protos==1.5.3
grpc-google-iam-v1==0.11.4
grpcio==1.53.2
html5lib==0.999999999
httplib2==0.19.0
idna==3.7
Markdown==2.6.11
mock==2.0.0
numpy==1.22.0
oauth2client==3.0.0
oauthlib==2.0.7
pandas==0.22.0
pandas-gbq==0.3.1
pbr==4.0.0
ply==3.8
proto-google-cloud-datastore-v1==0.90.4
proto-google-cloud-error-reporting-v1beta1==0.15.3
proto-google-cloud-logging-v2==0.91.3
protobuf==3.18.3
psutil==5.6.6
pyasn1==0.4.2
pyasn1-modules==0.2.1
python-dateutil==2.7.2
pytz==2018.3
requests==2.32.0
requests-oauthlib==0.8.0
rsa==4.7
six==1.11.0
tensorboard==1.12.0
tensorflow==2.12.1
termcolor==1.1.0
urllib3==1.26.18
Werkzeug==3.0.3
================================================
FILE: annotator_models/results/.gitignore
================================================
*
!.gitignore
================================================
FILE: annotator_models/trainer/__init__.py
================================================
================================================
FILE: annotator_models/trainer/dawid_skene.py
================================================
"""Description: Given unreliable ratings of items classes by multiple raters, determine the most likely true class for each item, class marginals, and individual error rates for each rater, using Expectation Maximization
References:
( Dawid and Skene (1979). Maximum Likelihood Estimation of Observer
Error-Rates Using the EM Algorithm. Journal of the Royal Statistical Society.
Series C (Applied Statistics), Vol. 28, No. 1, pp. 20-28.
"""
import argparse
import logging
import math
import sys
import time
import numpy as np
import pandas as pd
from scipy import stats
import tensorflow as tf
FLAGS = None
np.set_printoptions(precision=2)
def run(items,
raters,
classes,
counts,
label,
psuedo_count,
tol=1,
max_iter=25,
init='average'):
"""
Run the Dawid-Skene estimator on response data
Input:
responses: a pandas DataFrame of ratings where each row is a rating from
some rater ('_worker_id') on some item ('_unit_id')
tol: tolerance required for convergence of EM
max_iter: maximum number of iterations of EM
"""
# initialize
iteration = 0
converged = False
old_class_marginals = None
old_error_rates = None
# item_classes is a matrix of estimates of true item classes of size
# [items, classes]
item_classes = initialize(counts)
[nItems, nRaters, nClasses] = np.shape(counts)
logging.info('Iter\tlog-likelihood\tdelta-CM\tdelta-Y_hat')
while not converged:
iteration += 1
start_iter = time.time()
# M-step - updated error rates and class marginals given new
# distribution over true item classes
old_item_classes = item_classes
(class_marginals, error_rates) = m_step(counts, item_classes, psuedo_count)
# E-step - calculate expected item classes given error rates and
# class marginals
item_classes = e_step_verbose(counts, class_marginals, error_rates)
# check likelihood
log_L = calc_likelihood(counts, class_marginals, error_rates)
# calculate the number of seconds the last iteration took
iter_time = time.time() - start_iter
# check for convergence
if old_class_marginals is not None:
class_marginals_diff = np.sum(
np.abs(class_marginals - old_class_marginals))
item_class_diff = np.sum(np.abs(item_classes - old_item_classes))
logging.info('{0}\t{1:.1f}\t{2:.4f}\t\t{3:.2f}\t({4:3.2f} secs)'.format(
iteration, log_L, class_marginals_diff, item_class_diff, iter_time))
if (class_marginals_diff < tol and item_class_diff < tol) \
or iteration > max_iter:
converged = True
else:
logging.info('{0}\t{1:.1f}'.format(iteration, log_L))
# update current values
old_class_marginals = class_marginals
old_error_rates = error_rates
return class_marginals, error_rates, item_classes
def load_data(path, unit_id, worker_id, label):
logging.info('Loading data from {0}'.format(path))
with tf.gfile.Open(path, 'rb') as fileobj:
df = pd.read_csv(fileobj, encoding='utf-8')
# only keep necessary columns
df = df[[unit_id, worker_id, label]]
return df
def initialize(counts):
"""
Get initial estimates for the true item classes using counts
see equation 3.1 in Dawid-Skene (1979)
Input:
counts: counts of the number of times each response was given
by each rater for each item: [items x raters x classes]. Note
in the crowd rating example, counts will be a 0/1 matrix.
Returns:
item_classes: matrix of estimates of true item classes:
[items x responses]
"""
[nItems, nRaters, nClasses] = np.shape(counts)
# sum over raters
response_sums = np.sum(counts, 1)
# create an empty array
item_classes = np.zeros([nItems, nClasses])
# for each item, take the average number of ratings in each class
for p in range(nItems):
item_classes[p, :] = response_sums[p, :] / np.sum(
response_sums[p, :], dtype=float)
return item_classes
def m_step(counts, item_classes, psuedo_count):
"""
Get estimates for the prior class probabilities (p_j) and the error
rates (pi_jkl) using MLE with current estimates of true item classes
See equations 2.3 and 2.4 in Dawid-Skene (1979)
Input:
counts: Array of how many times each rating was given by each rater
for each item
item_classes: Matrix of current assignments of items to classes
psuedo_count: A psuedo count used to smooth the error rates. For each
rater k
and for each class i and class j, we pretend rater k has rated
psuedo_count examples with class i when class j was the true class.
Returns:
p_j: class marginals [classes]
pi_kjl: error rates - the probability of rater k giving
response l for an item in class j [observers, classes, classes]
"""
[nItems, nRaters, nClasses] = np.shape(counts)
# compute class marginals
class_marginals = np.sum(item_classes, axis=0) / float(nItems)
# compute error rates for each rater, each predicted class
# and each true class
error_rates = np.matmul(counts.T, item_classes) + psuedo_count
# reorder axes so its of size [nItems x nClasses x nClasses]
error_rates = np.einsum('abc->bca', error_rates)
# divide each row by the sum of the error rates over all observation classes
sum_over_responses = np.sum(error_rates, axis=2)[:, :, None]
# for cases where an annotator has never used a label, set their sum over
# responses for that label to 1 to avoid nan when we divide. The result will
# be error_rate[k, i, j] is 0 if annotator k never used label i.
sum_over_responses[sum_over_responses == 0] = 1
error_rates = np.divide(error_rates, sum_over_responses)
return (class_marginals, error_rates)
def m_step_verbose(counts, item_classes, psuedo_count):
"""
This method is the verbose (i.e. not vectorized) version of the m_step.
It is currently not used because the vectorized version is faster, but we
leave it here for future debugging.
Get estimates for the prior class probabilities (p_j) and the error
rates (pi_jkl) using MLE with current estimates of true item classes
See equations 2.3 and 2.4 in Dawid-Skene (1979)
Input:
counts: Array of how many times each rating was given by each rater
for each item
item_classes: Matrix of current assignments of items to classes
psuedo_count: A psuedo count used to smooth the error rates. For each
rater k
and for each class i and class j, we pretend rater k has rated
psuedo_count examples with class i when class j was the true class.
Returns:
p_j: class marginals [classes]
pi_kjl: error rates - the probability of rater k giving
response l for an item in class j [observers, classes, classes]
"""
[nItems, nRaters, nClasses] = np.shape(counts)
# compute class marginals
class_marginals = np.sum(item_classes, 0) / float(nItems)
# compute error rates for each rater, each predicted class
# and each true class
error_rates = np.zeros([nRaters, nClasses, nClasses])
for k in range(nRaters):
for j in range(nClasses):
for l in range(nClasses):
error_rates[k, j, l] = np.dot(item_classes[:,j], counts[:,k,l]) \
+ psuedo_count
# normalize by summing over all observation classes
sum_over_responses = np.sum(error_rates[k, j, :])
if sum_over_responses > 0:
error_rates[k, j, :] = error_rates[k, j, :] / float(sum_over_responses)
return (class_marginals, error_rates)
def e_step(counts_tiled, class_marginals, error_rates):
"""
Determine the probability of each item belonging to each class,
given current ML estimates of the parameters from the M-step
See equation 2.5 in Dawid-Skene (1979)
Inputs:
counts_tiled: A matrix of how many times each rating was given
by each rater for each item, repeated for each class to make matrix
multiplication fasterr. Size: [nItems, nRaters, nClasses, nClasses]
class_marginals: probability of a random item belonging to each class.
Size: [nClasses]
error_rates: probability of rater k assigning a item in class j
to class l. Size [nRaters, nClasses, nClasses]
Returns:
item_classes: Soft assignments of items to classes
[items x classes]
"""
[nItems, _, nClasses, _] = np.shape(counts_tiled)
error_rates_tiled = np.tile(error_rates, (nItems, 1, 1, 1))
power = np.power(error_rates_tiled, counts_tiled)
# Note, multiplying over axis 1 and then 2 is substantially faster than
# the equivalent np.prod(power, axis=(1,3)
item_classes = class_marginals * np.prod(np.prod(power, axis=1), axis=2)
# normalize error rates by dividing by the sum over all classes
item_sum = np.sum(item_classes, axis=1, keepdims=True)
item_classes = np.divide(item_classes, np.tile(item_sum, (1, nClasses)))
return item_classes
def e_step_verbose(counts, class_marginals, error_rates):
"""
This method is the verbose (i.e. not vectorized) version of
the e_step. It is actually faster than the vectorized e_step
function (16 seconds vs 25 seconds respectively on 10k ratings).
Determine the probability of each item belonging to each class,
given current ML estimates of the parameters from the M-step
See equation 2.5 in Dawid-Skene (1979)
Inputs:
counts: Array of how many times each rating was given
by each rater for each item
class_marginals: probability of a random item belonging to each class
error_rates: probability of rater k assigning a item in class j
to class l [raters, classes, classes]
Returns:
item_classes: Soft assignments of items to classes
[items x classes]
"""
[nItems, nRaters, nClasses] = np.shape(counts)
item_classes = np.zeros([nItems, nClasses])
for i in range(nItems):
for j in range(nClasses):
estimate = class_marginals[j]
estimate *= np.prod(np.power(error_rates[:, j, :], counts[i, :, :]))
item_classes[i, j] = estimate
# normalize error rates by dividing by the sum over all classes
item_sum = np.sum(item_classes, axis=1, keepdims=True)
item_classes = np.divide(item_classes, np.tile(item_sum, (1, nClasses)))
return item_classes
def calc_likelihood(counts, class_marginals, error_rates):
"""
Calculate the likelihood given the current parameter estimates
This should go up monotonically as EM proceeds
See equation 2.7 in Dawid-Skene (1979)
Inputs:
counts: Array of how many times each response was received
by each rater from each item
class_marginals: probability of a random item belonging to each class
error_rates: probability of rater k assigning a item in class j
to class l [raters, classes, classes]
Returns:
Likelihood given current parameter estimates
"""
[nItems, nRaters, nClasses] = np.shape(counts)
log_L = 0.0
for i in range(nItems):
item_likelihood = 0.0
for j in range(nClasses):
class_prior = class_marginals[j]
item_class_likelihood = np.prod(
np.power(error_rates[:, j, :], counts[i, :, :]))
item_class_posterior = class_prior * item_class_likelihood
item_likelihood += item_class_posterior
temp = log_L + np.log(item_likelihood)
if np.isnan(temp) or np.isinf(temp):
logging.info('{0}, {1}, {2}'.format(i, log_L, np.log(item_likelihood),
temp))
sys.exit()
log_L = temp
return log_L
def random_initialization(counts):
"""
Similar to initialize() above, except choose one initial class for each
item, weighted in proportion to the counts.
Input:
counts: counts of the number of times each response was received
by each rater from each item: [items x raters x classes]
Returns:
item_classes: matrix of estimates of true item classes:
[items x responses]
"""
[nItems, nRaters, nClasses] = np.shape(counts)
response_sums = np.sum(counts, 1)
# create an empty array
item_classes = np.zeros([nItems, nClasses])
# for each item, choose a random initial class, weighted in proportion
# to the counts from all raters
for p in range(nItems):
weights = response_sums[p, :] / np.sum(response_sums[p, :], dtype=float)
item_classes[p, np.random.choice(np.arange(nClasses), p=weights)] = 1
return item_classes
def majority_voting(counts):
"""
An alternative way to initialize assignment of items to classes
i.e Get initial estimates for the true item classes using majority voting
Input:
counts: Counts of the number of times each response was received
by each rater from each item: [items x raters x classes]
Returns:
item_classes: matrix of initial estimates of true item classes:
[items x responses]
"""
[nItems, nRaters, nClasses] = np.shape(counts)
# sum over observers
response_sums = np.sum(counts, 1)
# create an empty array
item_classes = np.zeros([nItems, nClasses])
# take the most frequent class for each item
for p in range(nItems):
indices = np.argwhere(response_sums[p, :] == np.max(response_sums[p, :]))
# in the case of ties, take the lowest valued label (could be randomized)
item_classes[p, np.min(indices)] = 1
return item_classes
def parse_item_classes(df, label, item_classes, index_to_unit_id_map,
index_to_y_map, unit_id, worker_id, comment_text_path):
"""
Given the original data df, the predicted item_classes, and
the data mappings, returns a DataFrame with the fields:
* _unit_index: the 0,1,...nItems index
* _unit_id: the original item ID
* {LABEL}_hat: the predicted probability of the item being labeled 1 as
learned from the Dawid-Skene algorithm
* {LABEL}_mean: the mean of the original ratings
"""
LABEL_HAT = '{}_hat'.format(label)
LABEL_MEAN = '{}_mean'.format(label)
ROUND_DEC = 8
_, N_ClASSES = np.shape(item_classes)
df_predictions = pd.DataFrame()
# Add columns for predictions for each class
col_names = []
for k in range(N_ClASSES):
# y is the original value of the class. When we train, we re-map
# all the classes to 0,1,....K. But our data has classes like
# -2,-1,0,1,2. In that case, of k is 0, then y would be -2
y = index_to_y_map[k]
col_name = '{0}_{1}'.format(LABEL_HAT, y)
col_names.append(col_name)
df_predictions[col_name] = [round(i[k], ROUND_DEC) for i in item_classes]
# To get a prediction of the mean label, multiply our predictions with the
# true y values.
y_values = list(index_to_y_map.values())
col_name = '{0}_hat_mean'.format(label)
df_predictions[col_name] = np.dot(df_predictions[col_names], list(y_values))
# Use the _unit_index to map to the original _unit_id
df_predictions['_unit_index'] = range(len(item_classes))
df_predictions[unit_id] = df_predictions['_unit_index']\
.apply(lambda i: index_to_unit_id_map[i])
# Calculate the y_mean from the original data and join on _unit_id
# Add a column for the mean predictions
df[label] = df[label].astype(float)
mean_labels = df.groupby(unit_id, as_index=False)[label]\
.mean()\
.round(ROUND_DEC)\
.rename(index=int, columns={label: LABEL_MEAN})
df_predictions = pd.merge(mean_labels, df_predictions, on=unit_id)
# join with data that contains the item-level comment text
if comment_text_path:
with tf.gfile.Open(comment_text_path, 'r') as fileobj:
logging.info(
'Loading comment text data from {}'.format(comment_text_path))
df_comments = pd.read_csv(fileobj)
# drop duplicate comments
df_comments = df_comments.drop_duplicates(subset=unit_id)
df_predictions = df_predictions.merge(df_comments, on=unit_id)
return df_predictions
def parse_error_rates(df, error_rates, index_to_worker_id_map, index_to_y_map,
unit_id, worker_id):
"""
Given the original data DataFrame, the predicted error_rates and the
mappings
between the indexes and ids, returns a DataFrame with the fields:
* _worker_index: the 0,1,...nItems index
* _worker_id: the original item ID
* _error_rate_{k}_{k}: probability the worker would choose class k when
the true class is k (for accurate workers, these numbers are high).
"""
columns = [worker_id, '_worker_index']
df_error_rates = pd.DataFrame()
# add the integer _worker_index
df_error_rates['_worker_index'] = index_to_worker_id_map.keys()
# add the original _worker_id
df_error_rates[worker_id] = [j for (i, j) in index_to_worker_id_map.items()]
# add annotation counts for each worker
worker_counts = df.groupby(
by=worker_id, as_index=False)[unit_id]\
.count()\
.rename(index=int, columns={unit_id: 'n_annotations'})
df_error_rates = pd.merge(df_error_rates, worker_counts, on=worker_id)
# add the diagonal error rates, which are the per-class accuracy rates,
# for each class k, we add a column for p(rater will pick k | item's true class is k)
# y_label is the original y value in the data and y_index is the
# integer we mapped it to, i.e. 0, 1, ..., |Y|
for y_index, y_label in index_to_y_map.items():
col_name = 'accuracy_rate_{0}'.format(y_label)
df_error_rates[col_name] = [e[y_index, y_index] for e in error_rates]
return df_error_rates
def main(FLAGS):
logging.basicConfig(level=logging.INFO)
# load data, each row is an annotation
n_examples = FLAGS.n_examples
label = FLAGS.label
unit_id = FLAGS.unit_id_col
worker_id = FLAGS.worker_id_col
comment_text_path = FLAGS.comment_text_path
df = load_data(FLAGS.data_path, unit_id, worker_id, label)[0:n_examples]
logging.info('Running on {0} examples for label {1}'.format(len(df), label))
# convert rater, item and label IDs to integers starting at 0
#
# * worker_id_to_index_map: _worker_id -> index
# * index_to_worker_id_map: index -> worker
# * unit_id_to_index_map: _unit_id -> index
# * index_to_unit_id_map: index -> _unit_id
# * y_to_index_map: label -> index
# * index_to_y_map: index -> label
worker_id_to_index_map = {
w: i for (i, w) in enumerate(df[worker_id].unique())
}
index_to_worker_id_map = {i: w for (w, i) in worker_id_to_index_map.items()}
unit_id_to_index_map = {w: i for (i, w) in enumerate(df[unit_id].unique())}
index_to_unit_id_map = {i: w for (w, i) in unit_id_to_index_map.items()}
y_to_index_map = {w: i for (i, w) in enumerate(df[label].unique())}
index_to_y_map = {i: w for (w, i) in y_to_index_map.items()}
# create list of unique raters, items and labels
raters = list(df[worker_id].apply(lambda x: worker_id_to_index_map[x]))
items = list(df[unit_id].apply(lambda x: unit_id_to_index_map[x]))
y = list(df[label].apply(lambda x: y_to_index_map[x]))
nClasses = len(df[label].unique())
nItems = len(df[unit_id].unique())
nRaters = len(df[worker_id].unique())
counts = np.zeros([nItems, nRaters, nClasses])
# convert responses to counts
for i, item_index in enumerate(items):
rater_index = raters[i]
y_index = y[i]
counts[item_index, rater_index, y_index] += 1
raters_unique = index_to_worker_id_map.keys()
items_unique = index_to_unit_id_map.keys()
classes_unique = index_to_y_map.keys()
logging.info('num items: {0}'.format(len(items_unique)))
logging.info('num raters: {0}'.format(len(raters_unique)))
logging.info('num classes: {0}'.format(len(classes_unique)))
# run EM
start = time.time()
class_marginals, error_rates, item_classes = run(
items_unique,
raters_unique,
classes_unique,
counts,
label,
FLAGS.pseudo_count,
tol=FLAGS.tolerance,
max_iter=FLAGS.max_iter)
end = time.time()
logging.info('training time: {0:.4f} seconds'.format(end - start))
# join comment_text, old labels and new labels
df_predictions = parse_item_classes(df, label, item_classes,
index_to_unit_id_map, index_to_y_map,
unit_id, worker_id, comment_text_path)
# join rater error_rates
df_error_rates = parse_error_rates(df, error_rates, index_to_worker_id_map,
index_to_y_map, unit_id, worker_id)
# write predictions and error_rates out as CSV
n = len(df)
prediction_path = '{0}/predictions_{1}_{2}.csv'.format(
FLAGS.job_dir, label, n)
error_rates_path = '{0}/error_rates_{1}_{2}.csv'.format(
FLAGS.job_dir, label, n)
logging.info('Writing predictions to {}'.format(prediction_path))
with tf.gfile.Open(prediction_path, 'w') as fileobj:
df_predictions.to_csv(fileobj, index=False, encoding='utf-8')
logging.info('Writing error rates to {}'.format(error_rates_path))
with tf.gfile.Open(error_rates_path, 'w') as fileobj:
df_error_rates.to_csv(fileobj, index=False, encoding='utf-8')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--data-path',
help='The path to data to run on, local or in Cloud Storage.')
parser.add_argument(
'--comment-text-path',
help='The path to comment text, local or in Cloud Storage.')
parser.add_argument(
'--worker-id-col', help='Column name of worker id.', default='_worker_id')
parser.add_argument(
'--unit-id-col', help='Column name of unit id.', default='_comment_id')
parser.add_argument(
'--n_examples',
help='The number of annotations to use.',
default=10000000,
type=int)
parser.add_argument(
'--label',
help='The label to train on, e.g. "obscene" or "threat"',
default='obscene')
parser.add_argument(
'--job-dir',
type=str,
default='',
help='The directory where the job is staged.')
parser.add_argument(
'--max-iter',
help='The max number of iteration to run.',
type=int,
default=25)
parser.add_argument(
'--pseudo-count',
help='The pseudo count to smooth error rates.',
type=float,
default=1.0)
parser.add_argument(
'--tolerance',
help='Stop training when variables change less than this value.',
type=int,
default=1)
FLAGS = parser.parse_args()
print('FLAGS', FLAGS)
main(FLAGS)
================================================
FILE: annotator_models/trainer/dawid_skene_test.py
================================================
"""Tests for dawid_skene."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import os
import pandas as pd
import tempfile
import unittest
import dawid_skene
class DawidSkeneTest(unittest.TestCase):
# The contents of Maximum Likelihood Estimation of Observer Error-Rates
# Using the EM Algorithm Table 1.
def setUp(self):
self.table_1 = pd.DataFrame.from_dict({
'patient':
range(1, 46),
11: [
1, 3, 1, 2, 2, 2, 1, 3, 2, 2, 4, 2, 1, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2,
2, 1, 1, 2, 1, 1, 1, 1, 3, 1, 2, 2, 4, 2, 2, 3, 1, 1, 1, 2, 1, 2
],
12: [
1, 3, 1, 2, 2, 2, 2, 3, 2, 3, 4, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2,
2, 1, 1, 3, 1, 1, 1, 1, 3, 1, 2, 2, 3, 2, 3, 3, 1, 1, 2, 3, 2, 2
],
13: [
1, 3, 2, 2, 2, 2, 2, 3, 2, 2, 4, 2, 1, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2,
1, 1, 1, 2, 1, 1, 2, 1, 3, 1, 2, 2, 3, 1, 2, 3, 1, 1, 1, 2, 1, 2
],
2: [
1, 4, 2, 3, 3, 3, 2, 3, 2, 2, 4, 3, 1, 3, 1, 2, 1, 1, 2, 1, 2, 2, 3,
2, 1, 1, 2, 1, 1, 1, 1, 3, 1, 2, 3, 4, 2, 3, 3, 1, 1, 2, 2, 1, 2
],
3: [
1, 3, 1, 1, 2, 3, 1, 4, 2, 2, 4, 3, 1, 2, 1, 1, 1, 1, 2, 3, 2, 2, 2,
2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 2, 3, 2, 2, 4, 1, 1, 1, 2, 1, 2
],
4: [
1, 3, 2, 2, 2, 2, 1, 3, 2, 2, 4, 4, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
2, 1, 1, 2, 1, 1, 2, 1, 3, 1, 2, 3, 4, 3, 3, 3, 1, 1, 1, 2, 1, 2
],
5: [
1, 4, 2, 1, 2, 2, 1, 3, 3, 3, 4, 3, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2,
2, 1, 1, 2, 1, 1, 1, 1, 3, 1, 2, 2, 3, 2, 3, 2, 1, 1, 1, 2, 1, 2
]
})
def test_paper_example(self):
with tempfile.TemporaryDirectory() as tempdirname:
f = tempfile.NamedTemporaryFile(delete=False)
f.file.close()
data = self.table_1.set_index('patient').stack().rename_axis(['patient', 'observer']).to_frame('label').reset_index()
data['observer'] = data['observer'].map({11:1, 12:1, 13:1, 2:2, 3:3, 4:4, 5:5})
data.to_csv(f.name, header=True)
Flags = collections.namedtuple('Flags', 'n_examples label unit_id_col worker_id_col comment_text_path data_path pseudo_count tolerance max_iter job_dir')
Flags.data_path = f.name
Flags.label = 'label'
Flags.worker_id_col = 'observer'
Flags.unit_id_col = 'patient'
Flags.n_examples = 350
Flags.pseudo_count = 1.0
Flags.comment_text_path = None
Flags.max_iter = 25
Flags.tolerance = 1
Flags.job_dir = tempdirname
dawid_skene.main(Flags)
os.unlink(f.name)
predictions = pd.read_csv(os.path.join(tempdirname, 'predictions_label_315.csv'))
print(predictions)
error_rates = pd.read_csv(os.path.join(tempdirname, 'error_rates_label_315.csv'))
print(error_rates)
if __name__ == '__main__':
unittest.main()
================================================
FILE: attention-tutorial/Attention_Model_Tutorial.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "szO16q_1vXOT"
},
"source": [
"# Attention Based Classification Tutorial\n",
"\n",
"**Recommended time: 30 minutes**\n",
"\n",
"**Contributors: nthain, martin-gorner**\n",
"\n",
"\n",
"This tutorial provides an introduction to building text classification models in tensorflow that use attention to provide insight into how classification decisions are being made. We will build our tensorflow graph following the Embed - Encode - Attend - Predict paradigm introduced by Matthew Honnibal. For more information about this approach, you can refer to:\n",
"\n",
"Slides: https://goo.gl/BYT7au\n",
"\n",
"Video: https://youtu.be/pzOzmxCR37I\n",
"\n",
"\n",
"Figure 1 below provides a representation of the full tensorflow graph we will build in this tutorial. The green squares represent RNN cells and the blue trapezoids represent neural networks for computing attention weights which will be discussed in more detail below. We will implement each piece of this model graph in a seperate function. The whole model will then simply be calling all of these functions in turn. \n",
"\n",
"\n",
"\n",
"\n",
"This tutorial was created in collaboration with the Tensorflow without a PhD series. To check out more episodes, tutorials, and codelabs from this series, please visit: \n",
"\n",
"https://github.com/GoogleCloudPlatform/tensorflow-without-a-phd\n",
"\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "tROhMjW49Dsr"
},
"source": [
"### Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"id": "vSgQlcQqbWyb"
},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"\n",
"from __future__ import absolute_import\n",
"from __future__ import division\n",
"from __future__ import print_function\n",
"\n",
"\n",
"import pandas as pd\n",
"import tensorflow as tf\n",
"import numpy as np\n",
"import time\n",
"import os\n",
"from sklearn import metrics\n",
"from visualize_attention import attentionDisplay\n",
"from process_figshare import download_figshare, process_figshare\n",
"\n",
"tf.set_random_seed(1234)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "KKwX66FG9G-L"
},
"source": [
"## Load & Explore Data"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "4YFtwZsD4J7r"
},
"source": [
"Let's begin by downloading the data from [Figshare](https://figshare.com/articles/Wikipedia_Talk_Labels_Toxicity/4563973) and cleaning and splitting it for use in training."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"download_figshare()\n",
"process_figshare()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We then load these splits as pandas dataframes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"id": "aIy4ggIxbWyg"
},
"outputs": [],
"source": [
"SPLITS = ['train', 'dev', 'test']\n",
"\n",
"wiki = {}\n",
"for split in SPLITS:\n",
" wiki[split] = pd.read_csv('data/wiki_%s.csv' % split)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "_eZEM1wd5FiA"
},
"source": [
"We display the top few rows of the dataframe to see what we're dealing with. The key columns are 'comment' which contains the text of a comment from a Wikipedia talk page and 'toxicity' which contains the fraction of annotators who found this comment to be toxic. More information about the other fields and how this data was collected can be found on [this wiki](https://meta.wikimedia.org/wiki/Research:Detox/Data_Release) and [research paper](https://arxiv.org/abs/1610.08914).\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"height": 195,
"output_extras": [
{
"item_id": 1
}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 334,
"status": "ok",
"timestamp": 1519755503377,
"user": {
"displayName": "Nithum Thain",
"photoUrl": "//lh4.googleusercontent.com/-o8q7BcjxLpg/AAAAAAAAAAI/AAAAAAAAABQ/-zA_Kee6FY0/s50-c-k-no/photo.jpg",
"userId": "105288052437331023238"
},
"user_tz": 210
},
"id": "6sj_aimNbWyn",
"outputId": "36fccb7e-60a3-4d1c-bbfa-03483ff49f84"
},
"outputs": [],
"source": [
"wiki['train'].head()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "p0cz2kA_9JxK"
},
"source": [
"### Hyperparameters"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Hyperparameters are used to specify various aspects of our model's architecture. In practice, these are often critical to model performance and are carefully tuned using some type of [hyperparameter search](https://en.wikipedia.org/wiki/Hyperparameter_optimization). For this tutorial, we will choose a reasonable set of hyperparameters and treat them as fixed."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"id": "JSvJ3wwwbWys"
},
"outputs": [],
"source": [
"hparams = {'max_document_length': 60,\n",
" 'embedding_size': 50,\n",
" 'rnn_cell_size': 128,\n",
" 'batch_size': 256,\n",
" 'attention_size': 32,\n",
" 'attention_depth': 2}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"id": "owTqZg2ebWyv"
},
"outputs": [],
"source": [
"MAX_LABEL = 2\n",
"WORDS_FEATURE = 'words'\n",
"NUM_STEPS = 300"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Step 0: Text Preprocessing"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Before we can build a neural network on comment strings, we first have to complete a number of preprocessing steps. In particular, it is important that we \"tokenize\" the string, splitting it into an array of tokens. In our case, each token will be a word in our sentence and they will be seperated by spaces and punctuation. Many alternative tokenizers exist, some of which use characters as tokens, and others which include punctuation, emojis, or even cleverly handle misspellings. \n",
"\n",
"Once we've tokenized the sentences, each word will be replaced with an integer representative. This will make the embedding (Step 1) much easier. \n",
"\n",
"Happily the tensorflow function [VocabularyProcessor](http://tflearn.org/data_utils/#vocabulary-processor) takes care of both the tokenization and integer mapping. We only have to give it the max_document_length argument which will determine the length of the output arrays. If sentences are shorter than this length, they will be padded and if they are longer, they will be trimmed. The VocabularyProcessor is then trained on the training set to build the initial vocabulary and map the words to integers."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"id": "9kcrgebgbWzB"
},
"outputs": [],
"source": [
"# Initialize the vocabulary processor\n",
"vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(hparams['max_document_length'])\n",
"\n",
"def process_inputs(vocab_processor, df, train_label = 'train', test_label = 'test'):\n",
" \n",
" # For simplicity, we call our features x and our outputs y\n",
" x_train = df['train'].comment\n",
" y_train = df['train'].is_toxic\n",
" x_test = df['test'].comment\n",
" y_test = df['test'].is_toxic\n",
"\n",
" # Train the vocab_processor from the training set\n",
" x_train = vocab_processor.fit_transform(x_train)\n",
" # Transform our test set with the vocabulary processor\n",
" x_test = vocab_processor.transform(x_test)\n",
"\n",
" # We need these to be np.arrays instead of generators\n",
" x_train = np.array(list(x_train))\n",
" x_test = np.array(list(x_test))\n",
" y_train = np.array(y_train).astype(int)\n",
" y_test = np.array(y_test).astype(int)\n",
"\n",
" n_words = len(vocab_processor.vocabulary_)\n",
" print('Total words: %d' % n_words)\n",
"\n",
" # Return the transformed data and the number of words\n",
" return x_train, y_train, x_test, y_test, n_words\n",
"\n",
"x_train, y_train, x_test, y_test, n_words = process_inputs(vocab_processor, wiki)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "1KtFmLmp9M0t"
},
"source": [
"### Step 1: Embed"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "AjtQe9eT9v4v"
},
"source": [
"Neural networks at their core are a composition of operators from linear algebra and non-linear activation functions. In order to perform these computations on our input sentences, we must first embed them as a vector of numbers. There are two main approaches to perform this embedding:\n",
"\n",
"\n",
"1. **Pre-trained:** It is often beneficial to initialize our embedding matrix using pre-trained embeddings like [Word2Vec](??) or [GloVe](??). These embeddings are trained on a huge corpus of text with a general purpose problem so that they incorporate syntactic and semantic properties of the words being embedded and are amenable to transfer learning on new problems. Once initialized, you can optionally train them further for your specific problem by allowing the embedding matrix in the graph to be a trainable variable in our tensorflow graph. \n",
"2. **Random:** Alternatively, embeddings can be \"trained from scratch\" by initializing the embedding matrix randomly and then training it like any other parameter in the tensorflow graph.\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "rCubiw6eUVQm"
},
"source": [
"In this notebook, we will be using a random initialization. To perform this embedding we use the embed_sequence function from the layers package. This will take our input features, which are the arrays of integers we produced in Step 0, and will randomly initialize a matrix to embed them into. The parameters of this matrix will then be trained with the rest of the graph."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"id": "UG1UXX4L_KQk"
},
"outputs": [],
"source": [
"def embed(features):\n",
" word_vectors = tf.contrib.layers.embed_sequence(\n",
" features[WORDS_FEATURE], \n",
" vocab_size=n_words, \n",
" embed_dim=hparams['embedding_size'])\n",
" \n",
" return word_vectors"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "nBp5uc-tSee2"
},
"source": [
"### Step 2: Encode"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "9vjxtIroTBUq"
},
"source": [
"A [recurrent neural network](https://en.wikipedia.org/wiki/Recurrent_neural_network) is a deep learning architecture that is useful for encoding sequential information like sentences. They are built around a single cell which contains one of several standard neural network architectures (e.g. simple [RNN](https://en.wikipedia.org/wiki/Recurrent_neural_network), [GRU](https://en.wikipedia.org/wiki/Gated_recurrent_unit), or [LSTM](https://en.wikipedia.org/wiki/Long_short-term_memory)). We will not focus on the details of the architectures, but at each point in time the cell takes in two inputs and produces two outputs. The inputs are the input token for that step in the sequence and some state from the previous steps in the sequence. The outputs produced are the encoded vectors for the current sequence step and a state to pass on to the next step of the sequence. \n",
"\n",
"Figure 2 shows what this looks like for an unrolled RNN. Each cell (represented by a green square) has two input arrows and two output arrrows. Note that all of the green squares represent the same cell and share parameters. One major advantage of this cell replication is that, at inference time, it allows us to deal with arbitrary length input and not be restricted by the input sizes of our training set.\n",
"\n",
""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For our model, we will use a bi-directional RNN. This is simply the concatentation of two RNNs, one which processes the sequence from left to right (the \"forward\" RNN) and one which process from right to left (the \"backward\" RNN). By using both directions, we get a stronger encoding as each word can be encoded using the context of its neighbors on boths sides rather than just a single side. For our cells, we use [gated recurrent units (GRUs)](https://en.wikipedia.org/wiki/Gated_recurrent_unit). Figure 3 gives a visual representation of this.\n",
"\n",
""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"id": "DBDS9LjdUZbV"
},
"outputs": [],
"source": [
"def encode(word_vectors):\n",
" # Create a Gated Recurrent Unit cell with hidden size of RNN_SIZE.\n",
" # Since the forward and backward RNNs will have different parameters, we instantiate two seperate GRUS.\n",
" rnn_fw_cell = tf.contrib.rnn.GRUCell(hparams['rnn_cell_size'])\n",
" rnn_bw_cell = tf.contrib.rnn.GRUCell(hparams['rnn_cell_size'])\n",
" \n",
" # Create an unrolled Bi-Directional Recurrent Neural Networks to length of\n",
" # max_document_length and passes word_list as inputs for each unit.\n",
" outputs, _ = tf.nn.bidirectional_dynamic_rnn(rnn_fw_cell, \n",
" rnn_bw_cell, \n",
" word_vectors, \n",
" dtype=tf.float32, \n",
" time_major=False)\n",
" \n",
" return outputs"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "V8hbwTb7dXLV"
},
"source": [
"### Step 3: Attend"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "PMKkWgSwdZSq"
},
"source": [
"There are a number of ways to use the encoded states of a recurrent neural network for prediction. One traditional approach is to simply use the final encoded state of the network, as seen in Figure 2. However, this could lose some useful information encoded in the previous steps of the sequence. In order to keep that information, one could instead use an average of the encoded states outputted by the RNN. There is not reason to believe, though, that all of the encoded states of the RNN are equally valuable. Thus, we arrive at the idea of using a weighted sum of these encoded states to make our prediction.\n",
"\n",
"We will call the weights of this weighted sum \"attention weights\" as we will see below that they correspond to how important our model thinks each token of the sequence is in making a prediction decision. We compute these attention weights simply by building a small fully connected neural network on top of each encoded state. This network will have a single unit final layer which will correspond to the attention weight we will assign. As for RNNs, the parameters of this network will be the same for each step of the sequence, allowing us to accomodate variable length inputs. Figure 4 shows us what the graph would look like if we applied attention to a uni-directional RNN.\n",
"\n",
""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Again, as our model uses a bi-directional RNN, we first concatenate the hidden states from each RNN before computing the attention weights and applying the weighted sum. Figure 5 below visualizes this step. \n",
"\n",
""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"id": "3a9fkmUOdeHh"
},
"outputs": [],
"source": [
"def attend(inputs, attention_size, attention_depth):\n",
" \n",
" inputs = tf.concat(inputs, axis = 2)\n",
" \n",
" inputs_shape = inputs.shape\n",
" sequence_length = inputs_shape[1].value\n",
" final_layer_size = inputs_shape[2].value\n",
" \n",
" x = tf.reshape(inputs, [-1, final_layer_size])\n",
" for _ in range(attention_depth-1):\n",
" x = tf.layers.dense(x, attention_size, activation = tf.nn.relu)\n",
" x = tf.layers.dense(x, 1, activation = None)\n",
" logits = tf.reshape(x, [-1, sequence_length, 1])\n",
" alphas = tf.nn.softmax(logits, dim = 1)\n",
" \n",
" output = tf.reduce_sum(inputs * alphas, 1)\n",
"\n",
" return output, alphas"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "bqtYZzWeoz55"
},
"source": [
"### Step 4: Predict"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To genereate a class prediction about whether a comment is toxic or not, the final part of our tensorflow graph takes the weighted average of hidden states generated in the attention step and uses a fully connected layer with a softmax activation function to generate probability scores for each of our prediction classes. While training, the model will use the cross-entropy loss function to train its parameters. \n",
"\n",
"As we will use the [estimator framework](https://www.tensorflow.org/get_started/custom_estimators) to train our model, we write an estimator_spec function to specify how our model is trained and what values to return during the prediction stage. We also specify the evaluation metrics of accuracy and auc, which we will use to evaluate our model in Step 7."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"id": "L6_Wo4ixbWzI"
},
"outputs": [],
"source": [
"def estimator_spec_for_softmax_classification(\n",
" logits, labels, mode, alphas):\n",
" \"\"\"Returns EstimatorSpec instance for softmax classification.\"\"\"\n",
" predicted_classes = tf.argmax(logits, 1)\n",
" if mode == tf.estimator.ModeKeys.PREDICT:\n",
" return tf.estimator.EstimatorSpec(\n",
" mode=mode,\n",
" predictions={\n",
" 'class': predicted_classes,\n",
" 'prob': tf.nn.softmax(logits),\n",
" 'attention': alphas\n",
" })\n",
"\n",
" onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)\n",
" loss = tf.losses.softmax_cross_entropy(\n",
" onehot_labels=onehot_labels, logits=logits)\n",
" if mode == tf.estimator.ModeKeys.TRAIN:\n",
" optimizer = tf.train.AdamOptimizer(learning_rate=0.01)\n",
" train_op = optimizer.minimize(loss, \n",
" global_step=tf.train.get_global_step())\n",
" return tf.estimator.EstimatorSpec(mode, \n",
" loss=loss, \n",
" train_op=train_op)\n",
"\n",
" eval_metric_ops = {\n",
" 'accuracy': tf.metrics.accuracy(\n",
" labels=labels, predictions=predicted_classes),\n",
" 'auc': tf.metrics.auc(\n",
" labels=labels, predictions=predicted_classes), \n",
" }\n",
" return tf.estimator.EstimatorSpec(\n",
" mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The predict component of our graph then just takes the output of our attention step, i.e. the weighted average of the bi-RNN hidden layers, and adds one more fully connected layer to compute the logits. These logits are fed into a our estimator_spec which uses a softmax to get the final class probabilties and a [softmax_cross_entropy](https://www.tensorflow.org/api_docs/python/tf/losses/softmax_cross_entropy) to build a loss function."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def predict(encoding, labels, mode, alphas):\n",
" logits = tf.layers.dense(encoding, MAX_LABEL, activation=None)\n",
" return estimator_spec_for_softmax_classification(\n",
" logits=logits, labels=labels, mode=mode, alphas=alphas)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "0URRXudn9Qlg"
},
"source": [
"### Step 5: Complete Model Architecture"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"collapsed": true,
"id": "cdb9C4jNbCBj"
},
"source": [
"We are now ready to put it all together. As you can see from the bi_rnn_model function below, once you have the components for embed, encode, attend, and predict, putting the whole graph together is extremely simple!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"id": "FcxSFa5vbWzR"
},
"outputs": [],
"source": [
"def bi_rnn_model(features, labels, mode):\n",
" \"\"\"RNN model to predict from sequence of words to a class.\"\"\"\n",
"\n",
" word_vectors = embed(features)\n",
" outputs = encode(word_vectors)\n",
" encoding, alphas = attend(outputs, \n",
" hparams['attention_size'], \n",
" hparams['attention_depth'])\n",
"\n",
" return predict(encoding, labels, mode, alphas)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "9jZqVeWx9TVT"
},
"source": [
"### Step 6: Train Model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We will use the estimator framework to train our model. To define our classifier, we just provide it with the complete model graph (i.e. the bi_rnn_model function) and a directory where the models will be saved."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"id": "HFDYpImJbWzT"
},
"outputs": [],
"source": [
"current_time = str(int(time.time()))\n",
"model_dir = os.path.join('checkpoints', current_time)\n",
"classifier = tf.estimator.Estimator(model_fn=bi_rnn_model, \n",
" model_dir=model_dir)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The estimator framework also requires us to define an input function. This will take the input data and provide it during model training in batches. We will use the provided numpy_input_function, which takes numpy arrays as features and labels. We also specify the batch size and whether we want to shuffle the data between epochs."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"height": 34,
"output_extras": [
{
"item_id": 1
}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 153379,
"status": "ok",
"timestamp": 1519758352944,
"user": {
"displayName": "Nithum Thain",
"photoUrl": "//lh4.googleusercontent.com/-o8q7BcjxLpg/AAAAAAAAAAI/AAAAAAAAABQ/-zA_Kee6FY0/s50-c-k-no/photo.jpg",
"userId": "105288052437331023238"
},
"user_tz": 210
},
"id": "gXJdQHe-bWzX",
"outputId": "353cbe80-0e36-4832-ed8e-5e6d31087ca1"
},
"outputs": [],
"source": [
"# Train.\n",
"train_input_fn = tf.estimator.inputs.numpy_input_fn(\n",
" x={WORDS_FEATURE: x_train},\n",
" y=y_train,\n",
" batch_size=hparams['batch_size'],\n",
" num_epochs=None,\n",
" shuffle=True)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now, it's finally time to train our model! With estimator, this is as easy as calling the train function and specifying how long we'd like to train for."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"classifier.train(input_fn=train_input_fn, \n",
" steps=NUM_STEPS)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "wJQI2zW19V8j"
},
"source": [
"### Step 7: Predict and Evaluate Model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To evaluate the function, we will use it to predict the values of examples from our test set. Again, we define a numpy_input_fn, for the test data in this case, and then have the classifier run predictions on this input function."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"id": "4E5poMgPbWza"
},
"outputs": [],
"source": [
"# Predict.\n",
"test_input_fn = tf.estimator.inputs.numpy_input_fn(\n",
" x={WORDS_FEATURE: x_test},\n",
" y=y_test,\n",
" num_epochs=1,\n",
" shuffle=False)\n",
"\n",
"predictions = classifier.predict(input_fn=test_input_fn)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"These predictions are returned to us as a generator. The code below gives an example of how we can extract the class and attention weights for each prediction."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"id": "oTL7trjX00Zp"
},
"outputs": [],
"source": [
"y_predicted = []\n",
"alphas_predicted = []\n",
"for p in predictions:\n",
" y_predicted.append(p['class'])\n",
" alphas_predicted.append(p['attention'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To evaluate our model, we can use the evaluate function provided by estimator to get the [accuracy](https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers) and [ROC-AUC](https://en.wikipedia.org/wiki/Receiver_operating_characteristic) scores as we defined them in our estimator_spec."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"height": 34,
"output_extras": [
{
"item_id": 1
}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 17936,
"status": "ok",
"timestamp": 1519758410784,
"user": {
"displayName": "Nithum Thain",
"photoUrl": "//lh4.googleusercontent.com/-o8q7BcjxLpg/AAAAAAAAAAI/AAAAAAAAABQ/-zA_Kee6FY0/s50-c-k-no/photo.jpg",
"userId": "105288052437331023238"
},
"user_tz": 210
},
"id": "jpgentt6bWzf",
"outputId": "ae6de3cc-9eb5-469a-e04e-958a784e9dee"
},
"outputs": [],
"source": [
"scores = classifier.evaluate(input_fn=test_input_fn)\n",
"print('Accuracy: {0:f}'.format(scores['accuracy']))\n",
"print('AUC: {0:f}'.format(scores['auc']))"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "lOmmwP6UV8h7"
},
"source": [
"### Step 8: Display Attention"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now that we have a trained attention based toxicity model, let's use it to visualize how our model makes its classification decisions. We use the helpful attentionDisplay class from the visualize_attention package. Given any sentence, this class uses our trained classifier to determine whether the sentence is toxic and also returns a representation of the attention weights. In the arrays below, the more red a word is, the more weight classifier puts on encoded word. Try it out on some sentences of your own and see what patterns you can find!\n",
"\n",
"Note: If you are viewing this on Github, the colors in the cells won't display properly. We recommend viewing it locally or with [nbviewer](https://nbviewer.jupyter.org/) to see the correct rendering of the attention weights."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"display = attentionDisplay(vocab_processor, classifier)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"height": 95,
"output_extras": [
{
"item_id": 1
},
{
"item_id": 2
}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1096,
"status": "ok",
"timestamp": 1519758417492,
"user": {
"displayName": "Nithum Thain",
"photoUrl": "//lh4.googleusercontent.com/-o8q7BcjxLpg/AAAAAAAAAAI/AAAAAAAAABQ/-zA_Kee6FY0/s50-c-k-no/photo.jpg",
"userId": "105288052437331023238"
},
"user_tz": 210
},
"id": "xSpv2plUV4mN",
"outputId": "952a6fc6-bac4-46ab-c354-c54e5d288d75"
},
"outputs": [],
"source": [
"display.display_prediction_attention(\"Fuck off, you idiot.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"height": 95,
"output_extras": [
{
"item_id": 1
},
{
"item_id": 2
}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1024,
"status": "ok",
"timestamp": 1519758419192,
"user": {
"displayName": "Nithum Thain",
"photoUrl": "//lh4.googleusercontent.com/-o8q7BcjxLpg/AAAAAAAAAAI/AAAAAAAAABQ/-zA_Kee6FY0/s50-c-k-no/photo.jpg",
"userId": "105288052437331023238"
},
"user_tz": 210
},
"id": "m9bsno-UV4o0",
"outputId": "beb38261-3e4e-4348-e62f-d23bac629268"
},
"outputs": [],
"source": [
"display.display_prediction_attention(\"Thanks for your help editing this.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"height": 95,
"output_extras": [
{
"item_id": 1
},
{
"item_id": 2
}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1223,
"status": "ok",
"timestamp": 1519758421016,
"user": {
"displayName": "Nithum Thain",
"photoUrl": "//lh4.googleusercontent.com/-o8q7BcjxLpg/AAAAAAAAAAI/AAAAAAAAABQ/-zA_Kee6FY0/s50-c-k-no/photo.jpg",
"userId": "105288052437331023238"
},
"user_tz": 210
},
"id": "nB4G8rriV4wt",
"outputId": "2b540ca1-a03d-475a-a54a-6c22558e0be3"
},
"outputs": [],
"source": [
"display.display_prediction_attention(\"You're such an asshole. But thanks anyway.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"height": 95,
"output_extras": [
{
"item_id": 1
},
{
"item_id": 2
}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1067,
"status": "ok",
"timestamp": 1519758422814,
"user": {
"displayName": "Nithum Thain",
"photoUrl": "//lh4.googleusercontent.com/-o8q7BcjxLpg/AAAAAAAAAAI/AAAAAAAAABQ/-zA_Kee6FY0/s50-c-k-no/photo.jpg",
"userId": "105288052437331023238"
},
"user_tz": 210
},
"id": "2L3TNl-NV4zV",
"outputId": "d58ba84a-c30f-4ddb-ecb5-3fc36a850bd5"
},
"outputs": [],
"source": [
"display.display_prediction_attention(\"I'm going to shoot you!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"height": 95,
"output_extras": [
{
"item_id": 1
},
{
"item_id": 2
}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1383,
"status": "ok",
"timestamp": 1519758424819,
"user": {
"displayName": "Nithum Thain",
"photoUrl": "//lh4.googleusercontent.com/-o8q7BcjxLpg/AAAAAAAAAAI/AAAAAAAAABQ/-zA_Kee6FY0/s50-c-k-no/photo.jpg",
"userId": "105288052437331023238"
},
"user_tz": 210
},
"id": "r5BKahjfV41o",
"outputId": "05b91277-4d0a-4627-8cb9-c2275a799927"
},
"outputs": [],
"source": [
"display.display_prediction_attention(\"Oh shoot. Well alright.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"height": 95,
"output_extras": [
{
"item_id": 1
},
{
"item_id": 2
}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1154,
"status": "ok",
"timestamp": 1519758426592,
"user": {
"displayName": "Nithum Thain",
"photoUrl": "//lh4.googleusercontent.com/-o8q7BcjxLpg/AAAAAAAAAAI/AAAAAAAAABQ/-zA_Kee6FY0/s50-c-k-no/photo.jpg",
"userId": "105288052437331023238"
},
"user_tz": 210
},
"id": "8GicGWbCV4uz",
"outputId": "f02500eb-35a9-466a-a759-8b83fb05feb3"
},
"outputs": [],
"source": [
"display.display_prediction_attention(\"First of all who the fuck died and made you the god.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"height": 95,
"output_extras": [
{
"item_id": 1
},
{
"item_id": 2
}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1061,
"status": "ok",
"timestamp": 1519758428491,
"user": {
"displayName": "Nithum Thain",
"photoUrl": "//lh4.googleusercontent.com/-o8q7BcjxLpg/AAAAAAAAAAI/AAAAAAAAABQ/-zA_Kee6FY0/s50-c-k-no/photo.jpg",
"userId": "105288052437331023238"
},
"user_tz": 210
},
"id": "kWIR-ivlWi18",
"outputId": "fb25ede3-e321-4abb-e358-3a0be35266fa"
},
"outputs": [],
"source": [
"display.display_prediction_attention(\"Gosh darn it!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"height": 95,
"output_extras": [
{
"item_id": 1
},
{
"item_id": 2
}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1400,
"status": "ok",
"timestamp": 1519758433415,
"user": {
"displayName": "Nithum Thain",
"photoUrl": "//lh4.googleusercontent.com/-o8q7BcjxLpg/AAAAAAAAAAI/AAAAAAAAABQ/-zA_Kee6FY0/s50-c-k-no/photo.jpg",
"userId": "105288052437331023238"
},
"user_tz": 210
},
"id": "MJhqEbl8WlJm",
"outputId": "acf96708-f04a-4493-a650-70ff8f6aa2a7"
},
"outputs": [],
"source": [
"display.display_prediction_attention(\"God damn it!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"height": 95,
"output_extras": [
{
"item_id": 1
},
{
"item_id": 2
}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1400,
"status": "ok",
"timestamp": 1519758437722,
"user": {
"displayName": "Nithum Thain",
"photoUrl": "//lh4.googleusercontent.com/-o8q7BcjxLpg/AAAAAAAAAAI/AAAAAAAAABQ/-zA_Kee6FY0/s50-c-k-no/photo.jpg",
"userId": "105288052437331023238"
},
"user_tz": 210
},
"id": "BDWSuL3kZCT1",
"outputId": "795856d9-ab5d-48aa-ceb2-46a654eec60b"
},
"outputs": [],
"source": [
"display.display_prediction_attention(\"You're not that smart are you?\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"default_view": {},
"last_runtime": {
"build_target": "//learning/brain/python/client:colab_notebook",
"kind": "private"
},
"name": "Attention Model Codelab.ipynb",
"provenance": [
{
"file_id": "1TEez0zxlE23RyPtPVEUaL6zhim-r8gMj",
"timestamp": 1518199421351
},
{
"file_id": "0By5BN4UDRuWSSHJuR2t2YVIzZjQ",
"timestamp": 1509645017645
}
],
"version": "0.3.2",
"views": {}
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: attention-tutorial/README.md
================================================
# Attention Based Classification Tutorial
**Recommended time: 30 minutes**
**Contributors: nthain, martin-gorner**
This tutorial provides an introduction to building text classification models in Tensorflow that use attention to provide insight into how classification decisions are being made. We will build our Tensorflow graph following the Embed - Encode - Attend - Predict paradigm introduced by Matthew Honnibal. For more information about this approach, you can refer to:
Slides: https://goo.gl/BYT7au
Video: https://youtu.be/pzOzmxCR37I
Figure 1 below provides a representation of the full Tensorflow graph we will build in this tutorial.

This tutorial was created in collaboration with the Tensorflow without a PhD series. To check out more episodes, tutorials, and codelabs from this series, please visit:
https://github.com/GoogleCloudPlatform/tensorflow-without-a-phd
## To Run Locally
1. Setup a (virtualenv)[https://virtualenvwrapper.readthedocs.io/en/latest/] for
the project (recommended, but technically optional).
```
Python 3:
```
python3 -m venv env
```
To enter your virtual env:
```shell
source env/bin/activate
```
2. Install library dependencies:
```shell
pip install -r requirements.txt
```
================================================
FILE: attention-tutorial/checkpoints/README.md
================================================
This directory stores model checkpoints during training.
================================================
FILE: attention-tutorial/data/README.md
================================================
A directory to hold our toxicity data.
================================================
FILE: attention-tutorial/process_figshare.py
================================================
"""Cleans and splits the toxicity data from Figshare:
https://figshare.com/articles/Wikipedia_Talk_Labels_Toxicity/4563973
------------------------------------------------------------------------
Copyright 2018, Google Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pandas as pd
import os
import re
from urllib.request import urlretrieve
DEFAULT_DATA_DIR = 'data/'
FIGSHARE_PATH = 'https://ndownloader.figshare.com/files/'
FIGSHARE_URL_MAPPING = {
'toxicity_annotations.tsv': FIGSHARE_PATH + '7394539',
'toxicity_annotated_comments.tsv': FIGSHARE_PATH + '7394542'
}
def download_figshare(download_data_dir=DEFAULT_DATA_DIR):
"""
Downloads the toxicity data from Figshare.
Args:
* download_data_dir (string): if provided, the directory where the
Figshare tsvs should be stored
"""
if not os.path.exists(download_data_dir):
os.makedirs(download_data_dir)
already_exist = True
for file in ['toxicity_annotations.tsv', 'toxicity_annotated_comments.tsv']:
if not os.path.isfile(os.path.join(download_data_dir, file)):
already_exist = False
print('Downloading %s...' % file, end='')
urlretrieve(FIGSHARE_URL_MAPPING[file],
os.path.join(download_data_dir, file))
print('Done!')
if already_exist:
print('Figshare data already exists.')
return
def process_figshare(input_data_dir=DEFAULT_DATA_DIR,
output_data_dir=DEFAULT_DATA_DIR):
"""
Cleans and splits the toxicity data from Figshare.
Args:
* input_data_dir (string): if provided, the directory where the
Figshare tsvs are stored
* output_data_dir (string): if provided, the directory where the
output splits should be written
"""
already_exist = True
for split in ['train', 'test', 'dev']:
if not os.path.isfile(os.path.join(output_data_dir, 'wiki_%s.csv' % split)):
already_exist = False
if already_exist:
print('Processed files already exist.')
return
print('Processing files...', end='')
toxicity_annotated_comments = pd.read_csv(
os.path.join(input_data_dir, 'toxicity_annotated_comments.tsv'),
sep='\t',
dtype={'rev_id': 'str'})
toxicity_annotations = pd.read_csv(
os.path.join(input_data_dir, 'toxicity_annotations.tsv'),
sep='\t',
dtype={'rev_id': 'str'})
annotations_gped = toxicity_annotations.groupby(
'rev_id', as_index=False).agg({'toxicity': 'mean'})
all_data = pd.merge(
annotations_gped, toxicity_annotated_comments, on='rev_id')
all_data['comment'] = all_data['comment'].apply(lambda x: re.sub(
'NEWLINE_TOKEN|TAB_TOKEN', ' ', x))
all_data['is_toxic'] = all_data['toxicity'] > 0.5
# split into train, valid, test
wiki_splits = {}
for split in ['train', 'test', 'dev']:
wiki_splits[split] = all_data.query('split == @split')
for split in wiki_splits:
wiki_splits[split].to_csv(
os.path.join(output_data_dir, 'wiki_%s.csv' % split), index=False)
print('Done!')
# TODO(nthain): Add input and output dirs as flags.
if __name__ == '__main__':
process_figshare()
================================================
FILE: attention-tutorial/requirements.txt
================================================
absl-py==0.1.9
appnope==0.1.0
bleach==3.3.0
certifi==2024.7.4
chardet==3.0.4
comet-ml==1.0.8
decorator==4.2.1
entrypoints==0.2.3
enum34==1.1.6
futures==3.1.1
h5py==2.7.1
html5lib==0.999999999
idna==3.7
ipykernel==4.8.2
ipython==8.10.0
ipython-genutils==0.2.0
ipywidgets==7.1.2
jedi==0.11.1
Jinja2==3.1.4
jsonschema==2.6.0
jupyter==1.0.0
jupyter-client==5.2.3
jupyter-console==5.2.0
jupyter-core==4.11.2
kaggle==1.0.5
Keras==2.13.1
Markdown==2.6.11
MarkupSafe==1.0
mistune==2.0.3
nbconvert==6.5.1
nbformat==4.4.0
nltk==3.9
notebook==6.4.12
numpy==1.22.0
pandas==0.22.0
pandocfilters==1.4.2
parso==0.1.1
pexpect==4.4.0
pickleshare==0.7.4
Pillow==10.3.0
prompt-toolkit==1.0.15
protobuf==3.18.3
ptyprocess==0.5.2
Pygments==2.15.0
python-dateutil==2.6.1
pytz==2017.3
PyYAML==5.4
pyzmq==17.0.0
qtconsole==4.3.1
requests==2.32.2
scikit-learn==0.19.1
scipy==1.10.0
Send2Trash==1.5.0
simplegeneric==0.8.1
six==1.11.0
sklearn==0.0
tensorflow==2.12.1
tensorflow-tensorboard==1.5.0
terminado==0.8.1
testpath==0.3.1
tflearn==0.3.2
tornado==6.4.1
traitlets==4.3.2
urllib3==1.26.18
wcwidth==0.1.7
webencodings==0.5.1
websocket-client==0.47.0
Werkzeug==3.0.6
widgetsnbextension==3.1.4
wurlitzer==1.0.1
================================================
FILE: attention-tutorial/visualize_attention.py
================================================
"""A class to help visualize attention weights.
------------------------------------------------------------------------
Copyright 2018, Google Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pandas as pd
import tensorflow as tf
import numpy as np
pd.set_option('max_columns', 100)
tokenizer = tf.contrib.learn.preprocessing.tokenizer
WORDS_FEATURE = 'words'
MAX_DOCUMENT_LENGTH = 60
class wordVal(object):
"""A helper class that represents a word and value simultaneously."""
def __init__(self, word, val):
self.word = word
self.val = val
def __str__(self):
return self.word
class attentionDisplay(object):
"""A class to visualize attention weights produced by a classifer on a given string."""
def __init__(self, vocab_processor, classifier, words_feature='words'):
"""
Args:
* vocab_processor: a trained vocabulary processor from
tf.contrib.learn.preprocessing.VocabularyProcessor
* classifier: the classifier of class Estimator produced in
Attention_Model_Codelab.ipynb
* words_feature (string): if provided, the key for the comments in the
feed dictionary expected by the classifier
"""
self.vocab_processor = vocab_processor
self.classifier = classifier
self.words_feature = words_feature
def _rgb_to_hex(self, rgb):
return '#%02x%02x%02x' % rgb
def _color_wordvals(self, s):
r = 255 - int(s.val * 255)
color = self._rgb_to_hex((255, r, r))
return 'background-color: %s' % color
def _predict_sentence(self, input_string):
x_test = self.vocab_processor.transform([input_string])
x_test = np.array(list(x_test))
test_input_fn = tf.estimator.inputs.numpy_input_fn(
x={self.words_feature: x_test}, num_epochs=1, shuffle=False)
predictions = self.classifier.predict(input_fn=test_input_fn)
y_predicted = []
alphas_predicted = []
for p in predictions:
y_predicted.append(p['class'])
alphas_predicted.append(p['attention'])
return y_predicted, alphas_predicted
def _resize_and_tokenize(self, input_string):
tokenized_sentence = list(tokenizer([input_string]))[0]
tokenized_sentence = tokenized_sentence + [''] * (
MAX_DOCUMENT_LENGTH - len(tokenized_sentence))
tokenized_sentence = tokenized_sentence[:MAX_DOCUMENT_LENGTH]
return tokenized_sentence
def display_prediction_attention(self, input_string):
"""Visualizes the attention weights of the initialized classifier on the given string."""
pred, attn = self._predict_sentence(input_string)
if pred[0]:
print('Toxic')
else:
print('Not toxic')
tokenized_string = self._resize_and_tokenize(input_string)
wordvals = [wordVal(w, v) for w, v in zip(tokenized_string, attn[0])]
word_df = pd.DataFrame(wordvals).transpose()
return word_df.style.applymap(self._color_wordvals)
================================================
FILE: data_preparation/README.md
================================================
# Dataset preparation
This directory contains some steps to prepare our data before training our ML models. In particular, we want to:
* Shuffle the data and split it into train, eval and test datasets.
* Create an artificial bias (female vs male) for our embedding experiments. This is done by modifying the toxicity rate for examples labeled as 'male'.
## Environment Setup
### Python Dependencies
Install library dependencies (it is optional, but recommended to install these
in a [Virtual Environment](https://docs.python.org/3/tutorial/venv.html):
```shell
# The python2 way to create and use virtual environment
# (optional, but recommended):
virtualenv .pyenv
source .pyenv/bin/activate
# Install dependencies
pip install -r requirements.txt
jupyter notebook
# ... do stuff ...
# Exit your virtual environment.
deactivate
```
### Execution flow
#### Splits the data locally
We recommend using a small dataset 'train_small.tfrecord'.
```shell
NOW=$(date +%Y%m%d%H%M%S)
JOB_NAME=data-preparation-$NOW
python run_preprocessing_data_split.py \
--job_dir 'local_data' \
--input_data_path 'local_data/train_small.tfrecord' \
--output_folder 'local_data/train_eval_test/'
```
#### Splits the data on the cloud
```shell
NOW=$(date +%Y%m%d%H%M%S)
JOB_NAME=data-preparation-$NOW
python run_preprocessing_data_split.py \
--job_name $JOB_NAME \
--job_dir gs://kaggle-model-experiments/dataflow/$JOB_NAME \
--input_data_path 'gs://kaggle-model-experiments/resources/civil_comments_data/train.tfrecord' \
--output_folder 'gs://kaggle-model-experiments/resources/civil_comments_data/train_eval_test' \
--cloud
```
#### Creates the artificial_bias locally
```shell
NOW=$(date +%Y%m%d%H%M%S)
JOB_NAME=data-preparation-$NOW
python run_preprocessing_artificial_bias.py \
--job_dir 'local_data' \
--input_data_path 'local_data/train_eval_test/train*.tfrecord' \
--output_folder 'local_data/artificial_bias'
```
#### Creates the artificial_bias on the cloud
```shell
NOW=$(date +%Y%m%d%H%M%S)
JOB_NAME=data-preparation-$NOW
python run_preprocessing_artificial_bias.py \
--job_name $JOB_NAME \
--job_dir gs://kaggle-model-experiments/dataflow/$JOB_NAME \
--input_data_path 'gs://kaggle-model-experiments/resources/civil_comments_data/train_eval_test/train*.tfrecord' \
--output_folder gs://kaggle-model-experiments/resources/civil_comments_data/artificial_bias/${USER}/${NOW} \
--cloud
```
================================================
FILE: data_preparation/config.ini
================================================
[CLOUD]
project = wikidetox
runner = DataflowRunner
max_num_workers = 50
defaultWorkerLogLevel = INFO
log_level = ERROR
zone = us-east1-b
[LOCAL]
project = wikidetox
runner = DirectRunner
defaultWorkerLogLevel=INFO
log_level = ERROR
================================================
FILE: data_preparation/preprocessing/__init__.py
================================================
================================================
FILE: data_preparation/preprocessing/constants.py
================================================
"""Constants variables for preprocessing."""
TRAIN_DATA_PREFIX = 'train'
EVAL_DATA_PREFIX = 'eval'
TEST_DATA_PREFIX = 'test'
TRAIN_ARTIFICIAL_BIAS_PREFIX = 'train_artificial_bias'
================================================
FILE: data_preparation/preprocessing/preprocessing.py
================================================
"""Preprocessing steps of the data preparation."""
import os
import random
import apache_beam as beam
import tensorflow as tf
from tensorflow_transform import coders
import constants
import tfrecord_utils
def get_identity_list():
return [
'male', 'female', 'transgender', 'other_gender', 'heterosexual',
'homosexual_gay_or_lesbian', 'bisexual', 'other_sexual_orientation',
'christian', 'jewish', 'muslim', 'hindu', 'buddhist', 'atheist',
'other_religion', 'black', 'white', 'asian', 'latino',
'other_race_or_ethnicity', 'physical_disability',
'intellectual_or_learning_disability', 'psychiatric_or_mental_illness',
'other_disability'
]
def get_civil_comments_spec(include_identity_terms=True):
"""Returns the spec of the civil_comments dataset."""
spec = {
'comment_text': tf.FixedLenFeature([], dtype=tf.string),
'id': tf.FixedLenFeature([], dtype=tf.string),
'toxicity': tf.FixedLenFeature([], dtype=tf.float32),
'severe_toxicity': tf.FixedLenFeature([], dtype=tf.float32),
'obscene': tf.FixedLenFeature([], dtype=tf.float32),
'sexual_explicit': tf.FixedLenFeature([], dtype=tf.float32),
'identity_attack': tf.FixedLenFeature([], dtype=tf.float32),
'insult': tf.FixedLenFeature([], dtype=tf.float32),
'threat': tf.FixedLenFeature([], dtype=tf.float32),
'toxicity_annotator_count': tf.FixedLenFeature([], dtype=tf.int64),
'identity_annotator_count': tf.FixedLenFeature([], dtype=tf.int64),
}
if include_identity_terms:
for identity in get_identity_list():
spec[identity] = tf.FixedLenFeature([],
dtype=tf.float32,
default_value=-1.0)
return spec
def split_data(examples, train_fraction, eval_fraction):
"""Splits the data into train/eval/test."""
def partition_fn(data, n_partition):
random_value = random.random()
if random_value < train_fraction:
return 0
if random_value < train_fraction + eval_fraction:
return 1
return 2
examples_split = (examples | 'SplitData' >> beam.Partition(partition_fn, 3))
return examples_split
@beam.ptransform_fn
def Shuffle(examples): # pylint: disable=invalid-name
return (examples
| 'PairWithRandom' >> beam.Map(lambda x: (random.random(), x))
| 'GroupByRandom' >> beam.GroupByKey()
| 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
def write_to_tf_records(examples, output_path):
"""Shuffles and writes to disk."""
output_path_prefix = os.path.basename(output_path)
shuff_ex = (examples | 'Shuffle_' + output_path_prefix >> Shuffle())
_ = (
shuff_ex
| 'Serialize_' + output_path_prefix >> beam.ParDo(
tfrecord_utils.EncodeTFRecord(
feature_spec=get_civil_comments_spec(),
optional_field_names=get_identity_list()))
| 'WriteToTF_' + output_path_prefix >> beam.io.WriteToTFRecord(
file_path_prefix=output_path, file_name_suffix='.tfrecord'))
class OversampleExample(beam.DoFn):
"""Oversamples examples from a given class."""
def __init__(self, rule_fn, oversample_rate):
if (oversample_rate <= 0) or not isinstance(oversample_rate, int):
raise ValueError('oversample_rate should be a positive integer.')
self._rule_fn = rule_fn
self._oversample_rate = oversample_rate
def process(self, element):
if self._rule_fn(element):
for _ in range(self._oversample_rate):
yield element
else:
yield element
def _select_male_toxic_example(example,
threshold_identity=0.5,
threshold_toxic=0.5):
is_toxic = example['toxicity'] >= threshold_toxic
if 'male' in example:
is_male = example['male'] >= threshold_identity
else:
is_male = False
return is_toxic and is_male
def run_data_split(p, input_data_path, train_fraction, eval_fraction,
output_folder):
"""Splits the data into train/eval/test.
Args:
p: Beam pipeline for constructing PCollections and applying PTransforms.
input_data_path: Input TF Records.
train_fraction: Fraction of the data to be allocated to the training set.
eval_fraction: Fraction of the data to be allocated to the eval set.
output_folder: Folder to save the train/eval/test datasets.
Raises:
ValueError:
If train_fraction + eval_fraction >= 1.
If the output_directory exists. This exception prevents the user
from overwriting a previous split.
"""
if (train_fraction + eval_fraction >= 1.):
raise ValueError('Train and eval fraction are incompatible.')
if tf.gfile.Exists(output_folder):
raise ValueError('Output directory should be empty.'
' You should select a different path.')
examples = (
p
| 'ReadExamples' >>
beam.io.tfrecordio.ReadFromTFRecord(file_pattern=input_data_path))
examples = (
examples
| 'DecodeTFRecord' >> beam.ParDo(
tfrecord_utils.DecodeTFRecord(
feature_spec=get_civil_comments_spec(),
optional_field_names=get_identity_list())))
split = split_data(examples, train_fraction, eval_fraction)
train_data = split[0]
eval_data = split[1]
test_data = split[2]
write_to_tf_records(train_data,
os.path.join(output_folder, constants.TRAIN_DATA_PREFIX))
write_to_tf_records(eval_data,
os.path.join(output_folder, constants.EVAL_DATA_PREFIX))
write_to_tf_records(test_data,
os.path.join(output_folder, constants.TEST_DATA_PREFIX))
def run_artificial_bias(p, train_input_data_path, output_folder,
oversample_rate):
"""Main function to create artificial bias.
Args:
p: Beam pipeline for constructing PCollections and applying PTransforms.
train_input_data_path: Input TF Records, which is typically the training
dataset. This artificial bias method should not be run on eval/test.
output_folder: Folder to save the train/eval/test datasets.
oversample_rate: How many times to oversample the targeted class.
"""
train_data = (
p
| 'ReadExamples' >>
beam.io.tfrecordio.ReadFromTFRecord(file_pattern=train_input_data_path)
| 'DecodeTFRecord' >> beam.ParDo(
tfrecord_utils.DecodeTFRecord(
feature_spec=get_civil_comments_spec(),
optional_field_names=get_identity_list())))
train_data_artificially_biased = (
train_data
| 'CreateBias' >> beam.ParDo(
OversampleExample(_select_male_toxic_example, oversample_rate)))
write_to_tf_records(
train_data_artificially_biased,
os.path.join(output_folder, constants.TRAIN_ARTIFICIAL_BIAS_PREFIX))
================================================
FILE: data_preparation/preprocessing/tfrecord_utils.py
================================================
"""Utilities to decode and encode TF Records.
These utilities are wrappers around TF-Tranform coders to handle the
specificities around optional fields.
"""
import apache_beam as beam
from tensorflow_transform import coders
class Schema(object):
"""Defines the dataset schema for tf-transform.
We should have used dataset_schema from tensorflow_transform.tf_metadata.
However, there is a lack of support for `FixedLenFeature` default value,
and an exception is triggered by _feature_from_feature_spec.
TODO(fprost): Submit internal bug here.
"""
def __init__(self, spec):
self._spec = spec
def as_feature_spec(self):
return self._spec
class DecodeTFRecord(beam.DoFn):
"""Wrapper around ExampleProtoCoder for decoding optional fields.
To decode a TF-Record example, we use the coder utility
'tensorflow_transform.codersExampleProtoCoder'. For optional fields,
(indicated by 'default_value' argument for `FixedLenFeature`), the coder
will generate the default value when the optional field is missing.
This wrapper post-processes the coder and removes the field if the default
value was used.
"""
def __init__(self,
feature_spec,
optional_field_names,
rule_optional_fn=lambda x: x < 0):
"""Initialises a TF-Record decoder.
Args:
feature_spec: Dictionary from feature names to one of `FixedLenFeature`,
`SparseFeature` or `VarLenFeature. It contains all the features to parse
(including optional ones).
optional_field_names: list of optional fields.
rule_optional_fn: function that take the value of an optional field and
returns True if the value is indicative of a default value (e.g.
resulting from the default value of parsing FixedLenFeature). Current
code requires that all optional_field_names share the rule_optional_fn.
"""
self._schema = Schema(feature_spec)
self._coder = coders.ExampleProtoCoder(self._schema)
self._optional_field_names = optional_field_names
self._rule_optional_fn = rule_optional_fn
def process(self, element):
parsed_element = self._coder.decode(element)
for identity in self._optional_field_names:
if self._rule_optional_fn(parsed_element[identity]):
del parsed_element[identity]
yield parsed_element
class EncodeTFRecord(beam.DoFn):
"""Wrapper around ExampleProtoCoder for encoding optional fields."""
def __init__(self, feature_spec, optional_field_names):
"""Initialises a TF-Record encoder.
Args:
feature_spec: Dictionary from feature names to one of `FixedLenFeature`,
`SparseFeature` or `VarLenFeature. It contains all the features to parse
(including optional ones).
optional_field_names: list of optional fields.
"""
self._feature_spec = feature_spec
self._optional_field_names = optional_field_names
def process(self, element):
element_spec = self._feature_spec.copy()
for identity in self._optional_field_names:
if identity not in element:
del element_spec[identity]
element_schema = Schema(element_spec)
coder = coders.ExampleProtoCoder(element_schema)
encoded_element = coder.encode(element)
yield encoded_element
================================================
FILE: data_preparation/requirements.txt
================================================
apache-beam[gcp]==2.2.0
configparser==3.5.0
tensorflow==2.12.1
tensorflow_transform==0.9
================================================
FILE: data_preparation/run_preprocessing_artificial_bias.py
================================================
"""Sets up and start the Dataflow job for data preparation."""
import argparse
import logging
import os
import sys
import apache_beam as beam
import configparser
from preprocessing import preprocessing
def _parse_arguments(argv):
"""Parses command line arguments."""
parser = argparse.ArgumentParser(
description='Runs Preprocessing on Civil comments data.')
parser.add_argument(
'--cloud', action='store_true', help='Run preprocessing on the cloud.')
parser.add_argument('--job_name', required=False, help='Dataflow job name')
parser.add_argument(
'--job_dir',
required=True,
help='Directory in which to stage code and write temporary outputs')
parser.add_argument(
'--output_folder',
required=True,
help='Directory where to write train, eval and test data')
parser.add_argument('--input_data_path')
parser.add_argument(
'--oversample_rate',
required=False,
default=5,
type=int,
help='How many times to oversample the targeted class')
args = parser.parse_args(args=argv[1:])
return args
def _set_logging(log_level):
logging.getLogger().setLevel(getattr(logging, log_level.upper()))
def _parse_config(env, config_file_path):
"""Parses configuration file.
Args:
env: The environment in which the preprocessing job will be run.
config_file_path: Path to the configuration file to be parsed.
Returns:
A dictionary containing the parsed runtime config.
"""
config = configparser.ConfigParser()
config.read(config_file_path)
return dict(config.items(env))
def main():
"""Configures pipeline and spawns preprocessing job."""
args = _parse_arguments(sys.argv)
config = _parse_config('CLOUD' if args.cloud else 'LOCAL', 'config.ini')
options = {'project': str(config.get('project'))}
if args.cloud:
if not args.job_name:
raise ValueError('Job name must be specified for cloud runs.')
options.update({
'job_name':
args.job_name,
'max_num_workers':
int(config.get('max_num_workers')),
'setup_file':
os.path.abspath(
os.path.join(os.path.dirname(__file__), 'setup.py')),
'staging_location':
os.path.join(args.job_dir, 'staging'),
'temp_location':
os.path.join(args.job_dir, 'tmp'),
'zone':
config.get('zone')
})
pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
_set_logging(config.get('log_level'))
with beam.Pipeline(
str(config.get('runner')), options=pipeline_options) as pipeline:
preprocessing.run_artificial_bias(
pipeline,
train_input_data_path=args.input_data_path,
output_folder=args.output_folder,
oversample_rate=args.oversample_rate)
if __name__ == '__main__':
main()
================================================
FILE: data_preparation/run_preprocessing_data_split.py
================================================
"""Sets up and start the Dataflow job for data preparation."""
import argparse
import logging
import os
import sys
import apache_beam as beam
import configparser
from preprocessing import preprocessing
def _parse_arguments(argv):
"""Parses command line arguments."""
parser = argparse.ArgumentParser(
description='Runs Preprocessing on Civil comments data.')
parser.add_argument(
'--cloud', action='store_true', help='Run preprocessing on the cloud.')
parser.add_argument('--job_name', required=False, help='Dataflow job name')
parser.add_argument(
'--job_dir',
required=True,
help='Directory in which to stage code and write temporary outputs')
parser.add_argument(
'--output_folder',
required=True,
help='Directory where to write train, eval and test data')
parser.add_argument('--input_data_path')
parser.add_argument(
'--train_fraction',
required=False,
default=0.7,
type=float,
help='The fraction of the data to allocate to the training dataset')
parser.add_argument(
'--eval_fraction',
required=False,
default=0.15,
type=float,
help='The fraction of the data to allocate to the eval dataset')
args = parser.parse_args(args=argv[1:])
return args
def _set_logging(log_level):
logging.getLogger().setLevel(getattr(logging, log_level.upper()))
def _parse_config(env, config_file_path):
"""Parses configuration file.
Args:
env: The environment in which the preprocessing job will be run.
config_file_path: Path to the configuration file to be parsed.
Returns:
A dictionary containing the parsed runtime config.
"""
config = configparser.ConfigParser()
config.read(config_file_path)
return dict(config.items(env))
def main():
"""Configures pipeline and spawns preprocessing job."""
args = _parse_arguments(sys.argv)
config = _parse_config('CLOUD' if args.cloud else 'LOCAL', 'config.ini')
options = {'project': str(config.get('project'))}
if args.cloud:
if not args.job_name:
raise ValueError('Job name must be specified for cloud runs.')
options.update({
'job_name':
args.job_name,
'max_num_workers':
int(config.get('max_num_workers')),
'setup_file':
os.path.abspath(
os.path.join(os.path.dirname(__file__), 'setup.py')),
'staging_location':
os.path.join(args.job_dir, 'staging'),
'temp_location':
os.path.join(args.job_dir, 'tmp'),
'zone':
config.get('zone')
})
pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
_set_logging(config.get('log_level'))
with beam.Pipeline(
str(config.get('runner')), options=pipeline_options) as pipeline:
preprocessing.run_data_split(
pipeline,
input_data_path=args.input_data_path,
train_fraction=args.train_fraction,
eval_fraction=args.eval_fraction,
output_folder=args.output_folder)
if __name__ == '__main__':
main()
================================================
FILE: data_preparation/setup.py
================================================
from setuptools import setup, find_packages
NAME = 'jigsaw'
VERSION = '1.0'
REQUIRED_PACKAGES = ['tensorflow-transform==0.9.0']
setup(
name=NAME,
version=VERSION,
packages=find_packages(),
install_requires=REQUIRED_PACKAGES,
)
================================================
FILE: experiments/.gitignore
================================================
# Ignore local data, e.g. copies of embeddings
local_data
# Ignore local tmp files and directories
tmp
# Local config to holds cloud/comel.ml settings.
tf_trainer/convai_config.py
================================================
FILE: experiments/README.md
================================================
# Text Classification Framework
This directory contains an ML framework for text classification. We illustrate
it with toxic (and other attributes) comment classification.
The framework is structured as a series of common files and templates to quickly
construct models on top of the [Keras](https://keras.io/) or the [TensorFlow
Estimator API](https://www.tensorflow.org/programmers_guide/estimators).
The templates also demonstrate how these models can be trained using [Google ML
Engine](https://cloud.google.com/ml-engine/).
## Environment Setup
### Build Tools/Bazel Dependencies
Install [Bazel](https://docs.bazel.build/versions/master/install-os-x.html);
this is the build tool we use to run tests, etc.
### Python Dependencies
Install library dependencies (it is optional, but recommended to install these
in a [Virtual Environment](https://docs.python.org/3/tutorial/venv.html):
```shell
# The python3 way to create and use virtual environment
# (optional, but recommended):
python3 -m venv .pyenv
source .pyenv/bin/activate
# Install dependencies
pip install -r requirements.txt
# ... do stuff ...
# Exit your virtual environment.
deactivate
```
### Cloud and ML Engine configuration
1. Install the [Google Cloud SDK](https://cloud.google.com/sdk/).
2. Log in:
```shell
gcloud auth login
```
You will be prompted to visit a page in the browser; follow the login instructions there.
Due to [some issues](https://stackoverflow.com/questions/44401088/using-training-tfrecords-that-are-stored-on-google-cloud), also run this command:
```shell
gcloud auth application-default login
```
Follow the instructions there as well.
3. Set the project:
```shell
gcloud config set project [PROJECT]
```
4. Verify that the above setup works:
```shell
gcloud ml-engine models list
```
You should see some existing models. Example output:
```shell
NAME DEFAULT_VERSION_NAME
kaggle_model v_20180627_173451
...
```
## Training an Existing Model
To train an existing model, execute either command:
* `./tf_trainer/MODEL_NAME/run.local.sh` to run training locally, or
* `./tf_trainer/MODEL_NAME/run.ml_engine.sh` to run training on [Google ML
Engine](https://cloud.google.com/ml-engine/).
These scripts assume that you have access to the resources on our cloud
projects. If you don't, you can still run the models locally, but will have to
modify the data paths in `run.local.sh`. At the moment, we only support reading
data in `tf.record` format. See
[`tools/convert_csv_to_tfrecord.py`](https://github.com/conversationai/conversationai-models/blob/master/experiments/tools/convert_csv_to_tfrecord.py)
for a simple CSV to `tf.record` converter.
## Running a hyper parameter tuning job
To run a hyper parameter tuning job on CMLE, execute the following command:
* `./tf_trainer/MODEL_NAME/run.hyperparameter.sh`.
The hyperparameter configuration (MODEL_NAME/hparam_config.yaml) describes the job configuration, the parameters to tune and their respective range.
You can monitor your progress in the CMLE UI.
## Deploying a trained model on CMLE
At the end of your training, the model will be saved as a .pb file. Note: this is currently broken for keras models. TODO(fprost): Update this.
You can then deploy this model on CMLE by executing the following command:
* `./tf_trainer/MODEL_NAME/run.deploy.sh`.
The model will be accessible as an API and available for [batch/online predictions](https://cloud.google.com/ml-engine/docs/tensorflow/batch-predict).
Further information can be found [here](https://cloud.google.com/ml-engine/docs/tensorflow/deploying-models) about deploying models on CMLE.
## Deploying several models on CMLE for a given training run
The argument `n_export` allows you to save several models during your training run (1 model every train_steps/n).
All of the .pb filed will be saved in a subfolder of your MODEL_DIR.
There is a convenient utility in model_evaluation to help you to deploy all models on CMLE:
* `python utils_export/deploy_continous_model.py --parent_dir MODEL_DIR --model_name MODEL_NAME `
## Evaluate an Existing Model on New Data
See `model_evaluation/` for further information.
### Type Checking
Check the typings:
```shell
mypy --ignore-missing-imports -p tf_trainer
```
It's recommended you use mypy as an additional linter in your editor.
### Testing
Run all the tests and see the output streamed:
```shell
bazel test --test_output=streamed ...
```
You can also run tests individually, directly with python like so:
```shell
python -m tf_trainer.common.tfrecord_input_test
python -m tf_trainer.common.base_keras_model_test
```
### Building a New Model
TODO(jjtan)
================================================
FILE: experiments/WORKSPACE
================================================
# Bazel Workspace File.
================================================
FILE: experiments/__init__.py
================================================
================================================
FILE: experiments/requirements.txt
================================================
absl-py==0.7.0
astor==0.7.1
bert-tensorflow==1.0.1
bleach==3.3.0
certifi==2024.7.4
chardet==3.0.4
gast==0.2.2
gcsfs==0.2.3
grpcio==1.53.2
h5py==2.9.0
html5lib==1.0.1
idna==3.7
jsonlines==1.2.0
Markdown==3.0.1
mypy==0.670
nltk==3.9
numpy==1.22.0
pandas==0.24.1
protobuf==3.18.3
PyYAML==5.4
requests==2.32.2
scipy==1.10.0
sentencepiece==0.1.8
six==1.12.0
tensorboard==1.12.2
tensorflow==2.12.1
tensorflow-hub==0.2.0
termcolor==1.1.0
tf-sentencepiece==0.1.8
typed-ast==1.3.2
urllib3==1.26.19
websocket-client==0.54.0
Werkzeug==3.0.3
wurlitzer==1.0.2
================================================
FILE: experiments/setup.py
================================================
from setuptools import find_packages
from setuptools import setup
REQUIRED_PACKAGES = [
'nltk>=3.3',
'typed_ast==1.3.2',
'tensorflow-hub==0.1.1',
'bert-tensorflow==1.0.1'
]
setup(
name='tf_trainer',
version='0.1',
install_requires=REQUIRED_PACKAGES,
packages=find_packages(),
include_package_data=True,
description='TF Estimator modelling framework.')
================================================
FILE: experiments/testdata/BUILD
================================================
exports_files([
"cats_and_dogs_onehot.vocab.txt",
"cats_and_dogs_with_cat_opt_int_labels.jsonl",
"cats_and_dogs_with_partial_cat_int_labels.jsonl",
"cats_and_dogs.jsonl",
])
================================================
FILE: experiments/testdata/cats_and_dogs.jsonl
================================================
{ "text": "cats good", "bad": 0.0 }
{ "text": "cats bad", "bad": 1.0 }
{ "text": "dogs good", "bad": 0.0 }
{ "text": "dogs bad", "bad": 1.0 }
{ "text": "good cats", "bad": 0.0 }
{ "text": "dogs and cats", "bad": 0.0 }
{ "text": "not bad dogs and cats", "bad": 0.0 }
{ "text": "not bad dogs", "bad": 0.0 }
{ "text": "bad dogs and cats", "bad": 1.0 }
{ "text": "bad dogs and bad cats", "bad": 1.0 }
{ "text": "dogs and bad cats", "bad": 1.0 }
{ "text": "dogs and not bad cats", "bad": 0.0 }
{ "text": "dogs and cats bad", "bad": 1.0 }
{ "text": "dogs and cats good", "bad": 1.0 }
{ "text": "not dogs and bad cats", "bad": 1.0 }
{ "text": "not dogs and not cats", "bad": 0.0 }
================================================
FILE: experiments/testdata/cats_and_dogs_onehot.vocab.txt
================================================
dogs 1.0 0.0 0.0 0.0 0.0 0.0
cats 0.0 1.0 0.0 0.0 0.0 0.0
good 0.0 0.0 1.0 0.0 0.0 0.0
bad 0.0 0.0 0.0 1.0 0.0 0.0
and 0.0 0.0 0.0 0.0 1.0 0.0
not 0.0 0.0 0.0 0.0 0.0 1.0
================================================
FILE: experiments/testdata/cats_and_dogs_with_cat_opt_int_labels.jsonl
================================================
{ "text": "cats good", "bad": 0.0, "cat": 1 }
{ "text": "cats bad", "bad": 1.0, "cat": 1 }
{ "text": "dogs good", "bad": 0.0 }
{ "text": "dogs bad", "bad": 1.0 }
{ "text": "good cats", "bad": 0.0, "cat": 1 }
{ "text": "dogs and cats", "bad": 0.0, "cat": 1 }
{ "text": "not bad dogs and cats", "bad": 0.0, "cat": 1 }
{ "text": "not bad dogs", "bad": 0.0 }
{ "text": "bad dogs and cats", "bad": 1.0, "cat": 1 }
{ "text": "bad dogs and bad cats", "bad": 1.0, "cat": 1 }
{ "text": "dogs and bad cats", "bad": 1.0, "cat": 1 }
{ "text": "dogs and not bad cats", "bad": 0.0, "cat": 1 }
{ "text": "dogs and cats bad", "bad": 1.0, "cat": 1 }
{ "text": "dogs and cats good", "bad": 1.0, "cat": 1 }
{ "text": "not dogs and bad cats", "bad": 1.0, "cat": 1 }
{ "text": "not dogs and not cats", "bad": 0.0, "cat": 1 }
================================================
FILE: experiments/testdata/cats_and_dogs_with_partial_cat_int_labels.jsonl
================================================
{ "text": "cats good", "bad": 0.0, "cat": 1 }
{ "text": "cats bad", "bad": 1.0, "cat": 1 }
{ "text": "dogs good", "bad": 0.0, "cat": 0 }
{ "text": "dogs bad", "bad": 1.0, "cat": 0 }
{ "text": "good cats", "bad": 0.0, "cat": 1 }
{ "text": "dogs and cats", "bad": 0.0, "cat": 1 }
{ "text": "not bad dogs and cats", "bad": 0.0, "cat": 1 }
{ "text": "not bad dogs", "bad": 0.0, "cat": 0 }
{ "text": "bad dogs and cats", "bad": 1.0, "cat": 1 }
{ "text": "bad dogs and bad cats", "bad": 1.0, "cat": 1 }
{ "text": "dogs and bad cats", "bad": 1.0, "cat": 1 }
{ "text": "dogs and not bad cats", "bad": 0.0}
{ "text": "dogs and cats bad", "bad": 1.0 }
{ "text": "dogs and cats good", "bad": 1.0 }
{ "text": "not dogs and bad cats", "bad": 1.0 }
{ "text": "not dogs and not cats", "bad": 0.0 }
================================================
FILE: experiments/tf_trainer/__init__.py
================================================
================================================
FILE: experiments/tf_trainer/common/BUILD
================================================
py_library(
name = "types",
srcs = [
"types.py",
],
)
py_library(
name = "model_trainer",
srcs = [
"model_trainer.py",
],
deps = [
":base_model",
":data_input",
":text_preprocessor",
":types",
],
)
py_library(
name = "token_embedding_index",
srcs = [
"token_embedding_index.py",
],
deps = [
":base_model",
":types",
],
)
py_test(
name = "token_embedding_index_test",
srcs = ["token_embedding_index_test.py"],
data = ["//testdata:cats_and_dogs_onehot.vocab.txt"],
deps = [
":token_embedding_index",
":types",
],
)
py_library(
name = "text_preprocessor",
srcs = [
"text_preprocessor.py",
],
deps = [
":base_model",
":token_embedding_index",
":types",
],
)
py_test(
name = "text_preprocessor_test",
srcs = ["text_preprocessor_test.py"],
data = [
"//testdata:cats_and_dogs_onehot.vocab.txt",
],
deps = [
":text_preprocessor",
":types",
],
)
py_library(
name = "base_model",
srcs = [
"base_model.py",
],
deps = [":types"],
)
py_library(
name = "data_input",
srcs = [
"dataset_input.py",
"tfrecord_input.py",
":base_model",
],
deps = [":types"],
)
py_test(
name = "tfrecord_input_test",
srcs = ["tfrecord_input_test.py"],
deps = [
":data_input",
":types",
],
)
py_library(
name = "cnn_spec_parser",
srcs = ["cnn_spec_parser.py"],
deps = [":types"],
)
py_test(
name = "cnn_spec_parser_test",
srcs = ["cnn_spec_parser_test.py"],
deps = [
":cnn_spec_parser",
":types",
],
)
py_library(
name = "episodic_tfrecord_input",
srcs = ["episodic_tfrecord_input.py"],
deps = [
":types",
":base_model",
":data_input",
],
)
================================================
FILE: experiments/tf_trainer/common/__init__.py
================================================
================================================
FILE: experiments/tf_trainer/common/base_model.py
================================================
# coding=utf-8
# Copyright 2018 The Conversation-AI.github.io Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Interface for Models."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import abc
import tensorflow as tf
from tf_trainer.common import types
from typing import Callable
# The TF Example key associated with input features that consist of an
# UTF-8 string, for models that use that as input.
TEXT_FEATURE_KEY = 'text'
# The TF Example key associated with a Tensor of int32s for models that
# use tokens from a vocabulary as input.
TOKENS_FEATURE_KEY = 'tokens'
# The TF Example key associated with examples in inference that consist of
# an int64 integer. It is a unique identifier of the TF Example and is passed
# along by the estimator and returned in the predictions (forward_features).
EXAMPLE_KEY = 'comment_key'
class BaseModel(abc.ABC):
"""Tentative interface for all model classes.
Although the code doesn't take advantage of this interface yet, all models
should subclass this one.
"""
def map(self, f: Callable[[tf.estimator.Estimator], tf.estimator.Estimator]
) -> 'BaseModel':
"""Allows models to be extended. e.g.
adding preprocessing steps.
"""
class Model(BaseModel):
def estimator(unused, model_dir):
del unused
return f(self.estimator(model_dir))
def hparams(unused):
del unused
return self.hparams()
return Model()
@abc.abstractmethod
def estimator(self, model_dir: str) -> tf.estimator.Estimator:
pass
def hparams(self) -> tf.contrib.training.HParams:
return tf.contrib.training.HParams()
================================================
FILE: experiments/tf_trainer/common/basic_gpu_config.yaml
================================================
trainingInput:
pythonVersion: '3.5'
scaleTier: BASIC_GPU
================================================
FILE: experiments/tf_trainer/common/cnn_spec_parser.py
================================================
# coding=utf-8
# Copyright 2018 The Conversation-AI.github.io Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""CNN Specification Parser.
A simple parser for specifications of convolutional layers.
BNF defining the syntax to specify CNNs:
```
layers = layer : layers
layer = filters
filters = filter, filters
filter = (size / stride -> num_filters)
size, stride, num_filters = \d+
```
Inspiration for the notation comes from: `num_filters` being the output
embedding size, and the other dimension of the computed CNN matrix will be
`input_size * size / stride`.
"""
import re
from typing import List
layers_split_regexp = re.compile(r'\s*:\s*')
filters_split_regexp = re.compile(r'\s*,\s*')
filter_regexp = re.compile(r'\(\s*(?P<size>\d+)\s*/\s*(?P<stride>\d+)\s*'
r'\-\>\s*(?P<num_filters>\d+)\s*\)')
class FilterParseError(Exception):
pass
class Filter(object):
"""A single CNN filter.
filter = '(size / stride -> num_filters)'
"""
def __init__(self, str: str) -> None:
m = filter_regexp.match(str)
if m is None:
raise FilterParseError('Bad filter definition for: %s' % str)
self.num_filters = int(m.group('num_filters')) # type "int"
self.size = int(m.group('size')) # type "int"
self.stride = int(m.group('stride')) # type "int"
def __str__(self) -> str:
return ('(%d / %d -> %d)' % (self.size, self.stride, self.num_filters))
class ConcurrentFilters(object):
"""A set of concurrent CNN filters that make up one layer
filters = filter, filters
"""
def __init__(self, str: str) -> None:
filter_spec_strs = filters_split_regexp.split(str)
self.filters = [Filter(s) for s in filter_spec_strs]
def __str__(self) -> str:
return ', '.join([str(f) for f in self.filters])
class SequentialLayers(object):
"""A sequence of CNN layers
layers = filters : layers
"""
def __init__(self, str: str) -> None:
layer_spec_strs = layers_split_regexp.split(str)
self.layers = [ConcurrentFilters(s) for s in layer_spec_strs
] # type: List[ConcurrentFilters]
def __str__(self) -> str:
return ' : '.join([str(f) for f in self.layers])
================================================
FILE: experiments/tf_trainer/common/cnn_spec_parser_test.py
================================================
# coding=utf-8
# Copyright 2018 The Conversation-AI.github.io Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for tfrecord_input."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tf_trainer.common.cnn_spec_parser import SequentialLayers
from tf_trainer.common.cnn_spec_parser import ConcurrentFilters
from tf_trainer.common.cnn_spec_parser import Filter
class CnnSpecParserTest(tf.test.TestCase):
def test_SequentialLayers(self):
s = ('(2 / 2 -> 100), (3 / 2 -> 101) '
': (6 / 2 -> 102) '
': (3 / 1 -> 103)')
spec = SequentialLayers(s)
layer0 = spec.layers[0]
self.assertEqual(len(layer0.filters), 2)
layer0filter0 = layer0.filters[0] # type: Filter
self.assertEqual(layer0filter0.size, 2)
self.assertEqual(layer0filter0.stride, 2)
self.assertEqual(layer0filter0.num_filters, 100)
self.assertEqual(str(spec), s)
if __name__ == '__main__':
tf.test.main()
================================================
FILE: experiments/tf_trainer/common/dataset_config.sh
================================================
#!/bin/bash
BASE_PATH="gs://conversationai-models"
GCS_RESOURCES="${BASE_PATH}/resources"
MODEL_PARENT_DIR="${BASE_PATH}/tf_trainer_runs"
if [ "$1" == "civil_comments" ]; then
train_path="${GCS_RESOURCES}/civil_comments_data/train_eval_test/train-*.tfrecord"
valid_path="${GCS_RESOURCES}/civil_comments_data/train_eval_test/eval-*.tfrecord"
labels="toxicity"
label_dtypes="float"
text_feature="comment_text"
elif [ "$1" == "toxicity" ]; then
train_path="${GCS_RESOURCES}/toxicity_data/toxicity_q42017_train.tfrecord"
valid_path="${GCS_RESOURCES}/toxicity_data/toxicity_q42017_validate.tfrecord"
labels="frac_neg"
label_dtypes="float"
text_feature="comment_text"
elif [ "$1" == "many_communities" ]; then
train_path="${GCS_RESOURCES}/transfer_learning_data/many_communities/20181105_train.tfrecord"
valid_path="${GCS_RESOURCES}/transfer_learning_data/many_communities/20181105_validate.tfrecord"
labels="removed"
# removed is a boolean variable cast as an int.
# 1 means that the comment was removed and 0 means it was not.
label_dtypes="int"
text_feature="comment_text"
elif [ "$1" == "many_communities_40_per_8_shot" ]; then
if [ "$2" == "optimistic" ]; then
train_path="${GCS_RESOURCES}/transfer_learning_data/many_communities_40_per_8_shot/augmented_train.tfrecord"
elif [ "$2" == "pessimistic" ]; then
train_path="${GCS_RESOURCES}/transfer_learning_data/many_communities_40_per_8_shot/original_train..tfrecord"
else
echo "Must provide second positional argument."
exit 1
fi
valid_path="${GCS_RESOURCES}/transfer_learning_data/many_communities_40_per_8_shot/validation_query..tfrecord"
# test_path = "${GCS_RESOURCES}/transfer_learning_data/many_communities_40_per_8_shot/test_query..tfrecord"
labels="label"
# removed is a boolean variable cast as an int.
# 1 means that the comment was removed and 0 means it was not.
label_dtypes="int"
text_feature="text"
# used for param tuning
train_steps=3000
eval_steps=250
eval_period=200
else
echo "First positional arg must be one of civil_comments, toxicity, many_communities."
exit 1
fi
================================================
FILE: experiments/tf_trainer/common/dataset_input.py
================================================
# coding=utf-8
# Copyright 2018 The Conversation-AI.github.io Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Abstract Base Class for DatasetInput."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import abc
from tf_trainer.common import types
class DatasetInput(abc.ABC):
"""Abstract Base Class for Dataset Input.
Provides the input functions (referred to as input_fn in TF docs) to be used
with Tensorflow Estimator's train, evaluate, and predict methods.
"""
@abc.abstractmethod
def train_input_fn(self) -> types.EstimatorInput:
pass
@abc.abstractmethod
def validate_input_fn(self) -> types.EstimatorInput:
pass
================================================
FILE: experiments/tf_trainer/common/episodic_tfrecord_input.py
================================================
# coding=utf-8
# Copyright 2018 The Conversation-AI.github.io Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""DatasetInput implementation for episodic data."""
import tensorflow as tf
from pathlib import Path
import collections
import os
import random
from tf_trainer.common import dataset_input
from tf_trainer.common import types
from typing import List, Dict, Tuple, Union
tf.app.flags.DEFINE_string('train_path', None,
'Path to the training data TFRecord file.')
tf.app.flags.DEFINE_string('dev_path', None,
'Path to the training data TFRecord file.')
tf.app.flags.DEFINE_string('episode_size', None,
'Path to the training data TFRecord file.')
Text = Union[tf.Tensor, str]
Label = Union[tf.Tensor, float]
TextDomainLabel = collections.namedtuple('TextDomainLabel',
['text', 'domain', 'label'])
EpisodeData = collections.namedtuple('EpisodeData',
['texts', 'domains', 'labels'])
class EpisodicTFRecordInput(dataset_input.DatasetInput):
"""Generates episodic data."""
def __init__(self, train_dir, validate_dir) -> None:
self.train_dir = train_dir
self.validate_dir = validate_dir
def train_input_fn(self) -> types.FeatureAndLabelTensors:
all_episodes = self._get_randomized_episodes(self.train_dir)
all_texts = [ep.texts for ep in all_episodes]
all_domains = [ep.domains for ep in all_episodes]
all_labels = [ep.labels for ep in all_episodes]
ds = tf.data.Dataset.from_tensor_slices((all_texts, all_domains,
all_labels))
self.episode_batches_itr = ds.make_one_shot_iterator()
return self.episode_batches_itr.get_next()
def validate_input_fn(self) -> types.FeatureAndLabelTensors:
pass
def _get_randomized_episodes(self, directory: str) -> List[EpisodeData]:
"""Retrieves a list of domain specific datasets.
Given a directory of TFRecord files, each holding data for a given domain,
with file name "[domain].tfrecord", returns an iterator of datasets, each
corresponding to the data for a single domain.
"""
tfrecord_files = tf.gfile.Glob(os.path.join(directory, '*.tfrecord'))
episodes = []
for file_no, tfrecord_file in enumerate(tfrecord_files):
tf.logging.info('PROCESSING FILE {}: {}'.format(file_no, tfrecord_file))
episodes.append(self._dataset_from_tfrecord_file(tfrecord_file))
tf.logging.info('Shuffling episodes')
random.shuffle(episodes) # In place shuffle.
return episodes
def _dataset_from_tfrecord_file(self, tfrecord_file: str) -> EpisodeData:
# The domain happens to be the file stem.
domain = Path(tfrecord_file).stem
def _read_tf_example(record) -> TextDomainLabel:
parsed = tf.parse_single_example(
record, {
'text': tf.FixedLenFeature([], tf.string),
'label': tf.FixedLenFeature([], tf.int64)
}) # type: Dict[str, types.Tensor]
return TextDomainLabel(
text=parsed['text'], domain=domain, label=parsed['label'])
examples = list(tf.python_io.tf_record_iterator(tfrecord_file))
random.shuffle(examples)
datapoints = [_read_tf_example(example) for example in examples]
return EpisodeData(
texts=[dp.text for dp in datapoints],
domains=[dp.domain for dp in datapoints],
labels=[dp.label for dp in datapoints])
================================================
FILE: experiments/tf_trainer/common/episodic_tfrecord_input_test.py
================================================
"""Tests for episodic_tfrecord_input."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tf_trainer.common import episodic_tfrecord_input
class EpisodicTFRecordInputTest(tf.test.TestCase):
def test(self):
train_dir = 'gs://kaggle-model-experiments/resources/transfer_learning_data/many_communities_pruned_episodes'
tf.logging.info('CREATE')
e = episodic_tfrecord_input.EpisodicTFRecordInput(train_dir, 'asdf')
tf.logging.info('GET DATA')
episodic_batch = e.train_input_fn()
with tf.Session() as session:
tf.logging.info('FIRST BATCH')
tf.logging.info(session.run(episodic_batch))
tf.logging.info('SECOND BATCH')
print(session.run(episodic_batch))
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
tf.test.main()
================================================
FILE: experiments/tf_trainer/common/model_trainer.py
================================================
# coding=utf-8
# Copyright 2018 The Conversation-AI.github.io Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The Model Trainer class.
This provides an abstraction of Keras and TF.Estimator, and is intended for use
in text classification models (although it may generalize to other kinds of
problems).
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import os
import os.path
import six
import tensorflow as tf
from tensorflow.python.platform import tf_logging as logging
from tensorflow.python.estimator import estimator as estimator_lib
from tensorflow.python.estimator import model_fn as model_fn_lib
from tensorflow.python.estimator.export.export_output import PredictOutput
from tensorflow.python.framework import ops
from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
from tensorflow.python.ops import clip_ops
from tensorflow.python.ops import sparse_ops
from tensorflow.python.training import optimizer as optimizer_lib
from tensorflow.python.lib.io import file_io
from tf_trainer.common import base_model
from tf_trainer.common import dataset_input as ds
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('model_dir', None,
"Directory for the Estimator's model directory.")
tf.app.flags.DEFINE_string('warm_start_from', None,
'Existing checkpoint from which to start training.')
tf.app.flags.DEFINE_bool('enable_profiling', False,
'Enable profiler hook in estimator.')
tf.app.flags.DEFINE_integer(
'n_export', -1, 'Number of models to export.'
'If =-1, only the best checkpoint (wrt specified eval metric) is exported.'
'If =1, only the last checkpoint is exported.'
'If >1, we export `n_export` evenly-spaced checkpoints.')
tf.app.flags.DEFINE_string('key_name', 'comment_key',
'Name of a pass-thru integer id for batch scoring.')
tf.app.flags.DEFINE_integer('train_steps', 100000,
'The number of steps to train for.')
tf.app.flags.DEFINE_integer('eval_period', 1000,
'The number of steps per eval period.')
tf.app.flags.DEFINE_integer('eval_steps', None,
'Number of examples to eval for, default all.')
tf.app.flags.mark_flag_as_required('model_dir')
# Copied from:
# https://stackoverflow.com/questions/49846207/tensorflow-estimator-warm-start-from-and-model-dir
class InitHook(tf.train.SessionRunHook):
"""Initializes model from a checkpoint_path
Args:
checkpoint_dir: full path to dir containing the checkpoint
"""
def __init__(self, checkpoint_dir):
self.model_path = checkpoint_dir
self.initialized = False
def begin(self):
"""
Restore parameters if a pre-trained model is available and
we haven't trained previously.
"""
if not self.initialized:
#checkpoint = tf.train.latest_checkpoint(self.model_path)
all_checkpoints = file_io.get_matching_files(os.path.join(
self.model_path, 'model.ckpt-*.index'))
if not all_checkpoints:
raise ValueError('No checkpoint files found matching %s.' % (
self.model_path + '*'))
all_checkpoints = [x.replace('.index', '') for x in all_checkpoints]
all_checkpoints = sorted(all_checkpoints, key=lambda x: int(x.split('-')[-1]))
checkpoint = all_checkpoints[-1]
if checkpoint is None:
logging.info('No pre-trained model is available at %s, '
'training from scratch.' % self.model_path)
else:
logging.info('Pre-trained model {0} found in {1} - warmstarting.'.format(
checkpoint, self.model_path))
tf.train.warm_start(checkpoint)
self.initialized = True
# This function extends tf.contrib.estimator.forward_features.
# As the binary_head has a ClassificationOutput for serving_default,
# the check at the end of 'new_model_fn' fails in the initial fn.
def forward_features(estimator, keys, sparse_default_values=None):
"""Forward features to predictions dictionary.
In some cases, user wants to see some of the features in estimators prediction
output. As an example, consider a batch prediction service: The service simply
runs inference on the users graph and returns the results. Keys are essential
because there is no order guarantee on the outputs so they need to be rejoined
to the inputs via keys or transclusion of the inputs in the outputs.
Example:
```python
def input_fn():
features, labels = ...
features['unique_example_id'] = ...
features, labels
estimator = tf.estimator.LinearClassifier(...)
estimator = tf.contrib.estimator.forward_features(
estimator, 'unique_example_id')
estimator.train(...)
assert 'unique_example_id' in estimator.predict(...)
```
Args:
estimator: A `tf.estimator.Estimator` object.
keys: A `string`
sparse_default_values: A dict of `str` keys mapping the name of the sparse
features to be converted to dense, to the default value to use. Only
sparse features indicated in the dictionary are converted to dense and the
provided default value is used.
Returns:
A new `tf.estimator.Estimator` which forwards features to predictions.
Raises:
ValueError:
* if `keys` is already part of `predictions`. We don't allow
override.
* if 'keys' does not exist in `features`.
TypeError: if `keys` type is not one of `string` or list/tuple of `string`.
"""
def verify_key_types(keys): # pylint: disable=missing-docstring
if keys is None:
return keys
if isinstance(keys, six.string_types):
return [keys]
if not isinstance(keys, (list, tuple)):
raise TypeError('keys should be either a string or a list of strings. '
'Given: {}'.format(type(keys)))
for key in keys:
if not isinstance(key, six.string_types):
raise TypeError('All items in the given keys list should be a string. '
'There exist an item with type: {}'.format(type(key)))
return keys
def get_keys(features):
if keys is None:
return features.keys()
return keys
def verify_keys_and_predictions(features, predictions):
if not isinstance(predictions, dict):
raise ValueError(
'Predictions should be a dict to be able to forward features. '
'Given: {}'.format(type(predictions)))
for key in get_keys(features):
if key not in features:
raise ValueError(
'keys should be exist in features. Key "{}" is not in features '
'dict. features dict has following keys: {}. Please check '
'arguments of forward_features.'.format(key, features.keys()))
if key in predictions:
raise ValueError(
'Cannot forward feature key ({}). Since it does exist in '
'predictions. Existing prediction keys: {}. Please check arguments '
'of forward_features.'.format(key, predictions.keys()))
keys = verify_key_types(keys)
def new_model_fn(features, labels, mode, config): # pylint: disable=missing-docstring
spec = estimator.model_fn(features, labels, mode, config)
predictions = spec.predictions
if predictions is None:
return spec
verify_keys_and_predictions(features, predictions)
for key in get_keys(features):
feature = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
features[key])
if sparse_default_values and (key in sparse_default_values):
if not isinstance(feature, sparse_tensor_lib.SparseTensor):
raise ValueError(
'Feature ({}) is expected to be a `SparseTensor`.'.format(key))
feature = sparse_ops.sparse_tensor_to_dense(
feature, default_value=sparse_default_values[key])
if not isinstance(feature, ops.Tensor):
raise ValueError(
'Feature ({}) should be a Tensor. Please use `keys` '
'argument of forward_features to filter unwanted features, or'
'add key to argument `sparse_default_values`.'
'Type of features[{}] is {}.'.format(key, key, type(feature)))
predictions[key] = feature
spec = spec._replace(predictions=predictions)
if spec.export_outputs: # CHANGES HERE
outputs = spec.export_outputs['predict'].outputs
outputs[key] = spec.predictions[key]
spec.export_outputs['predict'] = tf.estimator.export.PredictOutput(
outputs)
spec.export_outputs[
'serving_default'] = tf.estimator.export.PredictOutput(outputs)
return spec
return estimator_lib.Estimator(
model_fn=new_model_fn,
model_dir=estimator.model_dir,
config=estimator.config)
class ModelTrainer(object):
"""Model Trainer."""
def __init__(self, dataset: ds.DatasetInput,
model: base_model.BaseModel,
warm_start_from: str = None) -> None:
self._dataset = dataset
self._model = model
self._warm_start_from = warm_start_from
self._estimator = model.estimator(self._model_dir())
def train_with_eval(self):
"""Train with periodic evaluation.
"""
training_hooks = None
if FLAGS.enable_profiling:
training_hooks = [
tf.train.ProfilerHook(
save_steps=10,
output_dir=os.path.join(self._model_dir(), 'profiler')),
]
if self._warm_start_from:
init_hook = InitHook(checkpoint_dir=self._warm_start_from)
if training_hooks:
training_hooks.append(init_hook)
else:
training_hooks = [init_hook]
train_spec = tf.estimator.TrainSpec(
input_fn=self._dataset.train_input_fn,
max_steps=FLAGS.train_steps,
hooks=training_hooks)
eval_spec = tf.estimator.EvalSpec(
input_fn=self._dataset.validate_input_fn,
steps=FLAGS.eval_steps,
throttle_secs=1)
self._estimator._config = self._estimator.config.replace(
save_checkpoints_steps=FLAGS.eval_period)
if FLAGS.n_export > 1 or FLAGS.n_export == -1:
self._estimator._config = self._estimator.config.replace(
keep_checkpoint_max=None)
tf.estimator.train_and_evaluate(self._estimator, train_spec, eval_spec)
def predict_on_dev(self, predict_keys=None):
checkpoints, _ = self._get_list_checkpoint(1, self._model_dir(),
None, None)
return self._estimator.predict(self._dataset.validate_input_fn,
predict_keys=predict_keys,
checkpoint_path=checkpoints[0])
def eval_dir(self):
return self._estimator.eval_dir()
def _model_dir(self):
"""Get Model Directory.
Used to scope logs to a given trial (when hyper param tuning) so that they
don't run over each other. When running locally it will just use the passed
in model_dir.
"""
return os.path.join(
FLAGS.model_dir,
json.loads(os.environ.get('TF_CONFIG', '{}')).get('task', {}).get(
'trial', ''))
def _add_estimator_key(self, estimator, example_key_name):
"""Adds a forward key to the model_fn of an estimator."""
estimator = forward_features(estimator, example_key_name)
return estimator
def _get_best_step_from_event_file(self,
event_file,
metrics_key,
is_first_metric_better_fn):
"""Find, in `event_file`, the step corresponding to the best metric.
Args:
event_file: The event file where to find the metrics.
metrics_key: The metric by which to determine the best checkpoint to save.
is_first_metric_better_fn: Comparison function to find best metric. Takes
in as arguments two numbers, returns true if first is better than
second. Default function says larger is better. Default value works for
AUC: higher is better.
Returns:
Best step (int).
"""
if not metrics_key:
return None
best_metric = None
best_step = None
for e in tf.train.summary_iterator(event_file):
for v in e.summary.value:
if v.tag == metrics_key:
metric = v.simple_value
if not best_step or is_first_metric_better_fn(metric, best_metric):
best_metric = metric
best_step = e.step
return best_step
def _get_best_checkpoint(self,
checkpoints,
metrics_key,
is_first_metric_better_fn):
"""Find the best checkpoint, according to `metrics_key`.
Args:
checkpoints: List of model checkpoints.
metrics_key: The metric by which to determine the best checkpoint to save.
is_first_metric_better_fn: Comparison function to find best metric. Takes
in as arguments two numbers, returns true if first is better than
second. Default function says larger is better. Default value works for
AUC: higher is better.
Returns:
Best checkpoint path.
"""
eval_event_dir = self._estimator.eval_dir()
event_files = file_io.list_directory(eval_event_dir)
if not event_files:
raise ValueError('No event files found in directory %s.' % eval_event_dir)
if len(event_files) > 1:
print('Multiple event files found in dir %s. Using last one.' % eval_event_dir)
event_file = os.path.join(eval_event_dir, event_files[-1])
# Use the best step to find the best checkpoint.
best_step = self._get_best_step_from_event_file(event_file, metrics_key,
is_first_metric_better_fn)
# If we couldn't find metrics_key in the event file, try again using loss.
if best_step is None:
print("Metrics key %s not found in metrics, using 'loss' as metric key." %
metrics_key)
metrics_key = "loss"
# Want the checkpoint with the lowest loss
is_first_metric_better_fn = lambda x, y: x < y
best_step = self._get_best_step_from_event_file(event_file, metrics_key,
is_first_metric_better_fn)
if best_step is None:
raise ValueError("Couldn't find 'loss' metric in event file %s." % event_file)
best_checkpoint_path = None
for checkpoint_path in checkpoints:
version = int(checkpoint_path.split('-')[-1])
if version == best_step:
best_checkpoint_path = checkpoint_path
if not best_checkpoint_path:
raise ValueError("Couldn't find checkpoint for best_step = %d." % best_step)
return best_checkpoint_path
def _get_list_checkpoint(self,
n_export,
model_dir,
metrics_key,
is_first_metric_better_fn):
"""Get the checkpoints that we want to export, as well as the ones to clean up.
Args:
n_export: Number of models to export.
model_dir: Directory containing the checkpoints.
metrics_key: The metric by which to determine the best checkpoint to save.
is_first_metric_better_fn: Comparison function to find best metric. Takes
in as arguments two numbers, returns true if first is better than
second. Default function says larger is better. Default value works for
AUC: higher is better.
Returns:
Tuple of:
List of checkpoint paths to export,
Set of checkpoint paths to delete.
If n_export==1, we take only the last checkpoint.
If n_export==-1, we take the best checkpoint, according to `metrics_key` and
`is_first_metric_better_fn`. The remaining checkpoints are deleted.
Otherwise, we consider the list of steps for each for which we have a
checkpoint. Then we choose n_export number of checkpoints such that their
steps are as equidistant as possible.
"""
all_checkpoints = file_io.get_matching_files(
os.path.join(model_dir, 'model.ckpt-*.index'))
if not all_checkpoints:
raise ValueError('No checkpoint files found matching model.ckpt-*.index.')
all_checkpoints = [x.replace('.index', '') for x in all_checkpoints]
all_checkpoints = sorted(all_checkpoints, key=lambda x: int(x.split('-')[-1]))
# Keep track of the checkpoints to export, and the ones to delete.
checkpoints_to_export = None
checkpoints_to_delete = None
if n_export == 1:
checkpoints_to_export = [all_checkpoints[-1]]
elif n_export == -1:
checkpoints_to_export = [self._get_best_checkpoint(all_checkpoints, metrics_key,
is_first_metric_better_fn)]
elif n_export > 1:
# We want to cover a distance of (len(checkpoints) - 1): for 3 points, we have a distance of 2.
# with a number of points of (n_export -1): because 1 point is set at the end.
step = float(len(all_checkpoints) - 1) / (n_export - 1)
if step <= 1: # Fewer checkpoints available than the desired number.
return all_checkpoints, None
checkpoints_to_export = [
all_checkpoints[int(i * step)] for i in range(n_export - 1)
]
checkpoints_to_export.append(all_checkpoints[-1])
if checkpoints_to_export:
checkpoints_to_delete = set(all_checkpoints) - set(checkpoints_to_export)
return checkpoints_to_export, checkpoints_to_delete
def export(self,
serving_input_fn,
example_key_name=None,
metrics_key=None,
is_first_metric_better_fn=lambda x, y: x > y,
delete_unexported_checkpoints=True):
"""Export model as a .pb.
Args:
serving_input_fn: An input function for inference graph.
example_key_name: Name of the example_key field (string).
If None, no example_key will be used.
metrics_key: The metric by which to determine the best checkpoint to save.
is_first_metric_better_fn: Comparison function to find best metric. Takes
in as arguments 3 numbers, returns true if first is better than
second. Default function says larger is better. Default value works for
AUC: higher is better.
delete_unexported_checkpoints: Boolean flag indicating whether or not to delete
the checkpoints that aren't exported. If False then all model checkpoints are
retained.
NOTE: if using a different metrics_key than AUC, make sure `is_first_metric_better_fn`
is updated accordingly.
Example keys are useful when doing batch predictions. Typically,
the predictions are done by a cluster of machines and the order of
the results is random. Here, we add a forward feature in the inference graph
(https://www.tensorflow.org/api_docs/python/tf/contrib/estimator/forward_features)
which will be used as an example unique identifier. In inference, the input
example includes an example_key field that is passed along by the estimator
and returned in the predictions.
"""
if FLAGS.n_export == -1:
if not is_first_metric_better_fn:
raise ValueError('Must provide valid `is_first_metric_better_fn` '
'when exporting best checkpoint.')
if not metrics_key:
print('No value provided for `metrics_key`. Using loss.')
metrics_key = 'loss'
is_first_metric_better_fn = lambda x, y: x < y
estimator = self._estimator
if example_key_name:
estimator = self._add_estimator_key(self._estimator, example_key_name)
checkpoints_to_export, checkpoints_to_delete = self._get_list_checkpoint(
FLAGS.n_export, self._model_dir(), metrics_key, is_first_metric_better_fn)
# Delete the checkpoints we don't want.
if checkpoints_to_delete and delete_unexported_checkpoints:
for ckpt in checkpoints_to_delete:
tf.train.remove_checkpoint(ckpt)
# Export the desired checkpoints.
if checkpoints_to_export:
for checkpoint_path in checkpoints_to_export:
version = checkpoint_path.split('-')[-1]
estimator.export_savedmodel(
export_dir_base=os.path.join(self._model_dir(), version),
serving_input_receiver_fn=serving_input_fn,
checkpoint_path=checkpoint_path)
================================================
FILE: experiments/tf_trainer/common/p100_config.yaml
================================================
trainingInput:
pythonVersion: '3.5'
scaleTier: CUSTOM
masterType: standard_p100
workerType: standard_p100
parameterServerType: large_model
workerCount: 1
parameterServerCount: 1
================================================
FILE: experiments/tf_trainer/common/serving_input.py
================================================
"""Serving functions for deployed model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tensorflow.python.ops import array_ops
FLAGS = tf.app.flags.FLAGS
def create_text_serving_input_fn(text_feature_name, example_key_name):
def serving_input_fn_tfrecords():
serialized_example = tf.placeholder(
shape=[None], dtype=tf.string, name="input_example_tensor")
feature_spec = {
text_feature_name:
tf.FixedLenFeature([], dtype=tf.string),
example_key_name:
tf.FixedLenFeature([], dtype=tf.int64, default_value=-1)
}
features = tf.parse_example(serialized_example, feature_spec)
return tf.estimator.export.ServingInputReceiver(features,
serialized_example)
return serving_input_fn_tfrecords
def create_serving_input_fn(word_to_idx,
unknown_token,
text_feature_name,
example_key_name):
def serving_input_fn_tfrecords():
serialized_example = tf.placeholder(
shape=[None], dtype=tf.string, name="input_example_tensor")
feature_spec = {
text_feature_name: tf.VarLenFeature(dtype=tf.string),
example_key_name: tf.FixedLenFeature([], dtype=tf.int64, default_value=-1)
}
features = tf.parse_example(serialized_example, feature_spec)
keys = list(word_to_idx.keys())
values = list(word_to_idx.values())
vocabulary_table = tf.contrib.lookup.HashTable(
tf.contrib.lookup.KeyValueTensorInitializer(
keys, values, key_dtype=tf.string, value_dtype=tf.int64),
unknown_token)
words_int_sparse = vocabulary_table.lookup(features[text_feature_name])
words_int_dense = tf.sparse_tensor_to_dense(
words_int_sparse, default_value=0)
features[text_feature_name] = words_int_dense
return tf.estimator.export.ServingInputReceiver(features,
serialized_example)
return serving_input_fn_tfrecords
================================================
FILE: experiments/tf_trainer/common/text_preprocessor.py
================================================
# coding=utf-8
# Copyright 2018 The Conversation-AI.github.io Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Text Preprocessor."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
from absl import flags
import numpy as np
import tensorflow as tf
from tf_trainer.common import base_model
from tf_trainer.common import types
from tf_trainer.common.token_embedding_index import LoadTokenIdxEmbeddings
from typing import Callable, Dict, List, Optional, Tuple
FLAGS = flags.FLAGS
tf.app.flags.DEFINE_bool('is_embedding_trainable', False,
'Enable fine tuning of embeddings.')
class TextPreprocessor(object):
"""Text Preprocessor TensorFlow Estimator Extension.
Uses embedding indexes to create tensors that map tokens (provided by an
abstract tokenizer funtion) to embeddings.
Note: Due to the lack of text preprocessing functions in tensorflow, we expect
that the text is already preprocessed (list of words) in inference. In
training, due to the availability of tf.py_func, we can handle the
preprocessing.
"""
def __init__(self, embeddings_path: str) -> None:
self._word_to_idx, self._embeddings_matrix, self._unknown_token, self._embedding_size = \
LoadTokenIdxEmbeddings(embeddings_path) # type: Tuple[Dict[str, int], np.ndarray, int, int]
def train_preprocess_fn(self,
tokenizer: Callable[[str], List[str]],
lowercase: Optional[bool] = True
) -> Callable[[types.Tensor], types.Tensor]:
def _tokenize(text: bytes) -> np.ndarray:
"""Converts text to a list of words.
Args:
text: text to tokenize (string).
lowercase: whether to include lowercasing in preprocessing (boolean).
tokenizer: Python function to tokenize the text on.
Returns:
A list of strings (words).
"""
words = tokenizer(text.decode('utf-8'))
if lowercase:
words = [w.lower() for w in words]
return np.asarray(
[self._word_to_idx.get(w, self._unknown_token) for w in words],
dtype=np.int64)
def _preprocess_fn(text: types.Tensor) -> types.Tensor:
"""Converts a text into a list of integers.
Args:
text: a 0-D string Tensor.
Returns:
A 1-D int64 Tensor.
"""
words = tf.py_func(
_tokenize, [text], tf.int64, stateful=False, name='PreprocessFn')
return words
return _preprocess_fn
def add_embedding_to_model(self, model: base_model.BaseModel,
text_feature_name: str) -> base_model.BaseModel:
"""Returns a new BaseModel with an embedding layer prepended.
Args:
model: An existing BaseModel instance.
text_feature_name: The name of the feature containing text.
"""
return model.map(
functools.partial(self.create_estimator_with_embedding,
text_feature_name))
def create_estimator_with_embedding(
self, text_feature_name: str,
estimator: tf.estimator.Estimator) -> tf.estimator.Estimator:
"""Takes an existing estimator and prepends the embedding layers to it.
Args:
estimator: A predefined Estimator that expects embeddings.
text_feature_name: The name of the feature containing the text.
Returns:
TF Estimator with embedding ops added.
Note: We need to consider the case of large embeddings (see:
https://stackoverflow.com/questions/48217599/
how-to-initialize-embeddings-layer-within-estimator-api/48243086#48243086).
"""
old_model_fn = estimator.model_fn
old_config = estimator.config
old_params = estimator.params
def add_init_fn_to_estimatorSpec(estimator_spec, init_fn):
"""Add a new init_fn to the scaffold part of estimator spec."""
def new_init_fn(scaffold, sess):
init_fn(scaffold, sess)
if estimator_spec.scaffold.init_fn:
estimator_spec.scaffold.init_fn(scaffold, sess)
scaffold = tf.train.Scaffold(
init_fn=new_init_fn, copy_from_scaffold=estimator_spec.scaffold)
estimator_spec_with_scaffold = tf.estimator.EstimatorSpec(
mode=estimator_spec.mode,
predictions=estimator_spec.predictions,
loss=estimator_spec.loss,
train_op=estimator_spec.train_op,
eval_metric_ops=estimator_spec.eval_metric_ops,
export_outputs=estimator_spec.export_outputs,
training_chief_hooks=estimator_spec.training_chief_hooks,
training_hooks=estimator_spec.training_hooks,
scaffold=scaffold,
evaluation_hooks=estimator_spec.evaluation_hooks,
prediction_hooks=estimator_spec.prediction_hooks)
return estimator_spec_with_scaffold
def new_model_fn(features, labels, mode, params, config):
"""model_fn used in defining the new TF Estimator"""
embeddings, embedding_init_fn = self.word_embeddings(
trainable=FLAGS.is_embedding_trainable)
text_feature = features[text_feature_name]
word_embeddings = tf.nn.embedding_lookup(embeddings, text_feature)
new_features = {text_feature_name: word_embeddings}
# Fix dimensions to make Keras model output match label dims.
if mode != tf.estimator.ModeKeys.PREDICT:
labels = {k: tf.expand_dims(v, -1) for k, v in labels.items()}
# TODO: Modify when embeddings are part of the model.
estimator_spec = old_model_fn(
new_features, labels, mode=mode, config=config)
estimator_spec_with_scaffold = add_init_fn_to_estimatorSpec(
estimator_spec, embedding_init_fn)
return estimator_spec_with_scaffold
return tf.estimator.Estimator(
new_model_fn, config=old_config, params=old_params)
def word_to_idx(self) -> Dict[str, int]:
return self._word_to_idx
def unknown_token(self) -> int:
return self._unknown_token
def word_embeddings(self, trainable) -> tf.Variable:
"""Get word embedding TF Variable."""
embeddings = tf.get_variable(
'embeddings', self._embeddings_matrix.shape, trainable=trainable)
def init_fn(scaffold, sess):
sess.run(embeddings.initializer,
{embeddings.initial_value: self._embeddings_matrix})
return embeddings, init_fn
================================================
FILE: experiments/tf_trainer/common/text_preprocessor_test.py
================================================
# coding=utf-8
# Copyright 2018 The Conversation-AI.github.io Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for text_preprocessor."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tf_trainer.common import text_preprocessor
class TextPreprocessorTest(tf.test.TestCase):
def test_Tokenize(self):
preprocessor = text_preprocessor.TextPreprocessor(
'testdata/cats_and_dogs_onehot.vocab.txt')
with self.test_session() as session:
preprocess_fn = preprocessor.train_preprocess_fn(
tokenizer=lambda x: x.split(' '), lowercase=False)
tokens = preprocess_fn('dogs good cats bad rabbits not')
self.assertEqual(list(tokens.eval()), [1, 3, 2, 4, 7, 6])
def test_Lowercase(self):
preprocessor = text_preprocessor.TextPreprocessor(
'testdata/cats_and_dogs_onehot.vocab.txt')
with self.test_session() as session:
preprocess_fn = preprocessor.train_preprocess_fn(
tokenizer=lambda x: x.split(' '), lowercase=True)
tokens = preprocess_fn('Dogs GOOD Cats BAD rabbits not')
self.assertEqual(list(tokens.eval()), [1, 3, 2, 4, 7, 6])
if __name__ == '__main__':
tf.test.main()
================================================
FILE: experiments/tf_trainer/common/tfrecord_input.py
================================================
"""DatasetInput class based on TFRecord files."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import multiprocessing
import tensorflow as tf
from typing import Callable, List, Dict, Tuple
from tf_trainer.common import base_model
from tf_trainer.common import dataset_input
from tf_trainer.common import types
tf.app.flags.DEFINE_string('train_path', None,
'Path to the training data TFRecord file.')
tf.app.flags.DEFINE_string('validate_path', None,
'Path to the validation data TFRecord file.')
tf.app.flags.DEFINE_string('labels', 'frac_neg',
'Comma separated list of label features.')
tf.app.flags.DEFINE_string(
'label_dtypes', None, 'Comma separated list of dtypes for labels. Each '
'dtype must be float or int. If not provided '
'assumes all labels are floats.')
tf.app.flags.DEFINE_string('text_feature', 'comment_text',
'Name of feature containing text input.')
tf.app.flags.DEFINE_boolean('round_labels', True,
'Round label features to 0 or 1 if true.')
tf.app.flags.DEFINE_integer('batch_size', 256,
'Batch sizes to use when reading.')
tf.app.flags.DEFINE_integer(
'num_prefetch', 5,
'An optimization parameter for the number of elements to prefetch. See: '
'https://www.tensorflow.org/api_docs/python/tf/data/Dataset#prefetch')
FLAGS = tf.app.flags.FLAGS
DTYPE_MAPPING = {'float': tf.float32, 'int': tf.int64}
DTYPE_DEFAULT = {'float': -1.0, 'int': -1}
class TFRecordInput(dataset_input.DatasetInput):
"""Simple no-preprocessing TFRecord based DatasetInput.
Handles parsing of TF Examples.
Regardless of which TF Example feature key is used, as specified by the
FLAGS.text_feature, the simple input will store the input text feature in
the feature key _text_feature.
"""
def __init__(self) -> None:
self._labels = FLAGS.labels.split(',')
if FLAGS.label_dtypes:
self._label_dtypes = FLAGS.label_dtypes.split(',')
else:
self._label_dtypes = ['float'] * len(self._labels)
self._batch_size = FLAGS.batch_size
self._num_prefetch = FLAGS.num_prefetch
self._text_feature = FLAGS.text_feature
self._round_labels = FLAGS.round_labels
def labels(self) -> List[str]:
"""List of the names of the float label features."""
return self._labels
def text_feature(self) -> str:
"""Name of the feature containing the input text from examples."""
return self._text_feature
def train_input_fn(self) -> tf.data.TFRecordDataset:
"""input_fn for TF Estimators for training set.
Automatically repeats over input data forever. We define epoch limits in the
model trainer.
"""
assert FLAGS.train_path
return self._input_fn_from_file(FLAGS.train_path).repeat()
def validate_input_fn(self) -> tf.data.TFRecordDataset:
"""input_fn for TF Estimators for validation set."""
assert FLAGS.validate_path
return self._input_fn_from_file(FLAGS.validate_path)
def _keys_to_features(self):
keys_to_features = {}
keys_to_features[self._text_feature] = tf.FixedLenFeature([], tf.string)
for label, dtype in zip(self._labels, self._label_dtypes):
keys_to_features[label] = tf.FixedLenFeature([], DTYPE_MAPPING[dtype],
DTYPE_DEFAULT[dtype])
return keys_to_features
def _input_fn_from_file(self, filepath: str) -> tf.data.TFRecordDataset:
filenames_dataset = tf.data.Dataset.list_files(filepath)
dataset = tf.data.TFRecordDataset(
filenames_dataset) # type: tf.data.TFRecordDataset
parsed_dataset = dataset.map(
self._read_tf_example, num_parallel_calls=multiprocessing.cpu_count())
return parsed_dataset.batch(self._batch_size).prefetch(self._num_prefetch)
def _process_labels(self, features, parsed):
"""Applies rounding and computes weights tied to feature presence.
For all of the expected labels, if the value is negative, this
indicates a missing feature from the input. A corresponding
label name, suffixed by '_weight' will be added to the features
with a value of 1.0 is present, and 0.0 if absent. The label
value is rounded up or down (if enabled) and then mapped to
zero if missing.
Args:
features: the input features read from a TF Example.
parsed: the input labels read from a TF Example.
Returns:
A tuple of the features dict (with weights) and the labels dict.
"""
# Make a deep copy to avoid changing the input.
new_features = {k: v for k, v in features.items()}
labels = {}
for label in self._labels:
label_value = tf.cast(parsed[label], dtype=tf.float32)
# Missing values are negative, find them and zero those features out.
weight = tf.cast(tf.greater_equal(label_value, 0.0), dtype=tf.float32)
if self._round_labels:
label_value = tf.round(label_value)
new_features[label + '_weight'] = weight
labels[label] = tf.multiply(label_value, weight)
return new_features, labels
def _read_tf_example(
self,
record: tf.Tensor,
) -> types.FeatureAndLabelTensors:
"""Parses TF Example protobuf into a text feature and labels.
The input TF Example has a text feature as a singleton list with the full
comment as the single element.
"""
parsed = tf.parse_single_example(
record, self._keys_to_features()) # type: Dict[str, types.Tensor]
features = {base_model.TEXT_FEATURE_KEY: parsed[self._text_feature]}
return self._process_labels(features, parsed)
class TFRecordInputWithTokenizer(TFRecordInput):
"""TFRecord based DatasetInput.
Handles parsing of TF Examples.
When handling text input, this class will rewrite the text input future,
using the preprocessing fn. That is, the text feature will be rewritten
as a new key in the output changing both the type and contents - from
a string to a tensor of in integers representing tokens of some kind.
TODO: preserve the original string and write a new key.
"""
def __init__(self,
train_preprocess_fn: Callable[[str], List[str]],
max_seq_len: int = 30000) -> None:
super().__init__()
self._train_preprocess_fn = train_preprocess_fn
self._max_seq_len = max_seq_len
def _input_fn_from_file(self, filepath: str) -> types.FeatureAndLabelTensors:
filenames_dataset = tf.data.Dataset.list_files(filepath)
dataset = tf.data.TFRecordDataset(
filenames_dataset) # type: tf.data.TFRecordDataset
parsed_dataset = dataset.map(
self._read_tf_example, num_parallel_calls=multiprocessing.cpu_count())
parsed_dataset = parsed_dataset.filter(lambda x, _: tf.less(
x['sequence_length'], self._max_seq_len))
feature_shapes = {
base_model.TOKENS_FEATURE_KEY: [None],
'sequence_length': []
}
for label in self._labels:
feature_shapes[label + '_weight'] = []
padded_shapes = (
feature_shapes,
{label: [] for label in self._labels}) # type: Tuple[Dict, Dict]
parsed_dataset = parsed_dataset.apply(
tf.contrib.data.bucket_by_sequence_length(
element_length_func=lambda x, _: x['sequence_length'],
bucket_boundaries=[(i + 1) * 20 for i in range(10)],
bucket_batch_sizes=[self._batch_size] * 11,
padded_shapes=padded_shapes))
batched_dataset = parsed_dataset.prefetch(self._num_prefetch)
return batched_dataset
def _read_tf_example(
self,
record: tf.Tensor,
) -> types.FeatureAndLabelTensors:
"""Parses TF Example protobuf into a text feature and labels.
The input TF Example has a text feature as a singleton list with the full
comment as the single element.
"""
parsed = tf.parse_single_example(
record, self._keys_to_features()) # type: Dict[str, types.Tensor]
text = parsed[self.text_feature()]
tokens = self._train_preprocess_fn(text)
features = {
base_model.TOKENS_FEATURE_KEY: tokens,
'sequence_length': tf.shape(tokens)[0],
}
return self._process_labels(features, parsed)
================================================
FILE: experiments/tf_trainer/common/tfrecord_input_test.py
================================================
# coding=utf-8
# Copyright 2018 The Conversation-AI.github.io Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for tfrecord_input."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
from tf_trainer.common import base_model
from tf_trainer.common import tfrecord_input
from tf_trainer.common import types
FLAGS = tf.app.flags.FLAGS
class TFRecordInputTest(tf.test.TestCase):
def setUp(self):
FLAGS.text_feature = 'comment'
ex = tf.train.Example(
features=tf.train.Features(
feature={
'label':
tf.train.Feature(
float_list=tf.train.FloatList(value=[0.8])),
'ignored-label':
tf.train.Feature(
float_list=tf.train.FloatList(value=[0.125])),
'int_label':
tf.train.Feature(int64_list=tf.train.Int64List(value=[0])),
'comment':
tf.train.Feature(
bytes_list=tf.train.BytesList(
value=['Hi there Bob'.encode('utf-8')]))
}))
self.ex_tensor = tf.convert_to_tensor(
ex.SerializeToString(), dtype=tf.string)
def test_TFRecordInput_unrounded(self):
FLAGS.round_labels = False
FLAGS.labels = 'label'
dataset_input = tfrecord_input.TFRecordInput()
with self.test_session():
features, labels = dataset_input._read_tf_example(self.ex_tensor)
self.assertEqual(features[base_model.TEXT_FEATURE_KEY].eval(),
b'Hi there Bob')
np.testing.assert_almost_equal(labels['label'].eval(), 0.8)
np.testing.assert_almost_equal(features['label_weight'].eval(), 1.0)
self.assertCountEqual(list(labels), ['label'])
self.assertCountEqual(list(features), ['text', 'label_weight'])
def test_TFRecordInput_default_values(self):
FLAGS.labels = 'label,fake_label,int_label'
FLAGS.label_dtypes = 'float,float,int'
FLAGS.round_labels = False
dataset_input = tfrecord_input.TFRecordInput()
with self.test_session():
features, labels = dataset_input._read_tf_example(self.ex_tensor)
self.assertEqual(features[base_model.TEXT_FEATURE_KEY].eval(),
b'Hi there Bob')
np.testing.assert_almost_equal(labels['label'].eval(), 0.8)
np.testing.assert_almost_equal(labels['int_label'].eval(), 0.0)
np.testing.assert_almost_equal(features['label_weight'].eval(), 1.0)
np.testing.assert_almost_equal(labels['fake_label'].eval(), 0.0)
np.testing.assert_almost_equal(features['fake_label_weight'].eval(), 0.0)
def test_TFRecordInput_rounded(self):
FLAGS.labels = 'label'
FLAGS.round_labels = True
dataset_input = tfrecord_input.TFRecordInput()
with self.test_session():
features, labels = dataset_input._read_tf_example(self.ex_tensor)
self.assertEqual(features[base_model.TEXT_FEATURE_KEY].eval(),
b'Hi there Bob')
np.testing.assert_almost_equal(labels['label'].eval(), 1.0)
np.testing.assert_almost_equal(features['label_weight'].eval(), 1.0)
class TFRecordInputWithTokenizerTest(tf.test.TestCase):
def setUp(self):
FLAGS.text_feature = 'comment'
ex = tf.train.Example(
features=tf.train.Features(
feature={
'label':
tf.train.Feature(
float_list=tf.train.FloatList(value=[0.8])),
'int_label':
tf.train.Feature(int64_list=tf.train.Int64List(value=[0])),
'comment':
tf.train.Feature(
bytes_list=tf.train.BytesList(
value=['Hi there Bob'.encode('utf-8')]))
}))
self.ex_tensor = tf.convert_to_tensor(
ex.SerializeToString(), dtype=tf.string)
self.word_to_idx = {'Hi': 12, 'there': 13}
self.unknown_token = 999
def preprocessor(self, text):
return tf.py_func(
lambda t: np.asarray([
self.word_to_idx.get(x, self.unknown_token)
for x in t.decode('utf-8').split(' ')
]), [text], tf.int64)
def test_TFRecordInputWithTokenizer_unrounded(self):
FLAGS.labels = 'label,fake_label,int_label,fake_int_label'
FLAGS.label_dtypes = 'float,float,int,int'
FLAGS.round_labels = False
dataset_input = tfrecord_input.TFRecordInputWithTokenizer(
train_preprocess_fn=self.preprocessor)
with self.test_session():
features, labels = dataset_input._read_tf_example(self.ex_tensor)
self.assertEqual(
list(features[base_model.TOKENS_FEATURE_KEY].eval()), [12, 13, 999])
self.assertAlmostEqual(labels['label'].eval(), 0.8)
self.assertAlmostEqual(labels['fake_label'].eval(), 0.0)
self.assertAlmostEqual(labels['int_label'].eval(), 0.0)
self.assertAlmostEqual(labels['fake_int_label'].eval(), 0.0)
self.assertAlmostEqual(features['label_weight'].eval(), 1.0)
self.assertAlmostEqual(features['fake_label_weight'].eval(), 0.0)
self.assertAlmostEqual(features['int_label_weight'].eval(), 1.0)
self.assertAlmostEqual(features['fake_int_label_weight'].eval(), 0.0)
def test_TFRecordInputWithTokenizer_default_values(self):
FLAGS.labels = 'label,fake_label'
FLAGS.round_labels = False
dataset_input = tfrecord_input.TFRecordInputWithTokenizer(
train_preprocess_fn=self.preprocessor)
with self.test_session():
features, labels = dataset_input._read_tf_example(self.ex_tensor)
self.assertEqual(
list(features[base_model.TOKENS_FEATURE_KEY].eval()), [12, 13, 999])
self.assertAlmostEqual(labels['label'].eval(), 0.8)
self.assertAlmostEqual(labels['fake_label'].eval(), 0.0)
self.assertAlmostEqual(features['label_weight'].eval(), 1.0)
self.assertAlmostEqual(features['fake_label_weight'].eval(), 0.0)
def test_TFRecordInputWithTokenizer_rounded(self):
FLAGS.labels = 'label'
FLAGS.round_labels = True
dataset_input = tfrecord_input.TFRecordInputWithTokenizer(
train_preprocess_fn=self.preprocessor)
with self.test_session():
features, labels = dataset_input._read_tf_example(self.ex_tensor)
self.assertEqual(
list(features[base_model.TOKENS_FEATURE_KEY].eval()), [12, 13, 999])
self.assertEqual(labels['label'].eval(), 1.0)
self.assertEqual(features['label_weight'].eval(), 1.0)
if __name__ == '__main__':
tf.test.main()
================================================
FILE: experiments/tf_trainer/common/token_embedding_index.py
================================================
# coding=utf-8
# Copyright 2018 The Conversation-AI.github.io Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Working with Token Embeding Indexes."""
from typing import Tuple, Dict, Optional, List, Callable
import numpy as np
import functools
import tensorflow as tf
def LoadTokenIdxEmbeddings(embeddings_path: str) \
-> Tuple[Dict[str, int], np.ndarray, int, int]:
"""Generate word to idx mapping and word embeddings numpy array.
We have two levels of indirection (e.g. word to idx and then idx to
embedding) which could reduce embedding size if multiple words map to the
same idx; although this is not currently a real or useful use-case.
Args:
embeddings_path: Local, GCS, or HDFS path to embedding file. Each line
should be a word and its vector representation separated by a space.
Returns:
Tuple of:
A vocabulary dictionary (mapping words to their index)
A Numpy array of word embeddings with shape (vocab size, embedding size)
A unique unknown token index (greater than all other token indexes)
The size of the embeddings for words that is being used
"""
word_to_idx = {}
word_embeddings = []
if not tf.gfile.Exists(embeddings_path):
raise ValueEr
gitextract__2536wl_/
├── .bazelrc
├── .gitignore
├── .travis.yml
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── annotator_models/
│ ├── README.md
│ ├── bin/
│ │ ├── cancel-job
│ │ ├── ls-jobs
│ │ ├── run
│ │ ├── run_local
│ │ └── stream-logs
│ ├── cpu_config.yaml
│ ├── requirements.txt
│ ├── results/
│ │ └── .gitignore
│ └── trainer/
│ ├── __init__.py
│ ├── dawid_skene.py
│ └── dawid_skene_test.py
├── attention-tutorial/
│ ├── Attention_Model_Tutorial.ipynb
│ ├── README.md
│ ├── checkpoints/
│ │ └── README.md
│ ├── data/
│ │ └── README.md
│ ├── process_figshare.py
│ ├── requirements.txt
│ └── visualize_attention.py
├── data_preparation/
│ ├── README.md
│ ├── config.ini
│ ├── preprocessing/
│ │ ├── __init__.py
│ │ ├── constants.py
│ │ ├── preprocessing.py
│ │ └── tfrecord_utils.py
│ ├── requirements.txt
│ ├── run_preprocessing_artificial_bias.py
│ ├── run_preprocessing_data_split.py
│ └── setup.py
├── experiments/
│ ├── .gitignore
│ ├── README.md
│ ├── WORKSPACE
│ ├── __init__.py
│ ├── requirements.txt
│ ├── setup.py
│ ├── testdata/
│ │ ├── BUILD
│ │ ├── cats_and_dogs.jsonl
│ │ ├── cats_and_dogs_onehot.vocab.txt
│ │ ├── cats_and_dogs_with_cat_opt_int_labels.jsonl
│ │ └── cats_and_dogs_with_partial_cat_int_labels.jsonl
│ ├── tf_trainer/
│ │ ├── __init__.py
│ │ ├── common/
│ │ │ ├── BUILD
│ │ │ ├── __init__.py
│ │ │ ├── base_model.py
│ │ │ ├── basic_gpu_config.yaml
│ │ │ ├── cnn_spec_parser.py
│ │ │ ├── cnn_spec_parser_test.py
│ │ │ ├── dataset_config.sh
│ │ │ ├── dataset_input.py
│ │ │ ├── episodic_tfrecord_input.py
│ │ │ ├── episodic_tfrecord_input_test.py
│ │ │ ├── model_trainer.py
│ │ │ ├── p100_config.yaml
│ │ │ ├── serving_input.py
│ │ │ ├── text_preprocessor.py
│ │ │ ├── text_preprocessor_test.py
│ │ │ ├── tfrecord_input.py
│ │ │ ├── tfrecord_input_test.py
│ │ │ ├── token_embedding_index.py
│ │ │ ├── token_embedding_index_test.py
│ │ │ ├── types.py
│ │ │ └── v100_config.yaml
│ │ ├── tf_char_cnn/
│ │ │ ├── __init__.py
│ │ │ ├── hparam_config.yaml
│ │ │ ├── hparam_config_civil_comments.yaml
│ │ │ ├── hparam_config_many_communities.yaml
│ │ │ ├── hparam_config_toxicity.yaml
│ │ │ ├── model.py
│ │ │ ├── run.deploy.sh
│ │ │ ├── run.hyperparameter.sh
│ │ │ ├── run.local.sh
│ │ │ ├── run.ml_engine.sh
│ │ │ └── run.py
│ │ ├── tf_cnn/
│ │ │ ├── __init__.py
│ │ │ ├── finetune.py
│ │ │ ├── finetune.sh
│ │ │ ├── hparam_config.yaml
│ │ │ ├── hparam_config_civil_comments.yaml
│ │ │ ├── hparam_config_many_communities.yaml
│ │ │ ├── hparam_config_many_communities_40_per_8_shot.yaml
│ │ │ ├── hparam_config_toxicity.yaml
│ │ │ ├── model.py
│ │ │ ├── run.deploy.sh
│ │ │ ├── run.hyperparameter.sh
│ │ │ ├── run.local.sh
│ │ │ ├── run.ml_engine.sh
│ │ │ └── run.py
│ │ ├── tf_gru_attention/
│ │ │ ├── __init__.py
│ │ │ ├── finetune.py
│ │ │ ├── finetune.sh
│ │ │ ├── hparam_config.yaml
│ │ │ ├── hparam_config_civil_comments.yaml
│ │ │ ├── hparam_config_many_communities.yaml
│ │ │ ├── hparam_config_many_communities_40_per_8_shot.yaml
│ │ │ ├── hparam_config_toxicity.yaml
│ │ │ ├── model.py
│ │ │ ├── run.deploy.sh
│ │ │ ├── run.hyperparameter.sh
│ │ │ ├── run.local.sh
│ │ │ ├── run.ml_engine.sh
│ │ │ └── run.py
│ │ ├── tf_hub_classifier/
│ │ │ ├── __init__.py
│ │ │ ├── finetune.py
│ │ │ ├── finetune.sh
│ │ │ ├── hparam_config.yaml
│ │ │ ├── hparam_config_civil_comments.yaml
│ │ │ ├── hparam_config_many_communities.yaml
│ │ │ ├── hparam_config_many_communities_40_per_8_shot.yaml
│ │ │ ├── hparam_config_toxicity.yaml
│ │ │ ├── model.py
│ │ │ ├── run.deploy.sh
│ │ │ ├── run.hyperparameter.sh
│ │ │ ├── run.local.sh
│ │ │ ├── run.ml_engine.sh
│ │ │ └── run.py
│ │ ├── tf_hub_tfjs/
│ │ │ ├── __init__.py
│ │ │ ├── model.py
│ │ │ ├── notebook/
│ │ │ │ ├── BiasEvaluation.ipynb
│ │ │ │ └── EvaluatingClassifier.ipynb
│ │ │ ├── run.local.sh
│ │ │ └── run.py
│ │ ├── tf_kona_prototypical_network/
│ │ │ └── proto.py
│ │ └── tf_word_label_embedding/
│ │ ├── __init__.py
│ │ ├── hparam_config.yaml
│ │ ├── model.py
│ │ ├── run.hyperparameter.sh
│ │ ├── run.local.sh
│ │ ├── run.ml_engine.sh
│ │ └── run.py
│ └── tools/
│ ├── bert_tfrecord_converter.py
│ ├── convert_csv_to_tfrecord.py
│ └── convert_jsonl_to_tfrecord.py
├── hierarchical_attention_research/
│ └── han_model/
│ ├── .gitignore
│ ├── HAN_model.py
│ ├── LICENSE
│ ├── README.md
│ ├── bn_lstm.py
│ ├── bn_lstm_test.py
│ ├── data_util.py
│ ├── model_components.py
│ ├── requirements.txt
│ ├── worker.py
│ ├── yelp.py
│ └── yelp_prepare.py
├── kaggle-classification/
│ ├── .gitignore
│ ├── README.md
│ ├── __init__.py
│ ├── bin/
│ │ ├── cancel-job
│ │ ├── ls-jobs
│ │ ├── run
│ │ ├── run_keras.sh
│ │ ├── run_keras_local.sh
│ │ ├── run_local
│ │ └── stream-logs
│ ├── config.yaml
│ ├── gpu_config.yaml
│ ├── hparam_config.yaml
│ ├── keras_hparam_config.yaml
│ ├── keras_trainer/
│ │ ├── __init__.py
│ │ ├── base_model.py
│ │ ├── cnn_with_attention.py
│ │ ├── custom_metrics.py
│ │ ├── model.py
│ │ ├── rnn.py
│ │ └── single_layer_cnn.py
│ ├── requirements.txt
│ ├── setup.py
│ └── trainer/
│ ├── __init__.py
│ ├── model.py
│ └── wikidata.py
├── model_evaluation/
│ ├── BiosBias Evaluation.ipynb
│ ├── Predict bias.ipynb
│ ├── README.md
│ ├── deploy_models.sh
│ ├── few_shot_learning_baseline_evaluation.ipynb
│ ├── input_fn_example.py
│ ├── jigsaw_evaluation_pipeline.ipynb
│ ├── requirements.txt
│ ├── score_bias_data.sh
│ ├── score_scrubbed_data.sh
│ ├── score_test_data.py
│ └── utils_export/
│ ├── __init__.py
│ ├── dataset.py
│ ├── dataset_test.py
│ ├── deploy_list_models.py
│ ├── utils_cloudml.py
│ ├── utils_cloudml_test.py
│ ├── utils_tfrecords.py
│ └── utils_tfrecords_test.py
└── travis_blase_test_support/
└── bazel_0.18.1-linux-x86_64.deb.sha256
SYMBOL INDEX (359 symbols across 65 files)
FILE: annotator_models/trainer/dawid_skene.py
function run (line 25) | def run(items,
function load_data (line 99) | def load_data(path, unit_id, worker_id, label):
function initialize (line 110) | def initialize(counts):
function m_step (line 140) | def m_step(counts, item_classes, psuedo_count):
function m_step_verbose (line 186) | def m_step_verbose(counts, item_classes, psuedo_count):
function e_step (line 233) | def e_step(counts_tiled, class_marginals, error_rates):
function e_step_verbose (line 268) | def e_step_verbose(counts, class_marginals, error_rates):
function calc_likelihood (line 306) | def calc_likelihood(counts, class_marginals, error_rates):
function random_initialization (line 347) | def random_initialization(counts):
function majority_voting (line 376) | def majority_voting(counts):
function parse_item_classes (line 404) | def parse_item_classes(df, label, item_classes, index_to_unit_id_map,
function parse_error_rates (line 468) | def parse_error_rates(df, error_rates, index_to_worker_id_map, index_to_...
function main (line 510) | def main(FLAGS):
FILE: annotator_models/trainer/dawid_skene_test.py
class DawidSkeneTest (line 15) | class DawidSkeneTest(unittest.TestCase):
method setUp (line 19) | def setUp(self):
method test_paper_example (line 53) | def test_paper_example(self):
FILE: attention-tutorial/process_figshare.py
function download_figshare (line 40) | def download_figshare(download_data_dir=DEFAULT_DATA_DIR):
function process_figshare (line 65) | def process_figshare(input_data_dir=DEFAULT_DATA_DIR,
FILE: attention-tutorial/visualize_attention.py
class wordVal (line 35) | class wordVal(object):
method __init__ (line 38) | def __init__(self, word, val):
method __str__ (line 42) | def __str__(self):
class attentionDisplay (line 46) | class attentionDisplay(object):
method __init__ (line 49) | def __init__(self, vocab_processor, classifier, words_feature='words'):
method _rgb_to_hex (line 64) | def _rgb_to_hex(self, rgb):
method _color_wordvals (line 67) | def _color_wordvals(self, s):
method _predict_sentence (line 72) | def _predict_sentence(self, input_string):
method _resize_and_tokenize (line 87) | def _resize_and_tokenize(self, input_string):
method display_prediction_attention (line 94) | def display_prediction_attention(self, input_string):
FILE: data_preparation/preprocessing/preprocessing.py
function get_identity_list (line 14) | def get_identity_list():
function get_civil_comments_spec (line 26) | def get_civil_comments_spec(include_identity_terms=True):
function split_data (line 49) | def split_data(examples, train_fraction, eval_fraction):
function Shuffle (line 65) | def Shuffle(examples): # pylint: disable=invalid-name
function write_to_tf_records (line 72) | def write_to_tf_records(examples, output_path):
class OversampleExample (line 87) | class OversampleExample(beam.DoFn):
method __init__ (line 90) | def __init__(self, rule_fn, oversample_rate):
method process (line 96) | def process(self, element):
function _select_male_toxic_example (line 104) | def _select_male_toxic_example(example,
function run_data_split (line 115) | def run_data_split(p, input_data_path, train_fraction, eval_fraction,
function run_artificial_bias (line 163) | def run_artificial_bias(p, train_input_data_path, output_folder,
FILE: data_preparation/preprocessing/tfrecord_utils.py
class Schema (line 11) | class Schema(object):
method __init__ (line 20) | def __init__(self, spec):
method as_feature_spec (line 23) | def as_feature_spec(self):
class DecodeTFRecord (line 27) | class DecodeTFRecord(beam.DoFn):
method __init__ (line 38) | def __init__(self,
method process (line 59) | def process(self, element):
class EncodeTFRecord (line 67) | class EncodeTFRecord(beam.DoFn):
method __init__ (line 70) | def __init__(self, feature_spec, optional_field_names):
method process (line 82) | def process(self, element):
FILE: data_preparation/run_preprocessing_artificial_bias.py
function _parse_arguments (line 13) | def _parse_arguments(argv):
function _set_logging (line 39) | def _set_logging(log_level):
function _parse_config (line 43) | def _parse_config(env, config_file_path):
function main (line 58) | def main():
FILE: data_preparation/run_preprocessing_data_split.py
function _parse_arguments (line 13) | def _parse_arguments(argv):
function _set_logging (line 45) | def _set_logging(log_level):
function _parse_config (line 49) | def _parse_config(env, config_file_path):
function main (line 64) | def main():
FILE: experiments/tf_trainer/common/base_model.py
class BaseModel (line 41) | class BaseModel(abc.ABC):
method map (line 48) | def map(self, f: Callable[[tf.estimator.Estimator], tf.estimator.Estim...
method estimator (line 68) | def estimator(self, model_dir: str) -> tf.estimator.Estimator:
method hparams (line 71) | def hparams(self) -> tf.contrib.training.HParams:
FILE: experiments/tf_trainer/common/cnn_spec_parser.py
class FilterParseError (line 42) | class FilterParseError(Exception):
class Filter (line 46) | class Filter(object):
method __init__ (line 52) | def __init__(self, str: str) -> None:
method __str__ (line 60) | def __str__(self) -> str:
class ConcurrentFilters (line 64) | class ConcurrentFilters(object):
method __init__ (line 70) | def __init__(self, str: str) -> None:
method __str__ (line 74) | def __str__(self) -> str:
class SequentialLayers (line 78) | class SequentialLayers(object):
method __init__ (line 84) | def __init__(self, str: str) -> None:
method __str__ (line 89) | def __str__(self) -> str:
FILE: experiments/tf_trainer/common/cnn_spec_parser_test.py
class CnnSpecParserTest (line 28) | class CnnSpecParserTest(tf.test.TestCase):
method test_SequentialLayers (line 30) | def test_SequentialLayers(self):
FILE: experiments/tf_trainer/common/dataset_input.py
class DatasetInput (line 25) | class DatasetInput(abc.ABC):
method train_input_fn (line 33) | def train_input_fn(self) -> types.EstimatorInput:
method validate_input_fn (line 37) | def validate_input_fn(self) -> types.EstimatorInput:
FILE: experiments/tf_trainer/common/episodic_tfrecord_input.py
class EpisodicTFRecordInput (line 44) | class EpisodicTFRecordInput(dataset_input.DatasetInput):
method __init__ (line 47) | def __init__(self, train_dir, validate_dir) -> None:
method train_input_fn (line 51) | def train_input_fn(self) -> types.FeatureAndLabelTensors:
method validate_input_fn (line 61) | def validate_input_fn(self) -> types.FeatureAndLabelTensors:
method _get_randomized_episodes (line 64) | def _get_randomized_episodes(self, directory: str) -> List[EpisodeData]:
method _dataset_from_tfrecord_file (line 83) | def _dataset_from_tfrecord_file(self, tfrecord_file: str) -> EpisodeData:
FILE: experiments/tf_trainer/common/episodic_tfrecord_input_test.py
class EpisodicTFRecordInputTest (line 11) | class EpisodicTFRecordInputTest(tf.test.TestCase):
method test (line 13) | def test(self):
FILE: experiments/tf_trainer/common/model_trainer.py
class InitHook (line 73) | class InitHook(tf.train.SessionRunHook):
method __init__ (line 79) | def __init__(self, checkpoint_dir):
method begin (line 83) | def begin(self):
function forward_features (line 114) | def forward_features(estimator, keys, sparse_default_values=None):
class ModelTrainer (line 228) | class ModelTrainer(object):
method __init__ (line 231) | def __init__(self, dataset: ds.DatasetInput,
method train_with_eval (line 239) | def train_with_eval(self):
method predict_on_dev (line 276) | def predict_on_dev(self, predict_keys=None):
method eval_dir (line 283) | def eval_dir(self):
method _model_dir (line 286) | def _model_dir(self):
method _add_estimator_key (line 298) | def _add_estimator_key(self, estimator, example_key_name):
method _get_best_step_from_event_file (line 304) | def _get_best_step_from_event_file(self,
method _get_best_checkpoint (line 335) | def _get_best_checkpoint(self,
method _get_list_checkpoint (line 392) | def _get_list_checkpoint(self,
method export (line 456) | def export(self,
FILE: experiments/tf_trainer/common/serving_input.py
function create_text_serving_input_fn (line 12) | def create_text_serving_input_fn(text_feature_name, example_key_name):
function create_serving_input_fn (line 32) | def create_serving_input_fn(word_to_idx,
FILE: experiments/tf_trainer/common/text_preprocessor.py
class TextPreprocessor (line 37) | class TextPreprocessor(object):
method __init__ (line 49) | def __init__(self, embeddings_path: str) -> None:
method train_preprocess_fn (line 53) | def train_preprocess_fn(self,
method add_embedding_to_model (line 92) | def add_embedding_to_model(self, model: base_model.BaseModel,
method create_estimator_with_embedding (line 104) | def create_estimator_with_embedding(
method word_to_idx (line 173) | def word_to_idx(self) -> Dict[str, int]:
method unknown_token (line 176) | def unknown_token(self) -> int:
method word_embeddings (line 179) | def word_embeddings(self, trainable) -> tf.Variable:
FILE: experiments/tf_trainer/common/text_preprocessor_test.py
class TextPreprocessorTest (line 25) | class TextPreprocessorTest(tf.test.TestCase):
method test_Tokenize (line 27) | def test_Tokenize(self):
method test_Lowercase (line 36) | def test_Lowercase(self):
FILE: experiments/tf_trainer/common/tfrecord_input.py
class TFRecordInput (line 43) | class TFRecordInput(dataset_input.DatasetInput):
method __init__ (line 53) | def __init__(self) -> None:
method labels (line 64) | def labels(self) -> List[str]:
method text_feature (line 68) | def text_feature(self) -> str:
method train_input_fn (line 72) | def train_input_fn(self) -> tf.data.TFRecordDataset:
method validate_input_fn (line 81) | def validate_input_fn(self) -> tf.data.TFRecordDataset:
method _keys_to_features (line 86) | def _keys_to_features(self):
method _input_fn_from_file (line 94) | def _input_fn_from_file(self, filepath: str) -> tf.data.TFRecordDataset:
method _process_labels (line 102) | def _process_labels(self, features, parsed):
method _read_tf_example (line 132) | def _read_tf_example(
class TFRecordInputWithTokenizer (line 148) | class TFRecordInputWithTokenizer(TFRecordInput):
method __init__ (line 160) | def __init__(self,
method _input_fn_from_file (line 167) | def _input_fn_from_file(self, filepath: str) -> types.FeatureAndLabelT...
method _read_tf_example (line 197) | def _read_tf_example(
FILE: experiments/tf_trainer/common/tfrecord_input_test.py
class TFRecordInputTest (line 31) | class TFRecordInputTest(tf.test.TestCase):
method setUp (line 33) | def setUp(self):
method test_TFRecordInput_unrounded (line 54) | def test_TFRecordInput_unrounded(self):
method test_TFRecordInput_default_values (line 68) | def test_TFRecordInput_default_values(self):
method test_TFRecordInput_rounded (line 84) | def test_TFRecordInput_rounded(self):
class TFRecordInputWithTokenizerTest (line 97) | class TFRecordInputWithTokenizerTest(tf.test.TestCase):
method setUp (line 99) | def setUp(self):
method preprocessor (line 120) | def preprocessor(self, text):
method test_TFRecordInputWithTokenizer_unrounded (line 127) | def test_TFRecordInputWithTokenizer_unrounded(self):
method test_TFRecordInputWithTokenizer_default_values (line 147) | def test_TFRecordInputWithTokenizer_default_values(self):
method test_TFRecordInputWithTokenizer_rounded (line 162) | def test_TFRecordInputWithTokenizer_rounded(self):
FILE: experiments/tf_trainer/common/token_embedding_index.py
function LoadTokenIdxEmbeddings (line 22) | def LoadTokenIdxEmbeddings(embeddings_path: str) \
FILE: experiments/tf_trainer/common/token_embedding_index_test.py
class LoadTokenIdxEmbeddingsTest (line 26) | class LoadTokenIdxEmbeddingsTest(tf.test.TestCase):
method test_LoadTokenIdxEmbeddings (line 28) | def test_LoadTokenIdxEmbeddings(self):
FILE: experiments/tf_trainer/tf_char_cnn/model.py
class TFCharCNNModel (line 43) | class TFCharCNNModel(base_model.BaseModel):
method __init__ (line 49) | def __init__(self, target_labels: Set[str]) -> None:
method hparams (line 53) | def hparams():
method estimator (line 67) | def estimator(self, model_dir):
method _model_fn (line 74) | def _model_fn(self, features, labels, mode, params, config):
FILE: experiments/tf_trainer/tf_char_cnn/run.py
function main (line 18) | def main(argv):
FILE: experiments/tf_trainer/tf_cnn/finetune.py
function main (line 34) | def main(argv):
FILE: experiments/tf_trainer/tf_cnn/model.py
class TFCNNModel (line 40) | class TFCNNModel(base_model.BaseModel):
method __init__ (line 47) | def __init__(self, target_labels: Set[str]) -> None:
method hparams (line 51) | def hparams():
method estimator (line 64) | def estimator(self, model_dir):
method _model_fn (line 71) | def _model_fn(self, features, labels, mode, params, config):
FILE: experiments/tf_trainer/tf_cnn/run.py
function main (line 24) | def main(argv):
FILE: experiments/tf_trainer/tf_gru_attention/finetune.py
function main (line 34) | def main(argv):
FILE: experiments/tf_trainer/tf_gru_attention/model.py
function attend (line 33) | def attend(inputs, attention_size, attention_depth=1):
class TFRNNModel (line 51) | class TFRNNModel(base_model.BaseModel):
method __init__ (line 53) | def __init__(self, target_labels: Set[str]) -> None:
method hparams (line 57) | def hparams():
method estimator (line 68) | def estimator(self, model_dir):
method _model_fn (line 75) | def _model_fn(self, features, labels, mode, params, config):
FILE: experiments/tf_trainer/tf_gru_attention/run.py
function main (line 26) | def main(argv):
FILE: experiments/tf_trainer/tf_hub_classifier/finetune.py
function main (line 29) | def main(argv):
FILE: experiments/tf_trainer/tf_hub_classifier/model.py
class TFHubClassifierModel (line 35) | class TFHubClassifierModel(base_model.BaseModel):
method __init__ (line 37) | def __init__(self, target_labels: List[str]) -> None:
method hparams (line 41) | def hparams():
method estimator (line 49) | def estimator(self, model_dir):
method _model_fn (line 56) | def _model_fn(self, features, labels, mode, params, config):
FILE: experiments/tf_trainer/tf_hub_classifier/run.py
function main (line 18) | def main(argv):
FILE: experiments/tf_trainer/tf_hub_tfjs/model.py
class TFHubClassifierModel (line 35) | class TFHubClassifierModel(base_model.BaseModel):
method __init__ (line 37) | def __init__(self, target_labels: List[str]) -> None:
method hparams (line 41) | def hparams():
method estimator (line 49) | def estimator(self, model_dir):
method _model_fn (line 56) | def _model_fn(self, features, labels, mode, params, config):
FILE: experiments/tf_trainer/tf_hub_tfjs/run.py
class TFRecordWithSentencePiece (line 21) | class TFRecordWithSentencePiece(tfrecord_input.TFRecordInput):
method __init__ (line 24) | def __init__(self, spm_path):
method dense_ids (line 29) | def dense_ids(self, texts):
method pieces (line 34) | def pieces(self, feature_dict, label_dict):
method _input_fn_from_file (line 44) | def _input_fn_from_file(self, filepath: str):
function main (line 54) | def main(argv):
FILE: experiments/tf_trainer/tf_kona_prototypical_network/proto.py
function distance (line 37) | def distance(embeddings, prototype):
function neg_distance (line 41) | def neg_distance(embs, proto):
function calculate_logits (line 45) | def calculate_logits(embeddings, positive_prototype, negative_prototype):
function prepare_dataset (line 51) | def prepare_dataset(data):
function encoder (line 82) | def encoder(dense_config, output_types, output_shapes):
function train_operation (line 140) | def train_operation(negative_logits, positive_logits):
function predictions_and_metrics (line 154) | def predictions_and_metrics(negative_logits, positive_logits):
function main (line 173) | def main():
FILE: experiments/tf_trainer/tf_word_label_embedding/model.py
class TFWordLabelEmbeddingModel (line 26) | class TFWordLabelEmbeddingModel(base_model.BaseModel):
method __init__ (line 28) | def __init__(self, target_label: str) -> None:
method hparams (line 33) | def hparams():
method estimator (line 41) | def estimator(self, model_dir):
method _model_fn (line 48) | def _model_fn(self, features, labels, mode, params, config):
FILE: experiments/tf_trainer/tf_word_label_embedding/run.py
function main (line 25) | def main(argv):
FILE: experiments/tools/bert_tfrecord_converter.py
function create_int_feature (line 46) | def create_int_feature(values):
function create_tokenizer_from_hub_module (line 50) | def create_tokenizer_from_hub_module(url):
function convert_tfrecord_for_bert (line 63) | def convert_tfrecord_for_bert(filenames,
FILE: experiments/tools/convert_csv_to_tfrecord.py
function convert_csv_to_tfrecord (line 37) | def convert_csv_to_tfrecord(input_csv_path,
function main (line 63) | def main(argv):
FILE: experiments/tools/convert_jsonl_to_tfrecord.py
class MisingAllTextFieldsError (line 68) | class MisingAllTextFieldsError(Exception):
class FieldsCounter (line 72) | class FieldsCounter():
method __init__ (line 74) | def __init__(self):
method inc_field (line 77) | def inc_field(self, field_name: str):
function make_selected_output_row (line 83) | def make_selected_output_row(row, line, counters):
function itr_as_dict (line 109) | def itr_as_dict(input_jsonlines_path):
function itr_as_tfrecord (line 120) | def itr_as_tfrecord(input_jsonlines_path):
function convert_to_tfrecord (line 132) | def convert_to_tfrecord(input_jsonlines_path, output_tfrecord_path):
function main (line 138) | def main(argv):
FILE: hierarchical_attention_research/han_model/HAN_model.py
class HANClassifierModel (line 8) | class HANClassifierModel():
method __init__ (line 16) | def __init__(self,
method _init_embedding (line 104) | def _init_embedding(self, scope):
method _init_body (line 115) | def _init_body(self, scope):
method get_feed_data (line 175) | def get_feed_data(self, x, y=None, class_weights=None, is_training=True):
FILE: hierarchical_attention_research/han_model/bn_lstm.py
class LSTMCell (line 13) | class LSTMCell(RNNCell):
method __init__ (line 16) | def __init__(self, num_units):
method state_size (line 20) | def state_size(self):
method output_size (line 24) | def output_size(self):
method __call__ (line 27) | def __call__(self, x, state, scope=None):
class BNLSTMCell (line 55) | class BNLSTMCell(RNNCell):
method __init__ (line 58) | def __init__(self, num_units, training):
method state_size (line 63) | def state_size(self):
method output_size (line 67) | def output_size(self):
method __call__ (line 70) | def __call__(self, x, state, scope=None):
function orthogonal (line 101) | def orthogonal(shape):
function bn_lstm_identity_initializer (line 109) | def bn_lstm_identity_initializer(scale):
function orthogonal_initializer (line 125) | def orthogonal_initializer():
function batch_norm (line 133) | def batch_norm(x, name_scope, training, epsilon=1e-3, decay=0.999):
FILE: hierarchical_attention_research/han_model/data_util.py
function batch (line 4) | def batch(inputs):
FILE: hierarchical_attention_research/han_model/model_components.py
function bidirectional_rnn (line 10) | def bidirectional_rnn(cell_fw,
function task_specific_attention (line 55) | def task_specific_attention(inputs,
FILE: hierarchical_attention_research/han_model/worker.py
function HAN_model_1 (line 59) | def HAN_model_1(session, restore_only=False):
function decode (line 115) | def decode(ex):
function batch_iterator (line 126) | def batch_iterator(dataset, batch_size, max_epochs):
function ev (line 139) | def ev(session, model, dataset):
function evaluate (line 158) | def evaluate(dataset):
function train (line 169) | def train():
function main (line 227) | def main():
FILE: hierarchical_attention_research/han_model/yelp.py
function _read_dataset (line 22) | def _read_dataset(fn, review_max_sentences=30, sentence_max_length=30,
function read_trainset (line 46) | def read_trainset(epochs=1):
function read_devset (line 50) | def read_devset(epochs=1):
function read_vocab (line 54) | def read_vocab():
function read_labels (line 59) | def read_labels():
FILE: hierarchical_attention_research/han_model/yelp_prepare.py
function read_reviews (line 19) | def read_reviews():
function build_word_frequency_distribution (line 25) | def build_word_frequency_distribution():
function build_vocabulary (line 49) | def build_vocabulary(lower=3, n=50000):
function make_data (line 72) | def make_data(split_points=(0.8, 0.94)):
FILE: kaggle-classification/keras_trainer/base_model.py
class BaseModel (line 12) | class BaseModel(metaclass=ABCMeta):
method get_model (line 16) | def get_model(self) -> Model:
FILE: kaggle-classification/keras_trainer/cnn_with_attention.py
class CNNWithAttention (line 24) | class CNNWithAttention(base_model.BaseModel):
method __init__ (line 35) | def __init__(self, embeddings_matrix, hparams, labels):
method get_model (line 41) | def get_model(self):
FILE: kaggle-classification/keras_trainer/custom_metrics.py
function auc_roc (line 10) | def auc_roc(y_true, y_pred):
FILE: kaggle-classification/keras_trainer/model.py
class ModelRunner (line 59) | class ModelRunner():
method __init__ (line 62) | def __init__(self, job_dir, embeddings_path, log_path, hparams, labels):
method train (line 80) | def train(self, train):
method predict (line 116) | def predict(self, texts):
method score_metric (line 120) | def score_metric(self, data, metric_name, metric_fn):
method score_auc (line 144) | def score_auc(self, data):
method score_precision (line 148) | def score_precision(self, data):
method score_recall (line 153) | def score_recall(self, data):
method _prep_texts (line 158) | def _prep_texts(self, texts):
method _load_model (line 163) | def _load_model(self):
method _setup_tokenizer (line 173) | def _setup_tokenizer(self):
method _setup_embeddings_matrix (line 183) | def _setup_embeddings_matrix(self):
FILE: kaggle-classification/keras_trainer/rnn.py
class RNNModel (line 13) | class RNNModel(base_model.BaseModel):
method __init__ (line 22) | def __init__(self, embeddings_matrix, hparams, labels):
method get_model (line 28) | def get_model(self):
FILE: kaggle-classification/keras_trainer/single_layer_cnn.py
class SingleLayerCnn (line 22) | class SingleLayerCnn(base_model.BaseModel):
method __init__ (line 33) | def __init__(self, embeddings_matrix, hparams, labels):
method get_model (line 39) | def get_model(self) -> Model:
FILE: kaggle-classification/trainer/model.py
function estimator_spec_for_softmax_classification (line 60) | def estimator_spec_for_softmax_classification(logits, labels, mode,
function get_cnn_model (line 139) | def get_cnn_model(embedding_size, num_filters, dropout_keep_prob):
function bag_of_words_model (line 209) | def bag_of_words_model(features, labels, mode):
function main (line 237) | def main(FLAGS):
FILE: kaggle-classification/trainer/wikidata.py
function ngrams (line 14) | def ngrams(sentence, ngram_size):
class WikiData (line 25) | class WikiData:
method __init__ (line 27) | def __init__(self,
method _load_vocab_processor (line 94) | def _load_vocab_processor(self, path):
method _load_csv (line 98) | def _load_csv(self, path):
method _split (line 110) | def _split(self, data, train_percent, x_field, y_class, seed=None):
FILE: model_evaluation/input_fn_example.py
function create_input_fn_toxicity_performance (line 45) | def create_input_fn_toxicity_performance(tokenizer, model_input_comment_...
function create_input_fn_civil_performance (line 166) | def create_input_fn_civil_performance(tokenizer, model_input_comment_fie...
function create_input_fn_civil_bias (line 191) | def create_input_fn_civil_bias(tokenizer, model_input_comment_field):
function create_input_fn_artificial_bias (line 245) | def create_input_fn_artificial_bias(tokenizer, model_input_comment_field):
function create_input_fn_biasbios (line 303) | def create_input_fn_biasbios(tokenizer, model_input_comment_field, scrub...
function create_input_fn_artificial_bias (line 340) | def create_input_fn_artificial_bias(tokenizer, model_input_comment_field):
FILE: model_evaluation/score_test_data.py
function get_input_fn (line 51) | def get_input_fn(test_data, tokenizer, model_input_comment_field):
function tokenizer (line 63) | def tokenizer(text, lowercase=True):
function score_data (line 80) | def score_data(model_names,
FILE: model_evaluation/utils_export/dataset.py
class Model (line 35) | class Model(object):
method __init__ (line 44) | def __init__(self,
method feature_keys_spec (line 79) | def feature_keys_spec(self):
method example_key (line 82) | def example_key(self):
method model_names (line 85) | def model_names(self):
method prediction_keys (line 88) | def prediction_keys(self):
method project_name (line 91) | def project_name(self):
method set_job_ids_prediction (line 94) | def set_job_ids_prediction(self, job_ids):
method job_ids_prediction (line 97) | def job_ids_prediction(self):
class Dataset (line 101) | class Dataset(object):
method __init__ (line 118) | def __init__(self, input_fn, dataset_dir):
method show_data (line 130) | def show_data(self):
method check_input_fn (line 136) | def check_input_fn(self, input_fn):
method check_compatibility (line 153) | def check_compatibility(self, model):
method load_data (line 168) | def load_data(self, max_n_examples, **kwargs):
method get_path_input_tf (line 171) | def get_path_input_tf(self):
method get_path_prediction (line 177) | def get_path_prediction(self, model_name):
method convert_data_to_tf (line 183) | def convert_data_to_tf(self, feature_keys_spec, example_key, overwrite...
method call_prediction (line 212) | def call_prediction(self, model):
method collect_prediction (line 245) | def collect_prediction(self, model, class_names):
method wait_predictions (line 258) | def wait_predictions(self, model):
method add_model_prediction_to_data (line 269) | def add_model_prediction_to_data(self, model, recompute_predictions=Tr...
FILE: model_evaluation/utils_export/dataset_test.py
class TestCompatibleInputFn (line 32) | class TestCompatibleInputFn(unittest.TestCase):
method testCorrect (line 35) | def testCorrect(self):
method testWrongArgInputFn (line 48) | def testWrongArgInputFn(self):
method testInputFnWrongType (line 59) | def testInputFnWrongType(self):
method testWrongNumberOfLines (line 72) | def testWrongNumberOfLines(self):
class TestModelCompatibleWithInputFn (line 88) | class TestModelCompatibleWithInputFn(unittest.TestCase):
method testBadTypeFeatureKeys (line 91) | def testBadTypeFeatureKeys(self):
method testInputFnMissingFeatureKeys (line 101) | def testInputFnMissingFeatureKeys(self):
method testModelIsCompatibleWithDataset (line 121) | def testModelIsCompatibleWithDataset(self):
class TestEndPipeline (line 141) | class TestEndPipeline(unittest.TestCase):
method setUp (line 146) | def setUp(self):
method testComputePredictions (line 171) | def testComputePredictions(self):
method testLoadPredictions (line 177) | def testLoadPredictions(self):
FILE: model_evaluation/utils_export/deploy_list_models.py
function get_list_models_to_export (line 37) | def get_list_models_to_export(parent_model_dir):
function check_model_exists (line 46) | def check_model_exists(project_name, model_name):
function create_model (line 59) | def create_model(project_name, model_name):
function create_version (line 73) | def create_version(project_name, model_name, version_name, model_dir):
function check_version_deployed (line 96) | def check_version_deployed(operation_id):
function deploy_model_version (line 115) | def deploy_model_version(project_name, model_name, version_name, model_d...
function _get_version_name (line 133) | def _get_version_name(model_dir, go_up_3=True):
function deploy_all_models (line 147) | def deploy_all_models(list_model_dir, project_name, model_name):
FILE: model_evaluation/utils_export/utils_cloudml.py
function call_model_predictions_from_df (line 34) | def call_model_predictions_from_df(project_name,
function _call_batch_job (line 74) | def _call_batch_job(project_name,
function _make_batch_job_body (line 106) | def _make_batch_job_body(project_name,
function check_job_over (line 159) | def check_job_over(project_name, job_name):
function add_model_predictions_to_df (line 184) | def add_model_predictions_to_df(df, prediction_file, model_col_name,
FILE: model_evaluation/utils_export/utils_cloudml_test.py
class CallModelPredictionsFromDf (line 27) | class CallModelPredictionsFromDf(unittest.TestCase):
method test_correct (line 32) | def test_correct(self):
class CheckJobOver (line 36) | class CheckJobOver(unittest.TestCase):
method test_correct (line 40) | def test_correct(self):
class AddModelPredictionsToDf (line 44) | class AddModelPredictionsToDf(unittest.TestCase):
method setUp (line 47) | def setUp(self):
method test_missing_prediction_file (line 58) | def test_missing_prediction_file(self):
method test_empty_prediction_file (line 72) | def test_empty_prediction_file(self):
method test_missing_example_key (line 86) | def test_missing_example_key(self):
method test_missing_prediction_key (line 100) | def test_missing_prediction_key(self):
method test_correct (line 113) | def test_correct(self):
FILE: model_evaluation/utils_export/utils_tfrecords.py
function _bytes_feature (line 31) | def _bytes_feature(value):
function _int64_feature (line 35) | def _int64_feature(value):
function _bytes_list_feature (line 39) | def _bytes_list_feature(value_list):
class EncodingFeatureSpec (line 45) | class EncodingFeatureSpec(object):
function is_valid_spec (line 58) | def is_valid_spec(spec):
function encode_pandas_to_tfrecords (line 72) | def encode_pandas_to_tfrecords(df,
function decode_tf_records_to_pandas (line 119) | def decode_tf_records_to_pandas(decoding_features_spec,
FILE: model_evaluation/utils_export/utils_tfrecords_test.py
class TestEncodingAndDecoding (line 29) | class TestEncodingAndDecoding(unittest.TestCase):
method testCorrect (line 32) | def testCorrect(self):
class TestFeatureKeySpec (line 60) | class TestFeatureKeySpec(unittest.TestCase):
method test_not_a_dictionary (line 63) | def test_not_a_dictionary(self):
method test_not_in_possible (line 70) | def test_not_in_possible(self):
method test_valid (line 77) | def test_valid(self):
Condensed preview — 196 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (1,047K chars).
[
{
"path": ".bazelrc",
"chars": 239,
"preview": "startup --host_jvm_args=-Xmx2500m\nstartup --host_jvm_args=-Xms2500m\nstartup --batch\ntest --ram_utilization_factor=10\n\nbu"
},
{
"path": ".gitignore",
"chars": 245,
"preview": "# Editor config.\n.vscode/\n\n# Python Compiles files.\n*.pyc\n\n# Virtual Environment files.\n.pyenv\n.virtualenv\nenv\n.venv\n\n# "
},
{
"path": ".travis.yml",
"chars": 527,
"preview": "language: python\n\npython:\n - \"3.5\"\n - \"3.6\"\n\ndist: trusty\n\naddons:\n apt:\n sources:\n - ubuntu-toolchain-r-test"
},
{
"path": "CONTRIBUTING.md",
"chars": 984,
"preview": "# How to contribute\n\nWe'd love to accept your patches and contributions to this project. There are\njust a few small guid"
},
{
"path": "LICENSE",
"chars": 11357,
"preview": " Apache License\n Version 2.0, January 2004\n "
},
{
"path": "README.md",
"chars": 1196,
"preview": "# ConversationAI Models\n\nThis repository is contains example code to train machine learning models for text classificati"
},
{
"path": "annotator_models/README.md",
"chars": 2118,
"preview": "# Modeling Anotators\n\nThis is an implementation of the [Dawid-Skene model](http://crowdsourcing-class.org/readings/downl"
},
{
"path": "annotator_models/bin/cancel-job",
"chars": 45,
"preview": "#!/bin/bash\n\ngcloud ml-engine jobs cancel $1\n"
},
{
"path": "annotator_models/bin/ls-jobs",
"chars": 53,
"preview": "#!/bin/bash\n\ngcloud ml-engine jobs list | grep $USER\n"
},
{
"path": "annotator_models/bin/run",
"chars": 2267,
"preview": "#!/bin/bash\n\n#\n# A script to train the kaggle model remotely using ml-engine.\n#\n# To run with default hyperparameters fr"
},
{
"path": "annotator_models/bin/run_local",
"chars": 731,
"preview": "#!/bin/bash\n\n# A script to train the kaggle model locally.\n\nDATE=`date '+%Y%m%d_%H%M%S'`\nBUCKET_NAME=annotator_models\n\nd"
},
{
"path": "annotator_models/bin/stream-logs",
"chars": 50,
"preview": "#!/bin/bash\n\ngcloud ml-engine jobs stream-logs $1\n"
},
{
"path": "annotator_models/cpu_config.yaml",
"chars": 260,
"preview": "trainingInput:\n scaleTier: CUSTOM\n ## Custom scaleTier needed for using > 1 GPU machines.\n # scaleTier: CUSTOM\n mast"
},
{
"path": "annotator_models/requirements.txt",
"chars": 1797,
"preview": "absl-py==0.1.12\nastor==0.6.2\nbackports.weakref==1.0.post1\nbleach==3.3.0\ncachetools==2.0.1\ncertifi==2024.7.4\nchardet==3.0"
},
{
"path": "annotator_models/results/.gitignore",
"chars": 14,
"preview": "*\n!.gitignore\n"
},
{
"path": "annotator_models/trainer/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "annotator_models/trainer/dawid_skene.py",
"chars": 22663,
"preview": "\"\"\"Description: Given unreliable ratings of items classes by multiple raters, determine the most likely true class for e"
},
{
"path": "annotator_models/trainer/dawid_skene_test.py",
"chars": 3000,
"preview": "\"\"\"Tests for dawid_skene.\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ imp"
},
{
"path": "attention-tutorial/Attention_Model_Tutorial.ipynb",
"chars": 40025,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {\n \"colab_type\": \"text\",\n \"id\": \"szO16q_1vXOT\"\n },\n"
},
{
"path": "attention-tutorial/README.md",
"chars": 1340,
"preview": "# Attention Based Classification Tutorial\n\n**Recommended time: 30 minutes**\n\n**Contributors: nthain, martin-gorner**\n\n\nT"
},
{
"path": "attention-tutorial/checkpoints/README.md",
"chars": 57,
"preview": "This directory stores model checkpoints during training.\n"
},
{
"path": "attention-tutorial/data/README.md",
"chars": 38,
"preview": "A directory to hold our toxicity data."
},
{
"path": "attention-tutorial/process_figshare.py",
"chars": 3750,
"preview": "\"\"\"Cleans and splits the toxicity data from Figshare:\n\nhttps://figshare.com/articles/Wikipedia_Talk_Labels_Toxicity/4563"
},
{
"path": "attention-tutorial/requirements.txt",
"chars": 1186,
"preview": "absl-py==0.1.9\nappnope==0.1.0\nbleach==3.3.0\ncertifi==2024.7.4\nchardet==3.0.4\ncomet-ml==1.0.8\ndecorator==4.2.1\nentrypoint"
},
{
"path": "attention-tutorial/visualize_attention.py",
"chars": 3505,
"preview": "\"\"\"A class to help visualize attention weights.\n\n-----------------------------------------------------------------------"
},
{
"path": "data_preparation/README.md",
"chars": 2549,
"preview": "# Dataset preparation\n\nThis directory contains some steps to prepare our data before training our ML models. In particul"
},
{
"path": "data_preparation/config.ini",
"chars": 234,
"preview": "[CLOUD]\nproject = wikidetox\nrunner = DataflowRunner\nmax_num_workers = 50\ndefaultWorkerLogLevel = INFO\nlog_level = ERROR\n"
},
{
"path": "data_preparation/preprocessing/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "data_preparation/preprocessing/constants.py",
"chars": 181,
"preview": "\"\"\"Constants variables for preprocessing.\"\"\"\n\nTRAIN_DATA_PREFIX = 'train'\nEVAL_DATA_PREFIX = 'eval'\nTEST_DATA_PREFIX = '"
},
{
"path": "data_preparation/preprocessing/preprocessing.py",
"chars": 6835,
"preview": "\"\"\"Preprocessing steps of the data preparation.\"\"\"\n\nimport os\nimport random\n\nimport apache_beam as beam\nimport tensorflo"
},
{
"path": "data_preparation/preprocessing/tfrecord_utils.py",
"chars": 3291,
"preview": "\"\"\"Utilities to decode and encode TF Records.\n\nThese utilities are wrappers around TF-Tranform coders to handle the\n "
},
{
"path": "data_preparation/requirements.txt",
"chars": 89,
"preview": "apache-beam[gcp]==2.2.0\nconfigparser==3.5.0\ntensorflow==2.12.1\ntensorflow_transform==0.9\n"
},
{
"path": "data_preparation/run_preprocessing_artificial_bias.py",
"chars": 2845,
"preview": "\"\"\"Sets up and start the Dataflow job for data preparation.\"\"\"\n\nimport argparse\nimport logging\nimport os\nimport sys\n\nimp"
},
{
"path": "data_preparation/run_preprocessing_data_split.py",
"chars": 3069,
"preview": "\"\"\"Sets up and start the Dataflow job for data preparation.\"\"\"\n\nimport argparse\nimport logging\nimport os\nimport sys\n\nimp"
},
{
"path": "data_preparation/setup.py",
"chars": 245,
"preview": "from setuptools import setup, find_packages\n\nNAME = 'jigsaw'\nVERSION = '1.0'\nREQUIRED_PACKAGES = ['tensorflow-transform="
},
{
"path": "experiments/.gitignore",
"chars": 182,
"preview": "# Ignore local data, e.g. copies of embeddings\nlocal_data\n\n# Ignore local tmp files and directories\ntmp\n\n# Local config "
},
{
"path": "experiments/README.md",
"chars": 4711,
"preview": "# Text Classification Framework\n\nThis directory contains an ML framework for text classification. We illustrate\nit with "
},
{
"path": "experiments/WORKSPACE",
"chars": 24,
"preview": "# Bazel Workspace File.\n"
},
{
"path": "experiments/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "experiments/requirements.txt",
"chars": 547,
"preview": "absl-py==0.7.0\nastor==0.7.1\nbert-tensorflow==1.0.1\nbleach==3.3.0\ncertifi==2024.7.4\nchardet==3.0.4\ngast==0.2.2\ngcsfs==0.2"
},
{
"path": "experiments/setup.py",
"chars": 394,
"preview": "from setuptools import find_packages\nfrom setuptools import setup\n\nREQUIRED_PACKAGES = [\n 'nltk>=3.3',\n 'typed_ast"
},
{
"path": "experiments/testdata/BUILD",
"chars": 182,
"preview": "exports_files([\n \"cats_and_dogs_onehot.vocab.txt\",\n \"cats_and_dogs_with_cat_opt_int_labels.jsonl\",\n \"cats_and_dogs_wi"
},
{
"path": "experiments/testdata/cats_and_dogs.jsonl",
"chars": 674,
"preview": "{ \"text\": \"cats good\", \"bad\": 0.0 }\n{ \"text\": \"cats bad\", \"bad\": 1.0 }\n{ \"text\": \"dogs good\", \"bad\": 0.0 }\n{ \"text\": \"do"
},
{
"path": "experiments/testdata/cats_and_dogs_onehot.vocab.txt",
"chars": 171,
"preview": "dogs 1.0 0.0 0.0 0.0 0.0 0.0\ncats 0.0 1.0 0.0 0.0 0.0 0.0\ngood 0.0 0.0 1.0 0.0 0.0 0.0\nbad 0.0 0.0 0.0 1.0 0.0 0.0\nand 0"
},
{
"path": "experiments/testdata/cats_and_dogs_with_cat_opt_int_labels.jsonl",
"chars": 807,
"preview": "{ \"text\": \"cats good\", \"bad\": 0.0, \"cat\": 1 }\n{ \"text\": \"cats bad\", \"bad\": 1.0, \"cat\": 1 }\n{ \"text\": \"dogs good\", \"bad\":"
},
{
"path": "experiments/testdata/cats_and_dogs_with_partial_cat_int_labels.jsonl",
"chars": 785,
"preview": "{ \"text\": \"cats good\", \"bad\": 0.0, \"cat\": 1 }\n{ \"text\": \"cats bad\", \"bad\": 1.0, \"cat\": 1 }\n{ \"text\": \"dogs good\", \"bad\":"
},
{
"path": "experiments/tf_trainer/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "experiments/tf_trainer/common/BUILD",
"chars": 1955,
"preview": "py_library(\n name = \"types\",\n srcs = [\n \"types.py\",\n ],\n)\n\npy_library(\n name = \"model_trainer\",\n s"
},
{
"path": "experiments/tf_trainer/common/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "experiments/tf_trainer/common/base_model.py",
"chars": 2206,
"preview": "# coding=utf-8\n# Copyright 2018 The Conversation-AI.github.io Authors.\n#\n# Licensed under the Apache License, Version 2."
},
{
"path": "experiments/tf_trainer/common/basic_gpu_config.yaml",
"chars": 60,
"preview": "trainingInput:\n pythonVersion: '3.5'\n scaleTier: BASIC_GPU"
},
{
"path": "experiments/tf_trainer/common/cnn_spec_parser.py",
"chars": 2691,
"preview": "# coding=utf-8\n# Copyright 2018 The Conversation-AI.github.io Authors.\n#\n# Licensed under the Apache License, Version 2."
},
{
"path": "experiments/tf_trainer/common/cnn_spec_parser_test.py",
"chars": 1529,
"preview": "# coding=utf-8\n# Copyright 2018 The Conversation-AI.github.io Authors.\n#\n# Licensed under the Apache License, Version 2."
},
{
"path": "experiments/tf_trainer/common/dataset_config.sh",
"chars": 2213,
"preview": "#!/bin/bash\n\nBASE_PATH=\"gs://conversationai-models\"\nGCS_RESOURCES=\"${BASE_PATH}/resources\"\nMODEL_PARENT_DIR=\"${BASE_PATH"
},
{
"path": "experiments/tf_trainer/common/dataset_input.py",
"chars": 1219,
"preview": "# coding=utf-8\n# Copyright 2018 The Conversation-AI.github.io Authors.\n#\n# Licensed under the Apache License, Version 2."
},
{
"path": "experiments/tf_trainer/common/episodic_tfrecord_input.py",
"chars": 3996,
"preview": "# coding=utf-8\n# Copyright 2018 The Conversation-AI.github.io Authors.\n#\n# Licensed under the Apache License, Version 2."
},
{
"path": "experiments/tf_trainer/common/episodic_tfrecord_input_test.py",
"chars": 882,
"preview": "\"\"\"Tests for episodic_tfrecord_input.\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __"
},
{
"path": "experiments/tf_trainer/common/model_trainer.py",
"chars": 20530,
"preview": "# coding=utf-8\n# Copyright 2018 The Conversation-AI.github.io Authors.\n#\n# Licensed under the Apache License, Version 2."
},
{
"path": "experiments/tf_trainer/common/p100_config.yaml",
"chars": 191,
"preview": "trainingInput:\n pythonVersion: '3.5'\n scaleTier: CUSTOM\n masterType: standard_p100\n workerType: standard_p100\n para"
},
{
"path": "experiments/tf_trainer/common/serving_input.py",
"chars": 2141,
"preview": "\"\"\"Serving functions for deployed model.\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom"
},
{
"path": "experiments/tf_trainer/common/text_preprocessor.py",
"chars": 6875,
"preview": "# coding=utf-8\n# Copyright 2018 The Conversation-AI.github.io Authors.\n#\n# Licensed under the Apache License, Version 2."
},
{
"path": "experiments/tf_trainer/common/text_preprocessor_test.py",
"chars": 1767,
"preview": "# coding=utf-8\n# Copyright 2018 The Conversation-AI.github.io Authors.\n#\n# Licensed under the Apache License, Version 2."
},
{
"path": "experiments/tf_trainer/common/tfrecord_input.py",
"chars": 8274,
"preview": "\"\"\"DatasetInput class based on TFRecord files.\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import divisio"
},
{
"path": "experiments/tf_trainer/common/tfrecord_input_test.py",
"chars": 7121,
"preview": "# coding=utf-8\n# Copyright 2018 The Conversation-AI.github.io Authors.\n#\n# Licensed under the Apache License, Version 2."
},
{
"path": "experiments/tf_trainer/common/token_embedding_index.py",
"chars": 2706,
"preview": "# coding=utf-8\n# Copyright 2018 The Conversation-AI.github.io Authors.\n#\n# Licensed under the Apache License, Version 2."
},
{
"path": "experiments/tf_trainer/common/token_embedding_index_test.py",
"chars": 1553,
"preview": "# coding=utf-8\n# Copyright 2018 The Conversation-AI.github.io Authors.\n#\n# Licensed under the Apache License, Version 2."
},
{
"path": "experiments/tf_trainer/common/types.py",
"chars": 1205,
"preview": "# coding=utf-8\n# Copyright 2018 The Conversation-AI.github.io Authors.\n#\n# Licensed under the Apache License, Version 2."
},
{
"path": "experiments/tf_trainer/common/v100_config.yaml",
"chars": 86,
"preview": "trainingInput:\n pythonVersion: '3.5'\n scaleTier: CUSTOM\n masterType: standard_v100\n"
},
{
"path": "experiments/tf_trainer/tf_char_cnn/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "experiments/tf_trainer/tf_char_cnn/hparam_config.yaml",
"chars": 1365,
"preview": "trainingInput:\n pythonVersion: '3.5'\n # scaleTier: CUSTOM\n # masterType: standard\n # workerType: standard_gpu\n # pa"
},
{
"path": "experiments/tf_trainer/tf_char_cnn/hparam_config_civil_comments.yaml",
"chars": 1151,
"preview": "trainingInput:\n pythonVersion: '3.5'\n scaleTier: BASIC_GPU\n hyperparameters:\n goal: MAXIMIZE\n hyperparameterMet"
},
{
"path": "experiments/tf_trainer/tf_char_cnn/hparam_config_many_communities.yaml",
"chars": 1150,
"preview": "trainingInput:\n pythonVersion: '3.5'\n scaleTier: BASIC_GPU\n hyperparameters:\n goal: MAXIMIZE\n hyperparameterMet"
},
{
"path": "experiments/tf_trainer/tf_char_cnn/hparam_config_toxicity.yaml",
"chars": 1137,
"preview": "trainingInput:\n pythonVersion: '3.5'\n scaleTier: BASIC_GPU\n hyperparameters:\n goal: MAXIMIZE\n hyperparameterMet"
},
{
"path": "experiments/tf_trainer/tf_char_cnn/model.py",
"chars": 4657,
"preview": "\"\"\"Tensorflow Estimator Character CNN.\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom _"
},
{
"path": "experiments/tf_trainer/tf_char_cnn/run.deploy.sh",
"chars": 808,
"preview": "#!/bin/bash\n# Deploys a saved model on Cloud MLE.\n\nif [ \"$1\" == \"civil_comments\" ] || [ \"$1\" == \"toxicity\" ] || [ \"$1\" ="
},
{
"path": "experiments/tf_trainer/tf_char_cnn/run.hyperparameter.sh",
"chars": 969,
"preview": "#!/bin/bash\n\nsource \"tf_trainer/common/dataset_config.sh\"\nDATETIME=$(date '+%Y%m%d_%H%M%S')\nMODEL_NAME=\"tf_char_cnn\"\nMOD"
},
{
"path": "experiments/tf_trainer/tf_char_cnn/run.local.sh",
"chars": 257,
"preview": "#!/bin/bash\n\nsource \"tf_trainer/common/dataset_config.sh\"\n\npython -m tf_trainer.tf_char_cnn.run \\\n --train_path=$train_"
},
{
"path": "experiments/tf_trainer/tf_char_cnn/run.ml_engine.sh",
"chars": 967,
"preview": "#!/bin/bash\n\nsource \"tf_trainer/common/dataset_config.sh\"\nDATETIME=$(date '+%Y%m%d_%H%M%S')\nMODEL_NAME=\"tf_char_cnn\"\nMOD"
},
{
"path": "experiments/tf_trainer/tf_char_cnn/run.py",
"chars": 1065,
"preview": "\"\"\"Experiments with toxicity, civil_comments, many_communities datasets.\"\"\"\n\nfrom __future__ import absolute_import\nfrom"
},
{
"path": "experiments/tf_trainer/tf_cnn/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "experiments/tf_trainer/tf_cnn/finetune.py",
"chars": 2504,
"preview": "\"\"\"Experiments with many_communities dataset.\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division"
},
{
"path": "experiments/tf_trainer/tf_cnn/finetune.sh",
"chars": 2732,
"preview": "#!/bin/bash\n\nBASE_PATH=\"gs://conversationai-models\"\nGCS_RESOURCES=\"${BASE_PATH}/resources\"\n\nwarm_start_from=\"gs://conver"
},
{
"path": "experiments/tf_trainer/tf_cnn/hparam_config.yaml",
"chars": 1365,
"preview": "trainingInput:\n pythonVersion: '3.5'\n # scaleTier: CUSTOM\n # masterType: standard\n # workerType: standard_gpu\n # pa"
},
{
"path": "experiments/tf_trainer/tf_cnn/hparam_config_civil_comments.yaml",
"chars": 1151,
"preview": "trainingInput:\n pythonVersion: '3.5'\n scaleTier: BASIC_GPU\n hyperparameters:\n goal: MAXIMIZE\n hyperparameterMet"
},
{
"path": "experiments/tf_trainer/tf_cnn/hparam_config_many_communities.yaml",
"chars": 1150,
"preview": "trainingInput:\n pythonVersion: '3.5'\n scaleTier: BASIC_GPU\n hyperparameters:\n goal: MAXIMIZE\n hyperparameterMet"
},
{
"path": "experiments/tf_trainer/tf_cnn/hparam_config_many_communities_40_per_8_shot.yaml",
"chars": 1133,
"preview": "trainingInput:\n pythonVersion: '3.5'\n scaleTier: BASIC_GPU\n hyperparameters:\n goal: MAXIMIZE\n hyperparameterMet"
},
{
"path": "experiments/tf_trainer/tf_cnn/hparam_config_toxicity.yaml",
"chars": 1137,
"preview": "trainingInput:\n pythonVersion: '3.5'\n scaleTier: BASIC_GPU\n hyperparameters:\n goal: MAXIMIZE\n hyperparameterMet"
},
{
"path": "experiments/tf_trainer/tf_cnn/model.py",
"chars": 3882,
"preview": "\"\"\"Tensorflow Estimator CNN.\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ "
},
{
"path": "experiments/tf_trainer/tf_cnn/run.deploy.sh",
"chars": 809,
"preview": "#!/bin/bash\n# Deploys a saved model on Cloud MLE.\n\nif [ \"$1\" == \"civil_comments\" ] || [ \"$1\" == \"toxicity\" ] || [ \"$1\" ="
},
{
"path": "experiments/tf_trainer/tf_cnn/run.hyperparameter.sh",
"chars": 1075,
"preview": "#!/bin/bash\n\nsource \"tf_trainer/common/dataset_config.sh\"\nDATETIME=$(date '+%Y%m%d_%H%M%S')\nMODEL_NAME=\"tf_cnn\"\nMODEL_NA"
},
{
"path": "experiments/tf_trainer/tf_cnn/run.local.sh",
"chars": 315,
"preview": "#!/bin/bash\n\nsource \"tf_trainer/common/dataset_config.sh\"\n\npython -m tf_trainer.tf_cnn.run \\\n --train_path=$train_path "
},
{
"path": "experiments/tf_trainer/tf_cnn/run.ml_engine.sh",
"chars": 3004,
"preview": "#!/bin/bash\n\nsource \"tf_trainer/common/dataset_config.sh\"\nDATETIME=$(date '+%Y%m%d_%H%M%S')\nMODEL_NAME=\"tf_cnn\"\nMODEL_NA"
},
{
"path": "experiments/tf_trainer/tf_cnn/run.py",
"chars": 1861,
"preview": "\"\"\"Experiments with toxicity, civil_comments, many_communities datasets.\"\"\"\n\nfrom __future__ import absolute_import\nfrom"
},
{
"path": "experiments/tf_trainer/tf_gru_attention/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "experiments/tf_trainer/tf_gru_attention/finetune.py",
"chars": 2534,
"preview": "\"\"\"Experiments with many_communities dataset.\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division"
},
{
"path": "experiments/tf_trainer/tf_gru_attention/finetune.sh",
"chars": 2881,
"preview": "#!/bin/bash\n\nBASE_PATH=\"gs://conversationai-models\"\nGCS_RESOURCES=\"${BASE_PATH}/resources\"\n\nwarm_start_from=\"gs://conver"
},
{
"path": "experiments/tf_trainer/tf_gru_attention/hparam_config.yaml",
"chars": 1246,
"preview": "trainingInput:\n pythonVersion: '3.5'\n scaleTier: CUSTOM\n masterType: standard\n workerType: standard_gpu\n parameterS"
},
{
"path": "experiments/tf_trainer/tf_gru_attention/hparam_config_civil_comments.yaml",
"chars": 1079,
"preview": "trainingInput:\n pythonVersion: '3.5'\n scaleTier: BASIC_GPU\n hyperparameters:\n goal: MAXIMIZE\n hyperparameterMet"
},
{
"path": "experiments/tf_trainer/tf_gru_attention/hparam_config_many_communities.yaml",
"chars": 1077,
"preview": "trainingInput:\n pythonVersion: '3.5'\n scaleTier: BASIC_GPU\n hyperparameters:\n goal: MAXIMIZE\n hyperparameterMet"
},
{
"path": "experiments/tf_trainer/tf_gru_attention/hparam_config_many_communities_40_per_8_shot.yaml",
"chars": 1062,
"preview": "trainingInput:\n pythonVersion: '3.5'\n scaleTier: BASIC_GPU\n hyperparameters:\n goal: MAXIMIZE\n hyperparameterMet"
},
{
"path": "experiments/tf_trainer/tf_gru_attention/hparam_config_toxicity.yaml",
"chars": 1080,
"preview": "trainingInput:\n pythonVersion: '3.5'\n scaleTier: BASIC_GPU\n hyperparameters:\n goal: MAXIMIZE\n hyperparameterMet"
},
{
"path": "experiments/tf_trainer/tf_gru_attention/model.py",
"chars": 4054,
"preview": "\"\"\"Tensorflow Estimator implementation of RNN Model with Attention\"\"\"\n\nfrom __future__ import absolute_import\nfrom __fut"
},
{
"path": "experiments/tf_trainer/tf_gru_attention/run.deploy.sh",
"chars": 819,
"preview": "#!/bin/bash\n# Deploys a saved model on Cloud MLE.\n\nif [ \"$1\" == \"civil_comments\" ] || [ \"$1\" == \"toxicity\" ] || [ \"$1\" ="
},
{
"path": "experiments/tf_trainer/tf_gru_attention/run.hyperparameter.sh",
"chars": 1087,
"preview": "#!/bin/bash\n\nsource \"tf_trainer/common/dataset_config.sh\"\nDATETIME=$(date '+%Y%m%d_%H%M%S')\nMODEL_NAME=\"tf_gru_attention"
},
{
"path": "experiments/tf_trainer/tf_gru_attention/run.local.sh",
"chars": 857,
"preview": "#!/bin/bash\n\n# Note:\n# We currently use 2 different embeddings:\n# - glove.6B/glove.6B.300d.txt\n# - google-news/GoogleNew"
},
{
"path": "experiments/tf_trainer/tf_gru_attention/run.ml_engine.sh",
"chars": 3544,
"preview": "#!/bin/bash\n# This script runs one training job on Cloud MLE.\n\n# Note:\n# We currently use 2 different embeddings:\n# - gl"
},
{
"path": "experiments/tf_trainer/tf_gru_attention/run.py",
"chars": 1857,
"preview": "\"\"\"Experiments with Toxicity Dataset\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __f"
},
{
"path": "experiments/tf_trainer/tf_hub_classifier/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "experiments/tf_trainer/tf_hub_classifier/finetune.py",
"chars": 1949,
"preview": "\"\"\"Experiments with many_communities dataset.\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division"
},
{
"path": "experiments/tf_trainer/tf_hub_classifier/finetune.sh",
"chars": 2592,
"preview": "#!/bin/bash\n\nBASE_PATH=\"gs://conversationai-models\"\nGCS_RESOURCES=\"${BASE_PATH}/resources\"\n\nwarm_start_from=\"gs://conver"
},
{
"path": "experiments/tf_trainer/tf_hub_classifier/hparam_config.yaml",
"chars": 964,
"preview": "trainingInput:\n pythonVersion: '3.5'\n scaleTier: CUSTOM\n masterType: standard\n workerType: standard_gpu\n parameterS"
},
{
"path": "experiments/tf_trainer/tf_hub_classifier/hparam_config_civil_comments.yaml",
"chars": 830,
"preview": "trainingInput:\n pythonVersion: '3.5'\n scaleTier: BASIC_GPU\n hyperparameters:\n goal: MAXIMIZE\n hyperparameterMet"
},
{
"path": "experiments/tf_trainer/tf_hub_classifier/hparam_config_many_communities.yaml",
"chars": 829,
"preview": "trainingInput:\n pythonVersion: '3.5'\n scaleTier: BASIC_GPU\n hyperparameters:\n goal: MAXIMIZE\n hyperparameterMet"
},
{
"path": "experiments/tf_trainer/tf_hub_classifier/hparam_config_many_communities_40_per_8_shot.yaml",
"chars": 787,
"preview": "trainingInput:\n pythonVersion: '3.5'\n scaleTier: BASIC_GPU\n hyperparameters:\n goal: MAXIMIZE\n hyperparameterMet"
},
{
"path": "experiments/tf_trainer/tf_hub_classifier/hparam_config_toxicity.yaml",
"chars": 830,
"preview": "trainingInput:\n pythonVersion: '3.5'\n scaleTier: BASIC_GPU\n hyperparameters:\n goal: MAXIMIZE\n hyperparameterMet"
},
{
"path": "experiments/tf_trainer/tf_hub_classifier/model.py",
"chars": 3133,
"preview": "\"\"\"Tensorflow Estimator using TF Hub universal sentence encoder.\"\"\"\n\nfrom __future__ import absolute_import\nfrom __futur"
},
{
"path": "experiments/tf_trainer/tf_hub_classifier/run.deploy.sh",
"chars": 814,
"preview": "#!/bin/bash\n# Deploys a saved model on Cloud MLE.\n\nif [ \"$1\" == \"civil_comments\" ] || [ \"$1\" == \"toxicity\" ] || [ \"$1\" ="
},
{
"path": "experiments/tf_trainer/tf_hub_classifier/run.hyperparameter.sh",
"chars": 1126,
"preview": "#!/bin/bash\n\nsource \"tf_trainer/common/dataset_config.sh\"\nDATETIME=$(date '+%Y%m%d_%H%M%S')\nMODEL_NAME=\"tf_hub_classifie"
},
{
"path": "experiments/tf_trainer/tf_hub_classifier/run.local.sh",
"chars": 407,
"preview": "#!/bin/bash\n\nsource \"tf_trainer/common/dataset_config.sh\"\n\npython -m tf_trainer.tf_hub_classifier.run \\\n --train_path=$"
},
{
"path": "experiments/tf_trainer/tf_hub_classifier/run.ml_engine.sh",
"chars": 2741,
"preview": "#!/bin/bash\n# This script runs one training job on Cloud MLE.\n\nsource \"tf_trainer/common/dataset_config.sh\"\nDATETIME=$(d"
},
{
"path": "experiments/tf_trainer/tf_hub_classifier/run.py",
"chars": 1052,
"preview": "\"\"\"Experiments with Toxicity Dataset\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __f"
},
{
"path": "experiments/tf_trainer/tf_hub_tfjs/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "experiments/tf_trainer/tf_hub_tfjs/model.py",
"chars": 2979,
"preview": "\"\"\"Tensorflow Estimator using TF Hub universal sentence encoder.\"\"\"\n\nfrom __future__ import absolute_import\nfrom __futur"
},
{
"path": "experiments/tf_trainer/tf_hub_tfjs/notebook/BiasEvaluation.ipynb",
"chars": 139726,
"preview": "{\n \"nbformat\": 4,\n \"nbformat_minor\": 0,\n \"metadata\": {\n \"colab\": {\n \"name\": \"BiasEvaluation.ipynb\",\n \"ve"
},
{
"path": "experiments/tf_trainer/tf_hub_tfjs/notebook/EvaluatingClassifier.ipynb",
"chars": 69156,
"preview": "{\n \"nbformat\": 4,\n \"nbformat_minor\": 0,\n \"metadata\": {\n \"colab\": {\n \"name\": \"EvaluatingClassifier.ipynb\",\n "
},
{
"path": "experiments/tf_trainer/tf_hub_tfjs/run.local.sh",
"chars": 318,
"preview": "#!/bin/bash\n\nsource \"tf_trainer/common/dataset_config.sh\"\n\npython -m tf_trainer.tf_hub_tfjs.run \\\n --train_path=$train_"
},
{
"path": "experiments/tf_trainer/tf_hub_tfjs/run.py",
"chars": 2723,
"preview": "\"\"\"Experiments with Toxicity Dataset\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __f"
},
{
"path": "experiments/tf_trainer/tf_kona_prototypical_network/proto.py",
"chars": 12663,
"preview": "import numpy as np\nimport tensorflow as tf\nimport tensorflow_hub as hub\nimport pandas as pd\nimport sys\nimport datetime\ni"
},
{
"path": "experiments/tf_trainer/tf_word_label_embedding/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "experiments/tf_trainer/tf_word_label_embedding/hparam_config.yaml",
"chars": 907,
"preview": "trainingInput:\n ## BASIC_GPU uses single NVIDIA Tesla K80 GPU.\n pythonVersion: '3.5'\n scaleTier: BASIC_GPU\n hyperpar"
},
{
"path": "experiments/tf_trainer/tf_word_label_embedding/model.py",
"chars": 4039,
"preview": "\"\"\"Tensorflow Estimator implementation of Word Label Embeddings.\"\"\"\n\nfrom __future__ import absolute_import\nfrom __futur"
},
{
"path": "experiments/tf_trainer/tf_word_label_embedding/run.hyperparameter.sh",
"chars": 1028,
"preview": "#!/bin/bash\n\nsource \"tf_trainer/common/dataset_config.sh\"\nDATETIME=$(date '+%Y%m%d_%H%M%S')\nMODEL_NAME=\"tf_word_label_em"
},
{
"path": "experiments/tf_trainer/tf_word_label_embedding/run.local.sh",
"chars": 280,
"preview": "#!/bin/bash\n\nsource \"tf_trainer/common/dataset_config.sh\"\n\npython -m tf_trainer.tf_word_label_embedding.run \\\n --train_"
},
{
"path": "experiments/tf_trainer/tf_word_label_embedding/run.ml_engine.sh",
"chars": 1398,
"preview": "#!/bin/bash\n# This script runs one training job on Cloud MLE.\n\n# Note:\n# We currently use 2 different embeddings:\n# - gl"
},
{
"path": "experiments/tf_trainer/tf_word_label_embedding/run.py",
"chars": 1457,
"preview": "\"\"\"Experiments with Toxicity Dataset\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __f"
},
{
"path": "experiments/tools/bert_tfrecord_converter.py",
"chars": 5208,
"preview": "# coding=utf-8\n# Copyright 2018 The Conversation-AI.github.io Authors.\n#\n# Licensed under the Apache License, Version 2."
},
{
"path": "experiments/tools/convert_csv_to_tfrecord.py",
"chars": 3017,
"preview": "# coding=utf-8\n# Copyright 2018 The Conversation-AI.github.io Authors.\n#\n# Licensed under the Apache License, Version 2."
},
{
"path": "experiments/tools/convert_jsonl_to_tfrecord.py",
"chars": 4812,
"preview": "# coding=utf-8\n# Copyright 2018 The Conversation-AI.github.io Authors.\n#\n# Licensed under the Apache License, Version 2."
},
{
"path": "hierarchical_attention_research/han_model/.gitignore",
"chars": 1045,
"preview": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packagi"
},
{
"path": "hierarchical_attention_research/han_model/HAN_model.py",
"chars": 7750,
"preview": "import tensorflow as tf\nimport tensorflow.contrib.layers as layers\nimport numpy as np\nimport data_util\nfrom model_compon"
},
{
"path": "hierarchical_attention_research/han_model/LICENSE",
"chars": 1069,
"preview": "MIT License\n\nCopyright (c) 2017 Matvey Ezhov\n\nPermission is hereby granted, free of charge, to any person obtaining a co"
},
{
"path": "hierarchical_attention_research/han_model/README.md",
"chars": 871,
"preview": "# Deep Text Classifier\n\nImplementation of document classification model described in [Hierarchical Attention Networks fo"
},
{
"path": "hierarchical_attention_research/han_model/bn_lstm.py",
"chars": 4984,
"preview": "# borrowed from https://github.com/OlavHN/bnlstm, updated for r1.0\n\nimport math\nimport numpy as np\nimport tensorflow as "
},
{
"path": "hierarchical_attention_research/han_model/bn_lstm_test.py",
"chars": 2786,
"preview": "import time\nimport uuid\nimport os\nimport numpy as np\nimport tensorflow as tf\nfrom tensorflow.python.ops.rnn import dynam"
},
{
"path": "hierarchical_attention_research/han_model/data_util.py",
"chars": 731,
"preview": "import numpy as np\n\n\ndef batch(inputs):\n batch_size = len(inputs)\n\n document_sizes = np.array([len(doc) for doc in inp"
},
{
"path": "hierarchical_attention_research/han_model/model_components.py",
"chars": 3423,
"preview": "import tensorflow as tf\nimport tensorflow.contrib.layers as layers\n\ntry:\n from tensorflow.contrib.rnn import LSTMStateT"
},
{
"path": "hierarchical_attention_research/han_model/requirements.txt",
"chars": 310,
"preview": "cymem==1.31.2\ncytoolz==0.8.2\ndill==0.2.7.1\nen-core-web-sm==2.0.0\nmsgpack-numpy==0.4.1\nmsgpack-python==0.5.6\nmurmurhash=="
},
{
"path": "hierarchical_attention_research/han_model/worker.py",
"chars": 7021,
"preview": "#!/usr/bin/env python3\nimport argparse\nparser = argparse.ArgumentParser()\nparser.add_argument('--task', default='yelp', "
},
{
"path": "hierarchical_attention_research/han_model/yelp.py",
"chars": 1316,
"preview": "import os\nimport pickle\n\ntrain_dir = os.path.join(os.path.curdir, 'yelp')\ndata_dir = os.path.join(train_dir, 'data')\n\nfo"
},
{
"path": "hierarchical_attention_research/han_model/yelp_prepare.py",
"chars": 2332,
"preview": "import argparse\nparser = argparse.ArgumentParser()\nparser.add_argument('review_path')\nargs = parser.parse_args()\n\nimport"
},
{
"path": "kaggle-classification/.gitignore",
"chars": 367,
"preview": "# Directories to save model checkpoints\nruns/\nmodel/*\nsaved_models/*\n\n# Byte-compiled / optimized / DLL files\n__pycache_"
},
{
"path": "kaggle-classification/README.md",
"chars": 2519,
"preview": "# Toxic Comment Classification Kaggle Challenge\n\nThis directory is a place to play around with solutions for the\n[Toxic "
},
{
"path": "kaggle-classification/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "kaggle-classification/bin/cancel-job",
"chars": 45,
"preview": "#!/bin/bash\n\ngcloud ml-engine jobs cancel $1\n"
},
{
"path": "kaggle-classification/bin/ls-jobs",
"chars": 78,
"preview": "#!/bin/bash\n\nDATE=`date '+%Y-%m-%d'`\n\ngcloud ml-engine jobs list | grep $DATE\n"
},
{
"path": "kaggle-classification/bin/run",
"chars": 2126,
"preview": "#!/bin/bash\n\n#\n# A script to train the kaggle model remotely using ml-engine.\n#\n# To run with default hyperparameters fr"
},
{
"path": "kaggle-classification/bin/run_keras.sh",
"chars": 1666,
"preview": "#!/bin/bash\n\n#\n# A script to train the kaggle model remotely using ml-engine.\n#\n# Setup Steps:\n# 1. Install the gcloud S"
},
{
"path": "kaggle-classification/bin/run_keras_local.sh",
"chars": 897,
"preview": "#!/bin/bash\n\nDATE=`date '+%Y%m%d_%H%M%S'`\nOUTPUT_PATH=runs/${DATE}\nINPUT_PATH=local_data\nLOG_PATH=${OUTPUT_PATH}/logs/\nC"
},
{
"path": "kaggle-classification/bin/run_local",
"chars": 397,
"preview": "#!/bin/bash\n\n#\n# A script to train the kaggle model locally.\n# Assumes that train.csv and test.csv are downloaded into t"
},
{
"path": "kaggle-classification/bin/stream-logs",
"chars": 50,
"preview": "#!/bin/bash\n\ngcloud ml-engine jobs stream-logs $1\n"
},
{
"path": "kaggle-classification/config.yaml",
"chars": 322,
"preview": "trainingInput:\n ## BASIC_GPU uses single NVIDIA Tesla K80 GPU.\n scaleTier: BASIC_GPU\n ## Custom scaleTier needed for "
},
{
"path": "kaggle-classification/gpu_config.yaml",
"chars": 322,
"preview": "trainingInput:\n ## BASIC_GPU uses single NVIDIA Tesla K80 GPU.\n scaleTier: BASIC_GPU\n ## Custom scaleTier needed for "
},
{
"path": "kaggle-classification/hparam_config.yaml",
"chars": 1044,
"preview": "trainingInput:\n ## BASIC_GPU uses single NVIDIA Tesla K80 GPU.\n scaleTier: BASIC_GPU\n ## Custom scaleTier needed for "
},
{
"path": "kaggle-classification/keras_hparam_config.yaml",
"chars": 937,
"preview": "trainingInput:\n ## BASIC_GPU uses single NVIDIA Tesla K80 GPU.\n pythonVersion: '3.5'\n scaleTier: BASIC_GPU\n ## Custo"
},
{
"path": "kaggle-classification/keras_trainer/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "kaggle-classification/keras_trainer/base_model.py",
"chars": 460,
"preview": "\"\"\"Base model class used by the ModelRunner\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nf"
},
{
"path": "kaggle-classification/keras_trainer/cnn_with_attention.py",
"chars": 2610,
"preview": "\"\"\"Model class for a single layer CNN\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __"
},
{
"path": "kaggle-classification/keras_trainer/custom_metrics.py",
"chars": 792,
"preview": "\"\"\"Custom metrics used by Keras models.\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom "
},
{
"path": "kaggle-classification/keras_trainer/model.py",
"chars": 10210,
"preview": "\"\"\"Classifiers for the Toxic Comment Classification Kaggle challenge, https://www.kaggle.com/c/jigsaw-toxic-comment-clas"
},
{
"path": "kaggle-classification/keras_trainer/rnn.py",
"chars": 1668,
"preview": "\"\"\"RNN\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n"
},
{
"path": "kaggle-classification/keras_trainer/single_layer_cnn.py",
"chars": 2245,
"preview": "\"\"\"Model class for a single layer CNN\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __"
},
{
"path": "kaggle-classification/requirements.txt",
"chars": 635,
"preview": "absl-py==0.1.9\nastor==0.6.2\nbleach==3.3.0\ncertifi==2024.7.4\nchardet==3.0.4\ncomet-ml==1.0.8\nenum34==1.1.6\nfutures==3.1.1\n"
},
{
"path": "kaggle-classification/setup.py",
"chars": 717,
"preview": "from setuptools import find_packages\nfrom setuptools import setup\n\nREQUIRED_PACKAGES = [\n 'tflearn>=0.3.2', 'Keras==2"
},
{
"path": "kaggle-classification/trainer/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "kaggle-classification/trainer/model.py",
"chars": 12926,
"preview": "\"\"\"Classifiers for the Toxic Comment Classification Kaggle challenge, https://www.kaggle.com/c/jigsaw-toxic-comment-clas"
},
{
"path": "kaggle-classification/trainer/wikidata.py",
"chars": 5153,
"preview": "\"\"\"Class to encapsulate training and test data.\"\"\"\n\nimport numpy as np\nimport pandas as pd\nimport tensorflow as tf\nimpor"
},
{
"path": "model_evaluation/BiosBias Evaluation.ipynb",
"chars": 131002,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"### Imports\"\n ]\n },\n {\n \"cell"
},
{
"path": "model_evaluation/Predict bias.ipynb",
"chars": 12561,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 54,\n \"metadata\": {},\n \"outputs\": [],\n \"source\": [\n"
},
{
"path": "model_evaluation/README.md",
"chars": 3604,
"preview": "# Evaluation Pipeline for Text classification models.\n\nThis directory contains utilities to use a model deployed on clou"
},
{
"path": "model_evaluation/deploy_models.sh",
"chars": 1109,
"preview": "#!/bin/bash\n\nMODEL_DIRS='gs://conversationai-models/tf_trainer_runs/fprost/tf_gru_attention_multiclass_biosbias_glove/20"
},
{
"path": "model_evaluation/few_shot_learning_baseline_evaluation.ipynb",
"chars": 44937,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"metadata\": {},\n \"outputs\": [],\n \"source\": [\n "
},
{
"path": "model_evaluation/input_fn_example.py",
"chars": 13698,
"preview": "# Copyright 2016 The TensorFlow Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "model_evaluation/jigsaw_evaluation_pipeline.ipynb",
"chars": 112416,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {\n \"colab_type\": \"text\",\n \"id\": \"-YibCLoSLRHp\"\n },\n"
},
{
"path": "model_evaluation/requirements.txt",
"chars": 243,
"preview": "google-api-python-client==1.7.3\nMarkdown==2.6.11\nnltk==3.9\nnumpy==1.22.0\npandas==0.22.0\nrequests==2.32.2\nseaborn==0.8.1\n"
},
{
"path": "model_evaluation/score_bias_data.sh",
"chars": 998,
"preview": "#!/bin/bash\n\nMODEL_NAMES='tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190315_113247,'\\\n'tf_trainer_tf_gru_"
},
{
"path": "model_evaluation/score_scrubbed_data.sh",
"chars": 703,
"preview": "#!/bin/bash\n\nMODEL_NAMES='tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190315_113045,'\\\n'tf_trainer_tf_gru_"
},
{
"path": "model_evaluation/score_test_data.py",
"chars": 5845,
"preview": "# Copyright 2016 The TensorFlow Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "model_evaluation/utils_export/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "model_evaluation/utils_export/dataset.py",
"chars": 11325,
"preview": "# Copyright 2016 The TensorFlow Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "model_evaluation/utils_export/dataset_test.py",
"chars": 5826,
"preview": "# Copyright 2016 The TensorFlow Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "model_evaluation/utils_export/deploy_list_models.py",
"chars": 6836,
"preview": "# Copyright 2016 The TensorFlow Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "model_evaluation/utils_export/utils_cloudml.py",
"chars": 9288,
"preview": "# Copyright 2016 The TensorFlow Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "model_evaluation/utils_export/utils_cloudml_test.py",
"chars": 4111,
"preview": "# Copyright 2016 The TensorFlow Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "model_evaluation/utils_export/utils_tfrecords.py",
"chars": 5938,
"preview": "# Copyright 2016 The TensorFlow Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "model_evaluation/utils_export/utils_tfrecords_test.py",
"chars": 3045,
"preview": "# Copyright 2016 The TensorFlow Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "travis_blase_test_support/bazel_0.18.1-linux-x86_64.deb.sha256",
"chars": 96,
"preview": "4c2cd0a71ab1b65753aeb757af36bd6ebde9da4e53183525a1e1849c2542fdda bazel_0.18.1-linux-x86_64.deb\n"
}
]
About this extraction
This page contains the full source code of the conversationai/conversationai-models GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 196 files (953.6 KB), approximately 343.4k tokens, and a symbol index with 359 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.