Showing preview only (1,829K chars total). Download the full file or copy to clipboard to get everything.
Repository: mckinziebrandon/DeepChatModels
Branch: master
Commit: 4fef8a6ce00d
Files: 144
Total size: 1.7 MB
Directory structure:
gitextract_wj3qb2ee/
├── .gitattributes
├── .gitignore
├── .travis.yml
├── LICENSE.md
├── README.md
├── chatbot/
│ ├── __init__.py
│ ├── _models.py
│ ├── components/
│ │ ├── __init__.py
│ │ ├── base/
│ │ │ ├── __init__.py
│ │ │ └── _rnn.py
│ │ ├── bot_ops.py
│ │ ├── decoders.py
│ │ ├── embedder.py
│ │ ├── encoders.py
│ │ └── input_pipeline.py
│ ├── dynamic_models.py
│ ├── globals.py
│ └── legacy/
│ ├── __init__.py
│ ├── _decode.py
│ ├── _train.py
│ └── legacy_models.py
├── configs/
│ ├── example_attention.yml
│ ├── example_cornell.yml
│ ├── example_reddit.yml
│ ├── example_ubuntu.yml
│ ├── ubuntu_basic.yml
│ └── website_config.yml
├── data/
│ ├── __init__.py
│ ├── _dataset.py
│ ├── data_helper.py
│ ├── dataset_wrappers.py
│ ├── reddit_preprocessor.py
│ └── regex.py
├── main.py
├── notebooks/
│ ├── Analysis.ipynb
│ ├── DataVizUtils.ipynb
│ ├── README.md
│ ├── RedditPipelineAndVisualization.ipynb
│ ├── TensorFlow Notes.ipynb
│ ├── __init__.py
│ └── ubuntu_reformat.ipynb
├── requirements.txt
├── setup.py
├── tests/
│ ├── __init__.py
│ ├── test_config.py
│ ├── test_config.yml
│ ├── test_data/
│ │ ├── train_from.txt
│ │ ├── train_from.txt.ids121
│ │ ├── train_to.txt
│ │ ├── train_to.txt.ids121
│ │ ├── trainvoc121_seq15.tfrecords
│ │ ├── trainvoc121_seq20.tfrecords
│ │ ├── valid_from.txt
│ │ ├── valid_from.txt.ids121
│ │ ├── valid_to.txt
│ │ ├── valid_to.txt.ids121
│ │ ├── validvoc121_seq15.tfrecords
│ │ ├── validvoc121_seq20.tfrecords
│ │ └── vocab121.txt
│ ├── test_data.py
│ ├── test_dynamic_models.py
│ ├── test_legacy_models.py
│ └── utils.py
├── utils/
│ ├── __init__.py
│ ├── bot_freezer.py
│ └── io_utils.py
└── webpage/
├── __init__.py
├── app.yaml
├── config.py
├── deepchat/
│ ├── __init__.py
│ ├── main/
│ │ ├── __init__.py
│ │ ├── errors.py
│ │ ├── forms.py
│ │ └── views.py
│ ├── models.py
│ ├── static/
│ │ ├── assets/
│ │ │ ├── plots/
│ │ │ │ ├── accuracy.json
│ │ │ │ ├── configs.json
│ │ │ │ ├── training.json
│ │ │ │ └── validation.json
│ │ │ └── test_data/
│ │ │ ├── train_from.txt
│ │ │ ├── train_to.txt
│ │ │ ├── valid_from.txt
│ │ │ ├── valid_to.txt
│ │ │ └── vocab121.txt
│ │ ├── css/
│ │ │ ├── style_modifications.css
│ │ │ └── theme.css
│ │ ├── js/
│ │ │ ├── bootstrapify.js
│ │ │ ├── chat_processing.js
│ │ │ ├── jqBootstrapValidation.js
│ │ │ └── user_form.js
│ │ └── vendor/
│ │ ├── bootstrap-3.3.7-dist/
│ │ │ ├── css/
│ │ │ │ ├── bootstrap-theme.css
│ │ │ │ └── bootstrap.css
│ │ │ └── js/
│ │ │ ├── bootstrap.js
│ │ │ └── npm.js
│ │ ├── font-awesome/
│ │ │ ├── css/
│ │ │ │ └── font-awesome.css
│ │ │ ├── fonts/
│ │ │ │ └── FontAwesome.otf
│ │ │ ├── less/
│ │ │ │ ├── animated.less
│ │ │ │ ├── bordered-pulled.less
│ │ │ │ ├── core.less
│ │ │ │ ├── fixed-width.less
│ │ │ │ ├── font-awesome.less
│ │ │ │ ├── icons.less
│ │ │ │ ├── larger.less
│ │ │ │ ├── list.less
│ │ │ │ ├── mixins.less
│ │ │ │ ├── path.less
│ │ │ │ ├── rotated-flipped.less
│ │ │ │ ├── screen-reader.less
│ │ │ │ ├── stacked.less
│ │ │ │ └── variables.less
│ │ │ └── scss/
│ │ │ ├── _animated.scss
│ │ │ ├── _bordered-pulled.scss
│ │ │ ├── _core.scss
│ │ │ ├── _fixed-width.scss
│ │ │ ├── _icons.scss
│ │ │ ├── _larger.scss
│ │ │ ├── _list.scss
│ │ │ ├── _mixins.scss
│ │ │ ├── _path.scss
│ │ │ ├── _rotated-flipped.scss
│ │ │ ├── _screen-reader.scss
│ │ │ ├── _stacked.scss
│ │ │ ├── _variables.scss
│ │ │ └── font-awesome.scss
│ │ └── jquery/
│ │ └── jquery.easing.1.3.js
│ ├── templates/
│ │ ├── 404.html
│ │ ├── about.html
│ │ ├── admin/
│ │ │ └── index.html
│ │ ├── base.html
│ │ ├── index.html
│ │ ├── macros/
│ │ │ └── forms.html
│ │ └── plots.html
│ └── web_bot.py
├── manage.py
├── migrations/
│ ├── README
│ ├── alembic.ini
│ ├── env.py
│ ├── script.py.mako
│ └── versions/
│ └── 236b966ecd2f_.py
├── requirements.txt
├── runtime.txt
└── tests/
├── __init__.py
├── test_database.py
└── test_simple.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitattributes
================================================
notebooks/* linguist-vendored
*.pb filter=lfs diff=lfs merge=lfs -text
**/*.pb filter=lfs diff=lfs merge=lfs -text
================================================
FILE: .gitignore
================================================
**/images/hidden/*
# Note: Any leading '**' need to be followed by a '/'.
# Seems like '**/thing_to_ignore_always[/*]' covers all bases.
**/.*.swp
**/.log
**/tmp*
**/.idea/*
**/.ipynb_checkpoints/*
**/__pycache__/*
**/*.pyc
**/*.h5/*
**/out/**
.cache/**
**/.cache/**
**/pretrained/**
**/notebooks/*.txt
.DS*
**/.DS*
# Muh training scripts.
overnight.sh
experimental.sh
tests/macros/**
compact.sh
# Ignore collection of useful tf files that once
# existed in tf repo but have since been removed.
# They will be missed.
**/reference/*
**/saved_train_data/**
/webpage/data_dev.db
/notebooks/reference/
/notebooks/.ipynb_checkpoints/
# TEMP:
**/frozen_models/*
**/ubuntu/*
**/data_dev.db
**/data_test.db
/webpage/deepchat/static/assets/frozen_models/
/configs/test_lstm.yml
/configs/cornell.yml
/ae.py
**/individual_tb_plots/*
================================================
FILE: .travis.yml
================================================
language: python
dist: trusty
sudo: True
python:
- "3.5"
- "pypy3"
before_install:
- sudo apt-get update
- sudo apt-get install python-matplotlib python3-matplotlib python-tk python3-tk libtcmalloc-minimal4
install:
# code below is taken from http://conda.pydata.org/docs/travis.html
# We do this conditionally because it saves us some downloading if the
# version is the same.
- if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh;
else
wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
fi
- bash miniconda.sh -b -p $HOME/miniconda
- export PATH="$HOME/miniconda/bin:$PATH"
- hash -r
- conda config --set always_yes yes --set changeps1 no
- conda update -q conda
# Useful for debugging any issues with conda
- conda info -a
- conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION numpy scipy matplotlib pandas pytest h5py
- source activate test-environment
# - pip3 install six numpy wheel
# - pip3 install -U virtualenv
- pip install -r requirements.txt
# - pip install -U tensorflow
- conda install Pillow
- pip install tensorflow
script:
- pytest
================================================
FILE: LICENSE.md
================================================
MIT License
Copyright (c) 2017 Brandon McKinzie
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
# Conversation Models in Tensorflow
Notes to visitors:
* I've just shut down the website indefinitely. I ran out of my credits on Google Cloud four days ago, and have since been billed 30+ dollars which isn't something I can sustain. To run locally, assuming you satisfy all requirements in webpage/requirements.txt, just run `python3 manage.py runserver`. If you're unfamiliar with running flask this way, see the docs for [Flask-Script](https://flask-script.readthedocs.io/en/latest/). Sorry for any inconvenience!
* Please post any feedbacks/bugs as an issue and I will respond within 24 hours.
* I haven't gotten around to providing scripts for downloading the datasets. Until then, I've uploaded most of the data [here on my MEGA account](https://mega.nz/#F!xrRTwSzY!by9K42n_I_oi5T_DKP-xTA). It is organized the same way I have it locally.
* Don't let the simple web bots fool you -- this project supports more advanced techniques than the single-layer encoder-decoder models on the website. To see the parameters that are immediately available/supported for tweaking, checkout chatbot/globals.py, which contains the default configuration dictionary. Any value that you don't specify will assume the default value from that file, which tend toward safe conversative simple values.
* Contributions are more than welcome. I do my best to follow PEP8 and I'd prefer contributions do the same.
* Please note that the bulk of this project was written with tensorflow version 1.0 (before tf.contrib.seq2seq existed) and 1.1, but updates have been made since version 1.2 that appeared to break the project. I have not been able to do tests regarding how version 1.4 is faring with the project, but I intend to do so soon.
## Table of Contents
* [Project Overview](#brief-overview-of-completed-work)
* [Datasets](#datasets)
* [Models](#models)
* [Website](#website)
* [Model Components](#model-components)
* [Input Pipeline](#the-input-pipeline)
* [Reference Material](#reference-material)
## Project Overview
As of May 9, 2017, the main packages of the project are as follows:
* __chatbot__: The conversation model classes, the structural components of the models (encoders, decoders, cells, etc.), and various operations for easy saving/loading/evaluation.
* __data__: The core Dataset class that handles all data formatting, file paths, and utilities for interacting with the data, as well as some preprocessing scripts and helper classes for cleaning data. The data itself (for space reasons) is not included in the repository. See the link to my MEGA account to download the data in the same format as on my local machine.
* __notebooks__: Jupyter notebooks showcasing data visualization examples, data preprocessing techniques, and conversation model exploration.
* __webpage__: Flask web application hosted on Google App Engine, where you can talk with a handful of chatbots and interact with plots. You can run it locally, after installing its requirements (mostly Flask packages), by running the following command within the webpage directory: `python3 manage.py runserver`
From a user/developer standpoint, this project offers a cleaner interface for tinkering with sequence-to-sequence models. The ideal result is a chatbot API with the readability of [Keras](https://keras.io/), but with a degree of flexibility closer to [TensorFlow](https://www.tensorflow.org/).
On the 'client' side, playing with model parameters and running them is as easy as making a configuration (yaml) file, opening a python interpreter, and issuing a handful of commands. The following snippet, for example, is all that is needed to start training on the cornell dataset (after downloading it of course) with your configuration:
```python
import data
import chatbot
from utils import io_utils
# Load config dictionary with the flexible parse_config() function,
# which can handle various inputs for building your config dictionary.
config = io_utils.parse_config(config_path='path_to/my_config.yml')
dataset = getattr(data, config['dataset'])(config['dataset_params'])
bot = getattr(chatbot, config['model'])(dataset, config)
bot.train()
```
This is just one way to interface with the project. For example, the user can also pass in parameters via command-line args, which will be merged with any config files they specify as well (precedence given to command-line args if conflict). You can also pass in the location of a previously saved chatbot to resume training it or start a conversation. See `main.py` for more details.
### Datasets
* [Ubuntu Dialogue Corpus](https://arxiv.org/pdf/1506.08909.pdf): pre-processing approach can be seen in the ubuntu\_reformat.ipynb in the notebooks folder. The intended use for the dataset is response ranking for multi-turn dialogues, but I've taken the rather simple approach of extracting utterance-pairs and interpreting them as single-sentence to single-response, which correspond with inputs for the encoder and decoder, respectively, in the models.
* [Cornell Movie-Dialogs](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html): I began with [this preprocessed](https://github.com/suriyadeepan/datasets/tree/master/seq2seq/cornell_movie_corpus) version of the Cornell corpus, and made minor modifications to reduce noise.
* [Reddit comments](https://www.reddit.com/r/datasets/comments/3bxlg7/i_have_every_publicly_available_reddit_comment/): Approx. 1.7 billion reddit comments. Currently working on preprocessing and reducing this massive dataset to suitable format for training conversation models. Will post processed dataset download links when complete!
### Models
* DynamicBot: uses a more object-oriented approach offered by custom classes in model_components.py. The result is faster online batch-concatenated embedding and a more natural approach to chatting. It makes use of the (fantastic) new python API in the TensorFlow 1.0 release, notably the dynamic_rnn. It also adheres to good variable scoping practice and common tensorflow conventions I've observed in the documentation and source code, which has nice side effects such as clean graph visualizations in TensorBoard.
* SimpleBot: Simplified bucketed model based on the more complicated 'ChatBot' model below. Although it is less flexible in customizing bucket partitions and uses a sparse softmax over the full vocabulary instead of sampling, it is far more transparent in its implementation. It makes minimal use of tf.contrib, as opposed to ChatBot, and more or less is implemented from "scratch," in the sense of primarily relying on the basic tensorflow methods. If you're new to TensorFlow, it may be useful to read through its implementation to get a feel for common conventions in tensorflow programming, as it was the result of me reading the source code of all methods in ChatBot and writing my own more compact interpretation.
* ChatBot: Extended version of the model described in [this TensorFlow tutorial](https://www.tensorflow.org/tutorials/seq2seq). Architecture characteristics: bucketed inputs, decoder uses an attention mechanism (see page 69 of my [notes](http://mckinziebrandon.me/assets/pdf/CondensedSummaries.pdf), and inputs are embedded with the simple functions provided in the tf.contrib library. Also employs a sampled softmax loss function to allow for larger vocabulary sizes (page 67 of [notes](http://mckinziebrandon.me/assets/pdf/CondensedSummaries.pdf)). Additional comments: due to the nature of bucketed models, it takes much longer to create the model compared to others. The main bottleneck appears to be the size of the largest bucket and how the gradient ops are created based on the bucket sizes.
### Website
The webpage directory showcases a simple and space-efficient way for deploying your TensorFlow models in a Flask application. The models are 'frozen' -- all components not needed for chatting (e.g. optimizers) are removed and all remaining variables are converted to constants. When the user clicks on a model name, a REST API for that model is created. When the user enters a sentence into the form, an (AJAX) POST request is issued, where the response is the chatbot's response sentence. For more details on the REST API, see [views.py](https://github.com/mckinziebrandon/DeepChatModels/blob/master/webpage/deepchat/main/views.py).
The Flask application follows best practices, such as using blueprints for instantiating applications, different databases depending on the application environment (e.g. development or production), and more.
## Model Components
Here I'll go into more detail on how the models are constructed and how they can be visualized. This section is a work in progress and not yet complete.
### The Input Pipeline
Instead of using the ```feed_dict``` argument to input data batches to the model, it is *substantially* faster encode the input information and preprocessing techniques in the graph structure itself. This means we don't feed the model anything at training time. Rather the model uses a sequence of queues to access the data from files in google's protobuf format, decode the files into tensor sequences, dynamically batch and pad the sequences, and then feed these batches to the embedding decoder. All within the graph structure. Furthermore, this data processing is coordinated by multiple threads in parallel. We can use tensorboard (and best practices for variable scoping) to visualize this type of pipeline at a high level.
<img alt="input_pipeline" src="http://i.imgur.com/xrLqths.png" width="400" align="left">
<img alt="input_pipeline_expanded" src="http://i.imgur.com/xMWB7oL.png" width="400">
<br/>
<br/>
_(More descriptions coming soon!)_
## Reference Material
A lot of research has gone into these models, and I've been documenting my notes on the most "important" papers here in the last section of [my deep learning notes here](http://mckinziebrandon.me/assets/pdf/CondensedSummaries.pdf). The notes also include how I've tried translating the material from the papers into TensorFlow code. I'll be updating that as the ideas from more papers make their way into this project.
* Papers:
* [Sequence to Sequence Learning with Neural Networks. Sutskever et al., 2014.](https://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf)
* [On Using Very Large Target Vocabulary for Neural Machine Translation. Jean et al., 2014.](https://arxiv.org/pdf/1412.2007.pdf)
* [Neural Machine Translation by Jointly Learning to Align and Translate. Bahdanau et al., 2016](https://arxiv.org/pdf/1409.0473.pdf)
* [Effective Approaches to Attention-based Neural Machine Translation. Luong et al., 2015](https://arxiv.org/pdf/1508.04025.pdf)
* Online Resources:
* [Metaflow blog](https://blog.metaflow.fr/): Incredibly helpful tensorflow (r1.0) tutorials.
* [Flask Mega-Tutorial](https://blog.miguelgrinberg.com/post/the-flask-mega-tutorial-part-i-hello-world): For the webpage parts of the project.
* [Code for "Massive Exploration of Neural Machine Translation Architectures"](https://github.com/google/seq2seq): Main inspiration for switching to yaml configs and pydoc.locate. Paper is great as well.
* [Tensorflow r1.0 API](https://www.tensorflow.org/api_docs/): (Of course). The new python API guides are great.
================================================
FILE: chatbot/__init__.py
================================================
from chatbot import globals
from chatbot.components.base._rnn import *
from chatbot.components.bot_ops import dynamic_sampled_softmax_loss
from chatbot.components.decoders import *
from chatbot.components.embedder import *
from chatbot.components.encoders import *
from chatbot.dynamic_models import DynamicBot
from chatbot.legacy.legacy_models import ChatBot, SimpleBot
__all__ = ['Chatbot, SimpleBot', 'DynamicBot']
================================================
FILE: chatbot/_models.py
================================================
"""Abstract classes."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import copy
import yaml
import random
import subprocess
import numpy as np
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector
from tensorflow.python.client import device_lib
from utils import io_utils
from chatbot.components import *
from chatbot.globals import DEFAULT_FULL_CONFIG, OPTIMIZERS
def gpu_found():
"""Returns True if tensorflow finds at least 1 GPU."""
devices = device_lib.list_local_devices()
return len([x.name for x in devices if x.device_type == 'GPU']) > 0
class Model(object):
"""Superclass of all subsequent model classes.
"""
def __init__(self, logger, dataset, params):
"""
Args:
logger: returned by getLogger & called by subclasses. Passed
here so we know what object to use for info/warn/error.
dataset: object that inherits from data.Dataset.
params: (dict) user-specified params that override those in
DEFAULT_FULL_CONFIG above.
"""
self.log = logger
self.__dict__['__params'] = Model.fill_params(dataset, params)
# Make particularly useful ckpt directories for website configurations.
if 'website_config' in self.ckpt_dir:
self.ckpt_dir = Model._build_hparam_path(
ckpt_dir=self.ckpt_dir,
num_layers=self.num_layers,
max_seq_len=self.max_seq_len)
self.log.info("New ckpt dir:", self.ckpt_dir)
# Configure gpu options if we are using one.
if gpu_found():
self.log.info("GPU Found. Setting allow_growth to True.")
gpu_config = tf.ConfigProto()
gpu_config.gpu_options.allow_growth = True
self.sess = tf.Session(config=gpu_config)
else:
self.log.warning("GPU not found. Not recommended for training.")
self.sess = tf.Session()
with self.graph.name_scope(tf.GraphKeys.SUMMARIES):
self.global_step = tf.Variable(initial_value=0, trainable=False)
self.learning_rate = tf.constant(self.learning_rate)
# Create ckpt_dir if user hasn't already (if exists, has no effect).
subprocess.call(['mkdir', '-p', self.ckpt_dir])
self.projector_config = projector.ProjectorConfig()
# Good practice to set as None in constructor.
self.loss = None
self.file_writer = None
self.merged = None
self.train_op = None
self.saver = None
def compile(self):
""" Configure training process and initialize model. Inspired by Keras.
Either restore model parameters or create fresh ones.
- Checks if we can both (1) find a checkpoint state, and (2) a
valid V1/V2 checkpoint path.
- If we can't, then just re-initialize model with fresh params.
"""
self.log.info("Checking for checkpoints . . .")
checkpoint_state = tf.train.get_checkpoint_state(self.ckpt_dir)
if not self.reset_model and checkpoint_state \
and tf.train.checkpoint_exists(checkpoint_state.model_checkpoint_path):
print("Reading model parameters from",
checkpoint_state.model_checkpoint_path)
self.file_writer = tf.summary.FileWriter(self.ckpt_dir)
self.saver = tf.train.Saver(tf.global_variables())
self.saver.restore(self.sess, checkpoint_state.model_checkpoint_path)
else:
print("Created model with fresh parameters:\n\t", self.ckpt_dir)
# Recursively delete all files in output but keep directories.
subprocess.call([
'find', self.ckpt_dir, '-type', 'f', '-exec', 'rm', '{}', ';'
])
self.file_writer = tf.summary.FileWriter(self.ckpt_dir)
# Add operation for calling all variable initializers.
init_op = tf.global_variables_initializer()
# Construct saver (adds save/restore ops to all).
self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)
# Add the fully-constructed graph to the event file.
self.file_writer.add_graph(self.sess.graph)
# Initialize all model variables.
self.sess.run(init_op)
# Store model config in ckpt dir for easy loading later.
with open(os.path.join(self.ckpt_dir, 'config.yml'), 'w') as f:
yaml.dump(getattr(self, "params"), f, default_flow_style=False)
def save(self, summaries=None):
"""
Args:
summaries: merged summary instance returned by session.run.
"""
if self.saver is None:
raise ValueError("Tried saving model before defining a saver.")
ckpt_fname = os.path.join(self.ckpt_dir, "{}.ckpt".format(self.data_name))
# Saves the state of all global variables in a ckpt file.
self.saver.save(self.sess, ckpt_fname, global_step=self.global_step)
if summaries is not None:
self.file_writer.add_summary(summaries, self.global_step.eval(self.sess))
else:
self.log.info("Save called without summaries.")
def close(self, save_current=True):
"""Call then when training session is terminated.
- Saves the current model/checkpoint state.
- Freezes the model into a protobuf file in self.ckpt_dir.
- Closes context managers for file_writing and session.
"""
# First save the checkpoint as usual.
if save_current:
self.save()
# Freeze me, for I am infinite.
self.freeze()
# Be a responsible bot and close my file writer.
self.file_writer.close()
# Formally exit the session, farewell to all.
self.sess.close()
@property
def graph(self):
return self.sess.graph
@staticmethod
def fill_params(dataset, params):
"""For now, essentially just returns (already parsed) params,
but placed here in case I want to customize later (likely).
"""
# Replace (string) specification of dataset with the actual instance.
params['dataset'] = dataset
params['dataset_params']['data_name'] = dataset.name
if params['model_params']['ckpt_dir'] == 'out':
params['model_params']['ckpt_dir'] += '/'+dataset.name
# Define alias in case older models still use it.
params['model_params']['is_chatting'] = params['model_params']['decode']
return params
def freeze(self):
"""Useful for e.g. deploying model on website.
Args: directory containing model ckpt files we'd like to freeze.
"""
if not tf.get_collection('freezer'):
self.log.warning('No freezer found. Not saving a frozen model.')
return
# Note: output_node_names is only used to tell tensorflow what is can
# throw away in the frozen graph (e.g. training ops).
output_node_names = ",".join(
[t.name.rstrip(':0') for t in tf.get_collection('freezer')])
self.log.info('Output node names: %r', output_node_names)
# Save a graph with only the bare necessities for chat sessions.
output_graph_def = tf.graph_util.convert_variables_to_constants(
self.sess, self.graph.as_graph_def(), output_node_names.split(','))
output_fname = os.path.join(self.ckpt_dir, "frozen_model.pb")
with tf.gfile.GFile(output_fname, 'wb') as f:
f.write(output_graph_def.SerializeToString())
print("%d ops in the final graph." % len(output_graph_def.node))
subprocess.call(['cp', self.dataset.paths['vocab'], self.ckpt_dir])
def __getattr__(self, name):
if name == 'params':
camel_case = self.data_name.title().replace('_', '')
replace_dict = {'dataset': "data."+camel_case}
return {**self.__dict__['__params'], **replace_dict}
elif name in DEFAULT_FULL_CONFIG: # Requesting a top-level key.
return self.__dict__['__params'][name]
else:
for k in DEFAULT_FULL_CONFIG.keys():
if not isinstance(self.__dict__['__params'][k], dict):
continue
if name in self.__dict__['__params'][k]:
return self.__dict__['__params'][k][name]
raise AttributeError(name)
@staticmethod
def _build_hparam_path(ckpt_dir, **kwargs):
"""Returns relative path build from args for descriptive checkpointing.
The new path becomes ckpt_dir appended with directories named by kwargs:
- If a given kwargs[key] is a string, that is set as the
appended dir name.
- Otherwise, it gets formatted, e.g. for key='learning_rate' it
may become 'learning_rate_0_001'
Returns:
ckpt_dir followed by sequentially appended directories,
named by kwargs.
"""
kwargs = copy.deepcopy(kwargs)
new_ckpt_dir = ckpt_dir
for key in sorted(kwargs):
if not isinstance(kwargs[key], str):
dir_name = key + "_" + str(kwargs[key]).replace('.', '_')
else:
dir_name = kwargs[key]
new_ckpt_dir = os.path.join(new_ckpt_dir, dir_name)
return new_ckpt_dir
class BucketModel(Model):
"""Abstract class. Any classes that extend BucketModel just need to customize their
graph structure in __init__ and implement the step(...) function.
The real motivation for making this was to be able to use the true Model
abstract class for all classes in this directory, bucketed or not, r1.0 or r0.12.
"""
def __init__(self, logger, buckets, dataset, params):
self.buckets = buckets
super(BucketModel, self).__init__(
logger=logger,
dataset=dataset,
params=params)
def compile(self):
""" Configure training process. Name was inspired by Keras. <3 """
if self.losses is None:
raise ValueError("Tried compiling model before defining losses.")
print("Configuring training operations. This may take some time . . . ")
# Note: variables are trainable=True by default.
params = tf.trainable_variables()
# train_op will store the parameter (S)GD train_op.
self.apply_gradients = []
optimizer = OPTIMIZERS[self.optimizer](self.learning_rate)
for b in range(len(self.buckets)):
gradients = tf.gradients(self.losses[b], params)
# Gradient clipping is actually extremely simple, it basically just
# checks if L2Norm(gradients) > max_gradient, and if it is,
# it returns (gradients / L2Norm(gradients)) * max_grad.
clipped_gradients, _ = tf.clip_by_global_norm(
gradients, self.max_gradient)
self.apply_gradients.append(optimizer.apply_gradients(
zip(clipped_gradients, params),global_step=self.global_step))
super(BucketModel, self).compile()
def check_input_lengths(self, inputs, expected_lengths):
"""
Raises:
ValueError: if length of encoder_inputs, decoder_inputs, or
target_weights disagrees with bucket size for the specified bucket_id.
"""
for input, length in zip(inputs, expected_lengths):
if len(input) != length:
raise ValueError("Input length doesn't match bucket size:"
" %d != %d." % (len(input), length))
def get_batch(self, data, bucket_id):
"""Get a random batch of data from the specified bucket, prepare for step.
Args:
data: tuple of len(self.buckets). data[bucket_id] == [source_ids, target_ids]
bucket_id: integer, which bucket to get the batch for.
Returns:
The triple (encoder_inputs, decoder_inputs, target_weights) for
the constructed batch that has the proper format to call step(...) later.
"""
encoder_size, decoder_size = self.buckets[bucket_id]
encoder_inputs, decoder_inputs = [], []
# Get a random batch of encoder and decoder inputs from data,
# pad them if needed, reverse encoder inputs and add GO to decoder.
for _ in range(self.batch_size):
encoder_input, decoder_input = random.choice(data[bucket_id])
# BasicEncoder inputs are padded and then reversed.
encoder_pad = [io_utils.PAD_ID] * (encoder_size - len(encoder_input))
encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
# DynamicDecoder inputs get an extra "GO" symbol, and are padded then.
decoder_pad= [io_utils.PAD_ID] * (decoder_size - len(decoder_input) - 1)
decoder_inputs.append([io_utils.GO_ID] + decoder_input + decoder_pad)
# Define some small helper functions before we re-index & weight.
def inputs_to_unit(uid, inputs):
""" Return re-indexed version of inputs array. Description in params below.
:param uid: index identifier for input timestep/unit/node of interest.
:param inputs: single batch of data; inputs[i] is i'th sentence.
:return: re-indexed version of inputs as numpy array.
"""
return np.array([inputs[i][uid] for i in range(self.batch_size)], dtype=np.int32)
batch_encoder_inputs = [inputs_to_unit(i, encoder_inputs) for i in range(encoder_size)]
batch_decoder_inputs = [inputs_to_unit(i, decoder_inputs) for i in range(decoder_size)]
batch_weights = list(np.ones(shape=(decoder_size, self.batch_size), dtype=np.float32))
# Set weight for the final decoder unit to 0.0 for all batches.
for i in range(self.batch_size):
batch_weights[-1][i] = 0.0
# Also set any decoder-input-weights to 0 that have PAD
# as target decoder output.
for unit_id in range(decoder_size - 1):
ids_with_pad_target = [b for b in range(self.batch_size)
if decoder_inputs[b][unit_id+1] == io_utils.PAD_ID]
batch_weights[unit_id][ids_with_pad_target] = 0.0
return batch_encoder_inputs, batch_decoder_inputs, batch_weights
def train(self, dataset):
""" Train chatbot. """
from chatbot.legacy._train import train
train(self, dataset)
def decode(self):
""" Create chat session between user & chatbot. """
from chatbot.legacy._decode import decode
decode(self)
def step(self, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=False):
"""Run a step of the model.
Args:
encoder_inputs: list of numpy int vectors to feed as encoder inputs.
decoder_inputs: list of numpy int vectors to feed as decoder inputs.
target_weights: list of numpy float vectors to feed as target weights.
bucket_id: which bucket of the model to use.
"""
raise NotImplemented
================================================
FILE: chatbot/components/__init__.py
================================================
from chatbot.components.embedder import Embedder
from chatbot.components.input_pipeline import InputPipeline
from chatbot.components.encoders import BasicEncoder, BidirectionalEncoder
from chatbot.components.decoders import BasicDecoder, AttentionDecoder
__all__ = ["InputPipeline",
"Embedder",
"BasicEncoder",
"BidirectionalEncoder",
"BasicDecoder",
"AttentionDecoder"]
================================================
FILE: chatbot/components/base/__init__.py
================================================
================================================
FILE: chatbot/components/base/_rnn.py
================================================
"""Collection of base RNN classes and custom RNNCells.
"""
import tensorflow as tf
from tensorflow.python.util import nest
from tensorflow.python.ops import rnn_cell_impl
from chatbot.components import bot_ops
from tensorflow.contrib.rnn import RNNCell
from tensorflow.contrib.rnn import GRUCell, MultiRNNCell, LSTMStateTuple
from tensorflow.python.layers import core as layers_core
# Required due to TensorFlow's unreliable naming across versions . . .
try:
# r1.1
from tensorflow.contrib.seq2seq import DynamicAttentionWrapper \
as AttentionWrapper
from tensorflow.contrib.seq2seq import DynamicAttentionWrapperState \
as AttentionWrapperState
except ImportError:
# master
from tensorflow.contrib.seq2seq import AttentionWrapper
from tensorflow.contrib.seq2seq import AttentionWrapperState
class Cell(RNNCell):
"""Simple wrapper class for any extensions I want to make to the
encoder/decoder rnn cells. For now, just Dropout+GRU."""
def __init__(self, state_size, num_layers, dropout_prob, base_cell):
"""Define the cell by composing/wrapping with tf.contrib.rnn functions.
Args:
state_size: number of units in the cell.
num_layers: how many cells to include in the MultiRNNCell.
dropout_prob: probability of a node being dropped.
base_cell: (str) name of underling cell to use (e.g. 'GRUCell')
"""
self._state_size = state_size
self._num_layers = num_layers
self._dropout_prob = dropout_prob
self._base_cell = base_cell
def single_cell():
"""Convert cell name (str) to class, and create it."""
return getattr(tf.contrib.rnn, base_cell)(num_units=state_size)
if num_layers == 1:
self._cell = single_cell()
else:
self._cell = MultiRNNCell(
[single_cell() for _ in range(num_layers)])
@property
def state_size(self):
return self._cell.state_size
@property
def shape(self):
"""Needed for shape_invariants arg for tf.while_loop."""
if self._num_layers == 1:
return self.single_layer_shape()
else:
return tuple(self.single_layer_shape()
for _ in range(self._num_layers))
def single_layer_shape(self):
if 'LSTM' in self._base_cell:
return LSTMStateTuple(c=tf.TensorShape([None, self._state_size]),
h=tf.TensorShape([None, self._state_size]))
else:
return tf.TensorShape([None, self._state_size])
@property
def output_size(self):
return self._cell.output_size
def __call__(self, inputs, state, scope=None):
"""Run this RNN cell on inputs, starting from the given state.
Args:
inputs: `2-D` tensor with shape `[batch_size x input_size]`.
state: Either 2D Tensor or tuple of 2D tensors, determined by cases:
- `self.state_size` is int: `2-D Tensor` with shape
`[batch_size x self.state_size]`.
- `self.state_size` is tuple: tuple with shapes
`[batch_size x s] for s in self.state_size`.
scope: VariableScope for the created subgraph;
defaults to class name.
Returns:
A pair containing:
- Output: 2D tensor with shape [batch_size x self.output_size].
- New state: Either a single `2-D` tensor, or a tuple of tensors
matching the arity and shapes of `state`.
"""
output, new_state = self._cell(inputs, state, scope)
output = tf.layers.dropout(output, rate=self._dropout_prob, name="dropout")
return output, new_state
class RNN(object):
"""Base class for encoders/decoders. Has simple instance attributes and
an RNNCell object and getter.
"""
def __init__(self,
state_size,
embed_size,
dropout_prob,
num_layers,
base_cell="GRUCell",
state_wrapper=None):
"""
Args:
state_size: number of units in underlying rnn cell.
embed_size: dimension size of word-embedding space.
dropout_prob: probability of a node being dropped.
num_layers: how many cells to include in the MultiRNNCell.
base_cell: (str) name of underling cell to use (e.g. 'GRUCell')
state_wrapper: allow states to store their wrapper class. See the
wrapper method docstring below for more info.
"""
self.state_size = state_size
self.embed_size = embed_size
self.num_layers = num_layers
self.dropout_prob = dropout_prob
self.base_cell = base_cell
self._wrapper = state_wrapper
def get_cell(self, name):
"""Returns a cell instance, defined by its name scope."""
with tf.name_scope(name, "get_cell"):
return Cell(state_size=self.state_size,
num_layers=self.num_layers,
dropout_prob=self.dropout_prob,
base_cell=self.base_cell)
def wrapper(self, state):
"""Some RNN states are wrapped in namedtuples.
(TensorFlow decision, definitely not mine...).
This is here for derived classes to specify their wrapper state.
Some examples: LSTMStateTuple and AttentionWrapperState.
Args:
state: tensor state tuple, will be unpacked into the wrapper tuple.
"""
if self._wrapper is None:
return state
else:
return self._wrapper(*state)
def __call__(self, *args):
raise NotImplemented
class SimpleAttentionWrapper(RNNCell):
"""A simplified and tweaked version of TensorFlow's AttentionWrapper.
It closely follows the implementation described by Luong et. al, 2015 in
`Effective Approaches to Attention-based Neural Machine Translation`.
"""
def __init__(self,
cell,
attention_mechanism,
initial_cell_state=None,
name=None):
"""Construct the wrapper.
Main tweak is creating the attention_layer with a tanh activation
(Luong's choice) as opposed to linear (TensorFlow's choice). Also,
since I am sticking with Luong's approach, parameters that are in the
constructor of TensorFlow's AttentionWrapper have been removed, and
the corresponding values are set to how Luong's paper defined them.
Args:
cell: instance of the Cell class above.
attention_mechanism: instance of tf AttentionMechanism.
initial_cell_state: The initial state value to use for the cell when
the user calls `zero_state()`.
name: Name to use when creating ops.
"""
super(SimpleAttentionWrapper, self).__init__(name=name)
# Assume that 'cell' is an instance of the custom 'Cell' class above.
self._base_cell = cell._base_cell
self._num_layers = cell._num_layers
self._state_size = cell._state_size
self._attention_size = attention_mechanism.values.get_shape()[-1].value
self._attention_layer = layers_core.Dense(self._attention_size,
activation=tf.nn.tanh,
name="attention_layer",
use_bias=False)
self._cell = cell
self._attention_mechanism = attention_mechanism
with tf.name_scope(name, "AttentionWrapperInit"):
if initial_cell_state is None:
self._initial_cell_state = None
else:
final_state_tensor = nest.flatten(initial_cell_state)[-1]
state_batch_size = (
final_state_tensor.shape[0].value
or tf.shape(final_state_tensor)[0])
error_message = (
"Constructor AttentionWrapper %s: " % self._base_name +
"Non-matching batch sizes between the memory "
"(encoder output) and initial_cell_state.")
with tf.control_dependencies(
[tf.assert_equal(state_batch_size,
self._attention_mechanism.batch_size,
message=error_message)]):
self._initial_cell_state = nest.map_structure(
lambda s: tf.identity(s, name="check_initial_cell_state"),
initial_cell_state)
def zero_state(self, batch_size, dtype):
with tf.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
if self._initial_cell_state is not None:
cell_state = self._initial_cell_state
else:
cell_state = self._cell.zero_state(batch_size, dtype)
error_message = (
"zero_state of AttentionWrapper %s: " % self._base_name +
"Non-matching batch sizes between the memory "
"(encoder output) and the requested batch size.")
with tf.control_dependencies(
[tf.assert_equal(batch_size,
self._attention_mechanism.batch_size,
message=error_message)]):
cell_state = nest.map_structure(
lambda s: tf.identity(s, name="checked_cell_state"),
cell_state)
alignment_history = ()
_zero_state_tensors = rnn_cell_impl._zero_state_tensors
return AttentionWrapperState(
cell_state=cell_state,
time=tf.zeros([], dtype=tf.int32),
attention=_zero_state_tensors(self._attention_size, batch_size,
dtype),
alignments=self._attention_mechanism.initial_alignments(
batch_size, dtype),
alignment_history=alignment_history)
def call(self, inputs, state):
"""First computes the cell state and output in the usual way,
then works through the attention pipeline:
h --> a --> c --> h_tilde
using the naming/notation from Luong et. al, 2015.
Args:
inputs: `2-D` tensor with shape `[batch_size x input_size]`.
state: An instance of `AttentionWrapperState` containing the
tensors from the prev timestep.
Returns:
A tuple `(attention_or_cell_output, next_state)`, where:
- `attention_or_cell_output` depending on `output_attention`.
- `next_state` is an instance of `DynamicAttentionWrapperState`
containing the state calculated at this time step.
"""
# Concatenate the previous h_tilde with inputs (input-feeding).
cell_inputs = tf.concat([inputs, state.attention], -1)
# 1. (hidden) Compute the hidden state (cell_output).
cell_output, next_cell_state = self._cell(cell_inputs,
state.cell_state)
# 2. (align) Compute the normalized alignment scores. [B, L_enc].
# where L_enc is the max seq len in the encoder outputs for the (B)atch.
score = self._attention_mechanism(
cell_output, previous_alignments=state.alignments)
alignments = tf.nn.softmax(score)
# Reshape from [B, L_enc] to [B, 1, L_enc]
expanded_alignments = tf.expand_dims(alignments, 1)
# (Possibly projected) encoder outputs: [B, L_enc, state_size]
encoder_outputs = self._attention_mechanism.values
# 3 (context) Take inner prod. [B, 1, state size].
context = tf.matmul(expanded_alignments, encoder_outputs)
context = tf.squeeze(context, [1])
# 4 (h_tilde) Compute tanh(W [c, h]).
attention = self._attention_layer(
tf.concat([cell_output, context], -1))
next_state = AttentionWrapperState(
cell_state=next_cell_state,
attention=attention,
time=state.time + 1,
alignments=alignments,
alignment_history=())
return attention, next_state
@property
def output_size(self):
return self._attention_size
@property
def state_size(self):
return AttentionWrapperState(
cell_state=self._cell.state_size,
attention=self._attention_size,
time=tf.TensorShape([]),
alignments=self._attention_mechanism.alignments_size,
alignment_history=())
@property
def shape(self):
return AttentionWrapperState(
cell_state=self._cell.shape,
attention=tf.TensorShape([None, self._attention_size]),
time=tf.TensorShape(None),
alignments=tf.TensorShape([None, None]),
alignment_history=())
class BasicRNNCell(RNNCell):
"""Same as tf.contrib.rnn.BasicRNNCell, rewritten for clarity.
For example, many TF implementations have leftover code debt from past
versions, so I wanted to show what is actually going on, with the fluff
removed. Also, I've removed generally accepted values from parameters/args
in favor of just setting them.
"""
def __init__(self, num_units, reuse=None):
self._num_units = num_units
self._reuse = reuse
@property
def state_size(self):
return self._num_units
@property
def output_size(self):
return self._num_units
def __call__(self, inputs, state, scope=None):
"""Most basic RNN. Define as:
output = new_state = act(W * input + U * state + B).
"""
output = tf.tanh(bot_ops.linear_map(
args=[inputs, state],
output_size=self._num_units,
bias=True))
return output, output
================================================
FILE: chatbot/components/bot_ops.py
================================================
"""Custom TF 'ops' as meant in the TensorFlow definition of ops."""
import numpy as np
import tensorflow as tf
from utils import io_utils
from tensorflow.python.util import nest
def dynamic_sampled_softmax_loss(labels, logits, output_projection, vocab_size,
from_scratch=False, num_samples=512, name=None):
"""Sampled softmax loss function able to accept 3D Tensors as input,
as opposed to the official TensorFlow support for <= 2D. This is
dynamic because it can be applied across variable-length sequences,
which are unspecified at initialization with size 'None'.
Args:
labels: 2D integer tensor of shape [batch_size, None] containing
the word ID labels for each individual rnn state from logits.
logits: 3D float tensor of shape [batch_size, None, state_size] as
ouput by a DynamicDecoder instance.
from_scratch: (bool) Whether to use the version I wrote from scratch, or to use
the version I wrote that applies map_fn(sampled_softmax) across timeslices, which
is probably less efficient. (Currently testing)
num
Returns:
loss as a scalar Tensor, computed as the mean over all batches and sequences.
"""
if from_scratch:
return _dynamic_sampled_from_scratch(labels, logits, output_projection, vocab_size,
num_samples=num_samples, name=name)
else:
return _dynamic_sampled_map(labels, logits, output_projection, vocab_size,
num_samples=num_samples, name=name)
def _dynamic_sampled_map(labels, logits, output_projection, vocab_size,
num_samples=512, name=None):
"""Sampled softmax loss function able to accept 3D Tensors as input,
as opposed to the official TensorFlow support for <= 2D. This is
dynamic because it can be applied across variable-length sequences,
which are unspecified at initialization with size 'None'.
Args:
labels: 2D integer tensor of shape [batch_size, None] containing
the word ID labels for each individual rnn state from logits.
logits: 3D float tensor of shape [batch_size, None, state_size] as
ouput by a DynamicDecoder instance.
Returns:
loss as a scalar Tensor, computed as the mean over all batches and sequences.
"""
with tf.name_scope(name, "dynamic_sampled_softmax_loss", [labels, logits, output_projection]):
seq_len = tf.shape(logits)[1]
st_size = tf.shape(logits)[2]
time_major_outputs = tf.reshape(logits, [seq_len, -1, st_size])
time_major_labels = tf.reshape(labels, [seq_len, -1])
# Reshape is apparently faster (dynamic) than transpose.
w_t = tf.reshape(output_projection[0], [vocab_size, -1])
b = output_projection[1]
def sampled_loss(elem):
logits, lab = elem
lab = tf.reshape(lab, [-1, 1])
# TODO: Figure out how this accurately gets loss without requiring weights,
# like sparse_softmax_cross_entropy requires.
return tf.reduce_mean(
tf.nn.sampled_softmax_loss(
weights=w_t,
biases=b,
labels=lab,
inputs=logits,
num_sampled=num_samples,
num_classes=vocab_size,
partition_strategy='div'))
batch_losses = tf.map_fn(sampled_loss,
(time_major_outputs, time_major_labels),
dtype=tf.float32)
loss = tf.reduce_mean(batch_losses)
return loss
def _dynamic_sampled_from_scratch(labels, logits, output_projection, vocab_size,
num_samples, name=None):
"""Note: I closely follow the notation from Tensorflow's Candidate Sampling reference.
- Link: https://www.tensorflow.org/extras/candidate_sampling.pdf
Args:
output_projection: (tuple) returned by any DynamicDecoder.get_projections_tensors()
- output_projection[0] == w tensor. [state_size, vocab_size]
- output_projection[0] == b tensor. [vocab_size]
labels: 2D Integer tensor. [batch_size, None]
logits: 3D float Tensor [batch_size, None, state_size].
- In this project, usually is the decoder batch output sequence (NOT projected).
num_samples: number of classes out of vocab_size possible to use.
vocab_size: total number of classes.
"""
with tf.name_scope(name, "dynamic_sampled_from_scratch", [labels, logits, output_projection]):
batch_size, seq_len, state_size = tf.unstack(tf.shape(logits))
time_major_outputs = tf.reshape(logits, [seq_len, batch_size, state_size])
time_major_labels = tf.reshape(labels, [seq_len, batch_size])
weights = tf.transpose(output_projection[0])
biases = output_projection[1]
def sampled_loss_single_timestep(args):
"""
Args: 2-tuple (because map_fn below)
targets: 1D tensor (sighs loudly) of shape [batch_size]
logits: 2D tensor (sighs intensify) of shape [batch_size, state_size].
"""
logits, targets = args
with tf.name_scope("compute_sampled_logits", [weights, biases, logits, targets]):
targets = tf.cast(targets, tf.int64)
sampled_values = tf.nn.log_uniform_candidate_sampler(
true_classes=tf.expand_dims(targets, -1),
num_true=1,
num_sampled=num_samples,
unique=True,
range_max=vocab_size)
S, Q_true, Q_samp = (tf.stop_gradient(s) for s in sampled_values)
# Get concatenated 1D tensor of shape [batch_size * None + num_samples],
all_ids = tf.concat([targets, S], 0)
_W = tf.nn.embedding_lookup(weights, all_ids, partition_strategy='div')
_b = tf.nn.embedding_lookup(biases, all_ids)
W = {'targets': tf.slice(_W, begin=[0, 0], size=[batch_size, state_size]),
'samples': tf.slice(_W, begin=[batch_size, 0], size=[num_samples, state_size])}
b = {'targets': tf.slice(_b, begin=[0], size=[batch_size]),
'samples': tf.slice(_b, begin=[batch_size], size=[num_samples])}
true_logits = tf.reduce_sum(tf.multiply(logits, W['targets']), 1)
true_logits += b['targets'] - tf.log(Q_true)
sampled_logits = tf.matmul(logits, W['samples'], transpose_b=True)
sampled_logits += b['samples'] - tf.log(Q_samp)
F = tf.concat([true_logits, sampled_logits], 1)
def fn(s_i): return tf.where(targets == s_i, tf.ones_like(targets), tf.zeros_like(targets))
sample_labels = tf.transpose(tf.map_fn(fn, S))
out_targets = tf.concat([tf.ones_like(true_logits, dtype=tf.int64), sample_labels], 1)
return tf.losses.softmax_cross_entropy(out_targets, logits=F)
return tf.reduce_mean(tf.map_fn(sampled_loss_single_timestep,
(time_major_outputs, time_major_labels),
dtype=tf.float32))
def cross_entropy_sequence_loss(logits, labels, weights):
"""My version of various tensorflow sequence loss implementations I've
seen. They all seem to do the basic operations below, but in a much more
roundabout way. This version is able to be simpler because it assumes that
the inputs are coming from a chatbot.Model subclass.
"""
with tf.name_scope('cross_entropy_sequence_loss'):
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=logits, labels=labels)
# We can get the sequence lengths simply by casting all PAD labels
# with 0 and everything else with 1.
weights = tf.to_float(weights)
losses = tf.multiply(losses, weights)
return tf.reduce_sum(losses) / tf.reduce_sum(weights)
def dot_prod(x, y):
return tf.reduce_sum(tf.multiply(x, y))
def bahdanau_score(attention_dim, h_j, s_i):
state_size = tf.get_shape(h_j)[0]
h_proj = tf.get_variable('W_1',
[attention_dim, state_size],
dtype=tf.float32)
s_proj = tf.get_variable('W_2',
[attention_dim, state_size],
dtype=tf.float32)
v = tf.get_variable('v',
[attention_dim, state_size],
dtype=tf.float32)
score = dot_prod(v, tf.tanh(h_proj + s_proj))
return score
def luong_score(attention_dim, h_j, s_i):
h_proj = tf.get_variable('W_1',
[attention_dim, tf.get_shape(h_j)[0]],
dtype=tf.float32)
s_proj = tf.get_variable('W_2',
[attention_dim, tf.get_shape(s_i)[0]],
dtype=tf.float32)
score = dot_prod(h_proj, s_proj)
return score
def linear_map(args, output_size, biases=None):
"""Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
Basically, you pass in a bunch of vectors (ok you got me, 2D tensors because
batch dimensions) that you want added together but need their dimensions
to match. This function has you covered.
Args:
args: a 2D Tensor or a list of 2D, batch x n, Tensors.
output_size: int, second dimension of W[i].
biases: tensor of shape [output_size] added to all in batch if not None.
Returns:
A 2D Tensor with shape [batch x output_size] equal to
sum_i(args[i] * W[i]), where W[i]s are newly created matrices.
"""
if not nest.is_sequence(args):
args = [args]
# Calculate the total size of arguments on dimension 1.
total_arg_size = 0
shapes = [tf.shape(a)[1] for a in args]
for shape in shapes:
total_arg_size = tf.add(total_arg_size, shape)
dtype = args[0].dtype
# Now the computation.
scope = tf.get_variable_scope()
with tf.variable_scope(scope) as outer_scope:
weights = tf.get_variable('weights',
[total_arg_size, output_size],
dtype=dtype)
if len(args) == 1:
res = tf.matmul(args[0], weights)
else:
res = tf.matmul(tf.concat(args, 1), weights)
return res if not biases else tf.nn.bias_add(res, biases)
================================================
FILE: chatbot/components/decoders.py
================================================
import logging
import tensorflow as tf
import sys
# Required due to TensorFlow's unreliable naming across versions . . .
try:
# r1.1
from tensorflow.contrib.seq2seq import DynamicAttentionWrapperState \
as AttentionWrapperState
except ImportError:
# master
from tensorflow.contrib.seq2seq import AttentionWrapperState
from tensorflow.contrib.seq2seq import BahdanauAttention, LuongAttention
from tensorflow.contrib.rnn import LSTMStateTuple, LSTMCell
from chatbot.components.base._rnn import RNN, SimpleAttentionWrapper
from utils import io_utils
class Decoder(RNN):
"""Dynamic decoding (base) class that supports both training and inference without
requiring superfluous helper objects. With simple boolean parameters,
handles the decoder sub-graph construction dynamically in its entirety.
"""
def __init__(self,
base_cell,
encoder_outputs,
state_size,
vocab_size,
embed_size,
dropout_prob,
num_layers,
temperature,
max_seq_len,
state_wrapper=None):
"""
Args:
base_cell: (str) name of RNNCell class for underlying cell.
state_size: number of units in underlying rnn cell.
vocab_size: dimension of output space for projections.
embed_size: dimension size of word-embedding space.
dropout_prob: probability of a node being dropped.
num_layers: how many cells to include in the MultiRNNCell.
temperature: (float) determines randomness of outputs/responses.
- Some notable values (to get some intuition):
- t -> 0: outputs approach simple argmax.
- t = 1: same as sampling from softmax distribution over
outputs, interpreting the softmax outputs as from a
multinomial (probability) distribution.
- t -> inf: outputs approach uniform random distribution.
state_wrapper: allow states to store their wrapper class. See the
wrapper method docstring below for more info.
"""
self.encoder_outputs = encoder_outputs
if state_wrapper is None and base_cell == 'LSTMCell':
state_wrapper = LSTMStateTuple
super(Decoder, self).__init__(
base_cell=base_cell,
state_size=state_size,
embed_size=embed_size,
dropout_prob=dropout_prob,
num_layers=num_layers,
state_wrapper=state_wrapper)
self.temperature = temperature
self.vocab_size = vocab_size
self.max_seq_len = max_seq_len
with tf.variable_scope('projection_tensors'):
w = tf.get_variable(
name="w",
shape=[state_size, vocab_size],
dtype=tf.float32,
initializer=tf.contrib.layers.xavier_initializer())
b = tf.get_variable(
name="b",
shape=[vocab_size],
dtype=tf.float32,
initializer=tf.contrib.layers.xavier_initializer())
self._projection = (w, b)
def __call__(self,
inputs,
is_chatting,
loop_embedder,
cell,
initial_state=None):
"""Run the inputs on the decoder.
If we are chatting, then conduct dynamic sampling, which is the process
of generating a response given inputs == GO_ID.
Args:
inputs: Tensor with shape [batch_size, max_time, embed_size].
For training, inputs are the 'to' sentence tokens (embedded).
For chatting, first input is <GO> and thereafter, the input is
the bot's previous output (looped around through embedding).
initial_state: Tensor with shape [batch_size, state_size].
is_chatting: (bool) Determines how we retrieve the outputs and the
returned Tensor shape.
loop_embedder: required if is_chatting==True.
Embedder instance needed to feed decoder outputs
as next inputs.
Returns:
outputs: if not chatting, tensor of shape
[batch_size, max_time, vocab_size]. Otherwise, tensor of
response IDs with shape [batch_size, max_time].
state: if not is_chatting, tensor of shape
[batch_size, state_size]. Otherwise, None.
"""
self.rnn = tf.make_template('decoder_rnn',
tf.nn.dynamic_rnn,
cell=cell,
dtype=tf.float32)
outputs, state = self.rnn(inputs=inputs,
initial_state=initial_state)
if not is_chatting:
return outputs, state
if loop_embedder is None:
raise ValueError(
"Loop function required to feed outputs as inputs.")
def body(response, state):
"""Input callable for tf.while_loop. See below."""
tf.get_variable_scope().reuse_variables()
decoder_input = loop_embedder(tf.reshape(response[-1], (1, 1)),
reuse=True)
outputs, state = self.rnn(inputs=decoder_input,
initial_state=state,
sequence_length=[1])
next_id = self.sample(self.apply_projection(outputs))
response = tf.concat([response, tf.stack([next_id])], axis=0)
return response, state
def cond(response, s):
"""Input callable for tf.while_loop. See below."""
return tf.logical_and(
tf.not_equal(response[-1], io_utils.EOS_ID),
tf.less_equal(tf.size(response), self.max_seq_len))
# Project to full output state during inference time.
# Note: "outputs" at this point, at this exact line, is technically just
# a single output: the bot's first response token.
outputs = self.apply_projection(outputs)
# Begin the process of building the list of output tokens.
response = tf.stack([self.sample(outputs)])
# Reshape is needed so the while_loop ahead knows the shape of response.
# The comma after the 1 is intentional, it forces tf to believe us.
response = tf.reshape(response, [1,], name='response')
tf.get_variable_scope().reuse_variables()
# ============== BEHOLD: The tensorflow while loop. ==================
# This allows us to sample dynamically. It also makes me happy!
# -- Repeat 'body' while the 'cond' returns true.
# -- 'cond': callable returning a boolean scalar tensor.
# -- 'body': callable returning a tuple of tensors of same
# arity as loop_vars.
# -- 'loop_vars': tuple of tensors that is passed to 'cond' and 'body'.
response, _ = tf.while_loop(
cond, body, (response, state),
shape_invariants=(tf.TensorShape([None]), cell.shape),
back_prop=False)
# =============== FAREWELL: The tensorflow while loop. =================
outputs = tf.expand_dims(response, 0)
return outputs, None
def apply_projection(self, outputs, scope=None):
"""Defines & applies the affine transformation from state space
to output space.
Args:
outputs: Tensor of shape [batch_size, max_time, state_size]
returned by tf dynamic_rnn.
scope: (optional) variable scope for any created here.
Returns:
Tensor of shape [batch_size, max_time, vocab_size] representing the
projected outputs.
"""
with tf.variable_scope(scope, "proj_scope", [outputs]):
# Swap 1st and 2nd indices to match expected input of map_fn.
seq_len = tf.shape(outputs)[1]
st_size = tf.shape(outputs)[2]
time_major_outputs = tf.reshape(outputs, [seq_len, -1, st_size])
# Project batch at single timestep from state space to output space.
def proj_op(batch):
return tf.matmul(batch, self._projection[0]) + self._projection[1]
# Get projected output states;
# 3D Tensor with shape [batch_size, seq_len, ouput_size].
projected_state = tf.map_fn(proj_op, time_major_outputs)
return tf.reshape(projected_state, [-1, seq_len, self.vocab_size])
def sample(self, projected_output):
"""Return integer ID tensor representing the sampled word.
Args:
projected_output: Tensor [1, 1, state_size], representing a single
decoder timestep output.
"""
# TODO: We really need a tf.control_dependencies check here (for rank).
with tf.name_scope('decoder_sampler', values=[projected_output]):
# Protect against extra size-1 dimensions; grab the 1D tensor
# of size state_size.
logits = tf.squeeze(projected_output)
if self.temperature < 0.02:
return tf.argmax(logits, axis=0)
# Convert logits to probability distribution.
probabilities = tf.div(logits, self.temperature)
projected_output = tf.div(
tf.exp(probabilities),
tf.reduce_sum(tf.exp(probabilities), axis=-1))
# Sample 1 time from the probability distribution.
sample_ID = tf.squeeze(
tf.multinomial(tf.expand_dims(probabilities, 0), 1))
return sample_ID
def get_projection_tensors(self):
"""Returns the tuple (w, b) that decoder uses for projecting.
Required as argument to the sampled softmax loss.
"""
return self._projection
class BasicDecoder(Decoder):
"""Simple (but dynamic) decoder that is essentially just the base class."""
def __call__(self,
inputs,
initial_state=None,
is_chatting=False,
loop_embedder=None,
cell=None):
return super(BasicDecoder, self).__call__(
inputs=inputs,
initial_state=initial_state,
is_chatting=is_chatting,
loop_embedder=loop_embedder,
cell=self.get_cell('decoder_cell'))
class AttentionDecoder(Decoder):
"""Dynamic decoder that applies an attention mechanism over the full
sequence of encoder outputs. Using Bahdanau for now (may change).
TODO: Luong's paper mentions that they only use the *top* layer of
stacked LSTMs for attention-related computation. Since currently I'm
only testing attention models with one-layer encoder/decoders, this
isn't an issue. However, in a couple days I should revisit this.
"""
def __init__(self,
encoder_outputs,
base_cell,
state_size,
vocab_size,
embed_size,
attention_mechanism='BahdanauAttention',
dropout_prob=1.0,
num_layers=1,
temperature=0.0,
max_seq_len=10):
"""We need to explicitly call the constructor now, so we can:
- Specify we need the state wrapped in AttentionWrapperState.
- Specify our attention mechanism (will allow customization soon).
"""
super(AttentionDecoder, self).__init__(
encoder_outputs=encoder_outputs,
base_cell=base_cell,
state_size=state_size,
vocab_size=vocab_size,
embed_size=embed_size,
dropout_prob=dropout_prob,
num_layers=num_layers,
temperature=temperature,
max_seq_len=max_seq_len,
state_wrapper=AttentionWrapperState)
_mechanism = getattr(tf.contrib.seq2seq, attention_mechanism)
self.attention_mechanism = _mechanism(num_units=state_size,
memory=encoder_outputs)
self.output_attention = True
def __call__(self,
inputs,
initial_state=None,
is_chatting=False,
loop_embedder=None,
cell=None):
"""
The only modifcation to the superclass is we pass in our own
cell that is wrapped with a custom attention class (specified in
base/_rnn.py). It is mostly the same as tensorflow's, but with minor
tweaks so that it could easily hang out with the other components of
the project.
"""
if cell is None:
cell = self.get_cell('attn_cell', initial_state)
return super(AttentionDecoder, self).__call__(
inputs=inputs,
is_chatting=is_chatting,
loop_embedder=loop_embedder,
cell=cell)
def get_cell(self, name, initial_state):
# Get the simple underlying cell first.
cell = super(AttentionDecoder, self).get_cell(name)
# Return the normal cell wrapped to support attention.
return SimpleAttentionWrapper(
cell=cell,
attention_mechanism=self.attention_mechanism,
initial_cell_state=initial_state)
================================================
FILE: chatbot/components/embedder.py
================================================
import tensorflow as tf
import logging
import numpy as np
from chatbot._models import Model
from utils import io_utils
import time
class Embedder:
"""Acts on tensors with integer elements, embedding them in a higher-dimensional
vector space. A single Embedder instance can embed both encoder and decoder by
associating them with distinct scopes. """
def __init__(self, vocab_size, embed_size, l1_reg=0.0):
self.vocab_size = vocab_size
self.embed_size = embed_size
self.l1_reg = l1_reg
self._scopes = dict()
def __call__(self, inputs, reuse=None):
"""Embeds integers in inputs and returns the embedded inputs.
Args:
inputs: input tensor of shape [batch_size, max_time].
Returns:
Output tensor of shape [batch_size, max_time, embed_size]
"""
# Ensure inputs has expected rank of 2.
assert len(inputs.shape) == 2, \
"Expected inputs rank 2 but found rank %r" % len(inputs.shape)
scope = tf.get_variable_scope()
# Parse info from scope input needed for reliable reuse across model.
if scope is not None:
scope_name = scope if isinstance(scope, str) else scope.name
if scope_name not in self._scopes:
self._scopes[scope_name] = scope
else:
self._scopes['embedder_call'] = tf.variable_scope('embedder_call')
embed_tensor = tf.get_variable(
name="embed_tensor",
shape=[self.vocab_size, self.embed_size],
initializer=tf.contrib.layers.xavier_initializer(),
regularizer=tf.contrib.layers.l1_regularizer(self.l1_reg))
embedded_inputs = tf.nn.embedding_lookup(embed_tensor, inputs)
# Place any checks on inputs here before returning.
if not isinstance(embedded_inputs, tf.Tensor):
raise TypeError("Embedded inputs should be of type Tensor.")
if len(embedded_inputs.shape) != 3:
raise ValueError("Embedded sentence has incorrect shape.")
tf.summary.histogram(scope.name, embed_tensor)
return embedded_inputs
def assign_visualizers(self, writer, scope_names, metadata_path):
"""Setup the tensorboard embedding visualizer.
Args:
writer: instance of tf.summary.FileWriter
scope_names: list of
"""
assert writer is not None
if not isinstance(scope_names, list):
scope_names = [scope_names]
for scope_name in scope_names:
assert scope_name in self._scopes, \
"I don't have any embedding tensors for %s" % scope_name
config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig()
emb = config.embeddings.add()
emb.tensor_name = scope_name.rstrip('/') + '/embed_tensor:0'
emb.metadata_path = metadata_path
tf.contrib.tensorboard.plugins.projector.visualize_embeddings(writer, config)
def get_scope_basename(self, scope):
"""
Args:
scope: tf.variable_scope.
"""
return scope.name.strip('/').split('/')[-1]
class AutoEncoder(Model):
"""[UNDER CONSTRUCTION]. AutoEncoder for unsupervised pretraining the
word embeddings for dynamic models.
"""
def __init__(self, dataset, params):
self.log = logging.getLogger('AutoEncoderLogger')
super(AutoEncoder, self).__init__(self.log, dataset, params)
self.build_computation_graph(dataset)
self.compile()
def build_computation_graph(self, dataset):
from chatbot.components.input_pipeline import InputPipeline
# Organize input pipeline inside single node for clean visualization.
self.pipeline = InputPipeline(
file_paths=dataset.paths,
batch_size=self.batch_size,
is_chatting=self.is_chatting)
self.encoder_inputs = self.pipeline.encoder_inputs
with tf.variable_scope('autoencoder_encoder'):
embed_tensor = tf.get_variable(
name="embed_tensor",
shape=[self.vocab_size, self.embed_size])
_h = tf.nn.embedding_lookup(embed_tensor, self.encoder_inputs)
h = tf.contrib.keras.layers.Dense(self.embed_size, activation='relu')(_h)
with tf.variable_scope('autoencoder_decoder'):
w = tf.get_variable(
name="w",
shape=[self.embed_size, self.vocab_size],
dtype=tf.float32)
b = tf.get_variable(
name="b",
shape=[self.vocab_size],
dtype=tf.float32)
# Swap 1st and 2nd indices to match expected input of map_fn.
seq_len = tf.shape(h)[1]
st_size = tf.shape(h)[2]
time_major_outputs = tf.reshape(h, [seq_len, -1, st_size])
# Project batch at single timestep from state space to output space.
def proj_op(h_t):
return tf.matmul(h_t, w) + b
decoder_outputs = tf.map_fn(proj_op, time_major_outputs)
decoder_outputs = tf.reshape(decoder_outputs,
[-1, seq_len, self.vocab_size])
self.outputs = tf.identity(decoder_outputs, name='outputs')
# Tag inputs and outputs by name should we want to freeze the model.
self.graph.add_to_collection('freezer', self.encoder_inputs)
self.graph.add_to_collection('freezer', self.outputs)
# Merge any summaries floating around in the aether into one object.
self.merged = tf.summary.merge_all()
def compile(self):
if not self.is_chatting:
with tf.variable_scope("evaluation") as scope:
target_labels = self.encoder_inputs[:, 1:]
target_weights = tf.cast(target_labels > 0, target_labels.dtype)
print('\ntl\n', target_labels)
print('\ntw\n', target_weights)
preds = self.outputs[:, :-1, :]
print('\npreds\n', preds)
self.loss = tf.losses.sparse_softmax_cross_entropy(
labels=target_labels,
logits=preds,
weights=target_weights)
print(self.loss)
self.train_op = tf.contrib.layers.optimize_loss(
loss=self.loss, global_step=self.global_step,
learning_rate=self.learning_rate,
optimizer='Adam',
summaries=['gradients'])
# Compute accuracy, ensuring we use fully projected outputs.
_preds = tf.argmax(self.outputs[:, :-1, :], axis=2)
correct_pred = tf.equal(
_preds,
target_labels)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
tf.summary.scalar('accuracy', accuracy)
tf.summary.scalar('loss_train', self.loss)
self.merged = tf.summary.merge_all()
super(AutoEncoder, self).compile()
def step(self, forward_only=False):
if not forward_only:
return self.sess.run([self.merged, self.loss, self.train_op])
else:
return self.sess.run(fetches=tf.argmax(self.outputs[:, :-1, :], axis=2),
feed_dict=self.pipeline.feed_dict)
def train(self, close_when_done=True):
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=self.sess, coord=coord)
try:
avg_loss = avg_step_time = 0.0
while not coord.should_stop():
i_step = self.sess.run(self.global_step)
start_time = time.time()
summaries, step_loss, _ = self.step()
avg_step_time += (time.time() - start_time) / self.steps_per_ckpt
avg_loss += step_loss / self.steps_per_ckpt
# Print updates in desired intervals (steps_per_ckpt).
if i_step % self.steps_per_ckpt == 0:
print('loss:', avg_loss)
self.save(summaries=summaries)
avg_loss = avg_step_time = 0.0
if i_step >= self.max_steps:
print("Maximum step", i_step, "reached.")
raise SystemExit
except (KeyboardInterrupt, SystemExit):
print("Training halted. Cleaning up . . . ")
coord.request_stop()
except tf.errors.OutOfRangeError:
print("OutOfRangeError. You have run out of data.")
coord.request_stop()
finally:
coord.join(threads)
if close_when_done:
self.close()
def __call__(self, sentence):
encoder_inputs = io_utils.sentence_to_token_ids(
tf.compat.as_bytes(sentence), self.dataset.word_to_idx)
encoder_inputs = np.array([encoder_inputs[::-1]])
self.pipeline.feed_user_input(encoder_inputs)
# Get output sentence from the chatbot.
response = self.step(forward_only=True)
return self.dataset.as_words(response[0])
================================================
FILE: chatbot/components/encoders.py
================================================
"""Classes for the dynamic encoders."""
import tensorflow as tf
from tensorflow.contrib.rnn import GRUCell
from tensorflow.contrib.rnn import LSTMStateTuple, LSTMCell
from chatbot.components.base._rnn import RNN
from tensorflow.python.layers import core as layers_core
class BasicEncoder(RNN):
"""Encoder architecture that is defined by its cell running
inside dynamic_rnn.
"""
def __call__(self, inputs, initial_state=None):
"""Run the inputs on the encoder and return the output(s).
Args:
inputs: Tensor with shape [batch_size, max_time, embed_size].
initial_state: (optional) Tensor with shape [batch_size, state_size]
to initialize decoder cell.
Returns:
outputs: (only if return_sequence is True)
Tensor of shape [batch_size, max_time, state_size].
state: The final encoder state; shape [batch_size, state_size].
"""
cell = self.get_cell("basic_enc_cell")
_, state = tf.nn.dynamic_rnn(cell,
inputs,
initial_state=initial_state,
dtype=tf.float32)
return _, state
class BidirectionalEncoder(RNN):
"""Encoder that concatenates two copies of its cell forward and backward and
feeds into a bidirectional_dynamic_rnn.
Outputs are concatenated before being returned. I may move this
functionality to an intermediate class layer that handles shape-matching
between encoder/decoder.
"""
def __call__(self, inputs, initial_state=None):
"""Run the inputs on the encoder and return the output(s).
Args:
inputs: Tensor with shape [batch_size, max_time, embed_size].
Returns:
outputs: Tensor of shape [batch_size, max_time, state_size].
state: The final encoder state; shape [batch_size, state_size].
"""
cell_fw = self.get_cell("cell_fw")
cell_bw = self.get_cell("cell_bw")
outputs_tuple, final_state_tuple = tf.nn.bidirectional_dynamic_rnn(
cell_fw=cell_fw,
cell_bw=cell_bw,
inputs=inputs,
dtype=tf.float32)
# Create fully connected layer to help get us back to
# state size (from the dual state fw-bw).
layer = layers_core.Dense(units=self.state_size, use_bias=False)
def single_state(state):
"""Reshape bidirectional state (via fully connected layer)
to state size.
"""
if 'LSTM' in self.base_cell:
bridged_state = LSTMStateTuple(
c=layer(state[0]),
h=layer(state[1]))
else:
bridged_state = layer(state)
return bridged_state
# Concatenate each of the tuples fw and bw dimensions.
# Now we are dealing with the concatenated "states" with dimension:
# [batch_size, max_time, state_size * 2].
# NOTE: Convention of LSTMCell is that outputs only contain the
# the hidden state (i.e. 'h' only, no 'c').
outputs = tf.concat(outputs_tuple, -1)
outputs = tf.map_fn(layer, outputs)
# Similarly, combine the tuple of final states, resulting in:
# [batch_size, state_size * 2].
final_state = tf.concat(final_state_tuple, -1)
if self.num_layers == 1:
final_state = single_state(final_state)
else:
final_state = tuple([single_state(fs)
for fs in tf.unstack(final_state)])
return outputs, final_state
================================================
FILE: chatbot/components/input_pipeline.py
================================================
import logging
import tensorflow as tf
from utils import io_utils
from tensorflow.contrib.training import bucket_by_sequence_length
LENGTHS = {'encoder_sequence_length': tf.FixedLenFeature([], dtype=tf.int64),
'decoder_sequence_length': tf.FixedLenFeature([], dtype=tf.int64)}
SEQUENCES = {'encoder_sequence': tf.FixedLenSequenceFeature([], dtype=tf.int64),
'decoder_sequence': tf.FixedLenSequenceFeature([], dtype=tf.int64)}
class InputPipeline:
"""TensorFlow-only input pipeline with parallel enqueuing,
dynamic bucketed-batching, and more.
Overview of pipeline construction:
1. Create ops for reading protobuf tfrecords line-by-line.
2. Enqueue raw outputs, attach to threads, and parse sequences.
3. Organize sequences into buckets of similar lengths, pad, and batch.
"""
def __init__(self, file_paths, batch_size, capacity=None, is_chatting=False, scope=None):
"""
Args:
file_paths: (dict) returned by instance of Dataset via Dataset.paths.
batch_size: number of examples returned by dequeue op.
capacity: maximum number of examples allowed in the input queue at a time.
is_chatting: (bool) determines whether we're feeding user input or file inputs.
"""
with tf.name_scope(scope, 'input_pipeline') as scope:
if capacity is None:
self.capacity = max(50 * batch_size, int(1e4))
logging.info("Input capacity set to %d examples." % self.capacity)
self.batch_size = batch_size
self.paths = file_paths
self.control = {'train': 0, 'valid': 1}
self.active_data = tf.convert_to_tensor(self.control['train'])
self.is_chatting = is_chatting
self._user_input = tf.placeholder(tf.int32, [1, None], name='user_input')
self._feed_dict = None
self._scope = scope
if not is_chatting:
# Create tensors that will store input batches at runtime.
self._train_lengths, self.train_batches = self.build_pipeline('train')
self._valid_lengths, self.valid_batches = self.build_pipeline('valid')
def build_pipeline(self, name):
"""Creates a new input subgraph composed of the following components:
- Reader queue that feeds protobuf data files.
- RandomShuffleQueue assigned parallel-thread queuerunners.
- Dynamic padded-bucketed-batching queue for organizing batches in a time and
space-efficient manner.
Args:
name: filename prefix for data. See Dataset class for naming conventions.
Returns:
2-tuple (lengths, sequences):
lengths: (dict) parsed context feature from protobuf file.
Supports keys in LENGTHS.
sequences: (dict) parsed feature_list from protobuf file.
Supports keys in SEQUENCES.
"""
with tf.variable_scope(name + '_pipeline'):
proto_text = self._read_line(self.paths[name + '_tfrecords'])
context_pair, sequence_pair = self._assign_queue(proto_text)
input_length = tf.add(context_pair['encoder_sequence_length'],
context_pair['decoder_sequence_length'],
name=name + 'length_add')
return self._padded_bucket_batches(input_length, sequence_pair)
@property
def encoder_inputs(self):
"""Determines, via tensorflow control structures, which part of the pipeline to run
and retrieve inputs to a Model encoder component. """
if not self.is_chatting:
return self._cond_input('encoder')
else:
return self._user_input
@property
def decoder_inputs(self):
"""Determines, via tensorflow control structures, which part of the pipeline to run
and retrieve inputs to a Model decoder component. """
if not self.is_chatting:
return self._cond_input('decoder')
else:
# In a chat session, we just give the bot the go-ahead to respond!
return tf.convert_to_tensor([[io_utils.GO_ID]])
@property
def user_input(self):
return self._user_input
@property
def feed_dict(self):
return self._feed_dict
def feed_user_input(self, user_input):
"""Called by Model instances upon receiving input from stdin."""
self._feed_dict = {self._user_input.name: user_input}
def toggle_active(self):
"""Simple callable that toggles active_data between training and validation."""
def to_valid(): return tf.constant(self.control['valid'])
def to_train(): return tf.constant(self.control['train'])
self.active_data = tf.cond(tf.equal(self.active_data, self.control['train']),
to_valid, to_train)
def _cond_input(self, prefix):
with tf.name_scope(self._scope):
def train(): return self.train_batches[prefix + '_sequence']
def valid(): return self.valid_batches[prefix + '_sequence']
return tf.cond(tf.equal(self.active_data, self.control['train']),
train, valid, name=prefix + '_cond_input')
def _read_line(self, file):
"""Create ops for extracting lines from files.
Returns:
Tensor that will contain the lines at runtime.
"""
with tf.variable_scope('reader'):
filename_queue = tf.train.string_input_producer([file])
reader = tf.TFRecordReader(name='tfrecord_reader')
_, next_raw = reader.read(filename_queue, name='read_records')
return next_raw
def _assign_queue(self, proto_text):
"""
Args:
proto_text: object to be enqueued and managed by parallel threads.
"""
with tf.variable_scope('shuffle_queue'):
queue = tf.RandomShuffleQueue(
capacity=self.capacity,
min_after_dequeue=10*self.batch_size,
dtypes=tf.string, shapes=[()])
enqueue_op = queue.enqueue(proto_text)
example_dq = queue.dequeue()
qr = tf.train.QueueRunner(queue, [enqueue_op] * 4)
tf.train.add_queue_runner(qr)
_sequence_lengths, _sequences = tf.parse_single_sequence_example(
serialized=example_dq,
context_features=LENGTHS,
sequence_features=SEQUENCES)
return _sequence_lengths, _sequences
def _padded_bucket_batches(self, input_length, data):
with tf.variable_scope('bucket_batch'):
lengths, sequences = bucket_by_sequence_length(
input_length=tf.to_int32(input_length),
tensors=data,
batch_size=self.batch_size,
bucket_boundaries=[8, 16, 32],
capacity=self.capacity,
dynamic_pad=True)
return lengths, sequences
================================================
FILE: chatbot/dynamic_models.py
================================================
"""Sequence-to-sequence models with dynamic unrolling and faster embedding
techniques.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import time
import logging
import numpy as np
import tensorflow as tf
from chatbot import components
from chatbot.components import bot_ops, Embedder, InputPipeline
from chatbot._models import Model
from utils import io_utils
from pydoc import locate
class DynamicBot(Model):
""" General sequence-to-sequence model for conversations.
Will eventually support beam search, and a wider variety of
cell options. At present, supports multi-layer encoder/decoders,
GRU/LSTM cells, attention, and dynamic unrolling (online decoding included).
Additionally, will eventually support biologically inspired mechanisms for
learning, such as hebbian-based update rules.
"""
def __init__(self, dataset, params):
"""Build the model computation graph.
Args:
dataset: any instance inheriting from data.DataSet.
params: dictionary of hyperparameters.
For supported keys, see DEFAULT_FULL_CONFIG.
(chatbot.globals.py)
"""
self.log = logging.getLogger('DynamicBotLogger')
# Let superclass handle common bookkeeping (saving/loading/dir paths).
super(DynamicBot, self).__init__(self.log, dataset, params)
# Build the model's structural components.
self.build_computation_graph(dataset)
# Configure training and evaluation.
# Note: this is distinct from build_computation_graph for historical
# reasons, and I plan on refactoring. Initially, I more or less followed
# the feel of Keras for setting up models, but after incorporating the
# YAML configuration files, this seems rather unnecessary.
self.compile()
def build_computation_graph(self, dataset):
"""Create the TensorFlow model graph. Note that this only builds the
structural components, i.e. nothing related to training parameters,
optimization, etc.
The main components to be built (in order):
1. InputPipeline
2. Embedder
- single object shared between encoder/decoder.
- creates distict embeddings for distinct variable scopes.
2. Encoder
3. Decoder
"""
# Grab the model classes (Constructors) specified by user in params.
encoder_class = locate(getattr(self, 'encoder.class')) \
or getattr(components, getattr(self, 'encoder.class'))
decoder_class = locate(getattr(self, 'decoder.class')) \
or getattr(components, getattr(self, 'decoder.class'))
assert encoder_class is not None, "Couldn't find requested %s." % \
self.model_params['encoder.class']
assert decoder_class is not None, "Couldn't find requested %s." % \
self.model_params['decoder.class']
# Organize input pipeline inside single node for clean visualization.
self.pipeline = InputPipeline(
file_paths=dataset.paths,
batch_size=self.batch_size,
is_chatting=self.is_chatting)
# Grab the input feeds for encoder/decoder from the pipeline.
encoder_inputs = self.pipeline.encoder_inputs
self.decoder_inputs = self.pipeline.decoder_inputs
# Create embedder object -- handles all of your embedding needs!
# By passing scope to embedder calls, we can create distinct embeddings,
# while storing inside the same embedder object.
self.embedder = Embedder(
self.vocab_size,
self.embed_size,
l1_reg=self.l1_reg)
# Explicitly show required parameters for any subclass of
# chatbot.components.base.RNN (e.g. encoders/decoders).
# I do this for readability; you can easily tell below which additional
# params are needed, e.g. for a decoder.
rnn_params = {
'state_size': self.state_size,
'embed_size': self.embed_size,
'num_layers': self.num_layers,
'dropout_prob': self.dropout_prob,
'base_cell': self.base_cell}
with tf.variable_scope('encoder'):
embedded_enc_inputs = self.embedder(encoder_inputs)
# For now, encoders require just the RNN params when created.
encoder = encoder_class(**rnn_params)
# Apply embedded inputs to encoder for the final (context) state.
encoder_outputs, encoder_state = encoder(embedded_enc_inputs)
with tf.variable_scope("decoder"):
embedded_dec_inputs = self.embedder(self.decoder_inputs)
# Sneaky. Would be nice to have a "cleaner" way of doing this.
if getattr(self, 'attention_mechanism', None) is not None:
rnn_params['attention_mechanism'] = self.attention_mechanism
self.decoder = decoder_class(
encoder_outputs=encoder_outputs,
vocab_size=self.vocab_size,
max_seq_len=dataset.max_seq_len,
temperature=self.temperature,
**rnn_params)
# For decoder outpus, we want the full sequence (output sentence),
# not simply the last.
decoder_outputs, decoder_state = self.decoder(
embedded_dec_inputs,
initial_state=encoder_state,
is_chatting=self.is_chatting,
loop_embedder=self.embedder)
self.outputs = tf.identity(decoder_outputs, name='outputs')
# Tag inputs and outputs by name should we want to freeze the model.
tf.add_to_collection('freezer', encoder_inputs)
tf.add_to_collection('freezer', self.outputs)
# Merge any summaries floating around in the aether into one object.
self.merged = tf.summary.merge_all()
def compile(self):
""" TODO: perhaps merge this into __init__?
Originally, this function accepted training/evaluation specific
parameters. However, since moving the configuration parameters to .yaml
files and interfacing with the dictionary, no args are needed here,
and thus would mainly be a hassle to have to call before training.
Will decide how to refactor this later.
"""
if not self.is_chatting:
with tf.variable_scope("evaluation") as scope:
# Loss - target is to predict (as output) next decoder input.
# target_labels has shape [batch_size, dec_inp_seq_len - 1]
target_labels = self.decoder_inputs[:, 1:]
target_weights = tf.cast(target_labels > 0, target_labels.dtype)
preds = self.decoder.apply_projection(self.outputs)
reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
l1 = tf.reduce_sum(tf.abs(reg_losses))
if self.sampled_loss:
self.log.info("Training with dynamic sampled softmax loss.")
assert 0 < self.num_samples < self.vocab_size, \
"num_samples is %d but should be between 0 and %d" \
% (self.num_samples, self.vocab_size)
self.loss = bot_ops.dynamic_sampled_softmax_loss(
target_labels,
self.outputs[:, :-1, :],
self.decoder.get_projection_tensors(),
self.vocab_size,
num_samples=self.num_samples) + l1
else:
self.loss = tf.losses.sparse_softmax_cross_entropy(
labels=target_labels,
logits=preds[:, :-1, :],
weights=target_weights) + l1
# New loss function I'm experimenting with below:
# I'm suspicious that it may do the same stuff
# under-the-hood as sparse_softmax_cross_entropy,
# but I'm doing speed tests/comparisons to make sure.
#self.loss = bot_ops.cross_entropy_sequence_loss(
# labels=target_labels,
# logits=preds[:, :-1, :],
# weights=target_weights) + l1
self.log.info("Optimizing with %s.", self.optimizer)
self.train_op = tf.contrib.layers.optimize_loss(
loss=self.loss, global_step=self.global_step,
learning_rate=self.learning_rate,
optimizer=self.optimizer,
clip_gradients=self.max_gradient,
summaries=['gradients'])
# Compute accuracy, ensuring we use fully projected outputs.
correct_pred = tf.equal(tf.argmax(preds[:, :-1, :], axis=2),
target_labels)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
tf.summary.scalar('accuracy', accuracy)
tf.summary.scalar('loss_train', self.loss)
self.merged = tf.summary.merge_all()
# Note: Important not to merge in the validation loss, since
# we don't want to couple it with the training loss summary.
self.valid_summ = tf.summary.scalar('loss_valid', self.loss)
super(DynamicBot, self).compile()
def step(self, forward_only=False):
"""Run one step of the model, which can mean 1 of the following:
1. forward_only == False.
- This means we are training.
- We do a forward and a backward pass.
2. self.is_chatting.
- We are running a user's input sentence to generate a response.
- We only do a forward pass to get the response (word IDs).
3. Otherwise: inference (used for validation)
- Do a forward pass, but also compute loss(es) and summaries.
Args:
forward_only: if True, don't perform backward pass
(gradient updates).
Returns:
3-tuple: (summaries, step_loss, step_outputs).
Qualifications/details for each of the 3 cases:
1. If forward_only == False:
- This is a training step: 'summaries' are training summaries.
- step_outputs = None
2. else if self.is_chatting:
- summaries = step_loss = None
- step_outputs == the bot response tokens
3. else (validation):
- This is validation: 'summaries' are validation summaries.
- step_outputs == None (to reduce computational cost).
"""
if not forward_only:
fetches = [self.merged, self.loss, self.train_op]
summaries, step_loss, _ = self.sess.run(fetches)
return summaries, step_loss, None
elif self.is_chatting:
response = self.sess.run(
fetches=self.outputs,
feed_dict=self.pipeline.feed_dict)
return None, None, response
else:
fetches = [self.valid_summ, self.loss] # , self.outputs]
summaries, step_loss = self.sess.run(fetches)
return summaries, step_loss, None
def train(self, dataset=None):
"""Train bot on inputs until user types CTRL-C or queues run out of data.
Args:
dataset: (DEPRECATED) any instance of the Dataset class.
Will be removed soon.
"""
def perplexity(loss):
return np.exp(float(loss)) if loss < 300 else float("inf")
if dataset is None:
dataset = self.dataset
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=self.sess, coord=coord)
# Tell embedder to coordinate with TensorBoard's "Embedddings" tab.
# This allows us to view words in 3D-projected embedding space.
self.embedder.assign_visualizers(
self.file_writer,
['encoder', 'decoder'],
dataset.paths['vocab'])
# Note: Calling sleep allows sustained GPU utilization across training.
# Without it, GPU has to wait for data to be enqueued more often.
print('QUEUE RUNNERS RELEASED.', end=" ")
for _ in range(3):
print('.', end=" ");
time.sleep(1);
sys.stdout.flush()
print('GO!')
try:
avg_loss = avg_step_time = 0.0
while not coord.should_stop():
i_step = self.sess.run(self.global_step)
start_time = time.time()
summaries, step_loss, _ = self.step()
# Calculate running averages.
avg_step_time += (time.time() - start_time) / self.steps_per_ckpt
avg_loss += step_loss / self.steps_per_ckpt
# Print updates in desired intervals (steps_per_ckpt).
if i_step % self.steps_per_ckpt == 0:
# Display averged-training updates and save.
print("Step %d:" % i_step, end=" ")
print("step time = %.3f" % avg_step_time)
print("\ttraining loss = %.3f" % avg_loss, end="; ")
print("training perplexity = %.2f" % perplexity(avg_loss))
self.save(summaries=summaries)
# Toggle data switch and led the validation flow!
self.pipeline.toggle_active()
with self.graph.device('/cpu:0'):
summaries, eval_loss, _ = self.step(forward_only=True)
self.save(summaries=summaries)
self.pipeline.toggle_active()
print("\tValidation loss = %.3f" % eval_loss, end="; ")
print("val perplexity = %.2f" % perplexity(eval_loss))
# Reset the running averages and exit checkpoint.
avg_loss = avg_step_time = 0.0
if i_step >= self.max_steps:
print("Maximum step", i_step, "reached.")
raise SystemExit
except (KeyboardInterrupt, SystemExit):
print("Training halted. Cleaning up . . . ")
coord.request_stop()
except tf.errors.OutOfRangeError:
print("OutOfRangeError. You have run out of data.")
coord.request_stop()
finally:
coord.join(threads)
self.close(save_current=False, rebuild_for_chat=True)
def decode(self):
"""Sets up and manages chat session between bot and user (stdin)."""
# Make sure params are set to chat values, just in case the user
# forgot to specify/doesn't know about such things.
self._set_chat_params()
# Decode from standard input.
print("Type \"exit\" to exit.\n")
sentence = io_utils.get_sentence()
while sentence != 'exit':
response = self(sentence)
print("Robot:", response)
sentence = io_utils.get_sentence()
print("Farewell, human.")
def __call__(self, sentence):
"""This is how we talk to the bot interactively.
While decode(self) above sets up/manages the chat session,
users can also use this directly to get responses from the bot,
given an input sentence.
For example, one could do:
sentence = 'Hi, bot!'
response = bot(sentence)
for a single input-to-response with the bot.
Args:
sentence: (str) Input sentence from user.
Returns:
response string from bot.
"""
# Convert input sentence to token-ids.
encoder_inputs = io_utils.sentence_to_token_ids(
tf.compat.as_bytes(sentence), self.dataset.word_to_idx)
encoder_inputs = np.array([encoder_inputs[::-1]])
self.pipeline.feed_user_input(encoder_inputs)
# Get output sentence from the chatbot.
_, _, response = self.step(forward_only=True)
# response has shape [1, response_length].
# Its last element is the EOS_ID, which we don't show user.
response = self.dataset.as_words(response[0][:-1])
if 'UNK' in response:
response = "I don't know."
return response
def chat(self):
"""Alias for decode."""
self.decode()
def respond(self, sentence):
"""Alias for __call__. (Suggestion)"""
return self.__call__(sentence)
def close(self, save_current=True, rebuild_for_chat=True):
"""Before closing, which will freeze our graph to a file,
rebuild it so that it's ready for chatting when unfreezed,
to make it easier for the user. Training can still be resumed
with no issue since it doesn't load frozen models, just ckpts.
"""
if rebuild_for_chat:
lr_val = self.learning_rate.eval(session=self.sess)
tf.reset_default_graph()
# Gross. Am ashamed:
self.sess = tf.Session()
with self.graph.name_scope(tf.GraphKeys.SUMMARIES):
self.global_step = tf.Variable(initial_value=0, trainable=False)
self.learning_rate = tf.constant(lr_val)
self._set_chat_params()
self.build_computation_graph(self.dataset)
self.compile()
super(DynamicBot, self).close(save_current=save_current)
def _set_chat_params(self):
"""Set training-specific param values to chatting-specific values."""
# TODO: use __setattr__ instead of this.
self.__dict__['__params']['model_params']['decode'] = True
self.__dict__['__params']['model_params']['is_chatting'] = True
self.__dict__['__params']['model_params']['batch_size'] = 1
self.__dict__['__params']['model_params']['reset_model'] = False
self.__dict__['__params']['model_params']['dropout_prob'] = 0.0
assert self.is_chatting and self.decode and not self.reset_model
================================================
FILE: chatbot/globals.py
================================================
"""Place all default/global chatbot variables here."""
import tensorflow as tf
OPTIMIZERS = {
'Adagrad': tf.train.AdagradOptimizer,
'Adam': tf.train.AdamOptimizer,
'SGD': tf.train.GradientDescentOptimizer,
'RMSProp': tf.train.RMSPropOptimizer,
}
# All allowed and/or used default configuration values, period.
DEFAULT_FULL_CONFIG = {
"model": "DynamicBot",
"dataset": "Cornell",
"model_params": {
"base_cell": "GRUCell",
"ckpt_dir": "out", # Directory to store training checkpoints.
"decode": False,
"batch_size": 256,
"dropout_prob": 0.2, # Drop rate applied at encoder/decoders output.
"decoder.class": "BasicDecoder",
"encoder.class": "BasicEncoder",
"embed_size": 128,
"learning_rate": 0.002,
"l1_reg": 1.0e-6, # L1 regularization applied to word embeddings.
"lr_decay": 0.98,
"max_gradient": 5.0,
"max_steps": int(1e6), # Max number of training iterations.
"num_layers": 1, # Num layers for each of encoder, decoder.
"num_samples": 512, # IF sampled_loss is true, default sample size.
"optimizer": "Adam", # Options are those in OPTIMIZERS above.
"reset_model": True,
"sampled_loss": False, # Whether to do sampled softmax.
"state_size": 512,
"steps_per_ckpt": 200,
"temperature": 0.0, # Response temp for chat sessions. (default argmax)
},
"dataset_params": {
"data_dir": None, # Require user to specify.
"vocab_size": 40000,
"max_seq_len": 10, # Maximum length of sentence used to train bot.
"optimize_params": True # Reduce vocab size if exceeds num unique words
},
}
================================================
FILE: chatbot/legacy/__init__.py
================================================
================================================
FILE: chatbot/legacy/_decode.py
================================================
"""Used by legacy_models for decoding. Not needed by DynamicBot."""
import tensorflow as tf
import logging
import os
import sys
from utils import io_utils
from utils.io_utils import sentence_to_token_ids, get_vocab_dicts
import numpy as np
def decode(bot, dataset, teacher_mode=True):
"""Runs a chat session between the given chatbot and user."""
# We decode one sentence at a time.
bot.batch_size = 1
# Decode from standard input.
print("Type \"exit\" to exit.")
print("Write stuff after the \">\" below and I, your robot friend, will respond.")
sentence = io_utils.get_sentence()
while sentence:
# Convert input sentence to token-ids.
token_ids = sentence_to_token_ids(tf.compat.as_bytes(sentence), dataset.word_to_idx)
# Get output sentence from the chatbot.
outputs = decode_inputs(token_ids, dataset.idx_to_word, bot)
# Print the chatbot's response.
print(outputs)
if teacher_mode:
print("What should I have said?")
feedback = io_utils.get_sentence()
feedback_ids = sentence_to_token_ids(tf.compat.as_bytes(feedback), dataset.inputs_to_word)
outputs = train_on_feedback(bot, token_ids, feedback_ids, dataset.idx_to_word)
print("Okay. Let me try again:\n", outputs)
# Wait for next input.
sentence = io_utils.get_sentence()
# Stop program if sentence == 'exit\n'.
if sentence == 'exit':
print("Fine, bye :(")
break
def decode_inputs(inputs, idx_to_word, chatbot):
# Which bucket does it belong to?
bucket_id = _assign_to_bucket(inputs, chatbot.buckets)
# Get a 1-element batch to feed the sentence to the chatbot.
data = {bucket_id: [(inputs, [])]}
encoder_inputs, decoder_inputs, target_weights = chatbot.get_batch(data, bucket_id)
# Get output logits for the sentence.
_, _, _, output_logits = chatbot.step(encoder_inputs, decoder_inputs, target_weights, bucket_id, True)
# Convert raw output to chat response & print.
return _logits_to_outputs(output_logits, chatbot.temperature, idx_to_word)
def train_on_feedback(chatbot, input_ids, feedback_ids, idx_to_outputs):
bucket_id = _assign_to_bucket(feedback_ids, chatbot.buckets)
data = {bucket_id: [(input_ids, feedback_ids)]}
enc_in, dec_in, weights = chatbot.get_batch(data, bucket_id)
# Jack up learning rate & make sure robot learned its lesson.
chatbot.sess.run(chatbot.learning_rate.assign(0.7))
for _ in range(10):
# LEARN YOU FOOL, LEARN. :)
chatbot.step(enc_in, dec_in, weights, bucket_id, False)
return decode_inputs(input_ids, idx_to_outputs, chatbot)
def _logits_to_outputs(output_logits, temperature, idx_word):
"""
Args:
output_logits: shape is [output_length, [vocab_size]]
:return:
"""
# outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
outputs = [_sample(l, temperature) for l in output_logits]
# If there is an EOS symbol in outputs, cut them at that point.
if io_utils.EOS_ID in outputs:
outputs = outputs[:outputs.index(io_utils.EOS_ID)]
outputs = " ".join([tf.compat.as_str(idx_word[output]) for output in outputs]) + "."
# Capitalize.
outputs = outputs[0].upper() + outputs[1:]
return outputs
def _sample(logits, temperature):
if temperature < 0.5:
return int(np.argmax(logits, axis=1))
logits = logits.flatten()
logits = logits / temperature
logits = np.exp(logits - np.max(logits))
logits = logits / np.sum(logits)
sampleID = np.argmax(np.random.multinomial(1, logits, 1))
while sampleID == io_utils.UNK_ID:
sampleID = np.argmax(np.random.multinomial(1, logits, 1))
return int(sampleID)
def _assign_to_bucket(token_ids, buckets):
"""Find bucket large enough for token_ids, else warning."""
bucket_id = len(buckets) - 1
for i, bucket in enumerate(buckets):
if bucket[0] >= len(token_ids):
bucket_id = i
break
else:
logging.warning("Sentence longer than truncated: %s", len(token_ids))
return bucket_id
================================================
FILE: chatbot/legacy/_train.py
================================================
"""Train seq2seq attention chatbot.
Note: Only used for legacy_models.
For (better) DynamicBot implementation, please see dynamic_models.py and, for saving/restoring ops,
the base class of all models in _models.py.
"""
import time
from utils import *
def train(bot, dataset):
""" Train chatbot using dataset given by dataset.
chatbot: instance of ChatBot or SimpleBot.
"""
# Get data as token-ids.
train_set, dev_set = io_utils.read_data(dataset,
bot.buckets)
# Interpret train_buckets_scale[i] as [cumulative] frac of samples in bucket i or below.
train_buckets_scale = _get_data_distribution(train_set, bot.buckets)
# This is the training loop.
i_step = 0
step_time, loss = 0.0, 0.0
previous_losses = []
try:
while True:
# Sample a random bucket index according to the data distribution,
# then get a batch of data from that bucket by calling chatbot.get_batch.
rand = np.random.random_sample()
bucket_id = min([i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > rand])
# Get a batch and make a step.
start_time = time.time()
summary, step_loss = run_train_step(bot, train_set, bucket_id, False)
step_time += (time.time() - start_time) / bot.steps_per_ckpt
loss += step_loss / bot.steps_per_ckpt
# Once in a while, we save checkpoint, print statistics, and run evals.
if i_step % bot.steps_per_ckpt == 0:
run_checkpoint(bot, step_time, loss, previous_losses, dev_set)
step_time, loss = 0.0, 0.0
i_step += 1
except (KeyboardInterrupt, SystemExit):
print("Training halted. Cleaning up . . . ")
# Store the model's graph in ckpt directory.
bot.saver.export_meta_graph(bot.ckpt_dir + dataset.name + '.meta')
bot.close()
print("Done.")
def run_train_step(model, train_set, bucket_id, forward_only=False):
encoder_inputs, decoder_inputs, target_weights = model.get_batch(train_set, bucket_id)
step_returns = model.step(encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only)
summary, _, losses, _ = step_returns
if not forward_only and summary is not None:
model.train_writer.add_summary(summary, model.global_step.eval(model.sess))
return summary, losses
def run_checkpoint(model, step_time, loss, previous_losses, dev_set):
# Print statistics for the previous epoch.
perplexity = np.exp(float(loss)) if loss < 300 else float("inf")
print("\nglobal step:", model.global_step.eval(model.sess), end=" ")
print("learning rate: %.4f" % model.learning_rate.eval(session=model.sess), end=" ")
print("step time: %.2f" % step_time, end=" ")
print("perplexity: %.2f" % perplexity)
# Run evals on development set and print their perplexity.
for bucket_id in range(len(model.buckets)):
if len(dev_set[bucket_id]) == 0:
print(" eval: empty bucket %d" % (bucket_id))
continue
summary, eval_loss = run_train_step(model, dev_set, bucket_id, forward_only=True)
model.save(summaries=summary)
eval_ppx = np.exp(float(eval_loss)) if eval_loss < 300 else float("inf")
print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))
sys.stdout.flush()
def _get_data_distribution(train_set, buckets):
# Get number of samples for each bucket (i.e. train_bucket_sizes[1] == num-trn-samples-in-bucket-1).
train_bucket_sizes = [len(train_set[b]) for b in range(len(buckets))]
# The total number training samples, excluding the ones too long for our bucket choices.
train_total_size = float(sum(train_bucket_sizes))
# Interpret as: train_buckets_scale[i] == [cumulative] fraction of samples in bucket i or below.
return [sum(train_bucket_sizes[:i + 1]) / train_total_size
for i in range(len(train_bucket_sizes))]
================================================
FILE: chatbot/legacy/legacy_models.py
================================================
"""Sequence-to-sequence models."""
# EDIT: Modified inheritance strucutre (see _models.py) so these *should* work again.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import numpy as np
import tensorflow as tf
from tensorflow.contrib.legacy_seq2seq import embedding_attention_seq2seq
from tensorflow.contrib.legacy_seq2seq import model_with_buckets
#from tensorflow.contrib.rnn.python.ops import core_rnn
from tensorflow.contrib.rnn.python.ops import core_rnn_cell
from tensorflow.python.ops import embedding_ops
from chatbot._models import BucketModel
class ChatBot(BucketModel):
"""Sequence-to-sequence model with attention and for multiple buckets.
The input-to-output path can be thought of (on a high level) as follows:
1. Inputs: Batches of integer lists, where each integer is a
word ID to a pre-defined vocabulary.
2. Embedding: each input integer is mapped to an embedding vector.
Each embedding vector is of length 'layer_size', an argument to __init__.
The encoder and decoder have their own distinct embedding spaces.
3. Encoding: The embedded batch vectors are fed to a multi-layer cell containing GRUs.
4. Attention: At each timestep, the output of the multi-layer cell is saved, so that
the decoder can access them in the manner specified in the paper on
jointly learning to align and translate. (should give a link to paper...)
5. Decoding: The decoder, the same type of embedded-multi-layer cell
as the encoder, is initialized with the last output of the encoder,
the "context". Thereafter, we either feed it a target sequence
(when training) or we feed its previous output as its next input (chatting).
"""
def __init__(self, buckets, dataset, params):
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('ChatBotLogger')
super(ChatBot, self).__init__(
logger=logger,
buckets=buckets,
dataset=dataset,
params=params)
if len(buckets) > 1:
self.log.error("ChatBot requires len(buckets) be 1 since tensorflow's"
" model_with_buckets function is now deprecated and BROKEN. The only"
"workaround is ensuring len(buckets) == 1. ChatBot apologizes."
"ChatBot also wishes it didn't have to be this way. "
"ChatBot is jealous that DynamicBot does not have these issues.")
raise ValueError("Not allowed to pass buckets with len(buckets) > 1.")
# ==========================================================================================
# Define basic components: cell(s) state, encoder, decoder.
# ==========================================================================================
#cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.GRUCell(state_size)for _ in range(num_layers)])
cell = tf.contrib.rnn.GRUCell(self.state_size)
self.encoder_inputs = ChatBot._get_placeholder_list("encoder", buckets[-1][0])
self.decoder_inputs = ChatBot._get_placeholder_list("decoder", buckets[-1][1] + 1)
self.target_weights = ChatBot._get_placeholder_list("weight", buckets[-1][1] + 1, tf.float32)
target_outputs = [self.decoder_inputs[i + 1] for i in range(len(self.decoder_inputs) - 1)]
# If specified, sample from subset of full vocabulary size during training.
softmax_loss, output_proj = None, None
if 0 < self.num_samples < self.vocab_size:
softmax_loss, output_proj = ChatBot._sampled_loss(self.num_samples,
self.state_size,
self.vocab_size)
# ==========================================================================================
# Combine the components to construct desired model architecture.
# ==========================================================================================
# The seq2seq function: we use embedding for the input and attention.
def seq2seq_f(encoder_inputs, decoder_inputs):
# Note: the returned function uses separate embeddings for encoded/decoded sets.
# Maybe try implementing same embedding for both.
# Question: the outputs are projected to vocab_size NO MATTER WHAT.
# i.e. if output_proj is None, it uses its own OutputProjectionWrapper instead
# --> How does this affect our model?? A bit misleading imo.
#with tf.variable_scope(scope or "seq2seq2_f") as seq_scope:
return embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell,
num_encoder_symbols=self.vocab_size,
num_decoder_symbols=self.vocab_size,
embedding_size=self.state_size,
output_projection=output_proj,
feed_previous=self.is_chatting,
dtype=tf.float32)
# Note that self.outputs and self.losses are lists of length len(buckets).
# This allows us to identify which outputs/losses to compute given a particular bucket.
# Furthermore, \forall i < j, len(self.outputs[i]) < len(self.outputs[j]). (same for loss)
self.outputs, self.losses = model_with_buckets(
self.encoder_inputs, self.decoder_inputs,
target_outputs, self.target_weights,
buckets, seq2seq_f,
softmax_loss_function=softmax_loss)
# If decoding, append _projection to true output to the model.
if self.is_chatting and output_proj is not None:
self.outputs = ChatBot._get_projections(len(buckets), self.outputs, output_proj)
with tf.variable_scope("summaries"):
self.summaries = {}
for i, loss in enumerate(self.losses):
name = "loss{}".format(i)
self.summaries[name] = tf.summary.scalar("loss{}".format(i), loss)
def step(self, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=False):
"""Run a step of the model.
Args:
encoder_inputs: list of numpy int vectors to feed as encoder inputs.
decoder_inputs: list of numpy int vectors to feed as decoder inputs.
target_weights: list of numpy float vectors to feed as target weights.
bucket_id: which bucket of the model to use.
Returns:
[summary, gradient_norms, loss, outputs]
"""
encoder_size, decoder_size = self.buckets[bucket_id]
super(ChatBot, self).check_input_lengths(
[encoder_inputs, decoder_inputs, target_weights],
[encoder_size, decoder_size, decoder_size])
input_feed = {}
for l in range(encoder_size):
input_feed[self.encoder_inputs[l].name] = encoder_inputs[l]
for l in range(decoder_size):
input_feed[self.decoder_inputs[l].name] = decoder_inputs[l]
input_feed[self.target_weights[l].name] = target_weights[l]
input_feed[self.decoder_inputs[decoder_size].name] = np.zeros([self.batch_size],
dtype=np.int32)
if not forward_only: # Not just for decoding; also for validating in training.
fetches = [self.summaries["loss{}".format(bucket_id)],
self.apply_gradients[bucket_id], # Update Op that does SGD.
self.losses[bucket_id]] # Loss for this batch.
outputs = self.sess.run(fetches=fetches, feed_dict=input_feed)
return outputs[0], None, outputs[2], None # Summary, no gradients, loss, outputs.
else:
fetches = [self.losses[bucket_id]] # Loss for this batch.
for l in range(decoder_size): # Output logits.
fetches.append(self.outputs[bucket_id][l])
outputs = self.sess.run(fetches=fetches, feed_dict=input_feed)
return None, None, outputs[0], outputs[1:] # No summary, no gradients, loss, outputs.
@staticmethod
def _sampled_loss(num_samples, hidden_size, vocab_size):
"""Defines the samples softmax loss op and the associated output _projection.
Args:
num_samples: (context: importance sampling) size of subset of outputs for softmax.
hidden_size: number of units in the individual recurrent states.
vocab_size: number of unique output words.
Returns:
sampled_loss, apply_projection
- function: sampled_loss(labels, inputs)
- apply_projection: transformation to full vocab space, applied to decoder output.
"""
assert(0 < num_samples < vocab_size)
# Define the standard affine-softmax transformation from hidden_size -> vocab_size.
# True output (for a given bucket) := tf.matmul(decoder_out, w) + b
w_t = tf.get_variable("proj_w", [vocab_size, hidden_size], dtype=tf.float32)
w = tf.transpose(w_t)
b = tf.get_variable("proj_b", [vocab_size], dtype=tf.float32)
output_projection = (w, b)
def sampled_loss(labels, inputs):
labels = tf.reshape(labels, [-1, 1])
return tf.nn.sampled_softmax_loss(
weights=w_t,
biases=b,
labels=labels,
inputs=inputs,
num_sampled=num_samples,
num_classes=vocab_size)
return sampled_loss, output_projection
@staticmethod
def _get_projections(num_buckets, unprojected_vals, projection_operator):
"""Apply _projection operator to unprojected_vals, a tuple of length num_buckets.
:param num_buckets: the number of projections that will be applied.
:param unprojected_vals: tuple of length num_buckets.
:param projection_operator: (in the mathematical meaning) tuple of shape unprojected_vals.shape[-1].
:return: tuple of length num_buckets, with entries the same shape as entries in unprojected_vals, except for the last dimension.
"""
projected_vals = unprojected_vals
for b in range(num_buckets):
projected_vals[b] = [tf.matmul(output, projection_operator[0]) + projection_operator[1]
for output in unprojected_vals[b]]
return projected_vals
@staticmethod
def _get_placeholder_list(name, length, dtype=tf.int32):
"""
Args:
name: prefix of name of each tf.placeholder list item, where i'th name is [name]i.
length: number of items (tf.placeholders) in the returned list.
Returns:
list of tensorflow placeholder of dtype=tf.int32 and unspecified shape.
"""
return [tf.placeholder(dtype, shape=[None], name=name+str(i)) for i in range(length)]
class SimpleBot(BucketModel):
"""Primitive implementation from scratch, for learning purposes.
1. Inputs: same as ChatBot.
2. Embedding: same as ChatBot.
3. BasicEncoder: Single GRUCell.
4. DynamicDecoder: Single GRUCell.
"""
def __init__(self, dataset, params):
# SimpleBot allows user to not worry about making their own buckets.
# SimpleBot does that for you. SimpleBot cares.
max_seq_len = dataset.max_seq_len
buckets = [(max_seq_len // 2, max_seq_len // 2), (max_seq_len, max_seq_len)]
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('SimpleBotLogger')
super(SimpleBot, self).__init__(
logger=logger,
buckets=buckets,
dataset=dataset,
params=params)
# ==========================================================================================
# Create placeholder lists for encoder/decoder sequences.
# ==========================================================================================
with tf.variable_scope("placeholders"):
self.encoder_inputs = [tf.placeholder(tf.int32, shape=[None], name="encoder"+str(i))
for i in range(self.max_seq_len)]
self.decoder_inputs = [tf.placeholder(tf.int32, shape=[None], name="decoder"+str(i))
for i in range(self.max_seq_len+1)]
self.target_weights = [tf.placeholder(tf.float32, shape=[None], name="weight"+str(i))
for i in range(self.max_seq_len+1)]
# ==========================================================================================
# Before bucketing, need to define the underlying model(x, y) -> outputs, state(s).
# ==========================================================================================
def seq2seq(encoder_inputs, decoder_inputs, scope=None):
"""Builds basic encoder-decoder model and returns list of (2D) output tensors."""
with tf.variable_scope(scope or "seq2seq"):
encoder_cell = tf.contrib.rnn.GRUCell(self.state_size)
encoder_cell = tf.contrib.rnn.EmbeddingWrapper(encoder_cell, self.vocab_size, self.state_size)
# BasicEncoder(raw_inputs) -> Embed(raw_inputs) -> [be an RNN] -> encoder state.
_, encoder_state = tf.contrib.rnn.static_rnn(encoder_cell, encoder_inputs, dtype=tf.float32)
with tf.variable_scope("decoder"):
def loop_function(x):
with tf.variable_scope("loop_function"):
params = tf.get_variable("embed_tensor", [self.vocab_size, self.state_size])
return embedding_ops.embedding_lookup(params, tf.argmax(x, 1))
_decoder_cell = tf.contrib.rnn.GRUCell(self.state_size)
_decoder_cell = tf.contrib.rnn.EmbeddingWrapper(_decoder_cell, self.vocab_size, self.state_size)
# Dear TensorFlow: you should replace the 'reuse' param in
# OutputProjectionWrapper with 'scope' and just do scope.reuse in __init__.
# sincerely, programming conventions.
decoder_cell = tf.contrib.rnn.OutputProjectionWrapper(
_decoder_cell, self.vocab_size, reuse=tf.get_variable_scope().reuse)
decoder_outputs = []
prev = None
decoder_state = None
for i, dec_inp in enumerate(decoder_inputs):
if self.is_chatting and prev is not None:
dec_inp = loop_function(tf.reshape(prev, [1, 1]))
if i == 0:
output, decoder_state = decoder_cell(dec_inp, encoder_state,
scope=tf.get_variable_scope())
else:
tf.get_variable_scope().reuse_variables()
output, decoder_state = decoder_cell(dec_inp, decoder_state,
scope=tf.get_variable_scope())
decoder_outputs.append(output)
return decoder_outputs
# ====================================================================================
# Now we can build a simple bucketed seq2seq model.
# ====================================================================================
self.losses = []
self.outputs = []
values = self.encoder_inputs + self.decoder_inputs + self.decoder_inputs
with tf.name_scope("simple_bucket_model", values):
for idx_b, bucket in enumerate(buckets):
# Reminder: you should never explicitly set reuse=False. It's a no-no.
with tf.variable_scope(tf.get_variable_scope(), reuse=True if idx_b > 0 else None)\
as bucket_scope:
# The outputs for this bucket are defined entirely by the seq2seq function.
self.outputs.append(seq2seq(
self.encoder_inputs[:bucket[0]],
self.decoder_inputs[:bucket[1]],
scope=bucket_scope))
# Target outputs are just the inputs time-shifted by 1.
target_outputs = [self.decoder_inputs[i + 1]
for i in range(len(self.decoder_inputs) - 1)]
# Compute loss by comparing outputs and target outputs.
self.losses.append(SimpleBot._simple_loss(self.batch_size,
self.outputs[-1],
target_outputs[:bucket[1]],
self.target_weights[:bucket[1]]))
with tf.variable_scope("summaries"):
self.summaries = {}
for i, loss in enumerate(self.losses):
name = "loss{}".format(i)
self.summaries[name] = tf.summary.scalar("loss{}".format(i), loss)
@staticmethod
def _simple_loss(batch_size, logits, targets, weights):
"""Compute weighted cross-entropy loss on softmax(logits)."""
# Note: name_scope only affects names of ops,
# while variable_scope affects both ops AND variables.
with tf.name_scope("simple_loss", values=logits+targets+weights):
log_perplexities = []
for l, t, w in zip(logits, targets, weights):
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=t, logits=l)
log_perplexities.append(cross_entropy * w)
# Reduce via elementwise-sum.
log_perplexities = tf.add_n(log_perplexities)
# Get weighted-averge by dividing by sum of the weights.
log_perplexities /= tf.add_n(weights) + 1e-12
return tf.reduce_sum(log_perplexities) / tf.cast(batch_size, tf.float32)
def step(self, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=False):
"""Run a step of the model.
Args:
encoder_inputs: list of numpy int vectors to feed as encoder inputs.
decoder_inputs: list of numpy int vectors to feed as decoder inputs.
target_weights: list of numpy float vectors to feed as target weights.
bucket_id: which bucket of the model to use.
Returns:
[summary, gradient_norms, loss, outputs]:
"""
encoder_size, decoder_size = self.buckets[bucket_id]
super(SimpleBot, self).check_input_lengths(
[encoder_inputs, decoder_inputs, target_weights],
[encoder_size, decoder_size, decoder_size])
input_feed = {}
for l in range(encoder_size):
input_feed[self.encoder_inputs[l].name] = encoder_inputs[l]
for l in range(decoder_size):
input_feed[self.decoder_inputs[l].name] = decoder_inputs[l]
input_feed[self.target_weights[l].name] = target_weights[l]
input_feed[self.decoder_inputs[decoder_size].name] = np.zeros([self.batch_size], dtype=np.int32)
# Fetches: the Operations/Tensors we want executed/evaluated during session.run(...).
if not forward_only: # Not just for decoding; also for validating in training.
fetches = [self.summaries["loss{}".format(bucket_id)],
self.apply_gradients[bucket_id], # Update Op that does SGD.
self.losses[bucket_id]] # Loss for this batch.
outputs = self.sess.run(fetches=fetches, feed_dict=input_feed)
return outputs[0], None, outputs[2], None # summaries, No gradient norm, loss, no outputs.
else:
fetches = [self.losses[bucket_id]] # Loss for this batch.
for l in range(decoder_size): # Output logits.
fetches.append(self.outputs[bucket_id][l])
outputs = self.sess.run(fetches=fetches, feed_dict=input_feed)
return None, None, outputs[0], outputs[1:] #No summary, No gradient norm, loss, outputs.
================================================
FILE: configs/example_attention.yml
================================================
model: DynamicBot
dataset: Cornell
model_params:
base_cell: LSTMCell
ckpt_dir: out/cornell
attention_mechanism: BahdanauAttention
decoder.class: AttentionDecoder
encoder.class: BasicEncoder
batch_size: 256
embed_size: 128
num_layers: 1
state_size: 512
steps_per_ckpt: 250
dataset_params:
data_dir: /home/brandon/Datasets/cornell # Change to your path!
vocab_size: 52000
max_seq_len: 10
================================================
FILE: configs/example_cornell.yml
================================================
model: DynamicBot
dataset: Cornell
model_params:
base_cell: LSTMCell
num_layers: 2
attention_mechanism: LuongAttention
decoder.class: AttentionDecoder
encoder.class: BidirectionalEncoder
ckpt_dir: out/cornell
dataset_params:
data_dir: /home/brandon/Datasets/cornell # The only truly 'mandatory' parameter.
vocab_size: 52000 # Approximately the true number of unique words in the dataset.
max_seq_len: 20
================================================
FILE: configs/example_reddit.yml
================================================
model: DynamicBot
dataset: Reddit
model_params:
base_cell: GRUCell
batch_size: 128
embed_size: 128
num_layers: 1
reset_model: true
steps_per_ckpt: 200
ckpt_dir: out/reddit/basicReddit
dataset_params:
data_dir: /home/brandon/Datasets/reddit
max_seq_len: 15
vocab_size: 80000 # HUGE dataset = huge vocabulary.
================================================
FILE: configs/example_ubuntu.yml
================================================
model: DynamicBot
dataset: Ubuntu
model_params:
base_cell: GRUCell
ckpt_dir: out/ubuntu
decoder.class: BasicDecoder
encoder.class: BasicEncoder
num_layers: 2
state_size: 512
dataset_params:
data_dir: /home/brandon/Datasets/ubuntu
vocab_size: 60000 # Should probably be higher. Ubuntu is noisy.
max_seq_len: 12 # Any longer, and output quality is a challenge.
================================================
FILE: configs/ubuntu_basic.yml
================================================
model: chatbot.DynamicBot
dataset: data.Ubuntu
model_params:
base_cell: GRUCell
ckpt_dir: out/ubuntu/basic
decode: False
batch_size: 128
decoder.class: BasicDecoder
encoder.class: BasicEncoder
embed_size: 128
learning_rate: 0.002
num_layers: 1
reset_model: True
state_size: 512
steps_per_ckpt: 100
dataset_params:
data_dir: /home/brandon/Datasets/ubuntu
vocab_size: 60000
max_seq_len: 12
optimize_params: true
================================================
FILE: configs/website_config.yml
================================================
# Experimenting with best model params for website.
model: DynamicBot
dataset: Reddit
model_params:
base_cell: LSTMCell
batch_size: 128
ckpt_dir: out/reddit/website_config
dropout_prob: 0.0
decoder.class: BasicDecoder
encoder.class: BasicEncoder
l1_reg: 0.0
learning_rate: 0.001
embed_size: 128
num_layers: 2
reset_model: False
state_size: 512
steps_per_ckpt: 500
dataset_params:
data_dir: /home/brandon/Datasets/reddit # The only truly 'mandatory' parameter.
vocab_size: 40000
max_seq_len: 15
================================================
FILE: data/__init__.py
================================================
from __future__ import absolute_import
from data import data_helper
from data import _dataset
from data import dataset_wrappers
from data.data_helper import DataHelper
from data._dataset import Dataset
from data.dataset_wrappers import Cornell, Ubuntu, Reddit, TestData
__all__ = ['Cornell', 'Reddit', 'Ubuntu', 'TestData']
================================================
FILE: data/_dataset.py
================================================
"""ABC for datasets. """
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import logging
import numpy as np
import tensorflow as tf
from utils import io_utils
from abc import ABCMeta, abstractmethod, abstractproperty
from chatbot.globals import DEFAULT_FULL_CONFIG
DEFAULT_PARAMS = DEFAULT_FULL_CONFIG['dataset_params']
class DatasetABC(metaclass=ABCMeta):
@abstractmethod
def convert_to_tf_records(self, *args):
"""If not found in data dir, will create tfrecords data
files from text files.
"""
pass
@abstractmethod
def train_generator(self, batch_size):
"""Returns a generator function for batches of batch_size
train data.
"""
pass
@abstractmethod
def valid_generator(self, batch_size):
"""Returns a generator function for batches of batch_size
validation data.
"""
pass
@abstractproperty
def word_to_idx(self):
"""Return dictionary map from str -> int. """
pass
@abstractproperty
def idx_to_word(self):
"""Return dictionary map from int -> str. """
pass
@abstractproperty
def name(self):
"""Returns name of the dataset as a string."""
pass
@abstractproperty
def max_seq_len(self):
"""Return the maximum allowed sentence length."""
pass
class Dataset(DatasetABC):
def __init__(self, dataset_params):
"""Implements the general of subset of operations that all
dataset subclasses can use.
Args:
dataset_params: dictionary of configuration parameters.
See DEFAULT_FULL_CONFIG at top of file for supported keys.
"""
self.__dict__['__params'] = Dataset.fill_params(dataset_params)
# We query io_utils to ensure all data files are organized properly,
# and io_utils returns the paths to files of interest.
id_paths, vocab_path, vocab_size = io_utils.prepare_data(
data_dir=self.data_dir,
vocab_size=self.vocab_size,
optimize=dataset_params.get('optimize_params'),
config_path=dataset_params.get('config_path'))
if vocab_size != self.vocab_size:
self.log.info("Updating vocab size from %d to %d",
self.vocab_size, vocab_size)
self.vocab_size = vocab_size
# Also update the input dict, in case it is used later/elsewhere.
dataset_params['vocab_size'] = self.vocab_size
self.paths = dict()
self.paths = {
**id_paths,
'vocab': vocab_path,
'train_tfrecords': None,
'valid_tfrecords': None}
self._word_to_idx, self._idx_to_word = io_utils.get_vocab_dicts(
vocab_path)
# Create tfrecords file if not located in data_dir.
self.convert_to_tf_records('train')
self.convert_to_tf_records('valid')
def convert_to_tf_records(self, prefix='train'):
"""If can't find tfrecords 'prefix' files, creates them.
Args:
prefix: 'train' or 'valid'. Determines which tfrecords to build.
"""
from_path = self.paths['from_'+prefix]
to_path = self.paths['to_'+prefix]
tfrecords_fname = (prefix
+ 'voc%d_seq%d' % (self.vocab_size, self.max_seq_len)
+ '.tfrecords')
output_path = os.path.join(self.data_dir, tfrecords_fname)
if os.path.isfile(output_path):
self.log.info('Using tfrecords file %s' % output_path)
self.paths[prefix + '_tfrecords'] = output_path
return
def get_sequence_example(encoder_line, decoder_line):
space_needed = max(len(encoder_line.split()), len(decoder_line.split()))
if space_needed > self.max_seq_len:
return None
example = tf.train.SequenceExample()
encoder_list = [int(x) for x in encoder_line.split()]
decoder_list = [io_utils.GO_ID] \
+ [int(x) for x in decoder_line.split()] \
+ [io_utils.EOS_ID]
# Why tensorflow . . . why . . .
example.context.feature['encoder_sequence_length'].int64_list.value.append(
len(encoder_list))
example.context.feature['decoder_sequence_length'].int64_list.value.append(
len(decoder_list))
encoder_sequence = example.feature_lists.feature_list['encoder_sequence']
decoder_sequence = example.feature_lists.feature_list['decoder_sequence']
for e in encoder_list:
encoder_sequence.feature.add().int64_list.value.append(e)
for d in decoder_list:
decoder_sequence.feature.add().int64_list.value.append(d)
return example
with tf.gfile.GFile(from_path, mode="r") as encoder_file:
with tf.gfile.GFile(to_path, mode="r") as decoder_file:
with tf.python_io.TFRecordWriter(output_path) as writer:
encoder_line = encoder_file.readline()
decoder_line = decoder_file.readline()
while encoder_line and decoder_line:
sequence_example = get_sequence_example(
encoder_line,
decoder_line)
if sequence_example is not None:
writer.write(sequence_example.SerializeToString())
encoder_line = encoder_file.readline()
decoder_line = decoder_file.readline()
self.log.info("Converted text files %s and %s into tfrecords file %s" \
% (os.path.basename(from_path),
os.path.basename(to_path),
os.path.basename(output_path)))
self.paths[prefix + '_tfrecords'] = output_path
def sentence_generator(self, prefix='from'):
"""Yields (as words) single sentences from training data,
for testing purposes.
"""
self.log.info("Generating sentences from %s", self.paths[prefix+'_train'])
with tf.gfile.GFile(self.paths[prefix+'_train'], mode="r") as f:
sentence = self.as_words(
list(map(int, f.readline().strip().lower().split())))
while sentence:
yield sentence
sentence = self.as_words(
list(map(int, f.readline().strip().lower().split())))
def pairs_generator(self, num_generate=None):
in_sentences = self.sentence_generator('from')
in_sentences = [s for s in in_sentences]
out_sentences = self.sentence_generator('to')
out_sentences = [s for s in out_sentences]
if num_generate is None:
num_generate = len(in_sentences)
count = 0
for in_sent, out_sent in zip(in_sentences, out_sentences):
yield in_sent, out_sent
count += 1
if count >= num_generate:
break
def train_generator(self, batch_size):
"""[Note: not needed by DynamicBot since InputPipeline]"""
return self._generator(
self.paths['from_train'],
self.paths['to_train'],
batch_size)
def valid_generator(self, batch_size):
"""[Note: not needed by DynamicBot since InputPipeline]"""
return self._generator(
self.paths['from_valid'],
self.paths['to_valid'],
batch_size)
def _generator(self, from_path, to_path, batch_size):
"""(Used by BucketModels only). Returns a generator function that
reads data from file, and yields shuffled batches.
Args:
from_path: full path to file for encoder inputs.
to_path: full path to file for decoder inputs.
batch_size: number of samples to yield at once.
"""
def longest_sentence(enc_list, dec_list):
max_enc_len = max([len(s) for s in enc_list])
max_dec_len = max([len(s) for s in dec_list])
return max(max_enc_len, max_dec_len)
def padded_batch(encoder_tokens, decoder_tokens):
max_sent_len = longest_sentence(encoder_tokens, decoder_tokens)
encoder_batch = np.array(
[s + [io_utils.PAD_ID] * (max_sent_len - len(s))
for s in encoder_tokens])[:, ::-1]
decoder_batch = np.array(
[s + [io_utils.PAD_ID] * (max_sent_len - len(s))
for s in decoder_tokens])
return encoder_batch, decoder_batch
encoder_tokens = []
decoder_tokens = []
with tf.gfile.GFile(from_path, mode="r") as source_file:
with tf.gfile.GFile(to_path, mode="r") as target_file:
source, target = source_file.readline(), target_file.readline()
while source and target:
# Skip sentence pairs that are too long for specifications.
space_needed = max(len(source.split()), len(target.split()))
if space_needed > self.max_seq_len:
source, target = source_file.readline(), target_file.readline()
continue
# Reformat token strings to token lists.
# Note: GO_ID is prepended by the chat bot, since it
# determines whether or not it's responsible for responding.
encoder_tokens.append([int(x) for x in source.split()])
decoder_tokens.append(
[int(x) for x in target.split()] + [io_utils.EOS_ID])
# Have we collected batch_size number of sentences?
# If so, pad & yield.
assert len(encoder_tokens) == len(decoder_tokens)
if len(encoder_tokens) == batch_size:
yield padded_batch(encoder_tokens, decoder_tokens)
encoder_tokens = []
decoder_tokens = []
source, target = source_file.readline(), target_file.readline()
# Don't forget to yield the 'leftovers'!
assert len(encoder_tokens) == len(decoder_tokens)
assert len(encoder_tokens) <= batch_size
if len(encoder_tokens) > 0:
yield padded_batch(encoder_tokens, decoder_tokens)
@property
def word_to_idx(self):
"""Return dictionary map from str -> int. """
return self._word_to_idx
@property
def idx_to_word(self):
"""Return dictionary map from int -> str. """
return self._idx_to_word
def as_words(self, sentence):
"""Convert list of integer tokens to a single sentence string."""
words = []
for token in sentence:
word = self.idx_to_word[token]
try:
word = tf.compat.as_str(word)
except UnicodeDecodeError:
logging.error("UnicodeDecodeError on (token, word): "
"(%r, %r)", token, word)
word = str(word)
words.append(word)
words = " ".join(words)
#words = " ".join([tf.compat.as_str(self.idx_to_word[i]) for i in sentence])
words = words.replace(' , ', ', ').replace(' .', '.').replace(' !', '!')
words = words.replace(" ' ", "'").replace(" ?", "?")
if len(words) < 2:
return words
return words[0].upper() + words[1:]
@property
def name(self):
"""Returns name of the dataset as a string."""
return self._name
@property
def train_size(self):
raise NotImplemented
@property
def valid_size(self):
raise NotImplemented
@property
def max_seq_len(self):
return self._max_seq_len
@staticmethod
def fill_params(dataset_params):
"""Assigns default values from DEFAULT_FULL_CONFIG
for keys not in dataset_params."""
if 'data_dir' not in dataset_params:
raise ValueError('data directory not found in dataset_params.')
return {**DEFAULT_PARAMS, **dataset_params}
def __getattr__(self, name):
if name not in self.__dict__['__params']:
raise AttributeError(name)
else:
return self.__dict__['__params'][name]
================================================
FILE: data/data_helper.py
================================================
"""Provides pre-processing functionality.
Abstracts paths and filenames so we don't have to think about them. Currently,
in use by Brandon, but will extend to general users in the future.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import re
import pdb
import sys
import json
import logging
import tempfile
from pprint import pprint
from subprocess import Popen, PIPE
import pandas as pd
import numpy as np
from pympler.asizeof import asizeof # for profiling memory usage
# Absolute path to this file.
_WORD_SPLIT = re.compile(r'([.,!?\"\':;)(])|\s')
HERE = os.path.dirname(os.path.realpath(__file__))
DATA_ROOTS = {
'brandon': '/home/brandon/Datasets/reddit',
'ivan': '/Users/ivan/Documents/sp_17/reddit_data',
'mitch': '/Users/Mitchell/Documents/Chatbot/RedditData',
'george': '/Users/George/Documents/ChatbotData/reddit'
}
# Maximum memory usage allowed (in GiB).
MAX_MEM = 2.0
def prompt(text, default="", required=False):
print("%s (default=%r): " % (text, default), end="")
errors = 0
userinput = input()
while not userinput and required:
errors += 1
userinput = input("C'mon dude, be serious%s " % (
':' if errors <= 1 else ('!' * errors)))
return userinput or default
class DataHelper:
"""Manages file locations and computing resource during preprocessing.
This interacts directly with the user and double-checks their work; It makes it
harder for you to screw up.
"""
def __init__(self, log_level=logging.INFO):
""" Establish some baseline data with the user.
"""
self.logfile = tempfile.NamedTemporaryFile(
mode='w', prefix='data_helper', delete=False)
self.logfile.close()
logging.basicConfig(filename=self.logfile.name, level=log_level)
print("Using logfile:", self.logfile.name)
self.file_counter = 0 # current file we're processing
self._word_freq = None # temporary: for parallelizing frequency dict
print("Hi, I'm a DataHelper. For now, I help with the reddit dataset.")
print("At any prompt, press ENTER if you want the default value.")
# 1. Get user name. We can associate info with a given user as we go.
user = prompt("Username", default="brandon").lower()
if user not in DATA_ROOTS:
print("I don't recognize you, %s." % user)
self.data_root = prompt("Please give me the path to your data:",
required=True)
else:
self.data_root = DATA_ROOTS[user]
print("Hello, %s, I've set your data root to %s" % (user, self.data_root))
# 2. Get absolute paths to all data filenames in self.file_paths.
self.file_paths = []
years = prompt("Years to process", default="2007,2008,2009")
# Secretly supports passing a range too. Shhhh.
if '-' in years:
years = list(map(int, years.split('-')))
years = list(range(years[0], years[1]+1))
years = list(map(str, years))
else:
years = years.split(',')
for y in years:
# The path is: $ROOT/raw_data/$YEAR
# Add the entirety of the directory to the file paths.
base_path = os.path.join(self.data_root, 'raw_data', y)
rel_paths = os.listdir(base_path)
self.file_paths.extend([
os.path.join(base_path, f) for f in rel_paths \
if not f.endswith(".bz2")
])
self._next_file_path = self.file_paths[0]
print("These are the files I found:")
pprint(self.file_paths)
print()
_max_mem = prompt("Maximum memory to use (in GiB)", "%.2f" % MAX_MEM)
try:
self.max_mem = float(_max_mem)
except ValueError:
print("C'mon dude, get it together!")
def safe_load(self):
""" Load data while keeping an eye on memory usage."""
if self.file_counter >= len(self.file_paths):
print("No more files to load!")
return None
# For in-place appending.
# S.O.: https://stackoverflow.com/questions/20906474/
list_ = [] # real descriptive :)
for i in range(self.file_counter, len(self.file_paths)):
# lines=True means "read as json-object-per-line."
list_.append(pd.read_json(self.file_paths[i], lines=True))
mem_usage = float(asizeof(list_)) / 1e9
logging.info("Data list has size %.3f GiB", mem_usage)
logging.info("Most recent file loaded: %s", self.file_paths[i])
print("\rLoaded file", self.file_paths[i], end="")
sys.stdout.flush()
if mem_usage > self.max_mem:
print("\nPast max capacity:", mem_usage,
"Leaving data collection early.")
logging.warning('Terminated data loading after '
'reading %d files.', i + 1)
logging.info('Files read into df: %r', self.file_paths[:i+1])
break
print()
# If the user decides they want to continue loading later
# (when memory frees up), we want the file_counter set so that it
# starts on the next file.
self.file_counter = i + 1
self._next_file_path = self.file_paths[self.file_counter]
df = pd.concat(list_).reset_index()
logging.info("Number of lines in raw data file: %r", len(df.index))
logging.info("Column names from raw data file: %r", df.columns)
logging.info("DataHelper.safe_load: df.head() = %r", df.head())
return df
def load_random(self, year=None):
"""Load a random data file and return as a DataFrame.
Args:
year: (int) If given, get a random file from this year.
"""
files = self.file_paths
if year is not None:
files = list(filter(lambda f: str(year) in f, files))
rand_index = np.random.randint(low=0, high=len(files))
print('Returning data from file:\n', files[rand_index])
return pd.read_json(files[rand_index], lines=True)
def load_next(self):
if self.next_file_path is None:
logging.warning('Tried loading next file but no files remain.')
return None
df = pd.read_json(self.next_file_path, lines=True)
self.file_counter += 1
if self.file_counter < len(self.file_paths):
self._next_file_path = self.file_paths[self.file_counter]
else:
self._next_file_path = None
return df
def set_word_freq(self, wf):
"""Hacky (temporary) fix related to multiprocessing.Pool complaints
for the reddit preprocessing script.
"""
self._word_freq = wf
@property
def word_freq(self):
return self._word_freq
@property
def next_file_path(self):
return self._next_file_path
def get_year_from_path(self, path):
year = path.strip('/').split('/')[-2]
try:
_ = int(year)
except ValueError:
logging.warning("Couldn't get year from file path. Your directory"
" structure is unexpected.")
return None
logging.info('Extracted year %s', year)
return year
def generate_files(self,
from_file_path,
to_file_path,
root_to_children,
comments_dict):
"""Generates two files, [from_file_path] and [to_file_path]
of 1-1 comments.
"""
from_file_path = os.path.join(self.data_root, from_file_path)
to_file_path = os.path.join(self.data_root, to_file_path)
print("Writing data files:\n", from_file_path, "\n", to_file_path)
with open(from_file_path, 'w') as from_file:
with open(to_file_path, 'w') as to_file:
for root_ID, child_IDs in root_to_children.items():
for child_ID in child_IDs:
try:
from_file.write(comments_dict[root_ID].strip() + '\n')
to_file.write(comments_dict[child_ID].strip() + '\n')
except KeyError:
pass
(num_samples, stderr) = Popen(
['wc', '-l', from_file_path], stdout=PIPE).communicate()
num_samples = int(num_samples.strip().split()[0])
print("Final processed file has %d samples total." % num_samples)
# First make sure user has copy of bash script we're about to use.
# os.popen('cp %s %s' % (os.path.join(HERE, 'split_into_n.sh'), self.data_root))
# Split data into 90% training and 10% validation.
# os.popen('bash %s %d' % (os.path.join(self.data_root, 'split_into_n.sh'),
# 0.1 * num_samples))
def df_generator(self):
""" Generates df from single files at a time."""
for i in range(len(self.file_paths)):
df = pd.read_json(self.file_paths[i], lines=True)
init_num_rows = len(df.index)
logging.info("Number of lines in raw data file: %r" % init_num_rows)
logging.info("Column names from raw data file: %r" % df.columns)
yield df
@staticmethod
def random_rows_generator(num_rows_per_print, num_rows_total):
""" Fun generator for viewing random comments (rows) in dataframes."""
num_iterations = num_rows_total // num_rows_per_print
shuffled_indices = np.arange(num_rows_per_print * num_iterations)
np.random.shuffle(shuffled_indices)
for batch in shuffled_indices.reshape(num_iterations, num_rows_per_print):
yield batch
@staticmethod
def word_tokenizer(sentences):
""" Tokenizes sentence / list of sentences into word tokens."""
# Minor optimization: pre-create the list and fill it.
tokenized = [None for _ in range(len(sentences))]
for i in range(len(sentences)):
tokenized[i] = [
w for w in _WORD_SPLIT.split(sentences[i].strip()) if w
]
return tokenized
@staticmethod
def df_to_json(df, target_file=None, orient='records', lines=False, **kwargs):
"""Converts dataframe to json object in the intuitive way, i.e.
each row is converted to a json object, where columns are properties. If
target_file is not None, then each such object is saved as a line in the
target_file. Helpful because pandas default args are NOT this behavior.
Note: Setting lines=True can result in some problems when trying to reload
the file. Setting lines=False, while makes an essentially unreadable (for humans)
output file, it at least reproduces the saved dataframe upon loading via
df_reloaded = pd.read_json(target_file)
Args:
df: Pandas DataFrame.
orient:
lines: whether or not to save rows on their own line or writing full file to
single line.
target_file: Where to save the json-converted df.
If None, just return the json object.
kwargs: any additional named params the user wishes to pass to df.to_json.
"""
if target_file is None:
return df.to_json(orient=orient, lines=lines, **kwargs)
df.to_json(path_or_buf=target_file, orient=orient, lines=lines, **kwargs)
================================================
FILE: data/dataset_wrappers.py
================================================
"""Named data wrapper classes. No added functionality to dataset base class for now,
but preprocessing checks will be incorporated into each when it's time.
"""
import logging
import os
from data._dataset import Dataset
def check_data(abs_path, name):
"""All dataset wrappers call this as a quick sanity check."""
if abs_path is None:
raise ValueError('No data directory found in dataset_wrappers.check_data.'
'Either specify data_dir or use io_utils.parse_config.')
if os.path.basename(abs_path) != name:
print("Data directory %s does not match dataset name %s." % (abs_path, name))
propose_path = os.path.join(os.path.dirname(abs_path), name.lower())
print("Would you like me to change data_dir to {}? [y/n] ".format(propose_path))
answer = input()
if answer == 'y':
return propose_path
else:
raise ValueError("Rejected path change. Terminating program.")
return abs_path
class Cornell(Dataset):
"""Movie dialogs."""
def __init__(self, dataset_params):
self._name = "cornell"
self.log = logging.getLogger('CornellLogger')
dataset_params['data_dir'] = check_data(
dataset_params.get('data_dir'),
self.name)
super(Cornell, self).__init__(dataset_params)
class Ubuntu(Dataset):
"""Technical support chat logs from IRC."""
def __init__(self, dataset_params):
self._name = "ubuntu"
self.log = logging.getLogger('UbuntuLogger')
dataset_params['data_dir'] = check_data(
dataset_params.get('data_dir'),
self.name)
super(Ubuntu, self).__init__(dataset_params)
class Reddit(Dataset):
"""Reddit comments from 2007-2015."""
def __init__(self, dataset_params):
self._name = "reddit"
self.log = logging.getLogger('RedditLogger')
dataset_params['data_dir'] = check_data(
dataset_params.get('data_dir'),
self.name)
super(Reddit, self).__init__(dataset_params)
class TestData(Dataset):
"""Mock dataset with a handful of sentences."""
def __init__(self, dataset_params):
self.log = logging.getLogger('TestDataLogger')
self._name = "test_data"
dataset_params['data_dir'] = check_data(
dataset_params.get('data_dir'),
self.name)
super(TestData, self).__init__(dataset_params)
================================================
FILE: data/reddit_preprocessor.py
================================================
"""Reddit data preprocessing."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
from functools import wraps
from itertools import chain
from collections import Counter, defaultdict
from multiprocessing import Pool
import numpy as np
import pandas as pd
from data.data_helper import DataHelper
from data.regex import regex_replace, contractions
from nltk.corpus import wordnet
# Global helper object that helps abstract away locations of
# files & directories, and keeps an eye on memory usage.
if __name__ == '__main__':
data_helper = DataHelper()
else:
data_helper = None
# Max number of words in any saved sentence.
MAX_SEQ_LEN = 20
# Number of CPU cores available.
NUM_CORES = 2
# How many chunks we should split dataframes into at any given time.
NUM_PARTITIONS = 64
def timed_function(*expected_args):
"""Simple decorator to show how long the functions take to run."""
def decorator(fn):
@wraps(fn)
def wrapper(*args, **kwargs):
start_time = time.time()
res = fn(*args, **kwargs)
stop_time = time.time()
fname = expected_args[0]
print("Time to run %s: %.3f seconds." %
(fname, stop_time - start_time))
return res
return wrapper
return decorator
@timed_function('parallel_map_df')
def parallel_map_df(fn, df):
""" Based on great explanation from 'Pandas in Parallel' (racketracer.com).
"""
df = np.array_split(df, NUM_PARTITIONS)
pool = Pool(NUM_CORES)
df = pd.concat(pool.map(fn, df))
pool.close()
pool.join()
return df
@timed_function('parallel_map_list')
def parallel_map_list(fn, iterable):
""" Based on great explanation from 'Pandas in Parallel' (racketracer.com).
"""
iterable = np.array_split(iterable, NUM_PARTITIONS)
pool = Pool(NUM_CORES)
iterable = np.concatenate(pool.map(fn, iterable))
pool.close()
pool.join()
return iterable
def sentence_score(sentences):
word_freq = data_helper.word_freq
scores = []
for sentence in sentences:
word_count = len(sentence) + 1e-20
sent_score = sum([1.0 / ((word_freq[w] + 1e-20) * word_count)
for w in sentence if not wordnet.synsets(w)])
scores.append(sent_score)
return scores
def root_comments(df):
""" Builds a list determining which rows of df are root comments.
Returns:
list of length equal to the number of rows in our data frame.
"""
root_value = []
# Iterate over DataFrame rows as namedtuples,
# with index value as first element of the tuple.
for row in df.itertuples():
root_value.append(row.parent_id == row.link_id)
return root_value
@timed_function('remove_extra_columns')
def remove_extra_columns(df):
"""Throw away columns we don't need and misc. style formatting."""
df['root'] = root_comments(df)
df = df[['author', 'body', 'link_id', 'parent_id', 'name', 'root', 'subreddit']]
df.style.set_properties(subset=['body'], **{'width': '500px'})
df.style.set_properties(**{'text-align': 'left'})
df.head()
return df
@timed_function('regex_replacements')
def regex_replacements(df):
# Remove comments that are '[deleted]'.
df = df.loc[df.body != '[deleted]'].reset_index(drop=True)
df.style.set_properties(subset=['body'], **{'width': '800px'})
# Make all comments lowercase to help reduce vocab size.
df['body'] = df['body'].map(lambda s: s.strip().lower())
# Loop over regex replacements specified by modify_list.
for regex in regex_replace:
df['body'].replace(
{regex: regex_replace[regex]},
regex=True,
inplace=True)
return df
@timed_function('remove_large_comments')
def remove_large_comments(max_len, df):
# Could probably do a regex find on spaces to make this faster.
df = df[df['body'].map(lambda s: len(s.split())) < max_len].reset_index(drop=True)
return df
@timed_function('expand_contractions')
def expand_contractions(df):
""" Replace all contractions with their expanded chat_form.
Note: contractions is dict(contraction -> expanded form)
"""
for c in contractions:
df['body'].replace({c: contractions[c]}, regex=True, inplace=True)
return df
@timed_function('children_dict')
def children_dict(df):
""" Returns a dictionary with keys being the root comments and
values being their immediate root_to_children. Assumes that df has 'root' column.
Go through all comments. If it is a root, skip it since they wont have a parent_id
that corresponds to a comment.
"""
children = defaultdict(list)
for row in df.itertuples():
if row.root == False:
children[row.parent_id].append(row.name)
return children
def main():
"""Processes each file individually through the whole pipeline.
This decision was made due to the very large (many larger than 5 Gb) files.
I have scripts that combine the output files, and I plan on making those
available soon (very basic).
"""
current_file = data_helper.next_file_path
df = data_helper.load_next()
while df is not None:
# Execute preprocessing steps on current_file's dataframe.
df = remove_extra_columns(df)
df = regex_replacements(df)
df = remove_large_comments(max_len=MAX_SEQ_LEN, df=df)
df = expand_contractions(df)
sentences = parallel_map_list(fn=DataHelper.word_tokenizer, iterable=df.body.values)
data_helper.set_word_freq(Counter(chain.from_iterable(sentences)))
print('Bout to score!')
df['score'] = parallel_map_list(fn=sentence_score, iterable=sentences)
del sentences
# Keep the desired percentage of lowest-scored sentences. (low == good)
keep_best_percent = 0.8
df = df.loc[df['score'] < df['score'].quantile(keep_best_percent)]
print('Prepping for the grand finale.')
comments_dict = pd.Series(df.body.values, index=df.name).to_dict()
root_to_children = children_dict(df)
file_basename = os.path.join('processed_data',
data_helper.get_year_from_path(current_file),
os.path.basename(current_file))
data_helper.generate_files(
from_file_path="{}_encoder.txt".format(file_basename),
to_file_path="{}_decoder.txt".format(file_basename),
root_to_children=root_to_children,
comments_dict=comments_dict)
# Prep for next loop.
current_file = data_helper.next_file_path
df = data_helper.load_next()
if __name__ == '__main__':
main()
================================================
FILE: data/regex.py
================================================
regex_replace = {
(r"https?:\/\/"
r"(www\.)?"
r"[^\s\.]+"
r"\.\S{2,}"): "<link>", # Raw link.
r"\[[^\(\)]*\]\(.*\)": "<link>", # Markdown link.
r"\r?\n": " ", # Newlines.
r"\d+": "<number>",
r"\.{2,}": ".",
r"(>|\*|)": "",
r"[_-]+": " "
}
contractions = {
"sha'n't": "shall not",
"I've": "I have",
"who's": "who has",
"you're": "you are",
"can't've": "cannot have",
"could've": "could have",
"shan't": "shall not",
"he'd've": "he would have",
"hadn't've": "had not have",
"couldn't've": "could not have",
"y'all've": "you all have",
"when've": "when have",
"that'd've": "that would have",
"it'll": "it shall",
"oughtn't've": "ought not have",
"you'll": "you shall",
"shouldn't've": "should not have",
"shouldn't": "should not",
"we've": "we have",
"who've": "who have",
"why've": "why have",
"needn't've": "need not have",
"ma'am": "madam",
"oughtn't": "ought not",
"mustn't've": "must not have",
"they'd've": "they would have",
"isn't": "is not",
"y'all're": "you all are",
"so's": "so as",
"he'd": "he had",
"doesn't": "does not",
"he's": "he has",
"I'm": "I am",
"mightn't've": "might not have",
"hadn't": "had not",
"needn't": "need not",
"don't": "do not",
"he'll've": "he shall have",
"we'll've": "we will have",
"what'll": "what shall",
"that's": "that has",
"it'd": "it had",
"how's": "how has",
"you've": "you have",
"wouldn't": "would not",
"he'll": "he shall",
"we'd": "we had",
"I'll": "I shall",
"when's": "when has",
"we'll": "we will",
"couldn't": "could not",
"you'll've": "you shall have",
"will've": "will have",
"there'd've": "there would have",
"they'd": "they had",
"I'd": "I had", "y'all": "you all",
"won't've": "will not have",
"aren't": "are not",
"haven't": "have not",
"mustn't": "must not",
"what've": "what have",
"it's": "it has",
"she'll": "she shall",
"wasn't": "was not",
"they're": "they are",
"that'd": "that would",
"how'd'y": "how do you",
"what's": "what has",
"there'd": "there had",
"to've": "to have",
"I'll've": "I shall have",
"y'all'd": "you all would",
"would've": "would have",
"how'll": "how will",
"she'd": "she had", "what're": "what are",
"wouldn't've": "would not have",
"might've": "might have", "mayn't": "may not", "o'clock": "of the clock",
"'cause": "because",
"mightn't": "might not",
"didn't": "did not",
"they'll": "they shall",
"there's": "there has",
"we'd've": "we would have", "hasn't": "has not",
"let's": "let us", "she's": "she has",
"who'll": "who shall",
"shan't've": "shall not have",
"won't": "will not",
"where've": "where have",
"it'll've": "it shall have", "where's": "where has",
"you'd've": "you would have",
"weren't": "were not",
"who'll've": "who shall have",
"why's": "why has",
"how'd": "how did",
"we're": "we are",
"she'd've": "she would have",
"ain't": "am not",
"y'all'd've": "you all would have",
"I'd've": "I would have", "they've": "they have",
"must've": "must have",
"what'll've": "what shall have", "she'll've": "she shall have",
"where'd": "where did",
"should've": "should have",
"you'd": "you had",
"can't": "cannot",
"it'd've": "it would have",
"so've": "so have", "they'll've": "they shall have"}
================================================
FILE: main.py
================================================
#!/usr/bin/env python3
"""main.py: Train and/or chat with a bot. (work in progress).
Typical use cases:
1. Train a model specified by yaml config file, located at
path_to/my_config.yml, where paths are relative to project root:
./main.py --config path_to/my_config.yml
2. Train using mix of yaml config and cmd-line args, with
command-line args taking precedence over any values.
./main.py \
--config path_to/my_config.yml \
--model_params "{'batch_size': 32, 'optimizer': 'RMSProp'}"
3. Load a pretrained model that was saved in path_to/pretrained_dir,
which is assumed to be relative to the project root.
./main.py --pretrained_dir path_to/pretrained_dir
"""
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
import os
# Meaning of values:
# 1: INFO messages are not printed.
# 2: INFO, WARNING messages are not printed.
# I'm temporarily making the default '2' since the TF master
# branch (as of May 6) is spewing warnings that are clearly
# due to bugs on their side.
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import data
import chatbot
import logging
import tensorflow as tf
from pydoc import locate
from utils import io_utils
# =============================================================================
# FLAGS: Command line argument parser from TensorFlow.
# =============================================================================
flags = tf.app.flags
flags.DEFINE_string(
flag_name="pretrained_dir",
default_value=None,
docstring="relative path to a pretrained model directory."
"It is assumed that the model is one from this repository, and "
" thus has certain files that are generated after any training"
" session (TL;DR: any ckpt_dir you've trained previously).")
flags.DEFINE_string(
flag_name="config",
default_value=None,
docstring="relative path to a valid yaml config file."
" For example: configs/example_cornell.yml")
flags.DEFINE_string(
flag_name="debug",
default_value=False,
docstring="If true, increases output verbosity (log levels).")
flags.DEFINE_string(
flag_name="model",
default_value="{}",
docstring="Options: chatbot.{DynamicBot,Simplebot,ChatBot}.")
flags.DEFINE_string(
flag_name="model_params",
default_value="{}",
docstring="Configuration dictionary, with supported keys specified by"
" those in chatbot.globals.py.")
flags.DEFINE_string(
flag_name="dataset",
default_value="{}",
docstring="Name (capitalized) of dataset to use."
" Options: [data.]{Cornell,Ubuntu,Reddit}."
" - Legend: [optional] {Pick,One,Of,These}.")
flags.DEFINE_string(
flag_name="dataset_params",
default_value="{}",
docstring="Configuration dictionary, with supported keys specified by"
" those in chatbot.globals.py.")
FLAGS = flags.FLAGS
def start_training(dataset, bot):
"""Train bot.
Will expand this function later to aid interactivity/updates.
"""
print("Training bot. CTRL-C to stop training.")
bot.train(dataset)
def start_chatting(bot):
"""Talk to bot.
Will re-add teacher mode soon. Old implementation in _decode.py."""
print("Initiating chat session.")
print("Your bot has a temperature of %.2f." % bot.temperature, end=" ")
if bot.temperature < 0.1:
print("Not very adventurous, are we?")
elif bot.temperature < 0.7:
print("This should be interesting . . . ")
else:
print("Enjoy your gibberish!")
bot.chat()
def main(argv):
if FLAGS.debug:
# Setting to '0': all tensorflow messages are logged.
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'
logging.basicConfig(level=logging.INFO)
# Extract the merged configs/dictionaries.
config = io_utils.parse_config(flags=FLAGS)
if config['model_params']['decode'] and config['model_params']['reset_model']:
print("Woops! You passed {decode: True, reset_model: True}."
" You can't chat with a reset bot! I'll set reset to False.")
config['model_params']['reset_model'] = False
# If loading from pretrained, double-check that certain values are correct.
# (This is not something a user need worry about -- done automatically)
if FLAGS.pretrained_dir is not None:
assert config['model_params']['decode'] \
and not config['model_params']['reset_model']
# Print out any non-default parameters given by user, so as to reassure
# them that everything is set up properly.
io_utils.print_non_defaults(config)
print("Setting up %s dataset." % config['dataset'])
dataset_class = locate(config['dataset']) or getattr(data, config['dataset'])
dataset = dataset_class(config['dataset_params'])
print("Creating", config['model'], ". . . ")
bot_class = locate(config['model']) or getattr(chatbot, config['model'])
bot = bot_class(dataset, config)
if not config['model_params']['decode']:
start_training(dataset, bot)
else:
start_chatting(bot)
if __name__ == "__main__":
tf.logging.set_verbosity('ERROR')
tf.app.run()
================================================
FILE: notebooks/Analysis.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Analysis, Goals, and Predictions\n",
"\n",
"Here, I'd like to go over the loss functions being used to train the models and what we are aiming for, so that we can better understand the models' performance as training progresses. My approach will follow that of Ng (2015), as outlined in the textbook __\"Deep Learning\" by Goodfellow et al.__:\n",
"\n",
"* Determine your goals -- error metric(s) and target (re: desired) value(s). \n",
"* Establish a working end-to-end pipeline. \n",
"* Determine bottlenecks in performance, their sources, and whether they're due to overfitting/underfitting/software defect(s). \n",
"* Repeatedly make incremental changes such as gathering new data, adjusting hyperparams, or changing algorithms. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Selecting Hyperparameters"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Manual HyperParameter Tuning\n",
"\n",
"Here are I'll just bullet the main ideas:\n",
"* The learning rate is perhaps the most important hyperparameter. The training error increases appx exponentially as the learning rate decreases below its optimal value. Above the optimal value, the training error basically shoots off to infinity (vertical wall). \n",
"* Next, the best perfomance usually comes from a large model that is regularized well, for example, by using dropout. \n",
"* Table showing typical hyperparameter relationships with model capacity. Remember that you can basically brute force your way to good performance by jacking up the model capacity and training set size. \n",
"\n",
"| Hyperparameter | Increases capacity when... | \n",
"| -------------- | -------------------------- |\n",
"| Num hidden units | increased | \n",
"| Learning rate | tuned optimally |\n",
"| Convolution kernal width | increased | \n",
"| Implicit zero padding | increased | \n",
"| Weight decay coefficient | decreased | \n",
"| Dropout rate | decreased | \n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Automatic HyperParameter Optimization\n",
"\n",
"__Grid Search__: This is what I'm doing right now. User selects a small finite set of values to explore. Grid search trains a model for every joint specification of hyperparameter values in the Cartesian product of possible values. The experiment with the best _validation error_ is chosen as the best. \n",
"\n",
"__Random Search (Better)__: \n",
"1. Define a marginal distribution for each hyperparameter, e.g. multinoulli for discrete hparams or uniform (log-scale) for positive real-valued hyparams. For example, if we were interested in the range $[10^{-5}, 0.1]$ for the learning rate:\n",
"$$\n",
"\\begin{align}\n",
"\\texttt{logLearningRate} &\\sim Unif[-1, -5] \\\\\n",
"\\texttt{learningRate} &= 10^{logLearningRate}\n",
"\\end{align}\n",
"$$"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Debugging Strategies"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Determining whether or not a machine learning model is broken is hard. Here are some debugging tips:\n",
"* __Visualize the model in action__: Not just the quantitative stuff. How do the filters look? How is the chatbot responding?\n",
"* __Visualize the worst mistakes__: For example, our chatbot models output probabilities for the word tokens, and we either sample or argmax. One way to get an idea of what sentences our model does poorly on is to choose examples where the output probability max is *small*. In other words, if argmax(output) is much lower than usual, that says our model is rather unsure what is the best next word (think of the limiting case where it outputs 1/numOutputs for all possible tokens!). \n",
"* __Fit a tiny dataset__: Oooh, I like this one! Even small models can be guaranteed to be able to fit a sufficiently small dataset. Make sure you can write program that can train on say, a handful of input-output sentences, and produce the output given any of the inputs with near perfect accuracy. \n",
"* __Monitor histograms of activations/gradients__: The preactivation can tell us if the units saturate, or how often they do. For tanh units, the average of the absolute value of the preactivations tells us how saturated the unit is. It is also useful to compare the parameter gradients with the parameters themselves. Ideally, we'd like the gradients over a minibatch to be about 1 percent of the magnitude of the parameter. "
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"## Plotting the Hyperparameter-Search Results"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"%matplotlib inline\n",
"plt.style.use('ggplot')\n",
"plt.rcParams['figure.figsize'] = 10, 8\n",
"BASE = '/home/brandon/Documents/seq2seq_projects/data/saved_train_data/'"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>embed_size</th>\n",
" <th>global_step</th>\n",
" <th>learning_rate</th>\n",
" <th>loss</th>\n",
" <th>state_size</th>\n",
" <th>vocab_size</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>56</td>\n",
" <td>2</td>\n",
" <td>0.189885</td>\n",
" <td>9.211146</td>\n",
" <td>380</td>\n",
" <td>10000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>56</td>\n",
" <td>203</td>\n",
" <td>0.189885</td>\n",
" <td>5.385020</td>\n",
" <td>380</td>\n",
" <td>10000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>56</td>\n",
" <td>404</td>\n",
" <td>0.189885</td>\n",
" <td>5.219425</td>\n",
" <td>380</td>\n",
" <td>10000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>56</td>\n",
" <td>605</td>\n",
" <td>0.189885</td>\n",
" <td>4.849638</td>\n",
" <td>380</td>\n",
" <td>10000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>56</td>\n",
" <td>806</td>\n",
" <td>0.189885</td>\n",
" <td>4.682628</td>\n",
" <td>380</td>\n",
" <td>10000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" embed_size global_step learning_rate loss state_size vocab_size\n",
"0 56 2 0.189885 9.211146 380 10000\n",
"0 56 203 0.189885 5.385020 380 10000\n",
"0 56 404 0.189885 5.219425 380 10000\n",
"0 56 605 0.189885 4.849638 380 10000\n",
"0 56 806 0.189885 4.682628 380 10000"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path = BASE + 'cornell_03_11.csv'\n",
"df = pd.read_csv(path, index_col=0)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{32}\n",
"{128}\n",
"{0.5}\n"
]
}
],
"source": [
"embed_sizes = set(df['embed_size'])\n",
"state_sizes = set(df['state_size'])\n",
"learning_rates = set(df['learning_rate'])\n",
"print(embed_sizes)\n",
"print(state_sizes)\n",
"print(learning_rates)\n",
"\n",
"def get_split(df, col, vals):\n",
" return [(v, df[df[col]==v]) for v in vals]\n",
"\n",
"def split_df_and_plot(df, split_col, split_vals):\n",
" \"\"\"\n",
" Example usage:\n",
" split_df_and_plot(df, 'learning_rate', learning_rates)\n",
" \"\"\"\n",
" df_split = get_split(df, split_col, split_vals)\n",
" plt.figure(figsize=(8, 6))\n",
" for val, df_sp in df_split:\n",
" ax=plt.subplot()\n",
" plt.scatter(df_sp['global_step'], df_sp['loss'], label='%.3f' % val)\n",
"\n",
" plt.title(split_col + ' Comparisons', fontsize=20)\n",
" ax.set_xlabel('Global Step', fontsize=15)\n",
" ax.set_ylabel('Validation Loss', fontsize=15)\n",
" leg = ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., \n",
" title=split_col, prop={'size':15})\n",
" plt.setp(leg.get_title(),fontsize=20)\n",
" plt.tight_layout()\n",
" plt.savefig(split_col+'.pdf', bbox_extra_artists=(leg,), bbox_inches='tight')\n",
" plt.show()\n",
" \n",
"def inception_split(df, split_col_one, split_vals_one, split_col_two, split_vals_two):\n",
" \"\"\"ENHANCE\"\"\"\n",
" df_split_one = get_split(df, split_col_one, split_vals_one)\n",
" fig = plt.figure(figsize=(12, 10))\n",
" ctr = 1\n",
" for val_one, df_sp_one in df_split_one:\n",
" df_split_two = get_split(df_sp_one, split_col_two, split_vals_two)\n",
" ax=fig.add_subplot(3, 2, ctr)\n",
" for val_two, df_sp_two in df_split_two:\n",
" ax.scatter(df_sp_two['global_step'], df_sp_two['loss'], label=split_col_two + ': %.2f' % val_two)\n",
" ax.set_ylim([3., 10.])\n",
" plt.title(split_col_one + ' = %.2f' % val_one, fontsize=15)\n",
" ax.set_xlabel('Global Step', fontsize=12)\n",
" ax.set_ylabel('Validation Loss', fontsize=12)\n",
" if ctr in [2, 4, 6]:\n",
" leg = ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., \n",
" title=split_col_two, prop={'size':12})\n",
" plt.setp(leg.get_title(),fontsize=15)\n",
" ctr += 1\n",
" plt.tight_layout()\n",
" plt.savefig(split_col_one + \"_\" + split_col_two + '.pdf', bbox_extra_artists=(leg,), bbox_inches='tight')\n",
" plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Single Plots Distinguishing One Variable"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAskAAAGoCAYAAAC0dXiPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AA
gitextract_wj3qb2ee/
├── .gitattributes
├── .gitignore
├── .travis.yml
├── LICENSE.md
├── README.md
├── chatbot/
│ ├── __init__.py
│ ├── _models.py
│ ├── components/
│ │ ├── __init__.py
│ │ ├── base/
│ │ │ ├── __init__.py
│ │ │ └── _rnn.py
│ │ ├── bot_ops.py
│ │ ├── decoders.py
│ │ ├── embedder.py
│ │ ├── encoders.py
│ │ └── input_pipeline.py
│ ├── dynamic_models.py
│ ├── globals.py
│ └── legacy/
│ ├── __init__.py
│ ├── _decode.py
│ ├── _train.py
│ └── legacy_models.py
├── configs/
│ ├── example_attention.yml
│ ├── example_cornell.yml
│ ├── example_reddit.yml
│ ├── example_ubuntu.yml
│ ├── ubuntu_basic.yml
│ └── website_config.yml
├── data/
│ ├── __init__.py
│ ├── _dataset.py
│ ├── data_helper.py
│ ├── dataset_wrappers.py
│ ├── reddit_preprocessor.py
│ └── regex.py
├── main.py
├── notebooks/
│ ├── Analysis.ipynb
│ ├── DataVizUtils.ipynb
│ ├── README.md
│ ├── RedditPipelineAndVisualization.ipynb
│ ├── TensorFlow Notes.ipynb
│ ├── __init__.py
│ └── ubuntu_reformat.ipynb
├── requirements.txt
├── setup.py
├── tests/
│ ├── __init__.py
│ ├── test_config.py
│ ├── test_config.yml
│ ├── test_data/
│ │ ├── train_from.txt
│ │ ├── train_from.txt.ids121
│ │ ├── train_to.txt
│ │ ├── train_to.txt.ids121
│ │ ├── trainvoc121_seq15.tfrecords
│ │ ├── trainvoc121_seq20.tfrecords
│ │ ├── valid_from.txt
│ │ ├── valid_from.txt.ids121
│ │ ├── valid_to.txt
│ │ ├── valid_to.txt.ids121
│ │ ├── validvoc121_seq15.tfrecords
│ │ ├── validvoc121_seq20.tfrecords
│ │ └── vocab121.txt
│ ├── test_data.py
│ ├── test_dynamic_models.py
│ ├── test_legacy_models.py
│ └── utils.py
├── utils/
│ ├── __init__.py
│ ├── bot_freezer.py
│ └── io_utils.py
└── webpage/
├── __init__.py
├── app.yaml
├── config.py
├── deepchat/
│ ├── __init__.py
│ ├── main/
│ │ ├── __init__.py
│ │ ├── errors.py
│ │ ├── forms.py
│ │ └── views.py
│ ├── models.py
│ ├── static/
│ │ ├── assets/
│ │ │ ├── plots/
│ │ │ │ ├── accuracy.json
│ │ │ │ ├── configs.json
│ │ │ │ ├── training.json
│ │ │ │ └── validation.json
│ │ │ └── test_data/
│ │ │ ├── train_from.txt
│ │ │ ├── train_to.txt
│ │ │ ├── valid_from.txt
│ │ │ ├── valid_to.txt
│ │ │ └── vocab121.txt
│ │ ├── css/
│ │ │ ├── style_modifications.css
│ │ │ └── theme.css
│ │ ├── js/
│ │ │ ├── bootstrapify.js
│ │ │ ├── chat_processing.js
│ │ │ ├── jqBootstrapValidation.js
│ │ │ └── user_form.js
│ │ └── vendor/
│ │ ├── bootstrap-3.3.7-dist/
│ │ │ ├── css/
│ │ │ │ ├── bootstrap-theme.css
│ │ │ │ └── bootstrap.css
│ │ │ └── js/
│ │ │ ├── bootstrap.js
│ │ │ └── npm.js
│ │ ├── font-awesome/
│ │ │ ├── css/
│ │ │ │ └── font-awesome.css
│ │ │ ├── fonts/
│ │ │ │ └── FontAwesome.otf
│ │ │ ├── less/
│ │ │ │ ├── animated.less
│ │ │ │ ├── bordered-pulled.less
│ │ │ │ ├── core.less
│ │ │ │ ├── fixed-width.less
│ │ │ │ ├── font-awesome.less
│ │ │ │ ├── icons.less
│ │ │ │ ├── larger.less
│ │ │ │ ├── list.less
│ │ │ │ ├── mixins.less
│ │ │ │ ├── path.less
│ │ │ │ ├── rotated-flipped.less
│ │ │ │ ├── screen-reader.less
│ │ │ │ ├── stacked.less
│ │ │ │ └── variables.less
│ │ │ └── scss/
│ │ │ ├── _animated.scss
│ │ │ ├── _bordered-pulled.scss
│ │ │ ├── _core.scss
│ │ │ ├── _fixed-width.scss
│ │ │ ├── _icons.scss
│ │ │ ├── _larger.scss
│ │ │ ├── _list.scss
│ │ │ ├── _mixins.scss
│ │ │ ├── _path.scss
│ │ │ ├── _rotated-flipped.scss
│ │ │ ├── _screen-reader.scss
│ │ │ ├── _stacked.scss
│ │ │ ├── _variables.scss
│ │ │ └── font-awesome.scss
│ │ └── jquery/
│ │ └── jquery.easing.1.3.js
│ ├── templates/
│ │ ├── 404.html
│ │ ├── about.html
│ │ ├── admin/
│ │ │ └── index.html
│ │ ├── base.html
│ │ ├── index.html
│ │ ├── macros/
│ │ │ └── forms.html
│ │ └── plots.html
│ └── web_bot.py
├── manage.py
├── migrations/
│ ├── README
│ ├── alembic.ini
│ ├── env.py
│ ├── script.py.mako
│ └── versions/
│ └── 236b966ecd2f_.py
├── requirements.txt
├── runtime.txt
└── tests/
├── __init__.py
├── test_database.py
└── test_simple.py
SYMBOL INDEX (335 symbols across 37 files)
FILE: chatbot/_models.py
function gpu_found (line 22) | def gpu_found():
class Model (line 28) | class Model(object):
method __init__ (line 32) | def __init__(self, logger, dataset, params):
method compile (line 77) | def compile(self):
method save (line 115) | def save(self, summaries=None):
method close (line 133) | def close(self, save_current=True):
method graph (line 150) | def graph(self):
method fill_params (line 154) | def fill_params(dataset, params):
method freeze (line 167) | def freeze(self):
method __getattr__ (line 193) | def __getattr__(self, name):
method _build_hparam_path (line 209) | def _build_hparam_path(ckpt_dir, **kwargs):
class BucketModel (line 233) | class BucketModel(Model):
method __init__ (line 240) | def __init__(self, logger, buckets, dataset, params):
method compile (line 247) | def compile(self):
method check_input_lengths (line 271) | def check_input_lengths(self, inputs, expected_lengths):
method get_batch (line 282) | def get_batch(self, data, bucket_id):
method train (line 333) | def train(self, dataset):
method decode (line 338) | def decode(self):
method step (line 343) | def step(self, encoder_inputs, decoder_inputs, target_weights, bucket_...
FILE: chatbot/components/base/_rnn.py
class Cell (line 25) | class Cell(RNNCell):
method __init__ (line 29) | def __init__(self, state_size, num_layers, dropout_prob, base_cell):
method state_size (line 55) | def state_size(self):
method shape (line 59) | def shape(self):
method single_layer_shape (line 67) | def single_layer_shape(self):
method output_size (line 75) | def output_size(self):
method __call__ (line 78) | def __call__(self, inputs, state, scope=None):
class RNN (line 102) | class RNN(object):
method __init__ (line 107) | def __init__(self,
method get_cell (line 131) | def get_cell(self, name):
method wrapper (line 139) | def wrapper(self, state):
method __call__ (line 154) | def __call__(self, *args):
class SimpleAttentionWrapper (line 158) | class SimpleAttentionWrapper(RNNCell):
method __init__ (line 165) | def __init__(self,
method zero_state (line 221) | def zero_state(self, batch_size, dtype):
method call (line 250) | def call(self, inputs, state):
method output_size (line 304) | def output_size(self):
method state_size (line 308) | def state_size(self):
method shape (line 317) | def shape(self):
class BasicRNNCell (line 326) | class BasicRNNCell(RNNCell):
method __init__ (line 335) | def __init__(self, num_units, reuse=None):
method state_size (line 340) | def state_size(self):
method output_size (line 344) | def output_size(self):
method __call__ (line 347) | def __call__(self, inputs, state, scope=None):
FILE: chatbot/components/bot_ops.py
function dynamic_sampled_softmax_loss (line 9) | def dynamic_sampled_softmax_loss(labels, logits, output_projection, voca...
function _dynamic_sampled_map (line 37) | def _dynamic_sampled_map(labels, logits, output_projection, vocab_size,
function _dynamic_sampled_from_scratch (line 82) | def _dynamic_sampled_from_scratch(labels, logits, output_projection, voc...
function cross_entropy_sequence_loss (line 148) | def cross_entropy_sequence_loss(logits, labels, weights):
function dot_prod (line 165) | def dot_prod(x, y):
function bahdanau_score (line 169) | def bahdanau_score(attention_dim, h_j, s_i):
function luong_score (line 184) | def luong_score(attention_dim, h_j, s_i):
function linear_map (line 195) | def linear_map(args, output_size, biases=None):
FILE: chatbot/components/decoders.py
class Decoder (line 20) | class Decoder(RNN):
method __init__ (line 26) | def __init__(self,
method __call__ (line 84) | def __call__(self,
method apply_projection (line 177) | def apply_projection(self, outputs, scope=None):
method sample (line 207) | def sample(self, projected_output):
method get_projection_tensors (line 234) | def get_projection_tensors(self):
class BasicDecoder (line 241) | class BasicDecoder(Decoder):
method __call__ (line 244) | def __call__(self,
class AttentionDecoder (line 259) | class AttentionDecoder(Decoder):
method __init__ (line 269) | def __init__(self,
method __call__ (line 302) | def __call__(self,
method get_cell (line 325) | def get_cell(self, name, initial_state):
FILE: chatbot/components/embedder.py
class Embedder (line 9) | class Embedder:
method __init__ (line 14) | def __init__(self, vocab_size, embed_size, l1_reg=0.0):
method __call__ (line 20) | def __call__(self, inputs, reuse=None):
method assign_visualizers (line 57) | def assign_visualizers(self, writer, scope_names, metadata_path):
method get_scope_basename (line 78) | def get_scope_basename(self, scope):
class AutoEncoder (line 86) | class AutoEncoder(Model):
method __init__ (line 91) | def __init__(self, dataset, params):
method build_computation_graph (line 98) | def build_computation_graph(self, dataset):
method compile (line 143) | def compile(self):
method step (line 178) | def step(self, forward_only=False):
method train (line 185) | def train(self, close_when_done=True):
method __call__ (line 219) | def __call__(self, sentence):
FILE: chatbot/components/encoders.py
class BasicEncoder (line 10) | class BasicEncoder(RNN):
method __call__ (line 15) | def __call__(self, inputs, initial_state=None):
class BidirectionalEncoder (line 37) | class BidirectionalEncoder(RNN):
method __call__ (line 46) | def __call__(self, inputs, initial_state=None):
FILE: chatbot/components/input_pipeline.py
class InputPipeline (line 12) | class InputPipeline:
method __init__ (line 22) | def __init__(self, file_paths, batch_size, capacity=None, is_chatting=...
method build_pipeline (line 48) | def build_pipeline(self, name):
method encoder_inputs (line 74) | def encoder_inputs(self):
method decoder_inputs (line 83) | def decoder_inputs(self):
method user_input (line 93) | def user_input(self):
method feed_dict (line 97) | def feed_dict(self):
method feed_user_input (line 100) | def feed_user_input(self, user_input):
method toggle_active (line 104) | def toggle_active(self):
method _cond_input (line 111) | def _cond_input(self, prefix):
method _read_line (line 118) | def _read_line(self, file):
method _assign_queue (line 130) | def _assign_queue(self, proto_text):
method _padded_bucket_batches (line 154) | def _padded_bucket_batches(self, input_length, data):
FILE: chatbot/dynamic_models.py
class DynamicBot (line 21) | class DynamicBot(Model):
method __init__ (line 32) | def __init__(self, dataset, params):
method build_computation_graph (line 54) | def build_computation_graph(self, dataset):
method compile (line 142) | def compile(self):
method step (line 210) | def step(self, forward_only=False):
method train (line 254) | def train(self, dataset=None):
method decode (line 333) | def decode(self):
method __call__ (line 347) | def __call__(self, sentence):
method chat (line 380) | def chat(self):
method respond (line 384) | def respond(self, sentence):
method close (line 388) | def close(self, save_current=True, rebuild_for_chat=True):
method _set_chat_params (line 408) | def _set_chat_params(self):
FILE: chatbot/legacy/_decode.py
function decode (line 13) | def decode(bot, dataset, teacher_mode=True):
function decode_inputs (line 43) | def decode_inputs(inputs, idx_to_word, chatbot):
function train_on_feedback (line 55) | def train_on_feedback(chatbot, input_ids, feedback_ids, idx_to_outputs):
function _logits_to_outputs (line 67) | def _logits_to_outputs(output_logits, temperature, idx_word):
function _sample (line 84) | def _sample(logits, temperature):
function _assign_to_bucket (line 97) | def _assign_to_bucket(token_ids, buckets):
FILE: chatbot/legacy/_train.py
function train (line 9) | def train(bot, dataset):
function run_train_step (line 51) | def run_train_step(model, train_set, bucket_id, forward_only=False):
function run_checkpoint (line 60) | def run_checkpoint(model, step_time, loss, previous_losses, dev_set):
function _get_data_distribution (line 80) | def _get_data_distribution(train_set, buckets):
FILE: chatbot/legacy/legacy_models.py
class ChatBot (line 18) | class ChatBot(BucketModel):
method __init__ (line 37) | def __init__(self, buckets, dataset, params):
method step (line 112) | def step(self, encoder_inputs, decoder_inputs, target_weights, bucket_...
method _sampled_loss (line 153) | def _sampled_loss(num_samples, hidden_size, vocab_size):
method _get_projections (line 187) | def _get_projections(num_buckets, unprojected_vals, projection_operator):
method _get_placeholder_list (line 202) | def _get_placeholder_list(name, length, dtype=tf.int32):
class SimpleBot (line 213) | class SimpleBot(BucketModel):
method __init__ (line 221) | def __init__(self, dataset, params):
method _simple_loss (line 324) | def _simple_loss(batch_size, logits, targets, weights):
method step (line 339) | def step(self, encoder_inputs, decoder_inputs, target_weights, bucket_...
FILE: data/_dataset.py
class DatasetABC (line 18) | class DatasetABC(metaclass=ABCMeta):
method convert_to_tf_records (line 21) | def convert_to_tf_records(self, *args):
method train_generator (line 28) | def train_generator(self, batch_size):
method valid_generator (line 35) | def valid_generator(self, batch_size):
method word_to_idx (line 42) | def word_to_idx(self):
method idx_to_word (line 47) | def idx_to_word(self):
method name (line 52) | def name(self):
method max_seq_len (line 57) | def max_seq_len(self):
class Dataset (line 62) | class Dataset(DatasetABC):
method __init__ (line 64) | def __init__(self, dataset_params):
method convert_to_tf_records (line 102) | def convert_to_tf_records(self, prefix='train'):
method sentence_generator (line 166) | def sentence_generator(self, prefix='from'):
method pairs_generator (line 179) | def pairs_generator(self, num_generate=None):
method train_generator (line 196) | def train_generator(self, batch_size):
method valid_generator (line 203) | def valid_generator(self, batch_size):
method _generator (line 210) | def _generator(self, from_path, to_path, batch_size):
method word_to_idx (line 272) | def word_to_idx(self):
method idx_to_word (line 277) | def idx_to_word(self):
method as_words (line 281) | def as_words(self, sentence):
method name (line 304) | def name(self):
method train_size (line 309) | def train_size(self):
method valid_size (line 313) | def valid_size(self):
method max_seq_len (line 317) | def max_seq_len(self):
method fill_params (line 321) | def fill_params(dataset_params):
method __getattr__ (line 328) | def __getattr__(self, name):
FILE: data/data_helper.py
function prompt (line 38) | def prompt(text, default="", required=False):
class DataHelper (line 49) | class DataHelper:
method __init__ (line 56) | def __init__(self, log_level=logging.INFO):
method safe_load (line 113) | def safe_load(self):
method load_random (line 153) | def load_random(self, year=None):
method load_next (line 168) | def load_next(self):
method set_word_freq (line 181) | def set_word_freq(self, wf):
method word_freq (line 188) | def word_freq(self):
method next_file_path (line 192) | def next_file_path(self):
method get_year_from_path (line 195) | def get_year_from_path(self, path):
method generate_files (line 206) | def generate_files(self,
method df_generator (line 240) | def df_generator(self):
method random_rows_generator (line 250) | def random_rows_generator(num_rows_per_print, num_rows_total):
method word_tokenizer (line 259) | def word_tokenizer(sentences):
method df_to_json (line 271) | def df_to_json(df, target_file=None, orient='records', lines=False, **...
FILE: data/dataset_wrappers.py
function check_data (line 11) | def check_data(abs_path, name):
class Cornell (line 30) | class Cornell(Dataset):
method __init__ (line 33) | def __init__(self, dataset_params):
class Ubuntu (line 42) | class Ubuntu(Dataset):
method __init__ (line 45) | def __init__(self, dataset_params):
class Reddit (line 54) | class Reddit(Dataset):
method __init__ (line 57) | def __init__(self, dataset_params):
class TestData (line 66) | class TestData(Dataset):
method __init__ (line 69) | def __init__(self, dataset_params):
FILE: data/reddit_preprocessor.py
function timed_function (line 35) | def timed_function(*expected_args):
function parallel_map_df (line 52) | def parallel_map_df(fn, df):
function parallel_map_list (line 64) | def parallel_map_list(fn, iterable):
function sentence_score (line 75) | def sentence_score(sentences):
function root_comments (line 86) | def root_comments(df):
function remove_extra_columns (line 101) | def remove_extra_columns(df):
function regex_replacements (line 112) | def regex_replacements(df):
function remove_large_comments (line 131) | def remove_large_comments(max_len, df):
function expand_contractions (line 138) | def expand_contractions(df):
function children_dict (line 149) | def children_dict(df):
function main (line 163) | def main():
FILE: main.py
function start_training (line 86) | def start_training(dataset, bot):
function start_chatting (line 95) | def start_chatting(bot):
function main (line 110) | def main(argv):
FILE: tests/test_config.py
class TestConfig (line 16) | class TestConfig(unittest.TestCase):
method setUp (line 19) | def setUp(self):
method test_merge_params (line 24) | def test_merge_params(self):
method test_optimize (line 64) | def test_optimize(self):
method test_update_config (line 91) | def test_update_config(self):
FILE: tests/test_data.py
class TestData (line 17) | class TestData(unittest.TestCase):
method setUp (line 20) | def setUp(self):
method test_basic (line 30) | def test_basic(self):
method test_cornell (line 87) | def test_cornell(self):
FILE: tests/test_dynamic_models.py
class TestDynamicModels (line 18) | class TestDynamicModels(unittest.TestCase):
method setUp (line 20) | def setUp(self):
method test_create_bot (line 23) | def test_create_bot(self):
method test_save_bot (line 29) | def test_save_bot(self):
method test_save_bot (line 34) | def test_save_bot(self):
method test_train (line 41) | def test_train(self):
method test_base_methods (line 51) | def test_base_methods(self):
method test_manual_freeze (line 61) | def test_manual_freeze(self):
method test_memorize (line 142) | def test_memorize(self):
method _quick_train (line 179) | def _quick_train(self, bot, num_iter=10):
FILE: tests/test_legacy_models.py
class TestLegacyModels (line 13) | class TestLegacyModels(unittest.TestCase):
method setUp (line 16) | def setUp(self):
method test_create (line 24) | def test_create(self):
method test_compile (line 37) | def test_compile(self):
FILE: tests/utils.py
function create_bot (line 37) | def create_bot(flags=TEST_FLAGS, return_dataset=False):
function update_config (line 63) | def update_config(config, **kwargs):
FILE: utils/bot_freezer.py
function load_graph (line 15) | def load_graph(frozen_model_dir):
function unfreeze_bot (line 43) | def unfreeze_bot(frozen_model_path):
function unfreeze_and_chat (line 60) | def unfreeze_and_chat(frozen_model_path):
function get_frozen_vocab (line 94) | def get_frozen_vocab(config):
class FrozenBot (line 103) | class FrozenBot:
method __init__ (line 105) | def __init__(self, frozen_model_dir, vocab_size):
method as_words (line 115) | def as_words(self, sentence):
method __call__ (line 118) | def __call__(self, sentence):
FILE: utils/io_utils.py
function save_hyper_params (line 60) | def save_hyper_params(hyper_params, fname):
function get_sentence (line 67) | def get_sentence(lower=True):
function update_config (line 79) | def update_config(config=None,
function get_yaml_config (line 129) | def get_yaml_config(path, save_path=True):
function load_pretrained_config (line 138) | def load_pretrained_config(pretrained_dir):
function print_non_defaults (line 162) | def print_non_defaults(config):
function flags_to_dict (line 185) | def flags_to_dict(flags):
function merge_dicts (line 228) | def merge_dicts(default_dict, preference_dict):
function parse_config (line 252) | def parse_config(flags=None, pretrained_dir=None, config_path=None):
function basic_tokenizer (line 309) | def basic_tokenizer(sentence):
function num_lines (line 317) | def num_lines(file_path):
function get_word_freqs (line 323) | def get_word_freqs(path, counter, norm_digits=True):
function create_vocabulary (line 349) | def create_vocabulary(vocab_path, from_path, to_path, max_vocab_size, no...
function get_vocab_dicts (line 386) | def get_vocab_dicts(vocabulary_path):
function sentence_to_token_ids (line 410) | def sentence_to_token_ids(sentence, vocabulary, normalize_digits=True):
function data_to_token_ids (line 434) | def data_to_token_ids(data_path, target_path, vocabulary_path, normalize...
function prepare_data (line 461) | def prepare_data(data_dir,
FILE: webpage/config.py
class Config (line 5) | class Config:
method init_app (line 25) | def init_app(app):
class DevelopmentConfig (line 29) | class DevelopmentConfig(Config):
class TestingConfig (line 35) | class TestingConfig(Config):
class ProductionConfig (line 41) | class ProductionConfig(Config):
FILE: webpage/deepchat/__init__.py
class ReverseProxied (line 29) | class ReverseProxied(object):
method __init__ (line 46) | def __init__(self, app):
method __call__ (line 49) | def __call__(self, environ, start_response):
function create_app (line 68) | def create_app(config_name):
FILE: webpage/deepchat/main/errors.py
function page_not_found (line 8) | def page_not_found(e):
function internal_server_error (line 13) | def internal_server_error(e):
FILE: webpage/deepchat/main/forms.py
function bad_chars (line 10) | def bad_chars(form, string_field):
class ChatForm (line 16) | class ChatForm(FlaskForm):
class UserForm (line 22) | class UserForm(FlaskForm):
class SentencePairForm (line 30) | class SentencePairForm(FlaskForm):
FILE: webpage/deepchat/main/views.py
function inject_enumerate (line 26) | def inject_enumerate():
function load_gloabal_data (line 31) | def load_gloabal_data():
function index (line 39) | def index():
function about (line 51) | def about():
function plots (line 56) | def plots():
function update_database (line 61) | def update_database(user_message, bot_response):
function get_database_model (line 87) | def get_database_model(class_name, filter=None, **kwargs):
class UserAPI (line 111) | class UserAPI(Resource):
method post (line 113) | def post(self):
class ChatAPI (line 120) | class ChatAPI(Resource):
method __init__ (line 126) | def __init__(self, data_name):
method post (line 140) | def post(self):
class RedditAPI (line 152) | class RedditAPI(ChatAPI):
method __init__ (line 153) | def __init__(self):
class CornellAPI (line 157) | class CornellAPI(ChatAPI):
method __init__ (line 158) | def __init__(self):
class UbuntuAPI (line 162) | class UbuntuAPI(ChatAPI):
method __init__ (line 163) | def __init__(self):
class AuthException (line 176) | class AuthException(HTTPException):
method __init__ (line 177) | def __init__(self, message):
class ModelView (line 183) | class ModelView(sqla.ModelView):
method is_accessible (line 184) | def is_accessible(self):
method inaccessible_callback (line 190) | def inaccessible_callback(self, name, **kwargs):
FILE: webpage/deepchat/models.py
class User (line 15) | class User(db.Model):
method __repr__ (line 45) | def __repr__(self):
class Chatbot (line 49) | class Chatbot(db.Model):
method __init__ (line 62) | def __init__(self, name, **bot_kwargs):
method __repr__ (line 73) | def __repr__(self):
class Conversation (line 77) | class Conversation(db.Model):
method __repr__ (line 84) | def __repr__(self):
class Turn (line 88) | class Turn(db.Model):
method __repr__ (line 94) | def __repr__(self):
FILE: webpage/deepchat/static/js/jqBootstrapValidation.js
function regexFromString (line 875) | function regexFromString(inputstring) {
function executeFunctionByName (line 885) | function executeFunctionByName(functionName, context /*, args*/) {
FILE: webpage/deepchat/static/vendor/bootstrap-3.3.7-dist/js/bootstrap.js
function transitionEnd (line 34) | function transitionEnd() {
function removeElement (line 126) | function removeElement() {
function Plugin (line 142) | function Plugin(option) {
function Plugin (line 251) | function Plugin(option) {
function Plugin (line 475) | function Plugin(option) {
function getTargetFromTrigger (line 695) | function getTargetFromTrigger($trigger) {
function Plugin (line 707) | function Plugin(option) {
function getParent (line 774) | function getParent($this) {
function clearMenus (line 787) | function clearMenus(e) {
function Plugin (line 880) | function Plugin(option) {
function Plugin (line 1208) | function Plugin(option, _relatedTarget) {
function complete (line 1574) | function complete() {
function Plugin (line 1750) | function Plugin(option) {
function Plugin (line 1859) | function Plugin(option) {
function ScrollSpy (line 1902) | function ScrollSpy(element, options) {
function Plugin (line 2022) | function Plugin(option) {
function next (line 2131) | function next() {
function Plugin (line 2177) | function Plugin(option) {
function Plugin (line 2334) | function Plugin(option) {
FILE: webpage/deepchat/web_bot.py
function basic_tokenizer (line 19) | def basic_tokenizer(sentence):
function sentence_to_token_ids (line 26) | def sentence_to_token_ids(sentence, vocabulary, normalize_digits=True):
function get_vocab_dicts (line 34) | def get_vocab_dicts(vocabulary_path):
function load_graph (line 47) | def load_graph(frozen_model_dir):
function unfreeze_bot (line 74) | def unfreeze_bot(frozen_model_path):
class FrozenBot (line 91) | class FrozenBot:
method __init__ (line 94) | def __init__(self, frozen_model_dir, is_testing=False):
method load_config (line 115) | def load_config(self, config_path):
method __getattr__ (line 121) | def __getattr__(self, name):
method get_frozen_vocab (line 134) | def get_frozen_vocab(self, config):
method as_words (line 142) | def as_words(self, sentence):
method __call__ (line 160) | def __call__(self, sentence):
method unfreeze (line 183) | def unfreeze(self):
method freeze (line 190) | def freeze(self):
FILE: webpage/manage.py
function make_shell_context (line 27) | def make_shell_context():
function test (line 44) | def test():
function deploy (line 58) | def deploy():
FILE: webpage/migrations/env.py
function run_migrations_offline (line 31) | def run_migrations_offline():
function run_migrations_online (line 50) | def run_migrations_online():
FILE: webpage/migrations/versions/236b966ecd2f_.py
function upgrade (line 19) | def upgrade():
function downgrade (line 61) | def downgrade():
FILE: webpage/tests/test_database.py
class TestDatabase (line 14) | class TestDatabase(unittest.TestCase):
method setUp (line 16) | def setUp(self):
method tearDown (line 23) | def tearDown(self):
method test_app_exists (line 29) | def test_app_exists(self):
FILE: webpage/tests/test_simple.py
class TestSimple (line 8) | class TestSimple(unittest.TestCase):
method setUp (line 11) | def setUp(self):
method tearDown (line 18) | def tearDown(self):
method test_app_exists (line 24) | def test_app_exists(self):
method test_app_is_testing (line 27) | def test_app_is_testing(self):
Condensed preview — 144 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (1,880K chars).
[
{
"path": ".gitattributes",
"chars": 116,
"preview": "notebooks/* linguist-vendored\n\n*.pb filter=lfs diff=lfs merge=lfs -text\n**/*.pb filter=lfs diff=lfs merge=lfs -text\n"
},
{
"path": ".gitignore",
"chars": 831,
"preview": "**/images/hidden/*\n\n# Note: Any leading '**' need to be followed by a '/'. \n# Seems like '**/thing_to_ignore_always[/*]'"
},
{
"path": ".travis.yml",
"chars": 1330,
"preview": "language: python\ndist: trusty\nsudo: True\npython:\n - \"3.5\"\n - \"pypy3\"\nbefore_install:\n - sudo apt-get update\n "
},
{
"path": "LICENSE.md",
"chars": 1073,
"preview": "MIT License\n\nCopyright (c) 2017 Brandon McKinzie\n\nPermission is hereby granted, free of charge, to any person obtaining "
},
{
"path": "README.md",
"chars": 11370,
"preview": "# Conversation Models in Tensorflow\n\nNotes to visitors:\n* I've just shut down the website indefinitely. I ran out of my "
},
{
"path": "chatbot/__init__.py",
"chars": 420,
"preview": "from chatbot import globals\nfrom chatbot.components.base._rnn import *\nfrom chatbot.components.bot_ops import dynamic_s"
},
{
"path": "chatbot/_models.py",
"chars": 15359,
"preview": "\"\"\"Abstract classes.\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import p"
},
{
"path": "chatbot/components/__init__.py",
"chars": 426,
"preview": "from chatbot.components.embedder import Embedder\nfrom chatbot.components.input_pipeline import InputPipeline\nfrom chatbo"
},
{
"path": "chatbot/components/base/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "chatbot/components/base/_rnn.py",
"chars": 14094,
"preview": "\"\"\"Collection of base RNN classes and custom RNNCells.\n\"\"\"\n\nimport tensorflow as tf\nfrom tensorflow.python.util import n"
},
{
"path": "chatbot/components/bot_ops.py",
"chars": 10730,
"preview": "\"\"\"Custom TF 'ops' as meant in the TensorFlow definition of ops.\"\"\"\n\nimport numpy as np\nimport tensorflow as tf\nfrom uti"
},
{
"path": "chatbot/components/decoders.py",
"chars": 13563,
"preview": "import logging\nimport tensorflow as tf\nimport sys\n\n# Required due to TensorFlow's unreliable naming across versions . . "
},
{
"path": "chatbot/components/embedder.py",
"chars": 9211,
"preview": "import tensorflow as tf\nimport logging\nimport numpy as np\nfrom chatbot._models import Model\nfrom utils import io_utils\ni"
},
{
"path": "chatbot/components/encoders.py",
"chars": 3682,
"preview": "\"\"\"Classes for the dynamic encoders.\"\"\"\n\nimport tensorflow as tf\nfrom tensorflow.contrib.rnn import GRUCell\nfrom tensorf"
},
{
"path": "chatbot/components/input_pipeline.py",
"chars": 7108,
"preview": "import logging\nimport tensorflow as tf\nfrom utils import io_utils\nfrom tensorflow.contrib.training import bucket_by_sequ"
},
{
"path": "chatbot/dynamic_models.py",
"chars": 18429,
"preview": "\"\"\"Sequence-to-sequence models with dynamic unrolling and faster embedding \ntechniques.\n\"\"\"\n\nfrom __future__ import abso"
},
{
"path": "chatbot/globals.py",
"chars": 1742,
"preview": "\"\"\"Place all default/global chatbot variables here.\"\"\"\n\nimport tensorflow as tf\n\nOPTIMIZERS = {\n 'Adagrad': tf.train"
},
{
"path": "chatbot/legacy/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "chatbot/legacy/_decode.py",
"chars": 4173,
"preview": "\"\"\"Used by legacy_models for decoding. Not needed by DynamicBot.\"\"\"\n\nimport tensorflow as tf\nimport logging\nimport os\nim"
},
{
"path": "chatbot/legacy/_train.py",
"chars": 4048,
"preview": "\"\"\"Train seq2seq attention chatbot.\nNote: Only used for legacy_models.\nFor (better) DynamicBot implementation, please se"
},
{
"path": "chatbot/legacy/legacy_models.py",
"chars": 20872,
"preview": "\"\"\"Sequence-to-sequence models.\"\"\"\n\n# EDIT: Modified inheritance strucutre (see _models.py) so these *should* work again"
},
{
"path": "configs/example_attention.yml",
"chars": 413,
"preview": "model: DynamicBot\ndataset: Cornell\nmodel_params:\n base_cell: LSTMCell\n ckpt_dir: out/cornell\n attention_mechanism: Ba"
},
{
"path": "configs/example_cornell.yml",
"chars": 440,
"preview": "model: DynamicBot\ndataset: Cornell\nmodel_params:\n base_cell: LSTMCell\n num_layers: 2\n attention_mechanism: Luon"
},
{
"path": "configs/example_reddit.yml",
"chars": 330,
"preview": "model: DynamicBot\ndataset: Reddit\nmodel_params:\n base_cell: GRUCell\n batch_size: 128\n embed_size: 128\n num_layers: 1"
},
{
"path": "configs/example_ubuntu.yml",
"chars": 379,
"preview": "model: DynamicBot\ndataset: Ubuntu\nmodel_params:\n base_cell: GRUCell\n ckpt_dir: out/ubuntu\n decoder.class: BasicDecode"
},
{
"path": "configs/ubuntu_basic.yml",
"chars": 442,
"preview": "model: chatbot.DynamicBot\ndataset: data.Ubuntu\nmodel_params:\n base_cell: GRUCell\n ckpt_dir: out/ubuntu/basic\n decode:"
},
{
"path": "configs/website_config.yml",
"chars": 529,
"preview": "# Experimenting with best model params for website.\nmodel: DynamicBot\ndataset: Reddit\nmodel_params:\n base_cell: LSTMCel"
},
{
"path": "data/__init__.py",
"chars": 327,
"preview": "from __future__ import absolute_import\n\nfrom data import data_helper\nfrom data import _dataset\nfrom data import dataset_"
},
{
"path": "data/_dataset.py",
"chars": 12596,
"preview": "\"\"\"ABC for datasets. \"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import "
},
{
"path": "data/data_helper.py",
"chars": 11670,
"preview": "\"\"\"Provides pre-processing functionality.\n\nAbstracts paths and filenames so we don't have to think about them. Currently"
},
{
"path": "data/dataset_wrappers.py",
"chars": 2453,
"preview": "\"\"\"Named data wrapper classes. No added functionality to dataset base class for now,\nbut preprocessing checks will be in"
},
{
"path": "data/reddit_preprocessor.py",
"chars": 6818,
"preview": "\"\"\"Reddit data preprocessing.\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future_"
},
{
"path": "data/regex.py",
"chars": 3606,
"preview": "regex_replace = {\n (r\"https?:\\/\\/\"\n r\"(www\\.)?\"\n r\"[^\\s\\.]+\"\n r\"\\.\\S{2,}\"): \"<link>\", # Raw l"
},
{
"path": "main.py",
"chars": 5318,
"preview": "#!/usr/bin/env python3\n\n\"\"\"main.py: Train and/or chat with a bot. (work in progress).\n\nTypical use cases:\n 1. Train "
},
{
"path": "notebooks/Analysis.ipynb",
"chars": 484994,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Analysis, Goals, and Predictions\\"
},
{
"path": "notebooks/DataVizUtils.ipynb",
"chars": 267648,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Data Visualization Utilities\"\n "
},
{
"path": "notebooks/README.md",
"chars": 1540,
"preview": "## Data Processing\n\n### Overview of initial pre-processing\nOur initial method of processing the raw reddit data uses the"
},
{
"path": "notebooks/RedditPipelineAndVisualization.ipynb",
"chars": 68116,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Reddit Preprocessing in Stages\\n\""
},
{
"path": "notebooks/TensorFlow Notes.ipynb",
"chars": 43004,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {\n \"heading_collapsed\": true\n },\n \"source\": [\n \"#"
},
{
"path": "notebooks/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "notebooks/ubuntu_reformat.ipynb",
"chars": 134475,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {\n \"collapsed\": true\n },\n \"source\": [\n \"# Reforma"
},
{
"path": "requirements.txt",
"chars": 107,
"preview": "Pympler==0.5\nsetuptools==35.0.1\nPyYAML==3.12\nnltk>=3.2.2\nnumpy>=1.11.0\npandas>=0.19.2\ntensorflow==1.2.0rc2\n"
},
{
"path": "setup.py",
"chars": 518,
"preview": "from setuptools import setup\n\nsetup(name='DeepChatModels',\n description='Conversation Models in TensorFlow',\n "
},
{
"path": "tests/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "tests/test_config.py",
"chars": 4303,
"preview": "\"\"\"Tests for various operations done on config (yaml) dictionaries in project.\"\"\"\n\nimport os\nimport pydoc\nimport yaml\nim"
},
{
"path": "tests/test_config.yml",
"chars": 388,
"preview": "dataset: TestData\ndataset_params:\n config_path: /home/brandon/Documents/DeepChatModels/tests/test_config.yml\n max_seq_"
},
{
"path": "tests/test_data/train_from.txt",
"chars": 579,
"preview": "One.\nTwo.\nThree.\nFour.\nFive.\nSix.\nSeven.\nEight.\nNine.\nHello.\nWhat is your name?\nDo you like ice cream?\nWho is the presid"
},
{
"path": "tests/test_data/train_from.txt.ids121",
"chars": 419,
"preview": "36 4\n23 4\n20 4\n27 4\n18 4\n58 4\n47 4\n51 4\n44 4\n46 4\n13 12 24 106 5\n11 6 42 54 35 5\n28 12 14 16 31 14 41 38 5\n28 12 24 17 1"
},
{
"path": "tests/test_data/train_to.txt",
"chars": 561,
"preview": "Two.\nThree.\nFour.\nFive.\nSix.\nSeven.\nEight.\nNine.\nTen.\nHi.\nI am Groot.\nI hate ice cream.\nSatan is the president of the Un"
},
{
"path": "tests/test_data/train_to.txt.ids121",
"chars": 429,
"preview": "23 4\n20 4\n27 4\n18 4\n58 4\n47 4\n51 4\n44 4\n113 4\n33 4\n8 115 104 4\n8 75 54 35 4\n118 12 14 16 31 14 41 38 4\n68 108 12 48 17 1"
},
{
"path": "tests/test_data/valid_from.txt",
"chars": 292,
"preview": "Are you stupid?\nThank you.\nWho is a good boy?\nAre cats better than dogs?\nAre you stupid?\nThank you.\nWho is a good boy?\nA"
},
{
"path": "tests/test_data/valid_from.txt.ids121",
"chars": 200,
"preview": "10 6 3 5\n62 6 4\n28 12 21 60 3 5\n10 67 120 3 102 5\n10 6 3 5\n62 6 4\n28 12 21 60 3 5\n10 67 120 3 102 5\n10 6 3 5\n62 6 4\n28 1"
},
{
"path": "tests/test_data/valid_to.txt",
"chars": 348,
"preview": "No, I am not stupid.\nYou are welcome.\nI am a good boy.\nYes, cats are better than dogs.\nNo, I am not stupid.\nYou are welc"
},
{
"path": "tests/test_data/valid_to.txt.ids121",
"chars": 264,
"preview": "39 7 8 115 34 3 4\n6 10 66 4\n8 115 21 60 3 4\n3 7 67 10 120 3 102 4\n39 7 8 115 34 3 4\n6 10 66 4\n8 115 21 60 3 4\n3 7 67 10 "
},
{
"path": "tests/test_data/vocab121.txt",
"chars": 613,
"preview": "_PAD\n_GO\n_EOS\n_UNK\n.\n?\nyou\n,\ni\n'\nare\nis\ndo\nwhat\nthe\nt\nfavorite\nfive\npresident\ns\nthree\nhave\nfour\nwho\nnice\nyour\ndon\nsay\na\n"
},
{
"path": "tests/test_data.py",
"chars": 4834,
"preview": "import logging\nimport pdb\nimport sys\nsys.path.append(\"..\")\nimport os\nimport unittest\nimport tensorflow as tf\nfrom pydoc "
},
{
"path": "tests/test_dynamic_models.py",
"chars": 6769,
"preview": "\"\"\"Trial runs on DynamicBot with the TestData Dataset.\"\"\"\n\nimport time\nimport logging\nimport unittest\n\nimport numpy as n"
},
{
"path": "tests/test_legacy_models.py",
"chars": 1680,
"preview": "import os\nimport tensorflow as tf\nimport unittest\nimport logging\n\nimport sys\nfrom utils import io_utils\nimport data\nimpo"
},
{
"path": "tests/utils.py",
"chars": 2349,
"preview": "\"\"\"Utility functions used by test modules.\"\"\"\n\nimport logging\nimport data\nimport chatbot\n\nimport os\nfrom pydoc import lo"
},
{
"path": "utils/__init__.py",
"chars": 57,
"preview": "from utils import io_utils\nfrom utils import bot_freezer\n"
},
{
"path": "utils/bot_freezer.py",
"chars": 4612,
"preview": "\"\"\"Utilities for freezing and unfreezing model graphs and variables on the fly.\"\"\"\n\nfrom __future__ import absolute_impo"
},
{
"path": "utils/io_utils.py",
"chars": 22690,
"preview": "\"\"\"Utilities for downloading data from various datasets, tokenizing, vocabularies.\"\"\"\nfrom __future__ import absolute_im"
},
{
"path": "webpage/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "webpage/app.yaml",
"chars": 323,
"preview": "runtime: python\nenv: flex\nthreadsafe: false\nentrypoint: gunicorn -b :$PORT manage:app\n\nresources:\n cpu: 2 # Number of "
},
{
"path": "webpage/config.py",
"chars": 1657,
"preview": "import os\nbasedir = os.path.abspath(os.path.dirname(__file__))\n\n\nclass Config:\n\n DEFAULT_THEME = 'lumen'\n # Boolea"
},
{
"path": "webpage/deepchat/__init__.py",
"chars": 3318,
"preview": "\"\"\"deepchat/__init__.py: Initialize session objects.\"\"\"\n\nimport os\nfrom flask import Flask\nfrom flask_wtf import CSRFPro"
},
{
"path": "webpage/deepchat/main/__init__.py",
"chars": 486,
"preview": "\"\"\"Package constructor file for creating blueprint(s).\"\"\"\n\nfrom flask import Blueprint\nfrom flask_cors import CORS\n\n# Bl"
},
{
"path": "webpage/deepchat/main/errors.py",
"chars": 327,
"preview": "\"\"\"Routes for error pages.\"\"\"\n\nfrom flask import render_template\nfrom . import main\n\n\n@main.app_errorhandler(404)\ndef pa"
},
{
"path": "webpage/deepchat/main/forms.py",
"chars": 1172,
"preview": "\"\"\"apps/forms.py: \"\"\"\n\nfrom flask_wtf import FlaskForm\nfrom wtforms import StringField, SubmitField, \\\n TextField, Te"
},
{
"path": "webpage/deepchat/main/views.py",
"chars": 6137,
"preview": "from datetime import datetime\nimport os\nimport yaml\nimport json\n\nfrom flask import make_response, flash\nfrom werkzeug.ex"
},
{
"path": "webpage/deepchat/models.py",
"chars": 3794,
"preview": "\"\"\"app/models.py: Tutorial IV - Databases.\n\ndatabase models: collection of classes whose purpose is to represent the\n "
},
{
"path": "webpage/deepchat/static/assets/plots/accuracy.json",
"chars": 9712,
"preview": "{\"id\": \"el10890140364661486200\", \"axes\": [{\"texts\": [{\"id\": \"el10890140364668459552\", \"v_baseline\": \"hanging\", \"position"
},
{
"path": "webpage/deepchat/static/assets/plots/configs.json",
"chars": 1039,
"preview": "{\"BidiGRU\": {\"dataset_params\": {\"config_path\": \"configs/cornellBasicBidi.yml\"}, \"model_params\": {\"encoder.class\": \"Bidir"
},
{
"path": "webpage/deepchat/static/assets/plots/training.json",
"chars": 9599,
"preview": "{\"id\": \"el10890140364660675304\", \"axes\": [{\"texts\": [{\"id\": \"el10890140364658041968\", \"v_baseline\": \"hanging\", \"position"
},
{
"path": "webpage/deepchat/static/assets/plots/validation.json",
"chars": 9624,
"preview": "{\"id\": \"el10890140364671015904\", \"axes\": [{\"texts\": [{\"id\": \"el10890140364662563784\", \"v_baseline\": \"hanging\", \"position"
},
{
"path": "webpage/deepchat/static/assets/test_data/train_from.txt",
"chars": 579,
"preview": "One.\nTwo.\nThree.\nFour.\nFive.\nSix.\nSeven.\nEight.\nNine.\nHello.\nWhat is your name?\nDo you like ice cream?\nWho is the presid"
},
{
"path": "webpage/deepchat/static/assets/test_data/train_to.txt",
"chars": 561,
"preview": "Two.\nThree.\nFour.\nFive.\nSix.\nSeven.\nEight.\nNine.\nTen.\nHi.\nI am Groot.\nI hate ice cream.\nSatan is the president of the Un"
},
{
"path": "webpage/deepchat/static/assets/test_data/valid_from.txt",
"chars": 292,
"preview": "Are you stupid?\nThank you.\nWho is a good boy?\nAre cats better than dogs?\nAre you stupid?\nThank you.\nWho is a good boy?\nA"
},
{
"path": "webpage/deepchat/static/assets/test_data/valid_to.txt",
"chars": 348,
"preview": "No, I am not stupid.\nYou are welcome.\nI am a good boy.\nYes, cats are better than dogs.\nNo, I am not stupid.\nYou are welc"
},
{
"path": "webpage/deepchat/static/assets/test_data/vocab121.txt",
"chars": 613,
"preview": "_PAD\n_GO\n_EOS\n_UNK\n.\n?\nyou\n,\ni\n'\ndo\nare\nis\nwhat\nthe\nfavorite\nt\nfive\npresident\nwho\na\nthree\nhave\ntwo\nsay\nyour\nnice\nfour\ndo"
},
{
"path": "webpage/deepchat/static/css/style_modifications.css",
"chars": 1327,
"preview": "/* Various style tweaks for chat boxes and bootstrap css. */\n\n\n.jumbotron {\n background-color: whitesmoke;\n}\n\n.jumbotro"
},
{
"path": "webpage/deepchat/static/css/theme.css",
"chars": 303,
"preview": "/* custom stuff for bootstrap-reference page */\nbody {\n padding-top: 70px;\n padding-bottom: 30px;\n}\n\n.theme-dropdown ."
},
{
"path": "webpage/deepchat/static/js/bootstrapify.js",
"chars": 601,
"preview": "/* Assign certain elements to bootstrap classes by default, so\n * less typing for me.\n */\n$(document).ready(function() {"
},
{
"path": "webpage/deepchat/static/js/chat_processing.js",
"chars": 2270,
"preview": "$(document).ready(function() {\n\n // Extract the user input from the field.\n let chatForm = $('.chat-form');\n le"
},
{
"path": "webpage/deepchat/static/js/jqBootstrapValidation.js",
"chars": 36196,
"preview": "/* jqBootstrapValidation\n * A plugin for automating validation on Twitter Bootstrap formatted forms.\n *\n * v1.3.6\n *\n * "
},
{
"path": "webpage/deepchat/static/js/user_form.js",
"chars": 606,
"preview": "$(document).ready(function() {\n\n $('#user-form-submit').on('click', function(e) {\n var userName = $('#user-nam"
},
{
"path": "webpage/deepchat/static/vendor/bootstrap-3.3.7-dist/css/bootstrap-theme.css",
"chars": 26132,
"preview": "/*!\n * Bootstrap v3.3.7 (http://getbootstrap.com)\n * Copyright 2011-2016 Twitter, Inc.\n * Licensed under MIT (https://gi"
},
{
"path": "webpage/deepchat/static/vendor/bootstrap-3.3.7-dist/css/bootstrap.css",
"chars": 146223,
"preview": "/*!\n * Bootstrap v3.3.7 (http://getbootstrap.com)\n * Copyright 2011-2016 Twitter, Inc.\n * Licensed under MIT (https://gi"
},
{
"path": "webpage/deepchat/static/vendor/bootstrap-3.3.7-dist/js/bootstrap.js",
"chars": 69707,
"preview": "/*!\n * Bootstrap v3.3.7 (http://getbootstrap.com)\n * Copyright 2011-2016 Twitter, Inc.\n * Licensed under the MIT license"
},
{
"path": "webpage/deepchat/static/vendor/bootstrap-3.3.7-dist/js/npm.js",
"chars": 484,
"preview": "// This file is autogenerated via the `commonjs` Grunt task. You can require() this file in a CommonJS environment.\nrequ"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/css/font-awesome.css",
"chars": 35134,
"preview": "/*!\n * Font Awesome 4.6.3 by @davegandy - http://fontawesome.io - @fontawesome\n * License - http://fontawesome.io/lice"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/less/animated.less",
"chars": 713,
"preview": "// Animated Icons\n// --------------------------\n\n.@{fa-css-prefix}-spin {\n -webkit-animation: fa-spin 2s infinite linea"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/less/bordered-pulled.less",
"chars": 585,
"preview": "// Bordered & Pulled\n// -------------------------\n\n.@{fa-css-prefix}-border {\n padding: .2em .25em .15em;\n border: sol"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/less/core.less",
"chars": 452,
"preview": "// Base Class Definition\n// -------------------------\n\n.@{fa-css-prefix} {\n display: inline-block;\n font: normal norma"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/less/fixed-width.less",
"chars": 119,
"preview": "// Fixed Width Icons\n// -------------------------\n.@{fa-css-prefix}-fw {\n width: (18em / 14);\n text-align: center;\n}\n"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/less/font-awesome.less",
"chars": 495,
"preview": "/*!\n * Font Awesome 4.6.3 by @davegandy - http://fontawesome.io - @fontawesome\n * License - http://fontawesome.io/lice"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/less/icons.less",
"chars": 46249,
"preview": "/* Font Awesome uses the Unicode Private Use Area (PUA) to ensure screen\n readers do not read off random characters th"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/less/larger.less",
"chars": 370,
"preview": "// Icon Sizes\n// -------------------------\n\n/* makes the font 33% larger relative to the icon container */\n.@{fa-css-pre"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/less/list.less",
"chars": 377,
"preview": "// List Icons\n// -------------------------\n\n.@{fa-css-prefix}-ul {\n padding-left: 0;\n margin-left: @fa-li-width;\n lis"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/less/mixins.less",
"chars": 1603,
"preview": "// Mixins\n// --------------------------\n\n.fa-icon() {\n display: inline-block;\n font: normal normal normal @fa-font-siz"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/less/path.less",
"chars": 771,
"preview": "/* FONT PATH\n * -------------------------- */\n\n@font-face {\n font-family: 'FontAwesome';\n src: url('@{fa-font-path}/fo"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/less/rotated-flipped.less",
"chars": 622,
"preview": "// Rotated & Flipped Icons\n// -------------------------\n\n.@{fa-css-prefix}-rotate-90 { .fa-icon-rotate(90deg, 1); }\n.@"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/less/screen-reader.less",
"chars": 118,
"preview": "// Screen Readers\n// -------------------------\n\n.sr-only { .sr-only(); }\n.sr-only-focusable { .sr-only-focusable(); }\n"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/less/stacked.less",
"chars": 476,
"preview": "// Stacked Icons\n// -------------------------\n\n.@{fa-css-prefix}-stack {\n position: relative;\n display: inline-block;\n"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/less/variables.less",
"chars": 20890,
"preview": "// Variables\n// --------------------------\n\n@fa-font-path: \"../fonts\";\n@fa-font-size-base: 14px;\n@fa-line-heigh"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/scss/_animated.scss",
"chars": 715,
"preview": "// Spinning Icons\n// --------------------------\n\n.#{$fa-css-prefix}-spin {\n -webkit-animation: fa-spin 2s infinite line"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/scss/_bordered-pulled.scss",
"chars": 592,
"preview": "// Bordered & Pulled\n// -------------------------\n\n.#{$fa-css-prefix}-border {\n padding: .2em .25em .15em;\n border: so"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/scss/_core.scss",
"chars": 459,
"preview": "// Base Class Definition\n// -------------------------\n\n.#{$fa-css-prefix} {\n display: inline-block;\n font: normal norm"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/scss/_fixed-width.scss",
"chars": 120,
"preview": "// Fixed Width Icons\n// -------------------------\n.#{$fa-css-prefix}-fw {\n width: (18em / 14);\n text-align: center;\n}\n"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/scss/_icons.scss",
"chars": 46979,
"preview": "/* Font Awesome uses the Unicode Private Use Area (PUA) to ensure screen\n readers do not read off random characters th"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/scss/_larger.scss",
"chars": 375,
"preview": "// Icon Sizes\n// -------------------------\n\n/* makes the font 33% larger relative to the icon container */\n.#{$fa-css-pr"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/scss/_list.scss",
"chars": 378,
"preview": "// List Icons\n// -------------------------\n\n.#{$fa-css-prefix}-ul {\n padding-left: 0;\n margin-left: $fa-li-width;\n li"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/scss/_mixins.scss",
"chars": 1637,
"preview": "// Mixins\n// --------------------------\n\n@mixin fa-icon() {\n display: inline-block;\n font: normal normal normal #{$fa-"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/scss/_path.scss",
"chars": 783,
"preview": "/* FONT PATH\n * -------------------------- */\n\n@font-face {\n font-family: 'FontAwesome';\n src: url('#{$fa-font-path}/f"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/scss/_rotated-flipped.scss",
"chars": 672,
"preview": "// Rotated & Flipped Icons\n// -------------------------\n\n.#{$fa-css-prefix}-rotate-90 { @include fa-icon-rotate(90deg, "
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/scss/_screen-reader.scss",
"chars": 134,
"preview": "// Screen Readers\n// -------------------------\n\n.sr-only { @include sr-only(); }\n.sr-only-focusable { @include sr-only-f"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/scss/_stacked.scss",
"chars": 482,
"preview": "// Stacked Icons\n// -------------------------\n\n.#{$fa-css-prefix}-stack {\n position: relative;\n display: inline-block;"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/scss/_variables.scss",
"chars": 20971,
"preview": "// Variables\n// --------------------------\n\n$fa-font-path: \"../fonts\" !default;\n$fa-font-size-base: 14px !defau"
},
{
"path": "webpage/deepchat/static/vendor/font-awesome/scss/font-awesome.scss",
"chars": 430,
"preview": "/*!\n * Font Awesome 4.6.3 by @davegandy - http://fontawesome.io - @fontawesome\n * License - http://fontawesome.io/lice"
},
{
"path": "webpage/deepchat/static/vendor/jquery/jquery.easing.1.3.js",
"chars": 8095,
"preview": "/*\n * jQuery Easing v1.3 - http://gsgd.co.uk/sandbox/jquery/easing/\n *\n * Uses the built in easing capabilities added In"
},
{
"path": "webpage/deepchat/templates/404.html",
"chars": 519,
"preview": "{% extends \"base.html\" %}\n\n{% block title %}U Lost Yo{% endblock title %}\n\n{% block page_content %}\n\n <div class=\"jumbo"
},
{
"path": "webpage/deepchat/templates/about.html",
"chars": 1416,
"preview": "{% extends \"base.html\" %}\n{% set active_page = \"about\" %}\n\n{% block nav_session_user %}\n<ul class=\"nav navbar-nav navb"
},
{
"path": "webpage/deepchat/templates/admin/index.html",
"chars": 310,
"preview": "<!-- Customizing admin page. Note that we need to copy\nstuff from abstract_base.html since Jinja doesn't support\nmultipl"
},
{
"path": "webpage/deepchat/templates/base.html",
"chars": 5350,
"preview": "<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n <!-- block head:\n - defines the head element for this html file and can be "
},
{
"path": "webpage/deepchat/templates/index.html",
"chars": 5597,
"preview": "{% extends \"base.html\" %}\n{% set active_page = \"index\" %}\n{% from \"macros/forms.html\" import render_chat_form, render_"
},
{
"path": "webpage/deepchat/templates/macros/forms.html",
"chars": 1899,
"preview": "{% macro with_errors(field) %}\n <!-- Wraps a form field with error handling. -->\n {% if field and field.errors %}\n "
},
{
"path": "webpage/deepchat/templates/plots.html",
"chars": 8048,
"preview": "\n{% extends \"base.html\" %}\n{% set active_page = \"plots\" %}\n\n{% block nav_session_user %}\n <ul class=\"nav navbar-nav n"
},
{
"path": "webpage/deepchat/web_bot.py",
"chars": 7079,
"preview": "\"\"\"Minimal subset of functions/methods from repo needed to run bot on Heroku.\nSee the main repository for better docs (a"
},
{
"path": "webpage/manage.py",
"chars": 1853,
"preview": "#!/usr/bin/env python3\n\n\"\"\"manage.py: Start up the web server and the application.\"\"\"\n\nimport os\nfrom deepchat import cr"
},
{
"path": "webpage/migrations/README",
"chars": 38,
"preview": "Generic single-database configuration."
},
{
"path": "webpage/migrations/alembic.ini",
"chars": 770,
"preview": "# A generic, single database configuration.\n\n[alembic]\n# template used to generate migration files\n# file_template = %%("
},
{
"path": "webpage/migrations/env.py",
"chars": 2883,
"preview": "from __future__ import with_statement\nfrom alembic import context\nfrom sqlalchemy import engine_from_config, pool\nfrom l"
},
{
"path": "webpage/migrations/script.py.mako",
"chars": 494,
"preview": "\"\"\"${message}\n\nRevision ID: ${up_revision}\nRevises: ${down_revision | comma,n}\nCreate Date: ${create_date}\n\n\"\"\"\nfrom ale"
},
{
"path": "webpage/migrations/versions/236b966ecd2f_.py",
"chars": 2667,
"preview": "\"\"\"empty message\n\nRevision ID: 236b966ecd2f\nRevises: \nCreate Date: 2017-05-03 14:27:37.853971\n\n\"\"\"\nfrom alembic import o"
},
{
"path": "webpage/requirements.txt",
"chars": 436,
"preview": "gunicorn==19.6.0\ntensorflow>=1.1.0\nalembic==0.9.1\nFlask==0.12.1\nFlask_Admin==1.5.0\nFlask_BasicAuth==0.2.0\nFlask_Cors==3."
},
{
"path": "webpage/runtime.txt",
"chars": 13,
"preview": "python-3.5.2\n"
},
{
"path": "webpage/tests/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "webpage/tests/test_database.py",
"chars": 741,
"preview": "\"\"\"Unit tests for the application.\"\"\"\n\nfrom flask import current_app\nfrom flask import request\nfrom deepchat import crea"
},
{
"path": "webpage/tests/test_simple.py",
"chars": 831,
"preview": "\"\"\"Unit tests for the application.\"\"\"\n\nimport unittest\nfrom flask import current_app\nfrom deepchat import create_app, db"
}
]
// ... and 5 more files (download for full content)
About this extraction
This page contains the full source code of the mckinziebrandon/DeepChatModels GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 144 files (1.7 MB), approximately 825.2k tokens, and a symbol index with 335 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.