Repository: matroid/dlwithtf Branch: master Commit: 2eb79b9bb845 Files: 22 Total size: 112.4 KB Directory structure: gitextract_33683wml/ ├── README.md ├── ch3/ │ ├── linear_regression_tf.py │ ├── linear_regression_tf_simple.py │ └── logistic_regression_tf.py ├── ch4/ │ ├── fcnet_classification_tf.py │ ├── fcnet_regression_tf.py │ ├── tox21_fcnet.py │ └── tox21_fcnet_dropout.py ├── ch5/ │ ├── fcnet_func.py │ ├── hidden_grid_search.py │ ├── simple_grid_search.py │ └── tox21_rf.py ├── ch6/ │ └── convolutional.py ├── ch7/ │ ├── ptb_word_lm.py │ ├── reader.py │ └── setup.sh ├── ch8/ │ ├── a3c.py │ ├── environment.py │ ├── tensorgraph.py │ └── tictactoe.py └── ch9/ ├── cifar10.py └── cifar10_multi_gpu_train.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ # TensorFlow for Deep Learning Companion Code Referenced throughout the book. ![Book Cover](https://pbs.twimg.com/media/DXQHXOtVoAEO4T_.jpg:large) ## TensorFlow Versions The TensorFlow library has been evolving rapidly in the last couple years, and some of the code in this repo and the associated book no longer work with the latest versions of TensorFlow. We recommend using TensorFlow 1.6 for working through all exercises in this book. We are looking into creating a full `requirements.txt` file for all needed dependencies and hope to have that available for you soon. We also welcome any PRs that modify code to work with more recent TensorFlow versions. We are looking into these upgrades on our end as well. ================================================ FILE: ch3/linear_regression_tf.py ================================================ import numpy as np np.random.seed(456) import tensorflow as tf tf.set_random_seed(456) from matplotlib import rc rc('text', usetex=True) import matplotlib.pyplot as plt from scipy.stats import pearsonr from sklearn.metrics import mean_squared_error def pearson_r2_score(y, y_pred): """Computes Pearson R^2 (square of Pearson correlation).""" return pearsonr(y, y_pred)[0]**2 def rms_score(y_true, y_pred): """Computes RMS error.""" return np.sqrt(mean_squared_error(y_true, y_pred)) # Generate synthetic data N = 100 w_true = 5 b_true = 2 noise_scale = .1 x_np = np.random.rand(N, 1) noise = np.random.normal(scale=noise_scale, size=(N, 1)) # Convert shape of y_np to (N,) y_np = np.reshape(w_true * x_np + b_true + noise, (-1)) # Save image of the data distribution plt.scatter(x_np, y_np) plt.xlabel("x") plt.ylabel("y") plt.xlim(0, 1) plt.title("Toy Linear Regression Data, " r"$y = 5x + 2 + N(0, 1)$") plt.savefig("lr_data.png") # Generate tensorflow graph with tf.name_scope("placeholders"): x = tf.placeholder(tf.float32, (N, 1)) y = tf.placeholder(tf.float32, (N,)) with tf.name_scope("weights"): W = tf.Variable(tf.random_normal((1, 1))) b = tf.Variable(tf.random_normal((1,))) with tf.name_scope("prediction"): y_pred = tf.matmul(x, W) + b with tf.name_scope("loss"): l = tf.reduce_sum((y - tf.squeeze(y_pred))**2) with tf.name_scope("optim"): train_op = tf.train.AdamOptimizer(.001).minimize(l) with tf.name_scope("summaries"): tf.summary.scalar("loss", l) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter('/tmp/lr-train', tf.get_default_graph()) n_steps = 8000 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # Train model for i in range(n_steps): feed_dict = {x: x_np, y: y_np} _, summary, loss = sess.run([train_op, merged, l], feed_dict=feed_dict) print("step %d, loss: %f" % (i, loss)) train_writer.add_summary(summary, i) # Get weights w_final, b_final = sess.run([W, b]) # Make Predictions y_pred_np = sess.run(y_pred, feed_dict={x: x_np}) y_pred_np = np.reshape(y_pred_np, -1) r2 = pearson_r2_score(y_np, y_pred_np) print("Pearson R^2: %f" % r2) rms = rms_score(y_np, y_pred_np) print("RMS: %f" % rms) # Clear figure plt.clf() plt.xlabel("Y-true") plt.ylabel("Y-pred") plt.title("Predicted versus True values " r"(Pearson $R^2$: $0.994$)") plt.scatter(y_np, y_pred_np) plt.savefig("lr_pred.png") # Now draw with learned regression line plt.clf() plt.xlabel("x") plt.ylabel("y") plt.title("True Model versus Learned Model " r"(RMS: $1.027620$)") plt.xlim(0, 1) plt.scatter(x_np, y_np) x_left = 0 y_left = w_final[0]*x_left + b_final x_right = 1 y_right = w_final[0]*x_right + b_final plt.plot([x_left, x_right], [y_left, y_right], color='k') plt.savefig("lr_learned.png") ================================================ FILE: ch3/linear_regression_tf_simple.py ================================================ import tensorflow as tf d = 10 N = 100 x = tf.placeholder(tf.float32, (N, d)) y = tf.placeholder(tf.float32, (N,)) W = tf.Variable(tf.random_normal((d, 1))) b = tf.Variable(tf.random_normal((1,))) l = tf.reduce_sum((y - (tf.matmul(x, W) + b))**2) with tf.Session() as sess: tf.global_variables_initializer().run(session=sess) ================================================ FILE: ch3/logistic_regression_tf.py ================================================ import numpy as np np.random.seed(456) import tensorflow as tf tf.set_random_seed(456) import matplotlib.pyplot as plt from sklearn.metrics import accuracy_score from scipy.special import logit # Generate synthetic data N = 100 # Zeros form a Gaussian centered at (-1, -1) x_zeros = np.random.multivariate_normal( mean=np.array((-1, -1)), cov=.1*np.eye(2), size=(N//2,)) y_zeros = np.zeros((N//2,)) # Ones form a Gaussian centered at (1, 1) x_ones = np.random.multivariate_normal( mean=np.array((1, 1)), cov=.1*np.eye(2), size=(N//2,)) y_ones = np.ones((N//2,)) x_np = np.vstack([x_zeros, x_ones]) y_np = np.concatenate([y_zeros, y_ones]) # Save image of the data distribution plt.xlabel(r"$x_1$") plt.ylabel(r"$x_2$") plt.title("Toy Logistic Regression Data") # Plot Zeros plt.scatter(x_zeros[:, 0], x_zeros[:, 1], color="blue") plt.scatter(x_ones[:, 0], x_ones[:, 1], color="red") plt.savefig("logistic_data.png") # Generate tensorflow graph with tf.name_scope("placeholders"): x = tf.placeholder(tf.float32, (N, 2)) y = tf.placeholder(tf.float32, (N,)) with tf.name_scope("weights"): W = tf.Variable(tf.random_normal((2, 1))) b = tf.Variable(tf.random_normal((1,))) with tf.name_scope("prediction"): y_logit = tf.squeeze(tf.matmul(x, W) + b) # the sigmoid gives the class probability of 1 y_one_prob = tf.sigmoid(y_logit) # Rounding P(y=1) will give the correct prediction. y_pred = tf.round(y_one_prob) with tf.name_scope("loss"): # Compute the cross-entropy term for each datapoint entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits=y_logit, labels=y) # Sum all contributions l = tf.reduce_sum(entropy) with tf.name_scope("optim"): train_op = tf.train.AdamOptimizer(.01).minimize(l) with tf.name_scope("summaries"): tf.summary.scalar("loss", l) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter('/tmp/logistic-train', tf.get_default_graph()) n_steps = 1000 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # Train model for i in range(n_steps): feed_dict = {x: x_np, y: y_np} _, summary, loss = sess.run([train_op, merged, l], feed_dict=feed_dict) print("loss: %f" % loss) train_writer.add_summary(summary, i) # Get weights w_final, b_final = sess.run([W, b]) # Make Predictions y_pred_np = sess.run(y_pred, feed_dict={x: x_np}) score = accuracy_score(y_np, y_pred_np) print("Classification Accuracy: %f" % score) plt.clf() # Save image of the data distribution plt.xlabel(r"$x_1$") plt.ylabel(r"$x_2$") plt.title("Learned Model (Classification Accuracy: 1.00)") plt.xlim(-2, 2) plt.ylim(-2, 2) # Plot Zeros plt.scatter(x_zeros[:, 0], x_zeros[:, 1], color="blue") plt.scatter(x_ones[:, 0], x_ones[:, 1], color="red") x_left = -2 y_left = (1./w_final[1]) * (-b_final + logit(.5) - w_final[0]*x_left) x_right = 2 y_right = (1./w_final[1]) * (-b_final + logit(.5) - w_final[0]*x_right) plt.plot([x_left, x_right], [y_left, y_right], color='k') plt.savefig("logistic_pred.png") ================================================ FILE: ch4/fcnet_classification_tf.py ================================================ import numpy as np np.random.seed(456) import tensorflow as tf tf.set_random_seed(456) import matplotlib.pyplot as plt from sklearn.metrics import accuracy_score # Generate synthetic data N = 100 w_true = 5 b_true = 2 noise_scale = .1 # Zeros form a Gaussian centered at (-1, -1) x_zeros = np.random.multivariate_normal( mean=np.array((-1, -1)), cov=.1*np.eye(2), size=(N/2,)) y_zeros = np.zeros((N/2,)) # Ones form a Gaussian centered at (1, 1) x_ones = np.random.multivariate_normal( mean=np.array((1, 1)), cov=.1*np.eye(2), size=(N/2,)) y_ones = np.ones((N/2,)) x_np = np.vstack([x_zeros, x_ones]) y_np = np.concatenate([y_zeros, y_ones]) # Save image of the data distribution plt.xlabel("Dimension 1") plt.ylabel("Dimension 2") plt.title("FCNet Classification Data") # Plot Zeros plt.scatter(x_zeros[:, 0], x_zeros[:, 1], color="blue") plt.scatter(x_ones[:, 0], x_ones[:, 1], color="red") plt.savefig("fcnet_classification_data.png") # Generate tensorflow graph d = 2 n_hidden = 15 with tf.name_scope("placeholders"): x = tf.placeholder(tf.float32, (N, d)) y = tf.placeholder(tf.float32, (N,)) with tf.name_scope("layer-1"): W = tf.Variable(tf.random_normal((d, n_hidden))) b = tf.Variable(tf.random_normal((n_hidden,))) x_1 = tf.nn.relu(tf.matmul(x, W) + b) with tf.name_scope("output"): W = tf.Variable(tf.random_normal((n_hidden, 1))) b = tf.Variable(tf.random_normal((1,))) y_logit = tf.squeeze(tf.matmul(x_1, W) + b) # the sigmoid gives the class probability of 1 y_one_prob = tf.sigmoid(y_logit) # Rounding P(y=1) will give the correct prediction. y_pred = tf.round(y_one_prob) with tf.name_scope("loss"): # Compute the cross-entropy term for each datapoint entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits=y_logit, labels=y) # Sum all contributions l = tf.reduce_sum(entropy) with tf.name_scope("optim"): train_op = tf.train.AdamOptimizer(.001).minimize(l) with tf.name_scope("summaries"): tf.summary.scalar("loss", l) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter('/tmp/fcnet-classification-train', tf.get_default_graph()) n_steps = 200 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # Train model for i in range(n_steps): feed_dict = {x: x_np, y: y_np} _, summary, loss = sess.run([train_op, merged, l], feed_dict=feed_dict) print("step %d, loss: %f" % (i, loss)) train_writer.add_summary(summary, i) # Make Predictions y_pred_np = sess.run(y_pred, feed_dict={x: x_np}) score = accuracy_score(y_np, y_pred_np) print("Classification Accuracy: %f" % score) ================================================ FILE: ch4/fcnet_regression_tf.py ================================================ import numpy as np np.random.seed(456) import tensorflow as tf tf.set_random_seed(456) import matplotlib.pyplot as plt from scipy.stats import pearsonr def pearson_r2_score(y, y_pred): """Computes Pearson R^2 (square of Pearson correlation).""" return pearsonr(y, y_pred)[0]**2 # Generate synthetic data d = 1 N = 50 w_true = 5 b_true = 2 noise_scale = .1 x_np = np.random.rand(N, d) noise = np.random.normal(scale=noise_scale, size=(N, d)) y_np = np.reshape(w_true * x_np + b_true + noise, (-1)) # Save image of the data distribution plt.scatter(x_np, y_np) plt.xlabel("X") plt.ylabel("y") plt.title("Raw Linear Regression Data") plt.savefig("fcnet_regression_data.png") # Generate tensorflow graph n_hidden = 15 with tf.name_scope("placeholders"): x = tf.placeholder(tf.float32, (N, d)) y = tf.placeholder(tf.float32, (N,)) with tf.name_scope("layer-1"): W = tf.Variable(tf.random_normal((d, n_hidden))) b = tf.Variable(tf.random_normal((n_hidden,))) x_1 = tf.nn.relu(tf.matmul(x, W) + b) with tf.name_scope("output"): W = tf.Variable(tf.random_normal((n_hidden, 1))) b = tf.Variable(tf.random_normal((1,))) y_pred = tf.transpose(tf.matmul(x_1, W) + b) with tf.name_scope("loss"): lvec = (y - y_pred)**2 l = tf.reduce_sum(lvec) with tf.name_scope("optim"): train_op = tf.train.AdamOptimizer(.001).minimize(l) with tf.name_scope("summaries"): tf.summary.scalar("loss", l) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter('/tmp/fcnet-regression-train', tf.get_default_graph()) n_steps = 1000 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # Train model for i in range(n_steps): feed_dict = {x: x_np, y: y_np} _, summary, loss, lossvec = sess.run([train_op, merged, l, lvec], feed_dict=feed_dict) print("step %d, loss: %f, loss-vec-size: %s" % (i, loss, lossvec.shape)) train_writer.add_summary(summary, i) # Make Predictions y_pred_np = sess.run(y_pred, feed_dict={x: x_np}) y_pred_np = np.reshape(y_pred_np, -1) r2 = pearson_r2_score(y_np, y_pred_np) print("Pearson R^2: %f" % r2) # Clear figure plt.clf() plt.xlabel("Y-true") plt.ylabel("Y-pred") plt.title("Predicted versus true values") plt.scatter(y_np, y_pred_np) plt.savefig("fcnet_regression_pred.png") # Now draw with learned regression line plt.clf() plt.xlabel("X") plt.ylabel("Y") plt.title("Predicted versus true values") plt.xlim(0, 1) plt.scatter(x_np, y_np) plt.scatter(x_np, y_pred_np) plt.savefig("fcnet_regression_learned.png") ================================================ FILE: ch4/tox21_fcnet.py ================================================ import numpy as np np.random.seed(456) import tensorflow as tf tf.set_random_seed(456) import matplotlib.pyplot as plt import deepchem as dc from sklearn.metrics import accuracy_score _, (train, valid, test), _ = dc.molnet.load_tox21() train_X, train_y, train_w = train.X, train.y, train.w valid_X, valid_y, valid_w = valid.X, valid.y, valid.w test_X, test_y, test_w = test.X, test.y, test.w # Remove extra tasks train_y = train_y[:, 0] valid_y = valid_y[:, 0] test_y = test_y[:, 0] train_w = train_w[:, 0] valid_w = valid_w[:, 0] test_w = test_w[:, 0] # Generate tensorflow graph d = 1024 n_hidden = 50 learning_rate = .001 n_epochs = 10 batch_size = 100 with tf.name_scope("placeholders"): x = tf.placeholder(tf.float32, (None, d)) y = tf.placeholder(tf.float32, (None,)) with tf.name_scope("hidden-layer"): W = tf.Variable(tf.random_normal((d, n_hidden))) b = tf.Variable(tf.random_normal((n_hidden,))) x_hidden = tf.nn.relu(tf.matmul(x, W) + b) with tf.name_scope("output"): W = tf.Variable(tf.random_normal((n_hidden, 1))) b = tf.Variable(tf.random_normal((1,))) y_logit = tf.matmul(x_hidden, W) + b # the sigmoid gives the class probability of 1 y_one_prob = tf.sigmoid(y_logit) # Rounding P(y=1) will give the correct prediction. y_pred = tf.round(y_one_prob) with tf.name_scope("loss"): # Compute the cross-entropy term for each datapoint y_expand = tf.expand_dims(y, 1) entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits=y_logit, labels=y_expand) # Sum all contributions l = tf.reduce_sum(entropy) with tf.name_scope("optim"): train_op = tf.train.AdamOptimizer(learning_rate).minimize(l) with tf.name_scope("summaries"): tf.summary.scalar("loss", l) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter('/tmp/fcnet-tox21', tf.get_default_graph()) N = train_X.shape[0] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) step = 0 for epoch in range(n_epochs): pos = 0 while pos < N: batch_X = train_X[pos:pos+batch_size] batch_y = train_y[pos:pos+batch_size] feed_dict = {x: batch_X, y: batch_y} _, summary, loss = sess.run([train_op, merged, l], feed_dict=feed_dict) print("epoch %d, step %d, loss: %f" % (epoch, step, loss)) train_writer.add_summary(summary, step) step += 1 pos += batch_size # Make Predictions valid_y_pred = sess.run(y_pred, feed_dict={x: valid_X}) score = accuracy_score(valid_y, valid_y_pred) print("Unweighted Classification Accuracy: %f" % score) weighted_score = accuracy_score(valid_y, valid_y_pred, sample_weight=valid_w) print("Weighted Classification Accuracy: %f" % weighted_score) ================================================ FILE: ch4/tox21_fcnet_dropout.py ================================================ import numpy as np np.random.seed(456) import tensorflow as tf tf.set_random_seed(456) import matplotlib.pyplot as plt import deepchem as dc from sklearn.metrics import accuracy_score _, (train, valid, test), _ = dc.molnet.load_tox21() train_X, train_y, train_w = train.X, train.y, train.w valid_X, valid_y, valid_w = valid.X, valid.y, valid.w test_X, test_y, test_w = test.X, test.y, test.w # Remove extra tasks train_y = train_y[:, 0] valid_y = valid_y[:, 0] test_y = test_y[:, 0] train_w = train_w[:, 0] valid_w = valid_w[:, 0] test_w = test_w[:, 0] # Generate tensorflow graph d = 1024 n_hidden = 50 learning_rate = .001 n_epochs = 10 batch_size = 100 dropout_prob = 1.0 with tf.name_scope("placeholders"): x = tf.placeholder(tf.float32, (None, d)) y = tf.placeholder(tf.float32, (None,)) keep_prob = tf.placeholder(tf.float32) with tf.name_scope("hidden-layer"): W = tf.Variable(tf.random_normal((d, n_hidden))) b = tf.Variable(tf.random_normal((n_hidden,))) x_hidden = tf.nn.relu(tf.matmul(x, W) + b) # Apply dropout x_hidden = tf.nn.dropout(x_hidden, keep_prob) with tf.name_scope("output"): W = tf.Variable(tf.random_normal((n_hidden, 1))) b = tf.Variable(tf.random_normal((1,))) y_logit = tf.matmul(x_hidden, W) + b # the sigmoid gives the class probability of 1 y_one_prob = tf.sigmoid(y_logit) # Rounding P(y=1) will give the correct prediction. y_pred = tf.round(y_one_prob) with tf.name_scope("loss"): # Compute the cross-entropy term for each datapoint y_expand = tf.expand_dims(y, 1) entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits=y_logit, labels=y_expand) # Sum all contributions l = tf.reduce_sum(entropy) with tf.name_scope("optim"): train_op = tf.train.AdamOptimizer(learning_rate).minimize(l) with tf.name_scope("summaries"): tf.summary.scalar("loss", l) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter('/tmp/fcnet-tox21-dropout', tf.get_default_graph()) N = train_X.shape[0] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) step = 0 for epoch in range(n_epochs): pos = 0 while pos < N: batch_X = train_X[pos:pos+batch_size] batch_y = train_y[pos:pos+batch_size] feed_dict = {x: batch_X, y: batch_y, keep_prob: dropout_prob} _, summary, loss = sess.run([train_op, merged, l], feed_dict=feed_dict) print("epoch %d, step %d, loss: %f" % (epoch, step, loss)) train_writer.add_summary(summary, step) step += 1 pos += batch_size # Make Predictions (set keep_prob to 1.0 for predictions) train_y_pred = sess.run(y_pred, feed_dict={x: train_X, keep_prob: 1.0}) valid_y_pred = sess.run(y_pred, feed_dict={x: valid_X, keep_prob: 1.0}) test_y_pred = sess.run(y_pred, feed_dict={x: test_X, keep_prob: 1.0}) train_weighted_score = accuracy_score(train_y, train_y_pred, sample_weight=train_w) print("Train Weighted Classification Accuracy: %f" % train_weighted_score) valid_weighted_score = accuracy_score(valid_y, valid_y_pred, sample_weight=valid_w) print("Valid Weighted Classification Accuracy: %f" % valid_weighted_score) test_weighted_score = accuracy_score(test_y, test_y_pred, sample_weight=test_w) print("Test Weighted Classification Accuracy: %f" % test_weighted_score) ================================================ FILE: ch5/fcnet_func.py ================================================ import numpy as np np.random.seed(456) import tensorflow as tf tf.set_random_seed(456) import matplotlib.pyplot as plt import deepchem as dc from sklearn.metrics import accuracy_score def eval_tox21_hyperparams(n_hidden=50, n_layers=1, learning_rate=.001, dropout_prob=0.5, n_epochs=45, batch_size=100, weight_positives=True): print("---------------------------------------------") print("Model hyperparameters") print("n_hidden = %d" % n_hidden) print("n_layers = %d" % n_layers) print("learning_rate = %f" % learning_rate) print("n_epochs = %d" % n_epochs) print("batch_size = %d" % batch_size) print("weight_positives = %s" % str(weight_positives)) print("dropout_prob = %f" % dropout_prob) print("---------------------------------------------") d = 1024 graph = tf.Graph() with graph.as_default(): _, (train, valid, test), _ = dc.molnet.load_tox21() train_X, train_y, train_w = train.X, train.y, train.w valid_X, valid_y, valid_w = valid.X, valid.y, valid.w test_X, test_y, test_w = test.X, test.y, test.w # Remove extra tasks train_y = train_y[:, 0] valid_y = valid_y[:, 0] test_y = test_y[:, 0] train_w = train_w[:, 0] valid_w = valid_w[:, 0] test_w = test_w[:, 0] # Generate tensorflow graph with tf.name_scope("placeholders"): x = tf.placeholder(tf.float32, (None, d)) y = tf.placeholder(tf.float32, (None,)) w = tf.placeholder(tf.float32, (None,)) keep_prob = tf.placeholder(tf.float32) for layer in range(n_layers): with tf.name_scope("layer-%d" % layer): W = tf.Variable(tf.random_normal((d, n_hidden))) b = tf.Variable(tf.random_normal((n_hidden,))) x_hidden = tf.nn.relu(tf.matmul(x, W) + b) # Apply dropout x_hidden = tf.nn.dropout(x_hidden, keep_prob) with tf.name_scope("output"): W = tf.Variable(tf.random_normal((n_hidden, 1))) b = tf.Variable(tf.random_normal((1,))) y_logit = tf.matmul(x_hidden, W) + b # the sigmoid gives the class probability of 1 y_one_prob = tf.sigmoid(y_logit) # Rounding P(y=1) will give the correct prediction. y_pred = tf.round(y_one_prob) with tf.name_scope("loss"): # Compute the cross-entropy term for each datapoint y_expand = tf.expand_dims(y, 1) entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits=y_logit, labels=y_expand) # Multiply by weights if weight_positives: w_expand = tf.expand_dims(w, 1) entropy = w_expand * entropy # Sum all contributions l = tf.reduce_sum(entropy) with tf.name_scope("optim"): train_op = tf.train.AdamOptimizer(learning_rate).minimize(l) with tf.name_scope("summaries"): tf.summary.scalar("loss", l) merged = tf.summary.merge_all() hyperparam_str = "d-%d-hidden-%d-lr-%f-n_epochs-%d-batch_size-%d-weight_pos-%s" % ( d, n_hidden, learning_rate, n_epochs, batch_size, str(weight_positives)) train_writer = tf.summary.FileWriter('/tmp/fcnet-func-' + hyperparam_str, tf.get_default_graph()) N = train_X.shape[0] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) step = 0 for epoch in range(n_epochs): pos = 0 while pos < N: batch_X = train_X[pos:pos+batch_size] batch_y = train_y[pos:pos+batch_size] batch_w = train_w[pos:pos+batch_size] feed_dict = {x: batch_X, y: batch_y, w: batch_w, keep_prob: dropout_prob} _, summary, loss = sess.run([train_op, merged, l], feed_dict=feed_dict) print("epoch %d, step %d, loss: %f" % (epoch, step, loss)) train_writer.add_summary(summary, step) step += 1 pos += batch_size # Make Predictions (set keep_prob to 1.0 for predictions) valid_y_pred = sess.run(y_pred, feed_dict={x: valid_X, keep_prob: 1.0}) weighted_score = accuracy_score(valid_y, valid_y_pred, sample_weight=valid_w) print("Valid Weighted Classification Accuracy: %f" % weighted_score) return weighted_score if __name__ == "__main__": score = eval_tox21_hyperparams() ================================================ FILE: ch5/hidden_grid_search.py ================================================ import numpy as np from fcnet_func import eval_tox21_hyperparams scores = {} n_reps = 3 hidden_sizes = [30, 60] epochs = [15, 30, 45] dropouts = [.5] num_layers = [1, 2] for rep in range(n_reps): for n_epochs in epochs: for hidden_size in hidden_sizes: for dropout in dropouts: for n_layers in num_layers: score = eval_tox21_hyperparams(n_hidden=hidden_size, n_epochs=n_epochs, dropout_prob=dropout, n_layers=n_layers) if (hidden_size, n_epochs, dropout, n_layers) not in scores: scores[(hidden_size, n_epochs, dropout, n_layers)] = [] scores[(hidden_size, n_epochs, dropout, n_layers)].append(score) print("All Scores") print(scores) avg_scores = {} for params, param_scores in scores.iteritems(): avg_scores[params] = np.mean(np.array(param_scores)) print("Scores Averaged over %d repetitions" % n_reps) print(avg_scores) ================================================ FILE: ch5/simple_grid_search.py ================================================ import numpy as np from fcnet_func import eval_tox21_hyperparams scores = {} n_reps = 3 hidden_sizes = [50] epochs = [10] dropouts = [.5, 1.0] num_layers = [1, 2] for rep in range(n_reps): for n_epochs in epochs: for hidden_size in hidden_sizes: for dropout in dropouts: for n_layers in num_layers: score = eval_tox21_hyperparams(n_hidden=hidden_size, n_epochs=n_epochs, dropout_prob=dropout, n_layers=n_layers) if (hidden_size, n_epochs, dropout, n_layers) not in scores: scores[(hidden_size, n_epochs, dropout, n_layers)] = [] scores[(hidden_size, n_epochs, dropout, n_layers)].append(score) print("All Scores") print(scores) avg_scores = {} for params, param_scores in scores.iteritems(): avg_scores[params] = np.mean(np.array(param_scores)) print("Scores Averaged over %d repetitions" % n_reps) print(avg_scores) ================================================ FILE: ch5/tox21_rf.py ================================================ import numpy as np np.random.seed(456) import matplotlib.pyplot as plt import deepchem as dc from sklearn.metrics import accuracy_score from sklearn.ensemble import RandomForestClassifier _, (train, valid, test), _ = dc.molnet.load_tox21() train_X, train_y, train_w = train.X, train.y, train.w valid_X, valid_y, valid_w = valid.X, valid.y, valid.w test_X, test_y, test_w = test.X, test.y, test.w # Remove extra tasks train_y = train_y[:, 0] valid_y = valid_y[:, 0] test_y = test_y[:, 0] train_w = train_w[:, 0] valid_w = valid_w[:, 0] test_w = test_w[:, 0] # Generate tensorflow graph sklearn_model = RandomForestClassifier( class_weight="balanced", n_estimators=50) print("About to fit model on train set.") sklearn_model.fit(train_X, train_y) train_y_pred = sklearn_model.predict(train_X) valid_y_pred = sklearn_model.predict(valid_X) test_y_pred = sklearn_model.predict(test_X) weighted_score = accuracy_score(train_y, train_y_pred, sample_weight=train_w) print("Weighted train Classification Accuracy: %f" % weighted_score) weighted_score = accuracy_score(valid_y, valid_y_pred, sample_weight=valid_w) print("Weighted valid Classification Accuracy: %f" % weighted_score) weighted_score = accuracy_score(test_y, test_y_pred, sample_weight=test_w) print("Weighted test Classification Accuracy: %f" % weighted_score) ================================================ FILE: ch6/convolutional.py ================================================ # Copyright 2015 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Simple, end-to-end, LeNet-5-like convolutional MNIST model example. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import argparse import gzip import os import sys import time import numpy from six.moves import urllib from six.moves import xrange # pylint: disable=redefined-builtin import tensorflow as tf SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/' WORK_DIRECTORY = 'data' IMAGE_SIZE = 28 NUM_CHANNELS = 1 PIXEL_DEPTH = 255 NUM_LABELS = 10 VALIDATION_SIZE = 5000 # Size of the validation set. SEED = 66478 # Set to None for random seed. BATCH_SIZE = 64 NUM_EPOCHS = 10 EVAL_BATCH_SIZE = 64 EVAL_FREQUENCY = 100 # Number of steps between evaluations. def download(filename): """Download the data from Yann's website, unless it's already here.""" if not os.path.exists(WORK_DIRECTORY): os.makedirs(WORK_DIRECTORY) filepath = os.path.join(WORK_DIRECTORY, filename) if not os.path.exists(filepath): filepath, _ = urllib.request.urlretrieve(SOURCE_URL + filename, filepath) size = os.stat(filepath).st_size print('Successfully downloaded', filename, size, 'bytes.') return filepath def extract_data(filename, num_images): """Extract the images into a 4D tensor [image index, y, x, channels]. Values are rescaled from [0, 255] down to [-0.5, 0.5]. """ print('Extracting', filename) with gzip.open(filename) as bytestream: bytestream.read(16) buf = bytestream.read( IMAGE_SIZE * IMAGE_SIZE * num_images * NUM_CHANNELS) data = numpy.frombuffer(buf, dtype=numpy.uint8).astype( numpy.float32) # The original data consists of pixels ranging from 0-255. # Center the data to have mean zero, and unit range. data = (data - (255/2.0))/255 data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS) return data def extract_labels(filename, num_images): """Extract the labels into a vector of int64 label IDs.""" print('Extracting', filename) with gzip.open(filename) as bytestream: # Discard header. bytestream.read(8) # Read bytes for labels. buf = bytestream.read(num_images) labels = numpy.frombuffer(buf, dtype=numpy.uint8).astype( numpy.int64) return labels def error_rate(predictions, labels): """Return the error rate based on dense predictions and sparse labels.""" return 100.0 - ( 100.0 * numpy.sum(numpy.argmax(predictions, 1) == labels) / predictions.shape[0]) # We will replicate the model structure for the training subgraph, as # well as the evaluation subgraphs, while sharing the trainable # parameters. def model(data, train=False): """The Model definition.""" # 2D convolution, with 'SAME' padding (i.e. the output feature map # has the same size as the input). Note that {strides} is a 4D array # whose shape matches the data layout: [image index, y, x, depth]. conv = tf.nn.conv2d(data, conv1_weights, strides=[1, 1, 1, 1], padding='SAME') # Bias and rectified linear non-linearity. relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases)) # Max pooling. The kernel size spec {ksize} also follows the layout # of the data. Here we have a pooling window of 2, and a stride of # 2. pool = tf.nn.max_pool(relu, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') conv = tf.nn.conv2d(pool, conv2_weights, strides=[1, 1, 1, 1], padding='SAME') relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases)) pool = tf.nn.max_pool(relu, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') # Reshape the feature map cuboid into a 2D matrix to feed it to the # fully connected layers. pool_shape = pool.get_shape().as_list() reshape = tf.reshape( pool, [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]]) # Fully connected layer. Note that the '+' operation automatically # broadcasts the biases. hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases) # Add a 50% dropout during training only. Dropout also scales # activations such that no rescaling is needed at evaluation time. if train: hidden = tf.nn.dropout(hidden, 0.5, seed=SEED) return tf.matmul(hidden, fc2_weights) + fc2_biases # Get the data. train_data_filename = download('train-images-idx3-ubyte.gz') train_labels_filename = download('train-labels-idx1-ubyte.gz') test_data_filename = download('t10k-images-idx3-ubyte.gz') test_labels_filename = download('t10k-labels-idx1-ubyte.gz') # Extract it into numpy arrays. train_data = extract_data(train_data_filename, 60000) train_labels = extract_labels(train_labels_filename, 60000) test_data = extract_data(test_data_filename, 10000) test_labels = extract_labels(test_labels_filename, 10000) # Generate a validation set. validation_data = train_data[:VALIDATION_SIZE, ...] validation_labels = train_labels[:VALIDATION_SIZE] train_data = train_data[VALIDATION_SIZE:, ...] train_labels = train_labels[VALIDATION_SIZE:] num_epochs = NUM_EPOCHS train_size = train_labels.shape[0] # This is where training samples and labels are fed to the graph. # These placeholder nodes will be fed a batch of training data at each # training step using the {feed_dict} argument to the Run() call below. train_data_node = tf.placeholder( tf.float32, shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)) train_labels_node = tf.placeholder(tf.int64, shape=(BATCH_SIZE,)) eval_data = tf.placeholder( tf.float32, shape=(EVAL_BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)) # The variables below hold all the trainable weights. They are passed # an initial value which will be assigned when we call: # {tf.global_variables_initializer().run()} conv1_weights = tf.Variable( # 5x5 filter, depth 32. tf.truncated_normal([5, 5, NUM_CHANNELS, 32], stddev=0.1, seed=SEED, dtype=tf.float32)) conv1_biases = tf.Variable(tf.zeros([32], dtype=tf.float32)) conv2_weights = tf.Variable(tf.truncated_normal( [5, 5, 32, 64], stddev=0.1, seed=SEED, dtype=tf.float32)) conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=tf.float32)) fc1_weights = tf.Variable( # fully connected, depth 512. tf.truncated_normal([IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512], stddev=0.1, seed=SEED, dtype=tf.float32)) fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=tf.float32)) fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_LABELS], stddev=0.1, seed=SEED, dtype=tf.float32)) fc2_biases = tf.Variable(tf.constant( 0.1, shape=[NUM_LABELS], dtype=tf.float32)) # Training computation: logits + cross-entropy loss. logits = model(train_data_node, True) loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits( labels=train_labels_node, logits=logits)) # L2 regularization for the fully connected parameters. regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) + tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases)) # Add the regularization term to the loss. loss += 5e-4 * regularizers # Optimizer: set up a variable that's incremented once per batch and # controls the learning rate decay. batch = tf.Variable(0, dtype=tf.float32) # Decay once per epoch, using an exponential schedule starting at 0.01. learning_rate = tf.train.exponential_decay( 0.01, # Base learning rate. batch * BATCH_SIZE, # Current index into the dataset. train_size, # Decay step. 0.95, # Decay rate. staircase=True) # Use simple momentum for the optimization. optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9).minimize(loss, global_step=batch) # Predictions for the current training minibatch. train_prediction = tf.nn.softmax(logits) # Predictions for the test and validation, which we'll compute less # often. eval_prediction = tf.nn.softmax(model(eval_data)) # Small utility function to evaluate a dataset by feeding batches of # data to {eval_data} and pulling the results from {eval_predictions}. # Saves memory and enables this to run on smaller GPUs. def eval_in_batches(data, sess): """Get predictions for a dataset by running it in small batches.""" size = data.shape[0] if size < EVAL_BATCH_SIZE: raise ValueError("batch size for evals larger than dataset: %d" % size) predictions = numpy.ndarray(shape=(size, NUM_LABELS), dtype=numpy.float32) for begin in xrange(0, size, EVAL_BATCH_SIZE): end = begin + EVAL_BATCH_SIZE if end <= size: predictions[begin:end, :] = sess.run( eval_prediction, feed_dict={eval_data: data[begin:end, ...]}) else: batch_predictions = sess.run( eval_prediction, feed_dict={eval_data: data[-EVAL_BATCH_SIZE:, ...]}) predictions[begin:, :] = batch_predictions[begin - size:, :] return predictions # Create a local session to run the training. start_time = time.time() with tf.Session() as sess: # Run all the initializers to prepare the trainable parameters. tf.global_variables_initializer().run() # Loop through training steps. for step in xrange(int(num_epochs * train_size) // BATCH_SIZE): # Compute the offset of the current minibatch in the data. # Note that we could use better randomization across epochs. offset = (step * BATCH_SIZE) % (train_size - BATCH_SIZE) batch_data = train_data[offset:(offset + BATCH_SIZE), ...] batch_labels = train_labels[offset:(offset + BATCH_SIZE)] # This dictionary maps the batch data (as a numpy array) to the # node in the graph it should be fed to. feed_dict = {train_data_node: batch_data, train_labels_node: batch_labels} # Run the optimizer to update weights. sess.run(optimizer, feed_dict=feed_dict) # print some extra information once reach the evaluation frequency if step % EVAL_FREQUENCY == 0: # fetch some extra nodes' data l, lr, predictions = sess.run([loss, learning_rate, train_prediction], feed_dict=feed_dict) elapsed_time = time.time() - start_time start_time = time.time() print('Step %d (epoch %.2f), %.1f ms' % (step, float(step) * BATCH_SIZE / train_size, 1000 * elapsed_time / EVAL_FREQUENCY)) print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr)) print('Minibatch error: %.1f%%' % error_rate(predictions, batch_labels)) print('Validation error: %.1f%%' % error_rate( eval_in_batches(validation_data, sess), validation_labels)) sys.stdout.flush() # Finally print the result! test_error = error_rate(eval_in_batches(test_data, sess), test_labels) print('Test error: %.1f%%' % test_error) ================================================ FILE: ch7/ptb_word_lm.py ================================================ # Copyright 2015 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Example / benchmark for building a PTB LSTM model. Trains the model described in: (Zaremba, et. al.) Recurrent Neural Network Regularization http://arxiv.org/abs/1409.2329 There are 3 supported model configurations: =========================================== | config | epochs | train | valid | test =========================================== | small | 13 | 37.99 | 121.39 | 115.91 | medium | 39 | 48.45 | 86.16 | 82.07 | large | 55 | 37.87 | 82.62 | 78.29 The exact results may vary depending on the random initialization. The hyperparameters used in the model: - init_scale - the initial scale of the weights - learning_rate - the initial value of the learning rate - max_grad_norm - the maximum permissible norm of the gradient - num_layers - the number of LSTM layers - num_steps - the number of unrolled steps of LSTM - hidden_size - the number of LSTM units - max_epoch - the number of epochs trained with the initial learning rate - max_max_epoch - the total number of epochs for training - keep_prob - the probability of keeping weights in the dropout layer - lr_decay - the decay of the learning rate for each epoch after "max_epoch" - batch_size - the batch size The data required for this example is in the data/ dir of the PTB dataset from Tomas Mikolov's webpage: $ wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz $ tar xvf simple-examples.tgz To run: $ python ptb_word_lm.py """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import inspect import time import numpy as np import tensorflow as tf import reader flags = tf.flags logging = tf.logging flags.DEFINE_string("save_path", None, "Model output directory.") FLAGS = flags.FLAGS class PTBInput(object): """The input data.""" def __init__(self, config, data, name=None): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps self.epoch_size = ((len(data) // batch_size) - 1) // num_steps self.input_data, self.targets = reader.ptb_producer( data, batch_size, num_steps, name=name) class PTBModel(object): """The PTB model.""" def __init__(self, is_training, config, input_): self.input = input_ batch_size = input_.batch_size num_steps = input_.num_steps size = config.hidden_size vocab_size = config.vocab_size # Slightly better results can be obtained with forget gate biases # initialized to 1 but the hyperparameters of the model would # need to be different than reported in the paper. def lstm_cell(): # With the latest TensorFlow source code (as of Mar 27, 2017), # the BasicLSTMCell will need a reuse parameter which is # unfortunately not defined in TensorFlow 1.0. To maintain # backwards compatibility, we add an argument check here: if 'reuse' in inspect.getargspec( tf.contrib.rnn.BasicLSTMCell.__init__).args: return tf.contrib.rnn.BasicLSTMCell( size, forget_bias=0.0, state_is_tuple=True, reuse=tf.get_variable_scope().reuse) else: return tf.contrib.rnn.BasicLSTMCell( size, forget_bias=0.0, state_is_tuple=True) attn_cell = lstm_cell if is_training and config.keep_prob < 1: def attn_cell(): return tf.contrib.rnn.DropoutWrapper( lstm_cell(), output_keep_prob=config.keep_prob) cell = tf.contrib.rnn.MultiRNNCell( [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) self.initial_state = cell.zero_state(batch_size, tf.float32) with tf.device("/cpu:0"): embedding = tf.get_variable( "embedding", [vocab_size, size], dtype=tf.float32) inputs = tf.nn.embedding_lookup(embedding, input_.input_data) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) outputs = [] state = self.initial_state with tf.variable_scope("RNN"): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(inputs[:, time_step, :], state) outputs.append(cell_output) output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size]) softmax_w = tf.get_variable( "softmax_w", [size, vocab_size], dtype=tf.float32) softmax_b = tf.get_variable( "softmax_b", [vocab_size], dtype=tf.float32) logits = tf.matmul(output, softmax_w) + softmax_b # Reshape logits to be 3-D tensor for sequence loss logits = tf.reshape(logits, [batch_size, num_steps, vocab_size]) # use the contrib sequence loss and average over the batches loss = tf.contrib.seq2seq.sequence_loss( logits, input_.targets, tf.ones([batch_size, num_steps], dtype=tf.float32), average_across_timesteps=False, average_across_batch=True ) # update the cost variables self.cost = cost = tf.reduce_sum(loss) self.final_state = state if not is_training: return self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self.lr) self.train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=tf.contrib.framework.get_or_create_global_step()) self.new_lr = tf.placeholder( tf.float32, shape=[], name="new_learning_rate") self.lr_update = tf.assign(self.lr, self.new_lr) def assign_lr(self, session, lr_value): session.run(self.lr_update, feed_dict={self.new_lr: lr_value}) class SmallConfig(object): """Small config.""" init_scale = 0.1 learning_rate = 1.0 max_grad_norm = 5 num_layers = 2 num_steps = 20 hidden_size = 200 max_epoch = 4 max_max_epoch = 13 keep_prob = 1.0 lr_decay = 0.5 batch_size = 20 vocab_size = 10000 def run_epoch(session, model, eval_op=None, verbose=False): """Runs the model on the given data.""" start_time = time.time() costs = 0.0 iters = 0 state = session.run(model.initial_state) fetches = { "cost": model.cost, "final_state": model.final_state, } if eval_op is not None: fetches["eval_op"] = eval_op for step in range(model.input.epoch_size): feed_dict = {} for i, (c, h) in enumerate(model.initial_state): feed_dict[c] = state[i].c feed_dict[h] = state[i].h vals = session.run(fetches, feed_dict) cost = vals["cost"] state = vals["final_state"] costs += cost iters += model.input.num_steps if verbose and step % (model.input.epoch_size // 10) == 10: print("%.3f perplexity: %.3f speed: %.0f wps" % (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), (iters * model.input.batch_size/(time.time() - start_time)))) return np.exp(costs / iters) raw_data = reader.ptb_raw_data("./simple-examples/data") train_data, valid_data, test_data, _ = raw_data config = SmallConfig() eval_config = SmallConfig() eval_config.batch_size = 1 eval_config.num_steps = 1 with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = PTBInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Valid"): valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config, input_=valid_input) tf.summary.scalar("Validation Loss", mvalid.cost) with tf.name_scope("Test"): test_input = PTBInput(config=eval_config, data=test_data, name="TestInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = PTBModel(is_training=False, config=eval_config, input_=test_input) sv = tf.train.Supervisor() with sv.managed_session() as session: for i in range(config.max_max_epoch): lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) test_perplexity = run_epoch(session, mtest) print("Test Perplexity: %.3f" % test_perplexity) ================================================ FILE: ch7/reader.py ================================================ # Copyright 2015 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Utilities for parsing PTB text files.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import collections import os import sys import tensorflow as tf def _read_words(filename): with tf.gfile.GFile(filename, "r") as f: if sys.version_info[0] >= 3: return f.read().replace("\n", "").split() else: return f.read().decode("utf-8").replace("\n", "").split() def _build_vocab(filename): data = _read_words(filename) counter = collections.Counter(data) count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) words, _ = list(zip(*count_pairs)) word_to_id = dict(zip(words, range(len(words)))) return word_to_id def _file_to_word_ids(filename, word_to_id): data = _read_words(filename) return [word_to_id[word] for word in data if word in word_to_id] def ptb_raw_data(data_path=None): """Load PTB raw data from data directory "data_path". Reads PTB text files, converts strings to integer ids, and performs mini-batching of the inputs. The PTB dataset comes from Tomas Mikolov's webpage: http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz Args: data_path: string path to the directory where simple-examples.tgz has been extracted. Returns: tuple (train_data, valid_data, test_data, vocabulary) where each of the data objects can be passed to PTBIterator. """ train_path = os.path.join(data_path, "ptb.train.txt") valid_path = os.path.join(data_path, "ptb.valid.txt") test_path = os.path.join(data_path, "ptb.test.txt") word_to_id = _build_vocab(train_path) train_data = _file_to_word_ids(train_path, word_to_id) valid_data = _file_to_word_ids(valid_path, word_to_id) test_data = _file_to_word_ids(test_path, word_to_id) vocabulary = len(word_to_id) return train_data, valid_data, test_data, vocabulary def ptb_producer(raw_data, batch_size, num_steps, name=None): """Iterate on the raw PTB data. This chunks up raw_data into batches of examples and returns Tensors that are drawn from these batches. Args: raw_data: one of the raw data outputs from ptb_raw_data. batch_size: int, the batch size. num_steps: int, the number of unrolls. name: the name of this operation (optional). Returns: A pair of Tensors, each shaped [batch_size, num_steps]. The second element of the tuple is the same data time-shifted to the right by one. Raises: tf.errors.InvalidArgumentError: if batch_size or num_steps are too high. """ with tf.name_scope(name, "PTBProducer", [raw_data, batch_size, num_steps]): raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.int32) data_len = tf.size(raw_data) batch_len = data_len // batch_size data = tf.reshape(raw_data[0 : batch_size * batch_len], [batch_size, batch_len]) epoch_size = (batch_len - 1) // num_steps assertion = tf.assert_positive( epoch_size, message="epoch_size == 0, decrease batch_size or num_steps") with tf.control_dependencies([assertion]): epoch_size = tf.identity(epoch_size, name="epoch_size") i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue() x = tf.strided_slice(data, [0, i * num_steps], [batch_size, (i + 1) * num_steps]) x.set_shape([batch_size, num_steps]) y = tf.strided_slice(data, [0, i * num_steps + 1], [batch_size, (i + 1) * num_steps + 1]) y.set_shape([batch_size, num_steps]) return x, y ================================================ FILE: ch7/setup.sh ================================================ wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz tar xvf simple-examples.tgz ================================================ FILE: ch8/a3c.py ================================================ """Asynchronous Advantage Actor-Critic (A3C) algorithm for reinforcement learning.""" import numpy as np import tensorflow as tf import copy import multiprocessing import os import re import threading from collections import Sequence import pickle import threading import time import numpy as np import os import six import tensorflow as tf import tempfile from tensorgraph import TensorGraph from tensorgraph import Layer from tensorgraph import Dense from tensorgraph import Squeeze from tensorgraph import Flatten from tensorgraph import BatchNorm from tensorgraph import SoftMax from tensorgraph import Input class A3CLoss(Layer): """This layer computes the loss function for A3C.""" def __init__(self, value_weight, entropy_weight, **kwargs): super(A3CLoss, self).__init__(**kwargs) self.value_weight = value_weight self.entropy_weight = entropy_weight def create_tensor(self, **kwargs): reward, action, prob, value, advantage = [ layer.out_tensor for layer in self.in_layers ] prob = prob + np.finfo(np.float32).eps log_prob = tf.log(prob) policy_loss = -tf.reduce_mean( advantage * tf.reduce_sum(action * log_prob, axis=1)) value_loss = tf.reduce_mean(tf.square(reward - value)) entropy = -tf.reduce_mean(tf.reduce_sum(prob * log_prob, axis=1)) self.out_tensor = policy_loss + self.value_weight * value_loss - self.entropy_weight * entropy return self.out_tensor class A3C(object): """ Implements the Asynchronous Advantage Actor-Critic (A3C) algorithm for reinforcement learning. The algorithm is described in Mnih et al, "Asynchronous Methods for Deep Reinforcement Learning" (https://arxiv.org/abs/1602.01783). This class requires the policy to output two quantities: a vector giving the probability of taking each action, and an estimate of the value function for the current state. It optimizes both outputs at once using a loss that is the sum of three terms: 1. The policy loss, which seeks to maximize the discounted reward for each action. 2. The value loss, which tries to make the value estimate match the actual discounted reward that was attained at each step. 3. An entropy term to encourage exploration. This class only supports environments with discrete action spaces, not continuous ones. The "action" argument passed to the environment is an integer, giving the index of the action to perform. This class supports Generalized Advantage Estimation as described in Schulman et al., "High-Dimensional Continuous Control Using Generalized Advantage Estimation" (https://arxiv.org/abs/1506.02438). This is a method of trading off bias and variance in the advantage estimate, which can sometimes improve the rate of convergance. Use the advantage_lambda parameter to adjust the tradeoff. """ def __init__(self, env, max_rollout_length=20, discount_factor=0.99, advantage_lambda=0.98, value_weight=1.0, entropy_weight=0.01, optimizer=None, model_dir=None): """Create an object for optimizing a policy. Parameters ---------- env: Environment the Environment to interact with max_rollout_length: int the maximum length of rollouts to generate discount_factor: float the discount factor to use when computing rewards advantage_lambda: float the parameter for trading bias vs. variance in Generalized Advantage Estimation value_weight: float a scale factor for the value loss term in the loss function entropy_weight: float a scale factor for the entropy term in the loss function optimizer: Optimizer the optimizer to use. If None, a default optimizer is used. model_dir: str the directory in which the model will be saved. If None, a temporary directory will be created. """ self._env = env self.max_rollout_length = max_rollout_length self.discount_factor = discount_factor self.advantage_lambda = advantage_lambda self.value_weight = value_weight self.entropy_weight = entropy_weight self._optimizer = None (self._graph, self._features, self._rewards, self._actions, self._action_prob, self._value, self._advantages) = self.build_graph( None, "global", model_dir) with self._graph._get_tf("Graph").as_default(): self._session = tf.Session() def build_graph(self, tf_graph, scope, model_dir): """Construct a TensorGraph containing the policy and loss calculations.""" state_shape = self._env.state_shape features = [] for s in state_shape: features.append(Input(shape=[None] + list(s), dtype=tf.float32)) d1 = Flatten(in_layers=features) d2 = Dense( in_layers=[d1], activation_fn=tf.nn.relu, normalizer_fn=tf.nn.l2_normalize, normalizer_params={"dim": 1}, out_channels=64) d3 = Dense( in_layers=[d2], activation_fn=tf.nn.relu, normalizer_fn=tf.nn.l2_normalize, normalizer_params={"dim": 1}, out_channels=32) d4 = Dense( in_layers=[d3], activation_fn=tf.nn.relu, normalizer_fn=tf.nn.l2_normalize, normalizer_params={"dim": 1}, out_channels=16) d4 = BatchNorm(in_layers=[d4]) d5 = Dense(in_layers=[d4], activation_fn=None, out_channels=9) value = Dense(in_layers=[d4], activation_fn=None, out_channels=1) value = Squeeze(squeeze_dims=1, in_layers=[value]) action_prob = SoftMax(in_layers=[d5]) rewards = Input(shape=(None,)) advantages = Input(shape=(None,)) actions = Input(shape=(None, self._env.n_actions)) loss = A3CLoss( self.value_weight, self.entropy_weight, in_layers=[rewards, actions, action_prob, value, advantages]) graph = TensorGraph( batch_size=self.max_rollout_length, graph=tf_graph, model_dir=model_dir) for f in features: graph._add_layer(f) graph.add_output(action_prob) graph.add_output(value) graph.set_loss(loss) graph.set_optimizer(self._optimizer) with graph._get_tf("Graph").as_default(): with tf.variable_scope(scope): graph.build() return graph, features, rewards, actions, action_prob, value, advantages def fit(self, total_steps, max_checkpoints_to_keep=5, checkpoint_interval=600, restore=False): """Train the policy. Parameters ---------- total_steps: int the total number of time steps to perform on the environment, across all rollouts on all threads max_checkpoints_to_keep: int the maximum number of checkpoint files to keep. When this number is reached, older files are deleted. checkpoint_interval: float the time interval at which to save checkpoints, measured in seconds restore: bool if True, restore the model from the most recent checkpoint and continue training from there. If False, retrain the model from scratch. """ with self._graph._get_tf("Graph").as_default(): step_count = [0] workers = [] threads = [] for i in range(multiprocessing.cpu_count()): workers.append(Worker(self, i)) self._session.run(tf.global_variables_initializer()) if restore: self.restore() for worker in workers: thread = threading.Thread( name=worker.scope, target=lambda: worker.run(step_count, total_steps)) threads.append(thread) thread.start() variables = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope="global") saver = tf.train.Saver(variables, max_to_keep=max_checkpoints_to_keep) checkpoint_index = 0 while True: threads = [t for t in threads if t.isAlive()] if len(threads) > 0: threads[0].join(checkpoint_interval) checkpoint_index += 1 saver.save( self._session, self._graph.save_file, global_step=checkpoint_index) if len(threads) == 0: break def predict(self, state): """Compute the policy's output predictions for a state. Parameters ---------- state: array the state of the environment for which to generate predictions Returns ------- the array of action probabilities, and the estimated value function """ with self._graph._get_tf("Graph").as_default(): feed_dict = self.create_feed_dict(state) tensors = [self._action_prob.out_tensor, self._value.out_tensor] results = self._session.run(tensors, feed_dict=feed_dict) return results[:2] def select_action(self, state, deterministic=False): """Select an action to perform based on the environment's state. Parameters ---------- state: array the state of the environment for which to select an action deterministic: bool if True, always return the best action (that is, the one with highest probability). If False, randomly select an action based on the computed probabilities. Returns ------- the index of the selected action """ with self._graph._get_tf("Graph").as_default(): feed_dict = self.create_feed_dict(state) tensors = [self._action_prob.out_tensor] results = self._session.run(tensors, feed_dict=feed_dict) probabilities = results[0] if deterministic: return probabilities.argmax() else: return np.random.choice( np.arange(self._env.n_actions), p=probabilities[0]) def restore(self): """Reload the model parameters from the most recent checkpoint file.""" last_checkpoint = tf.train.latest_checkpoint(self._graph.model_dir) if last_checkpoint is None: raise ValueError("No checkpoint found") with self._graph._get_tf("Graph").as_default(): variables = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope="global") saver = tf.train.Saver(variables) saver.restore(self._session, last_checkpoint) def create_feed_dict(self, state): """Create a feed dict for use by predict() or select_action().""" feed_dict = dict((f.out_tensor, np.expand_dims(s, axis=0)) for f, s in zip(self._features, state)) return feed_dict class Worker(object): """A Worker object is created for each training thread.""" def __init__(self, a3c, index): self.a3c = a3c self.index = index self.scope = "worker%d" % index self.env = copy.deepcopy(a3c._env) self.env.reset() (self.graph, self.features, self.rewards, self.actions, self.action_prob, self.value, self.advantages) = a3c.build_graph( a3c._graph._get_tf("Graph"), self.scope, None) with a3c._graph._get_tf("Graph").as_default(): local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "global") gradients = tf.gradients(self.graph.loss.out_tensor, local_vars) grads_and_vars = list(zip(gradients, global_vars)) self.train_op = a3c._graph._get_tf("Optimizer").apply_gradients( grads_and_vars) self.update_local_variables = tf.group( * [tf.assign(v1, v2) for v1, v2 in zip(local_vars, global_vars)]) self.global_step = self.graph.get_global_step() def run(self, step_count, total_steps): with self.graph._get_tf("Graph").as_default(): while step_count[0] < total_steps: self.a3c._session.run(self.update_local_variables) states, actions, rewards, values = self.create_rollout() self.process_rollout(states, actions, rewards, values, step_count[0]) step_count[0] += len(actions) def create_rollout(self): """Generate a rollout.""" n_actions = self.env.n_actions session = self.a3c._session states = [] actions = [] rewards = [] values = [] # Generate the rollout. for i in range(self.a3c.max_rollout_length): if self.env.terminated: break state = self.env.state states.append(state) feed_dict = self.create_feed_dict(state) results = session.run( [self.action_prob.out_tensor, self.value.out_tensor], feed_dict=feed_dict) probabilities, value = results[:2] action = np.random.choice(np.arange(n_actions), p=probabilities[0]) actions.append(action) values.append(float(value)) rewards.append(self.env.step(action)) # Compute an estimate of the reward for the rest of the episode. if not self.env.terminated: feed_dict = self.create_feed_dict(self.env.state) final_value = self.a3c.discount_factor * float( session.run(self.value.out_tensor, feed_dict)) else: final_value = 0.0 values.append(final_value) if self.env.terminated: self.env.reset() return states, actions, np.array(rewards), np.array(values) def process_rollout(self, states, actions, rewards, values, step_count): """Train the network based on a rollout.""" # Compute the discounted rewards and advantages. if len(states) == 0: # Rollout creation sometimes fails in multithreaded environment. # Don't process if malformed print("Rollout creation failed. Skipping") return discounted_rewards = rewards.copy() discounted_rewards[-1] += values[-1] advantages = rewards - values[:-1] + self.a3c.discount_factor * np.array( values[1:]) for j in range(len(rewards) - 1, 0, -1): discounted_rewards[j-1] += self.a3c.discount_factor * discounted_rewards[j] advantages[j-1] += ( self.a3c.discount_factor * self.a3c.advantage_lambda * advantages[j]) # Convert the actions to one-hot. n_actions = self.env.n_actions actions_matrix = [] for action in actions: a = np.zeros(n_actions) a[action] = 1.0 actions_matrix.append(a) # Rearrange the states into the proper set of arrays. state_arrays = [[] for i in range(len(self.features))] for state in states: for j in range(len(state)): state_arrays[j].append(state[j]) # Build the feed dict and apply gradients. feed_dict = {} for f, s in zip(self.features, state_arrays): feed_dict[f.out_tensor] = s feed_dict[self.rewards.out_tensor] = discounted_rewards feed_dict[self.actions.out_tensor] = actions_matrix feed_dict[self.advantages.out_tensor] = advantages feed_dict[self.global_step] = step_count self.a3c._session.run(self.train_op, feed_dict=feed_dict) def create_feed_dict(self, state): """Create a feed dict for use during a rollout.""" feed_dict = dict((f.out_tensor, np.expand_dims(s, axis=0)) for f, s in zip(self.features, state)) return feed_dict ================================================ FILE: ch8/environment.py ================================================ import copy import random import shutil import numpy as np import tensorflow as tf import deepchem as dc import collections class Environment(object): """An environment in which an actor performs actions to accomplish a task. An environment has a current state, which is represented as either a single NumPy array, or optionally a list of NumPy arrays. When an action is taken, that causes the state to be updated. Exactly what is meant by an "action" is defined by each subclass. As far as this interface is concerned, it is simply an arbitrary object. The environment also computes a reward for each action, and reports when the task has been terminated (meaning that no more actions may be taken). """ def __init__(self, state_shape, n_actions, state_dtype=None): """Subclasses should call the superclass constructor in addition to doing their own initialization.""" self.state_shape = state_shape self.n_actions = n_actions if state_dtype is None: # Assume all arrays are float32. if isinstance(state_shape[0], collections.Sequence): self.state_dtype = [np.float32] * len(state_shape) else: self.state_dtype = np.float32 else: self.state_dtype = state_dtype class TicTacToeEnvironment(Environment): """ Play tictactoe against a randomly acting opponent """ X = np.array([1.0, 0.0]) O = np.array([0.0, 1.0]) EMPTY = np.array([0.0, 0.0]) ILLEGAL_MOVE_PENALTY = -3.0 LOSS_PENALTY = -3.0 NOT_LOSS = 0.1 DRAW_REWARD = 5.0 WIN_REWARD = 10.0 def __init__(self): super(TicTacToeEnvironment, self).__init__([(3, 3, 2)], 9) self.state = None self.terminated = None self.reset() def reset(self): self.terminated = False self.state = [np.zeros(shape=(3, 3, 2), dtype=np.float32)] # Randomize who goes first if random.randint(0, 1) == 1: move = self.get_O_move() self.state[0][move[0]][move[1]] = TicTacToeEnvironment.O def step(self, action): self.state = copy.deepcopy(self.state) row = action // 3 col = action % 3 # Illegal move -- the square is not empty if not np.all(self.state[0][row][col] == TicTacToeEnvironment.EMPTY): self.terminated = True return TicTacToeEnvironment.ILLEGAL_MOVE_PENALTY # Move X self.state[0][row][col] = TicTacToeEnvironment.X # Did X Win if self.check_winner(TicTacToeEnvironment.X): self.terminated = True return TicTacToeEnvironment.WIN_REWARD if self.game_over(): self.terminated = True return TicTacToeEnvironment.DRAW_REWARD move = self.get_O_move() self.state[0][move[0]][move[1]] = TicTacToeEnvironment.O # Did O Win if self.check_winner(TicTacToeEnvironment.O): self.terminated = True return TicTacToeEnvironment.LOSS_PENALTY if self.game_over(): self.terminated = True return TicTacToeEnvironment.DRAW_REWARD return TicTacToeEnvironment.NOT_LOSS def get_O_move(self): empty_squares = [] for row in range(3): for col in range(3): if np.all(self.state[0][row][col] == TicTacToeEnvironment.EMPTY): empty_squares.append((row, col)) return random.choice(empty_squares) def check_winner(self, player): for i in range(3): row = np.sum(self.state[0][i][:], axis=0) if np.all(row == player * 3): return True col = np.sum(self.state[0][:][i], axis=0) if np.all(col == player * 3): return True diag1 = self.state[0][0][0] + self.state[0][1][1] + self.state[0][2][2] if np.all(diag1 == player * 3): return True diag2 = self.state[0][0][2] + self.state[0][1][1] + self.state[0][2][0] if np.all(diag2 == player * 3): return True return False def game_over(self): for i in range(3): for j in range(3): if np.all(self.state[0][i][j] == TicTacToeEnvironment.EMPTY): return False return True def display(self): state = self.state[0] s = "" for row in range(3): for col in range(3): if np.all(state[row][col] == TicTacToeEnvironment.EMPTY): s += "_" if np.all(state[row][col] == TicTacToeEnvironment.X): s += "X" if np.all(state[row][col] == TicTacToeEnvironment.O): s += "O" s += "\n" return s ================================================ FILE: ch8/tensorgraph.py ================================================ """TensorGraph OOP Framework.""" import numpy as np import tensorflow as tf import copy import multiprocessing import os import re import threading from collections import Sequence import pickle import threading import time import numpy as np import os import six import tensorflow as tf import tempfile class TensorGraph(object): def __init__(self, batch_size=100, random_seed=None, graph=None, learning_rate=0.001, model_dir=None, **kwargs): """ Parameters ---------- batch_size: int default batch size for training and evaluating graph: tensorflow.Graph the Graph in which to create Tensorflow objects. If None, a new Graph is created. learning_rate: float or LearningRateSchedule the learning rate to use for optimization kwargs """ # Layer Management self.layers = dict() self.features = list() self.labels = list() self.outputs = list() self.task_weights = list() self.loss = None self.built = False self.optimizer = None self.learning_rate = learning_rate # Singular place to hold Tensor objects which don't serialize # See TensorGraph._get_tf() for more details on lazy construction self.tensor_objects = { "Graph": graph, #"train_op": None, } self.global_step = 0 self.batch_size = batch_size self.random_seed = random_seed if model_dir is not None: if not os.path.exists(model_dir): os.makedirs(model_dir) else: model_dir = tempfile.mkdtemp() self.model_dir_is_temp = True self.model_dir = model_dir self.save_file = "%s/%s" % (self.model_dir, "model") self.model_class = None def _add_layer(self, layer): if layer.name is None: layer.name = "%s_%s" % (layer.__class__.__name__, len(self.layers) + 1) if layer.name in self.layers: return if isinstance(layer, Input): self.features.append(layer) self.layers[layer.name] = layer for in_layer in layer.in_layers: self._add_layer(in_layer) def topsort(self): def add_layers_to_list(layer, sorted_layers): if layer in sorted_layers: return for in_layer in layer.in_layers: add_layers_to_list(in_layer, sorted_layers) sorted_layers.append(layer) sorted_layers = [] for l in self.features + self.labels + self.task_weights + self.outputs: add_layers_to_list(l, sorted_layers) add_layers_to_list(self.loss, sorted_layers) return sorted_layers def build(self): if self.built: return with self._get_tf("Graph").as_default(): self._training_placeholder = tf.placeholder(dtype=tf.float32, shape=()) if self.random_seed is not None: tf.set_random_seed(self.random_seed) for layer in self.topsort(): with tf.name_scope(layer.name): layer.create_tensor(training=self._training_placeholder) self.session = tf.Session() self.built = True def set_loss(self, layer): self._add_layer(layer) self.loss = layer def add_output(self, layer): self._add_layer(layer) self.outputs.append(layer) def set_optimizer(self, optimizer): """Set the optimizer to use for fitting.""" self.optimizer = optimizer def get_layer_variables(self, layer): """Get the list of trainable variables in a layer of the graph.""" if not self.built: self.build() with self._get_tf("Graph").as_default(): if layer.variable_scope == "": return [] return tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=layer.variable_scope) def get_global_step(self): return self._get_tf("GlobalStep") def _get_tf(self, obj): """Fetches underlying TensorFlow primitives. Parameters ---------- obj: str If "Graph", returns tf.Graph instance. If "Optimizer", returns the optimizer. If "train_op", returns the train operation. If "GlobalStep" returns the global step. Returns ------- TensorFlow Object """ if obj in self.tensor_objects and self.tensor_objects[obj] is not None: return self.tensor_objects[obj] if obj == "Graph": self.tensor_objects["Graph"] = tf.Graph() elif obj == "Optimizer": self.tensor_objects["Optimizer"] = tf.train.AdamOptimizer( learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-7) elif obj == "GlobalStep": with self._get_tf("Graph").as_default(): self.tensor_objects["GlobalStep"] = tf.Variable(0, trainable=False) return self._get_tf(obj) def restore(self): """Reload the values of all variables from the most recent checkpoint file.""" if not self.built: self.build() last_checkpoint = tf.train.latest_checkpoint(self.model_dir) if last_checkpoint is None: raise ValueError("No checkpoint found") with self._get_tf("Graph").as_default(): saver = tf.train.Saver() saver.restore(self.session, last_checkpoint) def __del__(self): pass class Layer(object): def __init__(self, in_layers=None, **kwargs): if "name" in kwargs: self.name = kwargs["name"] else: self.name = None if in_layers is None: in_layers = list() if not isinstance(in_layers, Sequence): in_layers = [in_layers] self.in_layers = in_layers self.variable_scope = "" self.tb_input = None def create_tensor(self, in_layers=None, **kwargs): raise NotImplementedError("Subclasses must implement for themselves") def _get_input_tensors(self, in_layers): """Get the input tensors to his layer. Parameters ---------- in_layers: list of Layers or tensors the inputs passed to create_tensor(). If None, this layer's inputs will be used instead. """ if in_layers is None: in_layers = self.in_layers if not isinstance(in_layers, Sequence): in_layers = [in_layers] tensors = [] for input in in_layers: tensors.append(tf.convert_to_tensor(input)) return tensors def _convert_layer_to_tensor(value, dtype=None, name=None, as_ref=False): return tf.convert_to_tensor(value.out_tensor, dtype=dtype, name=name) tf.register_tensor_conversion_function(Layer, _convert_layer_to_tensor) class Dense(Layer): def __init__( self, out_channels, activation_fn=None, biases_initializer=tf.zeros_initializer, weights_initializer=tf.contrib.layers.variance_scaling_initializer, **kwargs): """Create a dense layer. The weight and bias initializers are specified by callable objects that construct and return a Tensorflow initializer when invoked with no arguments. This will typically be either the initializer class itself (if the constructor does not require arguments), or a TFWrapper (if it does). Parameters ---------- out_channels: int the number of output values activation_fn: object the Tensorflow activation function to apply to the output biases_initializer: callable object the initializer for bias values. This may be None, in which case the layer will not include biases. weights_initializer: callable object the initializer for weight values """ super(Dense, self).__init__(**kwargs) self.out_channels = out_channels self.out_tensor = None self.activation_fn = activation_fn self.biases_initializer = biases_initializer self.weights_initializer = weights_initializer def create_tensor(self, in_layers=None, **kwargs): inputs = self._get_input_tensors(in_layers) if len(inputs) != 1: raise ValueError("Dense layer can only have one input") parent = inputs[0] if self.biases_initializer is None: biases_initializer = None else: biases_initializer = self.biases_initializer() out_tensor = tf.contrib.layers.fully_connected(parent, num_outputs=self.out_channels, activation_fn=self.activation_fn, biases_initializer=biases_initializer, weights_initializer=self.weights_initializer(), reuse=False, trainable=True) self.out_tensor = out_tensor return out_tensor class Squeeze(Layer): def __init__(self, in_layers=None, squeeze_dims=None, **kwargs): self.squeeze_dims = squeeze_dims super(Squeeze, self).__init__(in_layers, **kwargs) def create_tensor(self, in_layers=None, **kwargs): inputs = self._get_input_tensors(in_layers) parent_tensor = inputs[0] out_tensor = tf.squeeze(parent_tensor, squeeze_dims=self.squeeze_dims) self.out_tensor = out_tensor return out_tensor class BatchNorm(Layer): def __init__(self, in_layers=None, **kwargs): super(BatchNorm, self).__init__(in_layers, **kwargs) def create_tensor(self, in_layers=None, **kwargs): inputs = self._get_input_tensors(in_layers) parent_tensor = inputs[0] out_tensor = tf.layers.batch_normalization(parent_tensor) self.out_tensor = out_tensor return out_tensor class Flatten(Layer): """Flatten every dimension except the first""" def __init__(self, in_layers=None, **kwargs): super(Flatten, self).__init__(in_layers, **kwargs) def create_tensor(self, in_layers=None, **kwargs): inputs = self._get_input_tensors(in_layers) if len(inputs) != 1: raise ValueError("Only One Parent to Flatten") parent = inputs[0] parent_shape = parent.get_shape() vector_size = 1 for i in range(1, len(parent_shape)): vector_size *= parent_shape[i].value parent_tensor = parent out_tensor = tf.reshape(parent_tensor, shape=(-1, vector_size)) self.out_tensor = out_tensor return out_tensor class SoftMax(Layer): def __init__(self, in_layers=None, **kwargs): super(SoftMax, self).__init__(in_layers, **kwargs) def create_tensor(self, in_layers=None, **kwargs): inputs = self._get_input_tensors(in_layers) if len(inputs) != 1: raise ValueError("Must only Softmax single parent") parent = inputs[0] out_tensor = tf.contrib.layers.softmax(parent) self.out_tensor = out_tensor return out_tensor class Input(Layer): def __init__(self, shape, dtype=tf.float32, **kwargs): self._shape = tuple(shape) self.dtype = dtype super(Input, self).__init__(**kwargs) def create_tensor(self, in_layers=None, **kwargs): if in_layers is None: in_layers = self.in_layers out_tensor = tf.placeholder(dtype=self.dtype, shape=self._shape) self.out_tensor = out_tensor return out_tensor ================================================ FILE: ch8/tictactoe.py ================================================ """Adapted from DeepChem Examples by Peter Eastman and Karl Leswing.""" import copy import random import shutil import numpy as np import tensorflow as tf import deepchem as dc from environment import TicTacToeEnvironment from a3c import A3C def eval_tic_tac_toe(value_weight, num_epoch_rounds=1, games=10**4, rollouts=10**5, advantage_lambda=0.98): """ Returns the average reward over 10k games after 100k rollouts Parameters ---------- value_weight: float Returns ------- avg_rewards """ env = TicTacToeEnvironment() model_dir = "/tmp/tictactoe" try: shutil.rmtree(model_dir) except: pass avg_rewards = [] for j in range(num_epoch_rounds): print("Epoch round: %d" % j) a3c_engine = A3C( env, entropy_weight=0.01, value_weight=value_weight, model_dir=model_dir, advantage_lambda=advantage_lambda) try: a3c_engine.restore() except: print("unable to restore") pass a3c_engine.fit(rollouts) rewards = [] for i in range(games): env.reset() reward = -float('inf') while not env.terminated: action = a3c_engine.select_action(env.state) reward = env.step(action) rewards.append(reward) print("Mean reward at round %d is %f" % (j+1, np.mean(rewards))) avg_rewards.append({(j + 1) * rollouts: np.mean(rewards)}) return avg_rewards def main(): value_weight = 6.0 score = eval_tic_tac_toe(value_weight=0.2, num_epoch_rounds=20, advantage_lambda=0., games=10**4, rollouts=5*10**4) print(score) if __name__ == "__main__": main() ================================================ FILE: ch9/cifar10.py ================================================ # Copyright 2015 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Builds the CIFAR-10 network. Summary of available functions: # Compute input images and labels for training. If you would like to run # evaluations, use inputs() instead. inputs, labels = distorted_inputs() # Compute inference on the model inputs to make a prediction. predictions = inference(inputs) # Compute the total loss of the prediction with respect to the labels. loss = loss(predictions, labels) # Create a graph to run one step of training with respect to the loss. train_op = train(loss, global_step) """ # pylint: disable=missing-docstring from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import re import sys import tarfile from six.moves import urllib, xrange # pylint: disable=redefined-builtin import tensorflow as tf import cifar10_input FLAGS = tf.app.flags.FLAGS # Basic model parameters. tf.app.flags.DEFINE_integer('batch_size', 128, """Number of images to process in a batch.""") tf.app.flags.DEFINE_string('data_dir', '/tmp/cifar10_data', """Path to the CIFAR-10 data directory.""") # Process images of this size. Note that this differs from the original CIFAR # image size of 32 x 32. If one alters this number, then the entire model # architecture will change and any model would need to be retrained. IMAGE_SIZE = 24 NUM_CLASSES = 10 # Global constants describing the CIFAR-10 data set. NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000 NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = 10000 # Constants describing the training process. MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. NUM_EPOCHS_PER_DECAY = 350.0 # Epochs after which learning rate decays. LEARNING_RATE_DECAY_FACTOR = 0.1 # Learning rate decay factor. INITIAL_LEARNING_RATE = 0.1 # Initial learning rate. # If a model is trained with multiple GPUs, prefix all Op names with tower_name # to differentiate the operations. Note that this prefix is removed from the # names of the summaries when visualizing a model. TOWER_NAME = 'tower' DATA_URL = 'http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz' def distorted_inputs(): """Construct distorted input for CIFAR training using the Reader ops. Returns: images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size. labels: Labels. 1D tensor of [batch_size] size. Raises: ValueError: If no data_dir """ if not FLAGS.data_dir: raise ValueError('Please supply a data_dir') data_dir = os.path.join(FLAGS.data_dir, 'cifar-10-batches-bin') images, labels = _distorted_inputs(data_dir=data_dir, batch_size=FLAGS.batch_size) return images, labels def read_cifar10(filename_queue): """Reads and parses examples from CIFAR10 data files. Recommendation: if you want N-way read parallelism, call this function N times. This will give you N independent Readers reading different files & positions within those files, which will give better mixing of examples. Args: filename_queue: A queue of strings with the filenames to read from. Returns: An object representing a single example, with the following fields: height: number of rows in the result (32) width: number of columns in the result (32) depth: number of color channels in the result (3) key: a scalar string Tensor describing the filename & record number for this example. label: an int32 Tensor with the label in the range 0..9. uint8image: a [height, width, depth] uint8 Tensor with the image data """ class CIFAR10Record(object): pass result = CIFAR10Record() # Dimensions of the images in the CIFAR-10 dataset. # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the # input format. label_bytes = 1 # 2 for CIFAR-100 result.height = 32 result.width = 32 result.depth = 3 image_bytes = result.height * result.width * result.depth # Every record consists of a label followed by the image, with a # fixed number of bytes for each. record_bytes = label_bytes + image_bytes # Read a record, getting filenames from the filename_queue. No # header or footer in the CIFAR-10 format, so we leave header_bytes # and footer_bytes at their default of 0. reader = tf.FixedLengthRecordReader(record_bytes=record_bytes) result.key, value = reader.read(filename_queue) # Convert from a string to a vector of uint8 that is record_bytes long. record_bytes = tf.decode_raw(value, tf.uint8) # The first bytes represent the label, which we convert from uint8->int32. result.label = tf.cast( tf.strided_slice(record_bytes, [0], [label_bytes]), tf.int32) # The remaining bytes after the label represent the image, which we reshape # from [depth * height * width] to [depth, height, width]. depth_major = tf.reshape( tf.strided_slice(record_bytes, [label_bytes], [label_bytes + image_bytes]), [result.depth, result.height, result.width]) # Convert from [depth, height, width] to [height, width, depth]. result.uint8image = tf.transpose(depth_major, [1, 2, 0]) return result def _distorted_inputs(data_dir, batch_size): """Construct distorted input for CIFAR training using the Reader ops. Args: data_dir: Path to the CIFAR-10 data directory. batch_size: Number of images per batch. Returns: images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size. labels: Labels. 1D tensor of [batch_size] size. """ filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i) for i in xrange(1, 6)] for f in filenames: if not tf.gfile.Exists(f): raise ValueError('Failed to find file: ' + f) # Create a queue that produces the filenames to read. filename_queue = tf.train.string_input_producer(filenames) # Read examples from files in the filename queue. read_input = read_cifar10(filename_queue) reshaped_image = tf.cast(read_input.uint8image, tf.float32) height = IMAGE_SIZE width = IMAGE_SIZE # Image processing for training the network. Note the many random # distortions applied to the image. # Randomly crop a [height, width] section of the image. distorted_image = tf.random_crop(reshaped_image, [height, width, 3]) # Randomly flip the image horizontally. distorted_image = tf.image.random_flip_left_right(distorted_image) # Because these operations are not commutative, consider randomizing # the order their operation. # NOTE: since per_image_standardization zeros the mean and makes # the stddev unit, this likely has no effect see tensorflow#1458. distorted_image = tf.image.random_brightness(distorted_image, max_delta=63) distorted_image = tf.image.random_contrast(distorted_image, lower=0.2, upper=1.8) # Subtract off the mean and divide by the variance of the pixels. float_image = tf.image.per_image_standardization(distorted_image) # Set the shapes of tensors. float_image.set_shape([height, width, 3]) read_input.label.set_shape([1]) # Ensure that the random shuffling has good mixing properties. min_fraction_of_examples_in_queue = 0.4 min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * min_fraction_of_examples_in_queue) print ('Filling queue with %d CIFAR images before starting to train. ' 'This will take a few minutes.' % min_queue_examples) # Generate a batch of images and labels by building up a queue of examples. return _generate_image_and_label_batch(float_image, read_input.label, min_queue_examples, batch_size, shuffle=True) def inputs(eval_data): """Construct input for CIFAR evaluation using the Reader ops. Args: eval_data: bool, indicating if one should use the train or eval data set. Returns: images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size. labels: Labels. 1D tensor of [batch_size] size. Raises: ValueError: If no data_dir """ if not FLAGS.data_dir: raise ValueError('Please supply a data_dir') data_dir = os.path.join(FLAGS.data_dir, 'cifar-10-batches-bin') images, labels = _inputs(eval_data=eval_data, data_dir=data_dir, batch_size=FLAGS.batch_size) return images, labels def _inputs(eval_data, data_dir, batch_size): """Construct input for CIFAR evaluation using the Reader ops. Args: eval_data: bool, indicating if one should use the train or eval data set. data_dir: Path to the CIFAR-10 data directory. batch_size: Number of images per batch. Returns: images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size. labels: Labels. 1D tensor of [batch_size] size. """ if not eval_data: filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i) for i in xrange(1, 6)] num_examples_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN else: filenames = [os.path.join(data_dir, 'test_batch.bin')] num_examples_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_EVAL for f in filenames: if not tf.gfile.Exists(f): raise ValueError('Failed to find file: ' + f) # Create a queue that produces the filenames to read. filename_queue = tf.train.string_input_producer(filenames) # Read examples from files in the filename queue. read_input = read_cifar10(filename_queue) reshaped_image = tf.cast(read_input.uint8image, tf.float32) height = IMAGE_SIZE width = IMAGE_SIZE # Image processing for evaluation. # Crop the central [height, width] of the image. resized_image = tf.image.resize_image_with_crop_or_pad(reshaped_image, height, width) # Subtract off the mean and divide by the variance of the pixels. float_image = tf.image.per_image_standardization(resized_image) # Set the shapes of tensors. float_image.set_shape([height, width, 3]) read_input.label.set_shape([1]) # Ensure that the random shuffling has good mixing properties. min_fraction_of_examples_in_queue = 0.4 min_queue_examples = int(num_examples_per_epoch * min_fraction_of_examples_in_queue) # Generate a batch of images and labels by building up a queue of examples. return _generate_image_and_label_batch(float_image, read_input.label, min_queue_examples, batch_size, shuffle=False) def maybe_download_and_extract(): """Download and extract the tarball from Alex's website.""" dest_directory = FLAGS.data_dir if not os.path.exists(dest_directory): os.makedirs(dest_directory) filename = DATA_URL.split('/')[-1] filepath = os.path.join(dest_directory, filename) if not os.path.exists(filepath): def _progress(count, block_size, total_size): sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename, float(count * block_size) / float(total_size) * 100.0)) sys.stdout.flush() filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress) print() statinfo = os.stat(filepath) print('Successfully downloaded', filename, statinfo.st_size, 'bytes.') extracted_dir_path = os.path.join(dest_directory, 'cifar-10-batches-bin') if not os.path.exists(extracted_dir_path): tarfile.open(filepath, 'r:gz').extractall(dest_directory) def _generate_image_and_label_batch(image, label, min_queue_examples, batch_size, shuffle): """Construct a queued batch of images and labels. Args: image: 3-D Tensor of [height, width, 3] of type.float32. label: 1-D Tensor of type.int32 min_queue_examples: int32, minimum number of samples to retain in the queue that provides of batches of examples. batch_size: Number of images per batch. shuffle: boolean indicating whether to use a shuffling queue. Returns: images: Images. 4D tensor of [batch_size, height, width, 3] size. labels: Labels. 1D tensor of [batch_size] size. """ # Create a queue that shuffles the examples, and then # read 'batch_size' images + labels from the example queue. num_preprocess_threads = 16 if shuffle: images, label_batch = tf.train.shuffle_batch( [image, label], batch_size=batch_size, num_threads=num_preprocess_threads, capacity=min_queue_examples + 3 * batch_size, min_after_dequeue=min_queue_examples) else: images, label_batch = tf.train.batch( [image, label], batch_size=batch_size, num_threads=num_preprocess_threads, capacity=min_queue_examples + 3 * batch_size) # Display the training images in the visualizer. tf.summary.image('images', images) return images, tf.reshape(label_batch, [batch_size]) ================================================ FILE: ch9/cifar10_multi_gpu_train.py ================================================ # Copyright 2015 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """A binary to train CIFAR-10 using multiple GPUs with synchronous updates. Accuracy: cifar10_multi_gpu_train.py achieves ~86% accuracy after 100K steps (256 epochs of data) as judged by cifar10_eval.py. Speed: With batch_size 128. System | Step Time (sec/batch) | Accuracy -------------------------------------------------------------------- 1 Tesla K20m | 0.35-0.60 | ~86% at 60K steps (5 hours) 1 Tesla K40m | 0.25-0.35 | ~86% at 100K steps (4 hours) 2 Tesla K20m | 0.13-0.20 | ~84% at 30K steps (2.5 hours) 3 Tesla K20m | 0.13-0.18 | ~84% at 30K steps 4 Tesla K20m | ~0.10 | ~84% at 30K steps Usage: Please see the tutorial and website for how to download the CIFAR-10 data set, compile the program and train the model. http://tensorflow.org/tutorials/deep_cnn/ """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from datetime import datetime import os.path import shutil import re import time import numpy as np from six.moves import xrange # pylint: disable=redefined-builtin import tensorflow as tf import cifar10 FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string('train_dir', '/tmp/cifar10_train', """Directory where to write event logs """ """and checkpoint.""") tf.app.flags.DEFINE_integer('max_steps', 1000000, """Number of batches to run.""") tf.app.flags.DEFINE_integer('num_gpus', 1, """How many GPUs to use.""") tf.app.flags.DEFINE_boolean('log_device_placement', False, """Whether to log device placement.""") def _activation_summary(x): """Helper to create summaries for activations. Creates a summary that provides a histogram of activations. Creates a summary that measures the sparsity of activations. Args: x: Tensor Returns: nothing """ # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. tensor_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', x.op.name) tf.summary.histogram(tensor_name + '/activations', x) tf.summary.scalar(tensor_name + '/sparsity', tf.nn.zero_fraction(x)) def _variable_on_cpu(name, shape, initializer): """Helper to create a Variable stored on CPU memory. Args: name: name of the variable shape: list of ints initializer: initializer for Variable Returns: Variable Tensor """ with tf.device('/cpu:0'): var = tf.get_variable(name, shape, initializer=initializer, dtype=tf.float32) return var def _variable_with_weight_decay(name, shape, stddev, wd): """Helper to create an initialized Variable with weight decay. Note that the Variable is initialized with a truncated normal distribution. A weight decay is added only if one is specified. Args: name: name of the variable shape: list of ints stddev: standard deviation of a truncated Gaussian wd: add L2Loss weight decay multiplied by this float. If None, weight decay is not added for this Variable. Returns: Variable Tensor """ var = _variable_on_cpu( name, shape, tf.truncated_normal_initializer(stddev=stddev, dtype=tf.float32)) if wd is not None: weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss') tf.add_to_collection('losses', weight_decay) return var def inference(images): """Build the CIFAR-10 model. Args: images: Images returned from distorted_inputs() or inputs(). Returns: Logits. """ # We instantiate all variables using tf.get_variable() instead of # tf.Variable() in order to share variables across multiple GPU training runs. # If we only ran this model on a single GPU, we could simplify this function # by replacing all instances of tf.get_variable() with tf.Variable(). # # conv1 with tf.variable_scope('conv1') as scope: kernel = _variable_with_weight_decay('weights', shape=[5, 5, 3, 64], stddev=5e-2, wd=0.0) conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME') biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0)) pre_activation = tf.nn.bias_add(conv, biases) conv1 = tf.nn.relu(pre_activation, name=scope.name) _activation_summary(conv1) # pool1 pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool1') # norm1 norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm1') # conv2 with tf.variable_scope('conv2') as scope: kernel = _variable_with_weight_decay('weights', shape=[5, 5, 64, 64], stddev=5e-2, wd=0.0) conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME') biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1)) pre_activation = tf.nn.bias_add(conv, biases) conv2 = tf.nn.relu(pre_activation, name=scope.name) _activation_summary(conv2) # norm2 norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm2') # pool2 pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool2') # local3 with tf.variable_scope('local3') as scope: # Move everything into depth so we can perform a single matrix multiply. reshape = tf.reshape(pool2, [FLAGS.batch_size, -1]) dim = reshape.get_shape()[1].value weights = _variable_with_weight_decay('weights', shape=[dim, 384], stddev=0.04, wd=0.004) biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1)) local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name) _activation_summary(local3) # local4 with tf.variable_scope('local4') as scope: weights = _variable_with_weight_decay('weights', shape=[384, 192], stddev=0.04, wd=0.004) biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1)) local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name=scope.name) _activation_summary(local4) # linear layer(WX + b), # We don't apply softmax here because # tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits # and performs the softmax internally for efficiency. with tf.variable_scope('softmax_linear') as scope: weights = _variable_with_weight_decay('weights', [192, cifar10.NUM_CLASSES], stddev=1/192.0, wd=0.0) biases = _variable_on_cpu('biases', [cifar10.NUM_CLASSES], tf.constant_initializer(0.0)) softmax_linear = tf.add(tf.matmul(local4, weights), biases, name=scope.name) _activation_summary(softmax_linear) return softmax_linear def loss(logits, labels): """Add L2Loss to all the trainable variables. Add summary for "Loss" and "Loss/avg". Args: logits: Logits from inference(). labels: Labels from distorted_inputs or inputs(). 1-D tensor of shape [batch_size] Returns: Loss tensor of type float. """ # Calculate the average cross entropy loss across the batch. labels = tf.cast(labels, tf.int64) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=logits, name='cross_entropy_per_example') cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') tf.add_to_collection('losses', cross_entropy_mean) # The total loss is defined as the cross entropy loss plus all of the weight # decay terms (L2 loss). return tf.add_n(tf.get_collection('losses'), name='total_loss') def tower_loss(scope, images, labels): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' images: Images. 4D tensor of shape [batch_size, height, width, 3]. labels: Labels. 1D tensor of shape [batch_size]. Returns: Tensor of shape [] containing the total loss for a batch of data """ # Build inference Graph. logits = inference(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Attach a scalar summary to all individual losses and the total # loss; do the same for the averaged version of the losses. for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU # training session. This helps the clarity of presentation on # tensorboard. loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) tf.summary.scalar(loss_name, l) return total_loss def average_gradients(tower_grads): """Calculate the average gradient for each shared variable across all towers. Note that this function provides a synchronization point across all towers. Args: tower_grads: List of lists of (gradient, variable) tuples. The outer list is over individual gradients. The inner list is over the gradient calculation for each tower. Returns: List of pairs of (gradient, variable) where the gradient has been averaged across all towers. """ average_grads = [] for grad_and_vars in zip(*tower_grads): # Note that each grad_and_vars looks like the following: # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) grads = [] for g, _ in grad_and_vars: # Add 0 dimension to the gradients to represent the tower. expanded_g = tf.expand_dims(g, 0) # Append on a 'tower' dimension which we will average over below. grads.append(expanded_g) # Average over the 'tower' dimension. grad = tf.concat(axis=0, values=grads) grad = tf.reduce_mean(grad, 0) # Keep in mind that the Variables are redundant because they are shared # across towers. So .. we will just return the first tower's pointer to # the Variable. v = grad_and_vars[0][1] grad_and_var = (grad, v) average_grads.append(grad_and_var) return average_grads def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Calculate the learning rate schedule. num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE, global_step, decay_steps, cifar10.LEARNING_RATE_DECAY_FACTOR, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.GradientDescentOptimizer(lr) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * FLAGS.num_gpus) # Calculate the gradients for each model tower. tower_grads = [] with tf.variable_scope(tf.get_variable_scope()): for i in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope: # Dequeues one batch for the GPU image_batch, label_batch = batch_queue.dequeue() # Calculate the loss for one tower of the CIFAR model. This function # constructs the entire CIFAR model but shares the variables across # all towers. loss = tower_loss(scope, image_batch, label_batch) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # Calculate the gradients for the batch of data on this CIFAR tower. grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = average_gradients(tower_grads) # Add a summary to track the learning rate. summaries.append(tf.summary.scalar('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append(tf.summary.histogram(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.summary.histogram(var.op.name, var)) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( cifar10.MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) # Group all updates to into a single train op. train_op = tf.group(apply_gradient_op, variables_averages_op) # Create a saver. saver = tf.train.Saver(tf.global_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge(summaries) # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / FLAGS.num_gpus format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) def main(argv=None): # pylint: disable=unused-argument cifar10.maybe_download_and_extract() if os.path.exists(FLAGS.train_dir): shutil.rmtree(FLAGS.train_dir) os.makedirs(FLAGS.train_dir) train() if __name__ == '__main__': main()