Repository: bjpcjp/scikit-and-tensorflow-workbooks Branch: master Commit: b0f3c6e08d6a Files: 32 Total size: 13.5 MB Directory structure: gitextract_w4nw2wrh/ ├── README.md ├── _config.yml ├── ch02 - cal housing analysis.html ├── ch02 - cal housing analysis.ipynb ├── ch03-classification.html ├── ch03-classification.ipynb ├── ch04-training-models.html ├── ch04-training-models.ipynb ├── ch05-support-vector-machines.html ├── ch05-support-vector-machines.ipynb ├── ch06-decision-trees.html ├── ch06-decision-trees.ipynb ├── ch07-ensemble-learning.html ├── ch07-ensemble-learning.ipynb ├── ch08-dimensionality-reduction.html ├── ch08-dimensionality-reduction.ipynb ├── ch09-tensorflow-setup.html ├── ch09-tensorflow-setup.ipynb ├── ch10-neural-nets.html ├── ch10-neural-nets.ipynb ├── ch11-DNN-training.html ├── ch11-DNN-training.ipynb ├── ch12-distributed-TF.ipynb ├── ch13-convolutional-NNs.html ├── ch13-convolutional-NNs.ipynb ├── ch14-Recurrent-NNs.html ├── ch14-Recurrent-NNs.ipynb ├── ch15-autoencoders.html ├── ch15-autoencoders.ipynb ├── ch16-reinforcement-learning.html ├── ch16-reinforcement-learning.ipynb └── index.md ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================  [ebook repo](https://github.com/ageron/handson-ml/blob/master/15_autoencoders.ipynb) # book chapters 1) Intro to Machine Learning 2) Example end-to-end Machine Learning project (California Housing dataset) 3) Basic Classification 4) Training Techniques 5) Support Vector Machines 6) Decision Trees 7) Ensemble Learning & Random Forests 8) Dimensionality Reduction 9) TensorFlow Installation & Checkout 10) TensorFlow & Neural Nets 11) TensorFlow Training 12) TensorFlow on Distributed Hardware 13) Convolutional Neural Nets 14) Recurrent Neural Nets 15) Autoencoders 16) Reinforcement Learning ================================================ FILE: _config.yml ================================================ theme: jekyll-theme-tactile ================================================ FILE: ch02 - cal housing analysis.html ================================================
import os
import tarfile
from six.moves import urllib
import pandas as pd
import numpy as np
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"
def fetch_housing_data(
housing_url=HOUSING_URL,
housing_path=HOUSING_PATH):
# create datasets/housing directory if needed
if not os.path.isdir(housing_path):
os.makedirs(housing_path)
tgz_path = os.path.join(housing_path, "housing.tgz")
# retrieve tarfile
urllib.request.urlretrieve(housing_url, tgz_path)
# extract tarfile & close path
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path=housing_path)
housing_tgz.close()
def load_housing_data(
housing_path=HOUSING_PATH):
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)
# do it
#fetch_housing_data() -- already downloaded - static dataset
housing = load_housing_data()
housing.head()
# housing is a Pandas DataFrame.
# untouched datafile: 20640 records, 10 cols (9 float, 1 text)
housing.info()
# let's see if ocean_proximity can be lumped into categories:
housing['ocean_proximity'].value_counts()
# percentiles analysis of each feature
housing.describe()
# feature histograms
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
# split dataset into training (80%) and test (20%) subsets
import numpy as np
def split_train_test(
data, test_ratio):
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]
train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), "train +", len(test_set), "test")
# create method for ensuring consistent test sets across multiple runs
# (new test sets won't contain instances in previous training sets.)
# example method:
# compute hash of each instance
# keep only the last byte
# include instance in test set if value < 51 (20% of 256)
import hashlib
def test_set_check(
identifier, test_ratio, hash):
return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio
def split_train_test_by_id(
data, test_ratio, id_column, hash=hashlib.md5):
ids = data[id_column]
in_test_set = ids.apply(
lambda id_: test_set_check(
id_, test_ratio, hash))
return data.loc[~in_test_set], data.loc[in_test_set]
# housing dataset doesn't have ID attribute,
# so let's add an index to it.
housing_with_id = housing.reset_index()
train_set, test_set = split_train_test_by_id(
housing_with_id, 0.2, "index")
train_set.head()
test_set.head()
# a better index:
# let's use longitude & latitude to build stable identifier
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(
housing_with_id, 0.2, "id")
train_set.head()
test_set.head()
# another option: scikit-learn splitters
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(
housing, test_size=0.2, random_state=42)
test_set.head()
train_set.head()
# does sampling plan have a sampling bias?
# each strata in test dataset should mimic reality
housing['median_income'].hist(bins=5)
housing.describe()
housing["income_cat"]=np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"]<5, 5.0, inplace=True)
housing.describe()
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(
n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
# review income category proportions
housing["income_cat"].value_counts() / len(housing)
# remove income_cat attribute (return dataset to original state)
for set in (strat_train_set, strat_test_set):
set.drop(["income_cat"], axis=1, inplace=True)
housing = strat_train_set.copy()
# first: basic geographic distribution
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
# next: housing prices.
# color = price
# radius = population
# use predefined "jet" color map
housing.plot(
kind="scatter",
x="longitude",
y="latitude",
alpha=0.4,
#s=housing["population"].apply(lambda n: n/100),
s=housing["population"]/100,
label="population",
c="median_house_value",
cmap=plt.get_cmap("jet"),
colorbar=True,
)
plt.legend()
# next: look for correlatons to median house value.
corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)
# another way of looking for correlations: scatter_matrix
# focus on top 3 factors from above
from pandas.tools.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms",
"housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
# median house value to median income seems to be the most promising.
# let's zoom in.
housing.plot(
kind="scatter", x="median_income", y="median_house_value",
alpha=0.1)
# combine some attributes to create more useful ones
# then rebuild the correlation matrix.
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]
corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)
# *** NOTE: rooms_per_household corr (in book) show more improvement, ~0.199
# compared to our 0.146. Not sure of root cause yet. ***
# revert to clean copy of stratified training dataset
# separate predictors from labels
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
# 'total bedrooms' has some missing values - fix
# can use DataFrame dropna(), drop(), fillna()
# use Scikit-Learn class to handle missing values
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")
# drop ocean_proximity attribute, since it's non-numeric.
# then fit to training data.
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
# now what do we have?
imputer.statistics_
housing_num.median().values
# update training set by replacing missing values with learned medians
X = imputer.transform(housing_num)
pd.DataFrame(X, columns=housing_num.columns).info()
# convert ocean_proximity feature to numbers using LabelEncoder.
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
housing_cat = housing['ocean_proximity']
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded
# how is 'ocean_proximity' mapped?
print(encoder.classes_)
# a better solution for categorical data: one-hot encoding
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
# output = SciPy sparse matrix, better for memory usage
# if you need a dense NumPy array, call toarray()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X, y=None):
rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
population_per_household = X[:, population_ix] / X[:, household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X,
rooms_per_household,
population_per_household,
bedrooms_per_room]
else:
return np.c_[X,
rooms_per_household,
population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
# "DataFrameSelector" is a custom transformer class.
# grabs the specified feature, drops the rest, converts the DF into a NumPy array.
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__ (self, attribute_names):
self.attribute_names = attribute_names
def fit (self, X, y=None):
return self
def transform (self, X):
return X[self.attribute_names].values
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', Imputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('label_binarizer', LabelBinarizer()),
])
full_pipeline = FeatureUnion(transformer_list =[
('num_pipeline', num_pipeline),
('cat_pipeline', cat_pipeline)
])
# let's try it out:
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
housing_prepared.shape
# let's start with a linear regression
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
# first try. NOT very accurate.
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print ("predictions:\t", lin_reg.predict(some_data_prepared))
print ("labels:\t", list(some_labels))
# why? look at RMSE on whole training set.
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print ("typical prediction error:\t", lin_rmse)
# Hmmm. Not good. Underfit situation.
# Let's try a more powerful model, like a Decision Tree.
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
# Zero error? No way...
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
print ("typical prediction error:\t", tree_rmse)
# Use K-fold cross-validation
# Train & eval Decision Tree model against 10 splits of training dataset
# Returns 10 evaluation scores.
from sklearn.model_selection import cross_val_score
scores = cross_val_score(
tree_reg,
housing_prepared,
housing_labels,
scoring="neg_mean_squared_error",
cv=10)
rmse_scores = np.sqrt(-scores)
def display_scores(scores):
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard deviation:", scores.std())
display_scores(rmse_scores)
# So, Decision Tree RMSE: mean ~71097, stdev 2165 (still sucks.)
# compare to earlier Linear Regression:
lin_scores = cross_val_score(
lin_reg,
housing_prepared,
housing_labels,
scoring="neg_mean_squared_error",
cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)
# Yep, DT overfit is just about as bad. (RMSE mean 69052, stdev 2731)
# Let's try a RandomForest.
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
forest_scores = cross_val_score(
forest_reg,
housing_prepared,
housing_labels,
scoring="neg_mean_squared_error",
cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)
# OK, RandomForest is a little better.
# RMSE mean ~52495, stdev ~1569
from sklearn.model_selection import GridSearchCV
param_grid = [
{'n_estimators': [3, 10, 30],
'max_features': [2, 4, 6, 8]},
{'bootstrap': [False], # bootstrap = True = default setting
'n_estimators': [3, 10],
'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(
forest_reg,
param_grid,
cv=5,
scoring = 'neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)
# Best combination of parameters?
grid_search.best_params_
# Best estimator?
grid_search.best_estimator_
# Evaluation scores:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"],
cvres["params"]):
print(np.sqrt(-mean_score), params)
# best solution:
# max_features = 6, n_estimators = 30 (RMSE ~49,960)
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances
# display feature "importance" scores next to their names:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_one_hot_attribs = list(encoder.classes_)
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)
final_model = grid_search.best_estimator_
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse
| \n", " | longitude | \n", "latitude | \n", "housing_median_age | \n", "total_rooms | \n", "total_bedrooms | \n", "population | \n", "households | \n", "median_income | \n", "median_house_value | \n", "ocean_proximity | \n", "
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "-122.23 | \n", "37.88 | \n", "41.0 | \n", "880.0 | \n", "129.0 | \n", "322.0 | \n", "126.0 | \n", "8.3252 | \n", "452600.0 | \n", "NEAR BAY | \n", "
| 1 | \n", "-122.22 | \n", "37.86 | \n", "21.0 | \n", "7099.0 | \n", "1106.0 | \n", "2401.0 | \n", "1138.0 | \n", "8.3014 | \n", "358500.0 | \n", "NEAR BAY | \n", "
| 2 | \n", "-122.24 | \n", "37.85 | \n", "52.0 | \n", "1467.0 | \n", "190.0 | \n", "496.0 | \n", "177.0 | \n", "7.2574 | \n", "352100.0 | \n", "NEAR BAY | \n", "
| 3 | \n", "-122.25 | \n", "37.85 | \n", "52.0 | \n", "1274.0 | \n", "235.0 | \n", "558.0 | \n", "219.0 | \n", "5.6431 | \n", "341300.0 | \n", "NEAR BAY | \n", "
| 4 | \n", "-122.25 | \n", "37.85 | \n", "52.0 | \n", "1627.0 | \n", "280.0 | \n", "565.0 | \n", "259.0 | \n", "3.8462 | \n", "342200.0 | \n", "NEAR BAY | \n", "
| \n", " | longitude | \n", "latitude | \n", "housing_median_age | \n", "total_rooms | \n", "total_bedrooms | \n", "population | \n", "households | \n", "median_income | \n", "median_house_value | \n", "
|---|---|---|---|---|---|---|---|---|---|
| count | \n", "20640.000000 | \n", "20640.000000 | \n", "20640.000000 | \n", "20640.000000 | \n", "20433.000000 | \n", "20640.000000 | \n", "20640.000000 | \n", "20640.000000 | \n", "20640.000000 | \n", "
| mean | \n", "-119.569704 | \n", "35.631861 | \n", "28.639486 | \n", "2635.763081 | \n", "537.870553 | \n", "1425.476744 | \n", "499.539680 | \n", "3.870671 | \n", "206855.816909 | \n", "
| std | \n", "2.003532 | \n", "2.135952 | \n", "12.585558 | \n", "2181.615252 | \n", "421.385070 | \n", "1132.462122 | \n", "382.329753 | \n", "1.899822 | \n", "115395.615874 | \n", "
| min | \n", "-124.350000 | \n", "32.540000 | \n", "1.000000 | \n", "2.000000 | \n", "1.000000 | \n", "3.000000 | \n", "1.000000 | \n", "0.499900 | \n", "14999.000000 | \n", "
| 25% | \n", "-121.800000 | \n", "33.930000 | \n", "18.000000 | \n", "1447.750000 | \n", "296.000000 | \n", "787.000000 | \n", "280.000000 | \n", "2.563400 | \n", "119600.000000 | \n", "
| 50% | \n", "-118.490000 | \n", "34.260000 | \n", "29.000000 | \n", "2127.000000 | \n", "435.000000 | \n", "1166.000000 | \n", "409.000000 | \n", "3.534800 | \n", "179700.000000 | \n", "
| 75% | \n", "-118.010000 | \n", "37.710000 | \n", "37.000000 | \n", "3148.000000 | \n", "647.000000 | \n", "1725.000000 | \n", "605.000000 | \n", "4.743250 | \n", "264725.000000 | \n", "
| max | \n", "-114.310000 | \n", "41.950000 | \n", "52.000000 | \n", "39320.000000 | \n", "6445.000000 | \n", "35682.000000 | \n", "6082.000000 | \n", "15.000100 | \n", "500001.000000 | \n", "
| \n", " | index | \n", "longitude | \n", "latitude | \n", "housing_median_age | \n", "total_rooms | \n", "total_bedrooms | \n", "population | \n", "households | \n", "median_income | \n", "median_house_value | \n", "ocean_proximity | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "0 | \n", "-122.23 | \n", "37.88 | \n", "41.0 | \n", "880.0 | \n", "129.0 | \n", "322.0 | \n", "126.0 | \n", "8.3252 | \n", "452600.0 | \n", "NEAR BAY | \n", "
| 1 | \n", "1 | \n", "-122.22 | \n", "37.86 | \n", "21.0 | \n", "7099.0 | \n", "1106.0 | \n", "2401.0 | \n", "1138.0 | \n", "8.3014 | \n", "358500.0 | \n", "NEAR BAY | \n", "
| 2 | \n", "2 | \n", "-122.24 | \n", "37.85 | \n", "52.0 | \n", "1467.0 | \n", "190.0 | \n", "496.0 | \n", "177.0 | \n", "7.2574 | \n", "352100.0 | \n", "NEAR BAY | \n", "
| 3 | \n", "3 | \n", "-122.25 | \n", "37.85 | \n", "52.0 | \n", "1274.0 | \n", "235.0 | \n", "558.0 | \n", "219.0 | \n", "5.6431 | \n", "341300.0 | \n", "NEAR BAY | \n", "
| 6 | \n", "6 | \n", "-122.25 | \n", "37.84 | \n", "52.0 | \n", "2535.0 | \n", "489.0 | \n", "1094.0 | \n", "514.0 | \n", "3.6591 | \n", "299200.0 | \n", "NEAR BAY | \n", "
| \n", " | index | \n", "longitude | \n", "latitude | \n", "housing_median_age | \n", "total_rooms | \n", "total_bedrooms | \n", "population | \n", "households | \n", "median_income | \n", "median_house_value | \n", "ocean_proximity | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | \n", "4 | \n", "-122.25 | \n", "37.85 | \n", "52.0 | \n", "1627.0 | \n", "280.0 | \n", "565.0 | \n", "259.0 | \n", "3.8462 | \n", "342200.0 | \n", "NEAR BAY | \n", "
| 5 | \n", "5 | \n", "-122.25 | \n", "37.85 | \n", "52.0 | \n", "919.0 | \n", "213.0 | \n", "413.0 | \n", "193.0 | \n", "4.0368 | \n", "269700.0 | \n", "NEAR BAY | \n", "
| 11 | \n", "11 | \n", "-122.26 | \n", "37.85 | \n", "52.0 | \n", "3503.0 | \n", "752.0 | \n", "1504.0 | \n", "734.0 | \n", "3.2705 | \n", "241800.0 | \n", "NEAR BAY | \n", "
| 20 | \n", "20 | \n", "-122.27 | \n", "37.85 | \n", "40.0 | \n", "751.0 | \n", "184.0 | \n", "409.0 | \n", "166.0 | \n", "1.3578 | \n", "147500.0 | \n", "NEAR BAY | \n", "
| 23 | \n", "23 | \n", "-122.27 | \n", "37.84 | \n", "52.0 | \n", "1688.0 | \n", "337.0 | \n", "853.0 | \n", "325.0 | \n", "2.1806 | \n", "99700.0 | \n", "NEAR BAY | \n", "
| \n", " | index | \n", "longitude | \n", "latitude | \n", "housing_median_age | \n", "total_rooms | \n", "total_bedrooms | \n", "population | \n", "households | \n", "median_income | \n", "median_house_value | \n", "ocean_proximity | \n", "id | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "0 | \n", "-122.23 | \n", "37.88 | \n", "41.0 | \n", "880.0 | \n", "129.0 | \n", "322.0 | \n", "126.0 | \n", "8.3252 | \n", "452600.0 | \n", "NEAR BAY | \n", "-122192.12 | \n", "
| 1 | \n", "1 | \n", "-122.22 | \n", "37.86 | \n", "21.0 | \n", "7099.0 | \n", "1106.0 | \n", "2401.0 | \n", "1138.0 | \n", "8.3014 | \n", "358500.0 | \n", "NEAR BAY | \n", "-122182.14 | \n", "
| 2 | \n", "2 | \n", "-122.24 | \n", "37.85 | \n", "52.0 | \n", "1467.0 | \n", "190.0 | \n", "496.0 | \n", "177.0 | \n", "7.2574 | \n", "352100.0 | \n", "NEAR BAY | \n", "-122202.15 | \n", "
| 3 | \n", "3 | \n", "-122.25 | \n", "37.85 | \n", "52.0 | \n", "1274.0 | \n", "235.0 | \n", "558.0 | \n", "219.0 | \n", "5.6431 | \n", "341300.0 | \n", "NEAR BAY | \n", "-122212.15 | \n", "
| 4 | \n", "4 | \n", "-122.25 | \n", "37.85 | \n", "52.0 | \n", "1627.0 | \n", "280.0 | \n", "565.0 | \n", "259.0 | \n", "3.8462 | \n", "342200.0 | \n", "NEAR BAY | \n", "-122212.15 | \n", "
| \n", " | index | \n", "longitude | \n", "latitude | \n", "housing_median_age | \n", "total_rooms | \n", "total_bedrooms | \n", "population | \n", "households | \n", "median_income | \n", "median_house_value | \n", "ocean_proximity | \n", "id | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8 | \n", "8 | \n", "-122.26 | \n", "37.84 | \n", "42.0 | \n", "2555.0 | \n", "665.0 | \n", "1206.0 | \n", "595.0 | \n", "2.0804 | \n", "226700.0 | \n", "NEAR BAY | \n", "-122222.16 | \n", "
| 10 | \n", "10 | \n", "-122.26 | \n", "37.85 | \n", "52.0 | \n", "2202.0 | \n", "434.0 | \n", "910.0 | \n", "402.0 | \n", "3.2031 | \n", "281500.0 | \n", "NEAR BAY | \n", "-122222.15 | \n", "
| 11 | \n", "11 | \n", "-122.26 | \n", "37.85 | \n", "52.0 | \n", "3503.0 | \n", "752.0 | \n", "1504.0 | \n", "734.0 | \n", "3.2705 | \n", "241800.0 | \n", "NEAR BAY | \n", "-122222.15 | \n", "
| 12 | \n", "12 | \n", "-122.26 | \n", "37.85 | \n", "52.0 | \n", "2491.0 | \n", "474.0 | \n", "1098.0 | \n", "468.0 | \n", "3.0750 | \n", "213500.0 | \n", "NEAR BAY | \n", "-122222.15 | \n", "
| 13 | \n", "13 | \n", "-122.26 | \n", "37.84 | \n", "52.0 | \n", "696.0 | \n", "191.0 | \n", "345.0 | \n", "174.0 | \n", "2.6736 | \n", "191300.0 | \n", "NEAR BAY | \n", "-122222.16 | \n", "
| \n", " | longitude | \n", "latitude | \n", "housing_median_age | \n", "total_rooms | \n", "total_bedrooms | \n", "population | \n", "households | \n", "median_income | \n", "median_house_value | \n", "ocean_proximity | \n", "
|---|---|---|---|---|---|---|---|---|---|---|
| 20046 | \n", "-119.01 | \n", "36.06 | \n", "25.0 | \n", "1505.0 | \n", "NaN | \n", "1392.0 | \n", "359.0 | \n", "1.6812 | \n", "47700.0 | \n", "INLAND | \n", "
| 3024 | \n", "-119.46 | \n", "35.14 | \n", "30.0 | \n", "2943.0 | \n", "NaN | \n", "1565.0 | \n", "584.0 | \n", "2.5313 | \n", "45800.0 | \n", "INLAND | \n", "
| 15663 | \n", "-122.44 | \n", "37.80 | \n", "52.0 | \n", "3830.0 | \n", "NaN | \n", "1310.0 | \n", "963.0 | \n", "3.4801 | \n", "500001.0 | \n", "NEAR BAY | \n", "
| 20484 | \n", "-118.72 | \n", "34.28 | \n", "17.0 | \n", "3051.0 | \n", "NaN | \n", "1705.0 | \n", "495.0 | \n", "5.7376 | \n", "218600.0 | \n", "<1H OCEAN | \n", "
| 9814 | \n", "-121.93 | \n", "36.62 | \n", "34.0 | \n", "2351.0 | \n", "NaN | \n", "1063.0 | \n", "428.0 | \n", "3.7250 | \n", "278000.0 | \n", "NEAR OCEAN | \n", "
| \n", " | longitude | \n", "latitude | \n", "housing_median_age | \n", "total_rooms | \n", "total_bedrooms | \n", "population | \n", "households | \n", "median_income | \n", "median_house_value | \n", "ocean_proximity | \n", "
|---|---|---|---|---|---|---|---|---|---|---|
| 14196 | \n", "-117.03 | \n", "32.71 | \n", "33.0 | \n", "3126.0 | \n", "627.0 | \n", "2300.0 | \n", "623.0 | \n", "3.2596 | \n", "103000.0 | \n", "NEAR OCEAN | \n", "
| 8267 | \n", "-118.16 | \n", "33.77 | \n", "49.0 | \n", "3382.0 | \n", "787.0 | \n", "1314.0 | \n", "756.0 | \n", "3.8125 | \n", "382100.0 | \n", "NEAR OCEAN | \n", "
| 17445 | \n", "-120.48 | \n", "34.66 | \n", "4.0 | \n", "1897.0 | \n", "331.0 | \n", "915.0 | \n", "336.0 | \n", "4.1563 | \n", "172600.0 | \n", "NEAR OCEAN | \n", "
| 14265 | \n", "-117.11 | \n", "32.69 | \n", "36.0 | \n", "1421.0 | \n", "367.0 | \n", "1418.0 | \n", "355.0 | \n", "1.9425 | \n", "93400.0 | \n", "NEAR OCEAN | \n", "
| 2271 | \n", "-119.80 | \n", "36.78 | \n", "43.0 | \n", "2382.0 | \n", "431.0 | \n", "874.0 | \n", "380.0 | \n", "3.5542 | \n", "96500.0 | \n", "INLAND | \n", "
| \n", " | longitude | \n", "latitude | \n", "housing_median_age | \n", "total_rooms | \n", "total_bedrooms | \n", "population | \n", "households | \n", "median_income | \n", "median_house_value | \n", "
|---|---|---|---|---|---|---|---|---|---|
| count | \n", "20640.000000 | \n", "20640.000000 | \n", "20640.000000 | \n", "20640.000000 | \n", "20433.000000 | \n", "20640.000000 | \n", "20640.000000 | \n", "20640.000000 | \n", "20640.000000 | \n", "
| mean | \n", "-119.569704 | \n", "35.631861 | \n", "28.639486 | \n", "2635.763081 | \n", "537.870553 | \n", "1425.476744 | \n", "499.539680 | \n", "3.870671 | \n", "206855.816909 | \n", "
| std | \n", "2.003532 | \n", "2.135952 | \n", "12.585558 | \n", "2181.615252 | \n", "421.385070 | \n", "1132.462122 | \n", "382.329753 | \n", "1.899822 | \n", "115395.615874 | \n", "
| min | \n", "-124.350000 | \n", "32.540000 | \n", "1.000000 | \n", "2.000000 | \n", "1.000000 | \n", "3.000000 | \n", "1.000000 | \n", "0.499900 | \n", "14999.000000 | \n", "
| 25% | \n", "-121.800000 | \n", "33.930000 | \n", "18.000000 | \n", "1447.750000 | \n", "296.000000 | \n", "787.000000 | \n", "280.000000 | \n", "2.563400 | \n", "119600.000000 | \n", "
| 50% | \n", "-118.490000 | \n", "34.260000 | \n", "29.000000 | \n", "2127.000000 | \n", "435.000000 | \n", "1166.000000 | \n", "409.000000 | \n", "3.534800 | \n", "179700.000000 | \n", "
| 75% | \n", "-118.010000 | \n", "37.710000 | \n", "37.000000 | \n", "3148.000000 | \n", "647.000000 | \n", "1725.000000 | \n", "605.000000 | \n", "4.743250 | \n", "264725.000000 | \n", "
| max | \n", "-114.310000 | \n", "41.950000 | \n", "52.000000 | \n", "39320.000000 | \n", "6445.000000 | \n", "35682.000000 | \n", "6082.000000 | \n", "15.000100 | \n", "500001.000000 | \n", "
| \n", " | longitude | \n", "latitude | \n", "housing_median_age | \n", "total_rooms | \n", "total_bedrooms | \n", "population | \n", "households | \n", "median_income | \n", "median_house_value | \n", "income_cat | \n", "
|---|---|---|---|---|---|---|---|---|---|---|
| count | \n", "20640.000000 | \n", "20640.000000 | \n", "20640.000000 | \n", "20640.000000 | \n", "20433.000000 | \n", "20640.000000 | \n", "20640.000000 | \n", "20640.000000 | \n", "20640.000000 | \n", "20640.000000 | \n", "
| mean | \n", "-119.569704 | \n", "35.631861 | \n", "28.639486 | \n", "2635.763081 | \n", "537.870553 | \n", "1425.476744 | \n", "499.539680 | \n", "3.870671 | \n", "206855.816909 | \n", "3.006686 | \n", "
| std | \n", "2.003532 | \n", "2.135952 | \n", "12.585558 | \n", "2181.615252 | \n", "421.385070 | \n", "1132.462122 | \n", "382.329753 | \n", "1.899822 | \n", "115395.615874 | \n", "1.054618 | \n", "
| min | \n", "-124.350000 | \n", "32.540000 | \n", "1.000000 | \n", "2.000000 | \n", "1.000000 | \n", "3.000000 | \n", "1.000000 | \n", "0.499900 | \n", "14999.000000 | \n", "1.000000 | \n", "
| 25% | \n", "-121.800000 | \n", "33.930000 | \n", "18.000000 | \n", "1447.750000 | \n", "296.000000 | \n", "787.000000 | \n", "280.000000 | \n", "2.563400 | \n", "119600.000000 | \n", "2.000000 | \n", "
| 50% | \n", "-118.490000 | \n", "34.260000 | \n", "29.000000 | \n", "2127.000000 | \n", "435.000000 | \n", "1166.000000 | \n", "409.000000 | \n", "3.534800 | \n", "179700.000000 | \n", "3.000000 | \n", "
| 75% | \n", "-118.010000 | \n", "37.710000 | \n", "37.000000 | \n", "3148.000000 | \n", "647.000000 | \n", "1725.000000 | \n", "605.000000 | \n", "4.743250 | \n", "264725.000000 | \n", "4.000000 | \n", "
| max | \n", "-114.310000 | \n", "41.950000 | \n", "52.000000 | \n", "39320.000000 | \n", "6445.000000 | \n", "35682.000000 | \n", "6082.000000 | \n", "15.000100 | \n", "500001.000000 | \n", "5.000000 | \n", "
# Alternative local file loader (due to mldata.org being down)
from scipy.io import loadmat
mnist_raw = loadmat("mnist-original.mat")
mnist = {
"data": mnist_raw["data"].T,
"target": mnist_raw["label"][0],
"COL_NAMES": ["label", "data"],
"DESCR": "mldata.org dataset: mnist-original",
}
# 70K images, 28x28 pixels/image, each pixel = 0 (white) to 255 (black)
mnist # a dict object
# take a peek
X,y = mnist['data'], mnist['target']
X.shape, y.shape
# display example image
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
some_digit = X[36000]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(
some_digit_image,
cmap = matplotlib.cm.binary,
interpolation="nearest")
plt.axis("off")
plt.show()
# looks like a "five". What's the corresponding label?
y[36000]
# dataset already split into training (1st 60K) & test (last 10K) images.
# shuffle training set for cross-validation quality
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
import numpy as np
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# Start by only trying to ID "five" digits.
y_train_5 = (y_train == 5) # create target vectors
y_test_5 = (y_test == 5)
print(y_train_5.shape, y_train_5)
print(y_test_5.shape, y_test_5)
# SGD classifier: good at handling large DBs
# also good at handling one-at-a-time learning
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)
# did it correctly predict the "five" found above?
print(sgd_clf.predict([some_digit]))
# measure accuracy using K-fold (n=3) cross-validation scores
from sklearn.model_selection import cross_val_score
print(cross_val_score(
sgd_clf,
X_train,
y_train_5,
cv=3,
scoring="accuracy"))
# 90% accuracy = pretty easy when 90% of digits aren't fives to begin with ... :-|
# rolling your own cross-validation. Results should be similar-ish to above.
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
skfolds = StratifiedKFold(n_splits=3, random_state=42)
for train_index, test_index in skfolds.split(X_train, y_train_5):
clone_clf = clone(sgd_clf)
X_train_folds = X_train[train_index]
y_train_folds = (y_train_5[train_index])
X_test_fold = X_train[test_index]
y_test_fold = (y_train_5[test_index])
clone_clf.fit(X_train_folds, y_train_folds)
y_pred = clone_clf.predict(X_test_fold)
n_correct = sum(y_pred == y_test_fold)
print(n_correct / len(y_pred))
# 95% accuracy sounds too good to be true. How about not-fives?
from sklearn.base import BaseEstimator
class Never5Classifier(BaseEstimator):
def fit(self, X, y=None):
pass
def predict(self, X):
return np.zeros((len(X), 1), dtype=bool)
never_5_clf = Never5Classifier()
print(cross_val_score(
never_5_clf,
X_train,
y_train_5,
cv=3,
scoring="accuracy"))
# only ~10% of images are "five", so ~90% of images are "not five".
# You SHOULD be right about 90% of the time. :-)
# Lesson Learned:
# Accuracy not a good metric for classifiers - esp those with skewed datasets.
# general idea: count #times instances of A are classified as B.
# first, need a set of predictions.
from sklearn.model_selection import cross_val_predict
# Generate cross-val'd predictions for each datapoint
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
# ROWS = actual classes
# COLS = predicted classes
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train_5, y_train_pred))

print(3841 / (3841+1515), 3841/(3841+1580))
# precision, recall, f1 metrics
# precision/recall tradeoff: increasing one reduces the other.
from sklearn.metrics import precision_score, recall_score, f1_score
print("precision:\n",precision_score(y_train_5, y_train_pred))
print("recall:\n",recall_score(y_train_5, y_train_pred))
# F1 score favors classifiers with similar precision & recall.
print("f1:\n",f1_score(y_train_5, y_train_pred))
# Scikit doesn't let you directly set threshold values (which drive the decision
# function for precision/recall.) But you can use the decision function itself.
y_scores = sgd_clf.decision_function([some_digit])
print(y_scores)
threshold = 0
y_some_digit_pred = (y_scores > threshold)
print(y_some_digit_pred)
# raising the threshold reduces recall...
threshold = 200000
y_some_digit_pred = (y_scores > threshold)
print(y_some_digit_pred)

# how to find the right threshold?
# start with getting decision scores instead of predictions.
y_scores = cross_val_predict(
sgd_clf,
X_train,
y_train_5,
cv=3,
method="decision_function")
# use results to build a precision/recall curve
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
# plot the result
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.plot(thresholds,
precisions[:-1],
"b--",
label="Precision")
plt.plot(thresholds,
recalls[:-1],
"g-",
label="Recall")
plt.xlabel("Threshold")
plt.legend(loc="upper left")
plt.ylim([0, 1])
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()
# plot precision vs recall to look for knee of the curve
def plot_precision_vs_recall(precisions, recalls):
plt.plot(recalls, precisions, "b-", linewidth=2)
plt.xlabel("Recall", fontsize=16)
plt.ylabel("Precision", fontsize=16)
plt.axis([0, 1, 0, 1])
plt.figure(figsize=(10, 4))
plot_precision_vs_recall(precisions, recalls)
plt.show()
# assume you're targeting 90% precision:
# guesswork from precision-recall curve suggests setting threshold ~50000
y_train_pred_90 = (y_scores > 50000)
print(y_train_pred_90.shape, y_train_pred_90)
print("precision:\n",precision_score(y_train_5, y_train_pred_90))
print("recall:\n",recall_score(y_train_5, y_train_pred_90))
# ROC plots TRUE POSITIVE rate (TP = recall) vs FALSE POSITIVE rate. (FP = 1-specificity)
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)
def plot_roc_curve(fpr, tpr, label=None):
plt.plot(fpr, tpr, linewidth=2, label=label)
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([0, 1, 0, 1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plot_roc_curve(fpr, tpr)
plt.show()
# tradeoff: higher recall (TP) => more false positives produced.
# dotted line = purely random classifier results.
# area under curve (AUC) metric:
# perfect score = ROC AUC = 1.0
# random score = ROC AUC = 0.5
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_train_5, y_scores))
# train Random Forest classifier
# compare its ROC curve & AUC to SGD classifier
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
# Random Forest doesn't have decision_function(); use predict_proba() instead.
# returns array (row per instance, column per class)
y_probas_forest = cross_val_predict(
forest_clf,
X_train,
y_train_5,
cv=3,
method="predict_proba")
# To plot ROC curve, you need scores - not probabilities.
# use positive class probability as the score.
y_scores_forest = y_probas_forest[:, 1]
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5,y_scores_forest)
# plot ROC curve
plt.plot(fpr, tpr, "b:", label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc="lower right")
plt.show()
# Random Forest curve looks much steeper (better). How's the ROC AUC score?
print(roc_auc_score(y_train_5, y_scores_forest))
# How's the precision & recall?
y_train_pred_forest = cross_val_predict(
forest_clf,
X_train,
y_train_5,
cv=3)
print(precision_score(y_train_5, y_train_pred_forest))
print(recall_score(y_train_5, y_train_pred_forest))
# some algorithms (RF, Bayes, ..) can handle multiple classes
# others (SVMs, linear, ...) cannot
# one-vs-all (OVA) strategy for 0-9 digit classication:
# 10 binary classifiers, one for each digit -- select class with highest score
# one-vs-one (OVO) strategy:
# train classifiers for every PAIR of digits -- N*(N-1)/2 classifiers needed!
# Scikit detects using binary classifier when multi-class problem is present,
# auto-selects OVA.
sgd_clf.fit(X_train, y_train)
print(sgd_clf.predict([some_digit])) # can SGD correctly predict the "five"?
# let's see 10 scores, one per class.
# highest score corresponds to "five".
some_digit_scores = sgd_clf.decision_function([some_digit])
print(some_digit_scores)
print(sgd_clf.classes_)
# to force Scikit to use OVO (in this case) or OVA: use corresponding classifier.
from sklearn.multiclass import OneVsOneClassifier
ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42))
ovo_clf.fit(X_train, y_train)
print("prediction:\n",ovo_clf.predict([some_digit]))
# same thing for Random Forest (RF can directly handle multiple classifications)
forest_clf.fit(X_train, y_train)
print("prediction via Random Forest:\n",forest_clf.predict([some_digit]))
print("probability via Random Forest:\n",forest_clf.predict_proba([some_digit]))
# let's check these classifiers via CV. SGD first.
print("CV score:\n",cross_val_score(
sgd_clf,
X_train,
y_train,
cv=3,
scoring="accuracy"))
# scaling the inputs should help improve the scores.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
print("CV score, scaled inputs:\n",cross_val_score(
sgd_clf,
X_train_scaled,
y_train,
cv=3,
scoring="accuracy"))
# as earlier: a confusion matrix from the SGD classificer
y_train_pred = cross_val_predict(
sgd_clf,
X_train_scaled,
y_train,
cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
print("confusion matrix:\n",conf_mx)
# image equivalent
plt.matshow(conf_mx, cmap=plt.cm.gray)
plt.show()
# focus on errors.
# 1st: divide each value in confusion matrix by #images in corresponding class
# (compares error rates instead of #errors)
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums
# fill diagonals with zeroes to keep only the errors, and plot.
# brighter colors = more misclassifications
np.fill_diagonal(norm_conf_mx, 0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
plt.show()
# rows = actual classes
# cols = predicted classes
# 8s & 9s are a problem.
# more on analyzing individual errors
# EXTRA
def plot_digits(instances, images_per_row=10, **options):
size = 28
images_per_row = min(len(instances), images_per_row)
images = [instance.reshape(size,size) for instance in instances]
n_rows = (len(instances) - 1) // images_per_row + 1
row_images = []
n_empty = n_rows * images_per_row - len(instances)
images.append(np.zeros((size, size * n_empty)))
for row in range(n_rows):
rimages = images[row * images_per_row : (row + 1) * images_per_row]
row_images.append(np.concatenate(rimages, axis=1))
image = np.concatenate(row_images, axis=0)
plt.imshow(image, cmap = matplotlib.cm.binary, **options)
plt.axis("off")
cl_a, cl_b = 3, 5
X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]
plt.figure(figsize=(8,8))
plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)
plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)
plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)
plt.show()
# shows difficulty in seeing difference between threes and fives.
# We used SGDclassifier, which is sensitive to image shifts/rotates.
# use case: returning multiple classes for each instance
# (example: multiple people's faces in one picture.)
# create y_multilabel array with 2 target labels for each digit image:
# first = large digit (7,8,9)?; second = odd (1,3,5,7,9)?
from sklearn.neighbors import KNeighborsClassifier
y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
print("large nums?\n",y_train_large)
print("odd nums?\n",y_train_odd)
y_multilabel = np.c_[y_train_large, y_train_odd]
print("combined (multilabel)?\n",y_multilabel)
# KNeighbors classifier supports multilabeling
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)
# make example prediction using "some_digit" from above
# >= 7 = false (correct); odd digit = true (correct)
print("KNN prediction of some_digit: (>=7? odd?)\n",knn_clf.predict([some_digit]))
# another example: find avg F1 score across all labels
y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_train, cv=3)
print(f1_score(
y_train,
y_train_knn_pred,
average="macro")) # use "weighted" if more weight to be given to more common labels.
# generalization of multilabel, where each label can have multiple values.
# example: build image noise removal system
# start by adding noise to MNIST dataset
import numpy.random as rnd
noise = rnd.randint(0, 100, (len(X_train), 784))
X_train_mod = X_train + noise
noise = rnd.randint(0, 100, (len(X_test), 784))
X_test_mod = X_test + noise
y_train_mod = X_train
y_test_mod = X_test
some_index = 5500
def plot_digit(data):
image = data.reshape(28, 28)
plt.imshow(image, cmap = matplotlib.cm.binary,
interpolation="nearest")
plt.axis("off")
# train classifier, and clean up the image
knn_clf.fit(X_train_mod, y_train_mod)
clean_digit = knn_clf.predict([X_test_mod[some_index]])
plot_digit(clean_digit)
some_index = 5500
plt.subplot(121); plot_digit(X_test_mod[some_index])
plt.subplot(122); plot_digit(y_test_mod[some_index])
#save_fig("noisy_digit_example_plot")
plt.show()
# left: noisy image; right: cleaned up
= theta^T (dot) x --- theta^T = theta vector, transposed (row instead of col)
Training a model = finding theta that minimizes error function (ex: MSE)

# generate some data
import numpy as np
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)
%matplotlib inline
import matplotlib.pyplot as plt
plt.scatter(X,y)
plt.show()
# find theta.
# 1) use NumPy's matrix inverse function.
# 2) use dot method for matrix multiply.
X_b = np.c_[np.ones((100, 1)), X] # add x0 = 1 to each instance
theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)
# results:
print(theta_best) # compare to generated data: y = 4 + 3x + noise
# make some predictions
X_new = np.array([[0],[1],[2]])
X_new_b = np.c_[np.ones((3, 1)), X_new] # add x0 = 1 to each instance
y_predict = X_new_b.dot(theta_best)
print(y_predict)
# then plot
plt.plot(X_new, y_predict, "r-")
plt.plot(X, y, "b.")
plt.axis([0, 2, 0, 15])
plt.show()
# Scikit equivalent
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X,y)
print("intercept & coefficient:\n", lin_reg.intercept_, lin_reg.coef_)
print("predictions:\n", lin_reg.predict(X_new))
# Gradient Descent - Batch
# (Batch: math includes full training set X.)
# need to find partial derivative (slope) of the cost function
# for each model parameter (theta).
theta_path_bgd = []
eta = 0.1 # learning rate
n_iterations = 1000
m = 100
theta = np.random.randn(2,1) # random initialization
for iteration in range(n_iterations):
gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)
theta = theta - eta * gradients
theta_path_bgd.append(theta)
print(theta)
# Gradient Descent - Stochastic
# Stochastic: finds gradients based on random instances
# adv: better for huge datasets
# dis: much more erratic than batch GD
# -- good for avoiding local minima
# -- bad b/c may not find optimum sol'n
# simulated annealing helps. (gradually reduces learning rate)
theta_path_sgd = []
n_epochs, t0, t1 = 50, 5, 50 # learning schedule hyperparameters
def learning_schedule(t):
return t0 / (t + t1)
theta = np.random.randn(2,1) # random initialization
for epoch in range(n_epochs):
for i in range(m):
random_index = np.random.randint(m)
xi = X_b[random_index:random_index+1]
yi = y[random_index:random_index+1]
gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
eta = learning_schedule(epoch * m + i)
theta = theta - eta * gradients
theta_path_sgd.append(theta)
print(theta)
# SGD Regression using Scikit:
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(n_iter=50, penalty=None, eta0=0.1)
sgd_reg.fit(X, y.ravel())
print(sgd_reg.intercept_, sgd_reg.coef_)
# Gradient Descent - MiniBatch
# adv: performance boost via GPUs
theta_path_mgd = []
n_iterations = 50
minibatch_size = 20
import numpy.random as rnd
rnd.seed(42)
theta = rnd.randn(2,1) # random initialization
t0, t1 = 10, 1000
def learning_schedule(t):
return t0 / (t + t1)
t = 0
for epoch in range(n_iterations):
shuffled_indices = rnd.permutation(m)
X_b_shuffled = X_b[shuffled_indices]
y_shuffled = y[shuffled_indices]
for i in range(0, m, minibatch_size):
t += 1
xi = X_b_shuffled[i:i+minibatch_size]
yi = y_shuffled[i:i+minibatch_size]
gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
eta = learning_schedule(t)
theta = theta - eta * gradients
theta_path_mgd.append(theta)
print(theta)
theta_path_bgd = np.array(theta_path_bgd)
theta_path_sgd = np.array(theta_path_sgd)
theta_path_mgd = np.array(theta_path_mgd)
plt.figure(figsize=(10,4))
plt.plot(theta_path_sgd[:, 0], theta_path_sgd[:, 1], "r-s", linewidth=1, label="Stochastic")
plt.plot(theta_path_mgd[:, 0], theta_path_mgd[:, 1], "g-+", linewidth=1, label="Mini-batch")
plt.plot(theta_path_bgd[:, 0], theta_path_bgd[:, 1], "b-o", linewidth=1, label="Batch")
plt.legend(loc="upper right", fontsize=14)
plt.xlabel(r"$\theta_0$", fontsize=20)
plt.ylabel(r"$\theta_1$ ", fontsize=20, rotation=0)
plt.axis([2.5, 4.5, 2.3, 3.9])
#save_fig("gradient_descent_paths_plot")
plt.show()
# example quadratic equation + noise: y = 0.5*X^2 + X + 2 + noise
m = 100
X = 6 * np.random.rand(m, 1) - 3
y = 0.5 * X**2 + X + 2 + np.random.randn(m, 1)
plt.scatter(X,y)
plt.show()
# fit using Scikit
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
# caution: PolynomialFeatures converts array of n features
# into array of (n+d)!/d!n! features -- combinatorial explosions possible :-)
poly_features = PolynomialFeatures(degree=2, include_bias=False)
print(poly_features)
# X_poly: original feature of X, plus its square.
X_poly = poly_features.fit_transform(X)
#print(X, X_poly)
print(X[0], X_poly[0])
# fit it:
lin_reg = LinearRegression()
lin_reg.fit(X_poly, y)
print(lin_reg.intercept_, lin_reg.coef_)
# result estimate: 0.48x(1)^2 + 0.99x(2) + 2.06
# original: 0.50x(1)^2 + 1.00x(2) + 2.00 + gaussian noise
X_new = np.linspace(-3, 3, 100).reshape(100, 1)
X_new_poly = poly_features.transform(X_new)
y_new = lin_reg.predict(X_new_poly)
#testme = np.linspace(-3,3,20)
#print(testme, testme.reshape(20,1))
plt.plot(X, y, "b.")
plt.plot(X_new, y_new, "r-", linewidth=2, label="Predictions")
plt.xlabel("$x_1$", fontsize=14)
plt.ylabel("$y$", rotation=0, fontsize=14)
plt.legend(loc="upper left", fontsize=14)
plt.axis([-3, 3, 0, 10])
#save_fig("quadratic_predictions_plot")
plt.show()
# another way to check for underfit & overfit:
# use learning curve plots to see performance vs training set size.
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
# train model multiple times on various training subsets (of various sizes)
def plot_learning_curves(model, X, y):
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)
train_errors, val_errors = [], []
for m in range(1, len(X_train)):
model.fit(X_train[:m], y_train[:m])
y_train_predict = model.predict(X_train[:m])
y_val_predict = model.predict(X_val)
train_errors.append(mean_squared_error(y_train_predict, y_train[:m]))
val_errors.append(mean_squared_error(y_val_predict, y_val))
plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="Training set")
plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="Validation set")
plt.legend(loc="upper right", fontsize=14)
plt.xlabel("Training set size", fontsize=14)
plt.ylabel("RMSE", fontsize=14)
lin_reg = LinearRegression()
plot_learning_curves(lin_reg, X, y)
plt.axis([0, 80, 0, 3])
#save_fig("underfitting_learning_curves_plot")
plt.show()
# repeat exercise for 10th-degree polynomial
from sklearn.pipeline import Pipeline
polynomial_regression = Pipeline((
("poly_features", PolynomialFeatures(degree=10, include_bias=False)),
("sgd_reg", LinearRegression()),
))
plot_learning_curves(polynomial_regression, X, y)
plt.axis([0,80,0,3])
plt.show()
# note: training error rate much lower than on Linear Regression
# note: training/validation gap closes to zero. good fit?
Irreducibility: due to data noise.
Rule of thumb: increasing model complexity increases variance & reduces bias (and vice versa.)
# Ridge -- regularization term added to cost function.
# alpha param -- forces model weights to minimal values. higher alpha = "flatter" function (converge to mean)

# build dataset
import numpy.random as rnd
rnd.seed(42)
m = 20
X = 3 * rnd.rand(m, 1)
y = 1 + 0.5 * X + rnd.randn(m, 1) / 1.5
X_new = np.linspace(0, 3, 100).reshape(100, 1)
# plot it
plt.plot(X, y, "b.")
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.axis([0, 3, 0, 4])
# apply Ridge regression
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=1, solver="cholesky")
ridge_reg.fit(X,y)
ridge_reg.predict([[0.0],[1.5],[2.0],[3.0]])
# Ridge using SGD:
sgd_reg = SGDRegressor(penalty="l2")
sgd_reg.fit(X,y.ravel())
ridge_reg.predict([[0.0],[1.5],[2.0],[3.0]])
# Lasso -- similar to Ridge, also adds regularization term
# uses L1 norm (instead of 1/2 square of L2 norm, as in Ridge.)
# -- tends to force least important features to zero.
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X,y)
lasso_reg.predict([[0.0],[1.5],[2.0],[3.0]])
# Elastic Net -- midddle ground.
# regularization = mix of Ridge & Lasso (mix ratio "r")
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net.fit(X,y)
elastic_net.predict([[0.0],[1.5],[2.0],[3.0]])
# Early Stopping -- stop training when minimum validation error reached
# build dataset
rnd.seed(42)
m = 100
X = 6 * rnd.rand(m, 1) - 3
y = 2 + X + 0.5 * X**2 + rnd.randn(m, 1)
X_train, X_val, y_train, y_val = train_test_split(X[:50], y[:50].ravel(), test_size=0.5, random_state=10)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
poly_scaler = Pipeline((
("poly_features", PolynomialFeatures(
degree=90,
include_bias=False)),
("std_scaler", StandardScaler()),
))
X_train_poly_scaled = poly_scaler.fit_transform(X_train)
X_val_poly_scaled = poly_scaler.transform(X_val)
sgd_reg = SGDRegressor(n_iter=1,
penalty=None,
eta0=0.0005,
warm_start=True,
learning_rate="constant",
random_state=42)
n_epochs = 500
train_errors, val_errors = [], []
for epoch in range(n_epochs):
sgd_reg.fit(X_train_poly_scaled, y_train)
y_train_predict = sgd_reg.predict(X_train_poly_scaled)
y_val_predict = sgd_reg.predict(X_val_poly_scaled)
train_errors.append(mean_squared_error(y_train_predict, y_train))
val_errors.append(mean_squared_error(y_val_predict, y_val))
best_epoch = np.argmin(val_errors)
best_val_rmse = np.sqrt(val_errors[best_epoch])
plt.annotate('Best model',
xy=(best_epoch, best_val_rmse),
xytext=(best_epoch, best_val_rmse + 1),
ha="center",
arrowprops=dict(facecolor='black', shrink=0.05),
fontsize=16,
)
best_val_rmse -= 0.03 # just to make the graph look better
plt.plot([0, n_epochs], [best_val_rmse, best_val_rmse], "k:", linewidth=2)
plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="Validation set")
plt.plot(np.sqrt(train_errors), "r--", linewidth=2, label="Training set")
plt.legend(loc="upper right", fontsize=14)
plt.xlabel("Epoch", fontsize=14)
plt.ylabel("RMSE", fontsize=14)
#save_fig("early_stopping_plot")
plt.show()


#from sklearn import datasets
#iris = datasets.load_iris()
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()
print(iris.keys())
X = iris["data"][:, 3:] # petal width
y = (iris["target"] == 2).astype(np.int) # 1 if Iris-Virginica, else 0
# train a LR model
from sklearn.linear_model import LogisticRegression
log_reg=LogisticRegression()
log_reg.fit(X,y)
# predict probability of flowers with petal widths = 0-3cm
X_new = np.linspace(0, 3, 1000).reshape(-1, 1)
y_proba = log_reg.predict_proba(X_new)
print(y_proba)
decision_boundary = X_new[y_proba[:, 1] >= 0.5][0]
plt.plot(X_new, y_proba[:, 1], "g-", label="Iris-Virginica")
plt.plot(X_new, y_proba[:, 0], "b--", label="Not Iris-Virginica")
plt.text(decision_boundary+0.02, 0.15, "Decision boundary", fontsize=14, color="k", ha="center")
plt.xlabel("Petal width (cm)", fontsize=14)
plt.ylabel("Probability", fontsize=14)
plt.legend(loc="center left", fontsize=14)
plt.show()
# what's the prediction for petal length = 1.5 or 1.7cm?
print(log_reg.predict([[1.5], [1.7]]))
# Logistic Regressin contour plot
# with multiple decision boundaries (not just 50%)
from sklearn.linear_model import LogisticRegression
X = iris["data"][:, (2, 3)] # petal length, petal width
y = (iris["target"] == 2).astype(np.int)
log_reg = LogisticRegression(C=10**10)
log_reg.fit(X, y)
x0, x1 = np.meshgrid(
np.linspace(2.9, 7, 500).reshape(-1, 1),
np.linspace(0.8, 2.7, 200).reshape(-1, 1),
)
# ravel(): return contiguous flattened array
X_new = np.c_[x0.ravel(), x1.ravel()]
y_proba = log_reg.predict_proba(X_new)
plt.figure(figsize=(10, 4))
plt.plot(X[y==0, 0], X[y==0, 1], "bs")
plt.plot(X[y==1, 0], X[y==1, 1], "g^")
zz = y_proba[:, 1].reshape(x0.shape)
contour = plt.contour(x0, x1, zz, cmap=plt.cm.brg)
left_right = np.array([2.9, 7])
boundary = -(log_reg.coef_[0][0] * left_right + log_reg.intercept_[0]) / log_reg.coef_[0][1]
plt.clabel(contour, inline=1, fontsize=12)
plt.plot(left_right, boundary, "k--", linewidth=3)
plt.text(3.5, 1.5, "Not Iris-Virginica", fontsize=14, color="b", ha="center")
plt.text(6.5, 2.3, "Iris-Virginica", fontsize=14, color="g", ha="center")
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.axis([2.9, 7, 0.8, 2.7])
#save_fig("logistic_regression_contour_plot")
plt.show()




# use Softmax to classify iris flowers
X = iris["data"][:, (2, 3)] # petal length, width
y = iris["target"]
# Scikit LR can be switched to Softmax with "multinomial" setting.
# also defaults to L2 regularization (control with C parameter)
softmax_reg = LogisticRegression(multi_class="multinomial",solver="lbfgs", C=10)
softmax_reg.fit(X, y)
# predict iris 5cm long, 2cm wide:
softmax_reg.predict([[5, 2]])
softmax_reg.predict_proba([[5,2]])
# softmax contour plot
x0, x1 = np.meshgrid(
np.linspace(0, 8, 500).reshape(-1, 1),
np.linspace(0, 3.5, 200).reshape(-1, 1),
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_proba = softmax_reg.predict_proba(X_new)
y_predict = softmax_reg.predict(X_new)
zz1 = y_proba[:, 1].reshape(x0.shape)
zz = y_predict.reshape(x0.shape)
plt.figure(figsize=(10, 4))
plt.plot(X[y==2, 0], X[y==2, 1], "g^", label="Iris-Virginica")
plt.plot(X[y==1, 0], X[y==1, 1], "bs", label="Iris-Versicolor")
plt.plot(X[y==0, 0], X[y==0, 1], "yo", label="Iris-Setosa")
from matplotlib.colors import ListedColormap
custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
plt.contourf(x0, x1, zz, cmap=custom_cmap, linewidth=5)
contour = plt.contour(x0, x1, zz1, cmap=plt.cm.brg)
plt.clabel(contour, inline=1, fontsize=12)
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.legend(loc="center left", fontsize=14)
plt.axis([0, 7, 0, 3.5])
#save_fig("softmax_regression_contour_plot")
plt.show()
%matplotlib inline
import matplotlib.pyplot as plt
# Large margin classification:
from sklearn.svm import SVC
from sklearn import datasets
iris = datasets.load_iris()
X = iris["data"][:, (2, 3)] # petal length, petal width
y = iris["target"]
setosa_or_versicolor = (y == 0) | (y == 1)
X = X[setosa_or_versicolor]
y = y[setosa_or_versicolor]
# SVM Classifier model
svm_clf = SVC(kernel="linear", C=float("inf"))
svm_clf.fit(X, y)
# Bad models
import numpy as np
x0 = np.linspace(0, 5.5, 200)
pred_1 = 5*x0 - 20
pred_2 = x0 - 1.8
pred_3 = 0.1 * x0 + 0.5
def plot_svc_decision_boundary(svm_clf, xmin, xmax):
w = svm_clf.coef_[0]
b = svm_clf.intercept_[0]
# At the decision boundary, w0*x0 + w1*x1 + b = 0
# => x1 = -w0/w1 * x0 - b/w1
x0 = np.linspace(xmin, xmax, 200)
decision_boundary = -w[0]/w[1] * x0 - b/w[1]
margin = 1/w[1]
gutter_up = decision_boundary + margin
gutter_down = decision_boundary - margin
svs = svm_clf.support_vectors_
plt.scatter(svs[:, 0], svs[:, 1], s=180, facecolors='#FFAAAA')
plt.plot(x0, decision_boundary, "k-", linewidth=2)
plt.plot(x0, gutter_up, "k--", linewidth=2)
plt.plot(x0, gutter_down, "k--", linewidth=2)
plt.figure(figsize=(12,2.7))
plt.subplot(121)
plt.plot(x0, pred_1, "g--", linewidth=2)
plt.plot(x0, pred_2, "m-", linewidth=2)
plt.plot(x0, pred_3, "r-", linewidth=2)
plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs", label="Iris-Versicolor")
plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo", label="Iris-Setosa")
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.legend(loc="upper left", fontsize=14)
plt.axis([0, 5.5, 0, 2])
plt.subplot(122)
plot_svc_decision_boundary(svm_clf, 0, 5.5)
plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs")
plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo")
plt.xlabel("Petal length", fontsize=14)
plt.axis([0, 5.5, 0, 2])
plt.show()
# On left:
# dashed line = basically useless decision boundary.
# solid lines = OK for this dataset, but no margins. Probably will not work well on new instances.
# On right: SVM finds widest possible "street" between classes.
# sensitivity to feature scaling:
Xs = np.array([[1, 50], [5, 20], [3, 80], [5, 60]]).astype(np.float64)
ys = np.array([0, 0, 1, 1])
svm_clf = SVC(kernel="linear", C=100)
svm_clf.fit(Xs, ys)
plt.figure(figsize=(12,3.2))
plt.subplot(121)
plt.plot(Xs[:, 0][ys==1], Xs[:, 1][ys==1], "bo")
plt.plot(Xs[:, 0][ys==0], Xs[:, 1][ys==0], "ms")
plot_svc_decision_boundary(svm_clf, 0, 6)
plt.xlabel("$x_0$", fontsize=20)
plt.ylabel("$x_1$ ", fontsize=20, rotation=0)
plt.title("Unscaled", fontsize=16)
plt.axis([0, 6, 0, 90])
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(Xs)
svm_clf.fit(X_scaled, ys)
plt.subplot(122)
plt.plot(X_scaled[:, 0][ys==1], X_scaled[:, 1][ys==1], "bo")
plt.plot(X_scaled[:, 0][ys==0], X_scaled[:, 1][ys==0], "ms")
plot_svc_decision_boundary(svm_clf, -2, 2)
plt.xlabel("$x_0$", fontsize=20)
plt.title("Scaled", fontsize=16)
plt.axis([-2, 2, -2, 2])
# SVMs are sensitive to feature scaling.
# Plot on right has much more robust feature boundary.
X_scaled
# "hard" margin classification:
# - all instances need to be "out of the street".
# - all instances need to be "on the right side of the street".
# problem: doable only if data is linearly separable
# problem: very sensitive to outliers
X_outliers = np.array([[3.4, 1.3], [3.2, 0.8]])
y_outliers = np.array([0, 0])
Xo1 = np.concatenate([X, X_outliers[:1]], axis=0)
yo1 = np.concatenate([y, y_outliers[:1]], axis=0)
Xo2 = np.concatenate([X, X_outliers[1:]], axis=0)
yo2 = np.concatenate([y, y_outliers[1:]], axis=0)
svm_clf2 = SVC(kernel="linear", C=10**9)#float("inf"))
svm_clf2.fit(Xo2, yo2)
plt.figure(figsize=(12,2.7))
plt.subplot(121)
plt.plot(Xo1[:, 0][yo1==1], Xo1[:, 1][yo1==1], "bs")
plt.plot(Xo1[:, 0][yo1==0], Xo1[:, 1][yo1==0], "yo")
plt.text(0.3, 1.0, "Impossible!", fontsize=20, color="red")
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.annotate("Outlier",
xy=(X_outliers[0][0], X_outliers[0][1]),
xytext=(2.5, 1.7),
ha="center",
arrowprops=dict(facecolor='black', shrink=0.1),
fontsize=16,
)
plt.axis([0, 5.5, 0, 2])
plt.subplot(122)
plt.plot(Xo2[:, 0][yo2==1], Xo2[:, 1][yo2==1], "bs")
plt.plot(Xo2[:, 0][yo2==0], Xo2[:, 1][yo2==0], "yo")
plot_svc_decision_boundary(svm_clf2, 0, 5.5)
plt.xlabel("Petal length", fontsize=14)
plt.annotate("Outlier",
xy=(X_outliers[1][0], X_outliers[1][1]),
xytext=(3.2, 0.08),
ha="center",
arrowprops=dict(facecolor='black', shrink=0.1),
fontsize=16,
)
plt.axis([0, 5.5, 0, 2])
X_scaled
# soluton to "hard margins" problem:
# control hardness with C hyperparameter
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
iris = datasets.load_iris()
X = iris["data"][:, (2, 3)] # petal length, petal width
y = (iris["target"] == 2).astype(np.float64) # Iris-Virginica
scaler = StandardScaler()
svm_clf1 = LinearSVC(C=100, loss="hinge")
svm_clf2 = LinearSVC(C=1, loss="hinge")
scaled_svm_clf1 = Pipeline((
("scaler", scaler),
("linear_svc", svm_clf1),
))
scaled_svm_clf2 = Pipeline((
("scaler", scaler),
("linear_svc", svm_clf2),
))
scaled_svm_clf1.fit(X, y)
scaled_svm_clf2.fit(X, y)
scaled_svm_clf2.predict([[5.5, 1.7]])
X_scaled
# Convert to unscaled parameters
b1 = svm_clf1.decision_function([-scaler.mean_ / scaler.scale_])
b2 = svm_clf2.decision_function([-scaler.mean_ / scaler.scale_])
w1 = svm_clf1.coef_[0] / scaler.scale_
w2 = svm_clf2.coef_[0] / scaler.scale_
svm_clf1.intercept_ = np.array([b1])
svm_clf2.intercept_ = np.array([b2])
svm_clf1.coef_ = np.array([w1])
svm_clf2.coef_ = np.array([w2])
# Find support vectors (LinearSVC does not do this automatically)
t = y * 2 - 1
support_vectors_idx1 = (t * (X.dot(w1) + b1) < 1).ravel()
support_vectors_idx2 = (t * (X.dot(w2) + b2) < 1).ravel()
svm_clf1.support_vectors_ = X[support_vectors_idx1]
svm_clf2.support_vectors_ = X[support_vectors_idx2]
plt.figure(figsize=(12,3.2))
plt.subplot(121)
plt.plot(X[:, 0][y==1], X[:, 1][y==1], "g^", label="Iris-Virginica")
plt.plot(X[:, 0][y==0], X[:, 1][y==0], "bs", label="Iris-Versicolor")
plot_svc_decision_boundary(svm_clf1, 4, 6)
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.legend(loc="upper left", fontsize=14)
plt.title("$C = {}$".format(svm_clf1.C), fontsize=16)
plt.axis([4, 6, 0.8, 2.8])
plt.subplot(122)
plt.plot(X[:, 0][y==1], X[:, 1][y==1], "g^")
plt.plot(X[:, 0][y==0], X[:, 1][y==0], "bs")
plot_svc_decision_boundary(svm_clf2, 4, 6)
plt.xlabel("Petal length", fontsize=14)
plt.title("$C = {}$".format(svm_clf2.C), fontsize=16)
plt.axis([4, 6, 0.8, 2.8])
# some (most?) datasets are not linearly separable. simple example below.
X1D = np.linspace(-4, 4, 9).reshape(-1, 1)
X2D = np.c_[X1D, X1D**2] # adds 2nd, non-linear dimension.
y = np.array([0, 0, 1, 1, 1, 1, 1, 0, 0])
plt.figure(figsize=(10, 4))
plt.subplot(121)
plt.grid(True, which='both')
plt.axhline(y=0, color='k')
plt.plot(X1D[:, 0][y==0], np.zeros(4), "bs")
plt.plot(X1D[:, 0][y==1], np.zeros(5), "g^")
plt.gca().get_yaxis().set_ticks([])
plt.xlabel(r"$x_1$", fontsize=20)
plt.axis([-4.5, 4.5, -0.2, 0.2])
plt.subplot(122)
plt.grid(True, which='both')
plt.axhline(y=0, color='k')
plt.axvline(x=0, color='k')
plt.plot(X2D[:, 0][y==0], X2D[:, 1][y==0], "bs")
plt.plot(X2D[:, 0][y==1], X2D[:, 1][y==1], "g^")
plt.xlabel(r"$x_1$", fontsize=20)
plt.ylabel(r"$x_2$", fontsize=20, rotation=0)
plt.gca().get_yaxis().set_ticks([0, 4, 8, 12, 16])
plt.plot([-4.5, 4.5], [6.5, 6.5], "r--", linewidth=3)
plt.axis([-4.5, 4.5, -1, 17])
plt.subplots_adjust(right=1)
#save_fig("higher_dimensions_plot", tight_layout=False)
plt.show()
# result: adding 2nd dimension (on right) makes dataset linearly separable
# test on "moons" dataset
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=100, noise=0.15, random_state=42)
def plot_dataset(X, y, axes):
plt.plot(X[:, 0][y==0], X[:, 1][y==0], "bs")
plt.plot(X[:, 0][y==1], X[:, 1][y==1], "g^")
plt.axis(axes)
plt.grid(True, which='both')
plt.xlabel(r"$x_1$", fontsize=20)
plt.ylabel(r"$x_2$", fontsize=20, rotation=0)
plot_dataset(X, y, [-1.5, 2.5, -1, 1.5])
plt.show()
# do this in Scikit with a Pipeline. Contents:
# 1) Polynomial Features
# 2) StandardScaler
# 3) LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
polynomial_svm_clf = Pipeline((
("poly_features", PolynomialFeatures(degree=3)),
("scaler", StandardScaler()),
("svm_clf", LinearSVC(C=10, loss="hinge"))
))
polynomial_svm_clf.fit(X, y)
def plot_predictions(clf, axes):
x0s = np.linspace(axes[0], axes[1], 100)
x1s = np.linspace(axes[2], axes[3], 100)
x0, x1 = np.meshgrid(x0s, x1s)
X = np.c_[x0.ravel(), x1.ravel()]
y_pred = clf.predict(X).reshape(x0.shape)
y_decision = clf.decision_function(X).reshape(x0.shape)
plt.contourf(x0, x1, y_pred, cmap=plt.cm.brg, alpha=0.2)
plt.contourf(x0, x1, y_decision, cmap=plt.cm.brg, alpha=0.1)
plot_predictions(polynomial_svm_clf, [-1.5, 2.5, -1, 1.5])
plot_dataset(X, y, [-1.5, 2.5, -1, 1.5])
#save_fig("moons_polynomial_svc_plot")
plt.show()
from sklearn.svm import SVC
# train SVM classifier using 3rd-degree polynomial kernel
poly_kernel_svm_clf = Pipeline((
("scaler", StandardScaler()),
("svm_clf", SVC(
kernel="poly", degree=3, coef0=1, C=5))))
# train SVM classifier using 10th-degree polynomial kernel (for comparison)
poly100_kernel_svm_clf = Pipeline((
("scaler", StandardScaler()),
("svm_clf", SVC(kernel="poly", degree=10, coef0=100, C=5))
))
poly_kernel_svm_clf.fit(X, y)
poly100_kernel_svm_clf.fit(X, y)
plt.figure(figsize=(11, 4))
plt.subplot(121)
plot_predictions(poly_kernel_svm_clf, [-1.5, 2.5, -1, 1.5])
plot_dataset(X, y, [-1.5, 2.5, -1, 1.5])
plt.title(r"$d=3, r=1, C=5$", fontsize=18)
plt.subplot(122)
plot_predictions(poly100_kernel_svm_clf, [-1.5, 2.5, -1, 1.5])
plot_dataset(X, y, [-1.5, 2.5, -1, 1.5])
plt.title(r"$d=10, r=100, C=5$", fontsize=18)
#save_fig("moons_kernelized_polynomial_svc_plot")
plt.show()
# left: 3rd-degree polynomial; right: 10th-degree polynomial.
# if overfitting, reduce polynomial degree. if underfitting, bump it up.
# "coef0": controls high- vs low-degree polynomial influence.
# define similarity function to be Gaussian Radial Basis Function (RBF)
# equals 0 (far away) to 1 (at landmark)
def gaussian_rbf(x, landmark, gamma):
return np.exp(-gamma * np.linalg.norm(x - landmark, axis=1)**2)
gamma = 0.3
x1s = np.linspace(-4.5, 4.5, 200).reshape(-1, 1)
x2s = gaussian_rbf(x1s, -2, gamma)
x3s = gaussian_rbf(x1s, 1, gamma)
XK = np.c_[gaussian_rbf(X1D, -2, gamma), gaussian_rbf(X1D, 1, gamma)]
yk = np.array([0, 0, 1, 1, 1, 1, 1, 0, 0])
plt.figure(figsize=(11, 4))
plt.subplot(121)
plt.grid(True, which='both')
plt.axhline(y=0, color='k')
plt.scatter(x=[-2, 1], y=[0, 0], s=150, alpha=0.5, c="red")
plt.plot(X1D[:, 0][yk==0], np.zeros(4), "bs")
plt.plot(X1D[:, 0][yk==1], np.zeros(5), "g^")
plt.plot(x1s, x2s, "g--")
plt.plot(x1s, x3s, "b:")
plt.gca().get_yaxis().set_ticks([0, 0.25, 0.5, 0.75, 1])
plt.xlabel(r"$x_1$", fontsize=20)
plt.ylabel(r"Similarity", fontsize=14)
plt.annotate(r'$\mathbf{x}$',
xy=(X1D[3, 0], 0),
xytext=(-0.5, 0.20),
ha="center",
arrowprops=dict(facecolor='black', shrink=0.1),
fontsize=18,
)
plt.text(-2, 0.9, "$x_2$", ha="center", fontsize=20)
plt.text(1, 0.9, "$x_3$", ha="center", fontsize=20)
plt.axis([-4.5, 4.5, -0.1, 1.1])
plt.subplot(122)
plt.grid(True, which='both')
plt.axhline(y=0, color='k')
plt.axvline(x=0, color='k')
plt.plot(XK[:, 0][yk==0], XK[:, 1][yk==0], "bs")
plt.plot(XK[:, 0][yk==1], XK[:, 1][yk==1], "g^")
plt.xlabel(r"$x_2$", fontsize=20)
plt.ylabel(r"$x_3$ ", fontsize=20, rotation=0)
plt.annotate(r'$\phi\left(\mathbf{x}\right)$',
xy=(XK[3, 0], XK[3, 1]),
xytext=(0.65, 0.50),
ha="center",
arrowprops=dict(facecolor='black', shrink=0.1),
fontsize=18,
)
plt.plot([-0.1, 1.1], [0.57, -0.1], "r--", linewidth=3)
plt.axis([-0.1, 1.1, -0.1, 1.1])
plt.subplots_adjust(right=1)
#save_fig("kernel_method_plot")
plt.show()
x1_example = X1D[3, 0]
for landmark in (-2, 1):
k = gaussian_rbf(np.array([[x1_example]]), np.array([[landmark]]), gamma)
print("Phi({}, {}) = {}".format(x1_example, landmark, k))
rbf_kernel_svm_clf = Pipeline((
("scaler", StandardScaler()),
("svm_clf", SVC(kernel="rbf", gamma=5, C=0.001))
))
rbf_kernel_svm_clf.fit(X, y)
from sklearn.svm import SVC
gamma1, gamma2 = 0.1, 5
C1, C2 = 0.001, 1000
hyperparams = (gamma1, C1), (gamma1, C2), (gamma2, C1), (gamma2, C2)
svm_clfs = []
for gamma, C in hyperparams:
rbf_kernel_svm_clf = Pipeline((
("scaler", StandardScaler()),
("svm_clf", SVC(kernel="rbf", gamma=gamma, C=C))
))
rbf_kernel_svm_clf.fit(X, y)
svm_clfs.append(rbf_kernel_svm_clf)
plt.figure(figsize=(11, 7))
for i, svm_clf in enumerate(svm_clfs):
plt.subplot(221 + i)
plot_predictions(svm_clf, [-1.5, 2.5, -1, 1.5])
plot_dataset(X, y, [-1.5, 2.5, -1, 1.5])
gamma, C = hyperparams[i]
plt.title(r"$\gamma = {}, C = {}$".format(gamma, C), fontsize=16)
#save_fig("moons_rbf_svc_plot")
plt.show()
# below: model trained with different values of gamma and C.
# GAMMA:
# bigger gamma = narrower bell curve, so each instance's area of influence = smaller.
# smaller gamma: bigger bell curve = smoother decision boundary.
from sklearn.svm import LinearSVR
import numpy.random as rnd
rnd.seed(42)
m = 50
X = 2 * rnd.rand(m, 1)
y = (4 + 3 * X + rnd.randn(m, 1)).ravel()
svm_reg1 = LinearSVR(epsilon=1.5)
svm_reg2 = LinearSVR(epsilon=0.5)
svm_reg1.fit(X, y)
svm_reg2.fit(X, y)
def find_support_vectors(svm_reg, X, y):
y_pred = svm_reg.predict(X)
off_margin = (np.abs(y - y_pred) >= svm_reg.epsilon)
return np.argwhere(off_margin)
svm_reg1.support_ = find_support_vectors(svm_reg1, X, y)
svm_reg2.support_ = find_support_vectors(svm_reg2, X, y)
eps_x1 = 1
eps_y_pred = svm_reg1.predict([[eps_x1]])
def plot_svm_regression(svm_reg, X, y, axes):
x1s = np.linspace(axes[0], axes[1], 100).reshape(100, 1)
y_pred = svm_reg.predict(x1s)
plt.plot(x1s, y_pred, "k-", linewidth=2, label=r"$\hat{y}$")
plt.plot(x1s, y_pred + svm_reg.epsilon, "k--")
plt.plot(x1s, y_pred - svm_reg.epsilon, "k--")
plt.scatter(X[svm_reg.support_], y[svm_reg.support_], s=180, facecolors='#FFAAAA')
plt.plot(X, y, "bo")
plt.xlabel(r"$x_1$", fontsize=18)
plt.legend(loc="upper left", fontsize=18)
plt.axis(axes)
plt.figure(figsize=(9, 4))
plt.subplot(121)
plot_svm_regression(svm_reg1, X, y, [0, 2, 3, 11])
plt.title(r"$\epsilon = {}$".format(svm_reg1.epsilon), fontsize=18)
plt.ylabel(r"$y$", fontsize=18, rotation=0)
#plt.plot([eps_x1, eps_x1], [eps_y_pred, eps_y_pred - svm_reg1.epsilon], "k-", linewidth=2)
plt.annotate(
'', xy=(eps_x1, eps_y_pred), xycoords='data',
xytext=(eps_x1, eps_y_pred - svm_reg1.epsilon),
textcoords='data', arrowprops={'arrowstyle': '<->', 'linewidth': 1.5}
)
plt.text(0.91, 5.6, r"$\epsilon$", fontsize=20)
plt.subplot(122)
plot_svm_regression(svm_reg2, X, y, [0, 2, 3, 11])
plt.title(r"$\epsilon = {}$".format(svm_reg2.epsilon), fontsize=18)
#save_fig("svm_regression_plot")
plt.show()
# Use kernel-ized SVM model to handle nonlinear regression jobs.
from sklearn.svm import SVR
# random quadratic training set.
rnd.seed(42)
m = 100
X = 2 * rnd.rand(m, 1) - 1
y = (0.2 + 0.1 * X + 0.5 * X**2 + rnd.randn(m, 1)/10).ravel()
svm_poly_reg1 = SVR(kernel="poly", degree=2, C=100, epsilon=0.1)
svm_poly_reg2 = SVR(kernel="poly", degree=2, C=0.01, epsilon=0.1)
svm_poly_reg1.fit(X, y)
svm_poly_reg2.fit(X, y)
plt.figure(figsize=(9, 4))
plt.subplot(121)
plot_svm_regression(svm_poly_reg1, X, y, [-1, 1, 0, 1])
plt.title(r"$degree={}, C={}, \epsilon = {}$".format(svm_poly_reg1.degree, svm_poly_reg1.C, svm_poly_reg1.epsilon), fontsize=18)
plt.ylabel(r"$y$", fontsize=18, rotation=0)
plt.subplot(122)
plot_svm_regression(svm_poly_reg2, X, y, [-1, 1, 0, 1])
plt.title(r"$degree={}, C={}, \epsilon = {}$".format(svm_poly_reg2.degree, svm_poly_reg2.C, svm_poly_reg2.epsilon), fontsize=18)
#save_fig("svm_with_polynomial_kernel_plot")
plt.show()
# left: little regularization (large C)
# right: much more regularization (little C)
iris = datasets.load_iris()
X = iris["data"][:, (2, 3)] # petal length, petal width
y = (iris["target"] == 2).astype(np.float64) # Iris-Virginica
from mpl_toolkits.mplot3d import Axes3D
def plot_3D_decision_function(ax, w, b, x1_lim=[4, 6], x2_lim=[0.8, 2.8]):
x1_in_bounds = (X[:, 0] > x1_lim[0]) & (X[:, 0] < x1_lim[1])
X_crop = X[x1_in_bounds]
y_crop = y[x1_in_bounds]
x1s = np.linspace(x1_lim[0], x1_lim[1], 20)
x2s = np.linspace(x2_lim[0], x2_lim[1], 20)
x1, x2 = np.meshgrid(x1s, x2s)
xs = np.c_[x1.ravel(), x2.ravel()]
df = (xs.dot(w) + b).reshape(x1.shape)
m = 1 / np.linalg.norm(w)
boundary_x2s = -x1s*(w[0]/w[1])-b/w[1]
margin_x2s_1 = -x1s*(w[0]/w[1])-(b-1)/w[1]
margin_x2s_2 = -x1s*(w[0]/w[1])-(b+1)/w[1]
ax.plot_surface(x1s, x2, 0, color="b", alpha=0.2, cstride=100, rstride=100)
ax.plot(x1s, boundary_x2s, 0, "k-", linewidth=2, label=r"$h=0$")
ax.plot(x1s, margin_x2s_1, 0, "k--", linewidth=2, label=r"$h=\pm 1$")
ax.plot(x1s, margin_x2s_2, 0, "k--", linewidth=2)
ax.plot(X_crop[:, 0][y_crop==1], X_crop[:, 1][y_crop==1], 0, "g^")
ax.plot_wireframe(x1, x2, df, alpha=0.3, color="k")
ax.plot(X_crop[:, 0][y_crop==0], X_crop[:, 1][y_crop==0], 0, "bs")
ax.axis(x1_lim + x2_lim)
ax.text(4.5, 2.5, 3.8, "Decision function $h$", fontsize=15)
ax.set_xlabel(r"Petal length", fontsize=15)
ax.set_ylabel(r"Petal width", fontsize=15)
ax.set_zlabel(r"$h = \mathbf{w}^t \cdot \mathbf{x} + b$", fontsize=18)
ax.legend(loc="upper left", fontsize=16)
fig = plt.figure(figsize=(11, 6))
ax1 = fig.add_subplot(111, projection='3d')
plot_3D_decision_function(ax1, w=svm_clf2.coef_[0], b=svm_clf2.intercept_[0])
#save_fig("iris_3D_plot")
plt.show()

import os
import numpy.random as rnd
# load iris dataset & train a DT classifier
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
iris = load_iris()
X = iris.data[:, 2:] # petal length and width
y = iris.target
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)
# graph it into a .dot file
from sklearn.tree import export_graphviz
def image_path(fig_id):
#return os.path...
return fig_id
export_graphviz(
tree_clf,
out_file=image_path("iris_tree.dot"),
feature_names=iris.feature_names[2:],
class_names=iris.target_names,
rounded=True,
filled=True
)
# convert to PDF or PNG using command-line tool.
! dot -Tpng iris_tree.dot -o iris_tree.png

# Plot DT decision boundaries
# Depth=0: root node (petal length=2.45cm)
# Depth=1: right node splits @ 1.75cm
# Stops at max_depth = 2.
# Vertical dotted line shows boundary if max_depth set = 3.
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
def plot_decision_boundary(clf, X, y, axes=[0, 7.5, 0, 3], iris=True, legend=False, plot_training=True):
x1s = np.linspace(axes[0], axes[1], 100)
x2s = np.linspace(axes[2], axes[3], 100)
x1, x2 = np.meshgrid(x1s, x2s)
X_new = np.c_[x1.ravel(), x2.ravel()]
y_pred = clf.predict(X_new).reshape(x1.shape)
custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap, linewidth=10)
if not iris:
custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])
plt.contour(x1, x2, y_pred, cmap=custom_cmap2, alpha=0.8)
if plot_training:
plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo", label="Iris-Setosa")
plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs", label="Iris-Versicolor")
plt.plot(X[:, 0][y==2], X[:, 1][y==2], "g^", label="Iris-Virginica")
plt.axis(axes)
if iris:
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
else:
plt.xlabel(r"$x_1$", fontsize=18)
plt.ylabel(r"$x_2$", fontsize=18, rotation=0)
if legend:
plt.legend(loc="lower right", fontsize=14)
plt.figure(figsize=(8, 4))
plot_decision_boundary(tree_clf, X, y)
plt.plot([2.45, 2.45], [0, 3], "k-", linewidth=2)
plt.plot([2.45, 7.5], [1.75, 1.75], "k--", linewidth=2)
plt.plot([4.95, 4.95], [0, 1.75], "k:", linewidth=2)
plt.plot([4.85, 4.85], [1.75, 3], "k:", linewidth=2)
plt.text(1.40, 1.0, "Depth=0", fontsize=15)
plt.text(3.2, 1.80, "Depth=1", fontsize=13)
plt.text(4.05, 0.5, "(Depth=2)", fontsize=11)
#save_fig("decision_tree_decision_boundaries_plot")
plt.show()
# Probability of instance 5cm long, 1.5cm wide belonging to any one of three nodes above:
print(tree_clf.predict_proba([[5, 1.5]]))
# Return class of highest probability (in this case, class #1.)
print(tree_clf.predict([[5, 1.5]]))

Can use entropy measure by setting criterion parameter to "entropy".

Dataset's entropy = 0 when it contains instances of only one class.
# Train two DTs on moons dataset.
# left: default params = no restrictions (case of overfitting)
# right: min_samples_leaf = 4. (better generalization)
from sklearn.datasets import make_moons
Xm, ym = make_moons(n_samples=100, noise=0.25, random_state=53)
deep_tree_clf1 = DecisionTreeClassifier(random_state=42)
deep_tree_clf2 = DecisionTreeClassifier(min_samples_leaf=4, random_state=42)
deep_tree_clf1.fit(Xm, ym)
deep_tree_clf2.fit(Xm, ym)
plt.figure(figsize=(11, 4))
plt.subplot(121)
plot_decision_boundary(deep_tree_clf1, Xm, ym, axes=[-1.5, 2.5, -1, 1.5], iris=False)
plt.title("No restrictions", fontsize=16)
plt.subplot(122)
plot_decision_boundary(deep_tree_clf2, Xm, ym, axes=[-1.5, 2.5, -1, 1.5], iris=False)
plt.title("min_samples_leaf = {}".format(deep_tree_clf2.min_samples_leaf), fontsize=14)
#save_fig("min_samples_leaf_plot")
plt.show()
from sklearn.tree import DecisionTreeRegressor
# Quadrat!ic training set + noise
rnd.seed(42)
m = 200
X = rnd.rand(m, 1)
y = 4 * (X - 0.5) ** 2
y = y + rnd.randn(m, 1) / 10
tree_reg1 = DecisionTreeRegressor(random_state=42, max_depth=2)
tree_reg2 = DecisionTreeRegressor(random_state=42, max_depth=3)
tree_reg1.fit(X, y)
tree_reg2.fit(X, y)
def plot_regression_predictions(tree_reg, X, y, axes=[0, 1, -0.2, 1], ylabel="$y$"):
x1 = np.linspace(axes[0], axes[1], 500).reshape(-1, 1)
y_pred = tree_reg.predict(x1)
plt.axis(axes)
plt.xlabel("$x_1$", fontsize=18)
if ylabel:
plt.ylabel(ylabel, fontsize=18, rotation=0)
plt.plot(X, y, "b.")
plt.plot(x1, y_pred, "r.-", linewidth=2, label=r"$\hat{y}$")
plt.figure(figsize=(11, 4))
plt.subplot(121)
plot_regression_predictions(tree_reg1, X, y)
for split, style in ((0.1973, "k-"), (0.0917, "k--"), (0.7718, "k--")):
plt.plot([split, split], [-0.2, 1], style, linewidth=2)
plt.text(0.21, 0.65, "Depth=0", fontsize=15)
plt.text(0.01, 0.2, "Depth=1", fontsize=13)
plt.text(0.65, 0.8, "Depth=1", fontsize=13)
plt.legend(loc="upper center", fontsize=18)
plt.title("max_depth=2", fontsize=14)
plt.subplot(122)
plot_regression_predictions(tree_reg2, X, y, ylabel=None)
for split, style in ((0.1973, "k-"), (0.0917, "k--"), (0.7718, "k--")):
plt.plot([split, split], [-0.2, 1], style, linewidth=2)
for split in (0.0458, 0.1298, 0.2873, 0.9040):
plt.plot([split, split], [-0.2, 1], "k:", linewidth=1)
plt.text(0.3, 0.5, "Depth=2", fontsize=13)
plt.title("max_depth=3", fontsize=14)
plt.show()
# Predicted value for each region (red line) = avg target value of instances in that region.

tree_reg1 = DecisionTreeRegressor(random_state=42)
tree_reg2 = DecisionTreeRegressor(random_state=42, min_samples_leaf=10)
tree_reg1.fit(X, y)
tree_reg2.fit(X, y)
x1 = np.linspace(0, 1, 500).reshape(-1, 1)
y_pred1 = tree_reg1.predict(x1)
y_pred2 = tree_reg2.predict(x1)
plt.figure(figsize=(11, 4))
plt.subplot(121)
plt.plot(X, y, "b.")
plt.plot(x1, y_pred1, "r.-", linewidth=2, label=r"$\hat{y}$")
plt.axis([0, 1, -0.2, 1.1])
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$y$", fontsize=18, rotation=0)
plt.legend(loc="upper center", fontsize=18)
plt.title("No restrictions", fontsize=14)
plt.subplot(122)
plt.plot(X, y, "b.")
plt.plot(x1, y_pred2, "r.-", linewidth=2, label=r"$\hat{y}$")
plt.axis([0, 1, -0.2, 1.1])
plt.xlabel("$x_1$", fontsize=18)
plt.title("min_samples_leaf={}".format(tree_reg2.min_samples_leaf), fontsize=14)
#save_fig("tree_regression_regularization_plot")
plt.show()
# left: no regularization (default params): overfitting
# right: more reasonable.
rnd.seed(6)
Xs = rnd.rand(100, 2) - 0.5
ys = (Xs[:, 0] > 0).astype(np.float32) * 2
angle = np.pi / 4
rotation_matrix = np.array(
[[np.cos(angle), -np.sin(angle)],
[np.sin(angle), np.cos(angle)]])
Xsr = Xs.dot(rotation_matrix)
tree_clf_s = DecisionTreeClassifier(random_state=42)
tree_clf_s.fit(Xs, ys)
tree_clf_sr = DecisionTreeClassifier(random_state=42)
tree_clf_sr.fit(Xsr, ys)
plt.figure(figsize=(11, 4))
plt.subplot(121)
plot_decision_boundary(tree_clf_s, Xs, ys, axes=[-0.7, 0.7, -0.7, 0.7], iris=False)
plt.subplot(122)
plot_decision_boundary(tree_clf_sr, Xsr, ys, axes=[-0.7, 0.7, -0.7, 0.7], iris=False)
#save_fig("sensitivity_to_rotation_plot")
plt.show()
# left: std linearly separable dataset
# right: dataset rotated by 45degrees.
import numpy as np
import numpy.random as rnd
import matplotlib.pyplot as plt
heads_proba = 0.51
coin_tosses = (rnd.rand(10000, 10) < heads_proba).astype(np.int32)
cumulative_heads_ratio = np.cumsum(
coin_tosses, axis=0) / np.arange(1, 10001).reshape(-1, 1)
#cumulative_heads_ratio
plt.figure(figsize=(8,3.5))
plt.plot(cumulative_heads_ratio)
plt.plot([0, 10000], [0.51, 0.51], "k--", linewidth=2, label="51%")
plt.plot([0, 10000], [0.5, 0.5], "k-", label="50%")
plt.xlabel("Number of coin tosses")
plt.ylabel("Heads ratio")
plt.legend(loc="lower right")
plt.title("The law of large numbers:")
plt.axis([0, 10000, 0.42, 0.58])
#save_fig("law_of_large_numbers_plot")
plt.show()
# build a voting classifier in Scikit using three weaker classifiers
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
# use moons dataset
X, y = make_moons(
n_samples=500,
noise=0.30,
random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=42)
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(probability=True, random_state=42)
# voting classifier = logistic + random forest + SVC
voting_clf = VotingClassifier(
estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
voting='soft'
)
voting_clf.fit(X_train, y_train)
# let's see how each individual classifier did:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
# voting classifier did better than 3 individual ones!
If all classifiers can estimate class probabilities (they have a predict_proba() method), use Scikit to predict highest class probability, averaged over all individual classifiers. (soft voting)
Often better than hard voting because it gives more weight to highly confident votes. Replace voting="hard" with "soft" & ensure all classifiers can estimate class probabilities. (SVC cannot by default -set probability param to True.)
This tells SVC to use cross-validation to estimate class probabilities. Slows training times & adds a predict_proba() method).
from sklearn.datasets import make_moons
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
# Train ensemble of 500 Decision Tree classifiers
# each using 100 training instances - randomly sampled from training set
# with replacement.
bag_clf = BaggingClassifier(
DecisionTreeClassifier(random_state=42),
n_estimators=500,
max_samples=100,
bootstrap=True, # set to False for pasting instead of bagging.
n_jobs=-1,
random_state=42)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
y_pred_tree = tree_clf.predict(X_test)
print(accuracy_score(y_test, y_pred_tree))
from matplotlib.colors import ListedColormap
def plot_decision_boundary(clf, X, y, axes=[-1.5, 2.5, -1, 1.5], alpha=0.5, contour=True):
x1s = np.linspace(axes[0], axes[1], 100)
x2s = np.linspace(axes[2], axes[3], 100)
x1, x2 = np.meshgrid(x1s, x2s)
X_new = np.c_[x1.ravel(), x2.ravel()]
y_pred = clf.predict(X_new).reshape(x1.shape)
custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap, linewidth=10)
if contour:
custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])
plt.contour(x1, x2, y_pred, cmap=custom_cmap2, alpha=0.8)
plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo", alpha=alpha)
plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs", alpha=alpha)
plt.axis(axes)
plt.xlabel(r"$x_1$", fontsize=18)
plt.ylabel(r"$x_2$", fontsize=18, rotation=0)
plt.figure(figsize=(11,4))
plt.subplot(121)
plot_decision_boundary(tree_clf, X, y)
plt.title("Decision Tree", fontsize=14)
plt.subplot(122)
plot_decision_boundary(bag_clf, X, y)
plt.title("Decision Trees with Bagging", fontsize=14)
#save_fig("decision_tree_without_and_with_bagging_plot")
plt.show()
# oob_score_: predicts classifier results on test set.
bag_clf = BaggingClassifier(
DecisionTreeClassifier(),
n_estimators=500,
bootstrap=True,
n_jobs=-1,
oob_score=True
)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_
# did oob_score_ do a good job?
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test,y_pred)
# oob decision functionfor each training instance
bag_clf.oob_decision_function_
# Train an RF classifier with 500 trees limited to 16 max nodes each.
# splitter="random": tells RF to search for best feature among
# a random subset of features.
bag_clf = BaggingClassifier(
DecisionTreeClassifier(
splitter="random",
max_leaf_nodes=16,
random_state=42),
n_estimators=500,
max_samples=1.0,
bootstrap=True,
n_jobs=-1,
random_state=42)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(
n_estimators=500,
max_leaf_nodes=16,
n_jobs=-1,
random_state=42)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)
# almost identical predictions
np.sum(y_pred == y_pred_rf) / len(y_pred)
# rank features by importance in iris
# #1: petal length: 44%
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(
n_estimators=500,
n_jobs=-1,
random_state=42)
rnd_clf.fit(iris["data"], iris["target"])
for name, importance in zip(
iris["feature_names"],
rnd_clf.feature_importances_):
print(name, "=", importance)
rnd_clf.feature_importances_
plt.figure(figsize=(6, 4))
for i in range(15):
tree_clf = DecisionTreeClassifier(
max_leaf_nodes=16,
random_state=42+i)
indices_with_replacement = rnd.randint(
0,
len(X_train),
len(X_train))
tree_clf.fit(
X[indices_with_replacement],
y[indices_with_replacement])
plot_decision_boundary(
tree_clf, X, y,
axes=[-1.5, 2.5, -1, 1.5],
alpha=0.02,
contour=False)
plt.show()
# Plot decision boundaries of five predictors on moons dataset
m = len(X_train)
plt.figure(figsize=(11, 4))
for subplot, learning_rate in ((121, 1), (122, 0.5)):
sample_weights = np.ones(m)
for i in range(5):
plt.subplot(subplot)
svm_clf = SVC(
kernel="rbf",
C=0.05)
svm_clf.fit(
X_train, y_train,
sample_weight=sample_weights)
y_pred = svm_clf.predict(
X_train)
sample_weights[y_pred != y_train] *= (1 + learning_rate)
plot_decision_boundary(
svm_clf,
X, y,
alpha=0.2)
plt.title("learning_rate = {}".format(learning_rate - 1),
fontsize=16)
plt.subplot(121)
plt.text(-0.7, -0.65, "1", fontsize=14)
plt.text(-0.6, -0.10, "2", fontsize=14)
plt.text(-0.5, 0.10, "3", fontsize=14)
plt.text(-0.4, 0.55, "4", fontsize=14)
plt.text(-0.3, 0.90, "5", fontsize=14)
#save_fig("boosting_plot")
plt.show()
# left: 1st clf gets many wrong, so 2nd clf gets boosted values.
# right: same sequence, but learning rate cut in half.
# train AdaBoost classifier on 200 decision stumps (DS)
# DS = decision tree with max_depth=1
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1), n_estimators=200,
algorithm="SAMME.R", learning_rate=0.5, random_state=42
)
ada_clf.fit(X_train, y_train)
plot_decision_boundary(ada_clf, X, y)
plt.show()
from sklearn.tree import DecisionTreeRegressor
# training set: a noisy quadratic function
rnd.seed(42)
X = rnd.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * rnd.randn(100)
# train Regressor
tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg1.fit(X, y)
# now train 2nd Regressor using errors made by 1st one.
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg2.fit(X, y2)
# now train 3rd Regressor using errors made by 2nd one.
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg3.fit(X, y3)
X_new = np.array([[0.8]])
# now have ensemble w/ three trees.
y_pred = sum(tree.predict(X_new) for tree in (
tree_reg1, tree_reg2, tree_reg3))
print(y_pred)
def plot_predictions(
regressors, X, y, axes,
label=None,
style="r-",
data_style="b.",
data_label=None):
x1 = np.linspace(axes[0], axes[1], 500)
y_pred = sum(
regressor.predict(x1.reshape(-1, 1)) for regressor in regressors)
plt.plot(X[:, 0], y, data_style, label=data_label)
plt.plot(x1, y_pred, style, linewidth=2, label=label)
if label or data_label:
plt.legend(loc="upper center", fontsize=16)
plt.axis(axes)
plt.figure(figsize=(11,11))
plt.subplot(321)
plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h_1(x_1)$", style="g-", data_label="Training set")
plt.ylabel("$y$", fontsize=16, rotation=0)
plt.title("Residuals and tree predictions", fontsize=16)
plt.subplot(322)
plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1)$", data_label="Training set")
plt.ylabel("$y$", fontsize=16, rotation=0)
plt.title("Ensemble predictions", fontsize=16)
plt.subplot(323)
plot_predictions([tree_reg2], X, y2, axes=[-0.5, 0.5, -0.5, 0.5], label="$h_2(x_1)$", style="g-", data_style="k+", data_label="Residuals")
plt.ylabel("$y - h_1(x_1)$", fontsize=16)
plt.subplot(324)
plot_predictions([tree_reg1, tree_reg2], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1) + h_2(x_1)$")
plt.ylabel("$y$", fontsize=16, rotation=0)
plt.subplot(325)
plot_predictions([tree_reg3], X, y3, axes=[-0.5, 0.5, -0.5, 0.5], label="$h_3(x_1)$", style="g-", data_style="k+")
plt.ylabel("$y - h_1(x_1) - h_2(x_1)$", fontsize=16)
plt.xlabel("$x_1$", fontsize=16)
plt.subplot(326)
plot_predictions([tree_reg1, tree_reg2, tree_reg3], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1) + h_2(x_1) + h_3(x_1)$")
plt.xlabel("$x_1$", fontsize=16)
plt.ylabel("$y$", fontsize=16, rotation=0)
#save_fig("gradient_boosting_plot")
plt.show()
# 1st row: ensemble = only one tree: predictions match 1st tree.
# 2nd row: new tree trained on residual errors of 1st tree.
# 3rd row: " "
# result: ensemble predictions get better as trees are added.
# two GBRT ensembles trained with low learning rate
from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor(
max_depth=2,
n_estimators=3,
learning_rate=0.1,
random_state=42)
gbrt.fit(X, y)
gbrt_slow = GradientBoostingRegressor(
max_depth=2,
n_estimators=200,
learning_rate=0.1,
random_state=42)
gbrt_slow.fit(X, y)
plt.figure(figsize=(11,4))
plt.subplot(121)
plot_predictions(
[gbrt], X, y,
axes=[-0.5, 0.5, -0.1, 0.8],
label="Ensemble predictions")
plt.title("learning_rate={}, n_estimators={}".format(gbrt.learning_rate, gbrt.n_estimators), fontsize=14)
plt.subplot(122)
plot_predictions(
[gbrt_slow], X, y,
axes=[-0.5, 0.5, -0.1, 0.8])
plt.title("learning_rate={}, n_estimators={}".format(gbrt_slow.learning_rate, gbrt_slow.n_estimators), fontsize=14)
#save_fig("gbrt_learning_rate_plot")
plt.show()
# left: not enough trees (underfits)
# right: too many trees (overfits)
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
X_train, X_val, y_train, y_val = train_test_split(X, y)
# train GRBR regressor with 120 trees
gbrt = GradientBoostingRegressor(
max_depth=2,
n_estimators=120,
learning_rate=0.1,
random_state=42)
gbrt.fit(X_train, y_train)
# measure MSE validation error at each stage
errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]
errors
# train another GBRT ensemble using optimal #trees
best_n_estimators = np.argmin(errors)
min_error = errors[best_n_estimators]
gbrt_best = GradientBoostingRegressor(
max_depth=2,
n_estimators=best_n_estimators,
learning_rate=0.1,
random_state=42)
gbrt_best.fit(X_train, y_train)
plt.figure(figsize=(11, 4))
plt.subplot(121)
plt.plot(errors, "b.-")
plt.plot([best_n_estimators, best_n_estimators], [0, min_error], "k--")
plt.plot([0, 120], [min_error, min_error], "k--")
plt.plot(best_n_estimators, min_error, "ko")
plt.text(best_n_estimators, min_error*1.2, "Minimum", ha="center", fontsize=14)
plt.axis([0, 120, 0, 0.01])
plt.xlabel("Number of trees")
plt.title("Validation error", fontsize=14)
plt.subplot(122)
plot_predictions([gbrt_best], X, y, axes=[-0.5, 0.5, -0.1, 0.8])
plt.title("Best model (55 trees)", fontsize=14)
#save_fig("early_stopping_gbrt_plot")
plt.show()
gbrt = GradientBoostingRegressor(
max_depth=2,
n_estimators=1,
learning_rate=0.1,
random_state=42,
warm_start=True)
min_val_error = float("inf")
error_going_up = 0
# 120 estimators.
# stop training with validation error doesn't improve for
# five consecutive iterations
for n_estimators in range(1, 120):
gbrt.n_estimators = n_estimators
gbrt.fit(X_train, y_train)
y_pred = gbrt.predict(X_val)
val_error = mean_squared_error(y_val, y_pred)
if val_error < min_val_error:
min_val_error = val_error
error_going_up = 0
else:
error_going_up += 1
if error_going_up == 5:
break # early stopping
print(gbrt.n_estimators)
# todo: stacking implementation
1) Most points in high-D hypercube will be very close to a border.
2) Distances between random points much greater (very high probability of sparse matrix representation).
import numpy as np
import numpy.random as rnd
# build a 3D dataset
rnd.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1
angles = rnd.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * rnd.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * rnd.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * rnd.randn(m)
# mean-normalize the data
X = X - X.mean(axis=0)
# apply PCA to reduce to 2D
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X2D = pca.fit_transform(X)
# recover 3D points projected on 2D plane
X2D_inv = pca.inverse_transform(X2D)
# utility to draw 3D arrows
from matplotlib.patches import FancyArrowPatch
from mpl_toolkits.mplot3d import proj3d
class Arrow3D(FancyArrowPatch):
def __init__(self, xs, ys, zs, *args, **kwargs):
FancyArrowPatch.__init__(self, (0,0), (0,0), *args, **kwargs)
self._verts3d = xs, ys, zs
def draw(self, renderer):
xs3d, ys3d, zs3d = self._verts3d
xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M)
self.set_positions((xs[0],ys[0]),(xs[1],ys[1]))
FancyArrowPatch.draw(self, renderer)
# express plane as function of x,y
axes = [-1.8, 1.8, -1.3, 1.3, -1.0, 1.0]
x1s = np.linspace(axes[0], axes[1], 10)
x2s = np.linspace(axes[2], axes[3], 10)
x1, x2 = np.meshgrid(x1s, x2s)
C = pca.components_
R = C.T.dot(C)
z = (R[0, 2] * x1 + R[1, 2] * x2) / (1 - R[2, 2])
# plot 3D dataset, plane & projections
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')
X3D_above = X[X[:, 2] > X2D_inv[:, 2]]
X3D_below = X[X[:, 2] <= X2D_inv[:, 2]]
ax.plot(X3D_below[:, 0], X3D_below[:, 1], X3D_below[:, 2], "bo", alpha=0.5)
ax.plot_surface(x1, x2, z, alpha=0.2, color="k")
np.linalg.norm(C, axis=0)
ax.add_artist(Arrow3D([0, C[0, 0]],[0, C[0, 1]],[0, C[0, 2]], mutation_scale=15, lw=1, arrowstyle="-|>", color="k"))
ax.add_artist(Arrow3D([0, C[1, 0]],[0, C[1, 1]],[0, C[1, 2]], mutation_scale=15, lw=1, arrowstyle="-|>", color="k"))
ax.plot([0], [0], [0], "k.")
for i in range(m):
if X[i, 2] > X2D_inv[i, 2]:
ax.plot([X[i][0], X2D_inv[i][0]], [X[i][1], X2D_inv[i][1]], [X[i][2], X2D_inv[i][2]], "k-")
else:
ax.plot([X[i][0], X2D_inv[i][0]], [X[i][1], X2D_inv[i][1]], [X[i][2], X2D_inv[i][2]], "k-", color="#505050")
ax.plot(X2D_inv[:, 0], X2D_inv[:, 1], X2D_inv[:, 2], "k+")
ax.plot(X2D_inv[:, 0], X2D_inv[:, 1], X2D_inv[:, 2], "k.")
ax.plot(X3D_above[:, 0], X3D_above[:, 1], X3D_above[:, 2], "bo")
ax.set_xlabel("$x_1$", fontsize=18)
ax.set_ylabel("$x_2$", fontsize=18)
ax.set_zlabel("$x_3$", fontsize=18)
ax.set_xlim(axes[0:2])
ax.set_ylim(axes[2:4])
ax.set_zlim(axes[4:6])
#save_fig("dataset_3d_plot")
plt.show()
# 2D projection equivalent:
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
ax.plot(X2D[:, 0], X2D[:, 1], "k+")
ax.plot(X2D[:, 0], X2D[:, 1], "k.")
ax.plot([0], [0], "ko")
ax.arrow(0, 0, 0, 1, head_width=0.05, length_includes_head=True, head_length=0.1, fc='k', ec='k')
ax.arrow(0, 0, 1, 0, head_width=0.05, length_includes_head=True, head_length=0.1, fc='k', ec='k')
ax.set_xlabel("$z_1$", fontsize=18)
ax.set_ylabel("$z_2$", fontsize=18, rotation=0)
ax.axis([-1.5, 1.3, -1.2, 1.2])
ax.grid(True)
plt.show()
# Swiss roll visualization:
from sklearn.datasets import make_swiss_roll
X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)
axes = [-11.5, 14, -2, 23, -12, 15]
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=t, cmap=plt.cm.hot)
ax.view_init(10, -70)
ax.set_xlabel("$x_1$", fontsize=18)
ax.set_ylabel("$x_2$", fontsize=18)
ax.set_zlabel("$x_3$", fontsize=18)
ax.set_xlim(axes[0:2])
ax.set_ylim(axes[2:4])
ax.set_zlim(axes[4:6])
#save_fig("swiss_roll_plot")
plt.show()
# "squashed" swiss roll visualization:
plt.figure(figsize=(11, 4))
plt.subplot(121)
plt.scatter(X[:, 0], X[:, 1], c=t, cmap=plt.cm.hot)
plt.axis(axes[:4])
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$x_2$", fontsize=18, rotation=0)
plt.grid(True)
plt.subplot(122)
plt.scatter(t, X[:, 1], c=t, cmap=plt.cm.hot)
plt.axis([4, 15, axes[2], axes[3]])
plt.xlabel("$z_1$", fontsize=18)
plt.grid(True)
#save_fig("squished_swiss_roll_plot")
plt.show()
from matplotlib import gridspec
axes = [-11.5, 14, -2, 23, -12, 15]
x2s = np.linspace(axes[2], axes[3], 10)
x3s = np.linspace(axes[4], axes[5], 10)
x2, x3 = np.meshgrid(x2s, x3s)
fig = plt.figure(figsize=(6, 5))
ax = plt.subplot(111, projection='3d')
positive_class = X[:, 0] > 5
X_pos = X[positive_class]
X_neg = X[~positive_class]
ax.view_init(10, -70)
ax.plot(X_neg[:, 0], X_neg[:, 1], X_neg[:, 2], "y^")
ax.plot_wireframe(5, x2, x3, alpha=0.5)
ax.plot(X_pos[:, 0], X_pos[:, 1], X_pos[:, 2], "gs")
ax.set_xlabel("$x_1$", fontsize=18)
ax.set_ylabel("$x_2$", fontsize=18)
ax.set_zlabel("$x_3$", fontsize=18)
ax.set_xlim(axes[0:2])
ax.set_ylim(axes[2:4])
ax.set_zlim(axes[4:6])
#save_fig("manifold_decision_boundary_plot1")
plt.show()
fig = plt.figure(figsize=(5, 4))
ax = plt.subplot(111)
plt.plot(t[positive_class], X[positive_class, 1], "gs")
plt.plot(t[~positive_class], X[~positive_class, 1], "y^")
plt.axis([4, 15, axes[2], axes[3]])
plt.xlabel("$z_1$", fontsize=18)
plt.ylabel("$z_2$", fontsize=18, rotation=0)
plt.grid(True)
#save_fig("manifold_decision_boundary_plot2")
plt.show()
fig = plt.figure(figsize=(6, 5))
ax = plt.subplot(111, projection='3d')
positive_class = 2 * (t[:] - 4) > X[:, 1]
X_pos = X[positive_class]
X_neg = X[~positive_class]
ax.view_init(10, -70)
ax.plot(X_neg[:, 0], X_neg[:, 1], X_neg[:, 2], "y^")
ax.plot(X_pos[:, 0], X_pos[:, 1], X_pos[:, 2], "gs")
ax.set_xlabel("$x_1$", fontsize=18)
ax.set_ylabel("$x_2$", fontsize=18)
ax.set_zlabel("$x_3$", fontsize=18)
ax.set_xlim(axes[0:2])
ax.set_ylim(axes[2:4])
ax.set_zlim(axes[4:6])
#save_fig("manifold_decision_boundary_plot3")
plt.show()
fig = plt.figure(figsize=(5, 4))
ax = plt.subplot(111)
plt.plot(t[positive_class], X[positive_class, 1], "gs")
plt.plot(t[~positive_class], X[~positive_class, 1], "y^")
plt.plot([4, 15], [0, 22], "b-", linewidth=2)
plt.axis([4, 15, axes[2], axes[3]])
plt.xlabel("$z_1$", fontsize=18)
plt.ylabel("$z_2$", fontsize=18, rotation=0)
plt.grid(True)
#save_fig("manifold_decision_boundary_plot4")
plt.show()
# Lesson learned (below):
# Unrolling a dataset to a lower dimension doesn't necessarily lead to
# a simpler representation.
angle = np.pi / 5
stretch = 5
m = 200
rnd.seed(3)
X = rnd.randn(m, 2) / 10
X = X.dot(np.array([[stretch, 0],[0, 1]])) # stretch
X = X.dot([[np.cos(angle), np.sin(angle)], [-np.sin(angle), np.cos(angle)]]) # rotate
u1 = np.array([np.cos(angle), np.sin(angle)])
u2 = np.array([np.cos(angle - 2 * np.pi/6), np.sin(angle - 2 * np.pi/6)])
u3 = np.array([np.cos(angle - np.pi/2), np.sin(angle - np.pi/2)])
X_proj1 = X.dot(u1.reshape(-1, 1))
X_proj2 = X.dot(u2.reshape(-1, 1))
X_proj3 = X.dot(u3.reshape(-1, 1))
plt.figure(figsize=(8,4))
plt.subplot2grid((3,2), (0, 0), rowspan=3)
plt.plot([-1.4, 1.4], [-1.4*u1[1]/u1[0], 1.4*u1[1]/u1[0]], "k-", linewidth=1)
plt.plot([-1.4, 1.4], [-1.4*u2[1]/u2[0], 1.4*u2[1]/u2[0]], "k--", linewidth=1)
plt.plot([-1.4, 1.4], [-1.4*u3[1]/u3[0], 1.4*u3[1]/u3[0]], "k:", linewidth=2)
plt.plot(X[:, 0], X[:, 1], "bo", alpha=0.5)
plt.axis([-1.4, 1.4, -1.4, 1.4])
plt.arrow(0, 0, u1[0], u1[1], head_width=0.1, linewidth=5, length_includes_head=True, head_length=0.1, fc='k', ec='k')
plt.arrow(0, 0, u3[0], u3[1], head_width=0.1, linewidth=5, length_includes_head=True, head_length=0.1, fc='k', ec='k')
plt.text(u1[0] + 0.1, u1[1] - 0.05, r"$\mathbf{c_1}$", fontsize=22)
plt.text(u3[0] + 0.1, u3[1], r"$\mathbf{c_2}$", fontsize=22)
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$x_2$", fontsize=18, rotation=0)
plt.grid(True)
plt.subplot2grid((3,2), (0, 1))
plt.plot([-2, 2], [0, 0], "k-", linewidth=1)
plt.plot(X_proj1[:, 0], np.zeros(m), "bo", alpha=0.3)
plt.gca().get_yaxis().set_ticks([])
plt.gca().get_xaxis().set_ticklabels([])
plt.axis([-2, 2, -1, 1])
plt.grid(True)
plt.subplot2grid((3,2), (1, 1))
plt.plot([-2, 2], [0, 0], "k--", linewidth=1)
plt.plot(X_proj2[:, 0], np.zeros(m), "bo", alpha=0.3)
plt.gca().get_yaxis().set_ticks([])
plt.gca().get_xaxis().set_ticklabels([])
plt.axis([-2, 2, -1, 1])
plt.grid(True)
plt.subplot2grid((3,2), (2, 1))
plt.plot([-2, 2], [0, 0], "k:", linewidth=2)
plt.plot(X_proj3[:, 0], np.zeros(m), "bo", alpha=0.3)
plt.gca().get_yaxis().set_ticks([])
plt.axis([-2, 2, -1, 1])
plt.xlabel("$z_1$", fontsize=18)
plt.grid(True)
#save_fig("pca_best_projection")
plt.show()
Each axis vector is called a principal component. (PC)
PCs found using Singular Value Decomposition (SVD), a matrix factorization technique.
# use NumPy svd() to get principal components of training set,
# then extract 1st two PCs.
X_centered = X - X.mean(axis=0)
U,s,V = np.linalg.svd(X_centered)
c1, c2 = V.T[:,0], V.T[:,1]
print(c1,c2)
# project training set onto plane defined by 1st two PCs.
W2 = V.T[:, :2]
X2D = X_centered.dot(W2)
print(X2D)
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X2D = pca.fit_transform(X)
print(pca.components_[0])
print(pca.components_.T[:,0])
# 95% of dataset variance explained by 1st axis.
print(pca.explained_variance_ratio_)
# find minimum d to preserve 95% of training set variance
pca = PCA()
pca.fit(X)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
print(d)
#MNIST compression:
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_mldata
#mnist = fetch_mldata('MNIST original')
mnist_path = "./mnist-original.mat"
from scipy.io import loadmat
mnist_raw = loadmat(mnist_path)
mnist = {
"data": mnist_raw["data"].T,
"target": mnist_raw["label"][0],
"COL_NAMES": ["label", "data"],
"DESCR": "mldata.org dataset: mnist-original",
}
X, y = mnist["data"], mnist["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y)
X = X_train
pca = PCA()
pca.fit(X)
d = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.95) + 1
d
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X)
pca.n_components_
# did you hit your 95% minimum?
np.sum(pca.explained_variance_ratio_)
# use inverse_transform to decompress back to 784 dimensions
X_mnist = X_train
pca = PCA(n_components = 154)
X_mnist_reduced = pca.fit_transform(X_mnist)
X_mnist_recovered = pca.inverse_transform(X_mnist_reduced)
import matplotlib
import matplotlib.pyplot as plt
def plot_digits(instances, images_per_row=5, **options):
size = 28
images_per_row = min(len(instances), images_per_row)
images = [instance.reshape(size,size) for instance in instances]
n_rows = (len(instances) - 1) // images_per_row + 1
row_images = []
n_empty = n_rows * images_per_row - len(instances)
images.append(np.zeros((size, size * n_empty)))
for row in range(n_rows):
rimages = images[row * images_per_row : (row + 1) * images_per_row]
row_images.append(np.concatenate(rimages, axis=1))
image = np.concatenate(row_images, axis=0)
plt.imshow(image, cmap = matplotlib.cm.binary, **options)
plt.axis("off")
plt.figure(figsize=(7, 4))
plt.subplot(121)
plot_digits(X_mnist[::2100])
plt.title("Original", fontsize=16)
plt.subplot(122)
plot_digits(X_mnist_recovered[::2100])
plt.title("Compressed", fontsize=16)
#save_fig("mnist_compression_plot")
plt.show()
# split MNIST into 100 minibatches using Numpy array_split()
# reduce MNIST down to 154 dimensions as before.
# note use of partial_fit() for each batch.
from sklearn.decomposition import IncrementalPCA
n_batches = 100
inc_pca = IncrementalPCA(n_components=154)
for X_batch in np.array_split(X_mnist, n_batches):
print(".", end="")
inc_pca.partial_fit(X_batch)
X_mnist_reduced_inc = inc_pca.transform(X_mnist)
# alternative: Numpy memmap class (use binary array on disk as if it was in memory)
filename = "my_mnist.data"
X_mm = np.memmap(
filename, dtype='float32', mode='write', shape=X_mnist.shape)
X_mm[:] = X_mnist
del X_mm
X_mm = np.memmap(filename, dtype='float32', mode='readonly', shape=X_mnist.shape)
batch_size = len(X_mnist) // n_batches
inc_pca = IncrementalPCA(n_components=154, batch_size=batch_size)
inc_pca.fit(X_mm)
rnd_pca = PCA(
n_components=154,
random_state=42,
svd_solver="randomized")
X_reduced = rnd_pca.fit_transform(X_mnist)
import time
for n_components in (2, 10, 154):
print("n_components =", n_components)
regular_pca = PCA(
n_components=n_components)
inc_pca = IncrementalPCA(
n_components=154,
batch_size=500)
rnd_pca = PCA(
n_components=154,
random_state=42,
svd_solver="randomized")
for pca in (regular_pca, inc_pca, rnd_pca):
t1 = time.time()
pca.fit(X_mnist)
t2 = time.time()
print(pca.__class__.__name__, t2 - t1, "seconds")
rnd_pca = PCA(n_components=154, svd_solver="randomized")
t1 = time.time()
X_reduced = rnd_pca.fit_transform(X_mnist)
t2 = time.time()
print(t2-t1, "seconds")
# Below: Swiss roll reduced to 2D using 3 techniques:
# 1) linear kernel (equiv to PCA)
# 2) RBF kernel
# 3) sigmoid kernel (logistic)
from sklearn.decomposition import KernelPCA
X, t = make_swiss_roll(
n_samples=1000,
noise=0.2,
random_state=42)
lin_pca = KernelPCA(
n_components = 2,
kernel="linear",
fit_inverse_transform=True)
rbf_pca = KernelPCA(
n_components = 2,
kernel="rbf",
gamma=0.0433,
fit_inverse_transform=True)
sig_pca = KernelPCA(
n_components = 2,
kernel="sigmoid",
gamma=0.001,
coef0=1,
fit_inverse_transform=True)
y = t > 6.9
plt.figure(figsize=(11, 4))
for subplot, pca, title in (
(131, lin_pca, "Linear kernel"),
(132, rbf_pca, "RBF kernel, $\gamma=0.04$"),
(133, sig_pca, "Sigmoid kernel, $\gamma=10^{-3}, r=1$")):
X_reduced = pca.fit_transform(X)
if subplot == 132:
X_reduced_rbf = X_reduced
plt.subplot(subplot)
#plt.plot(X_reduced[y, 0], X_reduced[y, 1], "gs")
#plt.plot(X_reduced[~y, 0], X_reduced[~y, 1], "y^")
plt.title(title, fontsize=14)
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot)
plt.xlabel("$z_1$", fontsize=18)
if subplot == 131:
plt.ylabel("$z_2$", fontsize=18, rotation=0)
plt.grid(True)
#save_fig("kernel_pca_plot")
plt.show()
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
clf = Pipeline([
("kpca", KernelPCA(n_components=2)),
("log_reg", LogisticRegression())])
param_grid = [{
"kpca__gamma": np.linspace(0.03, 0.05, 10),
"kpca__kernel": ["rbf", "sigmoid"]}]
grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(X, y)
# best kernel & params?
print(grid_search.best_params_)
rbf_pca = KernelPCA(
n_components = 2,
kernel="rbf",
gamma=0.0433,
fit_inverse_transform=True) # perform reconstruction
X_reduced = rbf_pca.fit_transform(X)
X_preimage = rbf_pca.inverse_transform(X_reduced)
# return reconstruction pre-image error
from sklearn.metrics import mean_squared_error
mean_squared_error(X, X_preimage)
times_rpca = []
times_pca = []
sizes = [1000, 10000, 20000, 30000, 40000, 50000, 70000,
100000, 200000, 500000]
for n_samples in sizes:
X = rnd.randn(n_samples, 5)
pca = PCA(
n_components = 2,
random_state=42,
svd_solver="randomized")
t1 = time.time()
pca.fit(X)
t2 = time.time()
times_rpca.append(t2 - t1)
pca = PCA(n_components = 2)
t1 = time.time()
pca.fit(X)
t2 = time.time()
times_pca.append(t2 - t1)
plt.plot(sizes, times_rpca, "b-o", label="RPCA")
plt.plot(sizes, times_pca, "r-s", label="PCA")
plt.xlabel("n_samples")
plt.ylabel("Training time")
plt.legend(loc="upper left")
plt.title("PCA and Randomized PCA time complexity ")
plt.show()
# Use LLE to unroll a Swiss Roll.
from sklearn.manifold import LocallyLinearEmbedding
X, t = make_swiss_roll(
n_samples=1000,
noise=0.2,
random_state=41)
lle = LocallyLinearEmbedding(
n_neighbors=10,
n_components=2,
random_state=42)
X_reduced = lle.fit_transform(X)
plt.title("Unrolled swiss roll using LLE", fontsize=14)
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot)
plt.xlabel("$z_1$", fontsize=18)
plt.ylabel("$z_2$", fontsize=18)
plt.axis([-0.065, 0.055, -0.1, 0.12])
plt.grid(True)
#save_fig("lle_unrolling_plot")
plt.show()
from sklearn.manifold import MDS
mds = MDS(n_components=2, random_state=42)
X_reduced_mds = mds.fit_transform(X)
from sklearn.manifold import Isomap
isomap = Isomap(n_components=2)
X_reduced_isomap = isomap.fit_transform(X)
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)
X_reduced_tsne = tsne.fit_transform(X)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=2)
X_mnist = mnist["data"]
y_mnist = mnist["target"]
lda.fit(X_mnist, y_mnist)
X_reduced_lda = lda.transform(X_mnist)
titles = ["MDS", "Isomap", "t-SNE"]
plt.figure(figsize=(11,4))
for subplot, title, X_reduced in zip((131, 132, 133), titles,
(X_reduced_mds, X_reduced_isomap, X_reduced_tsne)):
plt.subplot(subplot)
plt.title(title, fontsize=14)
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot)
plt.xlabel("$z_1$", fontsize=18)
if subplot == 131:
plt.ylabel("$z_2$", fontsize=18, rotation=0)
plt.grid(True)
#save_fig("other_dim_reduction_plot")
plt.show()
$ cd
$ source env/bin/activate (if using virtualenv)
$ pip3 install --upgrade tensorflow (or tensorflow-gpu for GPU support)
$ python3 -c 'import tensorflow; print(tensorflow.version)'
!python3 -c 'import tensorflow; print(tensorflow.__version__)'
import numpy as np
# create your first graph
import tensorflow as tf
x = tf.Variable(3, name="x")
y = tf.Variable(4, name="y")
f = x*x*y + y + 2
# run graph by opening a session
sess = tf.Session()
sess.run(x.initializer)
sess.run(y.initializer)
result = sess.run(f)
print(result)
sess.close()
# for repeated session "runs"
with tf.Session() as sess:
x.initializer.run()
y.initializer.run()
result = f.eval()
print(result)
# use global_variables_initializer() to set up initialization
init = tf.global_variables_initializer()
with tf.Session() as sess:
init.run() # actually initialize all the variables
result = f.eval()
print(result)
# interactive sessions (from within Jupyter or Python shell)
# interactive sesions are auto-set as default sessions
sess = tf.InteractiveSession()
init.run()
result = f.eval()
print(result)
sess.close()
# any created node = added to default graph
x1 = tf.Variable(1)
x1.graph is tf.get_default_graph()
# handling multiple graphs
graph = tf.Graph()
with graph.as_default():
x2 = tf.Variable(2)
x2.graph is graph, x2.graph is tf.get_default_graph()
# TF finds node's dependencies & evaluates them first
w = tf.constant(3)
x = w + 2
y = x + 5
z = x * 3
# previous eval results = NOT reused. above code evals w & x twice.
with tf.Session() as sess:
print(y.eval())
print(z.eval())
# a more efficient evaluation call:
with tf.Session() as sess:
y_val, z_val = sess.run([y,z])
print(y_val)
print(z_val)
# (never used Numpy c_ operator before. Had to check it out.)
a = np.ones((6,1))
b = np.zeros((6,4))
c = np.ones((6,2))
np.c_[a,b,c]
from sklearn.datasets import fetch_california_housing
import numpy as np
housing = fetch_california_housing()
m, n = housing.data.shape
# add bias feature, x0 = 1
housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data]
print(m,n,housing_data_plus_bias.shape)
tf.reset_default_graph()
X = tf.constant(
housing_data_plus_bias,
dtype=tf.float64, name="X")
print("X shape: ",X.shape)
# housing.target = 1D array. Reshape to col vector to compute theta.
# reshape() accepts -1 = "unspecified" for a dimension.
y = tf.constant(
housing.target.reshape(-1, 1),
dtype=tf.float64, name="y")
XT = tf.transpose(X)
print("XT shape: ",XT.shape)
# normal equation: theta = (XT * X)^-1 * XT * y
theta = tf.matmul(
tf.matmul(
tf.matrix_inverse(
tf.matmul(XT, X)),
XT),
y)
# TF doesn't immediately run the code. It creates nodes that will run with eval().
# TF will auto-run on GPU if available.
with tf.Session() as sess:
result = theta.eval()
print("theta: \n",result)
# compare to pure NumPy
X = housing_data_plus_bias
y = housing.target.reshape(-1, 1)
theta_numpy = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
print("theta: \n",theta_numpy)
# compare to Scikit
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(
housing.data,
housing.target.reshape(-1, 1))
print("theta: \n",np.r_[
lin_reg.intercept_.reshape(-1, 1),
lin_reg.coef_.T])
# normalize input features first - otherwise training = much slower.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_housing_data = scaler.fit_transform(
housing.data)
scaled_housing_data_plus_bias = np.c_[
np.ones((m, 1)),
scaled_housing_data]
import pandas as pd
pd.DataFrame(scaled_housing_data_plus_bias).info()
print("mean (axis=0): \n",scaled_housing_data_plus_bias.mean(axis=0))
print("mean (axis=1): \n",scaled_housing_data_plus_bias.mean(axis=1))
print("mean (w/bias): \n",scaled_housing_data_plus_bias.mean())
print("data shape: \n",scaled_housing_data_plus_bias.shape)
# batch gradient step:
# theta(next) = theta - learning_rate * MSE(theta)
tf.reset_default_graph()
n_epochs = 1000
learning_rate = 0.01
X = tf.constant(
scaled_housing_data_plus_bias,
dtype=tf.float32, name="X")
y = tf.constant(
housing.target.reshape(-1, 1),
dtype=tf.float32, name="y")
theta = tf.Variable( # tf.random_uniform = generates random tensor
tf.random_uniform([n+1, 1], -1.0, 1.0, seed=42),
name="theta")
y_pred = tf.matmul(
X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
gradients = 2/m * tf.matmul(tf.transpose(X), error)
training_op = tf.assign(theta, theta - learning_rate * gradients)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(n_epochs):
if epoch % 100 == 0: # do every 100th epoch:
print("Epoch", epoch, "MSE =", mse.eval())
sess.run(training_op)
best_theta = theta.eval()
print("Best theta: \n",best_theta)
tf.reset_default_graph()
n_epochs = 1000
learning_rate = 0.01
X = tf.constant(
scaled_housing_data_plus_bias,
dtype=tf.float32, name="X")
y = tf.constant(
housing.target.reshape(-1, 1),
dtype=tf.float32, name="y")
theta = tf.Variable(
tf.random_uniform([n + 1, 1], -1.0, 1.0,
seed=42), name="theta")
y_pred = tf.matmul(
X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
# AutoDiff to the rescue
# creates list of ops, one/variable, to find gradients per variable
gradients = tf.gradients(mse, [theta])[0]
#
training_op = tf.assign(theta, theta - learning_rate * gradients)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(n_epochs):
if epoch % 100 == 0:
print("Epoch", epoch, "MSE =", mse.eval())
sess.run(training_op)
best_theta = theta.eval()
print("Best theta: \n", best_theta)
tf.reset_default_graph()
n_epochs = 1000
learning_rate = 0.01
X = tf.constant(
scaled_housing_data_plus_bias,
dtype=tf.float32, name="X")
y = tf.constant(
housing.target.reshape(-1, 1),
dtype=tf.float32, name="y")
theta = tf.Variable(
tf.random_uniform([n + 1, 1], -1.0, 1.0, seed=42),
name="theta")
y_pred = tf.matmul(
X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
#####
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(mse)
#####
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(n_epochs):
if epoch % 100 == 0:
print("Epoch", epoch, "MSE =", mse.eval())
sess.run(training_op)
best_theta = theta.eval()
print("Best theta:\n", best_theta)
tf.reset_default_graph()
n_epochs = 1000
learning_rate = 0.01
X = tf.constant(
scaled_housing_data_plus_bias,
dtype=tf.float32, name="X")
y = tf.constant(
housing.target.reshape(-1, 1),
dtype=tf.float32, name="y")
theta = tf.Variable(
tf.random_uniform([n + 1, 1], -1.0, 1.0, seed=42),
name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
#####
optimizer = tf.train.MomentumOptimizer(
learning_rate=learning_rate,
momentum=0.25)
#####
training_op = optimizer.minimize(mse)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(n_epochs):
sess.run(training_op)
best_theta = theta.eval()
print("Best theta:\n", best_theta)
A = tf.placeholder(tf.float32, shape=(None, 3))
B = A + 5
with tf.Session() as sess:
B_val_1 = B.eval(
feed_dict={A: [[1, 2, 3]]})
B_val_2 = B.eval(
feed_dict={A: [[4, 5, 6], [7, 8, 9]]})
print(B_val_1, "\n", B_val_2)
# definition phase: change X,y to placeholder nodes
tf.reset_default_graph()
n_epochs = 1000
learning_rate = 0.01
##########
X = tf.placeholder(tf.float32, shape=(None, n+1), name="X")
y = tf.placeholder(tf.float32, shape=(None, 1), name="y")
##########
theta = tf.Variable(
tf.random_uniform([n+1, 1], -1.0, 1.0, seed=42),
name="theta")
y_pred = tf.matmul(
X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
optimizer = tf.train.GradientDescentOptimizer(
learning_rate=learning_rate)
training_op = optimizer.minimize(mse)
init = tf.global_variables_initializer()
# execution phase: fetch minibatches one-by-one.
# use feed_dict to provide values to dependent nodes
import numpy.random as rnd
n_epochs = 10
batch_size = 100
n_batches = int(np.ceil(m / batch_size))
print("m: ",m,"\n","n_batches: ",n_batches,"\n")
def fetch_batch(epoch, batch_index, batch_size):
rnd.seed(epoch * n_batches + batch_index)
indices = rnd.randint(m, size=batch_size)
X_batch = scaled_housing_data_plus_bias[indices]
y_batch = housing.target.reshape(-1, 1)[indices]
return X_batch, y_batch
with tf.Session() as sess:
sess.run(init)
for epoch in range(n_epochs):
for batch_index in range(n_batches):
X_batch, y_batch = fetch_batch(
epoch, batch_index, batch_size)
sess.run(
training_op,
feed_dict={X: X_batch, y: y_batch})
best_theta = theta.eval()
print("Best theta: \n",best_theta)
tf.reset_default_graph()
n_epochs = 1000
learning_rate = 0.01
X = tf.constant(
scaled_housing_data_plus_bias,
dtype=tf.float32, name="X")
y = tf.constant(
housing.target.reshape(-1, 1),
dtype=tf.float32, name="y")
theta = tf.Variable(
tf.random_uniform([n+1, 1], -1.0, 1.0, seed=42),
name="theta")
y_pred = tf.matmul(
X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(
tf.square(error),
name="mse")
optimizer = tf.train.GradientDescentOptimizer(
learning_rate=learning_rate)
training_op = optimizer.minimize(mse)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
# can specify which vars to save:
# saver = tf.train.Saver({"weights": theta})
with tf.Session() as sess:
sess.run(init)
for epoch in range(n_epochs):
if epoch % 100 == 0:
print("Epoch", epoch, "MSE =", mse.eval())
save_path = saver.save(sess, "/tmp/my_model.ckpt")
sess.run(training_op)
best_theta = theta.eval()
save_path = saver.save(sess, "my_model_final.ckpt")
print("Best theta:\n",best_theta)
# model restoration:
# 1) create Saver at end of construction phase
# 2) call saver.restore() at start of execution
!cat checkpoint
from IPython.display import clear_output, Image, display, HTML
def strip_consts(graph_def, max_const_size=32):
"""Strip large constant values from graph_def."""
strip_def = tf.GraphDef()
for n0 in graph_def.node:
n = strip_def.node.add()
n.MergeFrom(n0)
if n.op == 'Const':
tensor = n.attr['value'].tensor
size = len(tensor.tensor_content)
if size > max_const_size:
tensor.tensor_content = b"<stripped %d bytes>"%size
return strip_def
def show_graph(graph_def, max_const_size=32):
"""Visualize TensorFlow graph."""
if hasattr(graph_def, 'as_graph_def'):
graph_def = graph_def.as_graph_def()
strip_def = strip_consts(graph_def, max_const_size=max_const_size)
code = """
<script>
function load() {{
document.getElementById("{id}").pbtxt = {data};
}}
</script>
<link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
<div style="height:600px">
<tf-graph-basic id="{id}"></tf-graph-basic>
</div>
""".format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))
# original was width=1200px, height=620px
iframe = """
<iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
""".format(code.replace('"', '"'))
display(HTML(iframe))
show_graph(tf.get_default_graph())
tf.reset_default_graph()
# Need a logging directory for TB data
# with timestamps to avoid mixing runs together
from datetime import datetime
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "tf_logs"
logdir = "{}/run-{}/".format(root_logdir, now)
n_epochs = 1000
learning_rate = 0.01
X = tf.placeholder(
tf.float32,
shape=(None, n + 1),
name="X")
y = tf.placeholder(
tf.float32,
shape=(None, 1),
name="y")
theta = tf.Variable(
tf.random_uniform([n + 1, 1], -1.0, 1.0, seed=42),
name="theta")
y_pred = tf.matmul(
X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(
tf.square(error),
name="mse")
optimizer = tf.train.GradientDescentOptimizer(
learning_rate=learning_rate)
training_op = optimizer.minimize(mse)
init = tf.global_variables_initializer()
mse_summary = tf.summary.scalar('MSE', mse)
# FileWriter - creates logdir if not already present,
# then writes graph def to a binary logfile.
summary_writer = tf.summary.FileWriter(
logdir,
tf.get_default_graph())
n_epochs = 10
batch_size = 100
n_batches = int(np.ceil(m / batch_size))
with tf.Session() as sess:
sess.run(init)
for epoch in range(n_epochs):
for batch_index in range(n_batches):
X_batch, y_batch = fetch_batch(
epoch,
batch_index,
batch_size)
# evaluate mse_summary on periodic basis,
# eg every 10 minibatches.
# adds summary for addition to events file.
if batch_index % 10 == 0:
summary_str = mse_summary.eval(
feed_dict={X: X_batch, y: y_batch})
step = epoch * n_batches + batch_index
summary_writer.add_summary(
summary_str,
step)
sess.run(
training_op,
feed_dict={X: X_batch, y: y_batch})
best_theta = theta.eval()
summary_writer.flush()
summary_writer.close()
print("Best theta:")
print(best_theta)
#!ls tf_logs/run*
tf.reset_default_graph()
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "tf_logs"
logdir = "{}/run-{}/".format(root_logdir, now)
n_epochs = 1000
learning_rate = 0.01
X = tf.placeholder(
tf.float32,
shape=(None, n + 1),
name="X")
y = tf.placeholder(
tf.float32,
shape=(None, 1),
name="y")
theta = tf.Variable(
tf.random_uniform([n + 1, 1], -1.0, 1.0, seed=42),
name="theta")
y_pred = tf.matmul(
X, theta,
name="predictions")
##### Name Scope
with tf.name_scope('loss') as scope:
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
#####
optimizer = tf.train.GradientDescentOptimizer(
learning_rate=learning_rate)
training_op = optimizer.minimize(mse)
init = tf.global_variables_initializer()
mse_summary = tf.summary.scalar(
'MSE', mse)
summary_writer = tf.summary.FileWriter(
logdir, tf.get_default_graph())
n_epochs = 10
batch_size = 100
n_batches = int(np.ceil(m / batch_size))
with tf.Session() as sess:
sess.run(init)
for epoch in range(n_epochs):
for batch_index in range(n_batches):
X_batch, y_batch = fetch_batch(
epoch,
batch_index,
batch_size)
if batch_index % 10 == 0:
summary_str = mse_summary.eval(
feed_dict={X: X_batch, y: y_batch})
step = epoch * n_batches + batch_index
summary_writer.add_summary(
summary_str,
step)
sess.run(
training_op,
feed_dict={X: X_batch, y: y_batch})
best_theta = theta.eval()
summary_writer.flush()
summary_writer.close()
print("Best theta:")
print(best_theta)

# UGLY
tf.reset_default_graph()
n_features = 3
X = tf.placeholder(
tf.float32,
shape=(None, n_features),
name="X")
w1 = tf.Variable(
tf.random_normal(
(n_features, 1)),
name="weights1")
w2 = tf.Variable(
tf.random_normal(
(n_features, 1)),
name="weights2")
b1 = tf.Variable(
0.0, name="bias1")
b2 = tf.Variable(
0.0, name="bias2")
linear1 = tf.add(
tf.matmul(X, w1), b1, name="linear1")
linear2 = tf.add(
tf.matmul(X, w2), b2, name="linear2")
relu1 = tf.maximum(
linear1, 0, name="relu1")
relu2 = tf.maximum(
linear2, 0, name="relu2")
output = tf.add_n([relu1, relu2], name="output")
# better -- you can create functions that build ReLUs!
tf.reset_default_graph()
def relu(X):
w_shape = int(
X.get_shape()[1]), 1
w = tf.Variable(
tf.random_normal(w_shape),
name="weights")
b = tf.Variable(
0.0,
name="bias")
linear = tf.add(
tf.matmul(X, w),
b,
name="linear")
return tf.maximum(linear, 0, name="relu")
n_features = 3
X = tf.placeholder(
tf.float32,
shape=(None, n_features),
name="X")
relus = [relu(X) for i in range(5)]
output = tf.add_n(
relus,
name="output")
summary_writer = tf.summary.FileWriter(
"logs/relu1",
tf.get_default_graph())
# better, with name scopes
tf.reset_default_graph()
def relu(X):
with tf.name_scope("relu"):
w_shape = int(
X.get_shape()[1]), 1
w = tf.Variable(
tf.random_normal(w_shape), name="weights")
b = tf.Variable(
0.0, name="bias")
linear = tf.add(
tf.matmul(X, w), b, name="linear")
return tf.maximum(
linear, 0, name="max")
n_features = 3
X = tf.placeholder(
tf.float32,
shape=(None, n_features),
name="X")
relus = [relu(X) for i in range(5)]
output = tf.add_n(
relus, name="output")
summary_writer = tf.summary.FileWriter(
"logs/relu2",
tf.get_default_graph())
summary_writer.close()
!ls logs
tf.reset_default_graph()
def relu(X, threshold):
with tf.name_scope("relu"):
w_shape = int(X.get_shape()[1]), 1
w = tf.Variable(tf.random_normal(w_shape), name="weights")
b = tf.Variable(0.0, name="bias")
linear = tf.add(tf.matmul(X, w), b, name="linear")
return tf.maximum(linear, threshold, name="max")
threshold = tf.Variable(0.0, name="threshold")
X = tf.placeholder(tf.float32, shape=(None, n_features), name="X")
relus = [relu(X, threshold) for i in range(5)]
output = tf.add_n(relus, name="output")
tf.reset_default_graph()
def relu(X):
with tf.name_scope("relu"):
if not hasattr(relu, "threshold"):
relu.threshold = tf.Variable(0.0, name="threshold")
w_shape = int(X.get_shape()[1]), 1
w = tf.Variable(tf.random_normal(w_shape), name="weights")
b = tf.Variable(0.0, name="bias")
linear = tf.add(tf.matmul(X, w), b, name="linear")
return tf.maximum(linear, relu.threshold, name="max")
X = tf.placeholder(tf.float32, shape=(None, n_features), name="X")
relus = [relu(X) for i in range(5)]
output = tf.add_n(relus, name="output")
tf.reset_default_graph()
def relu(X):
with tf.variable_scope("relu", reuse=True):
threshold = tf.get_variable("threshold", shape=(), initializer=tf.constant_initializer(0.0))
w_shape = int(X.get_shape()[1]), 1
w = tf.Variable(tf.random_normal(w_shape), name="weights")
b = tf.Variable(0.0, name="bias")
linear = tf.add(tf.matmul(X, w), b, name="linear")
return tf.maximum(linear, threshold, name="max")
X = tf.placeholder(tf.float32, shape=(None, n_features), name="X")
with tf.variable_scope("relu"):
threshold = tf.get_variable("threshold", shape=(), initializer=tf.constant_initializer(0.0))
relus = [relu(X) for i in range(5)]
output = tf.add_n(relus, name="output")
summary_writer = tf.summary.FileWriter("logs/relu6", tf.get_default_graph())
summary_writer.close()
# Perceptron with Iris dataset (Scikit)
import numpy as np
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data[:, (2, 3)] # petal length, petal width
y = (iris.target == 0).astype(np.int)
from sklearn.linear_model import Perceptron
per_clf = Perceptron(random_state=42)
per_clf.fit(X, y)
y_pred = per_clf.predict([[2, 0.5]])
print(y_pred)
import matplotlib.pyplot as plt
a = -per_clf.coef_[0][0] / per_clf.coef_[0][1]
b = -per_clf.intercept_ / per_clf.coef_[0][1]
axes = [0, 5, 0, 2]
x0, x1 = np.meshgrid(
np.linspace(axes[0], axes[1], 500).reshape(-1, 1),
np.linspace(axes[2], axes[3], 200).reshape(-1, 1),
)
X_new = np.c_[
x0.ravel(),
x1.ravel()]
y_predict = per_clf.predict(X_new)
zz = y_predict.reshape(x0.shape)
plt.figure(figsize=(10, 4))
plt.plot(X[y==0, 0], X[y==0, 1], "bs", label="Not Iris-Setosa")
plt.plot(X[y==1, 0], X[y==1, 1], "yo", label="Iris-Setosa")
plt.plot(
[axes[0],
axes[1]],
[a * axes[0] + b,
a * axes[1] + b],
"k-", linewidth=3)
from matplotlib.colors import ListedColormap
custom_cmap = ListedColormap(['#9898ff', '#fafab0'])
plt.contourf(x0, x1, zz, cmap=custom_cmap, linewidth=5)
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.legend(loc="lower right", fontsize=14)
plt.axis(axes)
#save_fig("perceptron_iris_plot")
plt.show()
# Activation functions
def logit(z):
return 1 / (1 + np.exp(-z))
def relu(z):
return np.maximum(0, z)
def derivative(f, z, eps=0.000001):
return (f(z + eps) - f(z - eps))/(2 * eps)
z = np.linspace(-5, 5, 200)
plt.figure(figsize=(11,4))
plt.subplot(121)
plt.plot(z, np.sign(z), "r-", linewidth=2, label="Step")
plt.plot(z, logit(z), "g--", linewidth=2, label="Logit")
plt.plot(z, np.tanh(z), "b-", linewidth=2, label="Tanh")
plt.plot(z, relu(z), "m-.", linewidth=2, label="ReLU")
plt.grid(True)
plt.legend(loc="center right", fontsize=14)
plt.title("Activation functions", fontsize=14)
plt.axis([-5, 5, -1.2, 1.2])
plt.subplot(122)
plt.plot(z, derivative(np.sign, z), "r-", linewidth=2, label="Step")
plt.plot(0, 0, "ro", markersize=5)
plt.plot(0, 0, "rx", markersize=10)
plt.plot(z, derivative(logit, z), "g--", linewidth=2, label="Logit")
plt.plot(z, derivative(np.tanh, z), "b-", linewidth=2, label="Tanh")
plt.plot(z, derivative(relu, z), "m-.", linewidth=2, label="ReLU")
plt.grid(True)
#plt.legend(loc="center right", fontsize=14)
plt.title("Derivatives", fontsize=14)
plt.axis([-5, 5, -0.2, 1.2])
#save_fig("activation_functions_plot")
plt.show()
# activation functions, continued
def heaviside(z):
return (z >= 0).astype(z.dtype)
def sigmoid(z):
return 1/(1+np.exp(-z))
def mlp_xor(x1, x2, activation=heaviside):
return activation(
-activation(x1 + x2 - 1.5) + activation(x1 + x2 - 0.5) - 0.5)
x1s = np.linspace(-0.2, 1.2, 100)
x2s = np.linspace(-0.2, 1.2, 100)
x1, x2 = np.meshgrid(x1s, x2s)
z1 = mlp_xor(x1, x2, activation=heaviside)
z2 = mlp_xor(x1, x2, activation=sigmoid)
plt.figure(figsize=(10,4))
plt.subplot(121)
plt.contourf(x1, x2, z1)
plt.plot([0, 1], [0, 1], "gs", markersize=20)
plt.plot([0, 1], [1, 0], "y^", markersize=20)
plt.title("Activation function: heaviside", fontsize=14)
plt.grid(True)
plt.subplot(122)
plt.contourf(x1, x2, z2)
plt.plot([0, 1], [0, 1], "gs", markersize=20)
plt.plot([0, 1], [1, 0], "y^", markersize=20)
plt.title("Activation function: sigmoid", fontsize=14)
plt.grid(True)
plt.show()
import tensorflow as tf
tf.reset_default_graph()
n_inputs = 28*28 # MNIST
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10
learning_rate = 0.01
# placeholders for training data & targets
# X,y only partially defined due to unknown #instances in training batches
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
# now need to create two hidden layers + one output layer
'''
No need to define your own. TF shortcuts:
fully_connected()
'''
def neuron_layer(X, n_neurons, name, activation=None):
# define a name scope to aid readability
with tf.name_scope(name):
n_inputs = int(X.get_shape()[1])
# create weights matrix. 2D (#inputs, #neurons)
# randomly initialized w/ truncated Gaussian, stdev = 2/sqrt(#inputs)
# aids convergence speed
stddev = 1 / np.sqrt(n_inputs)
init = tf.truncated_normal((n_inputs, n_neurons), stddev=stddev)
W = tf.Variable(init, name="weights")
# create bias variable, initialized to zero, one param per neuron
b = tf.Variable(tf.zeros([n_neurons]), name="biases")
# Z = X dot W + b
Z = tf.matmul(X, W) + b
# return relu(z), or simply z
if activation=="relu":
return tf.nn.relu(Z)
else:
return Z
with tf.name_scope("dnn"):
hidden1 = neuron_layer(X, n_hidden1, "hidden1", activation="relu")
hidden2 = neuron_layer(hidden1, n_hidden2, "hidden2", activation="relu")
# logits = NN output before going thru softmax activation
logits = neuron_layer(hidden2, n_outputs, "output")
with tf.name_scope("loss"):
# sparse_softmax_cross_entropy_with_logits() -- TF routine, handles corner cases for you.
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=y,
logits=logits)
# use reduce_mean() to find mean cross-entropy over all instances.
loss = tf.reduce_mean(
xentropy,
name="loss")
# use GD to handle cost function, ie minimize loss
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
# use accuracy as performance measure.
with tf.name_scope("eval"): # verify whether highest logit corresponds to target class
correct = tf.nn.in_top_k( # using in_top_k(), returns 1D tensor of booleans
logits, y, 1)
accuracy = tf.reduce_mean( # recast booleans to float & find avg.
tf.cast(correct, tf.float32)) # this gives overall accuracy number.
init = tf.global_variables_initializer() # initializer node
saver = tf.train.Saver() # to save trained params to disk
# load MNIST
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/")
n_epochs = 20
batch_size = 50
# Train
with tf.Session() as sess:
init.run() # initialize all variables
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
# use next_batch() to fetch data
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(
training_op,
feed_dict={X: X_batch, y: y_batch})
acc_train = accuracy.eval(
feed_dict={X: X_batch, y: y_batch})
acc_test = accuracy.eval(
feed_dict={X: mnist.test.images,
y: mnist.test.labels})
print(epoch, "Train accuracy:", acc_train, "Test accuracy:", acc_test)
save_path = saver.save(sess, "./my_model_final.ckpt")
with tf.Session() as sess:
# load from disk
saver.restore(sess, save_path) #"my_model_final.ckpt")
# grab images you want to classify
X_new_scaled = mnist.test.images[:20]
Z = logits.eval(feed_dict={X: X_new_scaled})
print(np.argmax(Z, axis=1))
print(mnist.test.labels[:20])


import tensorflow as tf
from tensorflow.contrib.layers import fully_connected
n_inputs = 28*28
n_hidden1 = 300
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
he_init = tf.contrib.layers.variance_scaling_initializer()
hidden1 = fully_connected(X, n_hidden1, weights_initializer=he_init, scope="h1")


# TF doesn't have leaky ReLU predefined, but easy to build.
def leaky_relu(z, name=None):
return tf.maximum(0.01 * z, z, name=name)
hidden1 = fully_connected(X, n_hidden1, activation_fn=leaky_relu)
Algorithm:

Does add computational complexity. Consider plain ELU + He initializaton as well.
# use MNIST dataset again
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/")
#setup
import tensorflow as tf
from tensorflow.contrib.layers import batch_norm, fully_connected
tf.reset_default_graph()
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10
learning_rate = 0.01
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
def leaky_relu(z, name=None):
return tf.maximum(0.01 * z, z, name=name)
# is_training: tells batch_norm() whether to use current minibatch's mean & stdev
# (found during training) or use running avgs (during testing)
with tf.name_scope("dnn"):
hidden1 = fully_connected(X, n_hidden1, activation_fn=leaky_relu, scope="hidden1")
hidden2 = fully_connected(hidden1, n_hidden2, activation_fn=leaky_relu, scope="hidden2")
logits = fully_connected(hidden2, n_outputs, activation_fn=None, scope="outputs")
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 20
batch_size = 100
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for iteration in range(len(mnist.test.labels)//batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels})
print(epoch, "Train accuracy:", acc_train, "Test accuracy:", acc_test)
save_path = saver.save(sess, "my_model_final.ckpt")
threshold = 1.0
optimizer = tf.train.GradientDescentOptimizer(
learning_rate)
grads_and_vars = optimizer.compute_gradients(
loss)
capped_gvs = [
(tf.clip_by_value(
grad, -threshold, threshold), var)
for grad, var in grads_and_vars]
training_op = optimizer.apply_gradients(capped_gvs)
# Reuse with TF
'''
original_w = [] # Load the weights from the other framework
original_b = [] # Load the biases from the other framework
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
hidden1 = fully_connected(X, n_hidden1, scope="hidden1")
[...] # # Build the rest of the model
# Get a handle on the variables created by fully_connected()
with tf.variable_scope("", default_name="", reuse=True): # root scope
hidden1_weights = tf.get_variable("hidden1/weights")
hidden1_biases = tf.get_variable("hidden1/biases")
# Create nodes to assign arbitrary values to the weights and biases
original_weights = tf.placeholder(tf.float32, shape=(n_inputs, n_hidden1))
original_biases = tf.placeholder(tf.float32, shape=(n_hidden1))
assign_hidden1_weights = tf.assign(hidden1_weights, original_weights)
assign_hidden1_biases = tf.assign(hidden1_biases, original_biases)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
sess.run(
assign_hidden1_weights,
feed_dict={original_weights: original_w})
sess.run(
assign_hidden1_biases,
feed_dict={original_biases: original_b})
[...] # Train the model on your new task
'''
# provide all trainable var in hidden layers 3,4 & outputs to optimizer function
# (this omits vars in hidden layers 1,2)
'''
train_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES,
scope="hidden[34]|outputs")
# minimizer can't touch layers 1,2 - they're "frozen"
training_op = optimizer.minimize(
loss,
var_list=train_vars)
'''
'''import numpy as np
n_epochs = 100
n_batches = 500
for epoch in range(n_epochs):
shuffled_idx = rnd.permutation(
len(hidden2_outputs))
hidden2_batches = np.array_split(
hidden2_outputs[shuffled_idx],
n_batches)
y_batches = np.array_split(
y_train[shuffled_idx],
n_batches)
for hidden2_batch, y_batch in zip(hidden2_batches, y_batches):
sess.run(
training_op,
feed_dict={hidden2: hidden2_batch, y: y_batch})
'''

Better optimizer choices:
Worth noting: below techniques rely on 1st-order partial derivatives (Jacobians); more techniques in literature use 2nd-order derivs (Hessians). Not viable for most deep learning due to memory & computational requirements.
# in TF
optimizer = tf.train.MomentumOptimizer(
learning_rate=learning_rate,
momentum=0.9)

# in TF
optimizer = tf.train.MomentumOptimizer(
learning_rate=learning_rate,
momentum=0.9,
use_nesterov=True)

# in TF
optimizer = tf.train.RMSPropOptimizer(
learning_rate=learning_rate,
momentum=0.9,
decay=0.9,
epsilon=1e-10)
Keeps track of decaying past squared gradients (like RMSProp)
Default params in TF:
# in TF
optimizer = tf.train.AdamOptimizer(
learning_rate=learning_rate)
# in TF
initial_learning_rate = 0.1
decay_steps = 10000
decay_rate = 1/10
global_step = tf.Variable(
0, trainable=False)
learning_rate = tf.train.exponential_decay(
initial_learning_rate,
global_step,
decay_steps,
decay_rate)
optimizer = tf.train.MomentumOptimizer(
learning_rate,
momentum=0.9)
training_op = optimizer.minimize(
loss,
global_step=global_step)
Typical p = 50%
In TF: apply dropout() to input layer & output of every hidden layer.
# in TF
from tensorflow.contrib.layers import dropout
[...]
is_training = tf.placeholder(
tf.bool,
shape=(),
name='is_training')
keep_prob = 0.5
X_drop = dropout(
X,
keep_prob,
is_training=is_training)
hidden1 = fully_connected(
X_drop, n_hidden1, scope="hidden1")
hidden1_drop = dropout(
hidden1, keep_prob, is_training=is_training)
hidden2 = fully_connected(
hidden1_drop, n_hidden2, scope="hidden2")
hidden2_drop = dropout(
hidden2, keep_prob, is_training=is_training)
logits = fully_connected(
hidden2_drop, n_outputs,
activation_fn=None,
scope="outputs")
%%html
<style>
table,td,tr,th {border:none!important}
</style>
# utilities
import matplotlib.pyplot as plt
def plot_image(image):
plt.imshow(image, cmap="gray", interpolation="nearest")
plt.axis("off")
def plot_color_image(image):
plt.imshow(image.astype(np.uint8),interpolation="nearest")
plt.axis("off")
| Layers | Padding | Strides |
|---|---|---|
![]() |
![]() |
![]() |

import numpy as np
fmap = np.zeros(shape=(7, 7, 1, 2), dtype=np.float32)
fmap[:, 3, 0, 0] = 1
fmap[3, :, 0, 1] = 1
print(fmap[:, :, 0, 0])
print(fmap[:, :, 0, 1])
plt.figure(figsize=(6,6))
plt.subplot(121)
plot_image(fmap[:, :, 0, 0])
plt.subplot(122)
plot_image(fmap[:, :, 0, 1])
plt.show()
from sklearn.datasets import load_sample_image
china = load_sample_image("china.jpg")
flower = load_sample_image("flower.jpg")
image = china[150:220, 130:250]
height, width, channels = image.shape
image_grayscale = image.mean(axis=2).astype(np.float32)
images = image_grayscale.reshape(1, height, width, 1)
import tensorflow as tf
tf.reset_default_graph()
# Define the model
X = tf.placeholder(
tf.float32,
shape=(None, height, width, 1))
feature_maps = tf.constant(fmap)
convolution = tf.nn.conv2d(
X,
feature_maps,
strides=[1,1,1,1],
padding="SAME",
use_cudnn_on_gpu=False)
# Run the model
with tf.Session() as sess:
output = convolution.eval(feed_dict={X: images})
plt.figure(figsize=(6,6))
#plt.subplot(121)
plot_image(images[0, :, :, 0])
#plt.subplot(122)
plot_image(output[0, :, :, 0])
#plt.subplot(123)
plot_image(output[0, :, :, 1])
plt.show()
%%html
<style>
img[alt=stacking] { width: 400px; }
</style>

import numpy as np
from sklearn.datasets import load_sample_images
# Load sample images
dataset = np.array(load_sample_images().images, dtype=np.float32)
batch_size, height, width, channels = dataset.shape
# Create 2 filters
filters = np.zeros(shape=(7, 7, channels, 2), dtype=np.float32)
filters[:, 3, :, 0] = 1 # vertical line
filters[3, :, :, 1] = 1 # horizontal line
# Create a graph with input X plus a convolutional layer applying the 2 filters
X = tf.placeholder(tf.float32,
shape=(None, height, width, channels))
convolution = tf.nn.conv2d(
X, filters, strides=[1,2,2,1], padding="SAME")
with tf.Session() as sess:
output = sess.run(convolution, feed_dict={X: dataset})
plt.imshow(output[0, :, :, 1])
plt.show()
%%html
<style>
img[alt=padding] { width: 400px; }
</style>

import tensorflow as tf
import numpy as np
tf.reset_default_graph()
filter_primes = np.array(
[2., 3., 5., 7., 11., 13.],
dtype=np.float32)
x = tf.constant(
np.arange(1, 13+1, dtype=np.float32).reshape([1, 1, 13, 1]))
print ("x:\n",x)
filters = tf.constant(
filter_primes.reshape(1, 6, 1, 1))
# conv2d arguments:
# x = input minibatch = 4D tensor
# filters = 4D tensor
# strides = 1D array (1, vstride, hstride, 1)
# padding = VALID = no zero padding, may ignore edge rows/cols
# padding = SAME = zero padding used if needed
valid_conv = tf.nn.conv2d(x, filters, strides=[1, 1, 5, 1], padding='VALID')
same_conv = tf.nn.conv2d(x, filters, strides=[1, 1, 5, 1], padding='SAME')
with tf.Session() as sess:
print("VALID:\n", valid_conv.eval())
print("SAME:\n", same_conv.eval())

dataset = np.array([china, flower], dtype=np.float32)
batch_size, height, width, channels = dataset.shape
filters = np.zeros(shape=(7, 7, channels, 2), dtype=np.float32)
filters[:, 3, :, 0] = 1 # vertical line
filters[3, :, :, 1] = 1 # horizontal line
X = tf.placeholder(tf.float32,
shape=(None, height, width, channels))
# alternative: avg_pool()
max_pool = tf.nn.max_pool(
X,
ksize=[1, 2, 2, 1],
strides=[1,2,2,1],
padding="VALID")
with tf.Session() as sess:
output = sess.run(max_pool, feed_dict={X: dataset})
plt.figure(figsize=(12,12))
plt.subplot(121)
plot_color_image(dataset[0])
plt.subplot(122)
plot_color_image(output[0])
plt.show()





%%html
<style>
img[alt=recurrent_unrolled] { width: 400px; }
</style>
<style>
img[alt=sequence_vector] { width: 400px; }
</style>
<style>
img[alt=gru-cell] { width: 400px; }
</style>
<style>
img[alt=encoder-decoder] { width: 400px; }
</style>



import tensorflow as tf
n_inputs = 3
n_neurons = 5
# two-layer net
X0 = tf.placeholder(tf.float32, [None, n_inputs])
X1 = tf.placeholder(tf.float32, [None, n_inputs])
Wx = tf.Variable(tf.random_normal(shape=[n_inputs, n_neurons],dtype=tf.float32))
Wy = tf.Variable(tf.random_normal(shape=[n_neurons,n_neurons],dtype=tf.float32))
b = tf.Variable(tf.zeros([1, n_neurons], dtype=tf.float32))
Y0 = tf.tanh(tf.matmul(X0, Wx) + b)
Y1 = tf.tanh(tf.matmul(Y0, Wy) + tf.matmul(X1, Wx) + b)
init = tf.global_variables_initializer()
# to feed inputs at both time steps,
import numpy as np
# Mini-batch: instance 0,instance 1,instance 2,instance 3
X0_batch = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 0, 1]]) # t = 0
X1_batch = np.array([[9, 8, 7], [0, 0, 0], [6, 5, 4], [3, 2, 1]]) # t = 1
# Y0, Y1 = network outputs at both time steps
with tf.Session() as sess:
init.run()
Y0_val, Y1_val = sess.run([Y0, Y1], feed_dict={X0: X0_batch, X1: X1_batch})
print("output at t=0:\n",Y0_val,"\n","output at t=1\n",Y1_val)
tf.reset_default_graph()
n_inputs = 3
n_neurons = 5
X0 = tf.placeholder(tf.float32, [None, n_inputs])
X1 = tf.placeholder(tf.float32, [None, n_inputs])
# BasicRNNCell() -- memcell "factory"
basic_cell = tf.contrib.rnn.BasicRNNCell(
num_units=n_neurons)
# static_rnn() -- creates unrolled RNN net by chaining cells.
# returns 1) python list of output tensors for each time step
# 2) tensor of final network states
output_seqs, states = tf.contrib.rnn.static_rnn(
basic_cell,
[X0, X1],
dtype=tf.float32)
Y0, Y1 = output_seqs
init = tf.global_variables_initializer()
# to feed inputs at both time steps,
import numpy as np
# Mini-batch: instance 0,instance 1,instance 2,instance 3
X0_batch = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 0, 1]]) # t = 0
X1_batch = np.array([[9, 8, 7], [0, 0, 0], [6, 5, 4], [3, 2, 1]]) # t = 1
# Y0, Y1 = network outputs at both time steps
with tf.Session() as sess:
init.run()
Y0_val, Y1_val = sess.run([Y0, Y1], feed_dict={X0: X0_batch, X1: X1_batch})
print("output at t=0:\n",Y0_val,"\n","output at t=1\n",Y1_val)
tf.reset_default_graph()
n_steps = 2
n_inputs = 3
n_neurons = 5
# this time, use placeholder with add'l dimension for #timesteps
#X0 = tf.placeholder(tf.float32, [None, n_inputs])
#X1 = tf.placeholder(tf.float32, [None, n_inputs])
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
#print(X)
# transpose - make time steps = 1st dimension
# unstack - extract list of tensors
X_seqs = tf.unstack(
tf.transpose(
X, perm=[1, 0, 2]))
#print(X_seqs)
# BasicRNNCell() -- memcell "factory"
basic_cell = tf.contrib.rnn.BasicRNNCell(
num_units=n_neurons)
# static_rnn() -- creates unrolled RNN net by chaining cells.
# returns 1) python list of output tensors for each time step
# 2) tensor of final network states
output_seqs, states = tf.contrib.rnn.static_rnn(
basic_cell,
X_seqs,
dtype=tf.float32)
#Y0, Y1 = output_seqs
# stack - merge output tensors
# transpose - swap 1st two dimensions
# returns tensor shape [none, #steps, #neurons]
outputs = tf.transpose(
tf.stack(output_seqs),
perm=[1,0,2])
init = tf.global_variables_initializer()
X_batch = np.array([
# t = 0 t = 1
[[0, 1, 2], [9, 8, 7]], # instance 1
[[3, 4, 5], [0, 0, 0]], # instance 2
[[6, 7, 8], [6, 5, 4]], # instance 3
[[9, 0, 1], [3, 2, 1]], # instance 4
])
with tf.Session() as sess:
init.run()
outputs_val = outputs.eval(feed_dict={X: X_batch})
print(outputs_val)
tf.reset_default_graph()
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
basic_cell = tf.contrib.rnn.BasicRNNCell(
num_units=n_neurons)
outputs, states = tf.nn.dynamic_rnn(
basic_cell, X, dtype=tf.float32)
init = tf.global_variables_initializer()
with tf.Session() as sess:
init.run()
outputs_val = outputs.eval(feed_dict={X: X_batch})
print(outputs_val)
tf.reset_default_graph()
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
seq_length = tf.placeholder(tf.int32, [None])
basic_cell = tf.contrib.rnn.BasicRNNCell(
num_units=n_neurons)
outputs, states = tf.nn.dynamic_rnn(
basic_cell, X, dtype=tf.float32,
#
#
sequence_length=seq_length)
#
#
X_batch = np.array([
[[0, 1, 2], [9, 8, 7]], # instance 1
[[3, 4, 5], [0, 0, 0]], # instance 2 -- zero padded
[[6, 7, 8], [6, 5, 4]], # instance 3
[[9, 0, 1], [3, 2, 1]], # instance 4
])
seq_length_batch = np.array([2,1,2,2])
init = tf.global_variables_initializer()
with tf.Session() as sess:
init.run()
outputs_val, states_val = sess.run(
[outputs, states],
feed_dict={X: X_batch, seq_length: seq_length_batch})
# RNN should output zero vectors for any time step
# beyond input sequence length
print(outputs_val)
# states tensor contains final state of each cell
print(states_val)


# similar to MNIST classifier
# unrolled RNN replaces hidden layers
tf.reset_default_graph()
from tensorflow.contrib.layers import fully_connected
n_steps = 28
n_inputs = 28
n_neurons = 150
n_outputs = 10
learning_rate = 0.001
# y = placeholder for target classes
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.int32, [None])
basic_cell = tf.contrib.rnn.BasicRNNCell(
num_units=n_neurons)
outputs, states = tf.nn.dynamic_rnn(
basic_cell, X, dtype=tf.float32)
logits = fully_connected(
states, n_outputs, activation_fn=None)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=y, logits=logits)
loss = tf.reduce_mean(
xentropy)
optimizer = tf.train.AdamOptimizer(
learning_rate=learning_rate)
training_op = optimizer.minimize(
loss)
correct = tf.nn.in_top_k(
logits, y, 1)
accuracy = tf.reduce_mean(
tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
# load MNIST data, reshape to [batch_size, n_steps, n_inputs]
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/")
X_test = mnist.test.images.reshape((-1, n_steps, n_inputs))
y_test = mnist.test.labels
# ready to run. reshape each training batch before feeding to net.
n_epochs = 10
batch_size = 150
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
X_batch = X_batch.reshape(
(-1, n_steps, n_inputs))
sess.run(
training_op,
feed_dict={X: X_batch, y: y_batch})
acc_train = accuracy.eval(
feed_dict={X: X_batch, y: y_batch})
acc_test = accuracy.eval(
feed_dict={X: X_test, y: y_test})
print(epoch,
"Train accuracy:", acc_train,
"Test accuracy:", acc_test)

t_min, t_max = 0, 30
resolution = 0.1
def time_series(t):
return t * np.sin(t) / 3 + 2 * np.sin(t*5)
def next_batch(batch_size, n_steps):
t0 = np.random.rand(batch_size, 1) * (t_max - t_min - n_steps * resolution)
Ts = t0 + np.arange(0., n_steps + 1) * resolution
ys = time_series(Ts)
return ys[:, :-1].reshape(-1, n_steps, 1), ys[:, 1:].reshape(-1, n_steps, 1)
t = np.linspace(t_min, t_max, (t_max - t_min) // resolution)
n_steps = 20
t_instance = np.linspace(
12.2, 12.2 + resolution * (n_steps + 1), n_steps + 1)
# each training instance = 20 inputs long
# targets = 20-input sequences
tf.reset_default_graph()
n_steps = 20
n_inputs = 1
n_neurons = 100
n_outputs = 1
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None, n_steps, n_outputs])
cell = tf.contrib.rnn.BasicRNNCell(
num_units=n_neurons,
activation=tf.nn.relu)
outputs, states = tf.nn.dynamic_rnn(
cell, X, dtype=tf.float32)
print(outputs.shape)
# output at each time step now vector[100],
# but we want single output value at each step.
# use OutputProjectionWrapper()
# -- adds FC layer to top of each output
cell = tf.contrib.rnn.OutputProjectionWrapper(
tf.contrib.rnn.BasicRNNCell(
num_units=n_neurons,
activation=tf.nn.relu),
output_size=n_outputs)
# define cost function using MSE
# use Adam optimizer
learning_rate = 0.001
loss = tf.reduce_mean(
tf.square(outputs - y))
optimizer = tf.train.AdamOptimizer(
learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()
# initialize & run
init = tf.global_variables_initializer()
n_iterations = 1000
batch_size = 50
with tf.Session() as sess:
init.run()
for iteration in range(n_iterations):
X_batch, y_batch = next_batch(batch_size, n_steps)
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
if iteration % 100 == 0:
mse = loss.eval(feed_dict={X: X_batch, y: y_batch})
print(iteration, "\tMSE:", mse)
# use trained model to make some predictions
X_new = time_series(np.array(t_instance[:-1].reshape(-1, n_steps, n_inputs)))
y_pred = sess.run(outputs, feed_dict={X: X_new})
print(y_pred)
import matplotlib.pyplot as plt
plt.title("Testing the model", fontsize=14)
plt.plot(
t_instance[:-1],
time_series(t_instance[:-1]),
"bo", markersize=10, label="instance")
plt.plot(
t_instance[1:],
time_series(t_instance[1:]),
"w*", markersize=10, label="target")
plt.plot(
t_instance[1:],
y_pred[0,:,0],
"r.", markersize=10, label="prediction")
plt.legend(loc="upper left")
plt.xlabel("Time")
#save_fig("time_series_pred_plot")
plt.show()
tf.reset_default_graph()
n_steps = 20
n_inputs = 1
n_neurons = 100
n_outputs = 1
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None, n_steps, n_outputs])
cell = tf.contrib.rnn.BasicRNNCell(
num_units=n_neurons,
activation=tf.nn.relu)
rnn_outputs, states = tf.nn.dynamic_rnn(
cell, X, dtype=tf.float32)
# stack outputs using reshape
stacked_rnn_outputs = tf.reshape(
rnn_outputs, [-1, n_neurons])
print(stacked_rnn_outputs)
# add FC layer -- just a projection, so no activation fn needed
stacked_outputs = fully_connected(
stacked_rnn_outputs,
n_outputs,
activation_fn=None)
print(stacked_outputs)
# unstack outputs using reshape
outputs = tf.reshape(
stacked_outputs, [-1, n_steps, n_outputs])
print(outputs)
loss = tf.reduce_sum(tf.square(outputs - y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
#initialize & run
init = tf.global_variables_initializer()
n_iterations = 1000
batch_size = 50
with tf.Session() as sess:
init.run()
for iteration in range(n_iterations):
X_batch, y_batch = next_batch(batch_size, n_steps)
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
if iteration % 100 == 0:
mse = loss.eval(feed_dict={X: X_batch, y: y_batch})
print(iteration, "\tMSE:", mse)
# use trained model to make some predictions
X_new = time_series(np.array(t_instance[:-1].reshape(-1, n_steps, n_inputs)))
y_pred = sess.run(outputs, feed_dict={X: X_new})
print(y_pred)
plt.title("Testing the model", fontsize=14)
plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "bo", markersize=10, label="instance")
plt.plot(t_instance[1:], time_series(t_instance[1:]), "w*", markersize=10, label="target")
plt.plot(t_instance[1:], y_pred[0,:,0], "r.", markersize=10, label="prediction")
plt.legend(loc="upper left")
plt.xlabel("Time")
plt.show()
n_iterations = 2000
batch_size = 50
with tf.Session() as sess:
init.run()
for iteration in range(n_iterations):
X_batch, y_batch = next_batch(batch_size, n_steps)
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
if iteration % 100 == 0:
mse = loss.eval(feed_dict={X: X_batch, y: y_batch})
print(iteration, "\tMSE:", mse)
sequence1 = [0. for i in range(n_steps)]
for iteration in range(len(t) - n_steps):
X_batch = np.array(sequence1[-n_steps:]).reshape(1, n_steps, 1)
y_pred = sess.run(outputs, feed_dict={X: X_batch})
sequence1.append(y_pred[0, -1, 0])
sequence2 = [time_series(i * resolution + t_min + (t_max-t_min/3)) for i in range(n_steps)]
for iteration in range(len(t) - n_steps):
X_batch = np.array(sequence2[-n_steps:]).reshape(1, n_steps, 1)
y_pred = sess.run(outputs, feed_dict={X: X_batch})
sequence2.append(y_pred[0, -1, 0])
plt.figure(figsize=(11,4))
plt.subplot(121)
plt.plot(t, sequence1, "b-")
plt.plot(t[:n_steps], sequence1[:n_steps], "b-", linewidth=3)
plt.xlabel("Time")
plt.ylabel("Value")
plt.subplot(122)
plt.plot(t, sequence2, "b-")
plt.plot(t[:n_steps], sequence2[:n_steps], "b-", linewidth=3)
plt.xlabel("Time")
#save_fig("creative_sequence_plot")
plt.show()
tf.reset_default_graph()
n_inputs = 2
n_neurons = 100
n_layers = 3
n_steps = 5
keep_prob = 0.5
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
basic_cell = tf.contrib.rnn.BasicRNNCell(
num_units=n_neurons)
print(basic_cell)
multi_layer_cell = tf.contrib.rnn.MultiRNNCell(
[basic_cell] * n_layers)
print(multi_layer_cell)
# states = tuple (one tensor/layer, = final state of layer's cell)
outputs, states = tf.nn.dynamic_rnn(
multi_layer_cell, X, dtype=tf.float32)
init = tf.global_variables_initializer()
import numpy.random as rnd
X_batch = rnd.rand(2, n_steps, n_inputs)
with tf.Session() as sess:
init.run()
outputs_val, states_val = sess.run(
[outputs, states],
feed_dict={X: X_batch})
print(outputs_val.shape)
# apply 50% dropout to inputs of RNN layers
# can apply dropout to outputs via output_keep_prob
tf.reset_default_graph()
from tensorflow.contrib.layers import fully_connected
n_inputs = 1
n_neurons = 100
n_layers = 3
n_steps = 20
n_outputs = 1
keep_prob = 0.5
learning_rate = 0.001
def deep_rnn_with_dropout(X, y, is_training):
# TF implementation of DropoutWrapper doesn't differentiate
# between training & testing.
cell = tf.contrib.rnn.BasicRNNCell(
num_units=n_neurons)
if is_training:
cell = tf.contrib.rnn.DropoutWrapper(
cell, input_keep_prob=keep_prob)
#
#
multi_layer_cell = tf.contrib.rnn.MultiRNNCell(
[cell] * n_layers)
rnn_outputs, states = tf.nn.dynamic_rnn(
multi_layer_cell, X, dtype=tf.float32)
stacked_rnn_outputs = tf.reshape(
rnn_outputs, [-1, n_neurons])
stacked_outputs = fully_connected(
stacked_rnn_outputs, n_outputs, activation_fn=None)
outputs = tf.reshape(
stacked_outputs, [-1, n_steps, n_outputs])
loss = tf.reduce_sum(
tf.square(outputs - y))
optimizer = tf.train.AdamOptimizer(
learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
return outputs, loss, training_op
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None, n_steps, n_outputs])
outputs, loss, training_op = deep_rnn_with_dropout(X, y, is_training)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_iterations = 2000
batch_size = 50
is_training = True
with tf.Session() as sess:
if is_training:
init.run()
for iteration in range(n_iterations):
X_batch, y_batch = next_batch(batch_size, n_steps)
sess.run(
training_op,
feed_dict={X: X_batch, y: y_batch})
if iteration % 100 == 0:
mse = loss.eval(
feed_dict={X: X_batch, y: y_batch})
print(iteration, "\tMSE:", mse)
save_path = saver.save(sess, "/tmp/my_model.ckpt")
else:
saver.restore(sess, "/tmp/my_model.ckpt")
X_new = time_series(
np.array(t_instance[:-1].reshape(-1, n_steps, n_inputs)))
y_pred = sess.run(
outputs, feed_dict={X: X_new})
plt.title("Testing the model", fontsize=14)
plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "bo", markersize=10, label="instance")
plt.plot(t_instance[1:], time_series(t_instance[1:]), "w*", markersize=10, label="target")
plt.plot(t_instance[1:], y_pred[0,:,0], "r.", markersize=10, label="prediction")
plt.legend(loc="upper left")
plt.xlabel("Time")
plt.show()
# testing
with tf.Session() as sess:
saver.restore(sess, "/tmp/my_model.ckpt")
X_new = time_series(
np.array(t_instance[:-1].reshape(-1, n_steps, n_inputs)))
y_pred = sess.run(
outputs, feed_dict={X: X_new})
plt.title("Testing the model", fontsize=14)
plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "bo", markersize=10, label="instance")
plt.plot(t_instance[1:], time_series(t_instance[1:]), "w*", markersize=10, label="target")
plt.plot(t_instance[1:], y_pred[0,:,0], "r.", markersize=10, label="prediction")
plt.legend(loc="upper left")
plt.xlabel("Time")
plt.show()
tf.reset_default_graph()
from tensorflow.contrib.layers import fully_connected
n_steps = 28
n_inputs = 28
n_neurons = 150
n_outputs = 10
learning_rate = 0.001
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.int32, [None])
lstm_cell = tf.contrib.rnn.BasicLSTMCell(
num_units=n_neurons)
multi_cell = tf.contrib.rnn.MultiRNNCell(
[lstm_cell]*3)
outputs, states = tf.nn.dynamic_rnn(
multi_cell, X, dtype=tf.float32)
top_layer_h_state = states[-1][1]
logits = fully_connected(
top_layer_h_state,
n_outputs,
activation_fn=None, scope="softmax")
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=y, logits=logits)
loss = tf.reduce_mean(
xentropy, name="loss")
optimizer = tf.train.AdamOptimizer(
learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
correct = tf.nn.in_top_k(
logits, y, 1)
accuracy = tf.reduce_mean(
tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
states
top_layer_h_state
n_epochs = 10
batch_size = 150
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
X_batch = X_batch.reshape((batch_size, n_steps, n_inputs))
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
print("Epoch", epoch, "Train accuracy =", acc_train, "Test accuracy =", acc_test)
# Peepholes in TF
lstm_cell = tf.contrib.rnn.LSTMCell(
num_units=n_neurons,
use_peepholes=True)

# in TF
gru_cell = tf.contrib.rnn.GRUCell(num_units=n_neurons)
# create embeddings variable. init with random[-1,+1]
vocabulary_size = 50000
embedding_size = 150
embeddings = tf.Variable(
tf.random_uniform(
[vocabulary_size, embedding_size],
-1.0, 1.0))
train_inputs = tf.placeholder(
tf.int32, shape=[None]) # from ids...
embed = tf.nn.embedding_lookup(
embeddings, train_inputs) # ...to embeddings

from six.moves import urllib
import errno
import os
import zipfile
WORDS_PATH = "datasets/words"
WORDS_URL = 'http://mattmahoney.net/dc/text8.zip'
def mkdir_p(path):
"""Create directories, ok if they already exist.
This is for python 2 support. In python >=3.2, simply use:
>>> os.makedirs(path, exist_ok=True)
"""
try:
os.makedirs(path)
except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
def fetch_words_data(words_url=WORDS_URL, words_path=WORDS_PATH):
os.makedirs(words_path, exist_ok=True)
zip_path = os.path.join(words_path, "words.zip")
if not os.path.exists(zip_path):
urllib.request.urlretrieve(words_url, zip_path)
with zipfile.ZipFile(zip_path) as f:
data = f.read(f.namelist()[0])
return data.decode("ascii").split()
words = fetch_words_data()
words[:5]
from collections import Counter
vocabulary_size = 50000
vocabulary = [("UNK", None)] + Counter(words).most_common(vocabulary_size - 1)
vocabulary = np.array([word for word, _ in vocabulary])
dictionary = {word: code for code, word in enumerate(vocabulary)}
data = np.array([dictionary.get(word, 0) for word in words])
" ".join(words[:9]), data[:9]
" ".join([vocabulary[word_index] for word_index in [5241, 3081, 12, 6, 195, 2, 3134, 46, 59]])
words[24], data[24]
import random
from collections import deque
def generate_batch(batch_size, num_skips, skip_window):
global data_index
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1 # [ skip_window target skip_window ]
buffer = deque(maxlen=span)
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
for i in range(batch_size // num_skips):
target = skip_window # target label at the center of the buffer
targets_to_avoid = [ skip_window ]
for j in range(num_skips):
while target in targets_to_avoid:
target = random.randint(0, span - 1)
targets_to_avoid.append(target)
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j, 0] = buffer[target]
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
return batch, labels
data_index=0
batch, labels = generate_batch(8, 2, 1)
batch, [vocabulary[word] for word in batch]
labels, [vocabulary[word] for word in labels[:, 0]]
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = rnd.choice(valid_window, valid_size, replace=False)
num_sampled = 64 # Number of negative examples to sample.
learning_rate = 0.01
tf.reset_default_graph()
# Input data.
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
# Look up embeddings for inputs.
init_embeddings = tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)
embeddings = tf.Variable(init_embeddings)
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
# Construct the variables for the NCE loss
nce_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / np.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
# Compute the average NCE loss for the batch.
# tf.nce_loss automatically draws a new sample of the negative labels each
# time we evaluate the loss.
loss = tf.reduce_mean(
tf.nn.nce_loss(nce_weights, nce_biases, train_labels, embed,
num_sampled, vocabulary_size))
# Construct the Adam optimizer
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
# Compute the cosine similarity between minibatch examples and all embeddings.
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), axis=1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
# Add variable initializer.
init = tf.global_variables_initializer()
num_steps = 1000 # was 100000?
with tf.Session() as session:
init.run()
average_loss = 0
for step in range(num_steps):
print("\rIteration: {}".format(step), end="\t")
batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}
# We perform one update step by evaluating the training op (including it
# in the list of returned values for session.run()
_, loss_val = session.run([training_op, loss], feed_dict=feed_dict)
average_loss += loss_val
if step % 2000 == 0:
if step > 0:
average_loss /= 2000
# The average loss is an estimate of the loss over the last 2000 batches.
print("Average loss at step ", step, ": ", average_loss)
average_loss = 0
# Note that this is expensive (~20% slowdown if computed every 500 steps)
if step % 10000 == 0:
sim = similarity.eval()
for i in range(valid_size):
valid_word = vocabulary[valid_examples[i]]
top_k = 8 # number of nearest neighbors
nearest = (-sim[i, :]).argsort()[1:top_k+1]
log_str = "Nearest to %s:" % valid_word
for k in range(top_k):
close_word = vocabulary[nearest[k]]
log_str = "%s %s," % (log_str, close_word)
print(log_str)
final_embeddings = normalized_embeddings.eval()
np.save("my_final_embeddings.npy", final_embeddings)
def plot_with_labels(low_dim_embs, labels):
assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
plt.figure(figsize=(18, 18)) #in inches
for i, label in enumerate(labels):
x, y = low_dim_embs[i,:]
plt.scatter(x, y)
plt.annotate(label,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
plt.show()
from sklearn.manifold import TSNE
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
plot_only = 500
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])
labels = [vocabulary[i] for i in range(plot_only)]
plot_with_labels(low_dim_embs, labels)
import matplotlib.pyplot as plt
import numpy as np
import numpy.random as rnd
import tensorflow as tf
import sys
# lets build a 3D dataset
rnd.seed(4)
m = 100
w1, w2 = 0.1, 0.3
noise = 0.1
angles = rnd.rand(m) * 3 * np.pi / 2 - 0.5
X_train = np.empty((m, 3))
X_train[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * rnd.randn(m) / 2
X_train[:, 1] = np.sin(angles) * 0.7 + noise * rnd.randn(m) / 2
X_train[:, 2] = X_train[:, 0] * w1 + X_train[:, 1] * w2 + noise * rnd.randn(m)
# normalize it
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
plt.plot(X_train)
plt.show()
# build AE
from tensorflow.contrib.layers import fully_connected
n_inputs = 3 # 3D inputs
n_hidden = 2 # 2D codings
n_outputs = n_inputs
learning_rate = 0.01
X = tf.placeholder(
tf.float32, shape=[None, n_inputs])
#
# set activation_fn=None & use MSE for cost function
# to perform simple PCA.
#
hidden = fully_connected(
X,
n_hidden,
activation_fn=None)
outputs = fully_connected(
hidden,
n_outputs,
activation_fn=None)
# MSE
reconstruction_loss = tf.reduce_mean(
tf.square(outputs - X))
optimizer = tf.train.AdamOptimizer(
learning_rate)
training_op = optimizer.minimize(
reconstruction_loss)
init = tf.global_variables_initializer()
# run the AE
n_iterations = 10000
codings = hidden
with tf.Session() as sess:
init.run()
for iteration in range(n_iterations):
training_op.run(feed_dict={X: X_train})
codings_val = codings.eval(feed_dict={X: X_train})
fig = plt.figure(figsize=(4,3))
plt.plot(codings_val[:,0], codings_val[:, 1], "b.")
plt.xlabel("$z_1$", fontsize=18)
plt.ylabel("$z_2$", fontsize=18, rotation=0)
#ave_fig("linear_autoencoder_pca_plot")
plt.show()
# plot: 2D projection with max variance

tf.reset_default_graph()
n_inputs = 28 * 28 # for MNIST
n_hidden1 = 300
n_hidden2 = 150 # codings
n_hidden3 = n_hidden1
n_outputs = n_inputs
learning_rate = 0.01
l2_reg = 0.0001
X = tf.placeholder(tf.float32,
shape=[None, n_inputs])
with tf.contrib.framework.arg_scope(
[fully_connected],
activation_fn=tf.nn.elu,
weights_initializer=tf.contrib.layers.variance_scaling_initializer(),
weights_regularizer=tf.contrib.layers.l2_regularizer(l2_reg)):
hidden1 = fully_connected(X, n_hidden1)
hidden2 = fully_connected(hidden1, n_hidden2) # codings
hidden3 = fully_connected(hidden2, n_hidden3)
outputs = fully_connected(hidden3, n_outputs, activation_fn=None)
# MSE
reconstruction_loss = tf.reduce_mean(
tf.square(outputs - X))
reg_losses = tf.get_collection(
tf.GraphKeys.REGULARIZATION_LOSSES)
loss = tf.add_n(
[reconstruction_loss] + reg_losses)
optimizer = tf.train.AdamOptimizer(
learning_rate)
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
# use MNIST dataset
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/")
# train the net. digit labels (y_batch) = unused.
n_epochs = 4
batch_size = 150
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
n_batches = mnist.train.num_examples // batch_size
for iteration in range(n_batches):
print("\r{}%".format(100 * iteration // n_batches), end="")
sys.stdout.flush()
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(training_op, feed_dict={X: X_batch})
mse_train = reconstruction_loss.eval(feed_dict={X: X_batch})
print("\r{}".format(epoch), "Train MSE:", mse_train)
saver.save(sess, "./my_model_all_layers.ckpt")
# utility: plot grayscale 28x28 image
def plot_image(image, shape=[28, 28]):
plt.imshow(image.reshape(shape), cmap="Greys", interpolation="nearest")
plt.axis("off")
# load model, eval on test set (measure reconstruction error, display original & reconstruction)
def show_reconstructed_digits(X, outputs, model_path = None, n_test_digits = 2):
with tf.Session() as sess:
if model_path:
saver.restore(sess, model_path)
X_test = mnist.test.images[:n_test_digits]
outputs_val = outputs.eval(feed_dict={X: X_test})
fig = plt.figure(figsize=(8, 3 * n_test_digits))
for digit_index in range(n_test_digits):
plt.subplot(n_test_digits, 2, digit_index * 2 + 1)
plot_image(X_test[digit_index])
plt.subplot(n_test_digits, 2, digit_index * 2 + 2)
plot_image(outputs_val[digit_index])
show_reconstructed_digits(X, outputs, "./my_model_all_layers.ckpt")
plt.show()
tf.reset_default_graph()
activation = tf.nn.elu
regularizer = tf.contrib.layers.l2_regularizer(l2_reg)
initializer = tf.contrib.layers.variance_scaling_initializer()
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
weights1_init = initializer([n_inputs, n_hidden1])
weights2_init = initializer([n_hidden1, n_hidden2])
weights1 = tf.Variable(weights1_init, dtype=tf.float32, name="weights1")
weights2 = tf.Variable(weights2_init, dtype=tf.float32, name="weights2")
# weights 3,4 not vars!
weights3 = tf.transpose(weights2, name="weights3") # tied weights
weights4 = tf.transpose(weights1, name="weights4") # tied weights
biases1 = tf.Variable(tf.zeros(n_hidden1),name="biases1")
biases2 = tf.Variable(tf.zeros(n_hidden2),name="biases2")
biases3 = tf.Variable(tf.zeros(n_hidden3),name="biases3")
biases4 = tf.Variable(tf.zeros(n_outputs),name="biases4")
hidden1 = activation(tf.matmul(X, weights1) + biases1)
hidden2 = activation(tf.matmul(hidden1, weights2) + biases2)
hidden3 = activation(tf.matmul(hidden2, weights3) + biases3)
outputs = tf.matmul(hidden3, weights4) + biases4
reconstruction_loss = tf.reduce_mean(
tf.square(outputs - X))
reg_loss = regularizer(weights1) + regularizer(weights2)
loss = reconstruction_loss + reg_loss
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()

def train_autoencoder(
X_train,
n_neurons,
n_epochs,
batch_size,
learning_rate = 0.01,
l2_reg = 0.0005,
activation_fn=tf.nn.elu):
graph = tf.Graph()
with graph.as_default():
n_inputs = X_train.shape[1]
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
with tf.contrib.framework.arg_scope(
[fully_connected],
activation_fn=activation_fn,
weights_initializer=tf.contrib.layers.variance_scaling_initializer(),
weights_regularizer=tf.contrib.layers.l2_regularizer(
l2_reg)):
hidden = fully_connected(
X, n_neurons, scope="hidden")
outputs = fully_connected(
hidden, n_inputs, activation_fn=None, scope="outputs")
mse = tf.reduce_mean(tf.square(outputs - X))
reg_losses = tf.get_collection(
tf.GraphKeys.REGULARIZATION_LOSSES)
loss = tf.add_n([mse] + reg_losses)
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()
with tf.Session(graph=graph) as sess:
init.run()
for epoch in range(n_epochs):
n_batches = len(X_train) // batch_size
for iteration in range(n_batches):
print("\r{}%".format(100 * iteration // n_batches), end="")
sys.stdout.flush()
indices = rnd.permutation(
len(X_train))[:batch_size]
X_batch = X_train[indices]
sess.run(
training_op, feed_dict={X: X_batch})
mse_train = mse.eval(
feed_dict={X: X_batch})
print("\r{}".format(epoch), "Train MSE:", mse_train)
params = dict(
[(var.name, var.eval()) for var in tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES)])
hidden_val = hidden.eval(
feed_dict={X: X_train})
return hidden_val, params["hidden/weights:0"], params["hidden/biases:0"], params["outputs/weights:0"], params["outputs/biases:0"]
# train two AEs
hidden_output, W1, b1, W4, b4 = train_autoencoder(
mnist.train.images,
n_neurons=300,
n_epochs=4,
batch_size=150)
_, W2, b2, W3, b3 = train_autoencoder(
hidden_output,
n_neurons=150,
n_epochs=4,
batch_size=150)
# create stacked AE by reusing weights &and biases from above
tf.reset_default_graph()
n_inputs = 28*28
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
hidden1 = tf.nn.elu(tf.matmul(X, W1) + b1)
hidden2 = tf.nn.elu(tf.matmul(hidden1, W2) + b2)
hidden3 = tf.nn.elu(tf.matmul(hidden2, W3) + b3)
outputs = tf.matmul(hidden3, W4) + b4
# Load model, evaluates it on test set (reconstruction error)
# display original & reconstructed images
def show_reconstructed_digits(
X,
outputs,
model_path = None,
n_test_digits = 2):
with tf.Session() as sess:
if model_path:
saver.restore(sess, model_path)
X_test = mnist.test.images[:n_test_digits]
outputs_val = outputs.eval(feed_dict={X: X_test})
fig = plt.figure(figsize=(8, 3 * n_test_digits))
for digit_index in range(n_test_digits):
plt.subplot(n_test_digits, 2, digit_index * 2 + 1)
plot_image(X_test[digit_index])
plt.subplot(n_test_digits, 2, digit_index * 2 + 2)
plot_image(outputs_val[digit_index])
plt.show()
#show_reconstructed_digits(X, outputs, "./my_model_all_layers.ckpt")
show_reconstructed_digits(X, outputs)
!pip3 install --upgrade gym
import gym
env = gym.make("CartPole-v0")
obs = env.reset()
obs
env.render()

img = env.render(mode="rgb_array")
img.shape
# what actions are possible?
# in this case: 0 = accelerate left, 1 = accelerate right
env.action_space
# pole is leaning right. let's go further to the right.
action = 1
obs, reward, done, info = env.step(action)
obs, reward, done, info
# example policy:
# (1) accelerate left when leaning left, (2) accelerate right when leaning right
# average reward over 500 episodes?
def basic_policy(obs):
angle = obs[2]
return 0 if angle < 0 else 1
totals = []
for episode in range(500):
episode_rewards = 0
obs = env.reset()
for step in range(1000): # 1000 steps max, we don't want to run forever
action = basic_policy(obs)
obs, reward, done, info = env.step(action)
episode_rewards += reward
if done:
break
totals.append(episode_rewards)
import numpy as np
np.mean(totals), np.std(totals), np.min(totals), np.max(totals)
Common tactic: applying a discount rate to older rewards.
Use normalization across many episodes to increase score reliability.
| NN Policy | Discounts & Rewards |
|---|---|
![]() |
![]() |
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected
# 1. Specify the neural network architecture
n_inputs = 4 # == env.observation_space.shape[0]
n_hidden = 4 # simple task, don't need more hidden neurons
n_outputs = 1 # only output prob(accelerating left)
initializer = tf.contrib.layers.variance_scaling_initializer()
# 2. Build the neural network
X = tf.placeholder(
tf.float32, shape=[None, n_inputs])
hidden = fully_connected(
X, n_hidden,
activation_fn=tf.nn.elu,
weights_initializer=initializer)
logits = fully_connected(
hidden, n_outputs,
activation_fn=None,
weights_initializer=initializer)
outputs = tf.nn.sigmoid(logits) # logistic (sigmoid) ==> return 0.0-1.0
# 3. Select a random action based on the estimated probabilities
p_left_and_right = tf.concat(
axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(
tf.log(p_left_and_right),
num_samples=1)
init = tf.global_variables_initializer()
| Markov Chain | Markov Decision Process |
|---|---|
![]() |
![]() |
# Define MDP:
nan=np.nan # represents impossible actions
T = np.array([ # shape=[s, a, s']
[[0.7, 0.3, 0.0], [1.0, 0.0, 0.0], [0.8, 0.2, 0.0]],
[[0.0, 1.0, 0.0], [nan, nan, nan], [0.0, 0.0, 1.0]],
[[nan, nan, nan], [0.8, 0.1, 0.1], [nan, nan, nan]],
])
R = np.array([ # shape=[s, a, s']
[[10., 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]],
[[10., 0.0, 0.0], [nan, nan, nan], [0.0, 0.0, -50.]],
[[nan, nan, nan], [40., 0.0, 0.0], [nan, nan, nan]],
])
possible_actions = [[0, 1, 2], [0, 2], [1]]
# run Q-Value Iteration algo
Q = np.full((3, 3), -np.inf)
for state, actions in enumerate(possible_actions):
Q[state, actions] = 0.0 # Initial value = 0.0, for all possible actions
learning_rate = 0.01
discount_rate = 0.95
n_iterations = 100
for iteration in range(n_iterations):
Q_prev = Q.copy()
for s in range(3):
for a in possible_actions[s]:
Q[s, a] = np.sum([
T[s, a, sp] * (R[s, a, sp] + discount_rate * np.max(Q_prev[sp]))
for sp in range(3)
])
print("Q: \n",Q)
print("Optimal action for each state:\n",np.argmax(Q, axis=1))
# change discount rate to 0.9, see how policy changes:
discount_rate = 0.90
for iteration in range(n_iterations):
Q_prev = Q.copy()
for s in range(3):
for a in possible_actions[s]:
Q[s, a] = np.sum([
T[s, a, sp] * (R[s, a, sp] + discount_rate * np.max(Q_prev[sp]))
for sp in range(3)
])
print("Q: \n",Q)
print("Optimal action for each state:\n",np.argmax(Q, axis=1))
Algorithm tracks running average of most recent awards & anticipated rewards.
Q-Learning algorithm adaptation of Q-Value Iteration where initial transition probabilities & rewards are unknown.
import numpy.random as rnd
learning_rate0 = 0.05
learning_rate_decay = 0.1
n_iterations = 20000
s = 0 # start in state 0
Q = np.full((3, 3), -np.inf) # -inf for impossible actions
for state, actions in enumerate(possible_actions):
Q[state, actions] = 0.0 # Initial value = 0.0, for all possible actions
for iteration in range(n_iterations):
a = rnd.choice(possible_actions[s]) # choose an action (randomly)
sp = rnd.choice(range(3), p=T[s, a]) # pick next state using T[s, a]
reward = R[s, a, sp]
learning_rate = learning_rate0 / (1 + iteration * learning_rate_decay)
Q[s, a] = learning_rate * Q[s, a] + (1 - learning_rate) * (reward + discount_rate * np.max(Q[sp]))
s = sp # move to next state
print("Q: \n",Q)
print("Optimal action for each state:\n",np.argmax(Q, axis=1))
env = gym.make('MsPacman-v0')
obs = env.reset()
obs.shape, env.action_space
# action_space = 9 possible joystick actions
# observations = atari screenshots as 3D NumPy arrays
mspacman_color = np.array([210, 164, 74]).mean()
# crop image, shrink to 88x80 pixels, convert to grayscale, improve contrast
def preprocess_observation(obs):
img = obs[1:176:2, ::2] # crop and downsize
img = img.mean(axis=2) # to greyscale
img[img==mspacman_color] = 0 # improve contrast
img = (img - 128) / 128 - 1 # normalize from -1. to 1.
return img.reshape(88, 80, 1)
| Ms PacMan Observation | Deep-Q net |
|---|---|
![]() |
![]() |
# Create DQN
# 3 convo layers, then 2 FC layers including output layer
from tensorflow.contrib.layers import convolution2d, fully_connected
input_height = 88
input_width = 80
input_channels = 1
conv_n_maps = [32, 64, 64]
conv_kernel_sizes = [(8,8), (4,4), (3,3)]
conv_strides = [4, 2, 1]
conv_paddings = ["SAME"]*3
conv_activation = [tf.nn.relu]*3
n_hidden_in = 64 * 11 * 10 # conv3 has 64 maps of 11x10 each
n_hidden = 512
hidden_activation = tf.nn.relu
n_outputs = env.action_space.n # 9 discrete actions are available
initializer = tf.contrib.layers.variance_scaling_initializer()
# training will need ***TWO*** DQNs:
# one to train the actor
# another to learn from trials & errors (critic)
# q_network is our net builder.
def q_network(X_state, scope):
prev_layer = X_state
conv_layers = []
with tf.variable_scope(scope) as scope:
for n_maps, kernel_size, stride, padding, activation in zip(
conv_n_maps,
conv_kernel_sizes,
conv_strides,
conv_paddings,
conv_activation):
prev_layer = convolution2d(
prev_layer,
num_outputs=n_maps,
kernel_size=kernel_size,
stride=stride,
padding=padding,
activation_fn=activation,
weights_initializer=initializer)
conv_layers.append(prev_layer)
last_conv_layer_flat = tf.reshape(
prev_layer,
shape=[-1, n_hidden_in])
hidden = fully_connected(
last_conv_layer_flat,
n_hidden,
activation_fn=hidden_activation,
weights_initializer=initializer)
outputs = fully_connected(
hidden,
n_outputs,
activation_fn=None,
weights_initializer=initializer)
trainable_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES,
scope=scope.name)
trainable_vars_by_name = {var.name[len(scope.name):]: var
for var in trainable_vars}
return outputs, trainable_vars_by_name
# create input placeholders & two DQNs
X_state = tf.placeholder(
tf.float32,
shape=[None, input_height, input_width,
input_channels])
actor_q_values, actor_vars = q_network(X_state, scope="q_networks/actor")
critic_q_values, critic_vars = q_network(X_state, scope="q_networks/critic")
copy_ops = [actor_var.assign(critic_vars[var_name])
for var_name, actor_var in actor_vars.items()]
# op to copy all trainable vars of critic DQN to actor DQN...
# use tf.group() to group all assignment ops together
copy_critic_to_actor = tf.group(*copy_ops)
# Critic DQN learns by matching Q-Value predictions
# to actor's Q-Value estimations during game play
# Actor will use a "replay memory" (5 tuples):
# state, action, next-state, reward, (0=over/1=continue)
# use normal supervised training ops
# occasionally copy critic DQN to actor DQN
# DQN normally returns one Q-Value for every poss. action
# only need Q-Value of action actually chosen
# So, convert action to one-hot vector [0...1...0], multiple by Q-values
# then sum over 1st axis.
X_action = tf.placeholder(
tf.int32, shape=[None])
q_value = tf.reduce_sum(
critic_q_values * tf.one_hot(X_action, n_outputs),
axis=1, keep_dims=True)
# training setup
tf.reset_default_graph()
y = tf.placeholder(
tf.float32, shape=[None, 1])
cost = tf.reduce_mean(
tf.square(y - q_value))
# non-trainable. minimize() op will manage incrementing it
global_step = tf.Variable(
0,
trainable=False,
name='global_step')
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(cost, global_step=global_step)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
# use a deque list to build the replay memory
from collections import deque
replay_memory_size = 10000
replay_memory = deque(
[], maxlen=replay_memory_size)
def sample_memories(batch_size):
indices = rnd.permutation(
len(replay_memory))[:batch_size]
cols = [[], [], [], [], []] # state, action, reward, next_state, continue
for idx in indices:
memory = replay_memory[idx]
for col, value in zip(cols, memory):
col.append(value)
cols = [np.array(col) for col in cols]
return (cols[0], cols[1], cols[2].reshape(-1, 1), cols[3], cols[4].reshape(-1, 1))
# create an actor
# use epsilon-greedy policy
# gradually decrease epsilon from 1.0 to 0.05 across 50K training steps
eps_min = 0.05
eps_max = 1.0
eps_decay_steps = 50000
def epsilon_greedy(q_values, step):
epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
if rnd.rand() < epsilon:
return rnd.randint(n_outputs) # random action
else:
return np.argmax(q_values) # optimal action
# training setup: the variables
n_steps = 100000 # total number of training steps
training_start = 1000 # start training after 1,000 game iterations
training_interval = 3 # run a training step every 3 game iterations
save_steps = 50 # save the model every 50 training steps
copy_steps = 25 # copy the critic to the actor every 25 training steps
discount_rate = 0.95
skip_start = 90 # skip the start of every game (it's just waiting time)
batch_size = 50
iteration = 0 # game iterations
checkpoint_path = "./my_dqn.ckpt"
done = True # env needs to be reset
# let's get busy
import os
with tf.Session() as sess:
# restore models if checkpoint file exists
if os.path.isfile(checkpoint_path):
saver.restore(sess, checkpoint_path)
# otherwise normally initialize variables
else:
init.run()
while True:
step = global_step.eval()
if step >= n_steps:
break
# iteration = total number of game steps from beginning
iteration += 1
if done: # game over, start again
obs = env.reset()
for skip in range(skip_start): # skip the start of each game
obs, reward, done, info = env.step(0)
state = preprocess_observation(obs)
# Actor evaluates what to do
q_values = actor_q_values.eval(feed_dict={X_state: [state]})
action = epsilon_greedy(q_values, step)
# Actor plays
obs, reward, done, info = env.step(action)
next_state = preprocess_observation(obs)
# Let's memorize what just happened
replay_memory.append((state, action, reward, next_state, 1.0 - done))
state = next_state
if iteration < training_start or iteration % training_interval != 0:
continue
# Critic learns
X_state_val, X_action_val, rewards, X_next_state_val, continues = (
sample_memories(batch_size))
next_q_values = actor_q_values.eval(
feed_dict={X_state: X_next_state_val})
max_next_q_values = np.max(
next_q_values, axis=1, keepdims=True)
y_val = rewards + continues * discount_rate * max_next_q_values
training_op.run(
feed_dict={X_state: X_state_val, X_action: X_action_val, y: y_val})
# Regularly copy critic to actor
if step % copy_steps == 0:
copy_critic_to_actor.run()
# And save regularly
if step % save_steps == 0:
saver.save(sess, checkpoint_path)
print("\n",np.average(y_val))