Repository: zlpure/CS234 Branch: master Commit: 2d92db348c3d Files: 72 Total size: 136.8 KB Directory structure: gitextract_onbbhi0g/ ├── LICENSE ├── README.md ├── assignment1/ │ ├── Makefile │ ├── collect_submission.sh │ ├── lake_envs.py │ ├── log │ ├── model_based_learning.py │ ├── model_free_learning.py │ ├── requirements.txt │ └── vi_and_pi.py ├── assignment2/ │ ├── .gitignore │ ├── Makefile │ ├── README.md │ ├── collect_submission.sh │ ├── configs/ │ │ ├── __init__.py │ │ ├── frozen_lake.py │ │ ├── q2_linear.py │ │ ├── q3_nature.py │ │ ├── q4_train_atari_linear.py │ │ ├── q5_train_atari_nature.py │ │ ├── q6_bonus_question.py │ │ └── test.py │ ├── core/ │ │ ├── __init__.py │ │ ├── deep_q_learning.py │ │ └── q_learning.py │ ├── q1_schedule.py │ ├── q2_linear.py │ ├── q3_nature.py │ ├── q4_train_atari_linear.py │ ├── q5_train_atari_nature.py │ ├── q6_double_q_learning.py │ ├── q6_dueling.py │ ├── requirements.txt │ ├── results/ │ │ ├── q2_linear/ │ │ │ ├── events.out.tfevents.1511874609.zengliang-PU551LD │ │ │ ├── log.txt │ │ │ └── model.weights/ │ │ │ ├── .data-00000-of-00001 │ │ │ ├── .index │ │ │ ├── .meta │ │ │ └── checkpoint │ │ ├── q3_nature/ │ │ │ ├── events.out.tfevents.1511876195.zengliang-PU551LD │ │ │ ├── log.txt │ │ │ └── model.weights/ │ │ │ ├── .index │ │ │ ├── .meta │ │ │ └── checkpoint │ │ └── q4_train_atari_linear/ │ │ ├── log.txt │ │ ├── model.weights/ │ │ │ ├── .data-00000-of-00001 │ │ │ ├── .index │ │ │ ├── .meta │ │ │ └── checkpoint │ │ └── monitor/ │ │ ├── openaigym.episode_batch.0.2799.stats.json │ │ ├── openaigym.episode_batch.0.3758.stats.json │ │ ├── openaigym.episode_batch.0.5469.stats.json │ │ ├── openaigym.manifest.0.2799.manifest.json │ │ ├── openaigym.manifest.0.3758.manifest.json │ │ ├── openaigym.manifest.0.5469.manifest.json │ │ ├── openaigym.video.0.2799.video000000.meta.json │ │ ├── openaigym.video.0.3758.video000000.meta.json │ │ └── openaigym.video.0.5469.video000000.meta.json │ └── utils/ │ ├── __init__.py │ ├── general.py │ ├── preprocess.py │ ├── replay_buffer.py │ ├── test_env.py │ ├── viewer.py │ └── wrappers.py └── assignment3/ ├── discrete_env.py ├── frozen_lake.py ├── q1.py ├── q2.py ├── q3.py ├── requirements.txt └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2017 Liang Zeng Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ ## My Solution to Assignments of CS234 This is my solution to three assignments of CS234.
[CS234: Deep Reinforcement Learning](http://cs234.stanford.edu/) is an interesting class, which teaches you what is the reinforcement learning: Learn to make good sequences of decisions. This class provides some basic knowledge and insights of cutting-edge research in reinforcement learning. More details are as follows: * Define the key features of RL vs AI & other ML * Define MDP, POMDP, bandit, batch offline RL, online RL * Describe the exploration vs exploitation challenge and compare and contrast 2 or more approaches * Given an application problem (e.g. from computer vision, robotics, etc) decide if it should be formulated as a RL problem, if yes how to formulate, what algorithm (from class) is best suited to address, and justify an answer * Implement several RL algorithms incl. a deep RL approach * Describe multiple criteria for analyzing RL algorithms and evaluate algorithms on these metrics: e.g. regret, sample complexity, computational complexity, convergence, etc. * List at least two open challenges or hot topics in RL ****** **Note:** If you consult my source codes that you may want to incorporate into your algorithm or system, you should clearly cite references in your codes. ****** ## Table of Contents * [Assignment 1](https://github.com/zlpure/CS234/tree/master/assignment1) * Bellman Operator Properties * Value Iteration * Grid Policies * Frozen Lake MDP * Frozen Lake Reinforcement Learning * [Assignment 2](https://github.com/zlpure/CS234/tree/master/assignment2) * Q-learning * Linear Approximation * Deepmind's DQN * (Bonus) Double DQN * (Bonus) Dueling DQN * [Assignment 3](https://github.com/zlpure/CS234/tree/master/assignment3) * R-max algorithm * epsilon-greedy q-learning * Expected Regret Bounds ## Dependencies * Anaconda * tensorflow>=0.12 * matplotlib * scipy * numpy * sklearn * six ## Author [@zlpure](github.com/zlpure) ================================================ FILE: assignment1/Makefile ================================================ submit: sh collect_submission.sh clean: rm -f assignment1.zip rm -f *.pyc *.png *.npy utils/*.pyc ================================================ FILE: assignment1/collect_submission.sh ================================================ rm -f assignment1.zip zip -r assignment1.zip *.py *.ipynb ================================================ FILE: assignment1/lake_envs.py ================================================ # coding: utf-8 """Defines some frozen lake maps.""" from gym.envs.toy_text import frozen_lake, discrete from gym.envs.registration import register register( id='Deterministic-4x4-FrozenLake-v0', entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv', kwargs={'map_name': '4x4', 'is_slippery': False}) register( id='Deterministic-8x8-FrozenLake-v0', entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv', kwargs={'map_name': '8x8', 'is_slippery': False}) register( id='Stochastic-4x4-FrozenLake-v0', entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv', kwargs={'map_name': '4x4', 'is_slippery': True}) ================================================ FILE: assignment1/log ================================================ Winter is here. You and your friends were tossing around a frisbee at the park when you made a wild throw that left the frisbee out in the middle of the lake. The water is mostly frozen, but there are a few holes where the ice has melted. If you step into one of those holes, you'll fall into the freezing water. At this time, there's an international frisbee shortage, so it's absolutely imperative that you navigate across the lake and retrieve the disc. However, the ice is slippery, so you won't always move in the direction you intend. The surface is described using a grid like the following SFFF FHFH FFFH HFFG S : starting point, safe F : frozen surface, safe H : hole, fall to your doom G : goal, where the frisbee is located The episode ends when you reach the goal or fall in a hole. You receive a reward of 1 if you reach the goal, and zero otherwise. [(0.3333333333333333, 13, 0.0, False), (0.3333333333333333, 14, 0.0, False), (0.3333333333333333, 9, 0.0, False)] [(0.3333333333333333, 13, 0.0, False), (0.3333333333333333, 14, 0.0, False), (0.3333333333333333, 15, 1.0, True)] 1.0 1 0.3 0 0.3 1 0.09 0 0.3 0 1.0 1 0.09 0 0.09 0 0.027 0 0.3 0 0.027 0 0.09 0 0.3 1 0.09 0 0.3 0 1.0 1 0.027 0 0.0081 0 0.0081 0 0.09 0 0.027 1 0.0081 0 0.09 0 0.027 0 0.3 0 0.027 0 0.09 0 0.3 1 0.09 0 0.3 0 1.0 1 0.0081 1 0.0081 0 0.027 0 0.00243 0 0.0081 0 0.0081 0 0.0081 0 0.09 0 0.00243 0 0.0081 0 0.027 1 0.0081 0 0.09 0 0.027 0 0.3 0 0.027 0 0.09 0 0.3 1 0.09 0 0.3 0 1.0 1 0.00243 0 0.00243 0 0.0081 1 0.0081 0 0.027 0 0.00243 0 0.0081 0 0.000729 0 0.00243 0 0.0081 0 0.0081 0 0.09 0 0.00243 0 0.0081 0 0.027 1 0.0081 0 0.09 0 0.027 0 0.3 0 0.027 0 0.09 0 0.3 1 0.09 0 0.3 0 1.0 1 0.000729 0 0.00243 0 0.00243 0 0.0081 1 0.0081 0 0.027 0 0.00243 0 0.0081 0 0.000729 0 0.00243 0 0.0081 0 0.0081 0 0.09 0 0.00243 0 0.0081 0 0.027 1 0.0081 0 0.09 0 0.027 0 0.3 0 0.027 0 0.09 0 0.3 1 0.09 0 0.3 0 1.0 1 0.000729 0 0.00243 0 0.00243 0 0.0081 1 0.0081 0 0.027 0 0.00243 0 0.0081 0 0.000729 0 0.00243 0 0.0081 0 0.0081 0 0.09 0 0.00243 0 0.0081 0 0.027 1 0.0081 0 0.09 0 0.027 0 0.3 0 0.027 0 0.09 0 0.3 1 0.09 0 0.3 0 1.0 1 0.000729 0 0.00243 0 0.00243 0 0.0081 1 0.0081 0 0.027 0 0.00243 0 0.0081 0 0.000729 0 0.00243 0 0.0081 0 0.0081 0 0.09 0 0.00243 0 0.0081 0 0.027 1 0.0081 0 0.09 0 0.027 0 0.3 0 0.027 0 0.09 0 0.3 1 0.09 0 0.3 0 1.0 1 0.000729 0 0.00243 0 0.00243 0 0.0081 1 0.0081 0 0.027 0 0.00243 0 0.0081 0 0.000729 0 0.00243 0 0.0081 0 0.0081 0 0.09 0 0.00243 0 0.0081 0 0.027 1 0.0081 0 0.09 0 0.027 0 0.3 0 0.027 0 0.09 0 0.3 1 0.09 0 0.3 0 1.0 1 0.000729 0 0.00243 0 0.00243 0 0.0081 1 0.0081 0 0.027 0 0.00243 0 0.0081 0 0.000729 0 0.00243 0 0.0081 0 0.0081 0 0.09 0 0.00243 0 0.0081 0 0.027 1 0.0081 0 0.09 0 0.027 0 0.3 0 0.027 0 0.09 0 0.3 1 0.09 0 0.3 0 1.0 1 0.000729 0 0.00243 0 0.00243 0 0.0081 1 0.0081 0 0.027 0 0.00243 0 0.0081 0 0.000729 0 0.00243 0 0.0081 0 0.0081 0 0.09 0 0.00243 0 0.0081 0 0.027 1 0.0081 0 0.09 0 0.027 0 0.3 0 0.027 0 0.09 0 0.3 1 0.09 0 0.3 0 1.0 1 0.000729 0 0.00243 0 0.00243 0 0.0081 1 0.0081 0 0.027 0 0.00243 0 0.0081 0 0.000729 0 0.00243 0 0.0081 0 0.0081 0 0.09 0 0.00243 0 0.0081 0 0.027 1 0.0081 0 0.09 0 0.027 0 0.3 0 0.027 0 0.09 0 0.3 1 0.09 0 0.3 0 1.0 1 0.000729 0 0.00243 0 0.00243 0 0.0081 1 0.0081 0 0.027 0 0.00243 0 0.0081 0 0.000729 0 0.00243 0 0.0081 0 0.0081 0 0.09 0 0.00243 0 0.0081 0 0.027 1 0.0081 0 0.09 0 0.027 0 0.3 0 0.027 0 0.09 0 0.3 1 0.09 0 0.3 0 1.0 1 0.000729 0 0.00243 0 0.00243 0 0.0081 1 0.0081 0 0.027 0 0.00243 0 0.0081 0 0.000729 0 0.00243 0 0.0081 0 0.0081 0 0.09 0 0.00243 0 0.0081 0 0.027 1 0.0081 0 0.09 0 0.027 0 0.3 0 0.027 0 0.09 0 0.3 1 0.09 0 0.3 0 1.0 1 0.000729 0 0.00243 0 0.00243 0 0.0081 1 0.0081 0 0.027 0 0.00243 0 0.0081 0 0.000729 0 0.00243 0 0.0081 0 0.0081 0 0.09 0 0.00243 0 0.0081 0 0.027 1 0.0081 0 0.09 0 0.027 0 0.3 0 0.027 0 0.09 0 0.3 1 0.09 0 0.3 0 1.0 1 0.000729 0 0.00243 0 0.00243 0 0.0081 1 0.0081 0 0.027 0 0.00243 0 0.0081 0 0.000729 0 0.00243 0 0.0081 0 0.0081 0 0.09 0 0.00243 0 0.0081 0 0.027 1 0.0081 0 0.09 0 0.027 0 0.3 0 0.027 0 0.09 0 0.3 1 0.09 0 0.3 0 1.0 1 0.000729 0 0.00243 0 0.00243 0 0.0081 1 0.0081 0 0.027 0 0.00243 0 0.0081 0 0.000729 0 0.00243 0 0.0081 0 0.0081 0 0.09 0 0.00243 0 0.0081 0 0.027 1 0.0081 0 0.09 0 0.027 0 0.3 0 0.027 0 0.09 0 0.3 1 0.09 0 0.3 0 1.0 1 0.000729 0 0.00243 0 0.00243 0 0.0081 1 0.0081 0 0.027 0 0.00243 0 0.0081 0 0.000729 0 0.00243 0 0.0081 0 0.0081 0 0.09 0 0.00243 0 0.0081 0 0.027 1 0.0081 0 0.09 0 0.027 0 0.3 0 0.027 0 0.09 0 0.3 1 0.09 0 0.3 0 1.0 1 0.000729 0 0.00243 0 0.00243 0 0.0081 1 0.0081 0 0.027 0 0.00243 0 0.0081 0 0.000729 0 0.00243 0 0.0081 0 0.0081 0 0.09 0 0.00243 0 0.0081 0 0.027 1 0.0081 0 0.09 0 0.027 0 0.3 0 0.027 0 0.09 0 0.3 1 0.09 0 0.3 0 1.0 1 0.000729 0 0.00243 0 0.00243 0 0.0081 1 0.0081 0 0.027 0 0.00243 0 0.0081 0 0.000729 0 0.00243 0 0.0081 0 0.0081 0 0.09 0 0.00243 0 0.0081 0 0.027 1 0.0081 0 0.09 0 0.027 0 0.3 0 0.027 0 0.09 0 0.3 1 0.09 0 0.3 0 1.0 1 [ 0.002 0.008 0.027 0.008 0.008 0. 0.09 0. 0.027 0.09 0.3 0. 0. 0.3 1. 0. ] [0 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0] ================================================ FILE: assignment1/model_based_learning.py ================================================ ### Episodic Model Based Learning using Maximum Likelihood Estimate of the Environment # Do not change the arguments and output types of any of the functions provided! You may debug in Main and elsewhere. import numpy as np import gym import time from lake_envs import * import matplotlib.pyplot as plt from tqdm import * from vi_and_pi import value_iteration from vi_and_pi import policy_iteration def initialize_P(nS, nA): """Initializes a uniformly random model of the environment with 0 rewards. Parameters ---------- nS: int Number of states nA: int Number of actions Returns ------- P: np.array of shape [nS x nA x nS x 4] where items are tuples representing transition information P[state][action] is a list of (prob, next_state, reward, done) tuples. """ P = [[[(1.0/nS, i, 0, False) for i in range(nS)] for _ in range(nA)] for _ in range(nS)] return P def initialize_counts(nS, nA): """Initializes a counts array. Parameters ---------- nS: int Number of states nA: int Number of actions Returns ------- counts: np.array of shape [nS x nA x nS] counts[state][action][next_state] is the number of times that doing "action" at state "state" transitioned to "next_state" """ counts = [[[0 for _ in range(nS)] for _ in range(nA)] for _ in range(nS)] return counts def initialize_rewards(nS, nA): """Initializes a rewards array. Values represent running averages. Parameters ---------- nS: int Number of states nA: int Number of actions Returns ------- rewards: array of shape [nS x nA x nS] counts[state][action][next_state] is the running average of rewards of doing "action" at "state" transtioned to "next_state" """ rewards = [[[0 for _ in range (nS)] for _ in range(nA)] for _ in range(nS)] return rewards def counts_and_rewards_to_P(counts, rewards, terminal_state): """Converts counts and rewards arrays to a P array consistent with the Gym environment data structure for a model of the environment. Use this function to convert your counts and rewards arrays to a P that you can use in value iteration. Parameters ---------- counts: array of shape [nS x nA x nS] counts[state][action][next_state] is the number of times that doing "action" at state "state" transitioned to "next_state" rewards: array of shape [nS x nA x nS] counts[state][action][next_state] is the running average of rewards of doing "action" at "state" transtioned to "next_state" Returns ------- P: np.array of shape [nS x nA x nS' x 4] where items are tuples representing transition information P[state][action] is a list of (prob, next_state, reward, done) tuples. """ nS = len(counts) nA = len(counts[0]) P = [[[] for _ in range(nA)] for _ in range(nS)] for state in range(nS): for action in range(nA): if sum(counts[state][action]) != 0: for next_state in range(nS): if counts[state][action][next_state] != 0: prob = float(counts[state][action][next_state]) / float(sum(counts[state][action])) reward = rewards[state][action][next_state] if next_state in terminal_state: P[state][action].append((prob, next_state, reward, True)) else: P[state][action].append((prob, next_state, reward, False)) else: prob = 1.0 / float(nS) for next_state in range(nS): P[state][action].append((prob, next_state, 0, False)) #for action in range(nA): #P[nS-2][2][nS-1] = (1.0, nS-1, 1, True) return P def update_mdp_model_with_history(counts, rewards, history): """Given a history of an entire episode, update the count and rewards arrays Parameters ---------- counts: array of shape [nS x nA x nS] counts[state][action][next_state] is the number of times that doing "action" at state "state" transitioned to "next_state" rewards: array of shape [nS x nA x nS] counts[state][action][next_state] is the running average of rewards of doing "action" at "state" transtioned to "next_state" history: a list of [state, action, reward, next_state, done] """ # HINT: For terminal states, we define that the probability of any action returning the state to itself is 1 (with zero reward) # Make sure you record this information in your counts array by updating the counts for this accordingly for your # value iteration to work. ############################ # YOUR IMPLEMENTATION HERE # for item in history: #print item (state, action, reward, next_state, done) = item #if not done: # counts[state][action][next_state] += 1 # rewards[state][action][next_state] = float(rewards[state][action][next_state]+reward) / counts[state][action][next_state] #else: # counts[state][action][next_state] = 1 # rewards[state][action][next_state] = float(rewards[state][action][next_state]+reward) / counts[state][action][next_state] counts[state][action][next_state] += 1 all_reward = float(rewards[state][action][next_state]*(counts[state][action][next_state]-1)+reward) rewards[state][action][next_state] = all_reward / counts[state][action][next_state] ############################ return counts, rewards def learn_with_mdp_model(env, method=None, num_episodes=5000, gamma = 0.95, e = 0.8, decay_rate = 0.99): """Build a model of the environment and use value iteration to learn a policy. In the next episode, play with the new policy using epsilon-greedy exploration. Your model of the environment should be based on updating counts and rewards arrays. The counts array counts the number of times that "state" with "action" led to "next_state", and the rewards array is the running average of rewards for going from at "state" with "action" leading to "next_state". For a single episode, create a list called "history" with all the experience from that episode, then update the "counts" and "rewards" arrays using the function "update_mdp_model_with_history". You may then call the prewritten function "counts_and_rewards_to_P" to convert your counts and rewards arrays to an environment data structure P consistent with the Gym environment's one. You may then call on value_iteration(P, nS, nA) to get a policy. Parameters ---------- env: gym.core.Environment Environment to compute Q function for. Must have nS, nA, and P as attributes. num_episodes: int Number of episodes of training. gamma: float Discount factor. Number in range [0, 1) learning_rate: float Learning rate. Number in range [0, 1) e: float Epsilon value used in the epsilon-greedy method. decay_rate: float Rate at which epsilon falls. Number in range [0, 1) Returns ------- policy: np.array An array of shape [env.nS] representing the action to take at a given state. """ P = initialize_P(env.nS, env.nA) counts = initialize_counts(env.nS, env.nA) rewards = initialize_rewards(env.nS, env.nA) ############################ # YOUR IMPLEMENTATION HERE # new_policy = np.zeros((env.nS)).astype(int) terminal_state = [] for i in range(num_episodes): done = False state = env.reset() his = [] while not done: if np.random.rand() > e: action = new_policy[state] else: action = np.random.randint(env.nA) nextstate, reward, done, _ = env.step(action) his.append([state, action, reward, nextstate, done]) state = nextstate if state not in terminal_state: terminal_state.append(state) counts, rewards = update_mdp_model_with_history(counts, rewards, his) P = counts_and_rewards_to_P(counts, rewards, terminal_state) _, new_policy = method(P, env.nS, env.nA, gamma) if i%10 == 0: e *= decay_rate ############################ return new_policy def render_single(env, policy): """Renders policy once on environment. Watch your agent play! Parameters ---------- env: gym.core.Environment Environment to play on. Must have nS, nA, and P as attributes. Policy: np.array of shape [env.nS] The action to take at a given state """ episode_reward = 0 state = env.reset() done = False while not done: #env.render() #time.sleep(0.5) # Seconds between frames. Modify as you wish. action = policy[state] state, reward, done, _ = env.step(action) episode_reward += reward #print "Episode reward: %f" % episode_reward return episode_reward # Feel free to run your own debug code in main! def main(): env = gym.make('Stochastic-4x4-FrozenLake-v0') #render_single(env, policy) #print policy score1 = [] score2 = [] average_score1 = [] average_score2 = [] for i in tqdm(np.arange(1, 5000, 50)): policy1 = learn_with_mdp_model(env, method=value_iteration, num_episodes=i+1) policy2 = learn_with_mdp_model(env, method=policy_iteration, num_episodes=i+1) episode_reward1 = render_single(env, policy1) episode_reward2 = render_single(env, policy2) score1.append(episode_reward1) score2.append(episode_reward2) for i in range(100): average_score1[i] = np.mean(score1[:i+1]) average_score2[i] = np.mean(score2[:i+1]) plt.plot(np.arange(1, 5000, 50),np.array(average_score1)) plt.plot(np.arange(1, 5000, 50),np.array(average_score2)) plt.title('The running average score of the model-based learning agent') plt.xlabel('traning episodes') plt.ylabel('score') plt.legend(['value-iteration', 'policy_iteration'], loc='upper right') #plt.show() plt.savefig('model-based.jpg') if __name__ == '__main__': main() ================================================ FILE: assignment1/model_free_learning.py ================================================ ### Episode model free learning using Q-learning and SARSA # Do not change the arguments and output types of any of the functions provided! You may debug in Main and elsewhere. import numpy as np import gym import time from lake_envs import * import matplotlib.pyplot as plt from tqdm import * def learn_Q_QLearning(env, num_episodes=2000, gamma=0.95, lr=0.1, e=0.8, decay_rate=0.99): """Learn state-action values using the Q-learning algorithm with epsilon-greedy exploration strategy. Update Q at the end of every episode. Parameters ---------- env: gym.core.Environment Environment to compute Q function for. Must have nS, nA, and P as attributes. num_episodes: int Number of episodes of training. gamma: float Discount factor. Number in range [0, 1) learning_rate: float Learning rate. Number in range [0, 1) e: float Epsilon value used in the epsilon-greedy method. decay_rate: float Rate at which epsilon falls. Number in range [0, 1) Returns ------- np.array An array of shape [env.nS x env.nA] representing state, action values """ ############################ # YOUR IMPLEMENTATION HERE # q_value = np.zeros([env.nS, env.nA]) for i in range(num_episodes): done = False state = env.reset() while not done: if np.random.rand() > e: action = np.argmax(q_value[state]) else: action = np.random.randint(env.nA) nextstate, reward, done, _ = env.step(action) q_value[state][action] = (1-lr)*q_value[state][action]+lr*(reward+gamma*np.max(q_value[nextstate])) state = nextstate if i%10 == 0: e *= decay_rate ''' print np.mean(q_value) plt.plot(np.arange(num_episodes),np.array(score)) plt.title('The running average score of the Q-learning agent') plt.xlabel('traning episodes') plt.ylabel('score') #plt.show() plt.savefig('c.jpg') ''' ############################ return q_value def learn_Q_SARSA(env, num_episodes=2000, gamma=0.95, lr=0.1, e=0.8, decay_rate=0.99): """Learn state-action values using the SARSA algorithm with epsilon-greedy exploration strategy Update Q at the end of every episode. Parameters ---------- env: gym.core.Environment Environment to compute Q function for. Must have nS, nA, and P as attributes. num_episodes: int Number of episodes of training. gamma: float Discount factor. Number in range [0, 1) learning_rate: float Learning rate. Number in range [0, 1) e: float Epsilon value used in the epsilon-greedy method. decay_rate: float Rate at which epsilon falls. Number in range [0, 1) Returns ------- np.array An array of shape [env.nS x env.nA] representing state-action values """ ############################ # YOUR IMPLEMENTATION HERE # q_value = np.zeros([env.nS, env.nA]) for i in range(num_episodes): done = False state = env.reset() if np.random.rand() > e: action = np.argmax(q_value[state]) else: action = np.random.randint(env.nA) while not done: nextstate, reward, done, _ = env.step(action) if np.random.rand() > e: nextaction = np.argmax(q_value[nextstate]) else: nextaction = np.random.randint(env.nA) q_value[state][action] = (1-lr)*q_value[state][action]+lr*(reward+gamma*q_value[nextstate][nextaction]) state = nextstate action = nextaction if i%10 == 0: e *= decay_rate ############################ return q_value def render_single_Q(env, Q): """Renders Q function once on environment. Watch your agent play! Parameters ---------- env: gym.core.Environment Environment to play Q function on. Must have nS, nA, and P as attributes. Q: np.array of shape [env.nS x env.nA] state-action values. """ episode_reward = 0 state = env.reset() done = False while not done: #env.render() #show frames #time.sleep(0.5) # Seconds between frames. Modify as you wish. action = np.argmax(Q[state]) state, reward, done, _ = env.step(action) episode_reward += reward #print "Episode reward: %f" % episode_reward return episode_reward # Feel free to run your own debug code in main! def main(): env = gym.make('Stochastic-4x4-FrozenLake-v0') score1 = [] score2 = [] average_score1 = [] average_score2 = [] for i in tqdm(range(4000)): Q1 = learn_Q_QLearning(env, num_episodes=i+1) Q2 = learn_Q_SARSA(env, num_episodes=i+1) episode_reward1 = render_single_Q(env, Q1) episode_reward2 = render_single_Q(env, Q2) score1.append(episode_reward1) score2.append(episode_reward2) for i in range(4000): average_score1.append(np.mean(score1[:i+1])) average_score2.append(np.mean(score2[:i+1])) plt.plot(np.arange(4000),np.array(average_score1)) plt.plot(np.arange(4000),np.array(average_score2)) plt.title('The running average score of the Q-learning agent') plt.xlabel('traning episodes') plt.ylabel('score') plt.legend(['q-learning', 'sarsa'], loc='upper right') #plt.show() plt.savefig('model-free.jpg') if __name__ == '__main__': main() ================================================ FILE: assignment1/requirements.txt ================================================ matplotlib numpy ================================================ FILE: assignment1/vi_and_pi.py ================================================ ### MDP Value Iteration and Policy Iteratoin # You might not need to use all parameters import numpy as np import gym import time from lake_envs import * np.set_printoptions(precision=3) def value_iteration(P, nS, nA, gamma=0.9, max_iteration=20, tol=1e-3): """ Learn value function and policy by using value iteration method for a given gamma and environment. Parameters: ---------- P: dictionary It is from gym.core.Environment P[state][action] is tuples with (probability, nextstate, reward, terminal) nS: int number of states nA: int number of actions gamma: float Discount factor. Number in range [0, 1) max_iteration: int The maximum number of iterations to run before stopping. Feel free to change it. tol: float Determines when value function has converged. Returns: ---------- value function: np.ndarray policy: np.ndarray """ V = np.zeros(nS) policy = np.zeros(nS, dtype=int) ############################ # YOUR IMPLEMENTATION HERE # idx = 1 new_V = V.copy() #print P[14][2] while idx<=max_iteration or np.sum(np.sqrt(np.square(new_V-V)))>tol: idx += 1 V = new_V for state in range(nS): max_result = -10 max_idx = 0 for action in range(nA): result = P[state][action] temp = np.array(result)[:,2].mean() #temp = result[0][2] for num in range(len(result)): (probability, nextstate, reward, terminal) = result[num] temp += gamma*probability*V[nextstate] if max_result < temp: max_result = temp max_idx = action new_V[state] = max_result policy[state] = max_idx #print new_V #print policy ############################ return V, policy def policy_evaluation(P, nS, nA, policy, gamma=0.9, max_iteration=100, tol=1e-3): """Evaluate the value function from a given policy. Parameters ---------- P: dictionary It is from gym.core.Environment P[state][action] is tuples with (probability, nextstate, reward, terminal) nS: int number of states nA: int number of actions gamma: float Discount factor. Number in range [0, 1) policy: np.array The policy to evaluate. Maps states to actions. max_iteration: int The maximum number of iterations to run before stopping. Feel free to change it. tol: float Determines when value function has converged. Returns ------- value function: np.ndarray The value function from the given policy. """ ############################ # YOUR IMPLEMENTATION HERE # value_function = np.zeros(nS) new_value_function = value_function.copy() i = 0 while i<=max_iteration or np.sum(np.sqrt(np.square(new_value_function-value_function)))>tol: i += 1 value_function = new_value_function.copy() for state in range(nS): result = P[state][policy[state]] new_value_function[state] = np.array(result)[:,2].mean() for num in range(len(result)): (probability, nextstate, reward, terminal) = result[num] new_value_function[state] += (gamma * probability * value_function[nextstate]) ############################ return new_value_function def policy_improvement(P, nS, nA, value_from_policy, policy, gamma=0.9): """Given the value function from policy improve the policy. Parameters ---------- P: dictionary It is from gym.core.Environment P[state][action] is tuples with (probability, nextstate, reward, terminal) nS: int number of states nA: int number of actions gamma: float Discount factor. Number in range [0, 1) value_from_policy: np.ndarray The value calculated from the policy policy: np.array The previous policy. Returns ------- new policy: np.ndarray An array of integers. Each integer is the optimal action to take in that state according to the environment dynamics and the given value function. """ ############################ # YOUR IMPLEMENTATION HERE # q_function = np.zeros([nS,nA]) for state in range(nS): for action in range(nA): result = P[state][action] for num in range(len(result)): (probability, nextstate, reward, terminal) = result[num] q_function[state][action] = reward q_function[state][action] += (gamma*probability*value_from_policy[nextstate]) new_policy = np.argmax(q_function, axis=1) ############################ return new_policy def policy_iteration(P, nS, nA, gamma=0.9, max_iteration=200, tol=1e-3): """Runs policy iteration. You should use the policy_evaluation and policy_improvement methods to implement this method. Parameters ---------- P: dictionary It is from gym.core.Environment P[state][action] is tuples with (probability, nextstate, reward, terminal) nS: int number of states nA: int number of actions gamma: float Discount factor. Number in range [0, 1) max_iteration: int The maximum number of iterations to run before stopping. Feel free to change it. tol: float Determines when value function has converged. Returns: ---------- value function: np.ndarray policy: np.ndarray """ V = np.zeros(nS) policy = np.zeros(nS, dtype=int) ############################ # YOUR IMPLEMENTATION HERE # i = 0 new_policy= policy.copy() while i<=max_iteration or np.sum(np.sqrt(np.square(new_policy-policy)))>tol: i += 1 policy = new_policy V = policy_evaluation(P, nS, nA, policy) new_policy = policy_improvement(P, nS, nA, V, policy) ############################ return V, policy def example(env): """Show an example of gym Parameters ---------- env: gym.core.Environment Environment to play on. Must have nS, nA, and P as attributes. """ env.seed(0); from gym.spaces import prng; prng.seed(10) # for print the location # Generate the episode ob = env.reset() for t in range(100): env.render() a = env.action_space.sample() ob, rew, done, _ = env.step(a) if done: break assert done env.render(); def render_single(env, policy): """Renders policy once on environment. Watch your agent play! Parameters ---------- env: gym.core.Environment Environment to play on. Must have nS, nA, and P as attributes. Policy: np.array of shape [env.nS] The action to take at a given state """ episode_reward = 0 ob = env.reset() for t in range(100): env.render() #time.sleep(0.5) # Seconds between frames. Modify as you wish. a = policy[ob] ob, rew, done, _ = env.step(a) episode_reward += rew if done: break assert done env.render(); print "Episode reward: %f" % episode_reward # Feel free to run your own debug code in main! # Play around with these hyperparameters. if __name__ == "__main__": env = gym.make("Stochastic-4x4-FrozenLake-v0") print env.__doc__ #print "Here is an example of state, action, reward, and next state" #example(env) V_vi, p_vi = value_iteration(env.P, env.nS, env.nA, gamma=0.9, max_iteration=20, tol=1e-3) #V_pi, p_pi = policy_iteration(env.P, env.nS, env.nA, gamma=0.9, max_iteration=20, tol=1e-3) render_single(env, p_vi) ================================================ FILE: assignment2/.gitignore ================================================ /results ================================================ FILE: assignment2/Makefile ================================================ submit: sh collect_submission.sh clean: rm -f assignment1.zip rm -f *.pyc *.png *.npy utils/*.pyc ================================================ FILE: assignment2/README.md ================================================ # RL with Atari ## Install First, install gym and atari environments. You may need to install other dependencies depending on your system. ``` pip install gym ``` and then install atari with one of the following commands ``` pip install "gym[atari]" pip install gym[atari] ``` We also require you to use a version greater than 1 for Tensorflow. ## Environment ### Pong-v0 - We play against a decent AI player. - One player wins if the ball pass through the other player and gets reward +1 else -1. - Episode is over when one of the player reaches 21 wins - final score is between -21 or +21 (lost all or won all) ```python # action = int in [0, 6) # state = (210, 160, 3) array # reward = 0 during the game, 1 if we win, -1 else ``` We use a modified env where the dimension of the input is reduced to ```python # state = (80, 80, 1) ``` with downsampling and greyscale. ## Training Once done with implementing `q2_linear.py` (setup of the tensorflow necessary op) and `q3_nature` make sure you test your implementation by launching `python q2_linear.py` and `python q3_nature.py` that will run your code on the Test environment. You can launch the training of DeepMind's DQN on pong with ``` python q5_train_atari_nature.py ``` The default config file should be sufficient to reach good performance after 5 million steps. You can monitor your training with Tensorboard by doing, on Azure ``` tensorboard --logdir=results ``` and then connect to `ip-of-you-machine:6006` **Credits** Assignment code written by Guillaume Genthial and Shuhui Qu. ================================================ FILE: assignment2/collect_submission.sh ================================================ rm -f assignment2.zip zip -r assignment2.zip . -x "*.pyc" "*.git*" "*weights/*" "*README.md" "*collect_submission.sh" "*events.out*" "*/monitor/*" ================================================ FILE: assignment2/configs/__init__.py ================================================ ================================================ FILE: assignment2/configs/frozen_lake.py ================================================ class config(): # env config render_train = False render_test = False env_name = "Pong-v0" RGB = True overwrite_render = True # output config output_path = "results/test/" model_output = output_path + "model.weights/" log_path = output_path + "log.txt" plot_output = output_path + "scores.png" training_path = "results/train/" # model and training config num_episodes_test = 20 grad_clip = True clip_val = 10 saving_freq = 500 log_freq = 50 eval_freq = 50000 soft_epsilon = 0.05 # nature paper hyper params nsteps_train = 2000*200 batch_size = 32 buffer_size = 50000 target_update_freq = 5000 gamma = 0.99 learning_freq = 1 state_history = 1 skip_frame = 1 lr = 0.1 eps_begin = 0.1 eps_end = 0.01 eps_nsteps = nsteps_train learning_start = 5000 ================================================ FILE: assignment2/configs/q2_linear.py ================================================ class config(): # env config render_train = False render_test = False overwrite_render = True record = False high = 255. # output config output_path = "results/q2_linear/" model_output = output_path + "model.weights/" log_path = output_path + "log.txt" plot_output = output_path + "scores.png" # model and training config num_episodes_test = 20 grad_clip = True clip_val = 10 saving_freq = 5000 log_freq = 50 eval_freq = 1000 soft_epsilon = 0 # hyper params nsteps_train = 10000 batch_size = 32 buffer_size = 1000 target_update_freq = 500 gamma = 0.99 learning_freq = 4 state_history = 4 lr_begin = 0.005 lr_end = 0.001 lr_nsteps = nsteps_train/2 eps_begin = 1 eps_end = 0.01 eps_nsteps = nsteps_train/2 learning_start = 200 ================================================ FILE: assignment2/configs/q3_nature.py ================================================ class config(): # env config render_train = False render_test = False overwrite_render = True record = False high = 255. # output config output_path = "results/q3_nature/" model_output = output_path + "model.weights/" log_path = output_path + "log.txt" plot_output = output_path + "scores.png" # model and training config num_episodes_test = 20 grad_clip = True clip_val = 10 saving_freq = 5000 log_freq = 50 eval_freq = 100 soft_epsilon = 0 # hyper params nsteps_train = 1000 batch_size = 32 buffer_size = 500 target_update_freq = 500 gamma = 0.99 learning_freq = 4 state_history = 4 lr_begin = 0.00025 lr_end = 0.0001 lr_nsteps = nsteps_train/2 eps_begin = 1 eps_end = 0.01 eps_nsteps = nsteps_train/2 learning_start = 200 ================================================ FILE: assignment2/configs/q4_train_atari_linear.py ================================================ class config(): # env config render_train = False render_test = False env_name = "Pong-v0" overwrite_render = True record = True high = 255. # output config output_path = "results/q4_train_atari_linear/" model_output = output_path + "model.weights/" log_path = output_path + "log.txt" plot_output = output_path + "scores.png" record_path = output_path + "monitor/" # model and training config num_episodes_test = 50 grad_clip = True clip_val = 10 saving_freq = 250000 log_freq = 50 eval_freq = 250000 record_freq = 250000 soft_epsilon = 0.05 # nature paper hyper params nsteps_train = 5000000 batch_size = 32 buffer_size = 1000000 target_update_freq = 10000 gamma = 0.99 learning_freq = 4 state_history = 4 skip_frame = 4 lr_begin = 0.00025 lr_end = 0.00005 lr_nsteps = nsteps_train/2 eps_begin = 1 eps_end = 0.1 eps_nsteps = 1000000 learning_start = 50000 ================================================ FILE: assignment2/configs/q5_train_atari_nature.py ================================================ class config(): # env config render_train = False render_test = False env_name = "Pong-v0" overwrite_render = True record = True high = 255. # output config output_path = "results/q5_train_atari_nature/" model_output = output_path + "model.weights/" log_path = output_path + "log.txt" plot_output = output_path + "scores.png" record_path = output_path + "monitor/" # model and training config num_episodes_test = 50 grad_clip = True clip_val = 10 saving_freq = 250000 log_freq = 50 eval_freq = 250000 record_freq = 250000 soft_epsilon = 0.05 # nature paper hyper params nsteps_train = 5000000 batch_size = 32 buffer_size = 1000000 target_update_freq = 10000 gamma = 0.99 learning_freq = 4 state_history = 4 skip_frame = 4 lr_begin = 0.00025 lr_end = 0.00005 lr_nsteps = nsteps_train/2 eps_begin = 1 eps_end = 0.1 eps_nsteps = 1000000 learning_start = 50000 ================================================ FILE: assignment2/configs/q6_bonus_question.py ================================================ class config(): # env config render_train = False render_test = False env_name = "Pong-v0" overwrite_render = True record = True high = 255. # output config output_path = "results/q6_bonus_question/" model_output = output_path + "model.weights/" log_path = output_path + "log.txt" plot_output = output_path + "scores.png" record_path = output_path + "monitor/" # model and training config num_episodes_test = 50 grad_clip = True clip_val = 10 saving_freq = 250000 log_freq = 50 eval_freq = 250000 record_freq = 250000 soft_epsilon = 0.05 # nature paper hyper params nsteps_train = 10000000 batch_size = 32 buffer_size = 1000000 target_update_freq = 10000 gamma = 0.99 learning_freq = 4 state_history = 4 skip_frame = 4 lr_begin = 0.00025 lr_end = 0.00005 lr_nsteps = nsteps_train/2 eps_begin = 1 eps_end = 0.1 eps_nsteps = 1000000 learning_start = 50000 ================================================ FILE: assignment2/configs/test.py ================================================ class config(): # env config render_train = True render_test = False env_name = "Pong-v0" overwrite_render = True record = True high = 255. # output config output_path = "results/test/" model_output = output_path + "model.weights/" log_path = output_path + "log.txt" plot_output = output_path + "scores.png" record_path = output_path + "video/" # model and training config num_episodes_test = 10 grad_clip = True clip_val = 10 saving_freq = 1000 log_freq = 50 eval_freq = 1000 record_freq = 1000 soft_epsilon = 0.05 # nature paper hyper params nsteps_train = 10000 batch_size = 32 buffer_size = 1000 target_update_freq = 1000 gamma = 0.99 learning_freq = 4 state_history = 4 skip_frame = 4 lr = 0.0001 eps_begin = 1 eps_end = 0.1 eps_nsteps = 1000 learning_start = 500 ================================================ FILE: assignment2/core/__init__.py ================================================ ================================================ FILE: assignment2/core/deep_q_learning.py ================================================ import os import numpy as np import tensorflow as tf import time from q_learning import QN class DQN(QN): """ Abstract class for Deep Q Learning """ def add_placeholders_op(self): raise NotImplementedError def get_q_values_op(self, scope, reuse=False): """ set Q values, of shape = (batch_size, num_actions) """ raise NotImplementedError def add_update_target_op(self, q_scope, target_q_scope): """ Update_target_op will be called periodically to copy Q network to target Q network Args: q_scope: name of the scope of variables for q target_q_scope: name of the scope of variables for the target network """ raise NotImplementedError def add_loss_op(self, q, target_q): """ Set (Q_target - Q)^2 """ raise NotImplementedError def add_optimizer_op(self, scope): """ Set training op wrt to loss for variable in scope """ raise NotImplementedError def process_state(self, state): """ Processing of state State placeholders are tf.uint8 for fast transfer to GPU Need to cast it to float32 for the rest of the tf graph. Args: state: node of tf graph of shape = (batch_size, height, width, nchannels) of type tf.uint8. if , values are between 0 and 255 -> 0 and 1 """ state = tf.cast(state, tf.float32) state /= self.config.high return state def build(self): """ Build model by adding all necessary variables """ # add placeholders self.add_placeholders_op() # compute Q values of state s = self.process_state(self.s) self.q = self.get_q_values_op(s, scope="q", reuse=False) # compute Q values of next state sp = self.process_state(self.sp) self.target_q = self.get_q_values_op(sp, scope="target_q", reuse=False) # add update operator for target network self.add_update_target_op("q", "target_q") # add square loss self.add_loss_op(self.q, self.target_q) # add optmizer for the main networks self.add_optimizer_op("q") def initialize(self): """ Assumes the graph has been constructed Creates a tf Session and run initializer of variables """ # create tf session self.sess = tf.Session() # tensorboard stuff self.add_summary() # initiliaze all variables init = tf.global_variables_initializer() self.sess.run(init) # synchronise q and target_q networks self.sess.run(self.update_target_op) # for saving networks weights self.saver = tf.train.Saver() def add_summary(self): """ Tensorboard stuff """ # extra placeholders to log stuff from python self.avg_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="avg_reward") self.max_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="max_reward") self.std_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="std_reward") self.avg_q_placeholder = tf.placeholder(tf.float32, shape=(), name="avg_q") self.max_q_placeholder = tf.placeholder(tf.float32, shape=(), name="max_q") self.std_q_placeholder = tf.placeholder(tf.float32, shape=(), name="std_q") self.eval_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="eval_reward") # add placeholders from the graph tf.summary.scalar("loss", self.loss) tf.summary.scalar("grads norm", self.grad_norm) # extra summaries from python -> placeholders tf.summary.scalar("Avg Reward", self.avg_reward_placeholder) tf.summary.scalar("Max Reward", self.max_reward_placeholder) tf.summary.scalar("Std Reward", self.std_reward_placeholder) tf.summary.scalar("Avg Q", self.avg_q_placeholder) tf.summary.scalar("Max Q", self.max_q_placeholder) tf.summary.scalar("Std Q", self.std_q_placeholder) tf.summary.scalar("Eval Reward", self.eval_reward_placeholder) # logging self.merged = tf.summary.merge_all() self.file_writer = tf.summary.FileWriter(self.config.output_path, self.sess.graph) def save(self): """ Saves session """ if not os.path.exists(self.config.model_output): os.makedirs(self.config.model_output) self.saver.save(self.sess, self.config.model_output) def get_best_action(self, state): """ Return best action Args: state: 4 consecutive observations from gym Returns: action: (int) action_values: (np array) q values for all actions """ action_values = self.sess.run(self.q, feed_dict={self.s: [state]})[0] return np.argmax(action_values), action_values def update_step(self, t, replay_buffer, lr): """ Performs an update of parameters by sampling from replay_buffer Args: t: number of iteration (episode and move) replay_buffer: ReplayBuffer instance .sample() gives batches lr: (float) learning rate Returns: loss: (Q - Q_target)^2 """ s_batch, a_batch, r_batch, sp_batch, done_mask_batch = replay_buffer.sample( self.config.batch_size) fd = { # inputs self.s: s_batch, self.a: a_batch, self.r: r_batch, self.sp: sp_batch, self.done_mask: done_mask_batch, self.lr: lr, # extra info self.avg_reward_placeholder: self.avg_reward, self.max_reward_placeholder: self.max_reward, self.std_reward_placeholder: self.std_reward, self.avg_q_placeholder: self.avg_q, self.max_q_placeholder: self.max_q, self.std_q_placeholder: self.std_q, self.eval_reward_placeholder: self.eval_reward, } loss_eval, grad_norm_eval, summary, _ = self.sess.run([self.loss, self.grad_norm, self.merged, self.train_op], feed_dict=fd) # tensorboard stuff self.file_writer.add_summary(summary, t) return loss_eval, grad_norm_eval def update_target_params(self): """ Update parametes of Q' with parameters of Q """ self.sess.run(self.update_target_op) ================================================ FILE: assignment2/core/q_learning.py ================================================ import os import gym import numpy as np import logging import time import sys from gym import wrappers from collections import deque from utils.general import get_logger, Progbar, export_plot from utils.replay_buffer import ReplayBuffer from utils.preprocess import greyscale from utils.wrappers import PreproWrapper, MaxAndSkipEnv class QN(object): """ Abstract Class for implementing a Q Network """ def __init__(self, env, config, logger=None): """ Initialize Q Network and env Args: config: class with hyperparameters logger: logger instance from logging module """ # directory for training outputs if not os.path.exists(config.output_path): os.makedirs(config.output_path) # store hyper params self.config = config self.logger = logger if logger is None: self.logger = get_logger(config.log_path) self.env = env # build model self.build() def build(self): """ Build model """ pass @property def policy(self): """ model.policy(state) = action """ return lambda state: self.get_action(state) def save(self): """ Save model parameters Args: model_path: (string) directory """ pass def initialize(self): """ Initialize variables if necessary """ pass def get_best_action(self, state): """ Returns best action according to the network Args: state: observation from gym Returns: tuple: action, q values """ raise NotImplementedError def get_action(self, state): """ Returns action with some epsilon strategy Args: state: observation from gym """ if np.random.random() < self.config.soft_epsilon: return self.env.action_space.sample() else: return self.get_best_action(state)[0] def update_target_params(self): """ Update params of Q' with params of Q """ raise NotImplementedError def init_averages(self): """ Defines extra attributes for tensorboard """ self.avg_reward = -21. self.max_reward = -21. self.std_reward = 0 self.avg_q = 0 self.max_q = 0 self.std_q = 0 self.eval_reward = -21. def update_averages(self, rewards, max_q_values, q_values, scores_eval): """ Update the averages Args: rewards: deque max_q_values: deque q_values: deque scores_eval: list """ self.avg_reward = np.mean(rewards) self.max_reward = np.max(rewards) self.std_reward = np.sqrt(np.var(rewards) / len(rewards)) self.max_q = np.mean(max_q_values) self.avg_q = np.mean(q_values) self.std_q = np.sqrt(np.var(q_values) / len(q_values)) if len(scores_eval) > 0: self.eval_reward = scores_eval[-1] def train(self, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) # interact with environment while t < self.config.nsteps_train: total_reward = 0 state = self.env.reset() while True: t += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() # chose action according to current Q and exploration best_action, q_values = self.get_best_action(q_input) action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, reward, done, info = self.env.step(action) # store the transition replay_buffer.store_effect(idx, action, reward, done) state = new_state # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and (t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format(t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") scores_eval += [self.evaluate()] if (t > self.config.learning_start) and self.config.record and (last_record > self.config.record_freq): self.logger.info("Recording...") last_record =0 self.record() # last words self.logger.info("- Training done.") self.save() scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output) def train_step(self, t, replay_buffer, lr): """ Perform training step Args: t: (int) nths step replay_buffer: buffer for sampling lr: (float) learning rate """ loss_eval, grad_eval = 0, 0 # perform training step if (t > self.config.learning_start and t % self.config.learning_freq == 0): loss_eval, grad_eval = self.update_step(t, replay_buffer, lr) # occasionaly update target network with q network if t % self.config.target_update_freq == 0: self.update_target_params() # occasionaly save the weights if (t % self.config.saving_freq == 0): self.save() return loss_eval, grad_eval def evaluate(self, env=None, num_episodes=None): """ Evaluation with same procedure as the training """ # log our activity only if default call if num_episodes is None: self.logger.info("Evaluating...") # arguments defaults if num_episodes is None: num_episodes = self.config.num_episodes_test if env is None: env = self.env # replay memory to play replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = [] for i in range(num_episodes): total_reward = 0 state = env.reset() while True: if self.config.render_test: env.render() # store last state in buffer idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() action = self.get_action(q_input) # perform action in env new_state, reward, done, info = env.step(action) # store in replay memory replay_buffer.store_effect(idx, action, reward, done) state = new_state # count reward total_reward += reward if done: break # updates to perform at the end of an episode rewards.append(total_reward) avg_reward = np.mean(rewards) sigma_reward = np.sqrt(np.var(rewards) / len(rewards)) if num_episodes > 1: msg = "Average reward: {:04.2f} +/- {:04.2f}".format(avg_reward, sigma_reward) self.logger.info(msg) return avg_reward def record(self): """ Re create an env and record a video for one episode """ env = gym.make(self.config.env_name) env = gym.wrappers.Monitor(env, self.config.record_path, video_callable=lambda x: True, resume=True) env = MaxAndSkipEnv(env, skip=self.config.skip_frame) env = PreproWrapper(env, prepro=greyscale, shape=(80, 80, 1), overwrite_render=self.config.overwrite_render) self.evaluate(env, 1) def run(self, exp_schedule, lr_schedule): """ Apply procedures of training for a QN Args: exp_schedule: exploration strategy for epsilon lr_schedule: schedule for learning rate """ # initialize self.initialize() # record one game at the beginning if self.config.record: self.record() # model self.train(exp_schedule, lr_schedule) # record one game at the end if self.config.record: self.record() ================================================ FILE: assignment2/q1_schedule.py ================================================ import numpy as np from utils.test_env import EnvTest class LinearSchedule(object): def __init__(self, eps_begin, eps_end, nsteps): """ Args: eps_begin: initial exploration eps_end: end exploration nsteps: number of steps between the two values of eps """ self.epsilon = eps_begin self.eps_begin = eps_begin self.eps_end = eps_end self.nsteps = nsteps def update(self, t): """ Updates epsilon Args: t: (int) nth frames """ ############################################################## """ TODO: modify self.epsilon such that for t = 0, self.epsilon = self.eps_begin for t = self.nsteps, self.epsilon = self.eps_end linear decay between the two self.epsilon should never go under self.eps_end """ ############################################################## ################ YOUR CODE HERE - 3-4 lines ################## value = np.linspace(self.eps_end, self.eps_begin, self.nsteps+1) #if t > self.nsteps: # self.epsilon = self.eps_end #else: # self.epsilon = value[t] self.epsilon = value[t] if t <= self.nsteps else self.eps_end ############################################################## ######################## END YOUR CODE ############## ######## class LinearExploration(LinearSchedule): def __init__(self, env, eps_begin, eps_end, nsteps): """ Args: env: gym environment eps_begin: initial exploration eps_end: end exploration nsteps: number of steps between the two values of eps """ self.env = env super(LinearExploration, self).__init__(eps_begin, eps_end, nsteps) def get_action(self, best_action): """ Returns a random action with prob epsilon, otherwise return the best_action Args: best_action: (int) best action according some policy Returns: an action """ ############################################################## """ TODO: with probability self.epsilon, return a random action else, return best_action you can access the environment stored in self.env and epsilon with self.epsilon """ ############################################################## ################ YOUR CODE HERE - 4-5 lines ################## temp = np.random.rand() if temp < self.epsilon: best_action = np.random.randint(self.env.action_space.n) return best_action ############################################################## ######################## END YOUR CODE ############## ######## def test1(): env = EnvTest((5, 5, 1)) exp_strat = LinearExploration(env, 1, 0, 10) found_diff = False for i in range(10): rnd_act = exp_strat.get_action(0) if rnd_act != 0 and rnd_act is not None: found_diff = True assert found_diff, "Test 1 failed." print("Test1: ok") def test2(): env = EnvTest((5, 5, 1)) exp_strat = LinearExploration(env, 1, 0, 10) exp_strat.update(5) assert exp_strat.epsilon == 0.5, "Test 2 failed" print("Test2: ok") def test3(): env = EnvTest((5, 5, 1)) exp_strat = LinearExploration(env, 1, 0.5, 10) exp_strat.update(20) assert exp_strat.epsilon == 0.5, "Test 3 failed" print("Test3: ok") def your_test(): """ Use this to implement your own tests """ pass if __name__ == "__main__": test1() test2() test3() your_test() ================================================ FILE: assignment2/q2_linear.py ================================================ import tensorflow as tf import tensorflow.contrib.layers as layers from utils.general import get_logger from utils.test_env import EnvTest from core.deep_q_learning import DQN from q1_schedule import LinearExploration, LinearSchedule from configs.q2_linear import config class Linear(DQN): """ Implement Fully Connected with Tensorflow """ def add_placeholders_op(self): """ Adds placeholders to the graph These placeholders are used as inputs by the rest of the model building and will be fed data during training. Note that when "None" is in a placeholder's shape, it's flexible (so we can use different batch sizes without rebuilding the model """ # this information might be useful # here, typically, a state shape is (80, 80, 1) state_shape = list(self.env.observation_space.shape) ############################################################## """ TODO: add placeholders: Remember that we stack 4 consecutive frames together, ending up with an input of shape (80, 80, 4). - self.s: batch of states, type = uint8 shape = (batch_size, img height, img width, nchannels x config.state_history) - self.a: batch of actions, type = int32 shape = (batch_size) - self.r: batch of rewards, type = float32 shape = (batch_size) - self.sp: batch of next states, type = uint8 shape = (batch_size, img height, img width, nchannels x config.state_history) - self.done_mask: batch of done, type = bool shape = (batch_size) note that this placeholder contains bool = True only if we are done in the relevant transition - self.lr: learning rate, type = float32 (Don't change the variable names!) HINT: variables from config are accessible with self.config.variable_name Also, you may want to use a dynamic dimension for the batch dimension. Check the use of None for tensorflow placeholders. you can also use the state_shape computed above. """ ############################################################## ################YOUR CODE HERE (6-15 lines) ################## img_height, img_width, nchannels = state_shape[0], state_shape[1], state_shape[2] self.s = tf.placeholder(dtype=tf.uint8, shape=[None, img_height, img_width, nchannels*self.config.state_history], name='state') self.a = tf.placeholder(dtype=tf.int32, shape=[None], name='action') self.r = tf.placeholder(dtype=tf.float32, shape=[None], name='reward') self.sp = tf.placeholder(dtype=tf.uint8, shape=[None, img_height, img_width, nchannels*self.config.state_history], name='next_state') self.done_mask = tf.placeholder(dtype=tf.bool, shape=[None], name='done_mask') self.lr = tf.placeholder(dtype=tf.float32, shape=(), name='lr') ############################################################## ######################## END YOUR CODE ####################### def get_q_values_op(self, state, scope, reuse=False): """ Returns Q values for all actions Args: state: (tf tensor) shape = (batch_size, img height, img width, nchannels) scope: (string) scope name, that specifies if target network or not reuse: (bool) reuse of variables in the scope Returns: out: (tf tensor) of shape = (batch_size, num_actions) """ # this information might be useful num_actions = self.env.action_space.n out = state ############################################################## """ TODO: implement a fully connected with no hidden layer (linear approximation) using tensorflow. In other words, if your state s has a flattened shape of n, and you have m actions, the result of your computation sould be equal to W s where W is a matrix of shape m x n HINT: you may find tensorflow.contrib.layers useful (imported) make sure to understand the use of the scope param you can use any other methods from tensorflow you are not allowed to import extra packages (like keras, lasagne, cafe, etc.) """ ############################################################## ################ YOUR CODE HERE - 2-3 lines ################## state_flatten = layers.flatten(state, scope=scope) out = layers.fully_connected(state_flatten, num_actions, reuse=reuse, scope=scope, activation_fn=None) ############################################################## ######################## END YOUR CODE ####################### return out def add_update_target_op(self, q_scope, target_q_scope): """ update_target_op will be called periodically to copy Q network weights to target Q network Remember that in DQN, we maintain two identical Q networks with 2 different set of weights. In tensorflow, we distinguish them with two different scopes. One for the target network, one for the regular network. If you're not familiar with the scope mechanism in tensorflow, read the docs https://www.tensorflow.org/programmers_guide/variable_scope Periodically, we need to update all the weights of the Q network and assign them with the values from the regular network. Thus, what we need to do is to build a tf op, that, when called, will assign all variables in the target network scope with the values of the corresponding variables of the regular network scope. Args: q_scope: (string) name of the scope of variables for q target_q_scope: (string) name of the scope of variables for the target network """ ############################################################## """ TODO: add an operator self.update_target_op that assigns variables from target_q_scope with the values of the corresponding var in q_scope HINT: you may find the following functions useful: - tf.get_collection #list - tf.assign #return tensor - tf.group (be sure that you set self.update_target_op) """ ############################################################## ################### YOUR CODE HERE - 5-10 lines ############# q_collection = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=q_scope) target_q_collection = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=target_q_scope) op = [tf.assign(target_q_collection[i], q_collection[i]) for i in range(len(q_collection))] self.update_target_op = tf.group(*op) ############################################################## ######################## END YOUR CODE ####################### def add_loss_op(self, q, target_q): """ Sets the loss of a batch, self.loss is a scalar Args: q: (tf tensor) shape = (batch_size, num_actions) target_q: (tf tensor) shape = (batch_size, num_actions) """ # you may need this variable num_actions = self.env.action_space.n ############################################################## """ TODO: The loss for an example is defined as: Q_samp(s) = r if done = r + gamma * max_a' Q_target(s', a') loss = (Q_samp(s) - Q(s, a))^2 You need to compute the average of the loss over the minibatch and store the resulting scalar into self.loss HINT: - config variables are accessible through self.config - you can access placeholders like self.a (for actions) self.r (rewards) or self.done_mask for instance - you may find the following functions useful - tf.cast - tf.reduce_max / reduce_sum - tf.one_hot - ... (be sure that you set self.loss) """ ############################################################## ##################### YOUR CODE HERE - 4-5 lines ############# #done = tf.cast(self.done_mask, tf.float32) temp = self.r + self.config.gamma*tf.reduce_max(target_q, axis=1) q_samp = tf.where(self.done_mask, self.r, temp) action = tf.one_hot(self.a, num_actions) q_new = tf.reduce_sum(tf.multiply(action,q), axis=1) self.loss = tf.reduce_mean(tf.square(q_new - q_samp)) ############################################################## ######################## END YOUR CODE ####################### def add_optimizer_op(self, scope): """ Set self.train_op and self.grad_norm """ ############################################################## """ TODO: 1. get Adam Optimizer (remember that we defined self.lr in the placeholders section) 2. compute grads wrt to variables in scope for self.loss 3. clip the grads by norm with self.config.clip_val if self.config.grad_clip is True 4. apply the gradients and store the train op in self.train_op (sess.run(train_op) must update the variables) 5. compute the global norm of the gradients and store this scalar in self.grad_norm HINT: you may find the following functinos useful - tf.get_collection - optimizer.compute_gradients - tf.clip_by_norm - optimizer.apply_gradients - tf.global_norm you can access config variable by writing self.config.variable_name (be sure that you set self.train_op and self.grad_norm) """ ############################################################## #################### YOUR CODE HERE - 8-12 lines ############# optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) scope_variable = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) grads_and_vars = optimizer.compute_gradients(self.loss, scope_variable) if self.config.grad_clip: clipped_grads_and_vars = [(tf.clip_by_norm(item[0],self.config.clip_val),item[1]) for item in grads_and_vars] self.train_op = optimizer.apply_gradients(clipped_grads_and_vars) self.grad_norm = tf.global_norm([item[0] for item in grads_and_vars]) ############################################################## ######################## END YOUR CODE ####################### if __name__ == '__main__': env = EnvTest((5, 5, 1)) # exploration strategy exp_schedule = LinearExploration(env, config.eps_begin, config.eps_end, config.eps_nsteps) # learning rate schedule lr_schedule = LinearSchedule(config.lr_begin, config.lr_end, config.lr_nsteps) # train model model = Linear(env, config) model.run(exp_schedule, lr_schedule) ================================================ FILE: assignment2/q3_nature.py ================================================ import tensorflow as tf import tensorflow.contrib.layers as layers from utils.general import get_logger from utils.test_env import EnvTest from q1_schedule import LinearExploration, LinearSchedule from q2_linear import Linear from configs.q3_nature import config class NatureQN(Linear): """ Implementing DeepMind's Nature paper. Here are the relevant urls. https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf """ def get_q_values_op(self, state, scope, reuse=False): """ Returns Q values for all actions Args: state: (tf tensor) shape = (batch_size, img height, img width, nchannels) scope: (string) scope name, that specifies if target network or not reuse: (bool) reuse of variables in the scope Returns: out: (tf tensor) of shape = (batch_size, num_actions) """ # this information might be useful num_actions = self.env.action_space.n out = state ############################################################## """ TODO: implement the computation of Q values like in the paper https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf you may find the section "model architecture" of the appendix of the nature paper particulary useful. store your result in out of shape = (batch_size, num_actions) HINT: you may find tensorflow.contrib.layers useful (imported) make sure to understand the use of the scope param you can use any other methods from tensorflow you are not allowed to import extra packages (like keras, lasagne, cafe, etc.) """ ############################################################## ################ YOUR CODE HERE - 10-15 lines ################ with tf.variable_scope(scope, reuse=reuse) as _: out = layers.conv2d(out, num_outputs=32, kernel_size=8, stride=4) out = layers.conv2d(out, num_outputs=64, kernel_size=4, stride=2) out = layers.conv2d(out, num_outputs=64, kernel_size=3, stride=1) out = layers.flatten(out) out = layers.fully_connected(out, num_outputs=512) out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) ############################################################## ######################## END YOUR CODE ####################### return out """ Use deep Q network for test environment. """ if __name__ == '__main__': env = EnvTest((80, 80, 1)) # exploration strategy exp_schedule = LinearExploration(env, config.eps_begin, config.eps_end, config.eps_nsteps) # learning rate schedule lr_schedule = LinearSchedule(config.lr_begin, config.lr_end, config.lr_nsteps) # train model model = NatureQN(env, config) model.run(exp_schedule, lr_schedule) ================================================ FILE: assignment2/q4_train_atari_linear.py ================================================ import gym from utils.preprocess import greyscale from utils.wrappers import PreproWrapper, MaxAndSkipEnv from q1_schedule import LinearExploration, LinearSchedule from q2_linear import Linear from configs.q4_train_atari_linear import config """ Use linear approximation for the Atari game. Please report the final result. Feel free to change the configurations (in the configs/ folder). If so, please report your hyperparameters. You'll find the results, log and video recordings of your agent every 250k under the corresponding file in the results folder. A good way to monitor the progress of the training is to use Tensorboard. The starter code writes summaries of different variables. To launch tensorboard, open a Terminal window and run tensorboard --logdir=results/ Then, connect remotely to address-ip-of-the-server:6006 6006 is the default port used by tensorboard. """ if __name__ == '__main__': # make env env = gym.make(config.env_name) env = MaxAndSkipEnv(env, skip=config.skip_frame) env = PreproWrapper(env, prepro=greyscale, shape=(80, 80, 1), overwrite_render=config.overwrite_render) # exploration strategy exp_schedule = LinearExploration(env, config.eps_begin, config.eps_end, config.eps_nsteps) # learning rate schedule lr_schedule = LinearSchedule(config.lr_begin, config.lr_end, config.lr_nsteps) # train model model = Linear(env, config) model.run(exp_schedule, lr_schedule) ================================================ FILE: assignment2/q5_train_atari_nature.py ================================================ import gym from utils.preprocess import greyscale from utils.wrappers import PreproWrapper, MaxAndSkipEnv from q1_schedule import LinearExploration, LinearSchedule from q3_nature import NatureQN from configs.q5_train_atari_nature import config """ Use deep Q network for the Atari game. Please report the final result. Feel free to change the configurations (in the configs/ folder). If so, please report your hyperparameters. You'll find the results, log and video recordings of your agent every 250k under the corresponding file in the results folder. A good way to monitor the progress of the training is to use Tensorboard. The starter code writes summaries of different variables. To launch tensorboard, open a Terminal window and run tensorboard --logdir=results/ Then, connect remotely to address-ip-of-the-server:6006 6006 is the default port used by tensorboard. """ if __name__ == '__main__': # make env env = gym.make(config.env_name) env = MaxAndSkipEnv(env, skip=config.skip_frame) env = PreproWrapper(env, prepro=greyscale, shape=(80, 80, 1), overwrite_render=config.overwrite_render) # exploration strategy exp_schedule = LinearExploration(env, config.eps_begin, config.eps_end, config.eps_nsteps) # learning rate schedule lr_schedule = LinearSchedule(config.lr_begin, config.lr_end, config.lr_nsteps) # train model model = NatureQN(env, config) model.run(exp_schedule, lr_schedule) ================================================ FILE: assignment2/q6_double_q_learning.py ================================================ import gym from utils.preprocess import greyscale from utils.wrappers import PreproWrapper, MaxAndSkipEnv import tensorflow as tf import tensorflow.contrib.layers as layers from utils.general import get_logger from utils.test_env import EnvTest from q1_schedule import LinearExploration, LinearSchedule from q2_linear import Linear from q3_nature import NatureQN from configs.q6_bonus_question import config class MyDQN(NatureQN): """ Going beyond - implement your own Deep Q Network to find the perfect balance between depth, complexity, number of parameters, etc. You can change the way the q-values are computed, the exploration strategy, or the learning rate schedule. You can also create your own wrapper of environment and transform your input to something that you think we'll help to solve the task. Ideally, your network would run faster than DeepMind's and achieve similar performance! You can also change the optimizer (by overriding the functions defined in TFLinear), or even change the sampling strategy from the replay buffer. If you prefer not to build on the current architecture, you're welcome to write your own code. You may also try more recent approaches, like double Q learning (see https://arxiv.org/pdf/1509.06461.pdf) or dueling networks (see https://arxiv.org/abs/1511.06581), but this would be for extra extra bonus points. """ def add_loss_op(self, q, target_q): """ Sets the loss of a batch, self.loss is a scalar Args: q: (tf tensor) shape = (batch_size, num_actions) target_q: (tf tensor) shape = (batch_size, num_actions) """ # you may need this variable num_actions = self.env.action_space.n ############################################################## """ TODO: The loss for an example is defined as: Q_samp(s) = r if done = r + gamma * Q_target(s', max_a'Q(s',a')) loss = (Q_samp(s) - Q(s, a))^2 You need to compute the average of the loss over the minibatch and store the resulting scalar into self.loss HINT: - config variables are accessible through self.config - you can access placeholders like self.a (for actions) self.r (rewards) or self.done_mask for instance - you may find the following functions useful - tf.cast - tf.reduce_max / reduce_sum - tf.one_hot - ... (be sure that you set self.loss) """ ############################################################## ##################### YOUR CODE HERE - 4-5 lines ############# #done = tf.cast(self.done_mask, tf.float32) idx = tf.arg_max(q, dimension=1) idx_one_hot = tf.one_hot(idx, num_actions) temp = self.r + self.config.gamma*tf.reduce_sum(tf.multiply(target_q, idx_one_hot), axis=1) q_samp = tf.where(self.done_mask, self.r, temp) action = tf.one_hot(self.a, num_actions) q_new = tf.reduce_sum(tf.multiply(action,q), axis=1) self.loss = tf.reduce_mean(tf.square(q_new - q_samp)) ############################################################## ######################## END YOUR CODE ####################### """ Use a different architecture for the Atari game. Please report the final result. Feel free to change the configuration. If so, please report your hyperparameters. """ if __name__ == '__main__': # make env env = gym.make(config.env_name) env = MaxAndSkipEnv(env, skip=config.skip_frame) env = PreproWrapper(env, prepro=greyscale, shape=(80, 80, 1), overwrite_render=config.overwrite_render) # exploration strategy # you may want to modify this schedule exp_schedule = LinearExploration(env, config.eps_begin, config.eps_end, config.eps_nsteps) # you may want to modify this schedule # learning rate schedule lr_schedule = LinearSchedule(config.lr_begin, config.lr_end, config.lr_nsteps) # train model model = MyDQN(env, config) model.run(exp_schedule, lr_schedule) ================================================ FILE: assignment2/q6_dueling.py ================================================ import gym from utils.preprocess import greyscale from utils.wrappers import PreproWrapper, MaxAndSkipEnv import tensorflow as tf import tensorflow.contrib.layers as layers from utils.general import get_logger from utils.test_env import EnvTest from q1_schedule import LinearExploration, LinearSchedule from q2_linear import Linear from configs.q6_bonus_question import config class MyDQN(Linear): """ Going beyond - implement your own Deep Q Network to find the perfect balance between depth, complexity, number of parameters, etc. You can change the way the q-values are computed, the exploration strategy, or the learning rate schedule. You can also create your own wrapper of environment and transform your input to something that you think we'll help to solve the task. Ideally, your network would run faster than DeepMind's and achieve similar performance! You can also change the optimizer (by overriding the functions defined in TFLinear), or even change the sampling strategy from the replay buffer. If you prefer not to build on the current architecture, you're welcome to write your own code. You may also try more recent approaches, like double Q learning (see https://arxiv.org/pdf/1509.06461.pdf) or dueling networks (see https://arxiv.org/abs/1511.06581), but this would be for extra extra bonus points. """ def get_q_values_op(self, state, scope, reuse=False): """ Returns Q values for all actions Args: state: (tf tensor) shape = (batch_size, img height, img width, nchannels) scope: (string) scope name, that specifies if target network or not reuse: (bool) reuse of variables in the scope Returns: out: (tf tensor) of shape = (batch_size, num_actions) """ # this information might be useful num_actions = self.env.action_space.n out = state ############################################################## """ TODO: implement the computation of Q values like in the paper https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf HINT: you may find tensorflow.contrib.layers useful (imported) make sure to understand the use of the scope param you can use any other methods from tensorflow you are not allowed to import extra packages (like keras, lasagne, cafe, etc.) L1: 32 8x8 filters with stride 4 + RELU L2: 64 4x4 filters with stride 2 + RELU L3: 64 3x3 fitlers with stride 1 + RELU L4a: 512 unit Fully-Connected layer + RELU L4b: 512 unit Fully-Connected layer + RELU L5a: 1 unit FC (State Value) L5b: #actions FC (Advantage Value) L6: Aggregate V(s)+A(s,a) """ ############################################################## ################ YOUR CODE HERE - 10-15 lines ################ with tf.variable_scope(scope, reuse=reuse) as _: out = layers.conv2d(out, num_outputs=32, kernel_size=8, stride=4) out = layers.conv2d(out, num_outputs=64, kernel_size=4, stride=2) out = layers.conv2d(out, num_outputs=64, kernel_size=3, stride=1) out = layers.flatten(out) out = layers.fully_connected(out, num_outputs=512) out1 = layers.fully_connected(out, num_outputs=1, activation_fn=None) out2 = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) out = out2 - tf.tile(tf.expand_dims(tf.reduce_mean(out2, axis=1),-1), [1,num_actions]) out = out + tf.tile(out1, [1,num_actions]) ############################################################## ######################## END YOUR CODE ####################### return out """ Use a different architecture for the Atari game. Please report the final result. Feel free to change the configuration. If so, please report your hyperparameters. """ if __name__ == '__main__': # make env env = gym.make(config.env_name) env = MaxAndSkipEnv(env, skip=config.skip_frame) env = PreproWrapper(env, prepro=greyscale, shape=(80, 80, 1), overwrite_render=config.overwrite_render) # exploration strategy # you may want to modify this schedule exp_schedule = LinearExploration(env, config.eps_begin, config.eps_end, config.eps_nsteps) # you may want to modify this schedule # learning rate schedule lr_schedule = LinearSchedule(config.lr_begin, config.lr_end, config.lr_nsteps) # train model model = MyDQN(env, config) model.run(exp_schedule, lr_schedule) ================================================ FILE: assignment2/requirements.txt ================================================ matplotlib numpy six ================================================ FILE: assignment2/results/q2_linear/log.txt ================================================ 2017-11-28 20:52:49,822:INFO: Evaluating... 2017-11-28 20:52:50,064:INFO: Average reward: -0.50 +/- 0.00 2017-11-28 20:52:50,983:INFO: Evaluating... 2017-11-28 20:52:51,013:INFO: Average reward: -0.50 +/- 0.00 2017-11-28 20:52:51,772:INFO: Evaluating... 2017-11-28 20:52:51,803:INFO: Average reward: -0.10 +/- 0.00 2017-11-28 20:52:52,561:INFO: Evaluating... 2017-11-28 20:52:52,592:INFO: Average reward: -0.10 +/- 0.00 2017-11-28 20:52:53,356:INFO: Evaluating... 2017-11-28 20:52:53,386:INFO: Average reward: -0.30 +/- 0.00 2017-11-28 20:52:54,208:INFO: Evaluating... 2017-11-28 20:52:54,240:INFO: Average reward: -0.10 +/- 0.00 2017-11-28 20:52:54,996:INFO: Evaluating... 2017-11-28 20:52:55,026:INFO: Average reward: -0.10 +/- 0.00 2017-11-28 20:52:55,779:INFO: Evaluating... 2017-11-28 20:52:55,809:INFO: Average reward: -0.50 +/- 0.00 2017-11-28 20:52:56,576:INFO: Evaluating... 2017-11-28 20:52:56,604:INFO: Average reward: -0.10 +/- 0.00 2017-11-28 20:52:57,366:INFO: Evaluating... 2017-11-28 20:52:57,394:INFO: Average reward: -0.10 +/- 0.00 2017-11-28 20:52:58,138:INFO: - Training done. 2017-11-28 20:52:58,161:INFO: Evaluating... 2017-11-28 20:52:58,194:INFO: Average reward: -0.10 +/- 0.00 2017-11-28 21:10:09,597:INFO: Evaluating... 2017-11-28 21:10:09,634:INFO: Average reward: -0.30 +/- 0.00 2017-11-28 21:10:10,317:INFO: Evaluating... 2017-11-28 21:10:10,347:INFO: Average reward: 0.50 +/- 0.00 2017-11-28 21:10:11,113:INFO: Evaluating... 2017-11-28 21:10:11,145:INFO: Average reward: 0.10 +/- 0.00 2017-11-28 21:10:11,894:INFO: Evaluating... 2017-11-28 21:10:11,925:INFO: Average reward: -0.10 +/- 0.00 2017-11-28 21:10:12,685:INFO: Evaluating... 2017-11-28 21:10:12,717:INFO: Average reward: 0.50 +/- 0.00 2017-11-28 21:10:13,506:INFO: Evaluating... 2017-11-28 21:10:13,539:INFO: Average reward: 1.90 +/- 0.00 2017-11-28 21:10:14,291:INFO: Evaluating... 2017-11-28 21:10:14,322:INFO: Average reward: 2.10 +/- 0.00 2017-11-28 21:10:15,084:INFO: Evaluating... 2017-11-28 21:10:15,114:INFO: Average reward: 2.00 +/- 0.00 2017-11-28 21:10:15,876:INFO: Evaluating... 2017-11-28 21:10:15,907:INFO: Average reward: 2.10 +/- 0.00 2017-11-28 21:10:16,665:INFO: Evaluating... 2017-11-28 21:10:16,695:INFO: Average reward: 2.10 +/- 0.00 2017-11-28 21:10:17,432:INFO: - Training done. 2017-11-28 21:10:17,453:INFO: Evaluating... 2017-11-28 21:10:17,486:INFO: Average reward: 2.10 +/- 0.00 ================================================ FILE: assignment2/results/q2_linear/model.weights/checkpoint ================================================ model_checkpoint_path: "." all_model_checkpoint_paths: "." ================================================ FILE: assignment2/results/q3_nature/log.txt ================================================ 2017-11-28 21:36:35,366:INFO: Evaluating... 2017-11-28 21:36:35,752:INFO: Average reward: 0.00 +/- 0.00 2017-11-28 21:36:36,569:INFO: Evaluating... 2017-11-28 21:36:36,868:INFO: Average reward: -0.50 +/- 0.00 2017-11-28 21:36:40,918:INFO: Evaluating... 2017-11-28 21:36:41,207:INFO: Average reward: 0.00 +/- 0.00 2017-11-28 21:36:45,230:INFO: Evaluating... 2017-11-28 21:36:45,520:INFO: Average reward: 0.50 +/- 0.00 2017-11-28 21:36:49,710:INFO: Evaluating... 2017-11-28 21:36:50,002:INFO: Average reward: 2.00 +/- 0.00 2017-11-28 21:36:54,073:INFO: Evaluating... 2017-11-28 21:36:54,361:INFO: Average reward: 2.00 +/- 0.00 2017-11-28 21:36:58,412:INFO: Evaluating... 2017-11-28 21:36:58,698:INFO: Average reward: 2.00 +/- 0.00 2017-11-28 21:37:02,752:INFO: Evaluating... 2017-11-28 21:37:03,044:INFO: Average reward: 2.10 +/- 0.00 2017-11-28 21:37:07,233:INFO: Evaluating... 2017-11-28 21:37:07,513:INFO: Average reward: 2.10 +/- 0.00 2017-11-28 21:37:09,855:INFO: - Training done. 2017-11-28 21:37:09,959:INFO: Evaluating... 2017-11-28 21:37:10,247:INFO: Average reward: 2.10 +/- 0.00 ================================================ FILE: assignment2/results/q3_nature/model.weights/checkpoint ================================================ model_checkpoint_path: "." all_model_checkpoint_paths: "." ================================================ FILE: assignment2/results/q4_train_atari_linear/log.txt ================================================ 2017-11-29 16:06:16,994:INFO: Making new env: Pong-v0 2017-11-29 16:06:17,179:INFO: Creating monitor directory results/q4_train_atari_linear/monitor/ 2017-11-29 16:06:17,187:INFO: Starting new video recorder writing to /home/zengliang/CS234/assignment2/results/q4_train_atari_linear/monitor/openaigym.video.0.5469.video000000.mp4 2017-11-29 16:06:18,628:INFO: Finished writing results. You can upload them to the scoreboard via gym.upload('/home/zengliang/CS234/assignment2/results/q4_train_atari_linear/monitor') 2017-11-29 16:06:18,629:INFO: Evaluating... 2017-11-29 16:07:00,357:INFO: Average reward: -20.98 +/- 0.02 2017-11-29 16:30:31,583:INFO: Evaluating... 2017-11-30 12:01:58,705:INFO: Making new env: Pong-v0 2017-11-30 12:01:58,917:INFO: Starting new video recorder writing to /home/zengliang/CS234/assignment2/results/q4_train_atari_linear/monitor/openaigym.video.0.3758.video000000.mp4 2017-11-30 12:02:01,397:INFO: Finished writing results. You can upload them to the scoreboard via gym.upload('/home/zengliang/CS234/assignment2/results/q4_train_atari_linear/monitor') 2017-11-30 12:02:01,397:INFO: Evaluating... 2017-11-30 12:02:40,550:INFO: Average reward: -20.98 +/- 0.02 2017-11-30 14:37:22,473:INFO: Making new env: Pong-v0 2017-11-30 14:37:22,717:INFO: Starting new video recorder writing to /home/zengliang/CS234/assignment2/results/q4_train_atari_linear/monitor/openaigym.video.0.2799.video000000.mp4 2017-11-30 14:37:26,391:INFO: Finished writing results. You can upload them to the scoreboard via gym.upload('/home/zengliang/CS234/assignment2/results/q4_train_atari_linear/monitor') 2017-11-30 14:37:26,392:INFO: Evaluating... 2017-11-30 14:38:24,987:INFO: Average reward: -20.90 +/- 0.06 2017-11-30 15:03:46,854:INFO: Evaluating... ================================================ FILE: assignment2/results/q4_train_atari_linear/model.weights/checkpoint ================================================ model_checkpoint_path: "." all_model_checkpoint_paths: "." ================================================ FILE: assignment2/results/q4_train_atari_linear/monitor/openaigym.episode_batch.0.2799.stats.json ================================================ {"timestamps": [1512023846.375136], "initial_reset_timestamp": 1512023842.709429, "episode_types": ["t"], "episode_lengths": [1254], "episode_rewards": [-21.0]} ================================================ FILE: assignment2/results/q4_train_atari_linear/monitor/openaigym.episode_batch.0.3758.stats.json ================================================ {"timestamps": [1512014521.383348], "initial_reset_timestamp": 1512014518.909645, "episode_types": ["t"], "episode_lengths": [1005], "episode_rewards": [-21.0]} ================================================ FILE: assignment2/results/q4_train_atari_linear/monitor/openaigym.episode_batch.0.5469.stats.json ================================================ {"timestamps": [1511942778.615624], "initial_reset_timestamp": 1511942777.179417, "episode_types": ["t"], "episode_lengths": [1195], "episode_rewards": [-21.0]} ================================================ FILE: assignment2/results/q4_train_atari_linear/monitor/openaigym.manifest.0.2799.manifest.json ================================================ {"env_info": {"env_id": "Pong-v0", "gym_version": "0.9.3"}, "stats": "openaigym.episode_batch.0.2799.stats.json", "videos": [["openaigym.video.0.2799.video000000.mp4", "openaigym.video.0.2799.video000000.meta.json"]]} ================================================ FILE: assignment2/results/q4_train_atari_linear/monitor/openaigym.manifest.0.3758.manifest.json ================================================ {"env_info": {"env_id": "Pong-v0", "gym_version": "0.9.3"}, "stats": "openaigym.episode_batch.0.3758.stats.json", "videos": [["openaigym.video.0.3758.video000000.mp4", "openaigym.video.0.3758.video000000.meta.json"]]} ================================================ FILE: assignment2/results/q4_train_atari_linear/monitor/openaigym.manifest.0.5469.manifest.json ================================================ {"env_info": {"env_id": "Pong-v0", "gym_version": "0.9.3"}, "stats": "openaigym.episode_batch.0.5469.stats.json", "videos": [["openaigym.video.0.5469.video000000.mp4", "openaigym.video.0.5469.video000000.meta.json"]]} ================================================ FILE: assignment2/results/q4_train_atari_linear/monitor/openaigym.video.0.2799.video000000.meta.json ================================================ {"encoder_version": {"cmdline": ["avconv", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/home/zengliang/CS234/assignment2/results/q4_train_atari_linear/monitor/openaigym.video.0.2799.video000000.mp4"], "version": "avconv version 9.18-6:9.18-0ubuntu0.14.04.1, Copyright (c) 2000-2014 the Libav developers\n built on Mar 16 2015 13:19:10 with gcc 4.8 (Ubuntu 4.8.2-19ubuntu1)\navconv 9.18-6:9.18-0ubuntu0.14.04.1\nlibavutil 52. 3. 0 / 52. 3. 0\nlibavcodec 54. 35. 0 / 54. 35. 0\nlibavformat 54. 20. 4 / 54. 20. 4\nlibavdevice 53. 2. 0 / 53. 2. 0\nlibavfilter 3. 3. 0 / 3. 3. 0\nlibavresample 1. 0. 1 / 1. 0. 1\nlibswscale 2. 1. 1 / 2. 1. 1\n", "backend": "avconv"}, "content_type": "video/mp4", "episode_id": 0} ================================================ FILE: assignment2/results/q4_train_atari_linear/monitor/openaigym.video.0.3758.video000000.meta.json ================================================ {"encoder_version": {"cmdline": ["avconv", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/home/zengliang/CS234/assignment2/results/q4_train_atari_linear/monitor/openaigym.video.0.3758.video000000.mp4"], "version": "avconv version 9.18-6:9.18-0ubuntu0.14.04.1, Copyright (c) 2000-2014 the Libav developers\n built on Mar 16 2015 13:19:10 with gcc 4.8 (Ubuntu 4.8.2-19ubuntu1)\navconv 9.18-6:9.18-0ubuntu0.14.04.1\nlibavutil 52. 3. 0 / 52. 3. 0\nlibavcodec 54. 35. 0 / 54. 35. 0\nlibavformat 54. 20. 4 / 54. 20. 4\nlibavdevice 53. 2. 0 / 53. 2. 0\nlibavfilter 3. 3. 0 / 3. 3. 0\nlibavresample 1. 0. 1 / 1. 0. 1\nlibswscale 2. 1. 1 / 2. 1. 1\n", "backend": "avconv"}, "content_type": "video/mp4", "episode_id": 0} ================================================ FILE: assignment2/results/q4_train_atari_linear/monitor/openaigym.video.0.5469.video000000.meta.json ================================================ {"encoder_version": {"cmdline": ["avconv", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/home/zengliang/CS234/assignment2/results/q4_train_atari_linear/monitor/openaigym.video.0.5469.video000000.mp4"], "version": "avconv version 9.18-6:9.18-0ubuntu0.14.04.1, Copyright (c) 2000-2014 the Libav developers\n built on Mar 16 2015 13:19:10 with gcc 4.8 (Ubuntu 4.8.2-19ubuntu1)\navconv 9.18-6:9.18-0ubuntu0.14.04.1\nlibavutil 52. 3. 0 / 52. 3. 0\nlibavcodec 54. 35. 0 / 54. 35. 0\nlibavformat 54. 20. 4 / 54. 20. 4\nlibavdevice 53. 2. 0 / 53. 2. 0\nlibavfilter 3. 3. 0 / 3. 3. 0\nlibavresample 1. 0. 1 / 1. 0. 1\nlibswscale 2. 1. 1 / 2. 1. 1\n", "backend": "avconv"}, "content_type": "video/mp4", "episode_id": 0} ================================================ FILE: assignment2/utils/__init__.py ================================================ ================================================ FILE: assignment2/utils/general.py ================================================ import time import sys import logging import numpy as np from collections import deque import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt def export_plot(ys, ylabel, filename): """ Export a plot in filename Args: ys: (list) of float / int to plot filename: (string) directory """ plt.figure() plt.plot(range(len(ys)), ys) plt.xlabel("Epoch") plt.ylabel(ylabel) plt.savefig(filename) plt.close() def get_logger(filename): """ Return a logger instance to a file """ logger = logging.getLogger('logger') logger.setLevel(logging.DEBUG) logging.basicConfig(format='%(message)s', level=logging.DEBUG) handler = logging.FileHandler(filename) handler.setLevel(logging.DEBUG) handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s')) logging.getLogger().addHandler(handler) return logger class Progbar(object): """Progbar class copied from keras (https://github.com/fchollet/keras/) Displays a progress bar. Small edit : added strict arg to update # Arguments target: Total number of steps expected. interval: Minimum visual progress update interval (in seconds). """ def __init__(self, target, width=30, verbose=1, discount=0.9): self.width = width self.target = target self.sum_values = {} self.exp_avg = {} self.unique_values = [] self.start = time.time() self.total_width = 0 self.seen_so_far = 0 self.verbose = verbose self.discount = discount def update(self, current, values=[], exact=[], strict=[], exp_avg=[]): """ Updates the progress bar. # Arguments current: Index of current step. values: List of tuples (name, value_for_last_step). The progress bar will display averages for these values. exact: List of tuples (name, value_for_last_step). The progress bar will display these values directly. """ for k, v in values: if k not in self.sum_values: self.sum_values[k] = [v * (current - self.seen_so_far), current - self.seen_so_far] self.unique_values.append(k) else: self.sum_values[k][0] += v * (current - self.seen_so_far) self.sum_values[k][1] += (current - self.seen_so_far) for k, v in exact: if k not in self.sum_values: self.unique_values.append(k) self.sum_values[k] = [v, 1] for k, v in strict: if k not in self.sum_values: self.unique_values.append(k) self.sum_values[k] = v for k, v in exp_avg: if k not in self.exp_avg: self.exp_avg[k] = v else: self.exp_avg[k] *= self.discount self.exp_avg[k] += (1-self.discount)*v self.seen_so_far = current now = time.time() if self.verbose == 1: prev_total_width = self.total_width sys.stdout.write("\b" * prev_total_width) sys.stdout.write("\r") numdigits = int(np.floor(np.log10(self.target))) + 1 barstr = '%%%dd/%%%dd [' % (numdigits, numdigits) bar = barstr % (current, self.target) prog = float(current)/self.target prog_width = int(self.width*prog) if prog_width > 0: bar += ('='*(prog_width-1)) if current < self.target: bar += '>' else: bar += '=' bar += ('.'*(self.width-prog_width)) bar += ']' sys.stdout.write(bar) self.total_width = len(bar) if current: time_per_unit = (now - self.start) / current else: time_per_unit = 0 eta = time_per_unit*(self.target - current) info = '' if current < self.target: info += ' - ETA: %ds' % eta else: info += ' - %ds' % (now - self.start) for k in self.unique_values: if type(self.sum_values[k]) is list: info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1])) else: info += ' - %s: %s' % (k, self.sum_values[k]) for k, v in self.exp_avg.iteritems(): info += ' - %s: %.4f' % (k, v) self.total_width += len(info) if prev_total_width > self.total_width: info += ((prev_total_width-self.total_width) * " ") sys.stdout.write(info) sys.stdout.flush() if current >= self.target: sys.stdout.write("\n") if self.verbose == 2: if current >= self.target: info = '%ds' % (now - self.start) for k in self.unique_values: info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1])) sys.stdout.write(info + "\n") def add(self, n, values=[]): self.update(self.seen_so_far+n, values) ================================================ FILE: assignment2/utils/preprocess.py ================================================ import numpy as np def greyscale(state): """ Preprocess state (210, 160, 3) image into a (80, 80, 1) image in grey scale """ state = np.reshape(state, [210, 160, 3]).astype(np.float32) # grey scale state = state[:, :, 0] * 0.299 + state[:, :, 1] * 0.587 + state[:, :, 2] * 0.114 # karpathy state = state[35:195] # crop state = state[::2,::2] # downsample by factor of 2 state = state[:, :, np.newaxis] return state.astype(np.uint8) def blackandwhite(state): """ Preprocess state (210, 160, 3) image into a (80, 80, 1) image in grey scale """ # erase background state[state==144] = 0 state[state==109] = 0 state[state!=0] = 1 # karpathy state = state[35:195] # crop state = state[::2,::2, 0] # downsample by factor of 2 state = state[:, :, np.newaxis] return state.astype(np.uint8) ================================================ FILE: assignment2/utils/replay_buffer.py ================================================ import numpy as np import random def sample_n_unique(sampling_f, n): """Helper function. Given a function `sampling_f` that returns comparable objects, sample n such unique objects. """ res = [] while len(res) < n: candidate = sampling_f() if candidate not in res: res.append(candidate) return res class ReplayBuffer(object): """ Taken from Berkeley's Assignment """ def __init__(self, size, frame_history_len): """This is a memory efficient implementation of the replay buffer. The sepecific memory optimizations use here are: - only store each frame once rather than k times even if every observation normally consists of k last frames - store frames as np.uint8 (actually it is most time-performance to cast them back to float32 on GPU to minimize memory transfer time) - store frame_t and frame_(t+1) in the same buffer. For the tipical use case in Atari Deep RL buffer with 1M frames the total memory footprint of this buffer is 10^6 * 84 * 84 bytes ~= 7 gigabytes Warning! Assumes that returning frame of zeros at the beginning of the episode, when there is less frames than `frame_history_len`, is acceptable. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. frame_history_len: int Number of memories to be retried for each observation. """ self.size = size self.frame_history_len = frame_history_len self.next_idx = 0 self.num_in_buffer = 0 self.obs = None self.action = None self.reward = None self.done = None def can_sample(self, batch_size): """Returns true if `batch_size` different transitions can be sampled from the buffer.""" return batch_size + 1 <= self.num_in_buffer def _encode_sample(self, idxes): obs_batch = np.concatenate([self._encode_observation(idx)[None] for idx in idxes], 0) act_batch = self.action[idxes] rew_batch = self.reward[idxes] next_obs_batch = np.concatenate([self._encode_observation(idx + 1)[None] for idx in idxes], 0) done_mask = np.array([1.0 if self.done[idx] else 0.0 for idx in idxes], dtype=np.float32) return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask def sample(self, batch_size): """Sample `batch_size` different transitions. i-th sample transition is the following: when observing `obs_batch[i]`, action `act_batch[i]` was taken, after which reward `rew_batch[i]` was received and subsequent observation next_obs_batch[i] was observed, unless the epsiode was done which is represented by `done_mask[i]` which is equal to 1 if episode has ended as a result of that action. Parameters ---------- batch_size: int How many transitions to sample. Returns ------- obs_batch: np.array Array of shape (batch_size, img_h, img_w, img_c * frame_history_len) and dtype np.uint8 act_batch: np.array Array of shape (batch_size,) and dtype np.int32 rew_batch: np.array Array of shape (batch_size,) and dtype np.float32 next_obs_batch: np.array Array of shape (batch_size, img_h, img_w, img_c * frame_history_len) and dtype np.uint8 done_mask: np.array Array of shape (batch_size,) and dtype np.float32 """ assert self.can_sample(batch_size) idxes = sample_n_unique(lambda: random.randint(0, self.num_in_buffer - 2), batch_size) return self._encode_sample(idxes) def encode_recent_observation(self): """Return the most recent `frame_history_len` frames. Returns ------- observation: np.array Array of shape (img_h, img_w, img_c * frame_history_len) and dtype np.uint8, where observation[:, :, i*img_c:(i+1)*img_c] encodes frame at time `t - frame_history_len + i` """ assert self.num_in_buffer > 0 return self._encode_observation((self.next_idx - 1) % self.size) def _encode_observation(self, idx): end_idx = idx + 1 # make noninclusive start_idx = end_idx - self.frame_history_len # this checks if we are using low-dimensional observations, such as RAM # state, in which case we just directly return the latest RAM. # if len(self.obs.shape) <= 2: # return self.obs[end_idx-1] # if there weren't enough frames ever in the buffer for context if start_idx < 0 and self.num_in_buffer != self.size: start_idx = 0 for idx in range(start_idx, end_idx - 1): if self.done[idx % self.size]: start_idx = idx + 1 missing_context = self.frame_history_len - (end_idx - start_idx) # if zero padding is needed for missing context # or we are on the boundry of the buffer if start_idx < 0 or missing_context > 0: frames = [np.zeros_like(self.obs[0]) for _ in range(missing_context)] for idx in range(start_idx, end_idx): frames.append(self.obs[idx % self.size]) return np.concatenate(frames, 2) else: # this optimization has potential to saves about 30% compute time \o/ img_h, img_w = self.obs.shape[1], self.obs.shape[2] return self.obs[start_idx:end_idx].transpose(1, 2, 0, 3).reshape(img_h, img_w, -1) def store_frame(self, frame): """Store a single frame in the buffer at the next available index, overwriting old frames if necessary. Parameters ---------- frame: np.array Array of shape (img_h, img_w, img_c) and dtype np.uint8 the frame to be stored Returns ------- idx: int Index at which the frame is stored. To be used for `store_effect` later. """ if self.obs is None: self.obs = np.empty([self.size] + list(frame.shape), dtype=np.uint8) self.action = np.empty([self.size], dtype=np.int32) self.reward = np.empty([self.size], dtype=np.float32) self.done = np.empty([self.size], dtype=np.bool) self.obs[self.next_idx] = frame ret = self.next_idx self.next_idx = (self.next_idx + 1) % self.size self.num_in_buffer = min(self.size, self.num_in_buffer + 1) return ret def store_effect(self, idx, action, reward, done): """Store effects of action taken after obeserving frame stored at index idx. The reason `store_frame` and `store_effect` is broken up into two functions is so that once can call `encode_recent_observation` in between. Paramters --------- idx: int Index in buffer of recently observed frame (returned by `store_frame`). action: int Action that was performed upon observing this frame. reward: float Reward that was received when the actions was performed. done: bool True if episode was finished after performing that action. """ self.action[idx] = action self.reward[idx] = reward self.done[idx] = done ================================================ FILE: assignment2/utils/test_env.py ================================================ import numpy as np class ActionSpace(object): def __init__(self, n): self.n = n def sample(self): return np.random.randint(0, self.n) class ObservationSpace(object): def __init__(self, shape): self.shape = shape self.bad_state = np.random.randint(0, 50, shape, dtype=np.uint8) self.normal_state = np.random.randint(100, 150, shape, dtype=np.uint8) self.good_state = np.random.randint(200, 250, shape, dtype=np.uint8) self.states = [self.bad_state, self.normal_state, self.good_state] class EnvTest(object): """ Adapted from Igor Gitman, CMU / Karan Goel """ def __init__(self, shape=(84, 84, 3)): #3 states self.rewards = [-0.1, 0, 0.1] self.cur_state = 0 self.num_iters = 0 self.was_in_second = False self.action_space = ActionSpace(4) self.observation_space = ObservationSpace(shape) def reset(self): self.cur_state = 0 self.num_iters = 0 self.was_in_second = False return self.observation_space.states[self.cur_state] def step(self, action): assert(0 <= action <= 3) self.num_iters += 1 if action < 3: self.cur_state = action reward = self.rewards[self.cur_state] if self.was_in_second is True: reward *= -10 if self.cur_state == 1: self.was_in_second = True else: self.was_in_second = False return self.observation_space.states[self.cur_state], reward, self.num_iters >= 5, {'ale.lives':0} def render(self): print(self.cur_state) ================================================ FILE: assignment2/utils/viewer.py ================================================ import pyglet class SimpleImageViewer(object): """ Modified version of gym viewer to chose format (RBG or I) see source here https://github.com/openai/gym/blob/master/gym/envs/classic_control/rendering.py """ def __init__(self, display=None): self.window = None self.isopen = False self.display = display def imshow(self, arr): if self.window is None: height, width, channels = arr.shape self.window = pyglet.window.Window(width=width, height=height, display=self.display) self.width = width self.height = height self.isopen = True ########################## ####### old version ###### # assert arr.shape == (self.height, self.width, I), "You passed in an image with the wrong number shape" # image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes()) ########################## ########################## ####### new version ###### nchannels = arr.shape[-1] if nchannels == 1: _format = "I" elif nchannels == 3: _format = "RGB" else: raise NotImplementedError image = pyglet.image.ImageData(self.width, self.height, _format, arr.tobytes()) ########################## self.window.clear() self.window.switch_to() self.window.dispatch_events() image.blit(0,0) self.window.flip() def close(self): if self.isopen: self.window.close() self.isopen = False def __del__(self): self.close() ================================================ FILE: assignment2/utils/wrappers.py ================================================ import numpy as np import gym from gym import spaces from viewer import SimpleImageViewer from collections import deque class MaxAndSkipEnv(gym.Wrapper): """ Wrapper from Berkeley's Assignment Takes a max pool over the last n states """ def __init__(self, env=None, skip=4): """Return only every `skip`-th frame""" super(MaxAndSkipEnv, self).__init__(env) # most recent raw observations (for max pooling across time steps) self._obs_buffer = deque(maxlen=2) self._skip = skip def _step(self, action): total_reward = 0.0 done = None for _ in range(self._skip): obs, reward, done, info = self.env.step(action) self._obs_buffer.append(obs) total_reward += reward if done: break max_frame = np.max(np.stack(self._obs_buffer), axis=0) return max_frame, total_reward, done, info def _reset(self): """Clear past frame buffer and init. to first obs. from inner env.""" self._obs_buffer.clear() obs = self.env.reset() self._obs_buffer.append(obs) return obs class PreproWrapper(gym.Wrapper): """ Wrapper for Pong to apply preprocessing Stores the state into variable self.obs """ def __init__(self, env, prepro, shape, overwrite_render=True, high=255): """ Args: env: (gym env) prepro: (function) to apply to a state for preprocessing shape: (list) shape of obs after prepro overwrite_render: (bool) if True, render is overwriten to vizualise effect of prepro grey_scale: (bool) if True, assume grey scale, else black and white high: (int) max value of state after prepro """ super(PreproWrapper, self).__init__(env) self.overwrite_render = overwrite_render self.viewer = None self.prepro = prepro self.observation_space = spaces.Box(low=0, high=high, shape=shape) self.high = high def _step(self, action): """ Overwrites _step function from environment to apply preprocess """ obs, reward, done, info = self.env.step(action) self.obs = self.prepro(obs) return self.obs, reward, done, info def _reset(self): self.obs = self.prepro(self.env.reset()) return self.obs def _render(self, mode='human', close=False): """ Overwrite _render function to vizualize preprocessing """ if self.overwrite_render: if close: if self.viewer is not None: self.viewer.close() self.viewer = None return img = self.obs if mode == 'rgb_array': return img elif mode == 'human': from gym.envs.classic_control import rendering if self.viewer is None: self.viewer = SimpleImageViewer() self.viewer.imshow(img) else: super(PongWrapper, self)._render(mode, close) ================================================ FILE: assignment3/discrete_env.py ================================================ import numpy as np from gym import Env, spaces from gym.utils import seeding def categorical_sample(prob_n, np_random): """ Sample from categorical distribution Each row specifies class probabilities """ prob_n = np.asarray(prob_n) csprob_n = np.cumsum(prob_n) return (csprob_n > np_random.rand()).argmax() class DiscreteEnv(Env): """ Has the following members - nS: number of states - nA: number of actions - P: transitions (*) - isd: initial state distribution (**) (*) dictionary dict of dicts of lists, where P[s][a] == [(probability, nextstate, reward, done), ...] (**) list or array of length nS """ def __init__(self, nS, nA, P, isd): self.P = P self.isd = isd self.lastaction=None # for rendering self.nS = nS self.nA = nA self.action_space = spaces.Discrete(self.nA) self.observation_space = spaces.Discrete(self.nS) self._seed() self._reset() def _seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] def _reset(self): self.s = categorical_sample(self.isd, self.np_random) self.lastaction=None return self.s def _step(self, a): transitions = self.P[self.s][a] i = categorical_sample([t[0] for t in transitions], self.np_random) p, s, r, d= transitions[i] self.s = s self.lastaction=a return (s, r, d, {"prob" : p}) ================================================ FILE: assignment3/frozen_lake.py ================================================ import numpy as np import sys from six import StringIO, b from gym import utils import discrete_env LEFT = 0 DOWN = 1 RIGHT = 2 UP = 3 MAPS = { "4x4": [ "SHHH", "FHHH", "FHHH", "FFFG" ] } class FrozenLakeEnv(discrete_env.DiscreteEnv): """ Winter is here. You and your friends were tossing around a frisbee at the park when you made a wild throw that left the frisbee out in the middle of the lake. The water is mostly frozen, but there are a few holes where the ice has melted. If you step into one of those holes, you'll fall into the freezing water. At this time, there's an international frisbee shortage, so it's absolutely imperative that you navigate across the lake and retrieve the disc. However, the ice is slippery, so you won't always move in the direction you intend. The surface is described using a grid like the following SHHH FHHH FHHH FFFG S : starting point, safe F : frozen surface, safe H : hole, you cannot move to these place G : goal, where the frisbee is located The episode ends when you reach the goal or fall in a hole or reach max steps You receive a reward of 1 if you reach the goal, and zero otherwise. """ metadata = {'render.modes': ['human', 'ansi']} def __init__(self, desc=None, map_name="4x4",is_slippery=False): if desc is None and map_name is None: raise ValueError('Must provide either desc or map_name') elif desc is None: desc = MAPS[map_name] self.desc = desc = np.asarray(desc,dtype='c') self.nrow, self.ncol = nrow, ncol = desc.shape nA = 4 nS = nrow * ncol isd = np.array(desc == b'S').astype('float64').ravel() isd /= isd.sum() P = {s : {a : [] for a in range(nA)} for s in range(nS)} self.a_true = [] for s in range(nS): a_true_table = np.arange(4) np.random.shuffle(a_true_table) self.a_true.append(a_true_table) def to_s(row, col): return row*ncol + col def inc(row, col, a): a_true_table = self.a_true[to_s(row, col)] if a_true_table[a]==0: # left col = max(col-1,0) elif a_true_table[a]==1: # down row = min(row+1,nrow-1) elif a_true_table[a]==2: # right col = min(col+1,ncol-1) elif a_true_table[a]==3: # up row = max(row-1,0) return (row, col) for row in range(nrow): for col in range(ncol): s = to_s(row, col) if desc[row, col] == b"H": continue for a in range(4): li = P[s][a] letter = desc[row, col] if letter in b'GH': li.append((1.0, s, 0, True)) else: if is_slippery: for b in [(a-1)%4, a, (a+1)%4]: newrow, newcol = inc(row, col, b) newstate = to_s(newrow, newcol) newletter = desc[newrow, newcol] # if meet hole, stay at original place if newletter == b'H': li.append((1.0, s, 0.0, False)) continue done = bytes(newletter) in b'GH' rew = float(newletter == b'G') li.append((0.8 if b==a else 0.1, newstate, rew, done)) else: newrow, newcol = inc(row, col, a) newstate = to_s(newrow, newcol) newletter = desc[newrow, newcol] # if meet hole, stay at original place if newletter == b'H': li.append((1.0, s, 0.0, False)) continue done = bytes(newletter) in b'GH' rew = float(newletter == b'G') li.append((1.0, newstate, rew, done)) super(FrozenLakeEnv, self).__init__(nS, nA, P, isd) def _render(self, mode='human', close=False): if close: return outfile = StringIO() if mode == 'ansi' else sys.stdout row, col = self.s // self.ncol, self.s % self.ncol desc = self.desc.tolist() desc = [[c.decode('utf-8') for c in line] for line in desc] desc[row][col] = utils.colorize(desc[row][col], "red", highlight=True) if self.lastaction is not None: outfile.write(" ({})\n".format(["Left","Down","Right","Up"][self.lastaction])) else: outfile.write("\n") outfile.write("\n".join(''.join(line) for line in desc)+"\n") return outfile ================================================ FILE: assignment3/q1.py ================================================ import math import gym from frozen_lake import * import numpy as np import time from utils import * import matplotlib.pyplot as plt from tqdm import * def rmax(env, gamma, m, R_max, epsilon, num_episodes, max_step = 6): """Learn state-action values using the Rmax algorithm Args: ---------- env: gym.core.Environment Environment to compute Q function for. Must have nS, nA, and P as attributes. gamma: float Discount factor. Number in range [0, 1) m: int Threshold of visitance R_max: float The estimated max reward that could be obtained in the game epsilon: accuracy paramter num_episodes: int Number of episodes of training. max_step: Int max number of steps in each episode Returns ------- np.array An array of shape [env.nS x env.nA] representing state-action values """ Q = np.ones((env.nS, env.nA)) * R_max / (1 - gamma) R = np.zeros((env.nS, env.nA)) nSA = np.zeros((env.nS, env.nA)) nSASP = np.zeros((env.nS, env.nA, env.nS)) ######################################################## # YOUR CODE HERE # ######################################################## total_score = 0 average_score = np.zeros(num_episodes) for time in range(num_episodes): is_done = False cur_state = env.reset() for _ in range(max_step): if is_done: break action = np.argmax(Q[cur_state]) (next_state, reward, is_done, _) = env.step(action) total_score += reward if nSA[cur_state][action] < m: nSA[cur_state][action] += 1 R[cur_state][action] += reward nSASP[cur_state][action][next_state] +=1 if nSA[cur_state][action] == m: up_bound = int(np.ceil(np.log(1.0/(epsilon*(1.0-gamma)))/(1.0-gamma))) for i in range(up_bound): for s in range(env.nS): for a in range(env.nA): if nSA[s][a] >= m: q_temp = R[s][a] / nSA[s][a] for j in range(env.nS): prob = nSASP[s][a][j] / nSA[s][a] q_temp += gamma*prob*np.max(Q[j]) Q[s][a] = q_temp cur_state = next_state average_score[time] = total_score / (time+1) ######################################################## # END YOUR CODE # ######################################################## return (Q, average_score) def main(): env = FrozenLakeEnv(is_slippery=False) print env.__doc__ for m in tqdm(np.arange(1,20,2)): (Q, average_score) = rmax(env, gamma = 0.99, m=m, R_max = 1, epsilon = 0.1, num_episodes = 1000) render_single_Q(env, Q) plt.plot(np.arange(1000),np.array(average_score)) plt.title('The running average score of the R-max learning agent') plt.xlabel('traning episodes') plt.ylabel('score') plt.legend(['m = '+str(i) for i in np.arange(1,20,2)], loc='upper right') #plt.show() plt.savefig('r-max.jpg') if __name__ == '__main__': print "haha" main() ================================================ FILE: assignment3/q2.py ================================================ import math import gym from frozen_lake import * import numpy as np import time from utils import * from tqdm import * import matplotlib.pyplot as plt def learn_Q_QLearning(env, num_episodes=10000, gamma = 0.99, lr = 0.1, e = 0.2, max_step=6): """Learn state-action values using the Q-learning algorithm with epsilon-greedy exploration strategy(no decay) Feel free to reuse your assignment1's code Parameters ---------- env: gym.core.Environment Environment to compute Q function for. Must have nS, nA, and P as attributes. num_episodes: int Number of episodes of training. gamma: float Discount factor. Number in range [0, 1) learning_rate: float Learning rate. Number in range [0, 1) e: float Epsilon value used in the epsilon-greedy method. max_step: Int max number of steps in each episode Returns ------- np.array An array of shape [env.nS x env.nA] representing state-action values """ Q = np.zeros((env.nS, env.nA)) ######################################################## # YOUR CODE HERE # ######################################################## total_score = 0 average_score = np.zeros(num_episodes) for i in range(num_episodes): done = False state = env.reset() for _ in range(max_step): if done: break if np.random.rand() > e: action = np.argmax(Q[state]) else: action = np.random.randint(env.nA) nextstate, reward, done, _ = env.step(action) Q[state][action] = (1-lr)*Q[state][action]+lr*(reward+gamma*np.max(Q[nextstate])) state = nextstate total_score += reward average_score[i] = total_score / (i+1) ######################################################## # END YOUR CODE # ######################################################## return (Q, average_score) def main(): env = FrozenLakeEnv(is_slippery=False) for e in tqdm(np.linspace(0,1,11)): (Q, average_score) = learn_Q_QLearning(env, num_episodes = 10000, gamma = 0.99, lr = 0.1, e = e) render_single_Q(env, Q) plt.plot(np.arange(10000), np.array(average_score)) plt.title('The running average score of the Q-learning agent') plt.xlabel('traning episodes') plt.ylabel('score') plt.legend(['e = '+str(i) for i in np.linspace(0,1,11)], loc='upper right') #plt.show() plt.savefig('q-learning.jpg') if __name__ == '__main__': main() ================================================ FILE: assignment3/q3.py ================================================ import math import gym from frozen_lake import * import numpy as np import time from utils import * import matplotlib.pyplot as plt from tqdm import * def rmax(env, gamma, m, R_max, epsilon, num_episodes, max_step = 6, e = 0.7): """Learn state-action values using the Rmax algorithm Args: ---------- env: gym.core.Environment Environment to compute Q function for. Must have nS, nA, and P as attributes. gamma: float Discount factor. Number in range [0, 1) m: int Threshold of visitance R_max: float The estimated max reward that could be obtained in the game epsilon: accuracy paramter num_episodes: int Number of episodes of training. max_step: Int max number of steps in each episode Returns ------- np.array An array of shape [env.nS x env.nA] representing state-action values """ Q = np.ones((env.nS, env.nA)) * R_max / (1 - gamma) R = np.zeros((env.nS, env.nA)) nSA = np.zeros((env.nS, env.nA)) nSASP = np.zeros((env.nS, env.nA, env.nS)) ######################################################## # YOUR CODE HERE # ######################################################## total_score = 0 average_score = np.zeros(num_episodes) for time in range(num_episodes): is_done = False cur_state = env.reset() for _ in range(max_step): if is_done: break if np.random.rand() > e: action = np.argmax(Q[cur_state]) else: action = np.random.randint(env.nA) (next_state, reward, is_done, _) = env.step(action) total_score += reward if nSA[cur_state][action] < m: nSA[cur_state][action] += 1 R[cur_state][action] += reward nSASP[cur_state][action][next_state] +=1 if nSA[cur_state][action] == m: up_bound = int(np.ceil(np.log(1.0/(epsilon*(1.0-gamma)))/(1.0-gamma))) for i in range(up_bound): for s in range(env.nS): for a in range(env.nA): if nSA[s][a] >= m: q_temp = R[s][a] / nSA[s][a] for j in range(env.nS): prob = nSASP[s][a][j] / nSA[s][a] q_temp += gamma*prob*np.max(Q[j]) Q[s][a] = q_temp cur_state = next_state average_score[time] = total_score / (time+1) ######################################################## # END YOUR CODE # ######################################################## return (Q, average_score) def main(): env = FrozenLakeEnv(is_slippery=False) print env.__doc__ (Q, average_score) = rmax(env, gamma = 0.99, m=1, R_max = 1, epsilon = 0.1, num_episodes = 1000) render_single_Q(env, Q) plt.plot(np.arange(1000),np.array(average_score)) plt.title('The running average score of the R-max with e-greedy learning agent') plt.xlabel('traning episodes') plt.ylabel('score') #plt.show() plt.savefig('r-max+e_greedy.jpg') if __name__ == '__main__': print "haha" main() ================================================ FILE: assignment3/requirements.txt ================================================ matplotlib numpy six ================================================ FILE: assignment3/utils.py ================================================ import math import gym from frozen_lake import * import numpy as np import time def render_single_Q(env, Q, max_step = 6): """Renders Q function once on environment. Parameters ---------- env: gym.core.Environment Environment to play Q function on. Must have nS, nA, and P as attributes. Q: np.array of shape [env.nS x env.nA] Q function """ state = env.reset() done = False episode_reward = 0 count = 0 while not done: env.render() time.sleep(0.5) # Seconds between frames. Modify as you wish. action = np.argmax(Q[state]) state, reward, done, _ = env.step(action) episode_reward += reward count += 1 if count >= max_step: break print "Episode reward: %d" % episode_reward