Repository: NathanEpstein/reinforce Branch: master Commit: 06a698c91da1 Files: 9 Total size: 7.8 KB Directory structure: gitextract_z1jayhda/ ├── .gitignore ├── README.md ├── reinforce/ │ ├── __init__.py │ ├── encoding.py │ ├── learn.py │ ├── policy.py │ ├── rewards.py │ └── transitions.py └── setup.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ *.pyc *.egg-info dist .DS_Store ================================================ FILE: README.md ================================================ # reinforce A 'plug and play' reinforcement learning library in Python. Infers a Markov Decision Process from data and solves for the optimal policy. Implementation based on Andrew Ng's notes. More information related to this project can be found here. ## Example Usage ```python observations = [ { 'state_transitions': [ { 'state': 'low', 'action': 'climb', 'state_': 'mid' }, { 'state': 'mid', 'action': 'climb', 'state_': 'high' }, { 'state': 'high', 'action': 'sink', 'state_': 'mid' }, { 'state': 'mid', 'action': 'sink', 'state_': 'low' }, { 'state': 'low', 'action': 'sink', 'state_': 'bottom' } ], 'reward': 0 }, { 'state_transitions': [ { 'state': 'low', 'action': 'climb', 'state_': 'mid' }, { 'state': 'mid', 'action': 'climb', 'state_': 'high' }, { 'state': 'high', 'action': 'climb', 'state_': 'top' }, ], 'reward': 0 } ] trap_states = [ { 'state_transitions': [ { 'state': 'bottom', 'action': 'sink', 'state_': 'bottom' }, { 'state': 'bottom', 'action': 'climb', 'state_': 'bottom' } ], 'reward': 0 }, { 'state_transitions': [ { 'state': 'top', 'action': 'sink', 'state_': 'top' }, { 'state': 'top', 'action': 'climb', 'state_': 'top' }, ], 'reward': 1 }, ] from learn import MarkovAgent mark = MarkovAgent(observations + trap_states) mark.learn() print(mark.policy) # {'high': 'climb', 'top': 'sink', 'bottom': 'sink', 'low': 'climb', 'mid': 'climb'} # NOTE: policy in top and bottom states is chosen randomly (doesn't affect state) ``` ================================================ FILE: reinforce/__init__.py ================================================ from .learn import * ================================================ FILE: reinforce/encoding.py ================================================ class StateActionEncoder: def __init__(self, observations): self.observations = observations self._parse_states_and_actions() def parse_dimensions(self): return { 'state_count': len(self.int_to_state), 'action_count': len(self.int_to_action) } def observations_to_int(self): for observation in self.observations: for transition in observation['state_transitions']: transition['state'] = self.state_to_int[transition['state']] transition['state_'] = self.state_to_int[transition['state_']] transition['action'] = self.action_to_int[transition['action']] def parse_encoded_policy(self, encoded_policy): policy = {} for index, encoded_action in enumerate(encoded_policy): state = self.int_to_state[index] action = self.int_to_action[int(encoded_action)] policy[state] = action return policy def _parse_states_and_actions(self): state_dict, action_dict = {}, {} state_array, action_array = [], [] state_index, action_index = 0, 0 for observation in self.observations: for transition in observation['state_transitions']: state = transition['state'] action = transition['action'] if state not in state_dict.keys(): state_dict[state] = state_index state_array.append(state) state_index += 1 if action not in action_dict.keys(): action_dict[action] = action_index action_array.append(action) action_index += 1 self.state_to_int = state_dict self.action_to_int = action_dict self.int_to_state = state_array self.int_to_action = action_array ================================================ FILE: reinforce/learn.py ================================================ from encoding import StateActionEncoder from rewards import RewardParser from transitions import TransitionParser from policy import PolicyParser class MarkovAgent: def __init__(self, observations): # encode observation data as int values self.state_action_encoder = StateActionEncoder(observations) self.state_action_encoder.observations_to_int() dimensions = self.state_action_encoder.parse_dimensions() # create reward, transition, and policy parsers self.reward_parser = RewardParser(observations, dimensions) self.transition_parser = TransitionParser(observations, dimensions) self.policy_parser = PolicyParser(dimensions) def learn(self): R = self.reward_parser.rewards() P = self.transition_parser.transition_probabilities() # learn int-encoded policy and convert to readable dictionary encoded_policy = self.policy_parser.policy(P, R) self.policy = self.state_action_encoder.parse_encoded_policy(encoded_policy) ================================================ FILE: reinforce/policy.py ================================================ import numpy as np class PolicyParser: def __init__(self, dimensions): self.state_count = dimensions['state_count'] self.action_count = dimensions['action_count'] def policy(self, P, rewards): print('COMPUTING POLICY') best_policy = np.zeros(self.state_count) state_values = np.zeros(self.state_count) GAMMA = 0.9 ITERATIONS = 125 for i in range(ITERATIONS): print ("iteration: {0} / {1}".format(i + 1, ITERATIONS)) for state in range(0, self.state_count): state_value = -float('Inf') for action in range(0, self.action_count): action_value = 0 for state_ in range(0, self.state_count): action_value += (P[state][action][state_] * state_values[state_] * GAMMA) if (action_value >= state_value): state_value = action_value best_policy[state] = action state_values[state] = rewards[state] + state_value return best_policy ================================================ FILE: reinforce/rewards.py ================================================ import numpy as np class RewardParser: def __init__(self, observations, dimensions): self.observations = observations self.state_count = dimensions['state_count'] def rewards(self): print('COMPUTING REWARDS') total_state_rewards = np.zeros(self.state_count) total_state_visits = np.zeros(self.state_count) for observation in self.observations: visits = float(len(observation['state_transitions'])) reward_per_visit = observation['reward'] / visits for state_transition in observation['state_transitions']: state = state_transition['state'] total_state_rewards[state] += reward_per_visit total_state_visits[state] += 1 average_state_rewards = total_state_rewards / total_state_visits average_state_rewards = np.nan_to_num(average_state_rewards) return average_state_rewards ================================================ FILE: reinforce/transitions.py ================================================ import numpy as np class TransitionParser: def __init__(self, observations, dimensions): self.observations = observations self.state_count = dimensions['state_count'] self.action_count = dimensions['action_count'] def transition_probabilities(self): print('COMPUTING TRANSITIONS') transition_count = self._count_transitions() return self._parse_probabilities(transition_count) def _count_transitions(self): transition_count = np.zeros((self.state_count, self.action_count, self.state_count)) for observation in self.observations: for state_transition in observation['state_transitions']: state = state_transition['state'] action = state_transition['action'] state_ = state_transition['state_'] transition_count[state][action][state_] += 1 return transition_count def _parse_probabilities(self, transition_count): P = np.zeros((self.state_count, self.action_count, self.state_count)) for state in range(0, self.state_count): for action in range(0, self.action_count): total_transitions = float(sum(transition_count[state][action])) if (total_transitions > 0): P[state][action] = transition_count[state][action] / total_transitions else: P[state][action] = 1.0 / self.state_count return P ================================================ FILE: setup.py ================================================ from setuptools import setup setup(name='reinforce', version='0.2.0', description='plug and play reinforcement learning', url='http://github.com/nathanepstein/reinforce', author='Nathan Epstein', author_email='ne2210@columbia.edu', license='MIT', packages=['reinforce'], )