Repository: NathanEpstein/reinforce
Branch: master
Commit: 06a698c91da1
Files: 9
Total size: 7.8 KB
Directory structure:
gitextract_z1jayhda/
├── .gitignore
├── README.md
├── reinforce/
│ ├── __init__.py
│ ├── encoding.py
│ ├── learn.py
│ ├── policy.py
│ ├── rewards.py
│ └── transitions.py
└── setup.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
*.pyc
*.egg-info
dist
.DS_Store
================================================
FILE: README.md
================================================
# reinforce
A 'plug and play' reinforcement learning library in Python.
Infers a Markov Decision Process from data and solves for the optimal policy.
Implementation based on Andrew Ng's notes.
More information related to this project can be found here.
## Example Usage
```python
observations = [
{ 'state_transitions': [
{ 'state': 'low', 'action': 'climb', 'state_': 'mid' },
{ 'state': 'mid', 'action': 'climb', 'state_': 'high' },
{ 'state': 'high', 'action': 'sink', 'state_': 'mid' },
{ 'state': 'mid', 'action': 'sink', 'state_': 'low' },
{ 'state': 'low', 'action': 'sink', 'state_': 'bottom' }
],
'reward': 0
},
{ 'state_transitions': [
{ 'state': 'low', 'action': 'climb', 'state_': 'mid' },
{ 'state': 'mid', 'action': 'climb', 'state_': 'high' },
{ 'state': 'high', 'action': 'climb', 'state_': 'top' },
],
'reward': 0
}
]
trap_states = [
{
'state_transitions': [
{ 'state': 'bottom', 'action': 'sink', 'state_': 'bottom' },
{ 'state': 'bottom', 'action': 'climb', 'state_': 'bottom' }
],
'reward': 0
},
{
'state_transitions': [
{ 'state': 'top', 'action': 'sink', 'state_': 'top' },
{ 'state': 'top', 'action': 'climb', 'state_': 'top' },
],
'reward': 1
},
]
from learn import MarkovAgent
mark = MarkovAgent(observations + trap_states)
mark.learn()
print(mark.policy)
# {'high': 'climb', 'top': 'sink', 'bottom': 'sink', 'low': 'climb', 'mid': 'climb'}
# NOTE: policy in top and bottom states is chosen randomly (doesn't affect state)
```
================================================
FILE: reinforce/__init__.py
================================================
from .learn import *
================================================
FILE: reinforce/encoding.py
================================================
class StateActionEncoder:
def __init__(self, observations):
self.observations = observations
self._parse_states_and_actions()
def parse_dimensions(self):
return {
'state_count': len(self.int_to_state),
'action_count': len(self.int_to_action)
}
def observations_to_int(self):
for observation in self.observations:
for transition in observation['state_transitions']:
transition['state'] = self.state_to_int[transition['state']]
transition['state_'] = self.state_to_int[transition['state_']]
transition['action'] = self.action_to_int[transition['action']]
def parse_encoded_policy(self, encoded_policy):
policy = {}
for index, encoded_action in enumerate(encoded_policy):
state = self.int_to_state[index]
action = self.int_to_action[int(encoded_action)]
policy[state] = action
return policy
def _parse_states_and_actions(self):
state_dict, action_dict = {}, {}
state_array, action_array = [], []
state_index, action_index = 0, 0
for observation in self.observations:
for transition in observation['state_transitions']:
state = transition['state']
action = transition['action']
if state not in state_dict.keys():
state_dict[state] = state_index
state_array.append(state)
state_index += 1
if action not in action_dict.keys():
action_dict[action] = action_index
action_array.append(action)
action_index += 1
self.state_to_int = state_dict
self.action_to_int = action_dict
self.int_to_state = state_array
self.int_to_action = action_array
================================================
FILE: reinforce/learn.py
================================================
from encoding import StateActionEncoder
from rewards import RewardParser
from transitions import TransitionParser
from policy import PolicyParser
class MarkovAgent:
def __init__(self, observations):
# encode observation data as int values
self.state_action_encoder = StateActionEncoder(observations)
self.state_action_encoder.observations_to_int()
dimensions = self.state_action_encoder.parse_dimensions()
# create reward, transition, and policy parsers
self.reward_parser = RewardParser(observations, dimensions)
self.transition_parser = TransitionParser(observations, dimensions)
self.policy_parser = PolicyParser(dimensions)
def learn(self):
R = self.reward_parser.rewards()
P = self.transition_parser.transition_probabilities()
# learn int-encoded policy and convert to readable dictionary
encoded_policy = self.policy_parser.policy(P, R)
self.policy = self.state_action_encoder.parse_encoded_policy(encoded_policy)
================================================
FILE: reinforce/policy.py
================================================
import numpy as np
class PolicyParser:
def __init__(self, dimensions):
self.state_count = dimensions['state_count']
self.action_count = dimensions['action_count']
def policy(self, P, rewards):
print('COMPUTING POLICY')
best_policy = np.zeros(self.state_count)
state_values = np.zeros(self.state_count)
GAMMA = 0.9
ITERATIONS = 125
for i in range(ITERATIONS):
print ("iteration: {0} / {1}".format(i + 1, ITERATIONS))
for state in range(0, self.state_count):
state_value = -float('Inf')
for action in range(0, self.action_count):
action_value = 0
for state_ in range(0, self.state_count):
action_value += (P[state][action][state_] * state_values[state_] * GAMMA)
if (action_value >= state_value):
state_value = action_value
best_policy[state] = action
state_values[state] = rewards[state] + state_value
return best_policy
================================================
FILE: reinforce/rewards.py
================================================
import numpy as np
class RewardParser:
def __init__(self, observations, dimensions):
self.observations = observations
self.state_count = dimensions['state_count']
def rewards(self):
print('COMPUTING REWARDS')
total_state_rewards = np.zeros(self.state_count)
total_state_visits = np.zeros(self.state_count)
for observation in self.observations:
visits = float(len(observation['state_transitions']))
reward_per_visit = observation['reward'] / visits
for state_transition in observation['state_transitions']:
state = state_transition['state']
total_state_rewards[state] += reward_per_visit
total_state_visits[state] += 1
average_state_rewards = total_state_rewards / total_state_visits
average_state_rewards = np.nan_to_num(average_state_rewards)
return average_state_rewards
================================================
FILE: reinforce/transitions.py
================================================
import numpy as np
class TransitionParser:
def __init__(self, observations, dimensions):
self.observations = observations
self.state_count = dimensions['state_count']
self.action_count = dimensions['action_count']
def transition_probabilities(self):
print('COMPUTING TRANSITIONS')
transition_count = self._count_transitions()
return self._parse_probabilities(transition_count)
def _count_transitions(self):
transition_count = np.zeros((self.state_count, self.action_count, self.state_count))
for observation in self.observations:
for state_transition in observation['state_transitions']:
state = state_transition['state']
action = state_transition['action']
state_ = state_transition['state_']
transition_count[state][action][state_] += 1
return transition_count
def _parse_probabilities(self, transition_count):
P = np.zeros((self.state_count, self.action_count, self.state_count))
for state in range(0, self.state_count):
for action in range(0, self.action_count):
total_transitions = float(sum(transition_count[state][action]))
if (total_transitions > 0):
P[state][action] = transition_count[state][action] / total_transitions
else:
P[state][action] = 1.0 / self.state_count
return P
================================================
FILE: setup.py
================================================
from setuptools import setup
setup(name='reinforce',
version='0.2.0',
description='plug and play reinforcement learning',
url='http://github.com/nathanepstein/reinforce',
author='Nathan Epstein',
author_email='ne2210@columbia.edu',
license='MIT',
packages=['reinforce'],
)