[
  {
    "path": ".gitignore",
    "content": "*.pyc\n*.egg-info\ndist\n.DS_Store\n"
  },
  {
    "path": "README.md",
    "content": "# reinforce\n\n<img src=\"./MDP.png\">\n\nA 'plug and play' reinforcement learning library in Python.\n\nInfers a Markov Decision Process from data and solves for the optimal policy.\n\nImplementation based on Andrew Ng's <a href=\"https://web.cs.wpi.edu/~kmlee/cs539/cs229-notes12.pdf\">notes.</a>\n\nMore information related to this project can be found <a href=\"https://github.com/NathanEpstein/pydata-reinforce\">here.</a>\n\n## Example Usage\n\n```python\n\nobservations = [\n  { 'state_transitions': [\n      { 'state': 'low', 'action': 'climb', 'state_': 'mid' },\n      { 'state': 'mid', 'action': 'climb', 'state_': 'high' },\n      { 'state': 'high', 'action': 'sink', 'state_': 'mid' },\n      { 'state': 'mid', 'action': 'sink', 'state_': 'low' },\n      { 'state': 'low', 'action': 'sink', 'state_': 'bottom' }\n    ],\n    'reward': 0\n  },\n  { 'state_transitions': [\n      { 'state': 'low', 'action': 'climb', 'state_': 'mid' },\n      { 'state': 'mid', 'action': 'climb', 'state_': 'high' },\n      { 'state': 'high', 'action': 'climb', 'state_': 'top' },\n    ],\n    'reward': 0\n  }\n]\n\ntrap_states = [\n  {\n    'state_transitions': [\n      { 'state': 'bottom', 'action': 'sink', 'state_': 'bottom' },\n      { 'state': 'bottom', 'action': 'climb', 'state_': 'bottom' }\n    ],\n    'reward': 0\n  },\n  {\n    'state_transitions': [\n      { 'state': 'top', 'action': 'sink', 'state_': 'top' },\n      { 'state': 'top', 'action': 'climb', 'state_': 'top' },\n    ],\n    'reward': 1\n  },\n]\n\nfrom learn import MarkovAgent\nmark = MarkovAgent(observations + trap_states)\nmark.learn()\n\nprint(mark.policy)\n# {'high': 'climb', 'top': 'sink', 'bottom': 'sink', 'low': 'climb', 'mid': 'climb'}\n# NOTE: policy in top and bottom states is chosen randomly (doesn't affect state)\n\n```\n"
  },
  {
    "path": "reinforce/__init__.py",
    "content": "from .learn import *"
  },
  {
    "path": "reinforce/encoding.py",
    "content": "class StateActionEncoder:\n  def __init__(self, observations):\n    self.observations = observations\n    self._parse_states_and_actions()\n\n  def parse_dimensions(self):\n    return {\n      'state_count': len(self.int_to_state),\n      'action_count': len(self.int_to_action)\n    }\n\n  def observations_to_int(self):\n    for observation in self.observations:\n      for transition in observation['state_transitions']:\n        transition['state'] = self.state_to_int[transition['state']]\n        transition['state_'] = self.state_to_int[transition['state_']]\n        transition['action'] = self.action_to_int[transition['action']]\n\n  def parse_encoded_policy(self, encoded_policy):\n    policy = {}\n    for index, encoded_action in enumerate(encoded_policy):\n      state = self.int_to_state[index]\n      action = self.int_to_action[int(encoded_action)]\n      policy[state] = action\n\n    return policy\n\n  def _parse_states_and_actions(self):\n    state_dict, action_dict = {}, {}\n    state_array, action_array = [], []\n    state_index, action_index = 0, 0\n\n    for observation in self.observations:\n      for transition in observation['state_transitions']:\n        state = transition['state']\n        action = transition['action']\n\n        if state not in state_dict.keys():\n          state_dict[state] = state_index\n          state_array.append(state)\n          state_index += 1\n\n        if action not in action_dict.keys():\n          action_dict[action] = action_index\n          action_array.append(action)\n          action_index += 1\n\n    self.state_to_int = state_dict\n    self.action_to_int = action_dict\n    self.int_to_state = state_array\n    self.int_to_action = action_array\n\n"
  },
  {
    "path": "reinforce/learn.py",
    "content": "from encoding import StateActionEncoder\nfrom rewards import RewardParser\nfrom transitions import TransitionParser\nfrom policy import PolicyParser\n\nclass MarkovAgent:\n  def __init__(self, observations):\n    # encode observation data as int values\n    self.state_action_encoder = StateActionEncoder(observations)\n    self.state_action_encoder.observations_to_int()\n    dimensions = self.state_action_encoder.parse_dimensions()\n\n    # create reward, transition, and policy parsers\n    self.reward_parser = RewardParser(observations, dimensions)\n    self.transition_parser = TransitionParser(observations, dimensions)\n    self.policy_parser = PolicyParser(dimensions)\n\n  def learn(self):\n    R = self.reward_parser.rewards()\n    P = self.transition_parser.transition_probabilities()\n\n    # learn int-encoded policy and convert to readable dictionary\n    encoded_policy = self.policy_parser.policy(P, R)\n    self.policy = self.state_action_encoder.parse_encoded_policy(encoded_policy)\n"
  },
  {
    "path": "reinforce/policy.py",
    "content": "import numpy as np\n\nclass PolicyParser:\n  def __init__(self, dimensions):\n    self.state_count = dimensions['state_count']\n    self.action_count = dimensions['action_count']\n\n  def policy(self, P, rewards):\n    print('COMPUTING POLICY')\n\n    best_policy = np.zeros(self.state_count)\n    state_values = np.zeros(self.state_count)\n\n    GAMMA = 0.9\n    ITERATIONS = 125\n    for i in range(ITERATIONS):\n      print (\"iteration: {0} / {1}\".format(i + 1, ITERATIONS))\n\n      for state in range(0, self.state_count):\n        state_value = -float('Inf')\n\n        for action in range(0, self.action_count):\n          action_value = 0\n\n          for state_ in range(0, self.state_count):\n            action_value += (P[state][action][state_] * state_values[state_] * GAMMA)\n\n          if (action_value >= state_value):\n            state_value = action_value\n            best_policy[state] = action\n\n        state_values[state] = rewards[state] + state_value\n\n    return best_policy"
  },
  {
    "path": "reinforce/rewards.py",
    "content": "import numpy as np\n\nclass RewardParser:\n  def __init__(self, observations, dimensions):\n    self.observations = observations\n    self.state_count = dimensions['state_count']\n\n  def rewards(self):\n    print('COMPUTING REWARDS')\n    total_state_rewards = np.zeros(self.state_count)\n    total_state_visits = np.zeros(self.state_count)\n\n    for observation in self.observations:\n      visits = float(len(observation['state_transitions']))\n      reward_per_visit = observation['reward'] / visits\n\n      for state_transition in observation['state_transitions']:\n        state = state_transition['state']\n        total_state_rewards[state] += reward_per_visit\n        total_state_visits[state] += 1\n\n    average_state_rewards = total_state_rewards / total_state_visits\n    average_state_rewards = np.nan_to_num(average_state_rewards)\n\n    return average_state_rewards"
  },
  {
    "path": "reinforce/transitions.py",
    "content": "import numpy as np\n\nclass TransitionParser:\n  def __init__(self, observations, dimensions):\n    self.observations = observations\n    self.state_count = dimensions['state_count']\n    self.action_count = dimensions['action_count']\n\n  def transition_probabilities(self):\n    print('COMPUTING TRANSITIONS')\n    transition_count = self._count_transitions()\n    return self._parse_probabilities(transition_count)\n\n  def _count_transitions(self):\n    transition_count = np.zeros((self.state_count, self.action_count, self.state_count))\n\n    for observation in self.observations:\n      for state_transition in observation['state_transitions']:\n        state = state_transition['state']\n        action = state_transition['action']\n        state_ = state_transition['state_']\n\n        transition_count[state][action][state_] += 1\n\n    return transition_count\n\n  def _parse_probabilities(self, transition_count):\n    P = np.zeros((self.state_count, self.action_count, self.state_count))\n\n    for state in range(0, self.state_count):\n      for action in range(0, self.action_count):\n\n        total_transitions = float(sum(transition_count[state][action]))\n\n        if (total_transitions > 0):\n          P[state][action] = transition_count[state][action] / total_transitions\n        else:\n          P[state][action] = 1.0 / self.state_count\n\n    return P"
  },
  {
    "path": "setup.py",
    "content": "from setuptools import setup\n\nsetup(name='reinforce',\n      version='0.2.0',\n      description='plug and play reinforcement learning',\n      url='http://github.com/nathanepstein/reinforce',\n      author='Nathan Epstein',\n      author_email='ne2210@columbia.edu',\n      license='MIT',\n      packages=['reinforce'],\n      )"
  }
]