'''
  Example Markov Decision Process Learner 
  @author: Copyright 2010 Thomas Reidemeister
  @date: 2010.02.19
  @file: rich_and_famous.py

  This program is free software: you can redistribute it and/or modify
  it under the terms of the GNU Lesser General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU Lesser General Public License for more details.
  You should have received a copy of the GNU Lesser General Public License
  along with this program.  If not, see <http://www.gnu.org/licenses/>.
'''
import numpy

# definition of states
S = numpy.array(['pu', 'pf', 'ru', 'rf'])
# definition of actions (just for output
# we will work with int indices of them
A = numpy.array(['advertise', 'save'])

# definition of expected rewards
R = [ numpy.array( # advertize
    [#pu   pf   ru   rf 
     [-1.0, -1.0, 0.0, 0.0], # pu
     [ 0.0, -1.0, 0.0, 0.0],  # pf
     [-1.0, -1.0, 0.0, 0.0], # ru
     [ 0.0, -1.0, 0.0, 0.0],  # rf
    ]),
    numpy.array( # save
    [#pu   pf   ru   rf 
     [0.0, 0.0, 0.0, 0.0],  # pu
     [0.0, 0.0, 0.0, 10.0], # pf
     [0.0, 0.0, 10,  0.0],  # ru
     [0.0, 0.0, 10,  10],   # rf
    ])]

# definition of transition probabilities
T = [numpy.array( # advertize
    [#pu   pf   ru   rf 
     [0.5, 0.5, 0.0, 0.0], # pu
     [0.0, 1.0, 0.0, 0.0], # pf
     [0.5, 0.5, 0.0, 0.0], # ru
     [0.0, 1.0, 0.0, 0.0], # rf
    ]),
    numpy.array( # save
    [#pu   pf   ru   rf 
     [1.0, 0.0, 0.0, 0.0], # pu
     [0.5, 0.0, 0.0, 0.5], # pf
     [0.5, 0.0, 0.5, 0.0], # ru
     [0.0, 0.0, 0.5, 0.5], # rf
    ])]

class QLearner:
    def __init__(self, S, A, R, T, gamma=0.95, alpha=0.1, k=10, n=50000):
        '''Creates a Q-learner for a Markov Decision Process
        @param S: Model states (array of strings).
        @param A: Model actions (array of strings).
        @param R: Model rewards (A-arrays).
        @param T: Model transition function (A-arrays).
        @param k: Boltzmann Temperature 
                  (i.e. initial stupidity).
        @param gamma: Discount factor.
        @param alpha: Learning rate.
        @param n: Number of iterations.
        '''
        self.S = S
        self.A = A
        self.R = R
        self.T = T
        self.gamma = gamma
        self.k = k
        self.alpha = alpha
        
        # initialise q-values uniformly
        # and each action (horizontal)
        # for each state (vertical)
        self.Qs = numpy.ones(
            (len(self.S), len(self.A)), 
            dtype=float) / len(self.S)
        
        # randomise initial state
        self.cs = numpy.random.randint(
            0, len(self.S)-1)
        
        iterations = float(n)
        while n > 0:
            self.iterate()
            self.k = 1+(1-k)*n/iterations
            n -= 1
        
    def iterate(self):
        '''Performs one action in the current
        state and updates the q-values'''
        
        # decide which action to take based
        # on Boltzmann distro of q-values of
        # the current state
        dist = self.BOLTZMANN(self.k, self.Qs[self.cs])
        
        # pick a random action
        a = self.DECIDE(dist)
        
        # decide new state based on transition
        # probabilities
        tsp = self.T[a][self.cs,:]
        newState = self.DECIDE(tsp)
        
        # compute reward
        r = self.R[a][self.cs, newState]
        
        self.Qs[self.cs, a] = (
             self.Qs[self.cs, a] * (1.0 - self.alpha) +
             self.alpha * (r + self.gamma * numpy.max(self.Qs[newState])))
        
        # update strategy for state
        self.cs = newState
        
    @staticmethod
    def DECIDE(distro):
        ''' pick a state at random given a
        probability distribution over
        discrete states
        @param distro: the prob distribution'''
        # make decision
        r = numpy.random.rand()
        tmp = numpy.zeros((1,1))
        for a in xrange(len(distro)):
            tmp += distro[a]
            if r < tmp:
                # decision made
                break
        return a
    
    @staticmethod
    def BOLTZMANN(k, Qs):
        '''Computes the Boltzmann distribution
        for doing something "stupid".
        @param k: temperature (aka stupidity)
        @param Qs: array of q values of the
        current state.
        '''
        # compute e^(q)/k for all actions
        Es = numpy.exp(Qs/k)
        
        # [x_j : e^[ q(a_j,s)/k ] / 
        # [sum_j e^[q(a_j, s)/k]]]
        return Es/numpy.sum(Es)
    
    def __str__(self):
        ''' output the current optimal strategy'''
        strategy_str = lambda row:self.A[row.argmax()]
        return str(self.Qs)+'\n['+','.join(map(strategy_str, self.Qs))+']'   

if __name__ == '__main__':
    # Reinforcement Learner Q-learner
    ql = QLearner(S, A, R, T)
    print ql
