package aima.core.learning.reinforcement.agent; import java.util.HashMap; import java.util.Map; import aima.core.agent.Action; import aima.core.learning.reinforcement.PerceptStateReward; import aima.core.util.FrequencyCounter; /** * Artificial Intelligence A Modern Approach (3rd Edition): page 837.
*
* *

 * function PASSIVE-TD-AGENT(percept) returns an action
 *   inputs: percept, a percept indicating the current state s' and reward signal r'
 *   persistent: π, a fixed policy
 *               U, a table of utilities, initially empty
 *               N_s, a table of frequencies for states, initially zero
 *               s,a,r, the previous state, action, and reward, initially null
 *               
 *   if s' is new then U[s'] <- r'
 *   if s is not null then
 *        increment N_s[s]
 *        U[s] <- U[s] + α(N_s[s])(r + γU[s'] - U[s])
 *   if s'.TERMINAL? then s,a,r <- null else s,a,r <- s',π[s'],r'
 *   return a
 *

* * Figure 21.4 A passive reinforcement learning agent that learns utility * estimates using temporal differences. The step-size function α(n) is * chosen to ensure convergence, as described in the text. * * @param * the state type. * @param * the action type. * * @author Ciaran O'Reilly * @author Ravi Mohan * */ public class PassiveTDAgent extends ReinforcementAgent { // persistent: π, a fixed policy private Map pi = new HashMap(); // U, a table of utilities, initially empty private Map U = new HashMap(); // N_s, a table of frequencies for states, initially zero private FrequencyCounter Ns = new FrequencyCounter(); // s,a,r, the previous state, action, and reward, initially null private S s = null; private A a = null; private Double r = null; // private double alpha = 0.0; private double gamma = 0.0; /** * Constructor. * * @param fixedPolicy * π a fixed policy. * @param alpha * a fixed learning rate. * @param gamma * discount to be used. */ public PassiveTDAgent(Map fixedPolicy, double alpha, double gamma) { this.pi.putAll(fixedPolicy); this.alpha = alpha; this.gamma = gamma; } /** * Passive reinforcement learning that learns utility estimates using * temporal differences * * @param percept * a percept indicating the current state s' and reward signal * r'. * @return an action */ @Override public A execute(PerceptStateReward percept) { // if s' is new then U[s'] <- r' S sDelta = percept.state(); double rDelta = percept.reward(); if (!U.containsKey(sDelta)) { U.put(sDelta, rDelta); } // if s is not null then if (null != s) { // increment N_s[s] Ns.incrementFor(s); // U[s] <- U[s] + α(N_s[s])(r + γU[s'] - U[s]) double U_s = U.get(s); U.put(s, U_s + alpha(Ns, s) * (r + gamma * U.get(sDelta) - U_s)); } // if s'.TERMINAL? then s,a,r <- null else s,a,r <- s',π[s'],r' if (isTerminal(sDelta)) { s = null; a = null; r = null; } else { s = sDelta; a = pi.get(sDelta); r = rDelta; } // return a return a; } @Override public Map getUtility() { return new HashMap(U); } @Override public void reset() { U = new HashMap(); Ns.clear(); s = null; a = null; r = null; } // // PROTECTED METHODS // /** * AIMA3e pg. 836 'if we change α from a fixed parameter to a function * that decreases as the number of times a state has been visited increases, * then U^π(s) itself will converge to the correct value.
*
* Note: override this method to obtain the desired behavior. * * @param Ns * a frequency counter of observed states. * @param s * the current state. * @return the learning rate to use based on the frequency of the state * passed in. */ protected double alpha(FrequencyCounter Ns, S s) { // Default implementation is just to return a fixed parameter value // irrespective of the # of times a state has been encountered return alpha; } // // PRIVATE METHODS // private boolean isTerminal(S s) { boolean terminal = false; Action a = pi.get(s); if (null == a || a.isNoOp()) { // No actions possible in state is considered terminal. terminal = true; } return terminal; } }