enter reinforcement

2024-07-06 01:30:00 +08:00
parent f105ba0150
commit e3f8181056
116 changed files with 19698 additions and 0 deletions
--- a/reinforcement/learningAgents.py
+++ b/reinforcement/learningAgents.py
@ -0,0 +1,258 @@
+# learningAgents.py
+# -----------------
+# Licensing Information:  You are free to use or extend these projects for
+# educational purposes provided that (1) you do not distribute or publish
+# solutions, (2) you retain this notice, and (3) you provide clear
+# attribution to UC Berkeley, including a link to http://ai.berkeley.edu.
+# 
+# Attribution Information: The Pacman AI projects were developed at UC Berkeley.
+# The core projects and autograders were primarily created by John DeNero
+# (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
+# Student side autograding was added by Brad Miller, Nick Hay, and
+# Pieter Abbeel (pabbeel@cs.berkeley.edu).
+
+
+from game import Directions, Agent, Actions
+
+import random,util,time
+
+class ValueEstimationAgent(Agent):
+    """
+      Abstract agent which assigns values to (state,action)
+      Q-Values for an environment. As well as a value to a
+      state and a policy given respectively by,
+
+      V(s) = max_{a in actions} Q(s,a)
+      policy(s) = arg_max_{a in actions} Q(s,a)
+
+      Both ValueIterationAgent and QLearningAgent inherit
+      from this agent. While a ValueIterationAgent has
+      a model of the environment via a MarkovDecisionProcess
+      (see mdp.py) that is used to estimate Q-Values before
+      ever actually acting, the QLearningAgent estimates
+      Q-Values while acting in the environment.
+    """
+
+    def __init__(self, alpha=1.0, epsilon=0.05, gamma=0.8, numTraining = 10):
+        """
+        Sets options, which can be passed in via the Pacman command line using -a alpha=0.5,...
+        alpha    - learning rate
+        epsilon  - exploration rate
+        gamma    - discount factor
+        numTraining - number of training episodes, i.e. no learning after these many episodes
+        """
+        self.alpha = float(alpha)
+        self.epsilon = float(epsilon)
+        self.discount = float(gamma)
+        self.numTraining = int(numTraining)
+
+    ####################################
+    #    Override These Functions      #
+    ####################################
+    def getQValue(self, state, action):
+        """
+        Should return Q(state,action)
+        """
+        util.raiseNotDefined()
+
+    def getValue(self, state):
+        """
+        What is the value of this state under the best action?
+        Concretely, this is given by
+
+        V(s) = max_{a in actions} Q(s,a)
+        """
+        util.raiseNotDefined()
+
+    def getPolicy(self, state):
+        """
+        What is the best action to take in the state. Note that because
+        we might want to explore, this might not coincide with getAction
+        Concretely, this is given by
+
+        policy(s) = arg_max_{a in actions} Q(s,a)
+
+        If many actions achieve the maximal Q-value,
+        it doesn't matter which is selected.
+        """
+        util.raiseNotDefined()
+
+    def getAction(self, state):
+        """
+        state: can call state.getLegalActions()
+        Choose an action and return it.
+        """
+        util.raiseNotDefined()
+
+class ReinforcementAgent(ValueEstimationAgent):
+    """
+      Abstract Reinforcemnt Agent: A ValueEstimationAgent
+            which estimates Q-Values (as well as policies) from experience
+            rather than a model
+
+        What you need to know:
+                    - The environment will call
+                      observeTransition(state,action,nextState,deltaReward),
+                      which will call update(state, action, nextState, deltaReward)
+                      which you should override.
+        - Use self.getLegalActions(state) to know which actions
+                      are available in a state
+    """
+    ####################################
+    #    Override These Functions      #
+    ####################################
+
+    def update(self, state, action, nextState, reward):
+        """
+                This class will call this function, which you write, after
+                observing a transition and reward
+        """
+        util.raiseNotDefined()
+
+    ####################################
+    #    Read These Functions          #
+    ####################################
+
+    def getLegalActions(self,state):
+        """
+          Get the actions available for a given
+          state. This is what you should use to
+          obtain legal actions for a state
+        """
+        return self.actionFn(state)
+
+    def observeTransition(self, state,action,nextState,deltaReward):
+        """
+            Called by environment to inform agent that a transition has
+            been observed. This will result in a call to self.update
+            on the same arguments
+
+            NOTE: Do *not* override or call this function
+        """
+        self.episodeRewards += deltaReward
+        self.update(state,action,nextState,deltaReward)
+
+    def startEpisode(self):
+        """
+          Called by environment when new episode is starting
+        """
+        self.lastState = None
+        self.lastAction = None
+        self.episodeRewards = 0.0
+
+    def stopEpisode(self):
+        """
+          Called by environment when episode is done
+        """
+        if self.episodesSoFar < self.numTraining:
+            self.accumTrainRewards += self.episodeRewards
+        else:
+            self.accumTestRewards += self.episodeRewards
+        self.episodesSoFar += 1
+        if self.episodesSoFar >= self.numTraining:
+            # Take off the training wheels
+            self.epsilon = 0.0    # no exploration
+            self.alpha = 0.0      # no learning
+
+    def isInTraining(self):
+        return self.episodesSoFar < self.numTraining
+
+    def isInTesting(self):
+        return not self.isInTraining()
+
+    def __init__(self, actionFn = None, numTraining=100, epsilon=0.5, alpha=0.5, gamma=1):
+        """
+        actionFn: Function which takes a state and returns the list of legal actions
+
+        alpha    - learning rate
+        epsilon  - exploration rate
+        gamma    - discount factor
+        numTraining - number of training episodes, i.e. no learning after these many episodes
+        """
+        if actionFn == None:
+            actionFn = lambda state: state.getLegalActions()
+        self.actionFn = actionFn
+        self.episodesSoFar = 0
+        self.accumTrainRewards = 0.0
+        self.accumTestRewards = 0.0
+        self.numTraining = int(numTraining)
+        self.epsilon = float(epsilon)
+        self.alpha = float(alpha)
+        self.discount = float(gamma)
+
+    ################################
+    # Controls needed for Crawler  #
+    ################################
+    def setEpsilon(self, epsilon):
+        self.epsilon = epsilon
+
+    def setLearningRate(self, alpha):
+        self.alpha = alpha
+
+    def setDiscount(self, discount):
+        self.discount = discount
+
+    def doAction(self,state,action):
+        """
+            Called by inherited class when
+            an action is taken in a state
+        """
+        self.lastState = state
+        self.lastAction = action
+
+    ###################
+    # Pacman Specific #
+    ###################
+    def observationFunction(self, state):
+        """
+            This is where we ended up after our last action.
+            The simulation should somehow ensure this is called
+        """
+        if not self.lastState is None:
+            reward = state.getScore() - self.lastState.getScore()
+            self.observeTransition(self.lastState, self.lastAction, state, reward)
+        return state
+
+    def registerInitialState(self, state):
+        self.startEpisode()
+        if self.episodesSoFar == 0:
+            print('Beginning %d episodes of Training' % (self.numTraining))
+
+    def final(self, state):
+        """
+          Called by Pacman game at the terminal state
+        """
+        deltaReward = state.getScore() - self.lastState.getScore()
+        self.observeTransition(self.lastState, self.lastAction, state, deltaReward)
+        self.stopEpisode()
+
+        # Make sure we have this var
+        if not 'episodeStartTime' in self.__dict__:
+            self.episodeStartTime = time.time()
+        if not 'lastWindowAccumRewards' in self.__dict__:
+            self.lastWindowAccumRewards = 0.0
+        self.lastWindowAccumRewards += state.getScore()
+
+        NUM_EPS_UPDATE = 100
+        if self.episodesSoFar % NUM_EPS_UPDATE == 0:
+            print('Reinforcement Learning Status:')
+            windowAvg = self.lastWindowAccumRewards / float(NUM_EPS_UPDATE)
+            if self.episodesSoFar <= self.numTraining:
+                trainAvg = self.accumTrainRewards / float(self.episodesSoFar)
+                print('\tCompleted %d out of %d training episodes' % (
+                       self.episodesSoFar,self.numTraining))
+                print('\tAverage Rewards over all training: %.2f' % (
+                        trainAvg))
+            else:
+                testAvg = float(self.accumTestRewards) / (self.episodesSoFar - self.numTraining)
+                print('\tCompleted %d test episodes' % (self.episodesSoFar - self.numTraining))
+                print('\tAverage Rewards over testing: %.2f' % testAvg)
+            print('\tAverage Rewards for last %d episodes: %.2f'  % (
+                    NUM_EPS_UPDATE,windowAvg))
+            print('\tEpisode took %.2f seconds' % (time.time() - self.episodeStartTime))
+            self.lastWindowAccumRewards = 0.0
+            self.episodeStartTime = time.time()
+
+        if self.episodesSoFar == self.numTraining:
+            msg = 'Training Done (turning off epsilon and alpha)'
+            print('%s\n%s' % (msg,'-' * len(msg)))