rein q1

2024-07-08 17:37:53 +08:00
parent 09c85f8942
commit 138a07eef6
5 changed files with 42 additions and 662 deletions
--- a/reinforcement/valueIterationAgents.py
+++ b/reinforcement/valueIterationAgents.py
@ -65,6 +65,35 @@ class ValueIterationAgent(ValueEstimationAgent):
          value iteration, V_k+1(...) depends on V_k(...)'s.
        """
        "*** YOUR CODE HERE ***"
+        # Write value iteration code here
+        # Hints:
+        # - After each iteration, store the new values in self.values
+        # - When updating a value, use self.values[state] = <new value>
+        # - You will need to copy the state values into a separate dictionary
+        #   to avoid changing values before computing the update.
+        # - The difference between the new value and the old value (|V_k+1(s) - V_k(s)|)
+        #   should be less than self.epsilon for all states s
+        # - Make sure to use the discount factor self.discount
+        # - Make sure to use the bellman equations to update the state values
+        # - The number of iterations is given by self.iterations
+        # - You may use the util.Counter() class
+        # - You may also use the self.mdp.getTransitionStatesAndProbs(state, action) method
+        # - You may also use the self.mdp.getReward(state, action, nextState) method
+        # - You may also use the self.mdp.getPossibleActions(state) method
+        # - You may also use the self.mdp.isTerminal(state) method
+
+        for i in range(self.iterations):
+            newValues = util.Counter()
+            for state in self.mdp.getStates():
+                if self.mdp.isTerminal(state):
+                    newValues[state] = 0
+                else:
+                    maxQValue = float("-inf")
+                    for action in self.mdp.getPossibleActions(state):
+                        qValue = self.computeQValueFromValues(state, action)
+                        maxQValue = max(maxQValue, qValue)
+                    newValues[state] = maxQValue
+            self.values = newValues


    def getValue(self, state):
@ -79,6 +108,11 @@ class ValueIterationAgent(ValueEstimationAgent):
          value function stored in self.values.
        """
        "*** YOUR CODE HERE ***"
+        qValue = 0
+        for nextState, prob in self.mdp.getTransitionStatesAndProbs(state, action):
+            reward = self.mdp.getReward(state, action, nextState)
+            qValue += prob * (reward + self.discount * self.values[nextState])
+        return qValue
 

    def computeActionFromValues(self, state):
@ -91,6 +125,14 @@ class ValueIterationAgent(ValueEstimationAgent):
          terminal state, you should return None.
        """
        "*** YOUR CODE HERE ***"
+        bestAction = None
+        bestQValue = float("-inf")
+        for action in self.mdp.getPossibleActions(state):
+            qValue = self.computeQValueFromValues(state, action)
+            if qValue > bestQValue:
+                bestQValue = qValue
+                bestAction = action
+        return bestAction


    def getPolicy(self, state):