rein q1

2024-07-08 17:37:53 +08:00
parent 09c85f8942
commit 138a07eef6
5 changed files with 42 additions and 662 deletions
--- a/reinforcement/test_cases/q1/1-tinygrid.test_output
+++ b/reinforcement/test_cases/q1/1-tinygrid.test_output
@@ -1,132 +0,0 @@
-Values at iteration 0 are correct.
-   Student/correct solution:
- values_k_0: """
-            0.0000
-            0.0000
-            0.0000
-"""
-
-
-Q-Values at iteration 0 for action south are correct.
-   Student/correct solution:
- q_values_k_0_action_south: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Q-Values at iteration 0 for action west are correct.
-   Student/correct solution:
- q_values_k_0_action_west: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Q-Values at iteration 0 for action exit are correct.
-   Student/correct solution:
- q_values_k_0_action_exit: """
-          -10.0000
-           illegal
-           10.0000
-"""
-
-
-Q-Values at iteration 0 for action east are correct.
-   Student/correct solution:
- q_values_k_0_action_east: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Q-Values at iteration 0 for action north are correct.
-   Student/correct solution:
- q_values_k_0_action_north: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Values at iteration 1 are NOT correct.
-   Student solution:
- values_k_1: """
-            0.0000
-            0.0000
-            0.0000
-"""
-
-
-   Correct solution:
- values_k_1: """
-          -10.0000
-            0.0000
-           10.0000
-"""
-
-
-Q-Values at iteration 1 for action south are NOT correct.
-   Student solution:
- q_values_k_1_action_south: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-   Correct solution:
- q_values_k_1_action_south: """
-           illegal
-            5.0000
-           illegal
-"""
-
-
-Q-Values at iteration 1 for action west are correct.
-   Student/correct solution:
- q_values_k_1_action_west: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Q-Values at iteration 1 for action exit are correct.
-   Student/correct solution:
- q_values_k_1_action_exit: """
-          -10.0000
-           illegal
-           10.0000
-"""
-
-
-Q-Values at iteration 1 for action east are correct.
-   Student/correct solution:
- q_values_k_1_action_east: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Q-Values at iteration 1 for action north are NOT correct.
-   Student solution:
- q_values_k_1_action_north: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-   Correct solution:
- q_values_k_1_action_north: """
-           illegal
-           -5.0000
-           illegal
-"""
-
-
--- a/reinforcement/test_cases/q1/2-tinygrid-noisy.test_output
+++ b/reinforcement/test_cases/q1/2-tinygrid-noisy.test_output
@@ -1,132 +0,0 @@
-Values at iteration 0 are correct.
-   Student/correct solution:
- values_k_0: """
-            0.0000
-            0.0000
-            0.0000
-"""
-
-
-Q-Values at iteration 0 for action south are correct.
-   Student/correct solution:
- q_values_k_0_action_south: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Q-Values at iteration 0 for action west are correct.
-   Student/correct solution:
- q_values_k_0_action_west: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Q-Values at iteration 0 for action exit are correct.
-   Student/correct solution:
- q_values_k_0_action_exit: """
-          -10.0000
-           illegal
-           10.0000
-"""
-
-
-Q-Values at iteration 0 for action east are correct.
-   Student/correct solution:
- q_values_k_0_action_east: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Q-Values at iteration 0 for action north are correct.
-   Student/correct solution:
- q_values_k_0_action_north: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Values at iteration 1 are NOT correct.
-   Student solution:
- values_k_1: """
-            0.0000
-            0.0000
-            0.0000
-"""
-
-
-   Correct solution:
- values_k_1: """
-          -10.0000
-            0.0000
-           10.0000
-"""
-
-
-Q-Values at iteration 1 for action south are NOT correct.
-   Student solution:
- q_values_k_1_action_south: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-   Correct solution:
- q_values_k_1_action_south: """
-           illegal
-            5.6250
-           illegal
-"""
-
-
-Q-Values at iteration 1 for action west are correct.
-   Student/correct solution:
- q_values_k_1_action_west: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Q-Values at iteration 1 for action exit are correct.
-   Student/correct solution:
- q_values_k_1_action_exit: """
-          -10.0000
-           illegal
-           10.0000
-"""
-
-
-Q-Values at iteration 1 for action east are correct.
-   Student/correct solution:
- q_values_k_1_action_east: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Q-Values at iteration 1 for action north are NOT correct.
-   Student solution:
- q_values_k_1_action_north: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-   Correct solution:
- q_values_k_1_action_north: """
-           illegal
-           -5.6250
-           illegal
-"""
-
-
--- a/reinforcement/test_cases/q1/3-bridge.test_output
+++ b/reinforcement/test_cases/q1/3-bridge.test_output
@@ -1,216 +0,0 @@
-Values at iteration 0 are correct.
-   Student/correct solution:
- values_k_0: """
-        __________       0.0000   __________
-            0.0000       0.0000       0.0000
-            0.0000       0.0000       0.0000
-            0.0000       0.0000       0.0000
-            0.0000       0.0000       0.0000
-            0.0000       0.0000       0.0000
-        __________       0.0000   __________
-"""
-
-
-Q-Values at iteration 0 for action south are correct.
-   Student/correct solution:
- q_values_k_0_action_south: """
-        __________      illegal   __________
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-        __________      illegal   __________
-"""
-
-
-Q-Values at iteration 0 for action west are correct.
-   Student/correct solution:
- q_values_k_0_action_west: """
-        __________      illegal   __________
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-        __________      illegal   __________
-"""
-
-
-Q-Values at iteration 0 for action exit are correct.
-   Student/correct solution:
- q_values_k_0_action_exit: """
-        __________      10.0000   __________
-         -100.0000      illegal    -100.0000
-         -100.0000      illegal    -100.0000
-         -100.0000      illegal    -100.0000
-         -100.0000      illegal    -100.0000
-         -100.0000      illegal    -100.0000
-        __________       1.0000   __________
-"""
-
-
-Q-Values at iteration 0 for action east are correct.
-   Student/correct solution:
- q_values_k_0_action_east: """
-        __________      illegal   __________
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-        __________      illegal   __________
-"""
-
-
-Q-Values at iteration 0 for action north are correct.
-   Student/correct solution:
- q_values_k_0_action_north: """
-        __________      illegal   __________
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-        __________      illegal   __________
-"""
-
-
-Values at iteration 1 are NOT correct.
-   Student solution:
- values_k_1: """
-        __________       0.0000   __________
-            0.0000       0.0000       0.0000
-            0.0000       0.0000       0.0000
-            0.0000       0.0000       0.0000
-            0.0000       0.0000       0.0000
-            0.0000       0.0000       0.0000
-        __________       0.0000   __________
-"""
-
-
-   Correct solution:
- values_k_1: """
-        __________      10.0000   __________
-         -100.0000       0.0000    -100.0000
-         -100.0000       0.0000    -100.0000
-         -100.0000       0.0000    -100.0000
-         -100.0000       0.0000    -100.0000
-         -100.0000       0.0000    -100.0000
-        __________       1.0000   __________
-"""
-
-
-Q-Values at iteration 1 for action south are NOT correct.
-   Student solution:
- q_values_k_1_action_south: """
-        __________      illegal   __________
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-        __________      illegal   __________
-"""
-
-
-   Correct solution:
- q_values_k_1_action_south: """
-        __________      illegal   __________
-           illegal      -8.5000      illegal
-           illegal      -8.5000      illegal
-           illegal      -8.5000      illegal
-           illegal      -8.5000      illegal
-           illegal      -7.7350      illegal
-        __________      illegal   __________
-"""
-
-
-Q-Values at iteration 1 for action west are NOT correct.
-   Student solution:
- q_values_k_1_action_west: """
-        __________      illegal   __________
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-        __________      illegal   __________
-"""
-
-
-   Correct solution:
- q_values_k_1_action_west: """
-        __________      illegal   __________
-           illegal     -76.0750      illegal
-           illegal     -76.5000      illegal
-           illegal     -76.5000      illegal
-           illegal     -76.5000      illegal
-           illegal     -76.4575      illegal
-        __________      illegal   __________
-"""
-
-
-Q-Values at iteration 1 for action exit are correct.
-   Student/correct solution:
- q_values_k_1_action_exit: """
-        __________      10.0000   __________
-         -100.0000      illegal    -100.0000
-         -100.0000      illegal    -100.0000
-         -100.0000      illegal    -100.0000
-         -100.0000      illegal    -100.0000
-         -100.0000      illegal    -100.0000
-        __________       1.0000   __________
-"""
-
-
-Q-Values at iteration 1 for action east are NOT correct.
-   Student solution:
- q_values_k_1_action_east: """
-        __________      illegal   __________
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-        __________      illegal   __________
-"""
-
-
-   Correct solution:
- q_values_k_1_action_east: """
-        __________      illegal   __________
-           illegal     -76.0750      illegal
-           illegal     -76.5000      illegal
-           illegal     -76.5000      illegal
-           illegal     -76.5000      illegal
-           illegal     -76.4575      illegal
-        __________      illegal   __________
-"""
-
-
-Q-Values at iteration 1 for action north are NOT correct.
-   Student solution:
- q_values_k_1_action_north: """
-        __________      illegal   __________
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-        __________      illegal   __________
-"""
-
-
-   Correct solution:
- q_values_k_1_action_north: """
-        __________      illegal   __________
-           illegal      -0.8500      illegal
-           illegal      -8.5000      illegal
-           illegal      -8.5000      illegal
-           illegal      -8.5000      illegal
-           illegal      -8.5000      illegal
-        __________      illegal   __________
-"""
-
-
--- a/reinforcement/test_cases/q1/4-discountgrid.test_output
+++ b/reinforcement/test_cases/q1/4-discountgrid.test_output
@@ -1,182 +0,0 @@
-Values at iteration 0 are correct.
-   Student/correct solution:
- values_k_0: """
-            0.0000       0.0000       0.0000       0.0000       0.0000
-            0.0000       0.0000   __________       0.0000       0.0000
-            0.0000       0.0000       0.0000       0.0000       0.0000
-            0.0000       0.0000   __________   __________       0.0000
-            0.0000       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-Q-Values at iteration 0 for action south are correct.
-   Student/correct solution:
- q_values_k_0_action_south: """
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-Q-Values at iteration 0 for action west are correct.
-   Student/correct solution:
- q_values_k_0_action_west: """
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-Q-Values at iteration 0 for action exit are correct.
-   Student/correct solution:
- q_values_k_0_action_exit: """
-          -10.0000      illegal      10.0000      illegal      illegal
-          -10.0000      illegal   __________      illegal      illegal
-          -10.0000      illegal       1.0000      illegal      illegal
-          -10.0000      illegal   __________   __________      illegal
-          -10.0000      illegal      illegal      illegal      illegal
-"""
-
-
-Q-Values at iteration 0 for action east are correct.
-   Student/correct solution:
- q_values_k_0_action_east: """
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-Q-Values at iteration 0 for action north are correct.
-   Student/correct solution:
- q_values_k_0_action_north: """
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-Values at iteration 1 are NOT correct.
-   Student solution:
- values_k_1: """
-            0.0000       0.0000       0.0000       0.0000       0.0000
-            0.0000       0.0000   __________       0.0000       0.0000
-            0.0000       0.0000       0.0000       0.0000       0.0000
-            0.0000       0.0000   __________   __________       0.0000
-            0.0000       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-   Correct solution:
- values_k_1: """
-          -10.0000       0.0000      10.0000       0.0000       0.0000
-          -10.0000       0.0000   __________       0.0000       0.0000
-          -10.0000       0.0000       1.0000       0.0000       0.0000
-          -10.0000       0.0000   __________   __________       0.0000
-          -10.0000       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-Q-Values at iteration 1 for action south are NOT correct.
-   Student solution:
- q_values_k_1_action_south: """
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-   Correct solution:
- q_values_k_1_action_south: """
-           illegal       0.0000      illegal       0.9000       0.0000
-           illegal      -0.9000   __________       0.0000       0.0000
-           illegal      -0.8100      illegal       0.0900       0.0000
-           illegal      -0.9000   __________   __________       0.0000
-           illegal      -0.9000       0.0000       0.0000       0.0000
-"""
-
-
-Q-Values at iteration 1 for action west are NOT correct.
-   Student solution:
- q_values_k_1_action_west: """
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-   Correct solution:
- q_values_k_1_action_west: """
-           illegal      -7.2000      illegal       7.2000       0.0000
-           illegal      -7.2000   __________       0.0000       0.0000
-           illegal      -7.2000      illegal       0.7200       0.0000
-           illegal      -7.2000   __________   __________       0.0000
-           illegal      -7.2000       0.0000       0.0000       0.0000
-"""
-
-
-Q-Values at iteration 1 for action exit are correct.
-   Student/correct solution:
- q_values_k_1_action_exit: """
-          -10.0000      illegal      10.0000      illegal      illegal
-          -10.0000      illegal   __________      illegal      illegal
-          -10.0000      illegal       1.0000      illegal      illegal
-          -10.0000      illegal   __________   __________      illegal
-          -10.0000      illegal      illegal      illegal      illegal
-"""
-
-
-Q-Values at iteration 1 for action east are NOT correct.
-   Student solution:
- q_values_k_1_action_east: """
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-   Correct solution:
- q_values_k_1_action_east: """
-           illegal       7.2000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.7200      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-Q-Values at iteration 1 for action north are NOT correct.
-   Student solution:
- q_values_k_1_action_north: """
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-   Correct solution:
- q_values_k_1_action_north: """
-           illegal       0.0000      illegal       0.9000       0.0000
-           illegal      -0.9000   __________       0.0000       0.0000
-           illegal      -0.8100      illegal       0.0900       0.0000
-           illegal      -0.9000   __________   __________       0.0000
-           illegal      -0.9000       0.0000       0.0000       0.0000
-"""
-
-
--- a/reinforcement/valueIterationAgents.py
+++ b/reinforcement/valueIterationAgents.py
@@ -65,6 +65,35 @@ class ValueIterationAgent(ValueEstimationAgent):
          value iteration, V_k+1(...) depends on V_k(...)'s.
        """
        "*** YOUR CODE HERE ***"
+        # Write value iteration code here
+        # Hints:
+        # - After each iteration, store the new values in self.values
+        # - When updating a value, use self.values[state] = <new value>
+        # - You will need to copy the state values into a separate dictionary
+        #   to avoid changing values before computing the update.
+        # - The difference between the new value and the old value (|V_k+1(s) - V_k(s)|)
+        #   should be less than self.epsilon for all states s
+        # - Make sure to use the discount factor self.discount
+        # - Make sure to use the bellman equations to update the state values
+        # - The number of iterations is given by self.iterations
+        # - You may use the util.Counter() class
+        # - You may also use the self.mdp.getTransitionStatesAndProbs(state, action) method
+        # - You may also use the self.mdp.getReward(state, action, nextState) method
+        # - You may also use the self.mdp.getPossibleActions(state) method
+        # - You may also use the self.mdp.isTerminal(state) method
+
+        for i in range(self.iterations):
+            newValues = util.Counter()
+            for state in self.mdp.getStates():
+                if self.mdp.isTerminal(state):
+                    newValues[state] = 0
+                else:
+                    maxQValue = float("-inf")
+                    for action in self.mdp.getPossibleActions(state):
+                        qValue = self.computeQValueFromValues(state, action)
+                        maxQValue = max(maxQValue, qValue)
+                    newValues[state] = maxQValue
+            self.values = newValues


    def getValue(self, state):
@@ -79,6 +108,11 @@ class ValueIterationAgent(ValueEstimationAgent):
          value function stored in self.values.
        """
        "*** YOUR CODE HERE ***"
+        qValue = 0
+        for nextState, prob in self.mdp.getTransitionStatesAndProbs(state, action):
+            reward = self.mdp.getReward(state, action, nextState)
+            qValue += prob * (reward + self.discount * self.values[nextState])
+        return qValue
 

    def computeActionFromValues(self, state):
@@ -91,6 +125,14 @@ class ValueIterationAgent(ValueEstimationAgent):
          terminal state, you should return None.
        """
        "*** YOUR CODE HERE ***"
+        bestAction = None
+        bestQValue = float("-inf")
+        for action in self.mdp.getPossibleActions(state):
+            qValue = self.computeQValueFromValues(state, action)
+            if qValue > bestQValue:
+                bestQValue = qValue
+                bestAction = action
+        return bestAction


    def getPolicy(self, state):