From 138a07eef6cca94f8541de47b1424d5bb9df0f85 Mon Sep 17 00:00:00 2001
From: ZhuangYumin <zhuangyumin@sjtu.edu.cn>
Date: Mon, 8 Jul 2024 17:37:53 +0800
Subject: [PATCH] rein q1

---
 .../test_cases/q1/1-tinygrid.test_output      | 132 -----------
 .../q1/2-tinygrid-noisy.test_output           | 132 -----------
 .../test_cases/q1/3-bridge.test_output        | 216 ------------------
 .../test_cases/q1/4-discountgrid.test_output  | 182 ---------------
 reinforcement/valueIterationAgents.py         |  42 ++++
 5 files changed, 42 insertions(+), 662 deletions(-)
 delete mode 100644 reinforcement/test_cases/q1/1-tinygrid.test_output
 delete mode 100644 reinforcement/test_cases/q1/2-tinygrid-noisy.test_output
 delete mode 100644 reinforcement/test_cases/q1/3-bridge.test_output
 delete mode 100644 reinforcement/test_cases/q1/4-discountgrid.test_output

diff --git a/reinforcement/test_cases/q1/1-tinygrid.test_output b/reinforcement/test_cases/q1/1-tinygrid.test_output
deleted file mode 100644
index 4824806..0000000
--- a/reinforcement/test_cases/q1/1-tinygrid.test_output
+++ /dev/null
@@ -1,132 +0,0 @@
-Values at iteration 0 are correct.
-   Student/correct solution:
- values_k_0: """
-            0.0000
-            0.0000
-            0.0000
-"""
-
-
-Q-Values at iteration 0 for action south are correct.
-   Student/correct solution:
- q_values_k_0_action_south: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Q-Values at iteration 0 for action west are correct.
-   Student/correct solution:
- q_values_k_0_action_west: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Q-Values at iteration 0 for action exit are correct.
-   Student/correct solution:
- q_values_k_0_action_exit: """
-          -10.0000
-           illegal
-           10.0000
-"""
-
-
-Q-Values at iteration 0 for action east are correct.
-   Student/correct solution:
- q_values_k_0_action_east: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Q-Values at iteration 0 for action north are correct.
-   Student/correct solution:
- q_values_k_0_action_north: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Values at iteration 1 are NOT correct.
-   Student solution:
- values_k_1: """
-            0.0000
-            0.0000
-            0.0000
-"""
-
-
-   Correct solution:
- values_k_1: """
-          -10.0000
-            0.0000
-           10.0000
-"""
-
-
-Q-Values at iteration 1 for action south are NOT correct.
-   Student solution:
- q_values_k_1_action_south: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-   Correct solution:
- q_values_k_1_action_south: """
-           illegal
-            5.0000
-           illegal
-"""
-
-
-Q-Values at iteration 1 for action west are correct.
-   Student/correct solution:
- q_values_k_1_action_west: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Q-Values at iteration 1 for action exit are correct.
-   Student/correct solution:
- q_values_k_1_action_exit: """
-          -10.0000
-           illegal
-           10.0000
-"""
-
-
-Q-Values at iteration 1 for action east are correct.
-   Student/correct solution:
- q_values_k_1_action_east: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Q-Values at iteration 1 for action north are NOT correct.
-   Student solution:
- q_values_k_1_action_north: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-   Correct solution:
- q_values_k_1_action_north: """
-           illegal
-           -5.0000
-           illegal
-"""
-
-
diff --git a/reinforcement/test_cases/q1/2-tinygrid-noisy.test_output b/reinforcement/test_cases/q1/2-tinygrid-noisy.test_output
deleted file mode 100644
index d7afe70..0000000
--- a/reinforcement/test_cases/q1/2-tinygrid-noisy.test_output
+++ /dev/null
@@ -1,132 +0,0 @@
-Values at iteration 0 are correct.
-   Student/correct solution:
- values_k_0: """
-            0.0000
-            0.0000
-            0.0000
-"""
-
-
-Q-Values at iteration 0 for action south are correct.
-   Student/correct solution:
- q_values_k_0_action_south: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Q-Values at iteration 0 for action west are correct.
-   Student/correct solution:
- q_values_k_0_action_west: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Q-Values at iteration 0 for action exit are correct.
-   Student/correct solution:
- q_values_k_0_action_exit: """
-          -10.0000
-           illegal
-           10.0000
-"""
-
-
-Q-Values at iteration 0 for action east are correct.
-   Student/correct solution:
- q_values_k_0_action_east: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Q-Values at iteration 0 for action north are correct.
-   Student/correct solution:
- q_values_k_0_action_north: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Values at iteration 1 are NOT correct.
-   Student solution:
- values_k_1: """
-            0.0000
-            0.0000
-            0.0000
-"""
-
-
-   Correct solution:
- values_k_1: """
-          -10.0000
-            0.0000
-           10.0000
-"""
-
-
-Q-Values at iteration 1 for action south are NOT correct.
-   Student solution:
- q_values_k_1_action_south: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-   Correct solution:
- q_values_k_1_action_south: """
-           illegal
-            5.6250
-           illegal
-"""
-
-
-Q-Values at iteration 1 for action west are correct.
-   Student/correct solution:
- q_values_k_1_action_west: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Q-Values at iteration 1 for action exit are correct.
-   Student/correct solution:
- q_values_k_1_action_exit: """
-          -10.0000
-           illegal
-           10.0000
-"""
-
-
-Q-Values at iteration 1 for action east are correct.
-   Student/correct solution:
- q_values_k_1_action_east: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-Q-Values at iteration 1 for action north are NOT correct.
-   Student solution:
- q_values_k_1_action_north: """
-           illegal
-            0.0000
-           illegal
-"""
-
-
-   Correct solution:
- q_values_k_1_action_north: """
-           illegal
-           -5.6250
-           illegal
-"""
-
-
diff --git a/reinforcement/test_cases/q1/3-bridge.test_output b/reinforcement/test_cases/q1/3-bridge.test_output
deleted file mode 100644
index 3cbda5d..0000000
--- a/reinforcement/test_cases/q1/3-bridge.test_output
+++ /dev/null
@@ -1,216 +0,0 @@
-Values at iteration 0 are correct.
-   Student/correct solution:
- values_k_0: """
-        __________       0.0000   __________
-            0.0000       0.0000       0.0000
-            0.0000       0.0000       0.0000
-            0.0000       0.0000       0.0000
-            0.0000       0.0000       0.0000
-            0.0000       0.0000       0.0000
-        __________       0.0000   __________
-"""
-
-
-Q-Values at iteration 0 for action south are correct.
-   Student/correct solution:
- q_values_k_0_action_south: """
-        __________      illegal   __________
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-        __________      illegal   __________
-"""
-
-
-Q-Values at iteration 0 for action west are correct.
-   Student/correct solution:
- q_values_k_0_action_west: """
-        __________      illegal   __________
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-        __________      illegal   __________
-"""
-
-
-Q-Values at iteration 0 for action exit are correct.
-   Student/correct solution:
- q_values_k_0_action_exit: """
-        __________      10.0000   __________
-         -100.0000      illegal    -100.0000
-         -100.0000      illegal    -100.0000
-         -100.0000      illegal    -100.0000
-         -100.0000      illegal    -100.0000
-         -100.0000      illegal    -100.0000
-        __________       1.0000   __________
-"""
-
-
-Q-Values at iteration 0 for action east are correct.
-   Student/correct solution:
- q_values_k_0_action_east: """
-        __________      illegal   __________
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-        __________      illegal   __________
-"""
-
-
-Q-Values at iteration 0 for action north are correct.
-   Student/correct solution:
- q_values_k_0_action_north: """
-        __________      illegal   __________
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-        __________      illegal   __________
-"""
-
-
-Values at iteration 1 are NOT correct.
-   Student solution:
- values_k_1: """
-        __________       0.0000   __________
-            0.0000       0.0000       0.0000
-            0.0000       0.0000       0.0000
-            0.0000       0.0000       0.0000
-            0.0000       0.0000       0.0000
-            0.0000       0.0000       0.0000
-        __________       0.0000   __________
-"""
-
-
-   Correct solution:
- values_k_1: """
-        __________      10.0000   __________
-         -100.0000       0.0000    -100.0000
-         -100.0000       0.0000    -100.0000
-         -100.0000       0.0000    -100.0000
-         -100.0000       0.0000    -100.0000
-         -100.0000       0.0000    -100.0000
-        __________       1.0000   __________
-"""
-
-
-Q-Values at iteration 1 for action south are NOT correct.
-   Student solution:
- q_values_k_1_action_south: """
-        __________      illegal   __________
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-        __________      illegal   __________
-"""
-
-
-   Correct solution:
- q_values_k_1_action_south: """
-        __________      illegal   __________
-           illegal      -8.5000      illegal
-           illegal      -8.5000      illegal
-           illegal      -8.5000      illegal
-           illegal      -8.5000      illegal
-           illegal      -7.7350      illegal
-        __________      illegal   __________
-"""
-
-
-Q-Values at iteration 1 for action west are NOT correct.
-   Student solution:
- q_values_k_1_action_west: """
-        __________      illegal   __________
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-        __________      illegal   __________
-"""
-
-
-   Correct solution:
- q_values_k_1_action_west: """
-        __________      illegal   __________
-           illegal     -76.0750      illegal
-           illegal     -76.5000      illegal
-           illegal     -76.5000      illegal
-           illegal     -76.5000      illegal
-           illegal     -76.4575      illegal
-        __________      illegal   __________
-"""
-
-
-Q-Values at iteration 1 for action exit are correct.
-   Student/correct solution:
- q_values_k_1_action_exit: """
-        __________      10.0000   __________
-         -100.0000      illegal    -100.0000
-         -100.0000      illegal    -100.0000
-         -100.0000      illegal    -100.0000
-         -100.0000      illegal    -100.0000
-         -100.0000      illegal    -100.0000
-        __________       1.0000   __________
-"""
-
-
-Q-Values at iteration 1 for action east are NOT correct.
-   Student solution:
- q_values_k_1_action_east: """
-        __________      illegal   __________
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-        __________      illegal   __________
-"""
-
-
-   Correct solution:
- q_values_k_1_action_east: """
-        __________      illegal   __________
-           illegal     -76.0750      illegal
-           illegal     -76.5000      illegal
-           illegal     -76.5000      illegal
-           illegal     -76.5000      illegal
-           illegal     -76.4575      illegal
-        __________      illegal   __________
-"""
-
-
-Q-Values at iteration 1 for action north are NOT correct.
-   Student solution:
- q_values_k_1_action_north: """
-        __________      illegal   __________
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-        __________      illegal   __________
-"""
-
-
-   Correct solution:
- q_values_k_1_action_north: """
-        __________      illegal   __________
-           illegal      -0.8500      illegal
-           illegal      -8.5000      illegal
-           illegal      -8.5000      illegal
-           illegal      -8.5000      illegal
-           illegal      -8.5000      illegal
-        __________      illegal   __________
-"""
-
-
diff --git a/reinforcement/test_cases/q1/4-discountgrid.test_output b/reinforcement/test_cases/q1/4-discountgrid.test_output
deleted file mode 100644
index fbc2ce8..0000000
--- a/reinforcement/test_cases/q1/4-discountgrid.test_output
+++ /dev/null
@@ -1,182 +0,0 @@
-Values at iteration 0 are correct.
-   Student/correct solution:
- values_k_0: """
-            0.0000       0.0000       0.0000       0.0000       0.0000
-            0.0000       0.0000   __________       0.0000       0.0000
-            0.0000       0.0000       0.0000       0.0000       0.0000
-            0.0000       0.0000   __________   __________       0.0000
-            0.0000       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-Q-Values at iteration 0 for action south are correct.
-   Student/correct solution:
- q_values_k_0_action_south: """
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-Q-Values at iteration 0 for action west are correct.
-   Student/correct solution:
- q_values_k_0_action_west: """
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-Q-Values at iteration 0 for action exit are correct.
-   Student/correct solution:
- q_values_k_0_action_exit: """
-          -10.0000      illegal      10.0000      illegal      illegal
-          -10.0000      illegal   __________      illegal      illegal
-          -10.0000      illegal       1.0000      illegal      illegal
-          -10.0000      illegal   __________   __________      illegal
-          -10.0000      illegal      illegal      illegal      illegal
-"""
-
-
-Q-Values at iteration 0 for action east are correct.
-   Student/correct solution:
- q_values_k_0_action_east: """
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-Q-Values at iteration 0 for action north are correct.
-   Student/correct solution:
- q_values_k_0_action_north: """
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-Values at iteration 1 are NOT correct.
-   Student solution:
- values_k_1: """
-            0.0000       0.0000       0.0000       0.0000       0.0000
-            0.0000       0.0000   __________       0.0000       0.0000
-            0.0000       0.0000       0.0000       0.0000       0.0000
-            0.0000       0.0000   __________   __________       0.0000
-            0.0000       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-   Correct solution:
- values_k_1: """
-          -10.0000       0.0000      10.0000       0.0000       0.0000
-          -10.0000       0.0000   __________       0.0000       0.0000
-          -10.0000       0.0000       1.0000       0.0000       0.0000
-          -10.0000       0.0000   __________   __________       0.0000
-          -10.0000       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-Q-Values at iteration 1 for action south are NOT correct.
-   Student solution:
- q_values_k_1_action_south: """
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-   Correct solution:
- q_values_k_1_action_south: """
-           illegal       0.0000      illegal       0.9000       0.0000
-           illegal      -0.9000   __________       0.0000       0.0000
-           illegal      -0.8100      illegal       0.0900       0.0000
-           illegal      -0.9000   __________   __________       0.0000
-           illegal      -0.9000       0.0000       0.0000       0.0000
-"""
-
-
-Q-Values at iteration 1 for action west are NOT correct.
-   Student solution:
- q_values_k_1_action_west: """
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-   Correct solution:
- q_values_k_1_action_west: """
-           illegal      -7.2000      illegal       7.2000       0.0000
-           illegal      -7.2000   __________       0.0000       0.0000
-           illegal      -7.2000      illegal       0.7200       0.0000
-           illegal      -7.2000   __________   __________       0.0000
-           illegal      -7.2000       0.0000       0.0000       0.0000
-"""
-
-
-Q-Values at iteration 1 for action exit are correct.
-   Student/correct solution:
- q_values_k_1_action_exit: """
-          -10.0000      illegal      10.0000      illegal      illegal
-          -10.0000      illegal   __________      illegal      illegal
-          -10.0000      illegal       1.0000      illegal      illegal
-          -10.0000      illegal   __________   __________      illegal
-          -10.0000      illegal      illegal      illegal      illegal
-"""
-
-
-Q-Values at iteration 1 for action east are NOT correct.
-   Student solution:
- q_values_k_1_action_east: """
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-   Correct solution:
- q_values_k_1_action_east: """
-           illegal       7.2000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.7200      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-Q-Values at iteration 1 for action north are NOT correct.
-   Student solution:
- q_values_k_1_action_north: """
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-
-
-   Correct solution:
- q_values_k_1_action_north: """
-           illegal       0.0000      illegal       0.9000       0.0000
-           illegal      -0.9000   __________       0.0000       0.0000
-           illegal      -0.8100      illegal       0.0900       0.0000
-           illegal      -0.9000   __________   __________       0.0000
-           illegal      -0.9000       0.0000       0.0000       0.0000
-"""
-
-
diff --git a/reinforcement/valueIterationAgents.py b/reinforcement/valueIterationAgents.py
index 6d0afd3..c987461 100644
--- a/reinforcement/valueIterationAgents.py
+++ b/reinforcement/valueIterationAgents.py
@@ -65,6 +65,35 @@ class ValueIterationAgent(ValueEstimationAgent):
           value iteration, V_k+1(...) depends on V_k(...)'s.
         """
         "*** YOUR CODE HERE ***"
+        # Write value iteration code here
+        # Hints:
+        # - After each iteration, store the new values in self.values
+        # - When updating a value, use self.values[state] = <new value>
+        # - You will need to copy the state values into a separate dictionary
+        #   to avoid changing values before computing the update.
+        # - The difference between the new value and the old value (|V_k+1(s) - V_k(s)|)
+        #   should be less than self.epsilon for all states s
+        # - Make sure to use the discount factor self.discount
+        # - Make sure to use the bellman equations to update the state values
+        # - The number of iterations is given by self.iterations
+        # - You may use the util.Counter() class
+        # - You may also use the self.mdp.getTransitionStatesAndProbs(state, action) method
+        # - You may also use the self.mdp.getReward(state, action, nextState) method
+        # - You may also use the self.mdp.getPossibleActions(state) method
+        # - You may also use the self.mdp.isTerminal(state) method
+
+        for i in range(self.iterations):
+            newValues = util.Counter()
+            for state in self.mdp.getStates():
+                if self.mdp.isTerminal(state):
+                    newValues[state] = 0
+                else:
+                    maxQValue = float("-inf")
+                    for action in self.mdp.getPossibleActions(state):
+                        qValue = self.computeQValueFromValues(state, action)
+                        maxQValue = max(maxQValue, qValue)
+                    newValues[state] = maxQValue
+            self.values = newValues
 
 
     def getValue(self, state):
@@ -79,6 +108,11 @@ class ValueIterationAgent(ValueEstimationAgent):
           value function stored in self.values.
         """
         "*** YOUR CODE HERE ***"
+        qValue = 0
+        for nextState, prob in self.mdp.getTransitionStatesAndProbs(state, action):
+            reward = self.mdp.getReward(state, action, nextState)
+            qValue += prob * (reward + self.discount * self.values[nextState])
+        return qValue
  
 
     def computeActionFromValues(self, state):
@@ -91,6 +125,14 @@ class ValueIterationAgent(ValueEstimationAgent):
           terminal state, you should return None.
         """
         "*** YOUR CODE HERE ***"
+        bestAction = None
+        bestQValue = float("-inf")
+        for action in self.mdp.getPossibleActions(state):
+            qValue = self.computeQValueFromValues(state, action)
+            if qValue > bestQValue:
+                bestQValue = qValue
+                bestAction = action
+        return bestAction
 
 
     def getPolicy(self, state):