From 675529e9dc564a7c551b85d6ab0f67c2233757f1 Mon Sep 17 00:00:00 2001
From: ZhuangYumin <zhuangyumin@sjtu.edu.cn>
Date: Mon, 8 Jul 2024 18:50:23 +0800
Subject: [PATCH] rein q4

---
 reinforcement/qlearningAgents.py              |  29 ++++-
 .../test_cases/q3/1-tinygrid.test_output      |  70 -----------
 .../q3/2-tinygrid-noisy.test_output           |  70 -----------
 .../test_cases/q3/3-bridge.test_output        | 110 ------------------
 .../test_cases/q3/4-discountgrid.test_output  |  90 --------------
 5 files changed, 28 insertions(+), 341 deletions(-)
 delete mode 100644 reinforcement/test_cases/q3/1-tinygrid.test_output
 delete mode 100644 reinforcement/test_cases/q3/2-tinygrid-noisy.test_output
 delete mode 100644 reinforcement/test_cases/q3/3-bridge.test_output
 delete mode 100644 reinforcement/test_cases/q3/4-discountgrid.test_output

diff --git a/reinforcement/qlearningAgents.py b/reinforcement/qlearningAgents.py
index 322125d..73c1ae5 100644
--- a/reinforcement/qlearningAgents.py
+++ b/reinforcement/qlearningAgents.py
@@ -58,7 +58,10 @@ class QLearningAgent(ReinforcementAgent):
           or the Q node value otherwise
         """
         "*** YOUR CODE HERE ***"
-
+        if (state, action) in self.qVals:
+            return self.qVals[(state, action)]
+        else:
+            return 0.0
 
 
     def computeValueFromQValues(self, state):
@@ -69,6 +72,12 @@ class QLearningAgent(ReinforcementAgent):
           terminal state, you should return a value of 0.0.
         """
         "*** YOUR CODE HERE ***"
+        legalActions = self.getLegalActions(state)
+        if not legalActions:
+            return 0.0
+        
+        maxQValue = max(self.getQValue(state, action) for action in legalActions)
+        return maxQValue
         
 
     def computeActionFromQValues(self, state):
@@ -78,6 +87,14 @@ class QLearningAgent(ReinforcementAgent):
           you should return None.
         """
         "*** YOUR CODE HERE ***"
+        legalActions = self.getLegalActions(state)
+        if not legalActions:
+            return None
+        
+        maxQValue = self.computeValueFromQValues(state)
+        bestActions = [action for action in legalActions if self.getQValue(state, action) == maxQValue]
+        
+        return random.choice(bestActions)
         
 
     def getAction(self, state):
@@ -94,6 +111,13 @@ class QLearningAgent(ReinforcementAgent):
         legalActions = self.getLegalActions(state)
         action = None
         "*** YOUR CODE HERE ***"
+        if not legalActions:
+            return None
+        
+        if util.flipCoin(self.epsilon):
+            return random.choice(legalActions)
+        else:
+            return self.computeActionFromQValues(state)
 
 
     def update(self, state, action, nextState, reward: float):
@@ -105,6 +129,9 @@ class QLearningAgent(ReinforcementAgent):
           it will be called on your behalf
         """
         "*** YOUR CODE HERE ***"
+        sample = reward + self.discount * self.computeValueFromQValues(nextState)
+        currentQValue = self.getQValue(state, action)
+        self.qVals[(state, action)] = (1 - self.alpha) * currentQValue + self.alpha * sample
  
 
     def getPolicy(self, state):
diff --git a/reinforcement/test_cases/q3/1-tinygrid.test_output b/reinforcement/test_cases/q3/1-tinygrid.test_output
deleted file mode 100644
index 689d16d..0000000
--- a/reinforcement/test_cases/q3/1-tinygrid.test_output
+++ /dev/null
@@ -1,70 +0,0 @@
-Q-Values at iteration 0 for action 'south' are NOT correct.   Student solution:
-	q_values_k_0_action_south: """
-           illegal
-           illegal
-           illegal
-"""
-
-   Correct solution:
-	q_values_k_0_action_south: """
-           illegal
-            0.0000
-           illegal
-"""
-
-Q-Values at iteration 0 for action 'west' are NOT correct.   Student solution:
-	q_values_k_0_action_west: """
-           illegal
-           illegal
-           illegal
-"""
-
-   Correct solution:
-	q_values_k_0_action_west: """
-           illegal
-            0.0000
-           illegal
-"""
-
-Q-Values at iteration 0 for action 'exit' are NOT correct.   Student solution:
-	q_values_k_0_action_exit: """
-           illegal
-           illegal
-           illegal
-"""
-
-   Correct solution:
-	q_values_k_0_action_exit: """
-            0.0000
-           illegal
-            0.0000
-"""
-
-Q-Values at iteration 0 for action 'east' are NOT correct.   Student solution:
-	q_values_k_0_action_east: """
-           illegal
-           illegal
-           illegal
-"""
-
-   Correct solution:
-	q_values_k_0_action_east: """
-           illegal
-            0.0000
-           illegal
-"""
-
-Q-Values at iteration 0 for action 'north' are NOT correct.   Student solution:
-	q_values_k_0_action_north: """
-           illegal
-           illegal
-           illegal
-"""
-
-   Correct solution:
-	q_values_k_0_action_north: """
-           illegal
-            0.0000
-           illegal
-"""
-
diff --git a/reinforcement/test_cases/q3/2-tinygrid-noisy.test_output b/reinforcement/test_cases/q3/2-tinygrid-noisy.test_output
deleted file mode 100644
index 689d16d..0000000
--- a/reinforcement/test_cases/q3/2-tinygrid-noisy.test_output
+++ /dev/null
@@ -1,70 +0,0 @@
-Q-Values at iteration 0 for action 'south' are NOT correct.   Student solution:
-	q_values_k_0_action_south: """
-           illegal
-           illegal
-           illegal
-"""
-
-   Correct solution:
-	q_values_k_0_action_south: """
-           illegal
-            0.0000
-           illegal
-"""
-
-Q-Values at iteration 0 for action 'west' are NOT correct.   Student solution:
-	q_values_k_0_action_west: """
-           illegal
-           illegal
-           illegal
-"""
-
-   Correct solution:
-	q_values_k_0_action_west: """
-           illegal
-            0.0000
-           illegal
-"""
-
-Q-Values at iteration 0 for action 'exit' are NOT correct.   Student solution:
-	q_values_k_0_action_exit: """
-           illegal
-           illegal
-           illegal
-"""
-
-   Correct solution:
-	q_values_k_0_action_exit: """
-            0.0000
-           illegal
-            0.0000
-"""
-
-Q-Values at iteration 0 for action 'east' are NOT correct.   Student solution:
-	q_values_k_0_action_east: """
-           illegal
-           illegal
-           illegal
-"""
-
-   Correct solution:
-	q_values_k_0_action_east: """
-           illegal
-            0.0000
-           illegal
-"""
-
-Q-Values at iteration 0 for action 'north' are NOT correct.   Student solution:
-	q_values_k_0_action_north: """
-           illegal
-           illegal
-           illegal
-"""
-
-   Correct solution:
-	q_values_k_0_action_north: """
-           illegal
-            0.0000
-           illegal
-"""
-
diff --git a/reinforcement/test_cases/q3/3-bridge.test_output b/reinforcement/test_cases/q3/3-bridge.test_output
deleted file mode 100644
index 4603d4e..0000000
--- a/reinforcement/test_cases/q3/3-bridge.test_output
+++ /dev/null
@@ -1,110 +0,0 @@
-Q-Values at iteration 0 for action 'south' are NOT correct.   Student solution:
-	q_values_k_0_action_south: """
-        __________      illegal   __________
-           illegal      illegal      illegal
-           illegal      illegal      illegal
-           illegal      illegal      illegal
-           illegal      illegal      illegal
-           illegal      illegal      illegal
-        __________      illegal   __________
-"""
-
-   Correct solution:
-	q_values_k_0_action_south: """
-        __________      illegal   __________
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-        __________      illegal   __________
-"""
-
-Q-Values at iteration 0 for action 'west' are NOT correct.   Student solution:
-	q_values_k_0_action_west: """
-        __________      illegal   __________
-           illegal      illegal      illegal
-           illegal      illegal      illegal
-           illegal      illegal      illegal
-           illegal      illegal      illegal
-           illegal      illegal      illegal
-        __________      illegal   __________
-"""
-
-   Correct solution:
-	q_values_k_0_action_west: """
-        __________      illegal   __________
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-        __________      illegal   __________
-"""
-
-Q-Values at iteration 0 for action 'exit' are NOT correct.   Student solution:
-	q_values_k_0_action_exit: """
-        __________      illegal   __________
-           illegal      illegal      illegal
-           illegal      illegal      illegal
-           illegal      illegal      illegal
-           illegal      illegal      illegal
-           illegal      illegal      illegal
-        __________      illegal   __________
-"""
-
-   Correct solution:
-	q_values_k_0_action_exit: """
-        __________       0.0000   __________
-            0.0000      illegal       0.0000
-            0.0000      illegal       0.0000
-            0.0000      illegal       0.0000
-            0.0000      illegal       0.0000
-            0.0000      illegal       0.0000
-        __________       0.0000   __________
-"""
-
-Q-Values at iteration 0 for action 'east' are NOT correct.   Student solution:
-	q_values_k_0_action_east: """
-        __________      illegal   __________
-           illegal      illegal      illegal
-           illegal      illegal      illegal
-           illegal      illegal      illegal
-           illegal      illegal      illegal
-           illegal      illegal      illegal
-        __________      illegal   __________
-"""
-
-   Correct solution:
-	q_values_k_0_action_east: """
-        __________      illegal   __________
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-        __________      illegal   __________
-"""
-
-Q-Values at iteration 0 for action 'north' are NOT correct.   Student solution:
-	q_values_k_0_action_north: """
-        __________      illegal   __________
-           illegal      illegal      illegal
-           illegal      illegal      illegal
-           illegal      illegal      illegal
-           illegal      illegal      illegal
-           illegal      illegal      illegal
-        __________      illegal   __________
-"""
-
-   Correct solution:
-	q_values_k_0_action_north: """
-        __________      illegal   __________
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-           illegal       0.0000      illegal
-        __________      illegal   __________
-"""
-
diff --git a/reinforcement/test_cases/q3/4-discountgrid.test_output b/reinforcement/test_cases/q3/4-discountgrid.test_output
deleted file mode 100644
index 1cd1d98..0000000
--- a/reinforcement/test_cases/q3/4-discountgrid.test_output
+++ /dev/null
@@ -1,90 +0,0 @@
-Q-Values at iteration 0 for action 'south' are NOT correct.   Student solution:
-	q_values_k_0_action_south: """
-           illegal      illegal      illegal      illegal      illegal
-           illegal      illegal   __________      illegal      illegal
-           illegal      illegal      illegal      illegal      illegal
-           illegal      illegal   __________   __________      illegal
-           illegal      illegal      illegal      illegal      illegal
-"""
-
-   Correct solution:
-	q_values_k_0_action_south: """
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-
-Q-Values at iteration 0 for action 'west' are NOT correct.   Student solution:
-	q_values_k_0_action_west: """
-           illegal      illegal      illegal      illegal      illegal
-           illegal      illegal   __________      illegal      illegal
-           illegal      illegal      illegal      illegal      illegal
-           illegal      illegal   __________   __________      illegal
-           illegal      illegal      illegal      illegal      illegal
-"""
-
-   Correct solution:
-	q_values_k_0_action_west: """
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-
-Q-Values at iteration 0 for action 'exit' are NOT correct.   Student solution:
-	q_values_k_0_action_exit: """
-           illegal      illegal      illegal      illegal      illegal
-           illegal      illegal   __________      illegal      illegal
-           illegal      illegal      illegal      illegal      illegal
-           illegal      illegal   __________   __________      illegal
-           illegal      illegal      illegal      illegal      illegal
-"""
-
-   Correct solution:
-	q_values_k_0_action_exit: """
-            0.0000      illegal       0.0000      illegal      illegal
-            0.0000      illegal   __________      illegal      illegal
-            0.0000      illegal       0.0000      illegal      illegal
-            0.0000      illegal   __________   __________      illegal
-            0.0000      illegal      illegal      illegal      illegal
-"""
-
-Q-Values at iteration 0 for action 'east' are NOT correct.   Student solution:
-	q_values_k_0_action_east: """
-           illegal      illegal      illegal      illegal      illegal
-           illegal      illegal   __________      illegal      illegal
-           illegal      illegal      illegal      illegal      illegal
-           illegal      illegal   __________   __________      illegal
-           illegal      illegal      illegal      illegal      illegal
-"""
-
-   Correct solution:
-	q_values_k_0_action_east: """
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-
-Q-Values at iteration 0 for action 'north' are NOT correct.   Student solution:
-	q_values_k_0_action_north: """
-           illegal      illegal      illegal      illegal      illegal
-           illegal      illegal   __________      illegal      illegal
-           illegal      illegal      illegal      illegal      illegal
-           illegal      illegal   __________   __________      illegal
-           illegal      illegal      illegal      illegal      illegal
-"""
-
-   Correct solution:
-	q_values_k_0_action_north: """
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________       0.0000       0.0000
-           illegal       0.0000      illegal       0.0000       0.0000
-           illegal       0.0000   __________   __________       0.0000
-           illegal       0.0000       0.0000       0.0000       0.0000
-"""
-