From 675529e9dc564a7c551b85d6ab0f67c2233757f1 Mon Sep 17 00:00:00 2001 From: ZhuangYumin Date: Mon, 8 Jul 2024 18:50:23 +0800 Subject: [PATCH] rein q4 --- reinforcement/qlearningAgents.py | 29 ++++- .../test_cases/q3/1-tinygrid.test_output | 70 ----------- .../q3/2-tinygrid-noisy.test_output | 70 ----------- .../test_cases/q3/3-bridge.test_output | 110 ------------------ .../test_cases/q3/4-discountgrid.test_output | 90 -------------- 5 files changed, 28 insertions(+), 341 deletions(-) delete mode 100644 reinforcement/test_cases/q3/1-tinygrid.test_output delete mode 100644 reinforcement/test_cases/q3/2-tinygrid-noisy.test_output delete mode 100644 reinforcement/test_cases/q3/3-bridge.test_output delete mode 100644 reinforcement/test_cases/q3/4-discountgrid.test_output diff --git a/reinforcement/qlearningAgents.py b/reinforcement/qlearningAgents.py index 322125d..73c1ae5 100644 --- a/reinforcement/qlearningAgents.py +++ b/reinforcement/qlearningAgents.py @@ -58,7 +58,10 @@ class QLearningAgent(ReinforcementAgent): or the Q node value otherwise """ "*** YOUR CODE HERE ***" - + if (state, action) in self.qVals: + return self.qVals[(state, action)] + else: + return 0.0 def computeValueFromQValues(self, state): @@ -69,6 +72,12 @@ class QLearningAgent(ReinforcementAgent): terminal state, you should return a value of 0.0. """ "*** YOUR CODE HERE ***" + legalActions = self.getLegalActions(state) + if not legalActions: + return 0.0 + + maxQValue = max(self.getQValue(state, action) for action in legalActions) + return maxQValue def computeActionFromQValues(self, state): @@ -78,6 +87,14 @@ class QLearningAgent(ReinforcementAgent): you should return None. """ "*** YOUR CODE HERE ***" + legalActions = self.getLegalActions(state) + if not legalActions: + return None + + maxQValue = self.computeValueFromQValues(state) + bestActions = [action for action in legalActions if self.getQValue(state, action) == maxQValue] + + return random.choice(bestActions) def getAction(self, state): @@ -94,6 +111,13 @@ class QLearningAgent(ReinforcementAgent): legalActions = self.getLegalActions(state) action = None "*** YOUR CODE HERE ***" + if not legalActions: + return None + + if util.flipCoin(self.epsilon): + return random.choice(legalActions) + else: + return self.computeActionFromQValues(state) def update(self, state, action, nextState, reward: float): @@ -105,6 +129,9 @@ class QLearningAgent(ReinforcementAgent): it will be called on your behalf """ "*** YOUR CODE HERE ***" + sample = reward + self.discount * self.computeValueFromQValues(nextState) + currentQValue = self.getQValue(state, action) + self.qVals[(state, action)] = (1 - self.alpha) * currentQValue + self.alpha * sample def getPolicy(self, state): diff --git a/reinforcement/test_cases/q3/1-tinygrid.test_output b/reinforcement/test_cases/q3/1-tinygrid.test_output deleted file mode 100644 index 689d16d..0000000 --- a/reinforcement/test_cases/q3/1-tinygrid.test_output +++ /dev/null @@ -1,70 +0,0 @@ -Q-Values at iteration 0 for action 'south' are NOT correct. Student solution: - q_values_k_0_action_south: """ - illegal - illegal - illegal -""" - - Correct solution: - q_values_k_0_action_south: """ - illegal - 0.0000 - illegal -""" - -Q-Values at iteration 0 for action 'west' are NOT correct. Student solution: - q_values_k_0_action_west: """ - illegal - illegal - illegal -""" - - Correct solution: - q_values_k_0_action_west: """ - illegal - 0.0000 - illegal -""" - -Q-Values at iteration 0 for action 'exit' are NOT correct. Student solution: - q_values_k_0_action_exit: """ - illegal - illegal - illegal -""" - - Correct solution: - q_values_k_0_action_exit: """ - 0.0000 - illegal - 0.0000 -""" - -Q-Values at iteration 0 for action 'east' are NOT correct. Student solution: - q_values_k_0_action_east: """ - illegal - illegal - illegal -""" - - Correct solution: - q_values_k_0_action_east: """ - illegal - 0.0000 - illegal -""" - -Q-Values at iteration 0 for action 'north' are NOT correct. Student solution: - q_values_k_0_action_north: """ - illegal - illegal - illegal -""" - - Correct solution: - q_values_k_0_action_north: """ - illegal - 0.0000 - illegal -""" - diff --git a/reinforcement/test_cases/q3/2-tinygrid-noisy.test_output b/reinforcement/test_cases/q3/2-tinygrid-noisy.test_output deleted file mode 100644 index 689d16d..0000000 --- a/reinforcement/test_cases/q3/2-tinygrid-noisy.test_output +++ /dev/null @@ -1,70 +0,0 @@ -Q-Values at iteration 0 for action 'south' are NOT correct. Student solution: - q_values_k_0_action_south: """ - illegal - illegal - illegal -""" - - Correct solution: - q_values_k_0_action_south: """ - illegal - 0.0000 - illegal -""" - -Q-Values at iteration 0 for action 'west' are NOT correct. Student solution: - q_values_k_0_action_west: """ - illegal - illegal - illegal -""" - - Correct solution: - q_values_k_0_action_west: """ - illegal - 0.0000 - illegal -""" - -Q-Values at iteration 0 for action 'exit' are NOT correct. Student solution: - q_values_k_0_action_exit: """ - illegal - illegal - illegal -""" - - Correct solution: - q_values_k_0_action_exit: """ - 0.0000 - illegal - 0.0000 -""" - -Q-Values at iteration 0 for action 'east' are NOT correct. Student solution: - q_values_k_0_action_east: """ - illegal - illegal - illegal -""" - - Correct solution: - q_values_k_0_action_east: """ - illegal - 0.0000 - illegal -""" - -Q-Values at iteration 0 for action 'north' are NOT correct. Student solution: - q_values_k_0_action_north: """ - illegal - illegal - illegal -""" - - Correct solution: - q_values_k_0_action_north: """ - illegal - 0.0000 - illegal -""" - diff --git a/reinforcement/test_cases/q3/3-bridge.test_output b/reinforcement/test_cases/q3/3-bridge.test_output deleted file mode 100644 index 4603d4e..0000000 --- a/reinforcement/test_cases/q3/3-bridge.test_output +++ /dev/null @@ -1,110 +0,0 @@ -Q-Values at iteration 0 for action 'south' are NOT correct. Student solution: - q_values_k_0_action_south: """ - __________ illegal __________ - illegal illegal illegal - illegal illegal illegal - illegal illegal illegal - illegal illegal illegal - illegal illegal illegal - __________ illegal __________ -""" - - Correct solution: - q_values_k_0_action_south: """ - __________ illegal __________ - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - __________ illegal __________ -""" - -Q-Values at iteration 0 for action 'west' are NOT correct. Student solution: - q_values_k_0_action_west: """ - __________ illegal __________ - illegal illegal illegal - illegal illegal illegal - illegal illegal illegal - illegal illegal illegal - illegal illegal illegal - __________ illegal __________ -""" - - Correct solution: - q_values_k_0_action_west: """ - __________ illegal __________ - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - __________ illegal __________ -""" - -Q-Values at iteration 0 for action 'exit' are NOT correct. Student solution: - q_values_k_0_action_exit: """ - __________ illegal __________ - illegal illegal illegal - illegal illegal illegal - illegal illegal illegal - illegal illegal illegal - illegal illegal illegal - __________ illegal __________ -""" - - Correct solution: - q_values_k_0_action_exit: """ - __________ 0.0000 __________ - 0.0000 illegal 0.0000 - 0.0000 illegal 0.0000 - 0.0000 illegal 0.0000 - 0.0000 illegal 0.0000 - 0.0000 illegal 0.0000 - __________ 0.0000 __________ -""" - -Q-Values at iteration 0 for action 'east' are NOT correct. Student solution: - q_values_k_0_action_east: """ - __________ illegal __________ - illegal illegal illegal - illegal illegal illegal - illegal illegal illegal - illegal illegal illegal - illegal illegal illegal - __________ illegal __________ -""" - - Correct solution: - q_values_k_0_action_east: """ - __________ illegal __________ - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - __________ illegal __________ -""" - -Q-Values at iteration 0 for action 'north' are NOT correct. Student solution: - q_values_k_0_action_north: """ - __________ illegal __________ - illegal illegal illegal - illegal illegal illegal - illegal illegal illegal - illegal illegal illegal - illegal illegal illegal - __________ illegal __________ -""" - - Correct solution: - q_values_k_0_action_north: """ - __________ illegal __________ - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - __________ illegal __________ -""" - diff --git a/reinforcement/test_cases/q3/4-discountgrid.test_output b/reinforcement/test_cases/q3/4-discountgrid.test_output deleted file mode 100644 index 1cd1d98..0000000 --- a/reinforcement/test_cases/q3/4-discountgrid.test_output +++ /dev/null @@ -1,90 +0,0 @@ -Q-Values at iteration 0 for action 'south' are NOT correct. Student solution: - q_values_k_0_action_south: """ - illegal illegal illegal illegal illegal - illegal illegal __________ illegal illegal - illegal illegal illegal illegal illegal - illegal illegal __________ __________ illegal - illegal illegal illegal illegal illegal -""" - - Correct solution: - q_values_k_0_action_south: """ - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ 0.0000 0.0000 - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ __________ 0.0000 - illegal 0.0000 0.0000 0.0000 0.0000 -""" - -Q-Values at iteration 0 for action 'west' are NOT correct. Student solution: - q_values_k_0_action_west: """ - illegal illegal illegal illegal illegal - illegal illegal __________ illegal illegal - illegal illegal illegal illegal illegal - illegal illegal __________ __________ illegal - illegal illegal illegal illegal illegal -""" - - Correct solution: - q_values_k_0_action_west: """ - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ 0.0000 0.0000 - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ __________ 0.0000 - illegal 0.0000 0.0000 0.0000 0.0000 -""" - -Q-Values at iteration 0 for action 'exit' are NOT correct. Student solution: - q_values_k_0_action_exit: """ - illegal illegal illegal illegal illegal - illegal illegal __________ illegal illegal - illegal illegal illegal illegal illegal - illegal illegal __________ __________ illegal - illegal illegal illegal illegal illegal -""" - - Correct solution: - q_values_k_0_action_exit: """ - 0.0000 illegal 0.0000 illegal illegal - 0.0000 illegal __________ illegal illegal - 0.0000 illegal 0.0000 illegal illegal - 0.0000 illegal __________ __________ illegal - 0.0000 illegal illegal illegal illegal -""" - -Q-Values at iteration 0 for action 'east' are NOT correct. Student solution: - q_values_k_0_action_east: """ - illegal illegal illegal illegal illegal - illegal illegal __________ illegal illegal - illegal illegal illegal illegal illegal - illegal illegal __________ __________ illegal - illegal illegal illegal illegal illegal -""" - - Correct solution: - q_values_k_0_action_east: """ - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ 0.0000 0.0000 - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ __________ 0.0000 - illegal 0.0000 0.0000 0.0000 0.0000 -""" - -Q-Values at iteration 0 for action 'north' are NOT correct. Student solution: - q_values_k_0_action_north: """ - illegal illegal illegal illegal illegal - illegal illegal __________ illegal illegal - illegal illegal illegal illegal illegal - illegal illegal __________ __________ illegal - illegal illegal illegal illegal illegal -""" - - Correct solution: - q_values_k_0_action_north: """ - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ 0.0000 0.0000 - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ __________ 0.0000 - illegal 0.0000 0.0000 0.0000 0.0000 -""" -