From 138a07eef6cca94f8541de47b1424d5bb9df0f85 Mon Sep 17 00:00:00 2001 From: ZhuangYumin Date: Mon, 8 Jul 2024 17:37:53 +0800 Subject: [PATCH] rein q1 --- .../test_cases/q1/1-tinygrid.test_output | 132 ----------- .../q1/2-tinygrid-noisy.test_output | 132 ----------- .../test_cases/q1/3-bridge.test_output | 216 ------------------ .../test_cases/q1/4-discountgrid.test_output | 182 --------------- reinforcement/valueIterationAgents.py | 42 ++++ 5 files changed, 42 insertions(+), 662 deletions(-) delete mode 100644 reinforcement/test_cases/q1/1-tinygrid.test_output delete mode 100644 reinforcement/test_cases/q1/2-tinygrid-noisy.test_output delete mode 100644 reinforcement/test_cases/q1/3-bridge.test_output delete mode 100644 reinforcement/test_cases/q1/4-discountgrid.test_output diff --git a/reinforcement/test_cases/q1/1-tinygrid.test_output b/reinforcement/test_cases/q1/1-tinygrid.test_output deleted file mode 100644 index 4824806..0000000 --- a/reinforcement/test_cases/q1/1-tinygrid.test_output +++ /dev/null @@ -1,132 +0,0 @@ -Values at iteration 0 are correct. - Student/correct solution: - values_k_0: """ - 0.0000 - 0.0000 - 0.0000 -""" - - -Q-Values at iteration 0 for action south are correct. - Student/correct solution: - q_values_k_0_action_south: """ - illegal - 0.0000 - illegal -""" - - -Q-Values at iteration 0 for action west are correct. - Student/correct solution: - q_values_k_0_action_west: """ - illegal - 0.0000 - illegal -""" - - -Q-Values at iteration 0 for action exit are correct. - Student/correct solution: - q_values_k_0_action_exit: """ - -10.0000 - illegal - 10.0000 -""" - - -Q-Values at iteration 0 for action east are correct. - Student/correct solution: - q_values_k_0_action_east: """ - illegal - 0.0000 - illegal -""" - - -Q-Values at iteration 0 for action north are correct. - Student/correct solution: - q_values_k_0_action_north: """ - illegal - 0.0000 - illegal -""" - - -Values at iteration 1 are NOT correct. - Student solution: - values_k_1: """ - 0.0000 - 0.0000 - 0.0000 -""" - - - Correct solution: - values_k_1: """ - -10.0000 - 0.0000 - 10.0000 -""" - - -Q-Values at iteration 1 for action south are NOT correct. - Student solution: - q_values_k_1_action_south: """ - illegal - 0.0000 - illegal -""" - - - Correct solution: - q_values_k_1_action_south: """ - illegal - 5.0000 - illegal -""" - - -Q-Values at iteration 1 for action west are correct. - Student/correct solution: - q_values_k_1_action_west: """ - illegal - 0.0000 - illegal -""" - - -Q-Values at iteration 1 for action exit are correct. - Student/correct solution: - q_values_k_1_action_exit: """ - -10.0000 - illegal - 10.0000 -""" - - -Q-Values at iteration 1 for action east are correct. - Student/correct solution: - q_values_k_1_action_east: """ - illegal - 0.0000 - illegal -""" - - -Q-Values at iteration 1 for action north are NOT correct. - Student solution: - q_values_k_1_action_north: """ - illegal - 0.0000 - illegal -""" - - - Correct solution: - q_values_k_1_action_north: """ - illegal - -5.0000 - illegal -""" - - diff --git a/reinforcement/test_cases/q1/2-tinygrid-noisy.test_output b/reinforcement/test_cases/q1/2-tinygrid-noisy.test_output deleted file mode 100644 index d7afe70..0000000 --- a/reinforcement/test_cases/q1/2-tinygrid-noisy.test_output +++ /dev/null @@ -1,132 +0,0 @@ -Values at iteration 0 are correct. - Student/correct solution: - values_k_0: """ - 0.0000 - 0.0000 - 0.0000 -""" - - -Q-Values at iteration 0 for action south are correct. - Student/correct solution: - q_values_k_0_action_south: """ - illegal - 0.0000 - illegal -""" - - -Q-Values at iteration 0 for action west are correct. - Student/correct solution: - q_values_k_0_action_west: """ - illegal - 0.0000 - illegal -""" - - -Q-Values at iteration 0 for action exit are correct. - Student/correct solution: - q_values_k_0_action_exit: """ - -10.0000 - illegal - 10.0000 -""" - - -Q-Values at iteration 0 for action east are correct. - Student/correct solution: - q_values_k_0_action_east: """ - illegal - 0.0000 - illegal -""" - - -Q-Values at iteration 0 for action north are correct. - Student/correct solution: - q_values_k_0_action_north: """ - illegal - 0.0000 - illegal -""" - - -Values at iteration 1 are NOT correct. - Student solution: - values_k_1: """ - 0.0000 - 0.0000 - 0.0000 -""" - - - Correct solution: - values_k_1: """ - -10.0000 - 0.0000 - 10.0000 -""" - - -Q-Values at iteration 1 for action south are NOT correct. - Student solution: - q_values_k_1_action_south: """ - illegal - 0.0000 - illegal -""" - - - Correct solution: - q_values_k_1_action_south: """ - illegal - 5.6250 - illegal -""" - - -Q-Values at iteration 1 for action west are correct. - Student/correct solution: - q_values_k_1_action_west: """ - illegal - 0.0000 - illegal -""" - - -Q-Values at iteration 1 for action exit are correct. - Student/correct solution: - q_values_k_1_action_exit: """ - -10.0000 - illegal - 10.0000 -""" - - -Q-Values at iteration 1 for action east are correct. - Student/correct solution: - q_values_k_1_action_east: """ - illegal - 0.0000 - illegal -""" - - -Q-Values at iteration 1 for action north are NOT correct. - Student solution: - q_values_k_1_action_north: """ - illegal - 0.0000 - illegal -""" - - - Correct solution: - q_values_k_1_action_north: """ - illegal - -5.6250 - illegal -""" - - diff --git a/reinforcement/test_cases/q1/3-bridge.test_output b/reinforcement/test_cases/q1/3-bridge.test_output deleted file mode 100644 index 3cbda5d..0000000 --- a/reinforcement/test_cases/q1/3-bridge.test_output +++ /dev/null @@ -1,216 +0,0 @@ -Values at iteration 0 are correct. - Student/correct solution: - values_k_0: """ - __________ 0.0000 __________ - 0.0000 0.0000 0.0000 - 0.0000 0.0000 0.0000 - 0.0000 0.0000 0.0000 - 0.0000 0.0000 0.0000 - 0.0000 0.0000 0.0000 - __________ 0.0000 __________ -""" - - -Q-Values at iteration 0 for action south are correct. - Student/correct solution: - q_values_k_0_action_south: """ - __________ illegal __________ - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - __________ illegal __________ -""" - - -Q-Values at iteration 0 for action west are correct. - Student/correct solution: - q_values_k_0_action_west: """ - __________ illegal __________ - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - __________ illegal __________ -""" - - -Q-Values at iteration 0 for action exit are correct. - Student/correct solution: - q_values_k_0_action_exit: """ - __________ 10.0000 __________ - -100.0000 illegal -100.0000 - -100.0000 illegal -100.0000 - -100.0000 illegal -100.0000 - -100.0000 illegal -100.0000 - -100.0000 illegal -100.0000 - __________ 1.0000 __________ -""" - - -Q-Values at iteration 0 for action east are correct. - Student/correct solution: - q_values_k_0_action_east: """ - __________ illegal __________ - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - __________ illegal __________ -""" - - -Q-Values at iteration 0 for action north are correct. - Student/correct solution: - q_values_k_0_action_north: """ - __________ illegal __________ - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - __________ illegal __________ -""" - - -Values at iteration 1 are NOT correct. - Student solution: - values_k_1: """ - __________ 0.0000 __________ - 0.0000 0.0000 0.0000 - 0.0000 0.0000 0.0000 - 0.0000 0.0000 0.0000 - 0.0000 0.0000 0.0000 - 0.0000 0.0000 0.0000 - __________ 0.0000 __________ -""" - - - Correct solution: - values_k_1: """ - __________ 10.0000 __________ - -100.0000 0.0000 -100.0000 - -100.0000 0.0000 -100.0000 - -100.0000 0.0000 -100.0000 - -100.0000 0.0000 -100.0000 - -100.0000 0.0000 -100.0000 - __________ 1.0000 __________ -""" - - -Q-Values at iteration 1 for action south are NOT correct. - Student solution: - q_values_k_1_action_south: """ - __________ illegal __________ - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - __________ illegal __________ -""" - - - Correct solution: - q_values_k_1_action_south: """ - __________ illegal __________ - illegal -8.5000 illegal - illegal -8.5000 illegal - illegal -8.5000 illegal - illegal -8.5000 illegal - illegal -7.7350 illegal - __________ illegal __________ -""" - - -Q-Values at iteration 1 for action west are NOT correct. - Student solution: - q_values_k_1_action_west: """ - __________ illegal __________ - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - __________ illegal __________ -""" - - - Correct solution: - q_values_k_1_action_west: """ - __________ illegal __________ - illegal -76.0750 illegal - illegal -76.5000 illegal - illegal -76.5000 illegal - illegal -76.5000 illegal - illegal -76.4575 illegal - __________ illegal __________ -""" - - -Q-Values at iteration 1 for action exit are correct. - Student/correct solution: - q_values_k_1_action_exit: """ - __________ 10.0000 __________ - -100.0000 illegal -100.0000 - -100.0000 illegal -100.0000 - -100.0000 illegal -100.0000 - -100.0000 illegal -100.0000 - -100.0000 illegal -100.0000 - __________ 1.0000 __________ -""" - - -Q-Values at iteration 1 for action east are NOT correct. - Student solution: - q_values_k_1_action_east: """ - __________ illegal __________ - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - __________ illegal __________ -""" - - - Correct solution: - q_values_k_1_action_east: """ - __________ illegal __________ - illegal -76.0750 illegal - illegal -76.5000 illegal - illegal -76.5000 illegal - illegal -76.5000 illegal - illegal -76.4575 illegal - __________ illegal __________ -""" - - -Q-Values at iteration 1 for action north are NOT correct. - Student solution: - q_values_k_1_action_north: """ - __________ illegal __________ - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - illegal 0.0000 illegal - __________ illegal __________ -""" - - - Correct solution: - q_values_k_1_action_north: """ - __________ illegal __________ - illegal -0.8500 illegal - illegal -8.5000 illegal - illegal -8.5000 illegal - illegal -8.5000 illegal - illegal -8.5000 illegal - __________ illegal __________ -""" - - diff --git a/reinforcement/test_cases/q1/4-discountgrid.test_output b/reinforcement/test_cases/q1/4-discountgrid.test_output deleted file mode 100644 index fbc2ce8..0000000 --- a/reinforcement/test_cases/q1/4-discountgrid.test_output +++ /dev/null @@ -1,182 +0,0 @@ -Values at iteration 0 are correct. - Student/correct solution: - values_k_0: """ - 0.0000 0.0000 0.0000 0.0000 0.0000 - 0.0000 0.0000 __________ 0.0000 0.0000 - 0.0000 0.0000 0.0000 0.0000 0.0000 - 0.0000 0.0000 __________ __________ 0.0000 - 0.0000 0.0000 0.0000 0.0000 0.0000 -""" - - -Q-Values at iteration 0 for action south are correct. - Student/correct solution: - q_values_k_0_action_south: """ - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ 0.0000 0.0000 - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ __________ 0.0000 - illegal 0.0000 0.0000 0.0000 0.0000 -""" - - -Q-Values at iteration 0 for action west are correct. - Student/correct solution: - q_values_k_0_action_west: """ - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ 0.0000 0.0000 - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ __________ 0.0000 - illegal 0.0000 0.0000 0.0000 0.0000 -""" - - -Q-Values at iteration 0 for action exit are correct. - Student/correct solution: - q_values_k_0_action_exit: """ - -10.0000 illegal 10.0000 illegal illegal - -10.0000 illegal __________ illegal illegal - -10.0000 illegal 1.0000 illegal illegal - -10.0000 illegal __________ __________ illegal - -10.0000 illegal illegal illegal illegal -""" - - -Q-Values at iteration 0 for action east are correct. - Student/correct solution: - q_values_k_0_action_east: """ - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ 0.0000 0.0000 - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ __________ 0.0000 - illegal 0.0000 0.0000 0.0000 0.0000 -""" - - -Q-Values at iteration 0 for action north are correct. - Student/correct solution: - q_values_k_0_action_north: """ - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ 0.0000 0.0000 - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ __________ 0.0000 - illegal 0.0000 0.0000 0.0000 0.0000 -""" - - -Values at iteration 1 are NOT correct. - Student solution: - values_k_1: """ - 0.0000 0.0000 0.0000 0.0000 0.0000 - 0.0000 0.0000 __________ 0.0000 0.0000 - 0.0000 0.0000 0.0000 0.0000 0.0000 - 0.0000 0.0000 __________ __________ 0.0000 - 0.0000 0.0000 0.0000 0.0000 0.0000 -""" - - - Correct solution: - values_k_1: """ - -10.0000 0.0000 10.0000 0.0000 0.0000 - -10.0000 0.0000 __________ 0.0000 0.0000 - -10.0000 0.0000 1.0000 0.0000 0.0000 - -10.0000 0.0000 __________ __________ 0.0000 - -10.0000 0.0000 0.0000 0.0000 0.0000 -""" - - -Q-Values at iteration 1 for action south are NOT correct. - Student solution: - q_values_k_1_action_south: """ - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ 0.0000 0.0000 - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ __________ 0.0000 - illegal 0.0000 0.0000 0.0000 0.0000 -""" - - - Correct solution: - q_values_k_1_action_south: """ - illegal 0.0000 illegal 0.9000 0.0000 - illegal -0.9000 __________ 0.0000 0.0000 - illegal -0.8100 illegal 0.0900 0.0000 - illegal -0.9000 __________ __________ 0.0000 - illegal -0.9000 0.0000 0.0000 0.0000 -""" - - -Q-Values at iteration 1 for action west are NOT correct. - Student solution: - q_values_k_1_action_west: """ - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ 0.0000 0.0000 - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ __________ 0.0000 - illegal 0.0000 0.0000 0.0000 0.0000 -""" - - - Correct solution: - q_values_k_1_action_west: """ - illegal -7.2000 illegal 7.2000 0.0000 - illegal -7.2000 __________ 0.0000 0.0000 - illegal -7.2000 illegal 0.7200 0.0000 - illegal -7.2000 __________ __________ 0.0000 - illegal -7.2000 0.0000 0.0000 0.0000 -""" - - -Q-Values at iteration 1 for action exit are correct. - Student/correct solution: - q_values_k_1_action_exit: """ - -10.0000 illegal 10.0000 illegal illegal - -10.0000 illegal __________ illegal illegal - -10.0000 illegal 1.0000 illegal illegal - -10.0000 illegal __________ __________ illegal - -10.0000 illegal illegal illegal illegal -""" - - -Q-Values at iteration 1 for action east are NOT correct. - Student solution: - q_values_k_1_action_east: """ - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ 0.0000 0.0000 - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ __________ 0.0000 - illegal 0.0000 0.0000 0.0000 0.0000 -""" - - - Correct solution: - q_values_k_1_action_east: """ - illegal 7.2000 illegal 0.0000 0.0000 - illegal 0.0000 __________ 0.0000 0.0000 - illegal 0.7200 illegal 0.0000 0.0000 - illegal 0.0000 __________ __________ 0.0000 - illegal 0.0000 0.0000 0.0000 0.0000 -""" - - -Q-Values at iteration 1 for action north are NOT correct. - Student solution: - q_values_k_1_action_north: """ - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ 0.0000 0.0000 - illegal 0.0000 illegal 0.0000 0.0000 - illegal 0.0000 __________ __________ 0.0000 - illegal 0.0000 0.0000 0.0000 0.0000 -""" - - - Correct solution: - q_values_k_1_action_north: """ - illegal 0.0000 illegal 0.9000 0.0000 - illegal -0.9000 __________ 0.0000 0.0000 - illegal -0.8100 illegal 0.0900 0.0000 - illegal -0.9000 __________ __________ 0.0000 - illegal -0.9000 0.0000 0.0000 0.0000 -""" - - diff --git a/reinforcement/valueIterationAgents.py b/reinforcement/valueIterationAgents.py index 6d0afd3..c987461 100644 --- a/reinforcement/valueIterationAgents.py +++ b/reinforcement/valueIterationAgents.py @@ -65,6 +65,35 @@ class ValueIterationAgent(ValueEstimationAgent): value iteration, V_k+1(...) depends on V_k(...)'s. """ "*** YOUR CODE HERE ***" + # Write value iteration code here + # Hints: + # - After each iteration, store the new values in self.values + # - When updating a value, use self.values[state] = + # - You will need to copy the state values into a separate dictionary + # to avoid changing values before computing the update. + # - The difference between the new value and the old value (|V_k+1(s) - V_k(s)|) + # should be less than self.epsilon for all states s + # - Make sure to use the discount factor self.discount + # - Make sure to use the bellman equations to update the state values + # - The number of iterations is given by self.iterations + # - You may use the util.Counter() class + # - You may also use the self.mdp.getTransitionStatesAndProbs(state, action) method + # - You may also use the self.mdp.getReward(state, action, nextState) method + # - You may also use the self.mdp.getPossibleActions(state) method + # - You may also use the self.mdp.isTerminal(state) method + + for i in range(self.iterations): + newValues = util.Counter() + for state in self.mdp.getStates(): + if self.mdp.isTerminal(state): + newValues[state] = 0 + else: + maxQValue = float("-inf") + for action in self.mdp.getPossibleActions(state): + qValue = self.computeQValueFromValues(state, action) + maxQValue = max(maxQValue, qValue) + newValues[state] = maxQValue + self.values = newValues def getValue(self, state): @@ -79,6 +108,11 @@ class ValueIterationAgent(ValueEstimationAgent): value function stored in self.values. """ "*** YOUR CODE HERE ***" + qValue = 0 + for nextState, prob in self.mdp.getTransitionStatesAndProbs(state, action): + reward = self.mdp.getReward(state, action, nextState) + qValue += prob * (reward + self.discount * self.values[nextState]) + return qValue def computeActionFromValues(self, state): @@ -91,6 +125,14 @@ class ValueIterationAgent(ValueEstimationAgent): terminal state, you should return None. """ "*** YOUR CODE HERE ***" + bestAction = None + bestQValue = float("-inf") + for action in self.mdp.getPossibleActions(state): + qValue = self.computeQValueFromValues(state, action) + if qValue > bestQValue: + bestQValue = qValue + bestAction = action + return bestAction def getPolicy(self, state):