rein q1

2024-07-08 17:37:53 +08:00
parent 09c85f8942
commit 138a07eef6
5 changed files with 42 additions and 662 deletions
--- a/reinforcement/test_cases/q1/1-tinygrid.test_output
+++ b/reinforcement/test_cases/q1/1-tinygrid.test_output
@@ -1,132 +0,0 @@
 Values at iteration 0 are correct.
   Student/correct solution:
 values_k_0: """
            0.0000
            0.0000
            0.0000
 """
 Q-Values at iteration 0 for action south are correct.
   Student/correct solution:
 q_values_k_0_action_south: """
           illegal
            0.0000
           illegal
 """
 Q-Values at iteration 0 for action west are correct.
   Student/correct solution:
 q_values_k_0_action_west: """
           illegal
            0.0000
           illegal
 """
 Q-Values at iteration 0 for action exit are correct.
   Student/correct solution:
 q_values_k_0_action_exit: """
          -10.0000
           illegal
           10.0000
 """
 Q-Values at iteration 0 for action east are correct.
   Student/correct solution:
 q_values_k_0_action_east: """
           illegal
            0.0000
           illegal
 """
 Q-Values at iteration 0 for action north are correct.
   Student/correct solution:
 q_values_k_0_action_north: """
           illegal
            0.0000
           illegal
 """
 Values at iteration 1 are NOT correct.
   Student solution:
 values_k_1: """
            0.0000
            0.0000
            0.0000
 """
   Correct solution:
 values_k_1: """
          -10.0000
            0.0000
           10.0000
 """
 Q-Values at iteration 1 for action south are NOT correct.
   Student solution:
 q_values_k_1_action_south: """
           illegal
            0.0000
           illegal
 """
   Correct solution:
 q_values_k_1_action_south: """
           illegal
            5.0000
           illegal
 """
 Q-Values at iteration 1 for action west are correct.
   Student/correct solution:
 q_values_k_1_action_west: """
           illegal
            0.0000
           illegal
 """
 Q-Values at iteration 1 for action exit are correct.
   Student/correct solution:
 q_values_k_1_action_exit: """
          -10.0000
           illegal
           10.0000
 """
 Q-Values at iteration 1 for action east are correct.
   Student/correct solution:
 q_values_k_1_action_east: """
           illegal
            0.0000
           illegal
 """
 Q-Values at iteration 1 for action north are NOT correct.
   Student solution:
 q_values_k_1_action_north: """
           illegal
            0.0000
           illegal
 """
   Correct solution:
 q_values_k_1_action_north: """
           illegal
           -5.0000
           illegal
 """
--- a/reinforcement/test_cases/q1/2-tinygrid-noisy.test_output
+++ b/reinforcement/test_cases/q1/2-tinygrid-noisy.test_output
@@ -1,132 +0,0 @@
 Values at iteration 0 are correct.
   Student/correct solution:
 values_k_0: """
            0.0000
            0.0000
            0.0000
 """
 Q-Values at iteration 0 for action south are correct.
   Student/correct solution:
 q_values_k_0_action_south: """
           illegal
            0.0000
           illegal
 """
 Q-Values at iteration 0 for action west are correct.
   Student/correct solution:
 q_values_k_0_action_west: """
           illegal
            0.0000
           illegal
 """
 Q-Values at iteration 0 for action exit are correct.
   Student/correct solution:
 q_values_k_0_action_exit: """
          -10.0000
           illegal
           10.0000
 """
 Q-Values at iteration 0 for action east are correct.
   Student/correct solution:
 q_values_k_0_action_east: """
           illegal
            0.0000
           illegal
 """
 Q-Values at iteration 0 for action north are correct.
   Student/correct solution:
 q_values_k_0_action_north: """
           illegal
            0.0000
           illegal
 """
 Values at iteration 1 are NOT correct.
   Student solution:
 values_k_1: """
            0.0000
            0.0000
            0.0000
 """
   Correct solution:
 values_k_1: """
          -10.0000
            0.0000
           10.0000
 """
 Q-Values at iteration 1 for action south are NOT correct.
   Student solution:
 q_values_k_1_action_south: """
           illegal
            0.0000
           illegal
 """
   Correct solution:
 q_values_k_1_action_south: """
           illegal
            5.6250
           illegal
 """
 Q-Values at iteration 1 for action west are correct.
   Student/correct solution:
 q_values_k_1_action_west: """
           illegal
            0.0000
           illegal
 """
 Q-Values at iteration 1 for action exit are correct.
   Student/correct solution:
 q_values_k_1_action_exit: """
          -10.0000
           illegal
           10.0000
 """
 Q-Values at iteration 1 for action east are correct.
   Student/correct solution:
 q_values_k_1_action_east: """
           illegal
            0.0000
           illegal
 """
 Q-Values at iteration 1 for action north are NOT correct.
   Student solution:
 q_values_k_1_action_north: """
           illegal
            0.0000
           illegal
 """
   Correct solution:
 q_values_k_1_action_north: """
           illegal
           -5.6250
           illegal
 """
--- a/reinforcement/test_cases/q1/3-bridge.test_output
+++ b/reinforcement/test_cases/q1/3-bridge.test_output
@@ -1,216 +0,0 @@
 Values at iteration 0 are correct.
   Student/correct solution:
 values_k_0: """
        __________       0.0000   __________
            0.0000       0.0000       0.0000
            0.0000       0.0000       0.0000
            0.0000       0.0000       0.0000
            0.0000       0.0000       0.0000
            0.0000       0.0000       0.0000
        __________       0.0000   __________
 """
 Q-Values at iteration 0 for action south are correct.
   Student/correct solution:
 q_values_k_0_action_south: """
        __________      illegal   __________
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
        __________      illegal   __________
 """
 Q-Values at iteration 0 for action west are correct.
   Student/correct solution:
 q_values_k_0_action_west: """
        __________      illegal   __________
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
        __________      illegal   __________
 """
 Q-Values at iteration 0 for action exit are correct.
   Student/correct solution:
 q_values_k_0_action_exit: """
        __________      10.0000   __________
         -100.0000      illegal    -100.0000
         -100.0000      illegal    -100.0000
         -100.0000      illegal    -100.0000
         -100.0000      illegal    -100.0000
         -100.0000      illegal    -100.0000
        __________       1.0000   __________
 """
 Q-Values at iteration 0 for action east are correct.
   Student/correct solution:
 q_values_k_0_action_east: """
        __________      illegal   __________
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
        __________      illegal   __________
 """
 Q-Values at iteration 0 for action north are correct.
   Student/correct solution:
 q_values_k_0_action_north: """
        __________      illegal   __________
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
        __________      illegal   __________
 """
 Values at iteration 1 are NOT correct.
   Student solution:
 values_k_1: """
        __________       0.0000   __________
            0.0000       0.0000       0.0000
            0.0000       0.0000       0.0000
            0.0000       0.0000       0.0000
            0.0000       0.0000       0.0000
            0.0000       0.0000       0.0000
        __________       0.0000   __________
 """
   Correct solution:
 values_k_1: """
        __________      10.0000   __________
         -100.0000       0.0000    -100.0000
         -100.0000       0.0000    -100.0000
         -100.0000       0.0000    -100.0000
         -100.0000       0.0000    -100.0000
         -100.0000       0.0000    -100.0000
        __________       1.0000   __________
 """
 Q-Values at iteration 1 for action south are NOT correct.
   Student solution:
 q_values_k_1_action_south: """
        __________      illegal   __________
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
        __________      illegal   __________
 """
   Correct solution:
 q_values_k_1_action_south: """
        __________      illegal   __________
           illegal      -8.5000      illegal
           illegal      -8.5000      illegal
           illegal      -8.5000      illegal
           illegal      -8.5000      illegal
           illegal      -7.7350      illegal
        __________      illegal   __________
 """
 Q-Values at iteration 1 for action west are NOT correct.
   Student solution:
 q_values_k_1_action_west: """
        __________      illegal   __________
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
        __________      illegal   __________
 """
   Correct solution:
 q_values_k_1_action_west: """
        __________      illegal   __________
           illegal     -76.0750      illegal
           illegal     -76.5000      illegal
           illegal     -76.5000      illegal
           illegal     -76.5000      illegal
           illegal     -76.4575      illegal
        __________      illegal   __________
 """
 Q-Values at iteration 1 for action exit are correct.
   Student/correct solution:
 q_values_k_1_action_exit: """
        __________      10.0000   __________
         -100.0000      illegal    -100.0000
         -100.0000      illegal    -100.0000
         -100.0000      illegal    -100.0000
         -100.0000      illegal    -100.0000
         -100.0000      illegal    -100.0000
        __________       1.0000   __________
 """
 Q-Values at iteration 1 for action east are NOT correct.
   Student solution:
 q_values_k_1_action_east: """
        __________      illegal   __________
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
        __________      illegal   __________
 """
   Correct solution:
 q_values_k_1_action_east: """
        __________      illegal   __________
           illegal     -76.0750      illegal
           illegal     -76.5000      illegal
           illegal     -76.5000      illegal
           illegal     -76.5000      illegal
           illegal     -76.4575      illegal
        __________      illegal   __________
 """
 Q-Values at iteration 1 for action north are NOT correct.
   Student solution:
 q_values_k_1_action_north: """
        __________      illegal   __________
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
           illegal       0.0000      illegal
        __________      illegal   __________
 """
   Correct solution:
 q_values_k_1_action_north: """
        __________      illegal   __________
           illegal      -0.8500      illegal
           illegal      -8.5000      illegal
           illegal      -8.5000      illegal
           illegal      -8.5000      illegal
           illegal      -8.5000      illegal
        __________      illegal   __________
 """
--- a/reinforcement/test_cases/q1/4-discountgrid.test_output
+++ b/reinforcement/test_cases/q1/4-discountgrid.test_output
@@ -1,182 +0,0 @@
 Values at iteration 0 are correct.
   Student/correct solution:
 values_k_0: """
            0.0000       0.0000       0.0000       0.0000       0.0000
            0.0000       0.0000   __________       0.0000       0.0000
            0.0000       0.0000       0.0000       0.0000       0.0000
            0.0000       0.0000   __________   __________       0.0000
            0.0000       0.0000       0.0000       0.0000       0.0000
 """
 Q-Values at iteration 0 for action south are correct.
   Student/correct solution:
 q_values_k_0_action_south: """
           illegal       0.0000      illegal       0.0000       0.0000
           illegal       0.0000   __________       0.0000       0.0000
           illegal       0.0000      illegal       0.0000       0.0000
           illegal       0.0000   __________   __________       0.0000
           illegal       0.0000       0.0000       0.0000       0.0000
 """
 Q-Values at iteration 0 for action west are correct.
   Student/correct solution:
 q_values_k_0_action_west: """
           illegal       0.0000      illegal       0.0000       0.0000
           illegal       0.0000   __________       0.0000       0.0000
           illegal       0.0000      illegal       0.0000       0.0000
           illegal       0.0000   __________   __________       0.0000
           illegal       0.0000       0.0000       0.0000       0.0000
 """
 Q-Values at iteration 0 for action exit are correct.
   Student/correct solution:
 q_values_k_0_action_exit: """
          -10.0000      illegal      10.0000      illegal      illegal
          -10.0000      illegal   __________      illegal      illegal
          -10.0000      illegal       1.0000      illegal      illegal
          -10.0000      illegal   __________   __________      illegal
          -10.0000      illegal      illegal      illegal      illegal
 """
 Q-Values at iteration 0 for action east are correct.
   Student/correct solution:
 q_values_k_0_action_east: """
           illegal       0.0000      illegal       0.0000       0.0000
           illegal       0.0000   __________       0.0000       0.0000
           illegal       0.0000      illegal       0.0000       0.0000
           illegal       0.0000   __________   __________       0.0000
           illegal       0.0000       0.0000       0.0000       0.0000
 """
 Q-Values at iteration 0 for action north are correct.
   Student/correct solution:
 q_values_k_0_action_north: """
           illegal       0.0000      illegal       0.0000       0.0000
           illegal       0.0000   __________       0.0000       0.0000
           illegal       0.0000      illegal       0.0000       0.0000
           illegal       0.0000   __________   __________       0.0000
           illegal       0.0000       0.0000       0.0000       0.0000
 """
 Values at iteration 1 are NOT correct.
   Student solution:
 values_k_1: """
            0.0000       0.0000       0.0000       0.0000       0.0000
            0.0000       0.0000   __________       0.0000       0.0000
            0.0000       0.0000       0.0000       0.0000       0.0000
            0.0000       0.0000   __________   __________       0.0000
            0.0000       0.0000       0.0000       0.0000       0.0000
 """
   Correct solution:
 values_k_1: """
          -10.0000       0.0000      10.0000       0.0000       0.0000
          -10.0000       0.0000   __________       0.0000       0.0000
          -10.0000       0.0000       1.0000       0.0000       0.0000
          -10.0000       0.0000   __________   __________       0.0000
          -10.0000       0.0000       0.0000       0.0000       0.0000
 """
 Q-Values at iteration 1 for action south are NOT correct.
   Student solution:
 q_values_k_1_action_south: """
           illegal       0.0000      illegal       0.0000       0.0000
           illegal       0.0000   __________       0.0000       0.0000
           illegal       0.0000      illegal       0.0000       0.0000
           illegal       0.0000   __________   __________       0.0000
           illegal       0.0000       0.0000       0.0000       0.0000
 """
   Correct solution:
 q_values_k_1_action_south: """
           illegal       0.0000      illegal       0.9000       0.0000
           illegal      -0.9000   __________       0.0000       0.0000
           illegal      -0.8100      illegal       0.0900       0.0000
           illegal      -0.9000   __________   __________       0.0000
           illegal      -0.9000       0.0000       0.0000       0.0000
 """
 Q-Values at iteration 1 for action west are NOT correct.
   Student solution:
 q_values_k_1_action_west: """
           illegal       0.0000      illegal       0.0000       0.0000
           illegal       0.0000   __________       0.0000       0.0000
           illegal       0.0000      illegal       0.0000       0.0000
           illegal       0.0000   __________   __________       0.0000
           illegal       0.0000       0.0000       0.0000       0.0000
 """
   Correct solution:
 q_values_k_1_action_west: """
           illegal      -7.2000      illegal       7.2000       0.0000
           illegal      -7.2000   __________       0.0000       0.0000
           illegal      -7.2000      illegal       0.7200       0.0000
           illegal      -7.2000   __________   __________       0.0000
           illegal      -7.2000       0.0000       0.0000       0.0000
 """
 Q-Values at iteration 1 for action exit are correct.
   Student/correct solution:
 q_values_k_1_action_exit: """
          -10.0000      illegal      10.0000      illegal      illegal
          -10.0000      illegal   __________      illegal      illegal
          -10.0000      illegal       1.0000      illegal      illegal
          -10.0000      illegal   __________   __________      illegal
          -10.0000      illegal      illegal      illegal      illegal
 """
 Q-Values at iteration 1 for action east are NOT correct.
   Student solution:
 q_values_k_1_action_east: """
           illegal       0.0000      illegal       0.0000       0.0000
           illegal       0.0000   __________       0.0000       0.0000
           illegal       0.0000      illegal       0.0000       0.0000
           illegal       0.0000   __________   __________       0.0000
           illegal       0.0000       0.0000       0.0000       0.0000
 """
   Correct solution:
 q_values_k_1_action_east: """
           illegal       7.2000      illegal       0.0000       0.0000
           illegal       0.0000   __________       0.0000       0.0000
           illegal       0.7200      illegal       0.0000       0.0000
           illegal       0.0000   __________   __________       0.0000
           illegal       0.0000       0.0000       0.0000       0.0000
 """
 Q-Values at iteration 1 for action north are NOT correct.
   Student solution:
 q_values_k_1_action_north: """
           illegal       0.0000      illegal       0.0000       0.0000
           illegal       0.0000   __________       0.0000       0.0000
           illegal       0.0000      illegal       0.0000       0.0000
           illegal       0.0000   __________   __________       0.0000
           illegal       0.0000       0.0000       0.0000       0.0000
 """
   Correct solution:
 q_values_k_1_action_north: """
           illegal       0.0000      illegal       0.9000       0.0000
           illegal      -0.9000   __________       0.0000       0.0000
           illegal      -0.8100      illegal       0.0900       0.0000
           illegal      -0.9000   __________   __________       0.0000
           illegal      -0.9000       0.0000       0.0000       0.0000
 """
--- a/reinforcement/valueIterationAgents.py
+++ b/reinforcement/valueIterationAgents.py
@@ -65,6 +65,35 @@ class ValueIterationAgent(ValueEstimationAgent):
          value iteration, V_k+1(...) depends on V_k(...)'s.
        """
        "*** YOUR CODE HERE ***"
        # Write value iteration code here
        # Hints:
        # - After each iteration, store the new values in self.values
        # - When updating a value, use self.values[state] = <new value>
        # - You will need to copy the state values into a separate dictionary
        #   to avoid changing values before computing the update.
        # - The difference between the new value and the old value (|V_k+1(s) - V_k(s)|)
        #   should be less than self.epsilon for all states s
        # - Make sure to use the discount factor self.discount
        # - Make sure to use the bellman equations to update the state values
        # - The number of iterations is given by self.iterations
        # - You may use the util.Counter() class
        # - You may also use the self.mdp.getTransitionStatesAndProbs(state, action) method
        # - You may also use the self.mdp.getReward(state, action, nextState) method
        # - You may also use the self.mdp.getPossibleActions(state) method
        # - You may also use the self.mdp.isTerminal(state) method
        for i in range(self.iterations):
            newValues = util.Counter()
            for state in self.mdp.getStates():
                if self.mdp.isTerminal(state):
                    newValues[state] = 0
                else:
                    maxQValue = float("-inf")
                    for action in self.mdp.getPossibleActions(state):
                        qValue = self.computeQValueFromValues(state, action)
                        maxQValue = max(maxQValue, qValue)
                    newValues[state] = maxQValue
            self.values = newValues
    def getValue(self, state):
@@ -79,6 +108,11 @@ class ValueIterationAgent(ValueEstimationAgent):
          value function stored in self.values.
        """
        "*** YOUR CODE HERE ***"
        qValue = 0
        for nextState, prob in self.mdp.getTransitionStatesAndProbs(state, action):
            reward = self.mdp.getReward(state, action, nextState)
            qValue += prob * (reward + self.discount * self.values[nextState])
        return qValue
    def computeActionFromValues(self, state):
@@ -91,6 +125,14 @@ class ValueIterationAgent(ValueEstimationAgent):
          terminal state, you should return None.
        """
        "*** YOUR CODE HERE ***"
        bestAction = None
        bestQValue = float("-inf")
        for action in self.mdp.getPossibleActions(state):
            qValue = self.computeQValueFromValues(state, action)
            if qValue > bestQValue:
                bestQValue = qValue
                bestAction = action
        return bestAction
    def getPolicy(self, state):