rein q1
This commit is contained in:
@ -65,6 +65,35 @@ class ValueIterationAgent(ValueEstimationAgent):
|
||||
value iteration, V_k+1(...) depends on V_k(...)'s.
|
||||
"""
|
||||
"*** YOUR CODE HERE ***"
|
||||
# Write value iteration code here
|
||||
# Hints:
|
||||
# - After each iteration, store the new values in self.values
|
||||
# - When updating a value, use self.values[state] = <new value>
|
||||
# - You will need to copy the state values into a separate dictionary
|
||||
# to avoid changing values before computing the update.
|
||||
# - The difference between the new value and the old value (|V_k+1(s) - V_k(s)|)
|
||||
# should be less than self.epsilon for all states s
|
||||
# - Make sure to use the discount factor self.discount
|
||||
# - Make sure to use the bellman equations to update the state values
|
||||
# - The number of iterations is given by self.iterations
|
||||
# - You may use the util.Counter() class
|
||||
# - You may also use the self.mdp.getTransitionStatesAndProbs(state, action) method
|
||||
# - You may also use the self.mdp.getReward(state, action, nextState) method
|
||||
# - You may also use the self.mdp.getPossibleActions(state) method
|
||||
# - You may also use the self.mdp.isTerminal(state) method
|
||||
|
||||
for i in range(self.iterations):
|
||||
newValues = util.Counter()
|
||||
for state in self.mdp.getStates():
|
||||
if self.mdp.isTerminal(state):
|
||||
newValues[state] = 0
|
||||
else:
|
||||
maxQValue = float("-inf")
|
||||
for action in self.mdp.getPossibleActions(state):
|
||||
qValue = self.computeQValueFromValues(state, action)
|
||||
maxQValue = max(maxQValue, qValue)
|
||||
newValues[state] = maxQValue
|
||||
self.values = newValues
|
||||
|
||||
|
||||
def getValue(self, state):
|
||||
@ -79,6 +108,11 @@ class ValueIterationAgent(ValueEstimationAgent):
|
||||
value function stored in self.values.
|
||||
"""
|
||||
"*** YOUR CODE HERE ***"
|
||||
qValue = 0
|
||||
for nextState, prob in self.mdp.getTransitionStatesAndProbs(state, action):
|
||||
reward = self.mdp.getReward(state, action, nextState)
|
||||
qValue += prob * (reward + self.discount * self.values[nextState])
|
||||
return qValue
|
||||
|
||||
|
||||
def computeActionFromValues(self, state):
|
||||
@ -91,6 +125,14 @@ class ValueIterationAgent(ValueEstimationAgent):
|
||||
terminal state, you should return None.
|
||||
"""
|
||||
"*** YOUR CODE HERE ***"
|
||||
bestAction = None
|
||||
bestQValue = float("-inf")
|
||||
for action in self.mdp.getPossibleActions(state):
|
||||
qValue = self.computeQValueFromValues(state, action)
|
||||
if qValue > bestQValue:
|
||||
bestQValue = qValue
|
||||
bestAction = action
|
||||
return bestAction
|
||||
|
||||
|
||||
def getPolicy(self, state):
|
||||
|
Reference in New Issue
Block a user