rein q7
This commit is contained in:
@ -97,15 +97,15 @@ class PacmanDeepQAgent(PacmanQAgent):
|
|||||||
next_states = torch.tensor(next_states)
|
next_states = torch.tensor(next_states)
|
||||||
done = np.array([x.done for x in minibatch])
|
done = np.array([x.done for x in minibatch])
|
||||||
|
|
||||||
Q_predict = network.run(states).data.detach().numpy()
|
Q_predict = network.run(states).data.detach().cpu().numpy()
|
||||||
Q_target = np.copy(Q_predict )
|
Q_target = np.copy(Q_predict )
|
||||||
state_indices = states.int().detach().numpy()
|
state_indices = states.int().detach().numpy()
|
||||||
state_indices = (state_indices[:, 0], state_indices[:, 1])
|
state_indices = (state_indices[:, 0], state_indices[:, 1])
|
||||||
exploration_bonus = 1 / (2 * np.sqrt((self.counts[state_indices] / 100)))
|
exploration_bonus = 1 / (2 * np.sqrt((self.counts[state_indices] / 100)))
|
||||||
|
|
||||||
replace_indices = np.arange(actions.shape[0])
|
replace_indices = np.arange(actions.shape[0])
|
||||||
action_indices = np.argmax(network.run(next_states).data, axis=1)
|
action_indices = np.argmax(network.run(next_states).data.cpu(), axis=1)
|
||||||
target = rewards + exploration_bonus + (1 - done) * self.discount * target_network.run(next_states).data[replace_indices, action_indices].detach().numpy()
|
target = rewards + exploration_bonus + (1 - done) * self.discount * target_network.run(next_states).data[replace_indices, action_indices].detach().cpu().numpy()
|
||||||
|
|
||||||
Q_target[replace_indices, actions] = target
|
Q_target[replace_indices, actions] = target
|
||||||
|
|
||||||
|
@ -9,7 +9,7 @@ from torch.nn import Module
|
|||||||
from torch.nn import Linear
|
from torch.nn import Linear
|
||||||
from torch import tensor, double, optim
|
from torch import tensor, double, optim
|
||||||
from torch.nn.functional import relu, mse_loss
|
from torch.nn.functional import relu, mse_loss
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
class DeepQNetwork(Module):
|
class DeepQNetwork(Module):
|
||||||
@ -24,9 +24,23 @@ class DeepQNetwork(Module):
|
|||||||
# Remember to set self.learning_rate, self.numTrainingGames,
|
# Remember to set self.learning_rate, self.numTrainingGames,
|
||||||
# and self.batch_size!
|
# and self.batch_size!
|
||||||
"*** YOUR CODE HERE ***"
|
"*** YOUR CODE HERE ***"
|
||||||
self.learning_rate = 0
|
# Initialize layers
|
||||||
self.numTrainingGames = 0
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
self.batch_size = 0
|
layer1_size=300
|
||||||
|
layer2_size=300
|
||||||
|
# layer3_size=500
|
||||||
|
self.fc1 = Linear(state_dim, layer1_size).to(self.device)
|
||||||
|
self.fc2 = Linear(layer1_size, layer2_size).to(self.device)
|
||||||
|
# self.fc3 = Linear(layer2_size, layer3_size).to(self.device)
|
||||||
|
self.fc_out= Linear(layer2_size, action_dim).to(self.device)
|
||||||
|
|
||||||
|
# Set learning parameters
|
||||||
|
self.learning_rate = 0.1
|
||||||
|
self.numTrainingGames = 3800
|
||||||
|
self.batch_size = 128
|
||||||
|
|
||||||
|
# Optimizer
|
||||||
|
self.optimizer = optim.SGD(self.parameters(), lr=self.learning_rate)
|
||||||
|
|
||||||
"**END CODE"""
|
"**END CODE"""
|
||||||
self.double()
|
self.double()
|
||||||
@ -43,6 +57,9 @@ class DeepQNetwork(Module):
|
|||||||
loss node between Q predictions and Q_target
|
loss node between Q predictions and Q_target
|
||||||
"""
|
"""
|
||||||
"*** YOUR CODE HERE ***"
|
"*** YOUR CODE HERE ***"
|
||||||
|
Q_target_tensor = tensor(Q_target, dtype=double, device=self.device)
|
||||||
|
loss = mse_loss(self.forward(states), Q_target_tensor)
|
||||||
|
return loss
|
||||||
|
|
||||||
|
|
||||||
def forward(self, states):
|
def forward(self, states):
|
||||||
@ -59,6 +76,14 @@ class DeepQNetwork(Module):
|
|||||||
scores, for each of the actions
|
scores, for each of the actions
|
||||||
"""
|
"""
|
||||||
"*** YOUR CODE HERE ***"
|
"*** YOUR CODE HERE ***"
|
||||||
|
if states.device.type != self.device.type:
|
||||||
|
states = states.to(self.device)
|
||||||
|
x = relu(self.fc1(states))
|
||||||
|
x = relu(self.fc2(x))
|
||||||
|
# x = relu(self.fc3(x))
|
||||||
|
Q_values = self.fc_out(x)
|
||||||
|
return Q_values
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def run(self, states):
|
def run(self, states):
|
||||||
@ -77,4 +102,8 @@ class DeepQNetwork(Module):
|
|||||||
Output:
|
Output:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
"*** YOUR CODE HERE ***"
|
"*** YOUR CODE HERE ***"
|
||||||
|
self.optimizer.zero_grad()
|
||||||
|
loss = self.get_loss(states, Q_target)
|
||||||
|
loss.backward()
|
||||||
|
self.optimizer.step()
|
Reference in New Issue
Block a user