diff --git a/reinforcement/deepQLearningAgents.py b/reinforcement/deepQLearningAgents.py index fa892f0..0bdbc04 100644 --- a/reinforcement/deepQLearningAgents.py +++ b/reinforcement/deepQLearningAgents.py @@ -97,15 +97,15 @@ class PacmanDeepQAgent(PacmanQAgent): next_states = torch.tensor(next_states) done = np.array([x.done for x in minibatch]) - Q_predict = network.run(states).data.detach().numpy() + Q_predict = network.run(states).data.detach().cpu().numpy() Q_target = np.copy(Q_predict ) state_indices = states.int().detach().numpy() state_indices = (state_indices[:, 0], state_indices[:, 1]) exploration_bonus = 1 / (2 * np.sqrt((self.counts[state_indices] / 100))) replace_indices = np.arange(actions.shape[0]) - action_indices = np.argmax(network.run(next_states).data, axis=1) - target = rewards + exploration_bonus + (1 - done) * self.discount * target_network.run(next_states).data[replace_indices, action_indices].detach().numpy() + action_indices = np.argmax(network.run(next_states).data.cpu(), axis=1) + target = rewards + exploration_bonus + (1 - done) * self.discount * target_network.run(next_states).data[replace_indices, action_indices].detach().cpu().numpy() Q_target[replace_indices, actions] = target diff --git a/reinforcement/model.py b/reinforcement/model.py index ff0da75..649d99f 100644 --- a/reinforcement/model.py +++ b/reinforcement/model.py @@ -9,7 +9,7 @@ from torch.nn import Module from torch.nn import Linear from torch import tensor, double, optim from torch.nn.functional import relu, mse_loss - +import torch class DeepQNetwork(Module): @@ -24,9 +24,23 @@ class DeepQNetwork(Module): # Remember to set self.learning_rate, self.numTrainingGames, # and self.batch_size! "*** YOUR CODE HERE ***" - self.learning_rate = 0 - self.numTrainingGames = 0 - self.batch_size = 0 + # Initialize layers + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + layer1_size=300 + layer2_size=300 + # layer3_size=500 + self.fc1 = Linear(state_dim, layer1_size).to(self.device) + self.fc2 = Linear(layer1_size, layer2_size).to(self.device) + # self.fc3 = Linear(layer2_size, layer3_size).to(self.device) + self.fc_out= Linear(layer2_size, action_dim).to(self.device) + + # Set learning parameters + self.learning_rate = 0.1 + self.numTrainingGames = 3800 + self.batch_size = 128 + + # Optimizer + self.optimizer = optim.SGD(self.parameters(), lr=self.learning_rate) "**END CODE""" self.double() @@ -43,6 +57,9 @@ class DeepQNetwork(Module): loss node between Q predictions and Q_target """ "*** YOUR CODE HERE ***" + Q_target_tensor = tensor(Q_target, dtype=double, device=self.device) + loss = mse_loss(self.forward(states), Q_target_tensor) + return loss def forward(self, states): @@ -59,6 +76,14 @@ class DeepQNetwork(Module): scores, for each of the actions """ "*** YOUR CODE HERE ***" + if states.device.type != self.device.type: + states = states.to(self.device) + x = relu(self.fc1(states)) + x = relu(self.fc2(x)) + # x = relu(self.fc3(x)) + Q_values = self.fc_out(x) + return Q_values + def run(self, states): @@ -77,4 +102,8 @@ class DeepQNetwork(Module): Output: None """ - "*** YOUR CODE HERE ***" \ No newline at end of file + "*** YOUR CODE HERE ***" + self.optimizer.zero_grad() + loss = self.get_loss(states, Q_target) + loss.backward() + self.optimizer.step() \ No newline at end of file