Files
PPCA-AIPacMan-2024/reinforcement/model.py
2024-07-22 19:10:40 +08:00

125 lines
4.5 KiB
Python

"""
Functions you should use.
Please avoid importing any other torch functions or modules.
Your code will not pass if the gradescope autograder detects any changed imports
"""
from torch.nn import Module
from torch.nn import Linear
from torch import tensor, double, optim
from torch.nn.functional import relu, mse_loss
import torch
kProductionMode=True
class DeepQNetwork(Module):
"""
A model that uses a Deep Q-value Network (DQN) to approximate Q(s,a) as part
of reinforcement learning.
"""
def __init__(self, state_dim, action_dim):
self.num_actions = action_dim
self.state_size = state_dim
super(DeepQNetwork, self).__init__()
# Remember to set self.learning_rate, self.numTrainingGames,
# and self.batch_size!
"*** YOUR CODE HERE ***"
# Initialize layers
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
layer0_size=256
layer1_size=512
layer2_size=128
layer3_size=64
self.fc0 = Linear(state_dim, layer0_size).to(self.device)
self.fc1 = Linear(layer0_size, layer1_size).to(self.device)
self.fc2 = Linear(layer1_size, layer2_size).to(self.device)
self.fc3 = Linear(layer2_size, layer3_size).to(self.device)
self.fc_out= Linear(layer3_size, action_dim).to(self.device)
# Set learning parameters
self.learning_rate = 0.01
self.numTrainingGames = 5000
self.batch_size = 128
# Optimizer
self.optimizer = optim.SGD(self.parameters(), lr=self.learning_rate)
# self.scheduler1 = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=27500, eta_min=self.learning_rate*0.25) # Replace with CosineAnnealingLR
# self.scheduler2 = torch.optim.lr_scheduler.ExponentialLR(self.optimizer, gamma=0.9999)
self.output_step=500
self.output_cnt=0
"**END CODE"""
self.double()
def get_loss(self, states, Q_target):
"""
Returns the Squared Loss between Q values currently predicted
by the network, and Q_target.
Inputs:
states: a (batch_size x state_dim) numpy array
Q_target: a (batch_size x num_actions) numpy array, or None
Output:
loss node between Q predictions and Q_target
"""
"*** YOUR CODE HERE ***"
Q_target_tensor = tensor(Q_target, dtype=double, device=self.device)
loss = mse_loss(self.forward(states), Q_target_tensor)
return loss
def forward(self, states):
"""
Runs the DQN for a batch of states.
The DQN takes the state and returns the Q-values for all possible actions
that can be taken. That is, if there are two actions, the network takes
as input the state s and computes the vector [Q(s, a_1), Q(s, a_2)]
Inputs:
states: a (batch_size x state_dim) numpy array
Q_target: a (batch_size x num_actions) numpy array, or None
Output:
result: (batch_size x num_actions) numpy array of Q-value
scores, for each of the actions
"""
"*** YOUR CODE HERE ***"
if states.device.type != self.device.type:
states = states.to(self.device)
x = relu(self.fc0(states))
x = relu(self.fc1(x))
x = relu(self.fc2(x))
x = relu(self.fc3(x))
Q_values = self.fc_out(x)
return Q_values
def run(self, states):
return self.forward(states)
def gradient_update(self, states, Q_target):
"""
Update your parameters by one gradient step with the .update(...) function.
You can look at the ML project for an idea of how to do this, but note that rather
than iterating through a dataset, you should only be applying a single gradient step
to the given datapoints.
Inputs:
states: a (batch_size x state_dim) numpy array
Q_target: a (batch_size x num_actions) numpy array, or None
Output:
None
"""
"*** YOUR CODE HERE ***"
if kProductionMode:
print("in production mode, no update")
return
self.optimizer.zero_grad()
loss = self.get_loss(states, Q_target)
loss.backward()
self.optimizer.step()
# self.scheduler1.step()
# self.scheduler2.step()
self.output_cnt+=1
if self.output_cnt%self.output_step==0:
print("now lr is: ", self.optimizer.param_groups[0]['lr'],"update count", self.output_cnt)