try to solve smallClassic

This commit is contained in:
2024-07-18 19:18:55 +08:00
parent 1bf4cc1efe
commit ceae34ea86
6 changed files with 58 additions and 13 deletions

1
.gitignore vendored
View File

@ -7,3 +7,4 @@ __pycache__/
**/.env/ **/.env/
**/.ipynb_checkpoints **/.ipynb_checkpoints
**/playground.ipynb **/playground.ipynb
**/*.bin

View File

@ -5,6 +5,7 @@ import layout
import copy import copy
import torch import torch
import numpy as np import numpy as np
import os
class PacmanDeepQAgent(PacmanQAgent): class PacmanDeepQAgent(PacmanQAgent):
def __init__(self, layout_input="smallGrid", target_update_rate=300, doubleQ=True, **args): def __init__(self, layout_input="smallGrid", target_update_rate=300, doubleQ=True, **args):
@ -15,8 +16,9 @@ class PacmanDeepQAgent(PacmanQAgent):
self.update_amount = 0 self.update_amount = 0
self.epsilon_explore = 1.0 self.epsilon_explore = 1.0
self.epsilon0 = 0.4 self.epsilon0 = 0.4
self.minimal_epsilon = 0.01
self.epsilon = self.epsilon0 self.epsilon = self.epsilon0
self.discount = 0.9 self.discount = 0.95
self.update_frequency = 3 self.update_frequency = 3
self.counts = None self.counts = None
self.replay_memory = ReplayMemory(50000) self.replay_memory = ReplayMemory(50000)
@ -54,6 +56,27 @@ class PacmanDeepQAgent(PacmanQAgent):
import model import model
self.model = model.DeepQNetwork(state_dim, action_dim) self.model = model.DeepQNetwork(state_dim, action_dim)
self.target_model = model.DeepQNetwork(state_dim, action_dim) self.target_model = model.DeepQNetwork(state_dim, action_dim)
if os.path.exists('para.bin'):
print("Loading model parameters from para.bin")
checkpoint = torch.load('para.bin')
self.model.load_state_dict(checkpoint['model_state_dict'])
self.target_model.load_state_dict(checkpoint['target_model_state_dict'])
self.model.optimizer.load_state_dict(checkpoint['model_optimizer_state_dict'])
self.target_model.optimizer.load_state_dict(checkpoint['target_model_optimizer_state_dict'])
self.replay_memory = checkpoint['memory']
print(self.model.state_dict())
else:
print("Initializing new model parameters")
def save_model(self, filename="para.bin"):
print(f"Saving model parameters to {filename}")
torch.save({
'model_state_dict': self.model.state_dict(),
'target_model_state_dict': self.target_model.state_dict(),
'model_optimizer_state_dict': self.model.optimizer.state_dict(),
"target_model_optimizer_state_dict": self.target_model.optimizer.state_dict(),
"memory": self.replay_memory
}, filename)
print(self.model.state_dict())
def getQValue(self, state, action): def getQValue(self, state, action):
""" """
@ -136,7 +159,7 @@ class PacmanDeepQAgent(PacmanQAgent):
if len(self.replay_memory) < self.min_transitions_before_training: if len(self.replay_memory) < self.min_transitions_before_training:
self.epsilon = self.epsilon_explore self.epsilon = self.epsilon_explore
else: else:
self.epsilon = max(self.epsilon0 * (1 - self.update_amount / 20000), 0) self.epsilon = max(self.epsilon0 * (1 - self.update_amount / 20000), self.minimal_epsilon)
if len(self.replay_memory) > self.min_transitions_before_training and self.update_amount % self.update_frequency == 0: if len(self.replay_memory) > self.min_transitions_before_training and self.update_amount % self.update_frequency == 0:

View File

@ -249,6 +249,13 @@ class ReinforcementAgent(ValueEstimationAgent):
print('\tAverage Rewards over testing: %.2f' % testAvg) print('\tAverage Rewards over testing: %.2f' % testAvg)
print('\tAverage Rewards for last %d episodes: %.2f' % ( print('\tAverage Rewards for last %d episodes: %.2f' % (
NUM_EPS_UPDATE,windowAvg)) NUM_EPS_UPDATE,windowAvg))
if windowAvg>-220:
if not hasattr(self,'best_window_avg_score'):
self.best_window_avg_score=-300
if windowAvg>self.best_window_avg_score:
print("find an excellent policy, ready to save model")
self.save_model("para.best.bin")
self.best_window_avg_score=windowAvg
print('\tEpisode took %.2f seconds' % (time.time() - self.episodeStartTime)) print('\tEpisode took %.2f seconds' % (time.time() - self.episodeStartTime))
self.lastWindowAccumRewards = 0.0 self.lastWindowAccumRewards = 0.0
self.episodeStartTime = time.time() self.episodeStartTime = time.time()
@ -256,3 +263,5 @@ class ReinforcementAgent(ValueEstimationAgent):
if self.episodesSoFar == self.numTraining: if self.episodesSoFar == self.numTraining:
msg = 'Training Done (turning off epsilon and alpha)' msg = 'Training Done (turning off epsilon and alpha)'
print('%s\n%s' % (msg,'-' * len(msg))) print('%s\n%s' % (msg,'-' * len(msg)))
import traceback
traceback.print_stack()

View File

@ -26,21 +26,25 @@ class DeepQNetwork(Module):
"*** YOUR CODE HERE ***" "*** YOUR CODE HERE ***"
# Initialize layers # Initialize layers
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
layer1_size=300 layer1_size=512
layer2_size=300 layer2_size=128
# layer3_size=500 layer3_size=64
self.fc1 = Linear(state_dim, layer1_size).to(self.device) self.fc1 = Linear(state_dim, layer1_size).to(self.device)
self.fc2 = Linear(layer1_size, layer2_size).to(self.device) self.fc2 = Linear(layer1_size, layer2_size).to(self.device)
# self.fc3 = Linear(layer2_size, layer3_size).to(self.device) self.fc3 = Linear(layer2_size, layer3_size).to(self.device)
self.fc_out= Linear(layer2_size, action_dim).to(self.device) self.fc_out= Linear(layer3_size, action_dim).to(self.device)
# Set learning parameters # Set learning parameters
self.learning_rate = 0.1 self.learning_rate = 0.01
self.numTrainingGames = 3800 self.numTrainingGames = 5000
self.batch_size = 128 self.batch_size = 128
# Optimizer # Optimizer
self.optimizer = optim.SGD(self.parameters(), lr=self.learning_rate) self.optimizer = optim.SGD(self.parameters(), lr=self.learning_rate)
# self.scheduler1 = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=27500, eta_min=self.learning_rate*0.25) # Replace with CosineAnnealingLR
# self.scheduler2 = torch.optim.lr_scheduler.ExponentialLR(self.optimizer, gamma=0.9999)
self.output_step=500
self.output_cnt=0
"**END CODE""" "**END CODE"""
self.double() self.double()
@ -80,7 +84,7 @@ class DeepQNetwork(Module):
states = states.to(self.device) states = states.to(self.device)
x = relu(self.fc1(states)) x = relu(self.fc1(states))
x = relu(self.fc2(x)) x = relu(self.fc2(x))
# x = relu(self.fc3(x)) x = relu(self.fc3(x))
Q_values = self.fc_out(x) Q_values = self.fc_out(x)
return Q_values return Q_values
@ -107,3 +111,8 @@ class DeepQNetwork(Module):
loss = self.get_loss(states, Q_target) loss = self.get_loss(states, Q_target)
loss.backward() loss.backward()
self.optimizer.step() self.optimizer.step()
# self.scheduler1.step()
# self.scheduler2.step()
self.output_cnt+=1
if self.output_cnt%self.output_step==0:
print("now lr is: ", self.optimizer.param_groups[0]['lr'],"update count", self.output_cnt)

View File

@ -745,6 +745,7 @@ if __name__ == '__main__':
""" """
args = readCommand(sys.argv[1:]) # Get game components based on input args = readCommand(sys.argv[1:]) # Get game components based on input
runGames(**args) runGames(**args)
args["pacman"].save_model()
# import cProfile # import cProfile
# cProfile.run("runGames( **args )") # cProfile.run("runGames( **args )")

View File

@ -564,7 +564,9 @@ class DeepQLearningTest(testClasses.TestCase):
pacman = pacmanType(self.layout) pacman = pacmanType(self.layout)
# Load Ghost Agent # Load Ghost Agent
ghostType = loadAgent("RandomGhost", nographics) ghost_agent_name="DirectionalGhost"
ghostType = loadAgent(ghost_agent_name, nographics)
print("using ghost agent", ghost_agent_name)
numghosts = 1 numghosts = 1
ghosts = [ghostType(i+1) for i in range(numghosts)] ghosts = [ghostType(i+1) for i in range(numghosts)]