diff --git a/.gitignore b/.gitignore index ba3ce20..a6c0622 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ __pycache__/ /.github **/.env/ **/.ipynb_checkpoints -**/playground.ipynb \ No newline at end of file +**/playground.ipynb +**/*.bin \ No newline at end of file diff --git a/reinforcement/deepQLearningAgents.py b/reinforcement/deepQLearningAgents.py index 0bdbc04..f688a20 100644 --- a/reinforcement/deepQLearningAgents.py +++ b/reinforcement/deepQLearningAgents.py @@ -5,6 +5,7 @@ import layout import copy import torch import numpy as np +import os class PacmanDeepQAgent(PacmanQAgent): def __init__(self, layout_input="smallGrid", target_update_rate=300, doubleQ=True, **args): @@ -15,8 +16,9 @@ class PacmanDeepQAgent(PacmanQAgent): self.update_amount = 0 self.epsilon_explore = 1.0 self.epsilon0 = 0.4 + self.minimal_epsilon = 0.01 self.epsilon = self.epsilon0 - self.discount = 0.9 + self.discount = 0.95 self.update_frequency = 3 self.counts = None self.replay_memory = ReplayMemory(50000) @@ -54,6 +56,27 @@ class PacmanDeepQAgent(PacmanQAgent): import model self.model = model.DeepQNetwork(state_dim, action_dim) self.target_model = model.DeepQNetwork(state_dim, action_dim) + if os.path.exists('para.bin'): + print("Loading model parameters from para.bin") + checkpoint = torch.load('para.bin') + self.model.load_state_dict(checkpoint['model_state_dict']) + self.target_model.load_state_dict(checkpoint['target_model_state_dict']) + self.model.optimizer.load_state_dict(checkpoint['model_optimizer_state_dict']) + self.target_model.optimizer.load_state_dict(checkpoint['target_model_optimizer_state_dict']) + self.replay_memory = checkpoint['memory'] + print(self.model.state_dict()) + else: + print("Initializing new model parameters") + def save_model(self, filename="para.bin"): + print(f"Saving model parameters to {filename}") + torch.save({ + 'model_state_dict': self.model.state_dict(), + 'target_model_state_dict': self.target_model.state_dict(), + 'model_optimizer_state_dict': self.model.optimizer.state_dict(), + "target_model_optimizer_state_dict": self.target_model.optimizer.state_dict(), + "memory": self.replay_memory + }, filename) + print(self.model.state_dict()) def getQValue(self, state, action): """ @@ -136,7 +159,7 @@ class PacmanDeepQAgent(PacmanQAgent): if len(self.replay_memory) < self.min_transitions_before_training: self.epsilon = self.epsilon_explore else: - self.epsilon = max(self.epsilon0 * (1 - self.update_amount / 20000), 0) + self.epsilon = max(self.epsilon0 * (1 - self.update_amount / 20000), self.minimal_epsilon) if len(self.replay_memory) > self.min_transitions_before_training and self.update_amount % self.update_frequency == 0: diff --git a/reinforcement/learningAgents.py b/reinforcement/learningAgents.py index 704155e..162ebd1 100644 --- a/reinforcement/learningAgents.py +++ b/reinforcement/learningAgents.py @@ -249,6 +249,13 @@ class ReinforcementAgent(ValueEstimationAgent): print('\tAverage Rewards over testing: %.2f' % testAvg) print('\tAverage Rewards for last %d episodes: %.2f' % ( NUM_EPS_UPDATE,windowAvg)) + if windowAvg>-220: + if not hasattr(self,'best_window_avg_score'): + self.best_window_avg_score=-300 + if windowAvg>self.best_window_avg_score: + print("find an excellent policy, ready to save model") + self.save_model("para.best.bin") + self.best_window_avg_score=windowAvg print('\tEpisode took %.2f seconds' % (time.time() - self.episodeStartTime)) self.lastWindowAccumRewards = 0.0 self.episodeStartTime = time.time() @@ -256,3 +263,5 @@ class ReinforcementAgent(ValueEstimationAgent): if self.episodesSoFar == self.numTraining: msg = 'Training Done (turning off epsilon and alpha)' print('%s\n%s' % (msg,'-' * len(msg))) + import traceback + traceback.print_stack() diff --git a/reinforcement/model.py b/reinforcement/model.py index 649d99f..13ae146 100644 --- a/reinforcement/model.py +++ b/reinforcement/model.py @@ -26,21 +26,25 @@ class DeepQNetwork(Module): "*** YOUR CODE HERE ***" # Initialize layers self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - layer1_size=300 - layer2_size=300 - # layer3_size=500 + layer1_size=512 + layer2_size=128 + layer3_size=64 self.fc1 = Linear(state_dim, layer1_size).to(self.device) self.fc2 = Linear(layer1_size, layer2_size).to(self.device) - # self.fc3 = Linear(layer2_size, layer3_size).to(self.device) - self.fc_out= Linear(layer2_size, action_dim).to(self.device) + self.fc3 = Linear(layer2_size, layer3_size).to(self.device) + self.fc_out= Linear(layer3_size, action_dim).to(self.device) # Set learning parameters - self.learning_rate = 0.1 - self.numTrainingGames = 3800 + self.learning_rate = 0.01 + self.numTrainingGames = 5000 self.batch_size = 128 # Optimizer self.optimizer = optim.SGD(self.parameters(), lr=self.learning_rate) + # self.scheduler1 = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=27500, eta_min=self.learning_rate*0.25) # Replace with CosineAnnealingLR + # self.scheduler2 = torch.optim.lr_scheduler.ExponentialLR(self.optimizer, gamma=0.9999) + self.output_step=500 + self.output_cnt=0 "**END CODE""" self.double() @@ -80,7 +84,7 @@ class DeepQNetwork(Module): states = states.to(self.device) x = relu(self.fc1(states)) x = relu(self.fc2(x)) - # x = relu(self.fc3(x)) + x = relu(self.fc3(x)) Q_values = self.fc_out(x) return Q_values @@ -106,4 +110,9 @@ class DeepQNetwork(Module): self.optimizer.zero_grad() loss = self.get_loss(states, Q_target) loss.backward() - self.optimizer.step() \ No newline at end of file + self.optimizer.step() + # self.scheduler1.step() + # self.scheduler2.step() + self.output_cnt+=1 + if self.output_cnt%self.output_step==0: + print("now lr is: ", self.optimizer.param_groups[0]['lr'],"update count", self.output_cnt) \ No newline at end of file diff --git a/reinforcement/pacman.py b/reinforcement/pacman.py index 86b68bf..d1fd780 100644 --- a/reinforcement/pacman.py +++ b/reinforcement/pacman.py @@ -745,6 +745,7 @@ if __name__ == '__main__': """ args = readCommand(sys.argv[1:]) # Get game components based on input runGames(**args) + args["pacman"].save_model() # import cProfile # cProfile.run("runGames( **args )") diff --git a/reinforcement/reinforcementTestClasses.py b/reinforcement/reinforcementTestClasses.py index a33ac67..bfb7ed5 100644 --- a/reinforcement/reinforcementTestClasses.py +++ b/reinforcement/reinforcementTestClasses.py @@ -564,7 +564,9 @@ class DeepQLearningTest(testClasses.TestCase): pacman = pacmanType(self.layout) # Load Ghost Agent - ghostType = loadAgent("RandomGhost", nographics) + ghost_agent_name="DirectionalGhost" + ghostType = loadAgent(ghost_agent_name, nographics) + print("using ghost agent", ghost_agent_name) numghosts = 1 ghosts = [ghostType(i+1) for i in range(numghosts)]