try to solve smallClassic

2024-07-18 19:18:55 +08:00
parent 1bf4cc1efe
commit ceae34ea86
6 changed files with 58 additions and 13 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ __pycache__/
 **/.env/
 **/.ipynb_checkpoints
 **/playground.ipynb
 **/*.bin
--- a/reinforcement/deepQLearningAgents.py
+++ b/reinforcement/deepQLearningAgents.py
@@ -5,6 +5,7 @@ import layout
 import copy
 import torch
 import numpy as np
 import os
 class PacmanDeepQAgent(PacmanQAgent):
    def __init__(self, layout_input="smallGrid", target_update_rate=300, doubleQ=True, **args):
@@ -15,8 +16,9 @@ class PacmanDeepQAgent(PacmanQAgent):
        self.update_amount = 0
        self.epsilon_explore = 1.0
        self.epsilon0 = 0.4
        self.minimal_epsilon = 0.01
        self.epsilon = self.epsilon0
-        self.discount = 0.9
+        self.discount = 0.95
        self.update_frequency = 3
        self.counts = None
        self.replay_memory = ReplayMemory(50000)
@@ -54,6 +56,27 @@ class PacmanDeepQAgent(PacmanQAgent):
        import model
        self.model = model.DeepQNetwork(state_dim, action_dim)
        self.target_model = model.DeepQNetwork(state_dim, action_dim)
        if os.path.exists('para.bin'):
            print("Loading model parameters from para.bin")
            checkpoint = torch.load('para.bin')
            self.model.load_state_dict(checkpoint['model_state_dict'])
            self.target_model.load_state_dict(checkpoint['target_model_state_dict'])
            self.model.optimizer.load_state_dict(checkpoint['model_optimizer_state_dict'])
            self.target_model.optimizer.load_state_dict(checkpoint['target_model_optimizer_state_dict'])
            self.replay_memory = checkpoint['memory']
            print(self.model.state_dict())
        else:
            print("Initializing new model parameters")
    def save_model(self, filename="para.bin"):
        print(f"Saving model parameters to {filename}")
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'target_model_state_dict': self.target_model.state_dict(),
            'model_optimizer_state_dict': self.model.optimizer.state_dict(),
            "target_model_optimizer_state_dict": self.target_model.optimizer.state_dict(),
            "memory": self.replay_memory
        }, filename)
        print(self.model.state_dict())
    def getQValue(self, state, action):
        """
@@ -136,7 +159,7 @@ class PacmanDeepQAgent(PacmanQAgent):
        if len(self.replay_memory) < self.min_transitions_before_training:
            self.epsilon = self.epsilon_explore
        else:
-            self.epsilon = max(self.epsilon0 * (1 - self.update_amount / 20000), 0)
+            self.epsilon = max(self.epsilon0 * (1 - self.update_amount / 20000), self.minimal_epsilon)
        if len(self.replay_memory) > self.min_transitions_before_training and self.update_amount % self.update_frequency == 0:
--- a/reinforcement/learningAgents.py
+++ b/reinforcement/learningAgents.py
@@ -249,6 +249,13 @@ class ReinforcementAgent(ValueEstimationAgent):
                print('\tAverage Rewards over testing: %.2f' % testAvg)
            print('\tAverage Rewards for last %d episodes: %.2f'  % (
                    NUM_EPS_UPDATE,windowAvg))
            if windowAvg>-220:
                if not hasattr(self,'best_window_avg_score'):
                    self.best_window_avg_score=-300
                if windowAvg>self.best_window_avg_score:
                    print("find an excellent policy, ready to save model")
                    self.save_model("para.best.bin")
                    self.best_window_avg_score=windowAvg
            print('\tEpisode took %.2f seconds' % (time.time() - self.episodeStartTime))
            self.lastWindowAccumRewards = 0.0
            self.episodeStartTime = time.time()
@@ -256,3 +263,5 @@ class ReinforcementAgent(ValueEstimationAgent):
        if self.episodesSoFar == self.numTraining:
            msg = 'Training Done (turning off epsilon and alpha)'
            print('%s\n%s' % (msg,'-' * len(msg)))
            import traceback
            traceback.print_stack()
--- a/reinforcement/model.py
+++ b/reinforcement/model.py
@@ -26,21 +26,25 @@ class DeepQNetwork(Module):
        "*** YOUR CODE HERE ***"
        # Initialize layers
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        layer1_size=300
+        layer1_size=512
-        layer2_size=300
+        layer2_size=128
-        # layer3_size=500
+        layer3_size=64
        self.fc1 = Linear(state_dim, layer1_size).to(self.device)
        self.fc2 = Linear(layer1_size, layer2_size).to(self.device)
-        # self.fc3 = Linear(layer2_size, layer3_size).to(self.device)
+        self.fc3 = Linear(layer2_size, layer3_size).to(self.device)
-        self.fc_out=  Linear(layer2_size, action_dim).to(self.device)
+        self.fc_out=  Linear(layer3_size, action_dim).to(self.device)
        # Set learning parameters
-        self.learning_rate = 0.1
+        self.learning_rate = 0.01
-        self.numTrainingGames = 3800
+        self.numTrainingGames = 5000
        self.batch_size = 128
        # Optimizer
        self.optimizer = optim.SGD(self.parameters(), lr=self.learning_rate)
        # self.scheduler1 = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=27500, eta_min=self.learning_rate*0.25)  # Replace with CosineAnnealingLR
        # self.scheduler2 = torch.optim.lr_scheduler.ExponentialLR(self.optimizer, gamma=0.9999)
        self.output_step=500
        self.output_cnt=0
        "**END CODE"""
        self.double()
@@ -80,7 +84,7 @@ class DeepQNetwork(Module):
            states = states.to(self.device)
        x = relu(self.fc1(states))
        x = relu(self.fc2(x))
-        # x = relu(self.fc3(x))
+        x = relu(self.fc3(x))
        Q_values = self.fc_out(x)
        return Q_values
@@ -107,3 +111,8 @@ class DeepQNetwork(Module):
        loss = self.get_loss(states, Q_target)
        loss.backward()
        self.optimizer.step()
        # self.scheduler1.step()
        # self.scheduler2.step()
        self.output_cnt+=1
        if self.output_cnt%self.output_step==0:
            print("now lr is: ", self.optimizer.param_groups[0]['lr'],"update count", self.output_cnt)
--- a/reinforcement/pacman.py
+++ b/reinforcement/pacman.py
@@ -745,6 +745,7 @@ if __name__ == '__main__':
    """
    args = readCommand(sys.argv[1:])  # Get game components based on input
    runGames(**args)
    args["pacman"].save_model()
    # import cProfile
    # cProfile.run("runGames( **args )")
--- a/reinforcement/reinforcementTestClasses.py
+++ b/reinforcement/reinforcementTestClasses.py
@@ -564,7 +564,9 @@ class DeepQLearningTest(testClasses.TestCase):
        pacman = pacmanType(self.layout)
        # Load Ghost Agent
-        ghostType = loadAgent("RandomGhost", nographics)
+        ghost_agent_name="DirectionalGhost"
        ghostType = loadAgent(ghost_agent_name, nographics)
        print("using ghost agent", ghost_agent_name)
        numghosts = 1
        ghosts = [ghostType(i+1) for i in range(numghosts)]