try to solve smallClassic
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@ -6,4 +6,5 @@ __pycache__/
|
|||||||
/.github
|
/.github
|
||||||
**/.env/
|
**/.env/
|
||||||
**/.ipynb_checkpoints
|
**/.ipynb_checkpoints
|
||||||
**/playground.ipynb
|
**/playground.ipynb
|
||||||
|
**/*.bin
|
@ -5,6 +5,7 @@ import layout
|
|||||||
import copy
|
import copy
|
||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import os
|
||||||
|
|
||||||
class PacmanDeepQAgent(PacmanQAgent):
|
class PacmanDeepQAgent(PacmanQAgent):
|
||||||
def __init__(self, layout_input="smallGrid", target_update_rate=300, doubleQ=True, **args):
|
def __init__(self, layout_input="smallGrid", target_update_rate=300, doubleQ=True, **args):
|
||||||
@ -15,8 +16,9 @@ class PacmanDeepQAgent(PacmanQAgent):
|
|||||||
self.update_amount = 0
|
self.update_amount = 0
|
||||||
self.epsilon_explore = 1.0
|
self.epsilon_explore = 1.0
|
||||||
self.epsilon0 = 0.4
|
self.epsilon0 = 0.4
|
||||||
|
self.minimal_epsilon = 0.01
|
||||||
self.epsilon = self.epsilon0
|
self.epsilon = self.epsilon0
|
||||||
self.discount = 0.9
|
self.discount = 0.95
|
||||||
self.update_frequency = 3
|
self.update_frequency = 3
|
||||||
self.counts = None
|
self.counts = None
|
||||||
self.replay_memory = ReplayMemory(50000)
|
self.replay_memory = ReplayMemory(50000)
|
||||||
@ -54,6 +56,27 @@ class PacmanDeepQAgent(PacmanQAgent):
|
|||||||
import model
|
import model
|
||||||
self.model = model.DeepQNetwork(state_dim, action_dim)
|
self.model = model.DeepQNetwork(state_dim, action_dim)
|
||||||
self.target_model = model.DeepQNetwork(state_dim, action_dim)
|
self.target_model = model.DeepQNetwork(state_dim, action_dim)
|
||||||
|
if os.path.exists('para.bin'):
|
||||||
|
print("Loading model parameters from para.bin")
|
||||||
|
checkpoint = torch.load('para.bin')
|
||||||
|
self.model.load_state_dict(checkpoint['model_state_dict'])
|
||||||
|
self.target_model.load_state_dict(checkpoint['target_model_state_dict'])
|
||||||
|
self.model.optimizer.load_state_dict(checkpoint['model_optimizer_state_dict'])
|
||||||
|
self.target_model.optimizer.load_state_dict(checkpoint['target_model_optimizer_state_dict'])
|
||||||
|
self.replay_memory = checkpoint['memory']
|
||||||
|
print(self.model.state_dict())
|
||||||
|
else:
|
||||||
|
print("Initializing new model parameters")
|
||||||
|
def save_model(self, filename="para.bin"):
|
||||||
|
print(f"Saving model parameters to {filename}")
|
||||||
|
torch.save({
|
||||||
|
'model_state_dict': self.model.state_dict(),
|
||||||
|
'target_model_state_dict': self.target_model.state_dict(),
|
||||||
|
'model_optimizer_state_dict': self.model.optimizer.state_dict(),
|
||||||
|
"target_model_optimizer_state_dict": self.target_model.optimizer.state_dict(),
|
||||||
|
"memory": self.replay_memory
|
||||||
|
}, filename)
|
||||||
|
print(self.model.state_dict())
|
||||||
|
|
||||||
def getQValue(self, state, action):
|
def getQValue(self, state, action):
|
||||||
"""
|
"""
|
||||||
@ -136,7 +159,7 @@ class PacmanDeepQAgent(PacmanQAgent):
|
|||||||
if len(self.replay_memory) < self.min_transitions_before_training:
|
if len(self.replay_memory) < self.min_transitions_before_training:
|
||||||
self.epsilon = self.epsilon_explore
|
self.epsilon = self.epsilon_explore
|
||||||
else:
|
else:
|
||||||
self.epsilon = max(self.epsilon0 * (1 - self.update_amount / 20000), 0)
|
self.epsilon = max(self.epsilon0 * (1 - self.update_amount / 20000), self.minimal_epsilon)
|
||||||
|
|
||||||
|
|
||||||
if len(self.replay_memory) > self.min_transitions_before_training and self.update_amount % self.update_frequency == 0:
|
if len(self.replay_memory) > self.min_transitions_before_training and self.update_amount % self.update_frequency == 0:
|
||||||
|
@ -249,6 +249,13 @@ class ReinforcementAgent(ValueEstimationAgent):
|
|||||||
print('\tAverage Rewards over testing: %.2f' % testAvg)
|
print('\tAverage Rewards over testing: %.2f' % testAvg)
|
||||||
print('\tAverage Rewards for last %d episodes: %.2f' % (
|
print('\tAverage Rewards for last %d episodes: %.2f' % (
|
||||||
NUM_EPS_UPDATE,windowAvg))
|
NUM_EPS_UPDATE,windowAvg))
|
||||||
|
if windowAvg>-220:
|
||||||
|
if not hasattr(self,'best_window_avg_score'):
|
||||||
|
self.best_window_avg_score=-300
|
||||||
|
if windowAvg>self.best_window_avg_score:
|
||||||
|
print("find an excellent policy, ready to save model")
|
||||||
|
self.save_model("para.best.bin")
|
||||||
|
self.best_window_avg_score=windowAvg
|
||||||
print('\tEpisode took %.2f seconds' % (time.time() - self.episodeStartTime))
|
print('\tEpisode took %.2f seconds' % (time.time() - self.episodeStartTime))
|
||||||
self.lastWindowAccumRewards = 0.0
|
self.lastWindowAccumRewards = 0.0
|
||||||
self.episodeStartTime = time.time()
|
self.episodeStartTime = time.time()
|
||||||
@ -256,3 +263,5 @@ class ReinforcementAgent(ValueEstimationAgent):
|
|||||||
if self.episodesSoFar == self.numTraining:
|
if self.episodesSoFar == self.numTraining:
|
||||||
msg = 'Training Done (turning off epsilon and alpha)'
|
msg = 'Training Done (turning off epsilon and alpha)'
|
||||||
print('%s\n%s' % (msg,'-' * len(msg)))
|
print('%s\n%s' % (msg,'-' * len(msg)))
|
||||||
|
import traceback
|
||||||
|
traceback.print_stack()
|
||||||
|
@ -26,21 +26,25 @@ class DeepQNetwork(Module):
|
|||||||
"*** YOUR CODE HERE ***"
|
"*** YOUR CODE HERE ***"
|
||||||
# Initialize layers
|
# Initialize layers
|
||||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
layer1_size=300
|
layer1_size=512
|
||||||
layer2_size=300
|
layer2_size=128
|
||||||
# layer3_size=500
|
layer3_size=64
|
||||||
self.fc1 = Linear(state_dim, layer1_size).to(self.device)
|
self.fc1 = Linear(state_dim, layer1_size).to(self.device)
|
||||||
self.fc2 = Linear(layer1_size, layer2_size).to(self.device)
|
self.fc2 = Linear(layer1_size, layer2_size).to(self.device)
|
||||||
# self.fc3 = Linear(layer2_size, layer3_size).to(self.device)
|
self.fc3 = Linear(layer2_size, layer3_size).to(self.device)
|
||||||
self.fc_out= Linear(layer2_size, action_dim).to(self.device)
|
self.fc_out= Linear(layer3_size, action_dim).to(self.device)
|
||||||
|
|
||||||
# Set learning parameters
|
# Set learning parameters
|
||||||
self.learning_rate = 0.1
|
self.learning_rate = 0.01
|
||||||
self.numTrainingGames = 3800
|
self.numTrainingGames = 5000
|
||||||
self.batch_size = 128
|
self.batch_size = 128
|
||||||
|
|
||||||
# Optimizer
|
# Optimizer
|
||||||
self.optimizer = optim.SGD(self.parameters(), lr=self.learning_rate)
|
self.optimizer = optim.SGD(self.parameters(), lr=self.learning_rate)
|
||||||
|
# self.scheduler1 = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=27500, eta_min=self.learning_rate*0.25) # Replace with CosineAnnealingLR
|
||||||
|
# self.scheduler2 = torch.optim.lr_scheduler.ExponentialLR(self.optimizer, gamma=0.9999)
|
||||||
|
self.output_step=500
|
||||||
|
self.output_cnt=0
|
||||||
|
|
||||||
"**END CODE"""
|
"**END CODE"""
|
||||||
self.double()
|
self.double()
|
||||||
@ -80,7 +84,7 @@ class DeepQNetwork(Module):
|
|||||||
states = states.to(self.device)
|
states = states.to(self.device)
|
||||||
x = relu(self.fc1(states))
|
x = relu(self.fc1(states))
|
||||||
x = relu(self.fc2(x))
|
x = relu(self.fc2(x))
|
||||||
# x = relu(self.fc3(x))
|
x = relu(self.fc3(x))
|
||||||
Q_values = self.fc_out(x)
|
Q_values = self.fc_out(x)
|
||||||
return Q_values
|
return Q_values
|
||||||
|
|
||||||
@ -106,4 +110,9 @@ class DeepQNetwork(Module):
|
|||||||
self.optimizer.zero_grad()
|
self.optimizer.zero_grad()
|
||||||
loss = self.get_loss(states, Q_target)
|
loss = self.get_loss(states, Q_target)
|
||||||
loss.backward()
|
loss.backward()
|
||||||
self.optimizer.step()
|
self.optimizer.step()
|
||||||
|
# self.scheduler1.step()
|
||||||
|
# self.scheduler2.step()
|
||||||
|
self.output_cnt+=1
|
||||||
|
if self.output_cnt%self.output_step==0:
|
||||||
|
print("now lr is: ", self.optimizer.param_groups[0]['lr'],"update count", self.output_cnt)
|
@ -745,6 +745,7 @@ if __name__ == '__main__':
|
|||||||
"""
|
"""
|
||||||
args = readCommand(sys.argv[1:]) # Get game components based on input
|
args = readCommand(sys.argv[1:]) # Get game components based on input
|
||||||
runGames(**args)
|
runGames(**args)
|
||||||
|
args["pacman"].save_model()
|
||||||
|
|
||||||
# import cProfile
|
# import cProfile
|
||||||
# cProfile.run("runGames( **args )")
|
# cProfile.run("runGames( **args )")
|
||||||
|
@ -564,7 +564,9 @@ class DeepQLearningTest(testClasses.TestCase):
|
|||||||
pacman = pacmanType(self.layout)
|
pacman = pacmanType(self.layout)
|
||||||
|
|
||||||
# Load Ghost Agent
|
# Load Ghost Agent
|
||||||
ghostType = loadAgent("RandomGhost", nographics)
|
ghost_agent_name="DirectionalGhost"
|
||||||
|
ghostType = loadAgent(ghost_agent_name, nographics)
|
||||||
|
print("using ghost agent", ghost_agent_name)
|
||||||
numghosts = 1
|
numghosts = 1
|
||||||
ghosts = [ghostType(i+1) for i in range(numghosts)]
|
ghosts = [ghostType(i+1) for i in range(numghosts)]
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user