PPO算法是离线学习法,样本效率利用率低,且对连续动作空间情况处理能力弱,无法精细控制
DDPG-深度确定性策略梯度算法,离线学习、处理连续动作空间
DDPG构造一个确定性策略,采用梯度上升法最大化Q值(动作-状态价值价值网络)
在行为策略上引入一个随机噪声N,加大对未知的探索
用到4个神经网络,使用软更新方式优化策略网络和价值网络(Actor-Critic model)
算法流程如下:
""" @Date :2022/11/2 @Fun: DDPG算法 """ import random import gym import torch import numpy as np from matplotlib import pyplot as plt from IPython import display env = gym.make("Pendulum-v0") # 智能体状态 state = env.reset() # 动作空间(连续性问题) actions = env.action_space print(state, actions) # 演员模型:接收一个状态,使用抽样方式确定动作 class Model(torch.nn.Module): """ 继承nn.Module,必须实现__init__() 方法和forward()方法。其中__init__() 方法里创建子模块,在forward()方法里拼接子模块。 """ def __init__(self): super().__init__() self.fc_state = torch.nn.Sequential(torch.nn.Linear(3, 64), torch.nn.ReLU(), torch.nn.Linear(64, 64), torch.nn.ReLU(), torch.nn.Linear(64, 1), torch.nn.Tanh() ) def forward(self, state): # 将结果约束到[-2, 2] 动作空间 action = self.fc_state(state) * 2.0 return action actor_model = Model() # 逐步优化的演员模型 actor_model_next = Model() actor_model_next.load_state_dict(actor_model.state_dict()) class V_Model(torch.nn.Module): def __init__(self): super(V_Model, self).__init__() self.fc_state = torch.nn.Sequential(torch.nn.Linear(4, 64), torch.nn.ReLU(), torch.nn.Linear(64, 64), torch.nn.ReLU(), torch.nn.Linear(64, 1),) def forward(self, state_action): # 接收状态和动作特征,输出价值 return self.fc_state(state_action) # 评论员模型:评价一个状态的价值,给出多好的得分 critic_model = V_Model() critic_model_next = V_Model() critic_model_next.load_state_dict(critic_model.state_dict()) # 演员模型根据状态输出确定性动作值 def get_action(state): state = torch.FloatTensor(state).reshape(1, 3) action = actor_model(state).item() # 给动作加高斯噪声,增加探索 action += random.normalvariate(0, 0.01) return action # 离线学习策略,构建Replay Buffer 样本池 datas = [] def update_data(): state = env.reset() done = False while not done: action = get_action(state) next_state, reward, done, _ = env.step([action]) datas.append((state, action, reward, next_state, done)) state = next_state while len(datas) > 10000: datas.pop(0) # 从Buffer中获取一个batch样本,迭代训练时使用 def get_samples(): batch_size = 64 sample = random.sample(datas, batch_size) states = torch.FloatTensor([i[0] for i in sample]).reshape(-1, 3) actions = torch.FloatTensor([i[1] for i in sample]).reshape(-1, 1) rewards = torch.FloatTensor([i[2] for i in sample]).reshape(-1, 1) next_states = torch.FloatTensor([i[3] for i in sample]).reshape(-1, 3) dones = torch.LongTensor([i[4] for i in sample]).reshape(-1, 1) return states, actions, rewards, next_states, dones # 价值网络评估(state, action) 的价值 def get_value(state, action): input = torch.cat([state, action], dim=1) return critic_model(input) # 监督目标y的计算(时序差分) def get_target(next_state, reward, done): # 对next_state的价值估计,首先要计算其对应的动作 action = actor_model_next(next_state) input = torch.cat([next_state, action], dim=1) target = critic_model_next(input) * 0.98 target = target * (1 - done) # 折扣价值 target = reward + target return target # 计算策略网络(演员)的优化loss,需要借助价值网络(评论员)价值最大化特性 def get_loss_action(state): action = actor_model(state) input = torch.cat([state, action], dim=1) # 价值网络越大越好,要最小化loss,故前面要加负号 loss = -critic_model(input).mean() return loss # 模型软更新 def soft_update(model, model_next): # 模型以一个微小步长参数更新 for old, new in zip(model.parameters(), model_next.parameters()): value = new.data * (1 - 0.005) + old.data * 0.005 new.data.copy_(value) def test(): state = env.reset() reward_sum = 0 over = False while not over: action = get_action(state) state, reward, over, _ = env.step([action]) reward_sum += reward return reward_sum def train(): actor_model.train() critic_model.train() optimizer = torch.optim.Adam(actor_model.parameters(), lr=5e-4) optimizer_value = torch.optim.Adam(critic_model.parameters(), lr=5e-3) # 玩N局游戏,每局游戏玩M次 for epoch in range(200): # 更新一波数据 update_data() # 每次更新后,大约学习200次(一个回合的样本大概是200) for _ in range(200): states, actions, rewards, next_states, dones = get_samples() value = get_value(states, actions) target = get_target(next_states, rewards, dones) # 两者数值求差,最小化损失 loss = torch.nn.MSELoss()(value, target) optimizer_value.zero_grad() loss.backward() optimizer_value.step() # 使用价值网络计算策略网络的loss,参数优化 loss_action = get_loss_action(states) optimizer.zero_grad() loss_action.backward() optimizer.step() soft_update(actor_model, actor_model_next) soft_update(critic_model, critic_model_next) if epoch % 20 == 0: result = sum([test() for _ in range(10)]) / 10 print(epoch, result) train()
标签:nn,08,torch,state,next,算法,action,model,DDPG From: https://www.cnblogs.com/demo-deng/p/16906884.html