首页 > 编程语言 > 强化学习代码实战-08 DDPG 算法

强化学习代码实战-08 DDPG 算法

时间:2022-11-19 20:01:12浏览次数:43  
标签:nn 08 torch state next 算法 action model DDPG

PPO算法是离线学习法,样本效率利用率低,且对连续动作空间情况处理能力弱,无法精细控制

DDPG-深度确定性策略梯度算法,离线学习、处理连续动作空间

DDPG构造一个确定性策略,采用梯度上升法最大化Q值(动作-状态价值价值网络)

在行为策略上引入一个随机噪声N,加大对未知的探索

用到4个神经网络,使用软更新方式优化策略网络和价值网络(Actor-Critic model)

算法流程如下:

 

"""
@Date   :2022/11/2
@Fun: DDPG算法
"""
import random
import gym
import torch
import numpy as np
from matplotlib import pyplot as plt
from IPython import display

env = gym.make("Pendulum-v0")
# 智能体状态
state = env.reset()
# 动作空间(连续性问题)
actions = env.action_space
print(state, actions)

# 演员模型:接收一个状态,使用抽样方式确定动作
class Model(torch.nn.Module):
    """
    继承nn.Module,必须实现__init__() 方法和forward()方法。其中__init__() 方法里创建子模块,在forward()方法里拼接子模块。
    """
    def __init__(self):
        super().__init__()
        self.fc_state = torch.nn.Sequential(torch.nn.Linear(3, 64),
                                            torch.nn.ReLU(),
                                            torch.nn.Linear(64, 64),
                                            torch.nn.ReLU(),
                                            torch.nn.Linear(64, 1),
                                            torch.nn.Tanh()
                                            )

    def forward(self, state):
        # 将结果约束到[-2, 2] 动作空间
        action = self.fc_state(state) * 2.0
        return action

actor_model = Model()
# 逐步优化的演员模型
actor_model_next = Model()
actor_model_next.load_state_dict(actor_model.state_dict())

class V_Model(torch.nn.Module):
    def __init__(self):
        super(V_Model, self).__init__()
        self.fc_state = torch.nn.Sequential(torch.nn.Linear(4, 64),
                                            torch.nn.ReLU(),
                                            torch.nn.Linear(64, 64),
                                            torch.nn.ReLU(),
                                            torch.nn.Linear(64, 1),)

    def forward(self, state_action):
        # 接收状态和动作特征,输出价值
        return self.fc_state(state_action)

# 评论员模型:评价一个状态的价值,给出多好的得分
critic_model = V_Model()
critic_model_next = V_Model()
critic_model_next.load_state_dict(critic_model.state_dict())

# 演员模型根据状态输出确定性动作值
def get_action(state):
    state = torch.FloatTensor(state).reshape(1, 3)
    action = actor_model(state).item()
    # 给动作加高斯噪声,增加探索
    action += random.normalvariate(0, 0.01)

    return action

# 离线学习策略,构建Replay Buffer 样本池
datas = []
def update_data():
    state = env.reset()
    done = False

    while not done:
        action = get_action(state)
        next_state, reward, done, _ = env.step([action])
        datas.append((state, action, reward, next_state, done))
        state = next_state

    while len(datas) > 10000:
        datas.pop(0)

# 从Buffer中获取一个batch样本,迭代训练时使用
def get_samples():
    batch_size = 64
    sample = random.sample(datas, batch_size)
    states = torch.FloatTensor([i[0] for i in sample]).reshape(-1, 3)
    actions = torch.FloatTensor([i[1] for i in sample]).reshape(-1, 1)
    rewards = torch.FloatTensor([i[2] for i in sample]).reshape(-1, 1)
    next_states = torch.FloatTensor([i[3] for i in sample]).reshape(-1, 3)
    dones = torch.LongTensor([i[4] for i in sample]).reshape(-1, 1)

    return states, actions, rewards, next_states, dones

# 价值网络评估(state, action) 的价值
def get_value(state, action):
    input = torch.cat([state, action], dim=1)

    return critic_model(input)

# 监督目标y的计算(时序差分)
def get_target(next_state, reward, done):
    # 对next_state的价值估计,首先要计算其对应的动作
    action = actor_model_next(next_state)
    input = torch.cat([next_state, action], dim=1)
    target = critic_model_next(input) * 0.98
    target = target * (1 - done)
    # 折扣价值
    target = reward + target

    return target

# 计算策略网络(演员)的优化loss,需要借助价值网络(评论员)价值最大化特性
def get_loss_action(state):
    action = actor_model(state)
    input = torch.cat([state, action], dim=1)
    # 价值网络越大越好,要最小化loss,故前面要加负号
    loss = -critic_model(input).mean()

    return loss

# 模型软更新
def soft_update(model, model_next):
    # 模型以一个微小步长参数更新
    for old, new in zip(model.parameters(), model_next.parameters()):
        value = new.data * (1 - 0.005) + old.data * 0.005
        new.data.copy_(value)

def test():
    state = env.reset()
    reward_sum = 0
    over = False

    while not over:
        action = get_action(state)

        state, reward, over, _ = env.step([action])
        reward_sum += reward

    return reward_sum

def train():
    actor_model.train()
    critic_model.train()
    optimizer = torch.optim.Adam(actor_model.parameters(), lr=5e-4)
    optimizer_value = torch.optim.Adam(critic_model.parameters(), lr=5e-3)

    # 玩N局游戏,每局游戏玩M次
    for epoch in range(200):
        # 更新一波数据
        update_data()

        # 每次更新后,大约学习200次(一个回合的样本大概是200)
        for _ in range(200):

            states, actions, rewards, next_states, dones = get_samples()

            value = get_value(states, actions)
            target = get_target(next_states, rewards, dones)
            # 两者数值求差,最小化损失
            loss = torch.nn.MSELoss()(value, target)
            optimizer_value.zero_grad()
            loss.backward()
            optimizer_value.step()

            # 使用价值网络计算策略网络的loss,参数优化
            loss_action = get_loss_action(states)
            optimizer.zero_grad()
            loss_action.backward()
            optimizer.step()

            soft_update(actor_model, actor_model_next)
            soft_update(critic_model, critic_model_next)

        if epoch % 20 == 0:
            result = sum([test() for _ in range(10)]) / 10
            print(epoch, result)


train()

 

标签:nn,08,torch,state,next,算法,action,model,DDPG
From: https://www.cnblogs.com/demo-deng/p/16906884.html

相关文章