强化学习代码实战-08 DDPG 算法

标签：nn 08 torch state next 算法 action model DDPG

PPO算法是离线学习法，样本效率利用率低，且对连续动作空间情况处理能力弱，无法精细控制

DDPG-深度确定性策略梯度算法，离线学习、处理连续动作空间

DDPG构造一个确定性策略，采用梯度上升法最大化Q值（动作-状态价值价值网络）

在行为策略上引入一个随机噪声N，加大对未知的探索

用到4个神经网络，使用软更新方式优化策略网络和价值网络（Actor-Critic model）

算法流程如下：

"""
@Date   ：2022/11/2
@Fun: DDPG算法
"""
import random
import gym
import torch
import numpy as np
from matplotlib import pyplot as plt
from IPython import display

env = gym.make("Pendulum-v0")
# 智能体状态
state = env.reset()
# 动作空间（连续性问题）
actions = env.action_space
print(state, actions)

# 演员模型：接收一个状态，使用抽样方式确定动作
class Model(torch.nn.Module):
    """
    继承nn.Module，必须实现__init__() 方法和forward()方法。其中__init__() 方法里创建子模块，在forward()方法里拼接子模块。
    """
    def __init__(self):
        super().__init__()
        self.fc_state = torch.nn.Sequential(torch.nn.Linear(3, 64),
                                            torch.nn.ReLU(),
                                            torch.nn.Linear(64, 64),
                                            torch.nn.ReLU(),
                                            torch.nn.Linear(64, 1),
                                            torch.nn.Tanh()
                                            )

    def forward(self, state):
        # 将结果约束到[-2, 2] 动作空间
        action = self.fc_state(state) * 2.0
        return action

actor_model = Model()
# 逐步优化的演员模型
actor_model_next = Model()
actor_model_next.load_state_dict(actor_model.state_dict())

class V_Model(torch.nn.Module):
    def __init__(self):
        super(V_Model, self).__init__()
        self.fc_state = torch.nn.Sequential(torch.nn.Linear(4, 64),
                                            torch.nn.ReLU(),
                                            torch.nn.Linear(64, 64),
                                            torch.nn.ReLU(),
                                            torch.nn.Linear(64, 1),)

    def forward(self, state_action):
        # 接收状态和动作特征，输出价值
        return self.fc_state(state_action)

# 评论员模型：评价一个状态的价值，给出多好的得分
critic_model = V_Model()
critic_model_next = V_Model()
critic_model_next.load_state_dict(critic_model.state_dict())

# 演员模型根据状态输出确定性动作值
def get_action(state):
    state = torch.FloatTensor(state).reshape(1, 3)
    action = actor_model(state).item()
    # 给动作加高斯噪声，增加探索
    action += random.normalvariate(0, 0.01)

    return action

# 离线学习策略，构建Replay Buffer 样本池
datas = []
def update_data():
    state = env.reset()
    done = False

    while not done:
        action = get_action(state)
        next_state, reward, done, _ = env.step([action])
        datas.append((state, action, reward, next_state, done))
        state = next_state

    while len(datas) > 10000:
        datas.pop(0)

# 从Buffer中获取一个batch样本，迭代训练时使用
def get_samples():
    batch_size = 64
    sample = random.sample(datas, batch_size)
    states = torch.FloatTensor([i[0] for i in sample]).reshape(-1, 3)
    actions = torch.FloatTensor([i[1] for i in sample]).reshape(-1, 1)
    rewards = torch.FloatTensor([i[2] for i in sample]).reshape(-1, 1)
    next_states = torch.FloatTensor([i[3] for i in sample]).reshape(-1, 3)
    dones = torch.LongTensor([i[4] for i in sample]).reshape(-1, 1)

    return states, actions, rewards, next_states, dones

# 价值网络评估(state, action) 的价值
def get_value(state, action):
    input = torch.cat([state, action], dim=1)

    return critic_model(input)

# 监督目标y的计算(时序差分)
def get_target(next_state, reward, done):
    # 对next_state的价值估计，首先要计算其对应的动作
    action = actor_model_next(next_state)
    input = torch.cat([next_state, action], dim=1)
    target = critic_model_next(input) * 0.98
    target = target * (1 - done)
    # 折扣价值
    target = reward + target

    return target

# 计算策略网络（演员）的优化loss，需要借助价值网络（评论员）价值最大化特性
def get_loss_action(state):
    action = actor_model(state)
    input = torch.cat([state, action], dim=1)
    # 价值网络越大越好，要最小化loss，故前面要加负号
    loss = -critic_model(input).mean()

    return loss

# 模型软更新
def soft_update(model, model_next):
    # 模型以一个微小步长参数更新
    for old, new in zip(model.parameters(), model_next.parameters()):
        value = new.data * (1 - 0.005) + old.data * 0.005
        new.data.copy_(value)

def test():
    state = env.reset()
    reward_sum = 0
    over = False

    while not over:
        action = get_action(state)

        state, reward, over, _ = env.step([action])
        reward_sum += reward

    return reward_sum

def train():
    actor_model.train()
    critic_model.train()
    optimizer = torch.optim.Adam(actor_model.parameters(), lr=5e-4)
    optimizer_value = torch.optim.Adam(critic_model.parameters(), lr=5e-3)

    # 玩N局游戏，每局游戏玩M次
    for epoch in range(200):
        # 更新一波数据
        update_data()

        # 每次更新后，大约学习200次（一个回合的样本大概是200）
        for _ in range(200):

            states, actions, rewards, next_states, dones = get_samples()

            value = get_value(states, actions)
            target = get_target(next_states, rewards, dones)
            # 两者数值求差，最小化损失
            loss = torch.nn.MSELoss()(value, target)
            optimizer_value.zero_grad()
            loss.backward()
            optimizer_value.step()

            # 使用价值网络计算策略网络的loss，参数优化
            loss_action = get_loss_action(states)
            optimizer.zero_grad()
            loss_action.backward()
            optimizer.step()

            soft_update(actor_model, actor_model_next)
            soft_update(critic_model, critic_model_next)

        if epoch % 20 == 0:
            result = sum([test() for _ in range(10)]) / 10
            print(epoch, result)


train()

标签：nn,08,torch,state,next,算法,action,model,DDPG
From： https://www.cnblogs.com/demo-deng/p/16906884.html

强化学习代码实战-08 DDPG 算法

相关文章

赞助商

阅读排行