文章目录
构建神经网络进化智能体
进化方法是基于黑盒优化的,由于不涉及梯度计算,因此也叫无梯度方法。
本节使用TensorFlow 2.x实现一个简单且近似于基于交叉熵的神经进化智能体
前期准备
需要如下包
from collections import namedtuple
import gym
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tqdm import tqdm
实现步骤
构建一个使用进化过程改进其策略,以在Gridworld环境中导航的神经网络
-
导入Agent类和Brain类
class Brain(keras.Model): def __init__(self, action_dim=5, input_shape=(1, 8 * 8)): """Initialize the Agent's Brain model Args: action_dim (int): Number of actions """ super(Brain, self).__init__() self.dense1 = layers.Dense(32, input_shape=input_shape, activation="relu") self.logits = layers.Dense(action_dim) def call(self, inputs): x = tf.convert_to_tensor(inputs) logits = self.logits(self.dense1(x)) return logits def process(self, observations): # Process batch observations using `call(inputs)` behind-the-scenes action_logits = self.predict_on_batch(observations) return action_logits class Agent(object): def __init__(self, brain): """Agent with a neural-network brain powered policy Args: brain (keras.Model): Neural Network based model """ self.brain = brain self.policy = self.policy_mlp def policy_mlp(self, observations): observations = observations.reshape(1, -1) action_logits = self.brain.process(observations) action = tf.random.categorical(tf.math.log(action_logits), num_samples=1) return action # tf.squeeze(action, axis=0) def get_action(self, observations): return self.policy(observations) def learn(self, obs, actions, **kwargs): self.brain.fit(obs, actions, **kwargs)
-
实现rollout()函数,模拟一回合内智能体在给定的环境中的轨迹,并返回obs_batch,action_batch,episode_reward
def rollout(agent, env, render=False): """Rollout `agent` in the `environment` for 1 episode Args: agent (Agent): Agent/policy to generate state-conditioned actions env (gym.Env): A Gym environment total_steps (int, optional): Totall number of steps to rollout. Defaults to 1000. render (bool, optional): Enable/disable rendering. Defaults to False. Returns: obs_batch (List): Batch of observations collected in the episode actions_batch (List): Batch of actions performed in the episode episode_reward (float): Total rewards accumulated in this episode """ obs, episode_reward, done, step_num = env.reset(), 0.0, False, 0 observations, actions = [], [] episode_reward = 0.0 while not done: action = agent.get_action(obs) next_obs, reward, done, info = env.step(action) # Save experience observations.append( np.array(obs).reshape(-1) ) # Convert to numpy & reshape (8, 8) to (1, 64) actions.append(np.squeeze(action, 0)) episode_reward += reward obs = next_obs step_num += 1 if render: env.render() env.close() return observations, actions, episode_reward
-
测试轨迹模拟方法
env = gym.make("Gridworld-v0") brain = Brain(env.action_space.n) brain.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]) print(env.action_space.n) agent = Agent(brain) obs_batch, actions_batch, episode_reward = rollout(agent, env)
-
验证模拟生成的经验数据是否一致
assert len(obs_batch) == len(actions_batch)
-
模拟多个完整的轨迹收集经验数据
trajectories = [rollout(agent, env, render=True) for _ in tqdm(range(10))]
-
利用经验数据样本查看奖励的分布
from tqdm.auto import tqdm import matplotlib.pyplot as plt %matplotlib inline sample_ep_rewards = [rollout(agent, env)[-1] for _ in tqdm(range(100))] plt.hist(sample_ep_rewards, bins=10, histtype="bar")
-
创建一个存储轨迹的容器
Trajectory = namedtuple("Trajectory", ["obs", "actions", "reward"])
-
为进化过程挑选精英经验
def gather_elite_xp(trajectories, elitism_criterion): """Gather elite trajectories from the batch of trajectories Args: batch_trajectories (List): List of episode trajectories containing experiences (obs, actions, episode_reward) Returns: elite_batch_obs elite_batch_actions elite_reard_threshold """ trajectory_obs, trajectory_actions, trajectory_rewards = zip(*trajectories) reward_threshold = np.percentile(trajectory_rewards, elitism_criterion) indices = [ index for index, value in enumerate(trajectory_rewards) if value >= reward_threshold ] elite_trajectory_obs = [trajectory_obs[i] for i in indices] elite_trajectory_actions = [trajectory_actions[i] for i in indices] unpacked_elite_batch_obs = [ item for items in elite_trajectory_obs for item in items ] unpacked_elite_batch_actions = [ item for items in elite_trajectory_actions for item in items ] return ( np.array(unpacked_elite_batch_obs), np.array(unpacked_elite_batch_actions), reward_threshold, )
-
测试精英经验的收集程序
elite_obs, elite_actions, reward_threshold = gather_elite_xp(trajectories, elitism_criterion=75)
-
定义辅助函数,将离散的动作索引转换为One-Hot编码向量或动作的概略分布
def gen_action_distribution(action_index, action_dim=5): action_distribution = np.zeros(action_dim).astype(type(action_index)) action_distribution[action_index] = 1 # action_distribution = np.expand_dims(action_distribution, 0) return action_distribution
-
测试动作分布生成函数
elite_action_distributions = np.array([gen_action_distribution(a.item()) for a in elite_actions])
-
创建并编译神经大脑
brain = Brain(env.action_space.n) brain.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
-
测试大脑训练循环
``` elite_obs, elite_action_distributions = elite_obs.astype("float16"), elite_action_distributions.astype("float16") # elite_obs = elite_obs.squeeze(-2) # elite_action_distributions = elite_action_distributions.squeeze(-2) print(elite_obs.shape, elite_action_distributions.shape) brain.fit(elite_obs, elite_action_distributions, batch_size=128, epochs=1)
-
实现辅助函数evaluate(),用于评估给定环境中的智能体
def evaluate(agent, env, render=True): global info obs, episode_reward, done, step_num = env.reset(), 0.0, False, 0 while not done: action = agent.get_action(obs) next_obs, reward, done, info = env.step(action) episode_reward += reward obs = next_obs step_num += 1 if render: env.render() return step_num, episode_reward, done, info
-
测试智能体评估循环
env = gym.make("Gridworld-v0") agent = Agent(brain) for episode_reward in tqdm(range(10)): steps,episode_reward,done,info = evaluate(agent, env) print(f"Episode reward: {episode_reward}, Steps: {steps}") env.close()
Episode reward: -5.899999999999995, Steps: 100
Episode reward: -6.399999999999993, Steps: 100
Episode reward: -6.699999999999992, Steps: 100
Episode reward: -7.099999999999991, Steps: 100
Episode reward: -7.29999999999999, Steps: 100
Episode reward: -6.699999999999992, Steps: 100
Episode reward: -5.799999999999995, Steps: 100
Episode reward: -5.899999999999995, Steps: 100
Episode reward: -6.8999999999999915, Steps: 100 -
定义训练循环的参数,创建environment、Brain、Agent
total_trajectory_rollouts = 70 elitism_criterion = 70 num_epochs = 200 mean_rewards = [] elite_reward_thresholds = []
env = gym.make("Gridworld-v0") input_shape = (env.observation_space.shape[0], env.observation_space.shape[1],) brain = Brain(env.action_space.n) brain.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]) agent = Agent(brain) for i in tqdm(range(num_epochs)): trajectories = [Trajectory(*rollout(agent, env)) for _ in range(total_trajectory_rollouts)] _,_,batch_rewards = zip(*trajectories) elite_obs, elite_actions, elite_threshold = gather_elite_xp(trajectories, elitism_criterion) elite_action_distributions = np.array([gen_action_distribution(a.item()) for a in elite_actions]) elite_obs, elite_action_distributions = elite_obs.astype("float16"), elite_action_distributions.astype("float16") brain.fit(elite_obs, elite_action_distributions, batch_size=128, epochs=1) elite_reward_thresholds.append(elite_threshold) mean_rewards.append(np.mean(batch_rewards)) print(f"Episode#:{i+1} elite-reward-threshold:{elite_reward_thresholds[-1]:.2f} reward: {mean_rewards[-1]:.2f}") plt.plot(mean_rewards,'r',label='Mean rewards') plt.plot(elite_reward_thresholds,'g',label='Elite reward thresholds') plt.legend() plt.grid() plt.show()
工作原理
在每一次迭代中,进化过程都会模拟或收集一组轨迹,使用智能体大脑中当前的一组神经权重建立经验数据。
然后进行经验选择过程,根据轨迹/经验获得的回合奖励,选择最高的k个百分位(精英标准)轨迹/经验。
然后使用这些选中的经验数据更新智能体的大脑模型。
每个过程会重复一个预设的迭代次数,使智能体的大脑模型不断改进并获得更多的奖励。
参考资料
第二章基于价值、策略和行动者-评论家的深度强化学习算法实现
提供一种实用方法构建基于价值、基于策略和基于A2C算法的强化学习智能体。
- 构建用于训练强化学习智能体的随机环境
- 构建基于价值的强化学习智能体算法
- 实现时间车分学习
- 构建强化学习中的蒙特卡洛预测和控制算法
- 实现SARSA算法和对应的强化学习智能体
- 构建基于Q学习的智能体
- 实现策略梯度
- 实现A2C算法
技术要求
Python 3.6以上
Anaconda
构建用于训练强化学习智能体的随机环境
现实世界的问题本质上时随机的,因此需要 随机的学习环境,以训练在现实世界中应用的强化学习智能体。
本节将构建迷宫环境用于训练强化学习智能体。
迷宫是一个随机环境,其世界表示为网格。网格上的每个位置都可以成为单元格。
在这个环境中,智能体的目标是找到通往目标状态的路径。
智能体初始位置在左上角,黑色为墙壁。智能体需要在网格中找到一条能够抵达迷宫右上角目标的路径,沿途尽可能多的收集硬币并避开墙壁,
目标、硬币、智能体和墙的位置都可以修改。
迷宫环境支持4个方向的离散动作:
0 :向上
1 :向下
2 :向左
3 :向右
奖励的的大小有智能体在到达目标状态前所收集的硬币数量决定。
因为环境要是随机的,所以针对智能体采取的动作具有轻微的“滑动”概率,即其中执行的动作将随机改变。
滑动的方向是顺时针(左→上,上→右)。
用slip_probability=0.2控制概率
前期准备
import numpy as np
实现步骤
-
首先定义一个MazeEnv()类和一个地图环境
class MazeEnv(gym.Env): def __init__(self, stochastic=True): """Stochastic Maze environment with coins, obstacles/walls and a goal state. Actions: 0: Move Up, 1: Move Down, 2: Move Left, 3: Move Right Reward is based on the number of coins collected by the agent before reaching goal state. Stochasticity in the env arises from the `slip_probability` which alters the action. The slip action will be the clockwise directinal action (LEFT -> UP, UP -> RIGHT etc) Example with `slip_probability=0.2`: With 0.2 probabilty a "RIGHT" action may result in "DOWN" """ self.map = np.asarray(["SWFWG", "OOOOO", "WOOOW", "FOWFW"])
-
在环境中合适的位置放置障碍物/墙壁
self.observation_space = gym.spaces.Discrete(1) self.dim = (4, 5) # Used for plotting policy & value function self.img_map = np.ones(self.dim) self.obstacles = [(0, 1), (0, 3), (2, 0), (2, 4), (3, 2), (3, 4)] for x in self.obstacles: self.img_map[x[0]][x[1]] = 0
-
定义顺时针方向上的“滑动”动作映射
self.slip_action_map = { 0: 3, 1: 2, 2: 0, 3: 1, }
-
定义一个字典形式的查询表,用于将下标映射为迷宫环境中的单元格
self.index_to_coordinate_map = { 0: (0, 0), 1: (1, 0), 2: (3, 0), 3: (1, 1), 4: (2, 1), 5: (3, 1), 6: (0, 2), 7: (1, 2), 8: (2, 2), 9: (1, 3), 10: (2, 3), 11: (3, 3), 12: (0, 4), 13: (1, 4), }
-
定义反向查询表,根据给定的索引查找单元格
self.coordinate_to_index_map = dict( (val, key) for key, val in self.index_to_coordinate_map.items() )
到此完成了环境的初始化
-
定义一个处理硬币以及它们在环境中的状态的函数num2coin(),其中0代表硬币没有被智能体收集,1代表硬币被智能体收集
def num2coin(self, n: int): # Each element of the below tuple correspond to a status of each coin. 0 for not collected, 1 for collected. coinlist = [ (0, 0, 0), (1, 0, 0), (0, 1, 0), (0, 0, 1), (1, 1, 0), (1, 0, 1), (0, 1, 1), (1, 1, 1), ] return list(coinlist[n])
-
定义一个执行反向操作的快速方法,查询硬币的数量状态/值
def coin2num(self, v: List): if sum(v) < 2: return np.inner(v, [1, 2, 3]) else: return np.inner(v, [1, 2, 3]) + 1
-
定义set_state()函数设置环境的状态。这对价值迭代等算法来说是有用的,因为算法需要获取到环境中的每个状态来计算价值函数。
def set_state(self, state: int) -> None: """Set the current state of the environment. Useful for value iteration Args: state (int): A valid state in the Maze env int: [0, 112] """ self.state = state
-
首先实现step(),并给予slip_probability应用“滑动”后的动作
def step(self, action, slip=True): """Run one step into the Maze env Args: state (Any): Current index state of the maze action (int): Discrete action for up, down, left, right slip (bool, optional): Stochasticity in the env. Defaults to True. Raises: ValueError: If invalid action is provided as input Returns: Tuple : Next state, reward, done, _ """ self.slip = slip if self.slip: if np.random.rand() < self.slip_probability: action = self.slip_action_map[action]
-
继续step()的实现,基于智能体执行的动作更新环境的状态
cell = self.index_to_coordinate_map[int(self.state / 8)] if action == 0: c_next = cell[1] r_next = max(0, cell[0] - 1) elif action == 1: c_next = cell[1] r_next = min(self.dim[0] - 1, cell[0] + 1) elif action == 2: c_next = max(0, cell[1] - 1) r_next = cell[0] elif action == 3: c_next = min(self.dim[1] - 1, cell[1] + 1) r_next = cell[0] else: raise ValueError(f"Invalid action:{action}")
-
确定智能体是否到达目标
if (r_next == self.goal_pos[0]) and ( c_next == self.goal_pos[1] ): # Check if goal reached v_coin = self.num2coin(self.state % 8) self.state = ( 8 * self.coordinate_to_index_map[(r_next, c_next)] + self.state % 8 ) return ( self.state, float(sum(v_coin)), True, )
-
处理智能体执行的动作导致和障碍物/墙壁碰撞的情况
else: if (r_next, c_next) in self.obstacles: # obstacle tuple list return self.state, 0.0, False
-
最后一种情况是判断智能体执行的动作是否能够收集到硬币
else: # Coin locations v_coin = self.num2coin(self.state % 8) if (r_next, c_next) == (0, 2): v_coin[0] = 1 elif (r_next, c_next) == (3, 0): v_coin[1] = 1 elif (r_next, c_next) == (3, 3): v_coin[2] = 1 self.state = 8 * self.coordinate_to_index_map[ (r_next, c_next) ] + self.coin2num(v_coin) return ( self.state, 0.0, False, )
-
实现render()函数可视化Gridworld状态的文本表示
def render(self): cell = self.index_to_coordinate_map[int(self.state / 8)] desc = self.map.tolist() desc[cell[0]] = ( desc[cell[0]][: cell[1]] + "\x1b[1;34m" # Blue font + "\x1b[4m" # Underline + "\x1b[1m" # Bold + "\x1b[7m" # Reversed + desc[cell[0]][cell[1]] + "\x1b[0m" + desc[cell[0]][cell[1] + 1 :] ) print("\n".join("".join(row) for row in desc))
-
测试
if __name__ == "__main__": env = MazeEnv() obs = env.reset() env.render() done = False step_num = 1 action_list = ["UP", "DOWN", "LEFT", "RIGHT"] # Run one episode while not done: # Sample a random action from the action space action = env.action_space.sample() next_obs, reward, done = env.step(action) print( f"step#:{step_num} action:{action_list[action]} reward:{reward} done:{done}" ) step_num += 1 env.render() env.close()
工作原理
这个迷宫定义了观测空间、动作空间以及奖励机制,可以实现一个马尔科夫决策过程MDP:
从环境的动作空间采取一个有效的动作,并在环境中执行选择的动作,进而获得新的观测、奖励以及一个布尔型完成标志。
构建基于价值的强化学习智能体算法
通过在给定环境中学习状态价值函数或动作价值函数运行。
本节将展示如何创建和更新迷宫环境的价值函数以得到一个最优策略。
学习价值函数特别是在环境不可知的无模型强化学习问题中,是非常有效的,特别是对于低维状态空间的强化学习问题。
基于价值的强化学习算法产生的最优动作序列
前期准备
import numpy as np
实现步骤
-
从envs.maze中导入迷宫学习环境
from envs.maze import MazeEnv
-
创建一个MazeEnv实例并打印状态空间和动作空间
env = MazeEnv() print(f"Observation space: {env.observation_space}") print(f"Action space: {env.action_space}")
Observation space: Discrete(1)
Action space: Discrete(4) -
定义状态维度初始化state_values,q_values以及策略
state_dim = env.distinct_states state_values = np.zeros(state_dim) q_values = np.zeros((state_dim, env.action_space.n)) # Action values policy = np.zeros(state_dim)
-
实现能够在给定环境状态和动作时计算状态/动作价值的函数。首先声明calculate_values函数,在之后的步骤中完整实现该函数
def calculate_values(state, action): """Evaluate Value function for given state and action Args: state (int): Valid (discrete) state in discrete `env.observation_space` action (int): Valid (discrete) action in `env.action_space` Returns: v_sum: value for given state, action """
-
生成slip_action,这是一个基于学习环境随机性的随机动作
slip_action = env.slip_action_map[action]
-
在计算给定状态-动作对的价值时,非常重要的一步是能能够在执行动作前设置环境的状态以观测到奖励/结果。迷宫环境提供了一个便捷的set_state()函数,用于设置当前环境的状态。使用set_state()函数,在环境中执行所需的动作
env.set_state(state) slip_next_state, slip_reward, _ = env.step(slip_action, slip=False)
-
根据贝尔曼方程,需要环境的一系列转移计算奖励。创建transitions列表补充新获得的环境转移信息
transitions = [] transitions.append((slip_reward, slip_next_state, env.slip_probability)) env.set_state(state)
-
使用状态和动作可以确定性地获得另一个转移。这可以通过在迷宫环境中步进时设置slip=False实现
env.set_state(state) next_state, reward, _ = env.step(action, slip=False) transitions.append((reward, next_state, 1 - env.slip_probability))
-
完成calculate_values函数,计算价值
for reward, next_state, pi in transitions: v_sum += pi * (reward + discount * state_values[next_state]) # 数学表示为:v_sum = Σ(π * (r + γ * V(s'))) return v_sum
-
开始实现状态/动作价值的学习。首先定义一个max_iteration的超参数
iters = 1000
-
使用价值迭代实现state_values函数的学习循环
# Value Iteration for i in range(iters): v_s = np.zeros(state_dim) for state in range(state_dim): if env.index_to_coordinate_map[int(state / 8)] == env.goal_pos: continue v_max = float("-inf") for action in range(env.action_space.n): v_sum = calculate_values(state, action) v_max = max(v_max, v_sum) v_s[state] = v_max state_values = np.copy(v_s)
-
接下来实现action_values函数
# action value for state in range(state_dim): for action in range(env.action_space.n): q_values[state, action] = calculate_values(state, action)
-
实现action_values函数,很快得到了最优策略
for state in range(state_dim): policy[state] = np.argmax(q_values[state, :])
-
使用以下代码打印Q值和策略
Path("results").mkdir(exist_ok=True) np.save("results/q_values", q_values) np.save("results/optimal_policy", policy) print(f"Q-values: {q_values}") print("Action mapping:[0 - UP; 1 - DOWN; 2 - LEFT; 3 - RIGHT") print("Optimal actions:") print(policy)
-
价值函数的学习和策略的更新
from value_function_utils import visualize_maze_values visualize_maze_values(q_values, env)
Q-values: [[0.47627195 0.52919106 0.47627195 0.48215185] [0.61934329 0.68815921 0.61934329 0.6269895 ] [0.77679473 0.86310525 0.77679473 0.78638479] [0.75627459 0.8403051 0.75627459 0.76561131] [1.00233621 1.1137069 1.00233621 1.01471073] [0.96607901 1.07342113 0.96607901 0.97800591] [1.14211398 1.26901553 1.14211398 1.15621415] [1.44764354 1.60849283 1.44764354 1.46551569] [0.48875767 0.53507096 0.52919106 0.59452329] [0.63557969 0.69580542 0.68815921 0.77311713] [0.7971588 0.87269531 0.86310525 0.96966146] [0.77610072 0.84964182 0.8403051 0.94404647] [1.02861297 1.12608142 1.1137069 1.25120158] [0.99140527 1.08534803 1.07342113 1.20594225] [1.17205509 1.28311571 1.26901553 1.42568412] [1.48559427 1.62636497 1.60849283 1.80707219] [0.62908618 0.70047476 0.69347001 0.62536208] [0.91545611 1.01866227 1.00847565 0.91492322] [0.7091226 0.70047476 0.70047476 0.77830529] [0.92414844 1.02852597 1.01824071 0.92223429] [1.03123835 1.01866227 1.01866227 1.13184697] [1.1734217 1.30573441 1.29267706 1.17256494] [1.04122382 1.02852597 1.02852597 1.14280663] [1.32185458 1.30573441 1.30573441 1.45081601] [0.6091023 0.54982381 0.54167677 0.66792123] [0.79037944 0.85238273 0.70439561 0.86856369] [0.99360992 0.88296658 0.88346933 1.08937275] [0.96738078 0.85814486 0.86013123 1.06059542] [1.28049343 1.269747 1.13998366 1.40567091] [1.23558253 1.10976943 1.09874738 1.35482401] [1.46091461 1.29665499 1.29895664 1.6016945 ] [1.85169831 1.64587197 1.64644355 2.03016752] [0.60224231 0.61046091 0.55642962 0.61273668] [0.80087912 0.90003502 0.86097292 0.96642246] [0.98234204 0.7966686 0.8937406 0.9782463 ] [0.95454405 0.90529066 0.86863427 0.95019828] [1.28460724 1.15855173 1.28364924 1.42856649] [1.22188693 1.15323979 1.12316879 1.23609213] [1.44239928 1.16976997 1.31249592 1.42079106] [1.83115491 1.48504651 1.66595055 1.82716657] [0.55801833 0.62536208 0.68557359 0.61701623] [0.87314186 0.91492322 1.00377407 0.90339666] [0.87439236 0.77830529 0.71883807 0.78695313] [0.86422309 0.92223429 1.01158233 0.9104241 ] [1.27158116 1.13184697 1.04536703 1.14442305] [1.11701146 1.17256494 1.28640926 1.15776833] [1.28389387 1.14280663 1.0554893 1.15550448] [1.62992909 1.45081601 1.33996491 1.46693618] [0.76450235 0.68158014 0.69654658 0.68733301] [0.77299682 0.85888535 0.77299682 0.78253999] [1.24918602 1.11338871 1.13814726 1.12305867] [1.21639456 1.08416196 1.1082706 1.09357809] [1.26306586 1.40340651 1.26306586 1.27865927] [1.22991005 1.36656672 1.22991005 1.24509413] [1.82459184 1.63609183 1.66240589 1.64146145] [1.84486508 2.04985009 1.84486508 1.86764119] [0.75651225 0.61114797 0.61831588 0.60856216] [0.7920947 0.95425352 0.78083627 0.96492058] [1.23575551 0.99759454 1.00869851 0.99091626] [1.20331661 0.95460946 0.98207329 0.96303981] [1.29572069 1.44063457 1.26490002 1.57666658] [1.26367451 1.24224946 1.22039845 1.53527867] [1.81713404 1.44939311 1.48185906 1.55582592] [1.89551177 1.86318803 1.8289222 2.302918 ] [0.68029019 0.60618136 0.56440281 0.66886353] [0.8909458 0.96306081 0.86964504 1.08158369] [1.11055678 0.98796177 0.90691505 1.08630341] [1.06068626 0.94506484 0.88147918 0.86946004] [1.44114012 1.44269517 1.29903885 1.62237554] [1.38310531 1.23156359 1.13940971 1.38024579] [1.61140816 1.43505654 1.33188548 1.40079304] [2.07465796 1.84527689 1.6904981 2.07036868] [0.59659629 0.67572351 0.67359004 0.51085746] [0.95658904 1.07108401 0.87798323 0.91045903] [0.9801531 1.09757129 1.0999582 0.91227739] [0.95658904 0.88229678 1.07108401 0.88709871] [1.60867475 1.61826173 1.43606135 1.766238 ] [1.58882117 1.39394139 1.40033118 1.74172739] [1.58882117 1.41930837 1.62863403 1.74172739] [2.38323175 2.09091208 2.10049678 2.61259109] [0.61485131 0.75016964 0.61185017 0.68418669] [0.97693818 1.21511254 0.97248035 1.10843783] [1.00056096 1.21772012 0.99854723 1.11054997] [0.95357786 0.78440529 0.95555343 0.85054756] [1.59469297 1.82266882 1.47308561 1.66265674] [1.55032878 1.24224949 1.27707077 1.37996298] [1.55032878 1.26279675 1.46199607 1.37996298] [2.32549317 1.86337423 1.91560615 2.06994447] [0.67577468 0.7570808 0.68075071 0.68978473] [1.09478984 1.22831869 1.10429827 1.11913481] [1.09690198 1.22831869 1.10453295 1.11913481] [0.85054756 0.76549281 0.77494333 0.76549281] [1.64218477 1.84247804 1.65644741 1.67870222] [1.37996298 1.24196668 1.2572996 1.24196668] [1.37996298 1.24196668 1.2572996 1.24196668] [2.06994447 1.86295002 1.8859494 1.86295002] [0. 0.4433422 0. 0.04926024] [1. 0.8010989 0.1 0.98901099] [1. 0.80268258 0.1 0.98918695] [1. 0.8010989 0.1 0.98901099] [2. 1.6021978 0.2 1.97802198] [2. 1.6021978 0.2 1.97802198] [2. 1.6021978 0.2 1.97802198] [3. 2.4032967 0.3 2.96703297] [0.04926024 0.50415731 0.54733605 0.49260244] [0.98901099 0.89749646 0.96757805 0.89010989] [0.98918695 0.90167882 0.99096615 0.89186953] [0.98901099 0.89749646 0.96757805 0.89010989] [1.97802198 1.76115922 1.63065278 1.78021978] [1.97802198 1.75895327 1.61079919 1.78021978] [1.97802198 1.75895327 1.61079919 1.78021978] [2.96703297 2.6384299 2.41619878 2.67032967]] Action mapping:[0 - UP; 1 - DOWN; 2 - LEFT; 3 - RIGHT Optimal actions: [1. 1. 1. 1. 1. 1. 1. 1. 3. 3. 3. 3. 3. 3. 3. 3. 1. 1. 3. 1. 3. 1. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 0. 0. 3. 3. 0. 0. 2. 2. 0. 2. 0. 2. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 3. 0. 0. 3. 3. 0. 3. 0. 3. 0. 0. 3. 0. 0. 0. 1. 1. 2. 2. 3. 3. 3. 3. 1. 1. 1. 2. 1. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 2. 0. 2. 0. 0. 0. 0. 0.]
产生如上图展示的价值函数学习和策略更新的过程
工作原理
迷宫包含一个起始单元格、一个目标单元格和一些硬币、墙壁和空地。
由于含有硬币的单元格性质各异,在迷宫环境中有112中截然不同的状态。
q_values是一个大小为112×4的大矩阵,步骤的最后将其打印了出来。
基于价值迭代的价值函数学习遵循贝尔曼方程,并且可以通过Q值函数中选择Q值/ 动作价值最高的动作来获得最优策略。
在图中,价值函数使用颜色表示,而策略使用绿色箭头表示。
最初这些状态几乎是均匀的。
随着学习的进行,有硬币的状态比没有硬币的装填获得更多的价值,而且同向目标的状态获得一个非常高的价值。
黑色表示墙壁,箭头表示策略给出的运动方向。
随着学习收敛,策略变为最优,使得智能体收集完每一枚硬币后引导它到达目标。
标签:self,智能,state,构建,env,action,强化,reward,elite From: https://blog.csdn.net/yuri_sospiro/article/details/140452977