ray.rllib-入门实践-12：自定义多智能体强化学习

我在博客 ray.rllib-入门实践-10：自定义环境中介绍了自定义 rllib 强化学习环境的方法与使用示例，但是那个示例是针对单智能体强化学习环境的。自定义多智能体强化学习环境及使用，与单智能体方法步骤类似，单细节上有许多不同，相对更复杂一些。

大概的三个步骤是：

1. 定义多智能体强化学习环境

2. 向ray注册环境

3. 配置并使用环境执行训练与评估。

环境配置：

torch==2.5.1

ray==2.10.0

ray[rllib]==2.10.0

ray[tune]==2.10.0

ray[serve]==2.10.0

numpy==1.23.0

python==3.9.18

一、自定义多智能体强化学习环境

需要继承自 “ray.rllib.env.multi_agent_env.MultiAgentEnv”. reset 函数和step函数接口需要与下面保持严格一致。每一个输出都变成了 key:value 字典。

import ray 
from ray import tune 
from ray.rllib.algorithms.ppo import PPO, PPOConfig 
from ray.tune.registry import register_env
from ray.rllib.env.multi_agent_env import MultiAgentEnv 
from gymnasium import spaces
import numpy as np
from sympy import pretty_print ## 1. 定义多智能体环境
class My_Multi_Agent_Env(MultiAgentEnv):def __init__(self,env_config):## 初始化智能体ID列表self.agents = env_config["agents"]## 定义每个智能体的动作空间， 这里默认多个智能体的动作空间一样self.action_space = spaces.Box(low=-1,high=1,shape=(3,))## 定义每个智能体的观测空间， 这里默认多个智能体的观测空间一样self.observation_space = spaces.Box(low=-1,high=1,shape=(5,))self.step_count = 0def reset(self, *, seed=None, options=None):self.step_count = 0obs,info = {}, {}for agent_name in self.agents:obs[agent_name] = self.observation_space.sample()info[agent_name] = {}return obs,infodef step(self,action):self.step_count += 1obs,reward,terminated,truncated, info = {},{},{},{},{}for agent_name in self.agents:obs[agent_name] = self.observation_space.sample()reward[agent_name] = 0terminated[agent_name] = Falsetruncated[agent_name] = Falseinfo[agent_name] = {}terminated["__all__"] = Falsetruncated["__all__"] = Falseif self.step_count > 10:terminated["__all__"] = Truetruncated["__all__"] = Truereturn obs,reward,terminated,truncated,infodef render(self):# 简单打印环境状态print(f"rensering..........")def close(self):pass

多智能体环境的测试：

import gymnasium as gym 
from gymnasium import spaces 
import numpy as np
from ray.rllib.env.multi_agent_env import MultiAgentEnv 
from ray.rllib.algorithms.ppo import PPO, PPOConfig
from ray.tune.registry import register_envclass My_Multi_Agent_Env(MultiAgentEnv):def __init__(self,num_agents = 3):## 初始化智能体数量self.num_agents = num_agents## 初始化智能体ID列表self.agents = [f"agent_{i}" for i in range(self.num_agents)]## 定义每个智能体的动作空间， 这里默认多个智能体的动作空间一样action_dim = 3self.action_space = spaces.Box(low=-1,high=1,shape=(action_dim,))## 定义每个智能体的观测空间， 这里默认多个智能体的观测空间一样obs_dim = 5self.observation_space = spaces.Box(low=-1,high=1,shape=(obs_dim,))## 定义整个环境的stateself.state = np.random.rand(self.num_agents, obs_dim)self.step_count = 0def reset(self, *, seed=None, options=None):self.step_count = 0obs,info = {}, {}for agent_name in self.agents:obs[agent_name] = self.observation_space.sample()info[agent_name] = {}return obs,infodef step(self,action):self.step_count += 1obs,reward,terminated,truncated, info = {},{},{},{},{}for agent_name in self.agents:obs[agent_name] = self.observation_space.sample()reward[agent_name] = 0terminated[agent_name] = Falsetruncated[agent_name] = Falseinfo[agent_name] = {}terminated["__all__"] = Falsetruncated["__all__"] = Falseif self.step_count > 10:terminated["__all__"] = Truetruncated["__all__"] = Truereturn obs,reward,terminated,truncated,infodef render(self):# 简单打印环境状态print(f"Current state: {self.state}")def close(self):pass ## 使用示例
env = My_Multi_Agent_Env(num_agents=3)
for i in range(3):## 重置环境obs,info = env.reset()print(f"reset, episode_{i}")while True:## 动作采样,后面改为神经网络生成actions = {}for agent_name in env.agents:actions[agent_name] = env.action_space.sample()## 动作执行obs,rewards,dones,truncates,infos = env.step(actions)print(f"\t step = {env.step_count}")if dones["__all__"]:break

二、注册多智能体环境

与单智能体环境方法一致。

from ray.tune.registry import register_envdef env_creator(env_config):return My_Multi_Agent_Env(env_config)
register_env("My_Multi_Agent_Env",env_creator)

三、配置并使用多智能体环境

在config中配置多智能体强化学习训练，主要需要配置两个模块：

1. 配置使用多智能体环境

通过以下代码实现：

agents = ["red","blue"]
config = PPOConfig()
config = config.environment(env="My_Multi_Agent_Env",env_config = {"agents":agents})

2. 给每个智能体配置一个policy

主要包含三个元素：

（1）policies: policy字典，里面存储了多个policy, 每个policy有有一个名字。

（2）policy_mapping_fn 函数，根据agent的名字或ID, 从policy中选择智能体。

（3）policies_to_train列表，记录哪个智能体的policy需要进行训练。

这三个元素共同组成一个model子模块 config.multi_agent。

具体配置方式如下：

## 4. 配置多智能体策略
agents = ["red","blue"]
env_config = {"agents":agents}
env = My_Multi_Agent_Env(env_config)
obs_space = env.observation_space
action_space = env.action_space## case-1: 同构的智能体设置
# policies = {}
# policies["default_policy"] = (None, obs_space, action_space,{})  ## None 表示默认的智能体策略， 比如PPOConfig对应的就是PPOTorchPolicy
# def policy_mapping_fn(agent_name):
#     return "default_policy" ## case-2: 异构的智能体设置， 异构效果更好。
policies = {}
for agent_name in agents:policies[agent_name] = (None, obs_space, action_space,{})  ## None 表示默认的智能体策略， 比如PPOConfig对应的就是PPOTorchPolicydef policy_mapping_fn(agent_name,*args, **kwargs):return agent_name multi_agent_config = {"policies":policies,"policy_mapping_fn":policy_mapping_fn
}policies_to_train = ["red","blue"]
## 5. 配置算法： 
agents = ["red","blue"]
config = PPOConfig()
config = config.environment(env="My_Multi_Agent_Env",env_config = {"agents":agents})"""在下面这行命令，对多智能体的模型进行配置"""
config.multi_agent(policies=policies,policy_mapping_fn = policy_mapping_fn,policies_to_train = ["red","blue"])
# config.multi_agent = multi_agent_config ## 这种方式不能使用， 虽然能跑通训练，但是无法加载保存的模型。
## 因为新版本的PPOConfig把config.multi_agent定义为一个函数， 加载模型时也是把 config.multi_agent 作为一个函数使用，
## 这种赋值操作，会改变 config.multi_agent 的属性， 导致配置失败。 虽然运行能跑通，但是不是预期的结果。 
config.evaluation(evaluation_num_workers=1,) ## 必须要有。evaluation的配置，后面才能使用algo.evaluation()操作。
algo = config.build()

四、训练并保存多智能体

## 6. 执行训练
for i in range(2):iter_result = algo.train()print(f"iter_{i}")## 7. 保存模型
import os 
checkpoint_dir = "F:/codes/RLlib_study/outputs"
os.makedirs(checkpoint_dir, exist_ok=True)
algo.save(checkpoint_dir)
print(f"save checkpoints to {checkpoint_dir}")

五、评估模型

评估方式1：

## 8. 评估模型
evaluation_result = algo.evaluate()
print("evaluated")
print(pretty_print(evaluation_result["evaluation"]))

评估方式2：可以实现与其他算法的智能体pk评估。

env_config = {"agents":["red","blue"]}
env = My_Multi_Agent_Env(env_config)
obs,info = env.reset()
step,done = 0,False
print("evaluate")
while not done:actions = {}for agent_id in agents:actions[agent_id] = algo.compute_single_action(obs[agent_id],policy_id=agent_id)## 如果想与其他来源的智能体对抗，可以在这里添加其他智能体的计算action的函数。## 这里可以添加任意智能体的模型。 obs, reward, terminated, truncated, info = env.step(actions)done = terminated["__all__"]step += 1print(f"step = {step}, \n\treward = {reward}, \n\taction = {actions}, \n\tobs = {obs}")

六、汇总代码

import ray 
from ray import tune 
from ray.rllib.algorithms.ppo import PPO, PPOConfig 
from ray.tune.registry import register_env
from ray.rllib.env.multi_agent_env import MultiAgentEnv 
from gymnasium import spaces
import numpy as np
from sympy import pretty_print ## 1. 定义多智能体环境
class My_Multi_Agent_Env(MultiAgentEnv):def __init__(self,env_config):## 初始化智能体ID列表self.agents = env_config["agents"]## 定义每个智能体的动作空间， 这里默认多个智能体的动作空间一样self.action_space = spaces.Box(low=-1,high=1,shape=(3,))## 定义每个智能体的观测空间， 这里默认多个智能体的观测空间一样self.observation_space = spaces.Box(low=-1,high=1,shape=(5,))self.step_count = 0def reset(self, *, seed=None, options=None):self.step_count = 0obs,info = {}, {}for agent_name in self.agents:obs[agent_name] = self.observation_space.sample()info[agent_name] = {}return obs,infodef step(self,action):self.step_count += 1obs,reward,terminated,truncated, info = {},{},{},{},{}for agent_name in self.agents:obs[agent_name] = self.observation_space.sample()reward[agent_name] = 0terminated[agent_name] = Falsetruncated[agent_name] = Falseinfo[agent_name] = {}terminated["__all__"] = Falsetruncated["__all__"] = Falseif self.step_count > 10:terminated["__all__"] = Truetruncated["__all__"] = Truereturn obs,reward,terminated,truncated,infodef render(self):# 简单打印环境状态print(f"rensering..........")def close(self):pass ## 2. 注册多智能体环境
def env_creator(env_config):return My_Multi_Agent_Env(env_config)
register_env("My_Multi_Agent_Env",env_creator)## 3. 初始化 ray 
ray.init()## 4. 配置多智能体策略
agents = ["red","blue"]
env_config = {"agents":agents}
env = My_Multi_Agent_Env(env_config)
obs_space = env.observation_space
action_space = env.action_space## case-1: 同构的智能体设置
# policies = {}
# policies["default_policy"] = (None, obs_space, action_space,{})  ## None 表示默认的智能体策略， 比如PPOConfig对应的就是PPOTorchPolicy
# def policy_mapping_fn(agent_name):
#     return "default_policy" ## case-2: 异构的智能体设置， 异构效果更好。
policies = {}
for agent_name in agents:policies[agent_name] = (None, obs_space, action_space,{})  ## None 表示默认的智能体策略， 比如PPOConfig对应的就是PPOTorchPolicydef policy_mapping_fn(agent_name,*args, **kwargs):return agent_name multi_agent_config = {"policies":policies,"policy_mapping_fn":policy_mapping_fn
}policies_to_train = ["red","blue"]
## 5. 配置算法： 
agents = ["red","blue"]
config = PPOConfig()
config = config.environment(env="My_Multi_Agent_Env",env_config = {"agents":agents})
# config.multi_agent = multi_agent_config ## 这种方式不能使用， 虽然能跑通训练，但是无法加载保存的模型。
## 因为新版本的PPOConfig把config.multi_agent定义为一个函数， 加载模型时也是把 config.multi_agent 作为一个函数使用，
## 这种赋值操作，会改变 config.multi_agent 的属性， 导致配置失败。 虽然运行能跑通，但是不是预期的结果。 
config.multi_agent(policies=policies,policy_mapping_fn = policy_mapping_fn,policies_to_train = ["red","blue"])
config.evaluation(evaluation_num_workers=1,) ## 必须要有。evaluation的配置，后面才能使用algo.evaluation()操作。
algo = config.build()## 6. 执行训练
for i in range(2):iter_result = algo.train()print(f"iter_{i}")## 7. 保存模型
import os 
checkpoint_dir = "F:/codes/RLlib_study/outputs"
os.makedirs(checkpoint_dir, exist_ok=True)
algo.save(checkpoint_dir)
print(f"save checkpoints to {checkpoint_dir}")## 8. 评估模型
## 方法1：
evaluation_result = algo.evaluate()
print("evaluated")
print(evaluation_result["evaluation"])## 方法2
env_config = {"agents":["red","blue"]}
env = My_Multi_Agent_Env(env_config)
obs,info = env.reset()
step,done = 0,False
print("evaluate")
while not done:actions = {}for agent_id in agents:actions[agent_id] = algo.compute_single_action(obs[agent_id],policy_id=agent_id)## 如果想与其他来源的智能体对抗，可以在这里添加其他智能体的计算action的函数。## 这里可以添加任意智能体的模型。 obs, reward, terminated, truncated, info = env.step(actions)done = terminated["__all__"]step += 1print(f"step = {step}, \n\treward = {reward}, \n\taction = {actions}, \n\tobs = {obs}")