一、使用测试
- 这里跑的是isaac gym官方的强化学习环境代码isaacgymenvs
下载链接:https://zhuanlan.zhihu.com/p/671309384
1、 运行命令和效果
- 训练命令
python train.py task=Cartpole #headless=True
运行倒立摆任务,运行一会就可以收敛。headless设置为True可以无图像输出
- 测试命令
python train.py task=Cartpole test=True checkpoint=./runs/Cartpole-21-22-15/nn/Cartpole.pth num_envs=2
test代表测试模式,num_envs是指定环境数量,checkpoint是模型的地址
二、train.py解读
1、hydra模块
@hydra.main(version_base="1.1", config_name="config", config_path="./cfg")
- 这个模块主要是从yaml文件中读取配置信息,是通过一个装饰器的形式实现的。不仅读取了./cfg/config.yaml文件,还读取了命令行中的配置信息,命令行中的配置信息回覆盖yaml中的配置。
2、基础参数字典配置
# ensure checkpoints can be specified as relative pathsif cfg.checkpoint:cfg.checkpoint = to_absolute_path(cfg.checkpoint)cfg_dict = omegaconf_to_dict(cfg)print_dict("配置字典:",cfg_dict)# set numpy formatting for printing only 配置numpy打印时候的格式set_np_formatting()# global rank of the GPU 获取进程rank号global_rank = int(os.getenv("RANK", "0")) # sets seed. if seed is -1 will pick a random one 设置种子cfg.seed = set_seed(cfg.seed, torch_deterministic=cfg.torch_deterministic, rank=global_rank)
- 这部分代码就是在cfg的基础上进行一些修改
3、注册环境
def create_isaacgym_env(**kwargs): #创建isaac gym环境envs = isaacgymenvs.make(cfg.seed, cfg.task_name, cfg.task.env.numEnvs, cfg.sim_device,cfg.rl_device,cfg.graphics_device_id,cfg.headless,cfg.multi_gpu,cfg.capture_video,cfg.force_render,cfg,**kwargs,)if cfg.capture_video:envs.is_vector_env = Trueenvs = gym.wrappers.RecordVideo(envs,f"videos/{run_name}",step_trigger=lambda step: step % cfg.capture_video_freq == 0,video_length=cfg.capture_video_len,)return envs#这里的注册了一个环境,叫做rlgpu。这个名字后来在CartpolePPO.yaml中设置了env_nameenv_configurations.register('rlgpu', {'vecenv_type': 'RLGPU1', #定义了一个向量环境的名字,这里是RLGPU1,要和下面的RLGPU1对应'env_creator': lambda **kwargs: create_isaacgym_env(**kwargs),})# 下面是注册这个向量环境RLGPU1,RLGPU1应该是一个用于管理多环境的类#从那个_init_文件中获得任务字典中的对应任务的环境类ige_env_cls = isaacgym_task_map[cfg.task_name] #这里的输出就是对应的环境类了dict_cls = ige_env_cls.dict_obs_cls if hasattr(ige_env_cls, 'dict_obs_cls') and ige_env_cls.dict_obs_cls else False #三元表达式if dict_cls:obs_spec = {}actor_net_cfg = cfg.train.params.networkobs_spec['obs'] = {'names': list(actor_net_cfg.inputs.keys()), 'concat': not actor_net_cfg.name == "complex_net", 'space_name': 'observation_space'}if "central_value_config" in cfg.train.params.config:critic_net_cfg = cfg.train.params.config.central_value_config.networkobs_spec['states'] = {'names': list(critic_net_cfg.inputs.keys()), 'concat': not critic_net_cfg.name == "complex_net", 'space_name': 'state_space'}vecenv.register('RLGPU1', lambda config_name, num_actors, **kwargs: ComplexObsRLGPUEnv(config_name, num_actors, obs_spec, **kwargs))else:vecenv.register('RLGPU1', lambda config_name, num_actors, **kwargs: RLGPUEnv(config_name, num_actors, **kwargs))
- 这里主要是配置了rlgpu,定义了其管理并行环境的方式和创建环境的函数。这个‘rlgpu’的名字是在强化学习任务的yaml里指定的。
4、补充配置字典
print(cfg)print('11111111111111111111111111111111111')rlg_config_dict = omegaconf_to_dict(cfg.train) #取出训练相关参数print(rlg_config_dict)rlg_config_dict = preprocess_train_config(cfg, rlg_config_dict)print('11111111111111111111111111111111111')print(rlg_config_dict)
def preprocess_train_config(cfg, config_dict):"""Adding common configuration parameters to the rl_games train config. 增加训练配置参数,增加的就是这下面的四个An alternative to this is inferring them in task-specific .yaml files, but that requires repeating the samevariable interpolations in each config."""train_cfg = config_dict['params']['config']train_cfg['device'] = cfg.rl_devicetrain_cfg['population_based_training'] = cfg.pbt.enabledtrain_cfg['pbt_idx'] = cfg.pbt.policy_idx if cfg.pbt.enabled else Nonetrain_cfg['full_experiment_name'] = cfg.get('full_experiment_name')# print(f'Using rl_device: {cfg.rl_device}')# print(f'Using sim_device: {cfg.sim_device}')# print(train_cfg)try: #检查是否需要修改网络的单元数model_size_multiplier = config_dict['params']['network']['mlp']['model_size_multiplier']if model_size_multiplier != 1:units = config_dict['params']['network']['mlp']['units']for i, u in enumerate(units):units[i] = u * model_size_multiplierprint(f'Modified MLP units by x{model_size_multiplier} to {config_dict["params"]["network"]["mlp"]["units"]}')except KeyError:passreturn config_dict
- 通过打印log可以看出,增加的实际上就是自定义函数中的这几个,应该是为了方便
5、定义观察者
observers = [RLGPUAlgoObserver()]if cfg.pbt.enabled:pbt_observer = PbtAlgoObserver(cfg)observers.append(pbt_observer)if cfg.wandb_activate:cfg.seed += global_rankif global_rank == 0:# initialize wandb only once per multi-gpu runwandb_observer = WandbAlgoObserver(cfg)observers.append(wandb_observer)# register new AMP network builder and agentdef build_runner(algo_observer):runner = Runner(algo_observer)#这里增加的几种算法其实没用上,是为别的算法准备的runner.algo_factory.register_builder('amp_continuous', lambda **kwargs : amp_continuous.AMPAgent(**kwargs))runner.player_factory.register_builder('amp_continuous', lambda **kwargs : amp_players.AMPPlayerContinuous(**kwargs))model_builder.register_model('continuous_amp', lambda network, **kwargs : amp_models.ModelAMPContinuous(network))model_builder.register_network('amp', lambda **kwargs : amp_network_builder.AMPBuilder())return runner
- 添加了三个观察者,主要是为了后续runner使用做准备
6、配置runner
runner = build_runner(MultiObserver(observers))runner.load(rlg_config_dict) #加载整体配置runner.reset() #重置环境# dump config dict 转存配置字典,就是每个实验log中的config.yaml文件if not cfg.test:experiment_dir = os.path.join('runs', cfg.train.params.config.name + '_{date:%d-%H-%M-%S}'.format(date=datetime.now()))os.makedirs(experiment_dir, exist_ok=True)with open(os.path.join(experiment_dir, 'config.yaml'), 'w') as f:f.write(OmegaConf.to_yaml(cfg))print("进入循环")print(cfg)runner.run({'train': not cfg.test,'play': cfg.test,'checkpoint': cfg.checkpoint,'sigma': cfg.sigma if cfg.sigma != '' else None})print("出循环")
三、torch_runner.py解读
- Runner是一个完整的类,属于rl_games模块
1、初始化
- 这个algo_factory主要就是维护了一个字典,将不同的算法类存到其中
def __init__(self, algo_observer=None):self.algo_factory = object_factory.ObjectFactory()self.algo_factory.register_builder('a2c_continuous', lambda **kwargs : a2c_continuous.A2CAgent(**kwargs))self.algo_factory.register_builder('a2c_discrete', lambda **kwargs : a2c_discrete.DiscreteA2CAgent(**kwargs)) self.algo_factory.register_builder('sac', lambda **kwargs: sac_agent.SACAgent(**kwargs))#self.algo_factory.register_builder('dqn', lambda **kwargs : dqnagent.DQNAgent(**kwargs))self.player_factory = object_factory.ObjectFactory()self.player_factory.register_builder('a2c_continuous', lambda **kwargs : players.PpoPlayerContinuous(**kwargs))self.player_factory.register_builder('a2c_discrete', lambda **kwargs : players.PpoPlayerDiscrete(**kwargs))self.player_factory.register_builder('sac', lambda **kwargs : players.SACPlayer(**kwargs))#self.player_factory.register_builder('dqn', lambda **kwargs : players.DQNPlayer(**kwargs))self.algo_observer = algo_observer if algo_observer else DefaultAlgoObserver()torch.backends.cudnn.benchmark = True### it didnot help for lots for openai gym envs anyway :(#torch.backends.cudnn.deterministic = True#torch.use_deterministic_algorithms(True)
2、run(self, args)
- 这个函数就是根据参数选择训练模式和测试模式
def run(self, args):if args['train']:self.run_train(args)elif args['play']:self.run_play(args)else:self.run_train(args)
3、run_train(self, args)
- 训练模式主要是根据算法类型创建智能体
- 然后调用训练函数train()
def run_train(self, args):print('Started to train')print('aaaaaaaaaaaaaaaaaaaaaaaaaaaa')print(self.algo_name)agent = self.algo_factory.create(self.algo_name, base_name='run', params=self.params)_restore(agent, args)_override_sigma(agent, args)print('1111112222333')agent.train()
四、torch_runner.py解读
- 这里有一个完整的类A2CAgent(a2c_common.ContinuousA2CBase)
- 这个类是a2c_common.ContinuousA2CBase的衍生类,这个衍生类重新编写了部分虚函数,包括calc_gradients(计算梯度)、reg_loss(正则化)等。这些虚函数会在train()函数中调用
- 上一部分调用的train()是基类的函数。
def calc_gradients(self, input_dict):value_preds_batch = input_dict['old_values']old_action_log_probs_batch = input_dict['old_logp_actions']advantage = input_dict['advantages']old_mu_batch = input_dict['mu']old_sigma_batch = input_dict['sigma']return_batch = input_dict['returns']actions_batch = input_dict['actions']obs_batch = input_dict['obs']obs_batch = self._preproc_obs(obs_batch)lr_mul = 1.0curr_e_clip = self.e_clipbatch_dict = {'is_train': True,'prev_actions': actions_batch, 'obs' : obs_batch,}rnn_masks = Noneif self.is_rnn:rnn_masks = input_dict['rnn_masks']batch_dict['rnn_states'] = input_dict['rnn_states']batch_dict['seq_length'] = self.seq_lengthif self.zero_rnn_on_done:batch_dict['dones'] = input_dict['dones'] with torch.cuda.amp.autocast(enabled=self.mixed_precision):#1、取出数据res_dict = self.model(batch_dict)action_log_probs = res_dict['prev_neglogp']values = res_dict['values']entropy = res_dict['entropy']mu = res_dict['mus']sigma = res_dict['sigmas']#2、计算损失,a_loss是actor损失,c_loss是critic损失,entropy是熵,b_loss是限制损失a_loss = self.actor_loss_func(old_action_log_probs_batch, action_log_probs, advantage, self.ppo, curr_e_clip)if self.has_value_loss:c_loss = common_losses.critic_loss(self.model,value_preds_batch, values, curr_e_clip, return_batch, self.clip_value)else:c_loss = torch.zeros(1, device=self.ppo_device)if self.bound_loss_type == 'regularisation':b_loss = self.reg_loss(mu)elif self.bound_loss_type == 'bound':b_loss = self.bound_loss(mu)else:b_loss = torch.zeros(1, device=self.ppo_device)losses, sum_mask = torch_ext.apply_masks([a_loss.unsqueeze(1), c_loss , entropy.unsqueeze(1), b_loss.unsqueeze(1)], rnn_masks)a_loss, c_loss, entropy, b_loss = losses[0], losses[1], losses[2], losses[3]loss = a_loss + 0.5 * c_loss * self.critic_coef - entropy * self.entropy_coef + b_loss * self.bounds_loss_coefif self.multi_gpu:self.optimizer.zero_grad()else:for param in self.model.parameters():param.grad = Noneself.scaler.scale(loss).backward()#TODO: Refactor this ugliest code of they yearself.trancate_gradients_and_step()with torch.no_grad():reduce_kl = rnn_masks is Nonekl_dist = torch_ext.policy_kl(mu.detach(), sigma.detach(), old_mu_batch, old_sigma_batch, reduce_kl)if rnn_masks is not None:kl_dist = (kl_dist * rnn_masks).sum() / rnn_masks.numel() #/ sum_maskself.diagnostics.mini_batch(self,{'values' : value_preds_batch,'returns' : return_batch,'new_neglogp' : action_log_probs,'old_neglogp' : old_action_log_probs_batch,'masks' : rnn_masks}, curr_e_clip, 0) self.train_result = (a_loss, c_loss, entropy, \kl_dist, self.last_lr, lr_mul, \mu.detach(), sigma.detach(), b_loss)
五、a2c_common.py解读
- 这个文件有A2CBase基类和从其衍生的连续A2C和离散A2C
1、 train()
- ContinuousA2CBase(A2CBase)中的train()函数主体是一个循环,调用self.train_epoch()执行仿真、生成数据、训练模型
def train(self):print('zzzzzzzzzzzz')self.init_tensors()self.last_mean_rewards = -100500start_time = time.time()total_time = 0rep_count = 0self.obs = self.env_reset()self.curr_frames = self.batch_size_envsif self.multi_gpu:print("====================broadcasting parameters")model_params = [self.model.state_dict()]dist.broadcast_object_list(model_params, 0)self.model.load_state_dict(model_params[0])while True:epoch_num = self.update_epoch()step_time, play_time, update_time, sum_time, a_losses, c_losses, b_losses, entropies, kls, last_lr, lr_mul = self.train_epoch()##关键就是这个函数total_time += sum_timeframe = self.frame // self.num_agents# cleaning memory to optimize spaceself.dataset.update_values_dict(None)should_exit = Falseif self.global_rank == 0:self.diagnostics.epoch(self, current_epoch = epoch_num)# do we need scaled_time?scaled_time = self.num_agents * sum_timescaled_play_time = self.num_agents * play_timecurr_frames = self.curr_frames * self.world_size if self.multi_gpu else self.curr_framesself.frame += curr_framesprint_statistics(self.print_stats, curr_frames, step_time, scaled_play_time, scaled_time, epoch_num, self.max_epochs, frame, self.max_frames)self.write_stats(total_time, epoch_num, step_time, play_time, update_time,a_losses, c_losses, entropies, kls, last_lr, lr_mul, frame,scaled_time, scaled_play_time, curr_frames)if len(b_losses) > 0:self.writer.add_scalar('losses/bounds_loss', torch_ext.mean_list(b_losses).item(), frame)if self.has_soft_aug:self.writer.add_scalar('losses/aug_loss', np.mean(aug_losses), frame)if self.game_rewards.current_size > 0:mean_rewards = self.game_rewards.get_mean()mean_shaped_rewards = self.game_shaped_rewards.get_mean()mean_lengths = self.game_lengths.get_mean()self.mean_rewards = mean_rewards[0]for i in range(self.value_size):rewards_name = 'rewards' if i == 0 else 'rewards{0}'.format(i)self.writer.add_scalar(rewards_name + '/step'.format(i), mean_rewards[i], frame)self.writer.add_scalar(rewards_name + '/iter'.format(i), mean_rewards[i], epoch_num)self.writer.add_scalar(rewards_name + '/time'.format(i), mean_rewards[i], total_time)self.writer.add_scalar('shaped_' + rewards_name + '/step'.format(i), mean_shaped_rewards[i], frame)self.writer.add_scalar('shaped_' + rewards_name + '/iter'.format(i), mean_shaped_rewards[i], epoch_num)self.writer.add_scalar('shaped_' + rewards_name + '/time'.format(i), mean_shaped_rewards[i], total_time)self.writer.add_scalar('episode_lengths/step', mean_lengths, frame)self.writer.add_scalar('episode_lengths/iter', mean_lengths, epoch_num)self.writer.add_scalar('episode_lengths/time', mean_lengths, total_time)if self.has_self_play_config:self.self_play_manager.update(self)checkpoint_name = self.config['name'] + '_ep_' + str(epoch_num) + '_rew_' + str(mean_rewards[0])if self.save_freq > 0:if epoch_num % self.save_freq == 0:self.save(os.path.join(self.nn_dir, 'last_' + checkpoint_name))if mean_rewards[0] > self.last_mean_rewards and epoch_num >= self.save_best_after:print('saving next best rewards: ', mean_rewards)self.last_mean_rewards = mean_rewards[0]self.save(os.path.join(self.nn_dir, self.config['name']))if 'score_to_win' in self.config:if self.last_mean_rewards > self.config['score_to_win']:print('Maximum reward achieved. Network won!')self.save(os.path.join(self.nn_dir, checkpoint_name))should_exit = Trueif epoch_num >= self.max_epochs and self.max_epochs != -1:if self.game_rewards.current_size == 0:print('WARNING: Max epochs reached before any env terminated at least once')mean_rewards = -np.infself.save(os.path.join(self.nn_dir, 'last_' + self.config['name'] + '_ep_' + str(epoch_num) \+ '_rew_' + str(mean_rewards).replace('[', '_').replace(']', '_')))print('MAX EPOCHS NUM!')should_exit = Trueif self.frame >= self.max_frames and self.max_frames != -1:if self.game_rewards.current_size == 0:print('WARNING: Max frames reached before any env terminated at least once')mean_rewards = -np.infself.save(os.path.join(self.nn_dir, 'last_' + self.config['name'] + '_frame_' + str(self.frame) \+ '_rew_' + str(mean_rewards).replace('[', '_').replace(']', '_')))print('MAX FRAMES NUM!')should_exit = Trueupdate_time = 0if self.multi_gpu:should_exit_t = torch.tensor(should_exit, device=self.device).float()dist.broadcast(should_exit_t, 0)should_exit = should_exit_t.float().item()if should_exit:return self.last_mean_rewards, epoch_numif should_exit:return self.last_mean_rewards, epoch_num
2、self.train_epoch()
- 这个函数实现仿真、计算梯度
- self.train_actor_critic(self.dataset[i])用于计算价值梯度
- batch_dict = self.play_steps()获得仿真数据
def train_epoch(self):super().train_epoch()self.set_eval()play_time_start = time.time()with torch.no_grad():if self.is_rnn:batch_dict = self.play_steps_rnn()else:batch_dict = self.play_steps()self.set_train()play_time_end = time.time()update_time_start = time.time()rnn_masks = batch_dict.get('rnn_masks', None)self.curr_frames = batch_dict.pop('played_frames')self.prepare_dataset(batch_dict)self.algo_observer.after_steps()a_losses = []c_losses = []entropies = []kls = []if self.has_central_value:self.train_central_value()for mini_ep in range(0, self.mini_epochs_num):ep_kls = []for i in range(len(self.dataset)):a_loss, c_loss, entropy, kl, last_lr, lr_mul = self.train_actor_critic(self.dataset[i])a_losses.append(a_loss)c_losses.append(c_loss)ep_kls.append(kl)entropies.append(entropy)av_kls = torch_ext.mean_list(ep_kls)if self.multi_gpu:dist.all_reduce(av_kls, op=dist.ReduceOp.SUM)av_kls /= self.world_sizeself.last_lr, self.entropy_coef = self.scheduler.update(self.last_lr, self.entropy_coef, self.epoch_num, 0, av_kls.item())self.update_lr(self.last_lr)kls.append(av_kls)self.diagnostics.mini_epoch(self, mini_ep)if self.normalize_input:self.model.running_mean_std.eval() # don't need to update statstics more than one miniepochupdate_time_end = time.time()play_time = play_time_end - play_time_startupdate_time = update_time_end - update_time_starttotal_time = update_time_end - play_time_startreturn batch_dict['step_time'], play_time, update_time, total_time, a_losses, c_losses, entropies, kls, last_lr, lr_mul
3、play_steps(self)
- 这个函数主要就是计算动作、输出状态、奖励,也是一个循环类型的代码
- 计算动作:res_dict = self.get_action_values(self.obs)
- 进行仿真:self.obs, rewards, self.dones, infos = self.env_step(res_dict[‘actions’]) #执行仿真
def play_steps(self):update_list = self.update_liststep_time = 0.0for n in range(self.horizon_length):if self.use_action_masks:masks = self.vec_env.get_action_masks()res_dict = self.get_masked_action_values(self.obs, masks)else:res_dict = self.get_action_values(self.obs)self.experience_buffer.update_data('obses', n, self.obs['obs'])self.experience_buffer.update_data('dones', n, self.dones)for k in update_list:self.experience_buffer.update_data(k, n, res_dict[k]) if self.has_central_value:self.experience_buffer.update_data('states', n, self.obs['states'])step_time_start = time.time()self.obs, rewards, self.dones, infos = self.env_step(res_dict['actions']) #执行仿真step_time_end = time.time()step_time += (step_time_end - step_time_start)shaped_rewards = self.rewards_shaper(rewards)if self.value_bootstrap and 'time_outs' in infos:shaped_rewards += self.gamma * res_dict['values'] * self.cast_obs(infos['time_outs']).unsqueeze(1).float()self.experience_buffer.update_data('rewards', n, shaped_rewards)self.current_rewards += rewardsself.current_shaped_rewards += shaped_rewardsself.current_lengths += 1all_done_indices = self.dones.nonzero(as_tuple=False)env_done_indices = all_done_indices[::self.num_agents]self.game_rewards.update(self.current_rewards[env_done_indices])self.game_shaped_rewards.update(self.current_shaped_rewards[env_done_indices])self.game_lengths.update(self.current_lengths[env_done_indices])self.algo_observer.process_infos(infos, env_done_indices)not_dones = 1.0 - self.dones.float()self.current_rewards = self.current_rewards * not_dones.unsqueeze(1)self.current_shaped_rewards = self.current_shaped_rewards * not_dones.unsqueeze(1)self.current_lengths = self.current_lengths * not_doneslast_values = self.get_values(self.obs)fdones = self.dones.float()mb_fdones = self.experience_buffer.tensor_dict['dones'].float()mb_values = self.experience_buffer.tensor_dict['values']mb_rewards = self.experience_buffer.tensor_dict['rewards']mb_advs = self.discount_values(fdones, last_values, mb_fdones, mb_values, mb_rewards)mb_returns = mb_advs + mb_valuesbatch_dict = self.experience_buffer.get_transformed_list(swap_and_flatten01, self.tensor_list)batch_dict['returns'] = swap_and_flatten01(mb_returns)batch_dict['played_frames'] = self.batch_sizebatch_dict['step_time'] = step_timereturn batch_dict
4、env_step(self, actions)
- 关键就是self.vec_env.step(actions) 这个函数,执行仿真
def env_step(self, actions):actions = self.preprocess_actions(actions) #动作预处理obs, rewards, dones, infos = self.vec_env.step(actions) #环境更新if self.is_tensor_obses:if self.value_size == 1:rewards = rewards.unsqueeze(1)return self.obs_to_tensors(obs), rewards.to(self.ppo_device), dones.to(self.ppo_device), infoselse:if self.value_size == 1:rewards = np.expand_dims(rewards, axis=1)return self.obs_to_tensors(obs), torch.from_numpy(rewards).to(self.ppo_device).float(), torch.from_numpy(dones).to(self.ppo_device), infos
六、cartpole.py解读
- vec_env可以被create_vec_env这个函数创建,通过下面的代码可以看出这个环境vec_env就是通过字典vecenv_config调用的,vecenv_config字典在最开始第一个文件的代码就调用了,在第一个文件中就根据配置指定了cartpole
vecenv_config = {}def register(config_name, func):vecenv_config[config_name] = funcdef create_vec_env(config_name, num_actors, **kwargs):vec_env_name = configurations[config_name]['vecenv_type']return vecenv_config[vec_env_name](config_name, num_actors, **kwargs)
- vec_env.step()这个函数就是任务类Cartpole(VecTask)的函数,这个函数来自基类VecTask
- self.pre_physics_step(action_tensor)在Cartpole中重写,输入动作
- self.render()渲染画面,也是这个类中实现的调用了step_graphics和draw_viewer
- self.gym.simulate(self.sim)实现单步仿真
- self.post_physics_step()存储仿真环境的新状态
def step(self, actions: torch.Tensor) -> Tuple[Dict[str, torch.Tensor], torch.Tensor, torch.Tensor, Dict[str, Any]]:"""Step the physics of the environment.Args:actions: actions to applyReturns:Observations, rewards, resets, infoObservations are dict of observations (currently only one member called 'obs')"""# randomize actionsif self.dr_randomizations.get('actions', None):actions = self.dr_randomizations['actions']['noise_lambda'](actions)action_tensor = torch.clamp(actions, -self.clip_actions, self.clip_actions)# apply actionsself.pre_physics_step(action_tensor)# step physics and render each framefor i in range(self.control_freq_inv):if self.force_render:self.render()self.gym.simulate(self.sim) #仿真一步# to fix!if self.device == 'cpu':self.gym.fetch_results(self.sim, True)# compute observations, rewards, resets, ...self.post_physics_step()self.control_steps += 1# fill time out buffer: set to 1 if we reached the max episode length AND the reset buffer is 1. Timeout == 1 makes sense only if the reset buffer is 1.self.timeout_buf = (self.progress_buf >= self.max_episode_length - 1) & (self.reset_buf != 0)# randomize observationsif self.dr_randomizations.get('observations', None):self.obs_buf = self.dr_randomizations['observations']['noise_lambda'](self.obs_buf)self.extras["time_outs"] = self.timeout_buf.to(self.rl_device)self.obs_dict["obs"] = torch.clamp(self.obs_buf, -self.clip_obs, self.clip_obs).to(self.rl_device)# asymmetric actor-criticif self.num_states > 0:self.obs_dict["states"] = self.get_state()return self.obs_dict, self.rew_buf.to(self.rl_device), self.reset_buf.to(self.rl_device), self.extras
七、整体流程
- 这个是从train.py到isaac gym仿真的流程,从上到下依次调用
train.py[main] 命令行执行
train.py[runner.run]
torch_runner.py[self.run_train(args)]
torch_runner.py[agent.train()]
a2c_continuous.py[基类a2c_common.ContinuousA2CBase]
a2c_common.py[train(self)]
a2c_common.py[train_epoch(self)]
a2c_common.py[self.play_steps()]
a2c_common.py[self.env_step()]
cartpole.py[基类VecTask]
vec_task.py[step()]
vec_task.py[self.gym.simulate(self.sim)] 调用isaac的仿真函数