1from typing import List, Dict, Any, Tuple, Union 2from collections import namedtuple 3import copy 4import torch 5from torch.utils.data import Dataset, DataLoader 6 7from ding.utils import POLICY_REGISTRY, split_data_generator, RunningMeanStd 8from ding.utils.data import default_collate, default_decollate 9from ding.torch_utils import Adam, to_device 10from ding.rl_utils import get_gae_with_default_last_value, get_train_sample, gae, gae_data, get_gae, \ 11 ppo_policy_data, ppo_policy_error, ppo_value_data, ppo_value_error, ppg_data, ppg_joint_error 12from ding.model import model_wrap 13from .base_policy import Policy 14 15 16class ExperienceDataset(Dataset): 17 """ 18 Overview: 19 A dataset class for storing and accessing experience data. 20 21 Interface: 22 ``__init__``, ``__len__``, ``__getitem__``. 23 """ 24 25 def __init__(self, data): 26 """ 27 Arguments: 28 - data (:obj:`dict`): A dictionary containing the experience data, where the keys represent the data types \ 29 and the values are the corresponding data arrays. 30 """ 31 super().__init__() 32 self.data = data 33 34 def __len__(self): 35 return list(self.data.values())[0].shape[0] 36 37 def __getitem__(self, ind): 38 data = {} 39 for key in self.data.keys(): 40 data[key] = self.data[key][ind] 41 return data 42 43 44def create_shuffled_dataloader(data, batch_size): 45 ds = ExperienceDataset(data) 46 return DataLoader(ds, batch_size=batch_size, shuffle=True) 47 48 49@POLICY_REGISTRY.register('ppg') 50class PPGPolicy(Policy): 51 """ 52 Overview: 53 Policy class of PPG algorithm. PPG is a policy gradient algorithm with auxiliary phase training. \ 54 The auxiliary phase training is proposed to distill the value into the policy network, \ 55 while making sure the policy network does not change the action predictions (kl div loss). \ 56 Paper link: https://arxiv.org/abs/2009.04416. 57 58 Interface: 59 ``_init_learn``, ``_data_preprocess_learn``, ``_forward_learn``, ``_state_dict_learn``, \ 60 ``_load_state_dict_learn``, ``_init_collect``, ``_forward_collect``, ``_process_transition``, \ 61 ``_get_train_sample``, ``_get_batch_size``, ``_init_eval``, ``_forward_eval``, ``default_model``, \ 62 ``_monitor_vars_learn``, ``learn_aux``. 63 Config: 64 == ==================== ======== ============== ======================================== ======================= 65 ID Symbol Type Default Value Description Other(Shape) 66 == ==================== ======== ============== ======================================== ======================= 67 1 ``type`` str ppg | RL policy register name, refer to | this arg is optional, 68 | registry ``POLICY_REGISTRY`` | a placeholder 69 2 ``cuda`` bool False | Whether to use cuda for network | this arg can be diff- 70 | erent from modes 71 3 ``on_policy`` bool True | Whether the RL algorithm is on-policy 72 | or off-policy 73 4. ``priority`` bool False | Whether use priority(PER) | priority sample, 74 | update priority 75 5 | ``priority_`` bool False | Whether use Importance Sampling | IS weight 76 | ``IS_weight`` | Weight to correct biased update. 77 6 | ``learn.update`` int 5 | How many updates(iterations) to train | this args can be vary 78 | ``_per_collect`` | after collector's one collection. Only | from envs. Bigger val 79 | valid in serial training | means more off-policy 80 7 | ``learn.value_`` float 1.0 | The loss weight of value network | policy network weight 81 | ``weight`` | is set to 1 82 8 | ``learn.entropy_`` float 0.01 | The loss weight of entropy | policy network weight 83 | ``weight`` | regularization | is set to 1 84 9 | ``learn.clip_`` float 0.2 | PPO clip ratio 85 | ``ratio`` 86 10 | ``learn.adv_`` bool False | Whether to use advantage norm in 87 | ``norm`` | a whole training batch 88 11 | ``learn.aux_`` int 5 | The frequency(normal update times) 89 | ``freq`` | of auxiliary phase training 90 12 | ``learn.aux_`` int 6 | The training epochs of auxiliary 91 | ``train_epoch`` | phase 92 13 | ``learn.aux_`` int 1 | The loss weight of behavioral_cloning 93 | ``bc_weight`` | in auxiliary phase 94 14 | ``collect.dis`` float 0.99 | Reward's future discount factor, aka. | may be 1 when sparse 95 | ``count_factor`` | gamma | reward env 96 15 | ``collect.gae_`` float 0.95 | GAE lambda factor for the balance 97 | ``lambda`` | of bias and variance(1-step td and mc) 98 == ==================== ======== ============== ======================================== ======================= 99 """ 100 config = dict( 101 # (str) RL policy register name (refer to function "POLICY_REGISTRY"). 102 type='ppg', 103 # (bool) Whether to use cuda for network. 104 cuda=False, 105 # (bool) Whether the RL algorithm is on-policy or off-policy. (Note: in practice PPO can be off-policy used) 106 on_policy=True, 107 priority=False, 108 # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True. 109 priority_IS_weight=False, 110 learn=dict( 111 actor_epoch_per_collect=1, 112 critic_epoch_per_collect=1, 113 batch_size=64, 114 learning_rate=0.001, 115 # ============================================================== 116 # The following configs is algorithm-specific 117 # ============================================================== 118 # (float) The loss weight of value network, policy network weight is set to 1 119 value_weight=0.5, 120 # (float) The loss weight of entropy regularization, policy network weight is set to 1 121 entropy_weight=0.01, 122 # (float) PPO clip ratio, defaults to 0.2 123 clip_ratio=0.2, 124 value_norm=False, 125 # (bool) Whether to use advantage norm in a whole training batch 126 adv_norm=False, 127 # (int) The frequency(normal update times) of auxiliary phase training 128 aux_freq=8, 129 # (int) The training epochs of auxiliary phase 130 aux_train_epoch=6, 131 # (int) The loss weight of behavioral_cloning in auxiliary phase 132 aux_bc_weight=1, 133 grad_clip_type='clip_norm', 134 grad_clip_value=10, 135 ignore_done=False, 136 ), 137 collect=dict( 138 # n_sample=64, 139 unroll_len=1, 140 # ============================================================== 141 # The following configs is algorithm-specific 142 # ============================================================== 143 # (float) Reward's future discount factor, aka. gamma. 144 discount_factor=0.99, 145 # (float) GAE lambda factor for the balance of bias and variance(1-step td and mc) 146 gae_lambda=0.95, 147 ), 148 eval=dict(), 149 ) 150 151 def default_model(self) -> Tuple[str, List[str]]: 152 """ 153 Overview: 154 Return this algorithm default neural network model setting for demonstration. ``__init__`` method will \ 155 automatically call this method to get the default model setting and create model. 156 157 Returns: 158 - model_info (:obj:`Tuple[str, List[str]]`): The registered model name and model's import_names. 159 """ 160 return 'ppg', ['ding.model.template.ppg'] 161 162 def _init_learn(self) -> None: 163 """ 164 Overview: 165 Initialize the learn mode of policy, including related attributes and modules. For PPG, it mainly \ 166 contains optimizer, algorithm-specific arguments such as aux_bc_weight and aux_train_epoch. This method \ 167 also executes some special network initializations and prepares running mean/std monitor for value. \ 168 This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``. 169 170 .. note:: 171 For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \ 172 and ``_load_state_dict_learn`` methods. 173 174 .. note:: 175 For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method. 176 177 .. note:: 178 If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \ 179 with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``. 180 """ 181 # Optimizer 182 self._optimizer_ac = Adam(self._model.actor_critic.parameters(), lr=self._cfg.learn.learning_rate) 183 self._optimizer_aux_critic = Adam(self._model.aux_critic.parameters(), lr=self._cfg.learn.learning_rate) 184 self._learn_model = model_wrap(self._model, wrapper_name='base') 185 186 # Algorithm config 187 self._priority = self._cfg.priority 188 self._priority_IS_weight = self._cfg.priority_IS_weight 189 assert not self._priority and not self._priority_IS_weight, "Priority is not implemented in PPG" 190 self._value_weight = self._cfg.learn.value_weight 191 self._entropy_weight = self._cfg.learn.entropy_weight 192 self._value_norm = self._cfg.learn.value_norm 193 if self._value_norm: 194 self._running_mean_std = RunningMeanStd(epsilon=1e-4, device=self._device) 195 self._clip_ratio = self._cfg.learn.clip_ratio 196 self._adv_norm = self._cfg.learn.adv_norm 197 198 # Main model 199 self._learn_model.reset() 200 201 # Auxiliary memories 202 self._aux_train_epoch = self._cfg.learn.aux_train_epoch 203 self._train_iteration = 0 204 self._aux_memories = [] 205 self._aux_bc_weight = self._cfg.learn.aux_bc_weight 206 207 def _data_preprocess_learn(self, data: List[Any]) -> dict: 208 """ 209 Overview: 210 Preprocess the data to fit the required data format for learning, including \ 211 collate(stack data into batch), ignore done(in some fake terminate env),\ 212 prepare loss weight per training sample, and cpu tensor to cuda. 213 Arguments: 214 - data (:obj:`List[Dict[str, Any]]`): The data collected from collect function. 215 Returns: 216 - data (:obj:`Dict[str, Any]`): The processed data, including at least ['done', 'weight']. 217 """ 218 # data preprocess 219 data = default_collate(data) 220 ignore_done = self._cfg.learn.ignore_done 221 if ignore_done: 222 data['done'] = None 223 else: 224 data['done'] = data['done'].float() 225 data['weight'] = None 226 if self._cuda: 227 data = to_device(data, self._device) 228 return data 229 230 def _forward_learn(self, data: dict) -> Dict[str, Any]: 231 """ 232 Overview: 233 Forward and backward function of learn mode. 234 Arguments: 235 - data (:obj:`Dict[str, Any]`): Input data used for policy forward, including the \ 236 collected training samples from replay buffer. For each element in dict, the key of the \ 237 dict is the name of data items and the value is the corresponding data. Usually, the value is \ 238 torch.Tensor or np.ndarray or there dict/list combinations. In the ``_forward_learn`` method, data \ 239 often need to first be stacked in the batch dimension by some utility functions such as \ 240 ``default_preprocess_learn``. \ 241 For PPG, each element in list is a dict containing at least the following keys: ``obs``, ``action``, \ 242 ``reward``, ``logit``, ``value``, ``done``. Sometimes, it also contains other keys such as ``weight``. 243 Returns: 244 - info_dict (:obj:`Dict[str, Any]`): Dict type data, a info dict indicated training result, which will be \ 245 recorded in text log and tensorboard, values are python scalar or a list of scalars. \ 246 For the detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method. 247 248 .. note:: 249 The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ 250 For the data type that not supported, the main reason is that the corresponding model does not support it. \ 251 You can implement you own model rather than use the default model. For more information, please raise an \ 252 issue in GitHub repo and we will continue to follow up. 253 254 .. note:: 255 For more detailed examples, please refer to our unittest for PPGPolicy: ``ding.policy.tests.test_ppgs``. 256 """ 257 data = self._data_preprocess_learn(data) 258 # ==================== 259 # PPG forward 260 # ==================== 261 self._learn_model.train() 262 return_infos = [] 263 if self._value_norm: 264 unnormalized_return = data['adv'] + data['value'] * self._running_mean_std.std 265 data['return'] = unnormalized_return / self._running_mean_std.std 266 self._running_mean_std.update(unnormalized_return.cpu().numpy()) 267 else: 268 data['return'] = data['adv'] + data['value'] 269 270 for epoch in range(self._cfg.learn.actor_epoch_per_collect): 271 for policy_data in split_data_generator(data, self._cfg.learn.batch_size, shuffle=True): 272 policy_adv = policy_data['adv'] 273 if self._adv_norm: 274 # Normalize advantage in a total train_batch 275 policy_adv = (policy_adv - policy_adv.mean()) / (policy_adv.std() + 1e-8) 276 # Policy Phase(Policy) 277 policy_output = self._learn_model.forward(policy_data['obs'], mode='compute_actor') 278 policy_error_data = ppo_policy_data( 279 policy_output['logit'], policy_data['logit'], policy_data['action'], policy_adv, 280 policy_data['weight'], None 281 ) 282 ppo_policy_loss, ppo_info = ppo_policy_error(policy_error_data, self._clip_ratio) 283 policy_loss = ppo_policy_loss.policy_loss - self._entropy_weight * ppo_policy_loss.entropy_loss 284 self._optimizer_ac.zero_grad() 285 policy_loss.backward() 286 self._optimizer_ac.step() 287 288 for epoch in range(self._cfg.learn.critic_epoch_per_collect): 289 for value_data in split_data_generator(data, self._cfg.learn.batch_size, shuffle=True): 290 value_adv = value_data['adv'] 291 return_ = value_data['return'] 292 if self._adv_norm: 293 # Normalize advantage in a total train_batch 294 value_adv = (value_adv - value_adv.mean()) / (value_adv.std() + 1e-8) 295 # Policy Phase(Value) 296 value_output = self._learn_model.forward(value_data['obs'], mode='compute_critic') 297 value_error_data = ppo_value_data( 298 value_output['value'], value_data['value'], return_, value_data['weight'] 299 ) 300 value_loss = self._value_weight * ppo_value_error(value_error_data, self._clip_ratio) 301 self._optimizer_aux_critic.zero_grad() 302 value_loss.backward() 303 self._optimizer_aux_critic.step() 304 305 data['return_'] = data['return'] 306 307 self._aux_memories.append(copy.deepcopy(data)) 308 309 self._train_iteration += 1 310 311 # ==================== 312 # PPG update 313 # use aux loss after iterations and reset aux_memories 314 # ==================== 315 316 # Auxiliary Phase 317 # record data for auxiliary head 318 319 if self._train_iteration % self._cfg.learn.aux_freq == 0: 320 aux_loss, bc_loss, aux_value_loss = self.learn_aux() 321 return { 322 'policy_cur_lr': self._optimizer_ac.defaults['lr'], 323 'value_cur_lr': self._optimizer_aux_critic.defaults['lr'], 324 'policy_loss': ppo_policy_loss.policy_loss.item(), 325 'value_loss': value_loss.item(), 326 'entropy_loss': ppo_policy_loss.entropy_loss.item(), 327 'policy_adv_abs_max': policy_adv.abs().max().item(), 328 'approx_kl': ppo_info.approx_kl, 329 'clipfrac': ppo_info.clipfrac, 330 'aux_value_loss': aux_value_loss, 331 'auxiliary_loss': aux_loss, 332 'behavioral_cloning_loss': bc_loss, 333 } 334 else: 335 return { 336 'policy_cur_lr': self._optimizer_ac.defaults['lr'], 337 'value_cur_lr': self._optimizer_aux_critic.defaults['lr'], 338 'policy_loss': ppo_policy_loss.policy_loss.item(), 339 'value_loss': value_loss.item(), 340 'entropy_loss': ppo_policy_loss.entropy_loss.item(), 341 'policy_adv_abs_max': policy_adv.abs().max().item(), 342 'approx_kl': ppo_info.approx_kl, 343 'clipfrac': ppo_info.clipfrac, 344 } 345 346 def _state_dict_learn(self) -> Dict[str, Any]: 347 """ 348 Overview: 349 Return the state_dict of learn mode, usually including model and optimizer. 350 Returns: 351 - state_dict (:obj:`Dict[str, Any]`): the dict of current policy learn state, for saving and restoring. 352 """ 353 return { 354 'model': self._learn_model.state_dict(), 355 'optimizer_ac': self._optimizer_ac.state_dict(), 356 'optimizer_aux_critic': self._optimizer_aux_critic.state_dict(), 357 } 358 359 def _load_state_dict_learn(self, state_dict: Dict[str, Any]) -> None: 360 """ 361 Overview: 362 Load the state_dict variable into policy learn mode. 363 Arguments: 364 - state_dict (:obj:`Dict[str, Any]`): the dict of policy learn state saved before.\ 365 When the value is distilled into the policy network, we need to make sure the policy \ 366 network does not change the action predictions, we need two optimizers, \ 367 _optimizer_ac is used in policy net, and _optimizer_aux_critic is used in value net. 368 369 .. tip:: 370 If you want to only load some parts of model, you can simply set the ``strict`` argument in \ 371 load_state_dict to ``False``, or refer to ``ding.torch_utils.checkpoint_helper`` for more \ 372 complicated operation. 373 """ 374 self._learn_model.load_state_dict(state_dict['model']) 375 self._optimizer_ac.load_state_dict(state_dict['optimizer_ac']) 376 self._optimizer_aux_critic.load_state_dict(state_dict['optimizer_aux_critic']) 377 378 def _init_collect(self) -> None: 379 """ 380 Overview: 381 Initialize the collect mode of policy, including related attributes and modules. For PPG, it contains the \ 382 collect_model to balance the exploration and exploitation (e.g. the multinomial sample mechanism in \ 383 discrete action space), and other algorithm-specific arguments such as unroll_len and gae_lambda. 384 This method will be called in ``__init__`` method if ``collect`` field is in ``enable_field``. 385 386 .. note:: 387 If you want to set some spacial member variables in ``_init_collect`` method, you'd better name them \ 388 with prefix ``_collect_`` to avoid conflict with other modes, such as ``self._collect_attr1``. 389 """ 390 self._unroll_len = self._cfg.collect.unroll_len 391 self._collect_model = model_wrap(self._model, wrapper_name='multinomial_sample') 392 # TODO continuous action space exploration 393 self._collect_model.reset() 394 self._gamma = self._cfg.collect.discount_factor 395 self._gae_lambda = self._cfg.collect.gae_lambda 396 397 def _forward_collect(self, data: dict) -> dict: 398 """ 399 Overview: 400 Policy forward function of collect mode (collecting training data by interacting with envs). Forward means \ 401 that the policy gets some necessary data (mainly observation) from the envs and then returns the output \ 402 data, such as the action to interact with the envs. 403 404 Arguments: 405 - data (:obj:`Dict[str, Any]`): Dict type data, stacked env data for predicting policy_output(action), \ 406 values are torch.Tensor or np.ndarray or dict/list combinations, keys are env_id indicated by integer. 407 408 Returns: 409 - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action and \ 410 other necessary data (action logit and value) for learn mode defined in \ 411 ``self._process_transition`` method. The key of the dict is the same as the input data, \ 412 i.e. environment id. 413 414 .. tip:: 415 If you want to add more tricks on this policy, like temperature factor in multinomial sample, you can pass \ 416 related data as extra keyword arguments of this method. 417 418 .. note:: 419 The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ 420 For the data type that not supported, the main reason is that the corresponding model does not support it. \ 421 You can implement you own model rather than use the default model. For more information, please raise an \ 422 issue in GitHub repo and we will continue to follow up. 423 424 .. note:: 425 For more detailed examples, please refer to our unittest for PPGPolicy: ``ding.policy.tests.test_ppg``. 426 """ 427 data_id = list(data.keys()) 428 data = default_collate(list(data.values())) 429 if self._cuda: 430 data = to_device(data, self._device) 431 self._collect_model.eval() 432 with torch.no_grad(): 433 output = self._collect_model.forward(data, mode='compute_actor_critic') 434 if self._cuda: 435 output = to_device(output, 'cpu') 436 output = default_decollate(output) 437 return {i: d for i, d in zip(data_id, output)} 438 439 def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple) -> dict: 440 """ 441 Overview: 442 Process and pack one timestep transition data into a dict, which can be directly used for training and \ 443 saved in replay buffer. For PPG, it contains obs, next_obs, action, reward, done, logit, value. 444 Arguments: 445 - obs (:obj:`Any`): Env observation 446 - model_output (:obj:`dict`): The output of the policy network with the observation \ 447 as input. For PPG, it contains the state value, action and the logit of the action. 448 - timestep (:obj:`namedtuple`): The execution result namedtuple returned by the environment step \ 449 method, except all the elements have been transformed into tensor data. Usually, it contains the next \ 450 obs, reward, done, info, etc. 451 Returns: 452 - transition (:obj:`dict`): The processed transition data of the current timestep. 453 454 .. note:: 455 ``next_obs`` is used to calculate nstep return when necessary, so we place in into transition by default. \ 456 You can delete this field to save memory occupancy if you do not need nstep return. 457 """ 458 transition = { 459 'obs': obs, 460 'next_obs': timestep.obs, 461 'logit': model_output['logit'], 462 'action': model_output['action'], 463 'value': model_output['value'], 464 'reward': timestep.reward, 465 'done': timestep.done, 466 } 467 return transition 468 469 def _get_train_sample(self, data: List[Dict[str, Any]]) -> Union[None, List[Any]]: 470 """ 471 Overview: 472 For a given trajectory (transitions, a list of transition) data, process it into a list of sample that \ 473 can be used for training directly. In PPG, a train sample is a processed transition with new computed \ 474 ``adv`` field. This method is usually used in collectors to execute necessary. \ 475 RL data preprocessing before training, which can help learner amortize revelant time consumption. \ 476 In addition, you can also implement this method as an identity function and do the data processing \ 477 in ``self._forward_learn`` method. 478 Arguments: 479 - data (:obj:`List[Dict[str, Any]]`): The trajectory data (a list of transition), each element is \ 480 the same format as the return value of ``self._process_transition`` method. 481 Returns: 482 - samples (:obj:`dict`): The processed train samples, each element is the similar format \ 483 as input transitions, but may contain more data for training, such as GAE advantage. 484 """ 485 data = to_device(data, self._device) 486 if self._cfg.learn.ignore_done: 487 data[-1]['done'] = False 488 489 if data[-1]['done']: 490 last_value = torch.zeros_like(data[-1]['value']) 491 else: 492 with torch.no_grad(): 493 last_value = self._collect_model.forward( 494 data[-1]['next_obs'].unsqueeze(0), mode='compute_actor_critic' 495 )['value'] 496 if self._value_norm: 497 last_value *= self._running_mean_std.std 498 for i in range(len(data)): 499 data[i]['value'] *= self._running_mean_std.std 500 data = get_gae( 501 data, 502 to_device(last_value, self._device), 503 gamma=self._gamma, 504 gae_lambda=self._gae_lambda, 505 cuda=False, 506 ) 507 if self._value_norm: 508 for i in range(len(data)): 509 data[i]['value'] /= self._running_mean_std.std 510 511 return get_train_sample(data, self._unroll_len) 512 513 def _get_batch_size(self) -> Dict[str, int]: 514 """ 515 Overview: 516 Get learn batch size. In the PPG algorithm, different networks require different data.\ 517 We need to get data['policy'] and data['value'] to train policy net and value net,\ 518 this function is used to get the batch size of data['policy'] and data['value']. 519 Returns: 520 - output (:obj:`dict[str, int]`): Dict type data, including str type batch size and int type batch size. 521 """ 522 bs = self._cfg.learn.batch_size 523 return {'policy': bs, 'value': bs} 524 525 def _init_eval(self) -> None: 526 """ 527 Overview: 528 Initialize the eval mode of policy, including related attributes and modules. For PPG, it contains the \ 529 eval model to select optimial action (e.g. greedily select action with argmax mechanism in discrete \ 530 action). This method will be called in ``__init__`` method if ``eval`` field is in ``enable_field``. 531 532 .. note:: 533 If you want to set some spacial member variables in ``_init_eval`` method, you'd better name them \ 534 with prefix ``_eval_`` to avoid conflict with other modes, such as ``self._eval_attr1``. 535 """ 536 self._eval_model = model_wrap(self._model, wrapper_name='argmax_sample') 537 self._eval_model.reset() 538 539 def _forward_eval(self, data: dict) -> dict: 540 """ 541 Overview: 542 Policy forward function of eval mode (evaluation policy performance by interacting with envs). Forward \ 543 means that the policy gets some necessary data (mainly observation) from the envs and then returns the \ 544 action to interact with the envs. ``_forward_eval`` in PPG often uses deterministic sample method to get \ 545 actions while ``_forward_collect`` usually uses stochastic sample method for balance exploration and \ 546 exploitation. 547 Arguments: 548 - data (:obj:`Dict[str, Any]`): The input data used for policy forward, including at least the obs. The \ 549 key of the dict is environment id and the value is the corresponding data of the env. 550 551 Returns: 552 - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action. The \ 553 key of the dict is the same as the input data, i.e. environment id. 554 555 .. note:: 556 The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ 557 For the data type that not supported, the main reason is that the corresponding model does not support it. \ 558 You can implement you own model rather than use the default model. For more information, please raise an \ 559 issue in GitHub repo and we will continue to follow up. 560 561 .. note:: 562 For more detailed examples, please refer to our unittest for PPGPolicy: ``ding.policy.tests.test_ppg``. 563 """ 564 data_id = list(data.keys()) 565 data = default_collate(list(data.values())) 566 if self._cuda: 567 data = to_device(data, self._device) 568 self._eval_model.eval() 569 with torch.no_grad(): 570 output = self._eval_model.forward(data, mode='compute_actor') 571 if self._cuda: 572 output = to_device(output, 'cpu') 573 output = default_decollate(output) 574 return {i: d for i, d in zip(data_id, output)} 575 576 def _monitor_vars_learn(self) -> List[str]: 577 """ 578 Overview: 579 Return the necessary keys for logging the return dict of ``self._forward_learn``. The logger module, such \ 580 as text logger, tensorboard logger, will use these keys to save the corresponding data. 581 Returns: 582 - vars (:obj:`List[str]`): The list of the necessary keys to be logged. 583 """ 584 return [ 585 'policy_cur_lr', 586 'value_cur_lr', 587 'policy_loss', 588 'value_loss', 589 'entropy_loss', 590 'policy_adv_abs_max', 591 'approx_kl', 592 'clipfrac', 593 'aux_value_loss', 594 'auxiliary_loss', 595 'behavioral_cloning_loss', 596 ] 597 598 def learn_aux(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: 599 """ 600 Overview: 601 The auxiliary phase training, where the value is distilled into the policy network. In PPG algorithm, \ 602 we use the value function loss as the auxiliary objective, thereby sharing features between the policy \ 603 and value function while minimizing distortions to the policy. We also use behavioral cloning loss to \ 604 optimize the auxiliary objective while otherwise preserving the original policy. 605 Returns: 606 - aux_loss (:obj:`Tuple[torch.Tensor, torch.Tensor, torch.Tensor]`): Including average auxiliary loss\ 607 average behavioral cloning loss, and average auxiliary value loss. 608 """ 609 aux_memories = self._aux_memories 610 # gather states and target values into one tensor 611 data = {} 612 states = [] 613 actions = [] 614 return_ = [] 615 old_values = [] 616 weights = [] 617 for memory in aux_memories: 618 # for memory in memories: 619 states.append(memory['obs']) 620 actions.append(memory['action']) 621 return_.append(memory['return_']) 622 old_values.append(memory['value']) 623 if memory['weight'] is None: 624 weight = torch.ones_like(memory['action']) 625 else: 626 weight = torch.tensor(memory['weight']) 627 weights.append(weight) 628 629 data['obs'] = torch.cat(states) 630 data['action'] = torch.cat(actions) 631 data['return_'] = torch.cat(return_) 632 data['value'] = torch.cat(old_values) 633 data['weight'] = torch.cat(weights).float() 634 # compute current policy logit_old 635 with torch.no_grad(): 636 data['logit_old'] = self._model.forward(data['obs'], mode='compute_actor')['logit'] 637 638 # prepared dataloader for auxiliary phase training 639 dl = create_shuffled_dataloader(data, self._cfg.learn.batch_size) 640 641 # the proposed auxiliary phase training 642 # where the value is distilled into the policy network, 643 # while making sure the policy network does not change the action predictions (kl div loss) 644 645 i = 0 646 auxiliary_loss_ = 0 647 behavioral_cloning_loss_ = 0 648 value_loss_ = 0 649 650 for epoch in range(self._aux_train_epoch): 651 for data in dl: 652 policy_output = self._model.forward(data['obs'], mode='compute_actor_critic') 653 654 # Calculate ppg error 'logit_new', 'logit_old', 'action', 'value_new', 'value_old', 'return_', 'weight' 655 data_ppg = ppg_data( 656 policy_output['logit'], data['logit_old'], data['action'], policy_output['value'], data['value'], 657 data['return_'], data['weight'] 658 ) 659 ppg_joint_loss = ppg_joint_error(data_ppg, self._clip_ratio) 660 wb = self._aux_bc_weight 661 total_loss = ppg_joint_loss.auxiliary_loss + wb * ppg_joint_loss.behavioral_cloning_loss 662 663 # # policy network loss copmoses of both the kl div loss as well as the auxiliary loss 664 # aux_loss = clipped_value_loss(policy_values, rewards, old_values, self.value_clip) 665 # loss_kl = F.kl_div(action_logprobs, old_action_probs, reduction='batchmean') 666 # policy_loss = aux_loss + loss_kl 667 668 self._optimizer_ac.zero_grad() 669 total_loss.backward() 670 self._optimizer_ac.step() 671 672 # paper says it is important to train the value network extra during the auxiliary phase 673 # Calculate ppg error 'value_new', 'value_old', 'return_', 'weight' 674 values = self._model.forward(data['obs'], mode='compute_critic')['value'] 675 data_aux = ppo_value_data(values, data['value'], data['return_'], data['weight']) 676 677 value_loss = ppo_value_error(data_aux, self._clip_ratio) 678 679 self._optimizer_aux_critic.zero_grad() 680 value_loss.backward() 681 self._optimizer_aux_critic.step() 682 683 auxiliary_loss_ += ppg_joint_loss.auxiliary_loss.item() 684 behavioral_cloning_loss_ += ppg_joint_loss.behavioral_cloning_loss.item() 685 value_loss_ += value_loss.item() 686 i += 1 687 688 self._aux_memories = [] 689 690 return auxiliary_loss_ / i, behavioral_cloning_loss_ / i, value_loss_ / i 691 692 693@POLICY_REGISTRY.register('ppg_offpolicy') 694class PPGOffPolicy(Policy): 695 """ 696 Overview: 697 Policy class of PPG algorithm with off-policy training mode. Off-policy PPG contains two different data \ 698 max_use buffers. The policy buffer offers data for policy phase , while the value buffer provides auxiliary \ 699 phase's data. The whole training procedure is similar to off-policy PPO but execute additional auxiliary \ 700 phase with a fixed frequency. 701 Interface: 702 ``_init_learn``, ``_data_preprocess_learn``, ``_forward_learn``, ``_state_dict_learn``, \ 703 ``_load_state_dict_learn``, ``_init_collect``, ``_forward_collect``, ``_process_transition``, \ 704 ``_get_train_sample``, ``_get_batch_size``, ``_init_eval``, ``_forward_eval``, ``default_model``, \ 705 ``_monitor_vars_learn``, ``learn_aux``. 706 Config: 707 == ==================== ======== ============== ======================================== ======================= 708 ID Symbol Type Default Value Description Other(Shape) 709 == ==================== ======== ============== ======================================== ======================= 710 1 ``type`` str ppg | RL policy register name, refer to | this arg is optional, 711 | registry ``POLICY_REGISTRY`` | a placeholder 712 2 ``cuda`` bool False | Whether to use cuda for network | this arg can be diff- 713 | erent from modes 714 3 ``on_policy`` bool True | Whether the RL algorithm is on-policy 715 | or off-policy 716 4. ``priority`` bool False | Whether use priority(PER) | priority sample, 717 | update priority 718 5 | ``priority_`` bool False | Whether use Importance Sampling | IS weight 719 | ``IS_weight`` | Weight to correct biased update. 720 6 | ``learn.update`` int 5 | How many updates(iterations) to train | this args can be vary 721 | ``_per_collect`` | after collector's one collection. Only | from envs. Bigger val 722 | valid in serial training | means more off-policy 723 7 | ``learn.value_`` float 1.0 | The loss weight of value network | policy network weight 724 | ``weight`` | is set to 1 725 8 | ``learn.entropy_`` float 0.01 | The loss weight of entropy | policy network weight 726 | ``weight`` | regularization | is set to 1 727 9 | ``learn.clip_`` float 0.2 | PPO clip ratio 728 | ``ratio`` 729 10 | ``learn.adv_`` bool False | Whether to use advantage norm in 730 | ``norm`` | a whole training batch 731 11 | ``learn.aux_`` int 5 | The frequency(normal update times) 732 | ``freq`` | of auxiliary phase training 733 12 | ``learn.aux_`` int 6 | The training epochs of auxiliary 734 | ``train_epoch`` | phase 735 13 | ``learn.aux_`` int 1 | The loss weight of behavioral_cloning 736 | ``bc_weight`` | in auxiliary phase 737 14 | ``collect.dis`` float 0.99 | Reward's future discount factor, aka. | may be 1 when sparse 738 | ``count_factor`` | gamma | reward env 739 15 | ``collect.gae_`` float 0.95 | GAE lambda factor for the balance 740 | ``lambda`` | of bias and variance(1-step td and mc) 741 == ==================== ======== ============== ======================================== ======================= 742 """ 743 config = dict( 744 # (str) RL policy register name (refer to function "POLICY_REGISTRY"). 745 type='ppg_offpolicy', 746 # (bool) Whether to use cuda for network. 747 cuda=False, 748 # (bool) Whether the RL algorithm is on-policy or off-policy. (Note: in practice PPO can be off-policy used) 749 on_policy=False, 750 priority=False, 751 # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True. 752 priority_IS_weight=False, 753 # (bool) Whether to need policy data in process transition 754 transition_with_policy_data=True, 755 learn=dict( 756 update_per_collect=5, 757 batch_size=64, 758 learning_rate=0.001, 759 # ============================================================== 760 # The following configs is algorithm-specific 761 # ============================================================== 762 # (float) The loss weight of value network, policy network weight is set to 1 763 value_weight=0.5, 764 # (float) The loss weight of entropy regularization, policy network weight is set to 1 765 entropy_weight=0.01, 766 # (float) PPO clip ratio, defaults to 0.2 767 clip_ratio=0.2, 768 # (bool) Whether to use advantage norm in a whole training batch 769 adv_norm=False, 770 # (int) The frequency(normal update times) of auxiliary phase training 771 aux_freq=5, 772 # (int) The training epochs of auxiliary phase 773 aux_train_epoch=6, 774 # (int) The loss weight of behavioral_cloning in auxiliary phase 775 aux_bc_weight=1, 776 ignore_done=False, 777 ), 778 collect=dict( 779 # n_sample=64, 780 unroll_len=1, 781 # ============================================================== 782 # The following configs is algorithm-specific 783 # ============================================================== 784 # (float) Reward's future discount factor, aka. gamma. 785 discount_factor=0.99, 786 # (float) GAE lambda factor for the balance of bias and variance(1-step td and mc) 787 gae_lambda=0.95, 788 ), 789 eval=dict(), 790 other=dict( 791 replay_buffer=dict( 792 # PPG use two separate buffer for different reuse 793 multi_buffer=True, 794 policy=dict(replay_buffer_size=1000, ), 795 value=dict(replay_buffer_size=1000, ), 796 ), 797 ), 798 ) 799 800 def default_model(self) -> Tuple[str, List[str]]: 801 """ 802 Overview: 803 Return this algorithm default neural network model setting for demonstration. ``__init__`` method will \ 804 automatically call this method to get the default model setting and create model. 805 806 Returns: 807 - model_info (:obj:`Tuple[str, List[str]]`): The registered model name and model's import_names. 808 809 .. note:: 810 The user can define and use customized network model but must obey the same inferface definition indicated \ 811 by import_names path. 812 """ 813 return 'ppg', ['ding.model.template.ppg'] 814 815 def _init_learn(self) -> None: 816 """ 817 Overview: 818 Initialize the learn mode of policy, including related attributes and modules. For PPG, it mainly \ 819 contains optimizer, algorithm-specific arguments such as aux_bc_weight and aux_train_epoch. This method \ 820 also executes some special network initializations and prepares running mean/std monitor for value. \ 821 This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``. 822 823 .. note:: 824 For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \ 825 and ``_load_state_dict_learn`` methods. 826 827 .. note:: 828 For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method. 829 830 .. note:: 831 If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \ 832 with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``. 833 """ 834 # Optimizer 835 self._optimizer_ac = Adam(self._model.actor_critic.parameters(), lr=self._cfg.learn.learning_rate) 836 self._optimizer_aux_critic = Adam(self._model.aux_critic.parameters(), lr=self._cfg.learn.learning_rate) 837 self._learn_model = model_wrap(self._model, wrapper_name='base') 838 839 # Algorithm config 840 self._priority = self._cfg.priority 841 self._priority_IS_weight = self._cfg.priority_IS_weight 842 assert not self._priority and not self._priority_IS_weight, "Priority is not implemented in PPG" 843 self._value_weight = self._cfg.learn.value_weight 844 self._entropy_weight = self._cfg.learn.entropy_weight 845 self._clip_ratio = self._cfg.learn.clip_ratio 846 self._adv_norm = self._cfg.learn.adv_norm 847 848 # Main model 849 self._learn_model.reset() 850 851 # Auxiliary memories 852 self._aux_train_epoch = self._cfg.learn.aux_train_epoch 853 self._train_iteration = 0 854 self._aux_memories = [] 855 self._aux_bc_weight = self._cfg.learn.aux_bc_weight 856 857 def _data_preprocess_learn(self, data: List[Any]) -> dict: 858 """ 859 Overview: 860 Preprocess the data to fit the required data format for learning, including \ 861 collate(stack data into batch), ignore done(in some fake terminate env),\ 862 prepare loss weight per training sample, and cpu tensor to cuda. 863 Arguments: 864 - data (:obj:`List[Dict[str, Any]]`): The data collected from collect function. 865 Returns: 866 - data (:obj:`Dict[str, Any]`): The processed data, including at least ['done', 'weight']. 867 """ 868 # data preprocess 869 for k, data_item in data.items(): 870 data_item = default_collate(data_item) 871 ignore_done = self._cfg.learn.ignore_done 872 if ignore_done: 873 data_item['done'] = None 874 else: 875 data_item['done'] = data_item['done'].float() 876 data_item['weight'] = None 877 data[k] = data_item 878 if self._cuda: 879 data = to_device(data, self._device) 880 return data 881 882 def _forward_learn(self, data: dict) -> Dict[str, Any]: 883 """ 884 Overview: 885 Forward and backward function of learn mode. 886 Arguments: 887 - data (:obj:`Dict[str, Any]`): Input data used for policy forward, including the \ 888 collected training samples from replay buffer. For each element in dict, the key of the \ 889 dict is the name of data items and the value is the corresponding data. Usually, \ 890 the class type of value is either torch.Tensor or np.ndarray, or a dict/list containing \ 891 either torch.Tensor or np.ndarray items In the ``_forward_learn`` method, data \ 892 often need to first be stacked in the batch dimension by some utility functions such as \ 893 ``default_preprocess_learn``. \ 894 For PPGOff, each element in list is a dict containing at least the following keys: ``obs``, \ 895 ``action``, ``reward``, ``logit``, ``value``, ``done``. Sometimes, it also contains other keys \ 896 such as ``weight``. 897 Returns: 898 - info_dict (:obj:`Dict[str, Any]`): Dict type data, a info dict indicated training result, which will be \ 899 recorded in text log and tensorboard, values are python scalar or a list of scalars. \ 900 For the detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method. 901 902 ReturnsKeys: 903 - necessary: "current lr", "total_loss", "policy_loss", "value_loss", "entropy_loss", \ 904 "adv_abs_max", "approx_kl", "clipfrac", \ 905 "aux_value_loss", "auxiliary_loss", "behavioral_cloning_loss". 906 907 - current_lr (:obj:`float`): Current learning rate. 908 - total_loss (:obj:`float`): The calculated loss. 909 - policy_loss (:obj:`float`): The policy(actor) loss of ppg. 910 - value_loss (:obj:`float`): The value(critic) loss of ppg. 911 - entropy_loss (:obj:`float`): The entropy loss. 912 - auxiliary_loss (:obj:`float`): The auxiliary loss, we use the value function loss \ 913 as the auxiliary objective, thereby sharing features between the policy and value function\ 914 while minimizing distortions to the policy. 915 - aux_value_loss (:obj:`float`): The auxiliary value loss, we need to train the value network extra \ 916 during the auxiliary phase, it's the value loss we train the value network during auxiliary phase. 917 - behavioral_cloning_loss (:obj:`float`): The behavioral cloning loss, used to optimize the auxiliary\ 918 objective while otherwise preserving the original policy. 919 """ 920 data = self._data_preprocess_learn(data) 921 # ==================== 922 # PPG forward 923 # ==================== 924 self._learn_model.train() 925 policy_data, value_data = data['policy'], data['value'] 926 policy_adv, value_adv = policy_data['adv'], value_data['adv'] 927 return_ = value_data['value'] + value_adv 928 if self._adv_norm: 929 # Normalize advantage in a total train_batch 930 policy_adv = (policy_adv - policy_adv.mean()) / (policy_adv.std() + 1e-8) 931 value_adv = (value_adv - value_adv.mean()) / (value_adv.std() + 1e-8) 932 # Policy Phase(Policy) 933 policy_output = self._learn_model.forward(policy_data['obs'], mode='compute_actor') 934 policy_error_data = ppo_policy_data( 935 policy_output['logit'], policy_data['logit'], policy_data['action'], policy_adv, policy_data['weight'], None 936 ) 937 ppo_policy_loss, ppo_info = ppo_policy_error(policy_error_data, self._clip_ratio) 938 policy_loss = ppo_policy_loss.policy_loss - self._entropy_weight * ppo_policy_loss.entropy_loss 939 self._optimizer_ac.zero_grad() 940 policy_loss.backward() 941 self._optimizer_ac.step() 942 943 # Policy Phase(Value) 944 value_output = self._learn_model.forward(value_data['obs'], mode='compute_critic') 945 value_error_data = ppo_value_data(value_output['value'], value_data['value'], return_, value_data['weight']) 946 value_loss = self._value_weight * ppo_value_error(value_error_data, self._clip_ratio) 947 self._optimizer_aux_critic.zero_grad() 948 value_loss.backward() 949 self._optimizer_aux_critic.step() 950 951 # ==================== 952 # PPG update 953 # use aux loss after iterations and reset aux_memories 954 # ==================== 955 956 # Auxiliary Phase 957 # record data for auxiliary head 958 data = data['value'] 959 data['return_'] = return_.data 960 self._aux_memories.append(copy.deepcopy(data)) 961 962 self._train_iteration += 1 963 total_loss = policy_loss + value_loss 964 if self._train_iteration % self._cfg.learn.aux_freq == 0: 965 aux_loss, bc_loss, aux_value_loss = self.learn_aux() 966 total_loss += aux_loss + bc_loss + aux_value_loss 967 return { 968 'policy_cur_lr': self._optimizer_ac.defaults['lr'], 969 'value_cur_lr': self._optimizer_aux_critic.defaults['lr'], 970 'policy_loss': ppo_policy_loss.policy_loss.item(), 971 'value_loss': value_loss.item(), 972 'entropy_loss': ppo_policy_loss.entropy_loss.item(), 973 'policy_adv_abs_max': policy_adv.abs().max().item(), 974 'approx_kl': ppo_info.approx_kl, 975 'clipfrac': ppo_info.clipfrac, 976 'aux_value_loss': aux_value_loss, 977 'auxiliary_loss': aux_loss, 978 'behavioral_cloning_loss': bc_loss, 979 'total_loss': total_loss.item(), 980 } 981 else: 982 return { 983 'policy_cur_lr': self._optimizer_ac.defaults['lr'], 984 'value_cur_lr': self._optimizer_aux_critic.defaults['lr'], 985 'policy_loss': ppo_policy_loss.policy_loss.item(), 986 'value_loss': value_loss.item(), 987 'entropy_loss': ppo_policy_loss.entropy_loss.item(), 988 'policy_adv_abs_max': policy_adv.abs().max().item(), 989 'approx_kl': ppo_info.approx_kl, 990 'clipfrac': ppo_info.clipfrac, 991 'total_loss': total_loss.item(), 992 } 993 994 def _state_dict_learn(self) -> Dict[str, Any]: 995 """ 996 Overview: 997 Return the state_dict of learn mode, usually including model and optimizer. 998 Returns: 999 - state_dict (:obj:`Dict[str, Any]`): the dict of current policy learn state, for saving and restoring.1000 """1001 return {1002 'model': self._learn_model.state_dict(),1003 'optimizer_ac': self._optimizer_ac.state_dict(),1004 'optimizer_aux_critic': self._optimizer_aux_critic.state_dict(),1005 }10061007 def _load_state_dict_learn(self, state_dict: Dict[str, Any]) -> None:1008 """1009 Overview:1010 Load the state_dict variable into policy learn mode.1011 Arguments:1012 - state_dict (:obj:`Dict[str, Any]`): the dict of policy learn state saved before.\1013 When the value is distilled into the policy network, we need to make sure the policy \1014 network does not change the action predictions, we need two optimizers, \1015 _optimizer_ac is used in policy net, and _optimizer_aux_critic is used in value net.10161017 .. tip::1018 If you want to only load some parts of model, you can simply set the ``strict`` argument in \1019 load_state_dict to ``False``, or refer to ``ding.torch_utils.checkpoint_helper`` for more \1020 complicated operation.1021 """1022 self._learn_model.load_state_dict(state_dict['model'])1023 self._optimizer_ac.load_state_dict(state_dict['optimizer_ac'])1024 self._optimizer_aux_critic.load_state_dict(state_dict['optimizer_aux_critic'])10251026 def _init_collect(self) -> None:1027 """1028 Overview:1029 Initialize the collect mode of policy, including related attributes and modules. For PPO, it contains the \1030 collect_model to balance the exploration and exploitation (e.g. the multinomial sample mechanism in \1031 discrete action space), and other algorithm-specific arguments such as unroll_len and gae_lambda.1032 This method will be called in ``__init__`` method if ``collect`` field is in ``enable_field``.10331034 .. note::1035 If you want to set some spacial member variables in ``_init_collect`` method, you'd better name them \1036 with prefix ``_collect_`` to avoid conflict with other modes, such as ``self._collect_attr1``.1037 """1038 self._unroll_len = self._cfg.collect.unroll_len1039 self._collect_model = model_wrap(self._model, wrapper_name='multinomial_sample')1040 # TODO continuous action space exploration1041 self._collect_model.reset()1042 self._gamma = self._cfg.collect.discount_factor1043 self._gae_lambda = self._cfg.collect.gae_lambda10441045 def _forward_collect(self, data: dict) -> dict:1046 """1047 Overview:1048 Policy forward function of collect mode (collecting training data by interacting with envs). Forward means \1049 that the policy gets some necessary data (mainly observation) from the envs and then returns the output \1050 data, such as the action to interact with the envs.10511052 Arguments:1053 - data (:obj:`Dict[str, Any]`): Dict type data, stacked env data for predicting policy_output(action), \1054 values are torch.Tensor or np.ndarray or dict/list combinations, keys are env_id indicated by integer.10551056 Returns:1057 - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action and \1058 other necessary data (action logit and value) for learn mode defined in \1059 ``self._process_transition`` method. The key of the dict is the same as the input data, \1060 i.e. environment id.10611062 .. tip::1063 If you want to add more tricks on this policy, like temperature factor in multinomial sample, you can pass \1064 related data as extra keyword arguments of this method.10651066 .. note::1067 The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \1068 For the data type that not supported, the main reason is that the corresponding model does not support it. \1069 You can implement you own model rather than use the default model. For more information, please raise an \1070 issue in GitHub repo and we will continue to follow up.10711072 .. note::1073 For more detailed examples, please refer to our unittest for PPGOffPolicy: ``ding.policy.tests.test_ppg``.1074 """1075 data_id = list(data.keys())1076 data = default_collate(list(data.values()))1077 if self._cuda:1078 data = to_device(data, self._device)1079 self._collect_model.eval()1080 with torch.no_grad():1081 output = self._collect_model.forward(data, mode='compute_actor_critic')1082 if self._cuda:1083 output = to_device(output, 'cpu')1084 output = default_decollate(output)1085 return {i: d for i, d in zip(data_id, output)}10861087 def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple) -> dict:1088 """1089 Overview:1090 Process and pack one timestep transition data into a dict, which can be directly used for training and \1091 saved in replay buffer. For PPG, it contains obs, next_obs, action, reward, done, logit, value.1092 Arguments:1093 - obs (:obj:`Any`): Env observation1094 - model_output (:obj:`dict`): The output of the policy network with the observation \1095 as input. For PPG, it contains the state value, action and the logit of the action.1096 - timestep (:obj:`namedtuple`): The execution result namedtuple returned by the environment step \1097 method, except all the elements have been transformed into tensor data. Usually, it contains the next \1098 obs, reward, done, info, etc.1099 Returns:1100 - transition (:obj:`dict`): The processed transition data of the current timestep.11011102 .. note::1103 ``next_obs`` is used to calculate nstep return when necessary, so we place in into transition by default. \1104 You can delete this field to save memory occupancy if you do not need nstep return.1105 """1106 transition = {1107 'obs': obs,1108 'next_obs': timestep.obs,1109 'logit': model_output['logit'],1110 'action': model_output['action'],1111 'value': model_output['value'],1112 'reward': timestep.reward,1113 'done': timestep.done,1114 }1115 return transition11161117 def _get_train_sample(self, data: list) -> Union[None, List[Any]]:1118 """1119 Overview:1120 For a given trajectory (transitions, a list of transition) data, process it into a list of sample that \1121 can be used for training directly. In PPG, a train sample is a processed transition with new computed \1122 ``adv`` field. This method is usually used in collectors to execute necessary. \1123 RL data preprocessing before training, which can help learner amortize revelant time consumption. \1124 In addition, you can also implement this method as an identity function and do the data processing \1125 in ``self._forward_learn`` method.1126 Arguments:1127 - data (:obj:`list`): The trajectory data (a list of transition), each element is \1128 the same format as the return value of ``self._process_transition`` method.1129 Returns:1130 - samples (:obj:`dict`): The processed train samples, each element is the similar format \1131 as input transitions, but may contain more data for training, such as GAE advantage.1132 """1133 data = get_gae_with_default_last_value(1134 data,1135 data[-1]['done'],1136 gamma=self._gamma,1137 gae_lambda=self._gae_lambda,1138 cuda=False,1139 )1140 data = get_train_sample(data, self._unroll_len)1141 for d in data:1142 d['buffer_name'] = ["policy", "value"]1143 return data11441145 def _get_batch_size(self) -> Dict[str, int]:1146 """1147 Overview:1148 Get learn batch size. In the PPG algorithm, different networks require different data.\1149 We need to get data['policy'] and data['value'] to train policy net and value net,\1150 this function is used to get the batch size of data['policy'] and data['value'].1151 Returns:1152 - output (:obj:`dict[str, int]`): Dict type data, including str type batch size and int type batch size.1153 """1154 bs = self._cfg.learn.batch_size1155 return {'policy': bs, 'value': bs}11561157 def _init_eval(self) -> None:1158 """1159 Overview:1160 Initialize the eval mode of policy, including related attributes and modules. For PPG, it contains the \1161 eval model to select optimial action (e.g. greedily select action with argmax mechanism in discrete \1162 action). This method will be called in ``__init__`` method if ``eval`` field is in ``enable_field``.11631164 .. note::1165 If you want to set some spacial member variables in ``_init_eval`` method, you'd better name them \1166 with prefix ``_eval_`` to avoid conflict with other modes, such as ``self._eval_attr1``.1167 """1168 self._eval_model = model_wrap(self._model, wrapper_name='argmax_sample')1169 self._eval_model.reset()11701171 def _forward_eval(self, data: dict) -> dict:1172 r"""1173 Overview:1174 Policy forward function of eval mode (evaluation policy performance by interacting with envs). Forward \1175 means that the policy gets some necessary data (mainly observation) from the envs and then returns the \1176 action to interact with the envs. ``_forward_eval`` in PPG often uses deterministic sample method to get \1177 actions while ``_forward_collect`` usually uses stochastic sample method for balance exploration and \1178 exploitation.1179 Arguments:1180 - data (:obj:`Dict[str, Any]`): The input data used for policy forward, including at least the obs. The \1181 key of the dict is environment id and the value is the corresponding data of the env.11821183 Returns:1184 - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action. The \1185 key of the dict is the same as the input data, i.e. environment id.11861187 .. note::1188 The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \1189 For the data type that not supported, the main reason is that the corresponding model does not support it. \1190 You can implement you own model rather than use the default model. For more information, please raise an \1191 issue in GitHub repo and we will continue to follow up.11921193 .. note::1194 For more detailed examples, please refer to our unittest for PPGOffPolicy: ``ding.policy.tests.test_ppg``.1195 """1196 data_id = list(data.keys())1197 data = default_collate(list(data.values()))1198 if self._cuda:1199 data = to_device(data, self._device)1200 self._eval_model.eval()1201 with torch.no_grad():1202 output = self._eval_model.forward(data, mode='compute_actor')1203 if self._cuda:1204 output = to_device(output, 'cpu')1205 output = default_decollate(output)1206 return {i: d for i, d in zip(data_id, output)}12071208 def _monitor_vars_learn(self) -> List[str]:1209 """1210 Overview:1211 Return the necessary keys for logging the return dict of ``self._forward_learn``. The logger module, such \1212 as text logger, tensorboard logger, will use these keys to save the corresponding data.1213 Returns:1214 - vars (:obj:`List[str]`): The list of the necessary keys to be logged.1215 """1216 return [1217 'policy_cur_lr',1218 'value_cur_lr',1219 'policy_loss',1220 'value_loss',1221 'entropy_loss',1222 'policy_adv_abs_max',1223 'approx_kl',1224 'clipfrac',1225 'aux_value_loss',1226 'auxiliary_loss',1227 'behavioral_cloning_loss',1228 ]12291230 def learn_aux(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:1231 """1232 Overview:1233 The auxiliary phase training, where the value is distilled into the policy network. In PPG algorithm, \1234 we use the value function loss as the auxiliary objective, thereby sharing features between the policy \1235 and value function while minimizing distortions to the policy. We also use behavioral cloning loss to \1236 optimize the auxiliary objective while otherwise preserving the original policy.1237 Returns:1238 - aux_loss (:obj:`Tuple[torch.Tensor, torch.Tensor, torch.Tensor]`): Including average auxiliary loss\1239 average behavioral cloning loss, and average auxiliary value loss.1240 """1241 aux_memories = self._aux_memories1242 # gather states and target values into one tensor1243 data = {}1244 states = []1245 actions = []1246 return_ = []1247 old_values = []1248 weights = []1249 for memory in aux_memories:1250 # for memory in memories:1251 states.append(memory['obs'])1252 actions.append(memory['action'])1253 return_.append(memory['return_'])1254 old_values.append(memory['value'])1255 if memory['weight'] is None:1256 weight = torch.ones_like(memory['action'])1257 else:1258 weight = torch.tensor(memory['weight'])1259 weights.append(weight)12601261 data['obs'] = torch.cat(states)1262 data['action'] = torch.cat(actions)1263 data['return_'] = torch.cat(return_)1264 data['value'] = torch.cat(old_values)1265 data['weight'] = torch.cat(weights)1266 # compute current policy logit_old1267 with torch.no_grad():1268 data['logit_old'] = self._model.forward(data['obs'], mode='compute_actor')['logit']12691270 # prepared dataloader for auxiliary phase training1271 dl = create_shuffled_dataloader(data, self._cfg.learn.batch_size)12721273 # the proposed auxiliary phase training1274 # where the value is distilled into the policy network,1275 # while making sure the policy network does not change the action predictions (kl div loss)12761277 i = 01278 auxiliary_loss_ = 01279 behavioral_cloning_loss_ = 01280 value_loss_ = 012811282 for epoch in range(self._aux_train_epoch):1283 for data in dl:1284 policy_output = self._model.forward(data['obs'], mode='compute_actor_critic')12851286 # Calculate ppg error 'logit_new', 'logit_old', 'action', 'value_new', 'value_old', 'return_', 'weight'1287 data_ppg = ppg_data(1288 policy_output['logit'], data['logit_old'], data['action'], policy_output['value'], data['value'],1289 data['return_'], data['weight']1290 )1291 ppg_joint_loss = ppg_joint_error(data_ppg, self._clip_ratio)1292 wb = self._aux_bc_weight1293 total_loss = ppg_joint_loss.auxiliary_loss + wb * ppg_joint_loss.behavioral_cloning_loss12941295 # # policy network loss copmoses of both the kl div loss as well as the auxiliary loss1296 # aux_loss = clipped_value_loss(policy_values, rewards, old_values, self.value_clip)1297 # loss_kl = F.kl_div(action_logprobs, old_action_probs, reduction='batchmean')1298 # policy_loss = aux_loss + loss_kl12991300 self._optimizer_ac.zero_grad()1301 total_loss.backward()1302 self._optimizer_ac.step()13031304 # paper says it is important to train the value network extra during the auxiliary phase1305 # Calculate ppg error 'value_new', 'value_old', 'return_', 'weight'1306 values = self._model.forward(data['obs'], mode='compute_critic')['value']1307 data_aux = ppo_value_data(values, data['value'], data['return_'], data['weight'])13081309 value_loss = ppo_value_error(data_aux, self._clip_ratio)13101311 self._optimizer_aux_critic.zero_grad()1312 value_loss.backward()1313 self._optimizer_aux_critic.step()13141315 auxiliary_loss_ += ppg_joint_loss.auxiliary_loss.item()1316 behavioral_cloning_loss_ += ppg_joint_loss.behavioral_cloning_loss.item()1317 value_loss_ += value_loss.item()1318 i += 113191320 self._aux_memories = []13211322 return auxiliary_loss_ / i, behavioral_cloning_loss_ / i, value_loss_ / i