ding.reward_model.ngu_reward_model¶
ding.reward_model.ngu_reward_model
¶
RndNGURewardModel
¶
Bases: BaseRewardModel
Overview
inter-episodic/RND reward model for NGU.
The corresponding paper is never give up: learning directed exploration strategies.
EpisodicNGURewardModel
¶
Bases: BaseRewardModel
Overview
Episodic reward model for NGU.
The corresponding paper is never give up: learning directed exploration strategies.
estimate(data)
¶
Rewrite the reward key in each row of the data.
Full Source Code
../ding/reward_model/ngu_reward_model.py
1import copy 2import random 3from typing import Union, Tuple, Dict, List 4 5import numpy as np 6import torch 7import torch.nn as nn 8import torch.nn.functional as F 9import torch.optim as optim 10from easydict import EasyDict 11 12from ding.model import FCEncoder, ConvEncoder 13from ding.utils import RunningMeanStd 14from ding.utils import SequenceType, REWARD_MODEL_REGISTRY 15from .base_reward_model import BaseRewardModel 16 17 18def collect_data_and_exclude_null_data_rnd(data_in): 19 res = [] 20 for item in data_in: 21 if torch.nonzero(torch.tensor(item['null']).float()).shape[0] != 0: # if have null padding in data 22 # the index of not null data in data_in 23 # not_null_index = torch.nonzero(torch.tensor(item['null']).float()).squeeze(-1) 24 null_start_index = int(torch.nonzero(torch.tensor(item['null']).float()).squeeze(-1)[0]) 25 obs = item['obs'][:null_start_index] # exclude the null padding data 26 else: 27 obs = item['obs'] # sequence data 28 res.append(obs) 29 return res 30 31 32def collect_data_rnd(data_in): 33 res = [] 34 is_null_list = [] 35 for item in data_in: 36 state = item['obs'] 37 is_null = item['null'] 38 res.append(state) 39 is_null_list.append(is_null) 40 return res, is_null_list 41 42 43def collect_data_and_exclude_null_data_episodic(data_in): 44 obs_list = [] 45 action_list = [] 46 for item in data_in: 47 if torch.nonzero(torch.tensor(item['null']).float()).shape[0] != 0: # if have null padding in data 48 # the index of not null data in data_in 49 # not_null_index = torch.nonzero(torch.tensor(item['null']).float()).squeeze(-1) 50 null_start_index = int(torch.nonzero(torch.tensor(item['null']).float()).squeeze(-1)[0]) 51 obs = item['obs'][:null_start_index] # sequence data 52 action = item['action'][:null_start_index] # exclude the null padding data 53 else: 54 obs = item['obs'] # sequence data 55 action = item['action'] 56 obs_list.append(obs) 57 action_list.append(action) 58 return obs_list, action_list 59 60 61def collect_data_episodic(data_in): 62 res = [] 63 is_null_list = [] 64 for item in data_in: 65 state = item['obs'] 66 is_null = item['null'] 67 res.append(state) 68 is_null_list.append(is_null) 69 return res, is_null_list 70 71 72class RndNetwork(nn.Module): 73 74 def __init__(self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType) -> None: 75 super(RndNetwork, self).__init__() 76 if isinstance(obs_shape, int) or len(obs_shape) == 1: 77 self.target = FCEncoder(obs_shape, hidden_size_list) 78 self.predictor = FCEncoder(obs_shape, hidden_size_list) 79 elif len(obs_shape) == 3: 80 self.target = ConvEncoder(obs_shape, hidden_size_list) 81 self.predictor = ConvEncoder(obs_shape, hidden_size_list) 82 else: 83 raise KeyError( 84 "not support obs_shape for pre-defined encoder: {}, " 85 "please customize your own RND model".format(obs_shape) 86 ) 87 for param in self.target.parameters(): 88 param.requires_grad = False 89 90 def forward(self, obs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: 91 predict_feature = self.predictor(obs) 92 with torch.no_grad(): 93 target_feature = self.target(obs) 94 return predict_feature, target_feature 95 96 97@REWARD_MODEL_REGISTRY.register('rnd-ngu') 98class RndNGURewardModel(BaseRewardModel): 99 r""" 100 Overview: 101 inter-episodic/RND reward model for NGU. 102 The corresponding paper is `never give up: learning directed exploration strategies`. 103 """ 104 config = dict( 105 type='rnd-ngu', 106 intrinsic_reward_type='add', 107 learning_rate=1e-3, 108 batch_size=64, 109 hidden_size_list=[64, 64, 128], 110 update_per_collect=100, 111 ) 112 113 def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> None: # noqa 114 super(RndNGURewardModel, self).__init__() 115 self.cfg = config 116 assert device == "cpu" or device.startswith("cuda") 117 self.device = device 118 self.tb_logger = tb_logger 119 self.reward_model = RndNetwork(config.obs_shape, config.hidden_size_list) 120 self.reward_model.to(self.device) 121 self.intrinsic_reward_type = config.intrinsic_reward_type 122 assert self.intrinsic_reward_type in ['add', 'new', 'assign'] 123 self.train_data_total = [] 124 self.train_data = [] 125 self.opt = optim.Adam(self.reward_model.predictor.parameters(), config.learning_rate) 126 self.estimate_cnt_rnd = 0 127 self._running_mean_std_rnd = RunningMeanStd(epsilon=1e-4) 128 self.only_use_last_five_frames = config.only_use_last_five_frames_for_icm_rnd 129 130 def _train(self) -> None: 131 train_data: list = random.sample(list(self.train_data_cur), self.cfg.batch_size) 132 133 train_data: torch.Tensor = torch.stack(train_data).to(self.device) 134 135 predict_feature, target_feature = self.reward_model(train_data) 136 loss = F.mse_loss(predict_feature, target_feature.detach()) 137 self.opt.zero_grad() 138 loss.backward() 139 self.opt.step() 140 141 def train(self) -> None: 142 if self.only_use_last_five_frames: 143 # self.train_obs shape list(list) [batch_size,seq_length,N 144 145 # stack episode dim 146 self.train_obs = [torch.stack(episode_obs[-5:], dim=0) for episode_obs in self.train_data_total] 147 148 # stack batch dim 149 # way 1 150 if isinstance(self.cfg.obs_shape, int): 151 self.train_data_cur = torch.stack( 152 self.train_obs, dim=0 153 ).view(len(self.train_obs) * len(self.train_obs[0]), self.cfg.obs_shape) 154 else: # len(self.cfg.obs_shape) == 3 for image obs 155 self.train_data_cur = torch.stack( 156 self.train_obs, dim=0 157 ).view(len(self.train_obs) * self.train_obs[0].shape[0], *self.cfg.obs_shape) 158 # way 2 159 # self.train_data_cur = torch.cat(self.train_obs, 0) 160 161 else: 162 self.train_data_cur = sum(self.train_data_total, []) 163 # another implementation way 164 # tmp = [] 165 # for i in range(len(self.train_data)): 166 # tmp += self.train_data[i] 167 # self.train_data = tmp 168 169 for _ in range(self.cfg.update_per_collect): 170 self._train() 171 172 def estimate(self, data: list) -> torch.Tensor: 173 """ 174 Rewrite the reward key in each row of the data. 175 """ 176 obs, is_null = collect_data_rnd(data) 177 if isinstance(obs[0], list): # if obs shape list( list(torch.tensor) ) 178 obs = sum(obs, []) 179 180 obs = torch.stack(obs).to(self.device) 181 182 with torch.no_grad(): 183 predict_feature, target_feature = self.reward_model(obs) 184 reward = F.mse_loss(predict_feature, target_feature, reduction='none').mean(dim=1) 185 self._running_mean_std_rnd.update(reward.cpu().numpy()) 186 # transform to mean 1 std 1 187 reward = 1 + (reward - self._running_mean_std_rnd.mean) / (self._running_mean_std_rnd.std + 1e-11) 188 self.estimate_cnt_rnd += 1 189 self.tb_logger.add_scalar('rnd_reward/rnd_reward_max', reward.max(), self.estimate_cnt_rnd) 190 self.tb_logger.add_scalar('rnd_reward/rnd_reward_mean', reward.mean(), self.estimate_cnt_rnd) 191 self.tb_logger.add_scalar('rnd_reward/rnd_reward_min', reward.min(), self.estimate_cnt_rnd) 192 return reward 193 194 def collect_data(self, data: list) -> None: 195 self.train_data_total.extend(collect_data_and_exclude_null_data_rnd(data)) 196 197 def clear_data(self) -> None: 198 self.train_data_total.clear() 199 200 def reward_deepcopy(self, train_data): 201 """ 202 this method deepcopy reward part in train_data, and other parts keep shallow copy 203 to avoid the reward part of train_data in the replay buffer be incorrectly modified. 204 """ 205 train_data_reward_deepcopy = [ 206 {k: copy.deepcopy(v) if k == 'reward' else v 207 for k, v in sample.items()} for sample in train_data 208 ] 209 return train_data_reward_deepcopy 210 211 212class InverseNetwork(nn.Module): 213 214 def __init__(self, obs_shape: Union[int, SequenceType], action_shape, hidden_size_list: SequenceType) -> None: 215 super(InverseNetwork, self).__init__() 216 if isinstance(obs_shape, int) or len(obs_shape) == 1: 217 self.embedding_net = FCEncoder(obs_shape, hidden_size_list) 218 elif len(obs_shape) == 3: 219 self.embedding_net = ConvEncoder(obs_shape, hidden_size_list) 220 else: 221 raise KeyError( 222 "not support obs_shape for pre-defined encoder: {}, please customize your own RND model". 223 format(obs_shape) 224 ) 225 self.inverse_net = nn.Sequential( 226 nn.Linear(hidden_size_list[-1] * 2, 512), nn.ReLU(inplace=True), nn.Linear(512, action_shape) 227 ) 228 229 def forward(self, inputs: Dict, inference: bool = False) -> Dict: 230 if inference: 231 with torch.no_grad(): 232 cur_obs_embedding = self.embedding_net(inputs['obs']) 233 return cur_obs_embedding 234 else: 235 # obs: torch.Tensor, next_obs: torch.Tensor 236 cur_obs_embedding = self.embedding_net(inputs['obs']) 237 next_obs_embedding = self.embedding_net(inputs['next_obs']) 238 # get pred action 239 obs_plus_next_obs = torch.cat([cur_obs_embedding, next_obs_embedding], dim=-1) 240 pred_action_logits = self.inverse_net(obs_plus_next_obs) 241 pred_action_probs = nn.Softmax(dim=-1)(pred_action_logits) 242 return pred_action_logits, pred_action_probs 243 244 245@REWARD_MODEL_REGISTRY.register('episodic') 246class EpisodicNGURewardModel(BaseRewardModel): 247 r""" 248 Overview: 249 Episodic reward model for NGU. 250 The corresponding paper is `never give up: learning directed exploration strategies`. 251 """ 252 config = dict( 253 type='episodic', 254 intrinsic_reward_type='add', 255 learning_rate=1e-3, 256 batch_size=64, 257 hidden_size_list=[64, 64, 128], 258 update_per_collect=100, 259 # means if using rescale trick to the last non-zero reward 260 # when combing extrinsic and intrinsic reward. 261 # the rescale trick only used in: 262 # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal 263 # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander 264 # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, 265 # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the 266 # original last nonzero extrinsic reward. 267 last_nonzero_reward_rescale=False, 268 # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True 269 last_nonzero_reward_weight=1, 270 ) 271 272 def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> None: # noqa 273 super(EpisodicNGURewardModel, self).__init__() 274 self.cfg = config 275 assert device == "cpu" or device.startswith("cuda") 276 self.device = device 277 self.tb_logger = tb_logger 278 self.episodic_reward_model = InverseNetwork(config.obs_shape, config.action_shape, config.hidden_size_list) 279 self.episodic_reward_model.to(self.device) 280 self.intrinsic_reward_type = config.intrinsic_reward_type 281 assert self.intrinsic_reward_type in ['add', 'new', 'assign'] 282 self.train_obs_total = [] 283 self.train_action_total = [] 284 self.opt = optim.Adam(self.episodic_reward_model.parameters(), config.learning_rate) 285 self.estimate_cnt_episodic = 0 286 self._running_mean_std_episodic_dist = RunningMeanStd(epsilon=1e-4) 287 self._running_mean_std_episodic_reward = RunningMeanStd(epsilon=1e-4) 288 self.only_use_last_five_frames = config.only_use_last_five_frames_for_icm_rnd 289 290 def _train(self) -> None: 291 # sample episode's timestep index 292 train_index = np.random.randint(low=0, high=self.train_obs.shape[0], size=self.cfg.batch_size) 293 294 train_obs: torch.Tensor = self.train_obs[train_index].to(self.device) # shape (self.cfg.batch_size, obs_dim) 295 train_next_obs: torch.Tensor = self.train_next_obs[train_index].to(self.device) 296 train_action: torch.Tensor = self.train_action[train_index].to(self.device) 297 298 train_data = {'obs': train_obs, 'next_obs': train_next_obs} 299 pred_action_logits, pred_action_probs = self.episodic_reward_model(train_data) 300 301 inverse_loss = F.cross_entropy(pred_action_logits, train_action.squeeze(-1)) 302 self.opt.zero_grad() 303 inverse_loss.backward() 304 self.opt.step() 305 306 def train(self) -> None: 307 self.train_next_obs_total = copy.deepcopy(self.train_obs_total) 308 309 if self.only_use_last_five_frames: 310 # self.train_obs shape: list(list) [batch_size,seq_length,obs_dim] 311 self.train_obs = [torch.stack(episode_obs[-6:-1], dim=0) for episode_obs in self.train_obs_total] 312 self.train_next_obs = [torch.stack(episode_obs[-5:], dim=0) for episode_obs in self.train_next_obs_total] 313 self.train_action = [ 314 torch.stack(episode_action[-6:-1], dim=0) for episode_action in self.train_action_total 315 ] 316 else: 317 self.train_obs = [ 318 torch.stack(episode_obs[:-1], dim=0) for episode_obs in self.train_obs_total if len(episode_obs) > 1 319 ] 320 self.train_next_obs = [ 321 torch.stack(episode_next_obs[1:], dim=0) for episode_next_obs in self.train_next_obs_total 322 if len(episode_next_obs) > 1 323 ] 324 self.train_action = [ 325 torch.stack(episode_action[:-1], dim=0) for episode_action in self.train_action_total 326 if len(episode_action) > 1 327 ] 328 329 # stack batch dim 330 self.train_obs = torch.cat(self.train_obs, 0) 331 self.train_next_obs = torch.cat(self.train_next_obs, 0) 332 self.train_action = torch.cat(self.train_action, 0) 333 334 for _ in range(self.cfg.update_per_collect): 335 self._train() 336 337 def _compute_intrinsic_reward( 338 self, 339 episodic_memory: List, 340 current_controllable_state: torch.Tensor, 341 k=10, 342 kernel_cluster_distance=0.008, 343 kernel_epsilon=0.0001, 344 c=0.001, 345 siminarity_max=8, 346 ) -> torch.Tensor: 347 # this function is modified from https://github.com/Coac/never-give-up/blob/main/embedding_model.py 348 state_dist = torch.cdist(current_controllable_state.unsqueeze(0), episodic_memory, p=2).squeeze(0).sort()[0][:k] 349 self._running_mean_std_episodic_dist.update(state_dist.cpu().numpy()) 350 state_dist = state_dist / (self._running_mean_std_episodic_dist.mean + 1e-11) 351 352 state_dist = torch.clamp(state_dist - kernel_cluster_distance, min=0, max=None) 353 kernel = kernel_epsilon / (state_dist + kernel_epsilon) 354 s = torch.sqrt(torch.clamp(torch.sum(kernel), min=0, max=None)) + c 355 356 if s > siminarity_max: 357 print('s > siminarity_max:', s.max(), s.min()) 358 return torch.tensor(0) # NOTE 359 return 1 / s 360 # average value 1/( ( 10* 1e-4/(1+1e-4) )**(1/2)+1e-3 ) = 30 361 362 def estimate(self, data: list) -> torch.Tensor: 363 """ 364 Rewrite the reward key in each row of the data. 365 """ 366 367 obs, is_null = collect_data_episodic(data) 368 # obs shape list(list()) [batch_size,seq_length,obs_dim] 369 batch_size = len(obs) 370 seq_length = len(obs[0]) 371 372 # stack episode dim 373 obs = [torch.stack(episode_obs, dim=0) for episode_obs in obs] 374 375 # stack batch dim 376 # way 0 377 if isinstance(self.cfg.obs_shape, int): 378 obs = torch.stack(obs, dim=0).view(batch_size * seq_length, self.cfg.obs_shape).to(self.device) 379 else: # len(self.cfg.obs_shape) == 3 for image obs 380 obs = torch.stack(obs, dim=0).view(batch_size * seq_length, *self.cfg.obs_shape).to(self.device) 381 # way 2 382 # obs = torch.cat(obs, 0) 383 384 inputs = {'obs': obs, 'is_null': is_null} 385 with torch.no_grad(): 386 cur_obs_embedding = self.episodic_reward_model(inputs, inference=True) 387 cur_obs_embedding = cur_obs_embedding.view(batch_size, seq_length, -1) 388 episodic_reward = [[] for _ in range(batch_size)] 389 null_cnt = 0 # the number of null transitions in the whole minibatch 390 for i in range(batch_size): 391 for j in range(seq_length): 392 if j < 10: 393 # if self._running_mean_std_episodic_reward.mean is not None: 394 # episodic_reward[i].append(torch.tensor(self._running_mean_std_episodic_reward.mean).to(self.device)) 395 # else: 396 episodic_reward[i].append(torch.tensor(0.).to(self.device)) 397 elif j: 398 episodic_memory = cur_obs_embedding[i][:j] 399 reward = self._compute_intrinsic_reward(episodic_memory, 400 cur_obs_embedding[i][j]).to(self.device) 401 episodic_reward[i].append(reward) 402 403 if torch.nonzero(torch.tensor(is_null[i]).float()).shape[0] != 0: 404 # TODO(pu): if have null padding, the episodic_reward should be 0 405 not_null_index = torch.nonzero(torch.tensor(is_null[i]).float()).squeeze(-1) 406 null_start_index = int(torch.nonzero(torch.tensor(is_null[i]).float()).squeeze(-1)[0]) 407 # add the number of null transitions in i'th sequence in batch 408 null_cnt = null_cnt + seq_length - null_start_index 409 for k in range(null_start_index, seq_length): 410 episodic_reward[i][k] = torch.tensor(0).to(self.device) 411 # episodic_reward[i][null_start_index:-1]=[torch.tensor(0).to(self.device) 412 # for i in range(seq_length-null_start_index)] 413 414 # list(list(tensor)) -> tensor 415 tmp = [torch.stack(episodic_reward_tmp, dim=0) for episodic_reward_tmp in episodic_reward] 416 # stack batch dim 417 episodic_reward = torch.stack(tmp, dim=0) # TODO(pu): image case 418 episodic_reward = episodic_reward.view(-1) # torch.Size([32, 42]) -> torch.Size([32*42] 419 420 episodic_reward_real_mean = sum(episodic_reward) / ( 421 batch_size * seq_length - null_cnt 422 ) # TODO(pu): recompute mean 423 self.estimate_cnt_episodic += 1 424 self._running_mean_std_episodic_reward.update(episodic_reward.cpu().numpy()) 425 426 self.tb_logger.add_scalar( 427 'episodic_reward/episodic_reward_max', episodic_reward.max(), self.estimate_cnt_episodic 428 ) 429 self.tb_logger.add_scalar( 430 'episodic_reward/episodic_reward_mean', episodic_reward_real_mean, self.estimate_cnt_episodic 431 ) 432 self.tb_logger.add_scalar( 433 'episodic_reward/episodic_reward_min', episodic_reward.min(), self.estimate_cnt_episodic 434 ) 435 self.tb_logger.add_scalar( 436 'episodic_reward/episodic_reward_std_', episodic_reward.std(), self.estimate_cnt_episodic 437 ) 438 # transform to [0,1]: er01 439 episodic_reward = (episodic_reward - 440 episodic_reward.min()) / (episodic_reward.max() - episodic_reward.min() + 1e-11) 441 """1. transform to batch mean1: erbm1""" 442 # episodic_reward = episodic_reward / (episodic_reward.mean() + 1e-11) 443 # the null_padding transition have episodic reward=0, 444 # episodic_reward = episodic_reward / (episodic_reward_real_mean + 1e-11) 445 """2. transform to long-term mean1: erlm1""" 446 # episodic_reward = episodic_reward / self._running_mean_std_episodic_reward.mean 447 """3. transform to mean 0, std 1, which is wrong, rnd_reward is in [1,5], episodic reward should >0, 448 otherwise, e.g. when the episodic_reward is -2, the rnd_reward larger, 449 the total intrinsic reward smaller, which is not correct.""" 450 # episodic_reward = (episodic_reward - self._running_mean_std_episodic_reward.mean) 451 # / self._running_mean_std_episodic_reward.std 452 """4. transform to std1, which is not very meaningful""" 453 # episodic_reward = episodic_reward / self._running_mean_std_episodic_reward.std 454 455 return episodic_reward 456 457 def collect_data(self, data: list) -> None: 458 train_obs, train_action = collect_data_and_exclude_null_data_episodic(data) 459 self.train_obs_total.extend(train_obs) 460 self.train_action_total.extend(train_action) 461 462 def clear_data(self) -> None: 463 self.train_obs_total = [] 464 self.train_action_total = [] 465 466 def fusion_reward( 467 self, train_data, inter_episodic_reward, episodic_reward, nstep, collector_env_num, tb_logger, estimate_cnt 468 ): 469 # NOTE: deepcopy reward part of train_data is very important, 470 # otherwise the reward of train_data in the replay buffer will be incorrectly modified. 471 data = self.reward_deepcopy(train_data) 472 estimate_cnt += 1 473 index_to_beta = { 474 i: 0.3 * torch.sigmoid(torch.tensor(10 * (2 * i - (collector_env_num - 2)) / (collector_env_num - 2))) 475 for i in range(collector_env_num) 476 } 477 batch_size = len(data) 478 seq_length = len(data[0]['reward']) 479 device = data[0]['reward'][0].device 480 intrinsic_reward_type = 'add' 481 intrisic_reward = episodic_reward * torch.clamp(inter_episodic_reward, min=1, max=5) 482 tb_logger.add_scalar('intrinsic_reward/intrinsic_reward_max', intrisic_reward.max(), estimate_cnt) 483 tb_logger.add_scalar('intrinsic_reward/intrinsic_reward_mean', intrisic_reward.mean(), estimate_cnt) 484 tb_logger.add_scalar('intrinsic_reward/intrinsic_reward_min', intrisic_reward.min(), estimate_cnt) 485 486 if not isinstance(data[0], (list, dict)): 487 # not rnn based rl algorithm 488 intrisic_reward = intrisic_reward.to(device) 489 intrisic_reward = torch.chunk(intrisic_reward, intrisic_reward.shape[0], dim=0) 490 for item, rew in zip(data, intrisic_reward): 491 if intrinsic_reward_type == 'add': 492 item['reward'] += rew * index_to_beta[data['beta']] 493 else: 494 # rnn based rl algorithm 495 intrisic_reward = intrisic_reward.to(device) 496 497 # tensor to tuple 498 intrisic_reward = torch.chunk(intrisic_reward, int(intrisic_reward.shape[0]), dim=0) 499 500 if self.cfg.last_nonzero_reward_weight is None and self.cfg.last_nonzero_reward_rescale: 501 # for minigrid env 502 self.cfg.last_nonzero_reward_weight = seq_length 503 504 # this is for the nstep rl algorithms 505 for i in range(batch_size): # batch_size typically 64 506 for j in range(seq_length): # burnin+unroll_len is the sequence length, e.g. 100=2+98 507 if j < seq_length - nstep: 508 intrinsic_reward = torch.cat( 509 [intrisic_reward[i * seq_length + j + k] for k in range(nstep)], dim=0 510 ) 511 # if intrinsic_reward_type == 'add': 512 if not data[i]['null'][j]: 513 # if data[i]['null'][j]==True, means its's null data, only the not null data, 514 # we add a intrinsic_reward 515 if data[i]['done'][j] and self.cfg.last_nonzero_reward_rescale: 516 # if not null data, and data[i]['done'][j]==True, so this is the last nstep transition 517 # in the original data. 518 519 # means if using rescale trick to the last non-zero reward 520 # when combing extrinsic and intrinsic reward. 521 # only used in sparse reward env minigrid, in which the last non-zero reward 522 # is a strong positive signal, should not be overwhelmed by intrinsic rewards。 523 for k in reversed(range(nstep)): 524 # here we want to find the last nonzero reward in the nstep reward list: 525 # data[i]['reward'][j], that is also the last reward in the sequence, here, 526 # we set the sequence length is large enough, 527 # so we can consider the sequence as the whole episode plus null_padding 528 529 # TODO(pu): what should we do if the last reward in the whole episode is zero? 530 if data[i]['reward'][j][k] != 0: 531 # find the last one that is nonzero, and enlarging <seq_length> times 532 last_nonzero_rew = copy.deepcopy(data[i]['reward'][j][k]) 533 data[i]['reward'][j][k] = \ 534 self.cfg.last_nonzero_reward_weight * last_nonzero_rew + \ 535 intrinsic_reward[k] * index_to_beta[int(data[i]['beta'][j])] 536 # substitute the kth reward in the list data[i]['reward'][j] with <seq_length> 537 # times amplified reward 538 break 539 else: 540 data[i]['reward'][j] = data[i]['reward'][j] + intrinsic_reward * index_to_beta[ 541 int(data[i]['beta'][j])] 542 543 return data, estimate_cnt