`ding.policy.pc`¶

`ding.policy.pc` ¶

`ProcedureCloningBFSPolicy` ¶

Bases: Policy
`process_states(observations, maze_maps)` ¶

Returns [B, W, W, 3] binary values. Channels are (wall; goal; obs)
Full Source Code

../ding/policy/pc.py
import mathfrom typing import List, Dict, Any, Tuplefrom collections import namedtupleimport torchimport torch.nn as nnfrom torch.optim import Adam, SGD, AdamWfrom torch.optim.lr_scheduler import LambdaLRfrom ding.policy import Policyfrom ding.model import model_wrapfrom ding.torch_utils import to_devicefrom ding.utils import EasyTimerfrom ding.utils import POLICY_REGISTRY@POLICY_REGISTRY.register('pc_bfs')class ProcedureCloningBFSPolicy(Policy):    def default_model(self) -> Tuple[str, List[str]]:        return 'pc_bfs', ['ding.model.template.procedure_cloning']    config = dict(        type='pc',        cuda=False,        on_policy=False,        continuous=False,        max_bfs_steps=100,        learn=dict(            update_per_collect=1,            batch_size=32,            learning_rate=1e-5,            lr_decay=False,            decay_epoch=30,            decay_rate=0.1,            warmup_lr=1e-4,            warmup_epoch=3,            optimizer='SGD',            momentum=0.9,            weight_decay=1e-4,        ),        collect=dict(            unroll_len=1,            noise=False,            noise_sigma=0.2,            noise_range=dict(                min=-0.5,                max=0.5,            ),        ),        eval=dict(),        other=dict(replay_buffer=dict(replay_buffer_size=10000)),    )    def _init_learn(self):        assert self._cfg.learn.optimizer in ['SGD', 'Adam']        if self._cfg.learn.optimizer == 'SGD':            self._optimizer = SGD(                self._model.parameters(),                lr=self._cfg.learn.learning_rate,                weight_decay=self._cfg.learn.weight_decay,                momentum=self._cfg.learn.momentum            )        elif self._cfg.learn.optimizer == 'Adam':            if self._cfg.learn.weight_decay is None:                self._optimizer = Adam(                    self._model.parameters(),                    lr=self._cfg.learn.learning_rate,                )            else:                self._optimizer = AdamW(                    self._model.parameters(),                    lr=self._cfg.learn.learning_rate,                    weight_decay=self._cfg.learn.weight_decay                )        if self._cfg.learn.lr_decay:            def lr_scheduler_fn(epoch):                if epoch <= self._cfg.learn.warmup_epoch:                    return self._cfg.learn.warmup_lr / self._cfg.learn.learning_rate                else:                    ratio = (epoch - self._cfg.learn.warmup_epoch) // self._cfg.learn.decay_epoch                    return math.pow(self._cfg.learn.decay_rate, ratio)            self._lr_scheduler = LambdaLR(self._optimizer, lr_scheduler_fn)        self._timer = EasyTimer(cuda=True)        self._learn_model = model_wrap(self._model, 'base')        self._learn_model.reset()        self._max_bfs_steps = self._cfg.max_bfs_steps        self._maze_size = self._cfg.maze_size        self._num_actions = self._cfg.num_actions        self._loss = nn.CrossEntropyLoss()    def process_states(self, observations, maze_maps):        """Returns [B, W, W, 3] binary values. Channels are (wall; goal; obs)"""        loc = torch.nn.functional.one_hot(            (observations[:, 0] * self._maze_size + observations[:, 1]).long(),            self._maze_size * self._maze_size,        ).long()        loc = torch.reshape(loc, [observations.shape[0], self._maze_size, self._maze_size])        states = torch.cat([maze_maps, loc], dim=-1).long()        return states    def _forward_learn(self, data):        if self._cuda:            collated_data = to_device(data, self._device)        else:            collated_data = data        observations = collated_data['obs'],        bfs_input_maps, bfs_output_maps = collated_data['bfs_in'].long(), collated_data['bfs_out'].long()        states = observations        bfs_input_onehot = torch.nn.functional.one_hot(bfs_input_maps, self._num_actions + 1).float()        bfs_states = torch.cat([            states,            bfs_input_onehot,        ], dim=-1)        logits = self._model(bfs_states)['logit']        logits = logits.flatten(0, -2)        labels = bfs_output_maps.flatten(0, -1)        loss = self._loss(logits, labels)        preds = torch.argmax(logits, dim=-1)        acc = torch.sum((preds == labels)) / preds.shape[0]        self._optimizer.zero_grad()        loss.backward()        self._optimizer.step()        pred_loss = loss.item()        cur_lr = [param_group['lr'] for param_group in self._optimizer.param_groups]        cur_lr = sum(cur_lr) / len(cur_lr)        return {'cur_lr': cur_lr, 'total_loss': pred_loss, 'acc': acc}    def _monitor_vars_learn(self):        return ['cur_lr', 'total_loss', 'acc']    def _init_eval(self):        self._eval_model = model_wrap(self._model, wrapper_name='base')        self._eval_model.reset()    def _forward_eval(self, data):        if self._cuda:            data = to_device(data, self._device)        max_len = self._max_bfs_steps        data_id = list(data.keys())        output = {}        for ii in data_id:            states = data[ii].unsqueeze(0)            bfs_input_maps = self._num_actions * torch.ones([1, self._maze_size, self._maze_size]).long()            if self._cuda:                bfs_input_maps = to_device(bfs_input_maps, self._device)            xy = torch.where(states[:, :, :, -1] == 1)            observation = (xy[1][0].item(), xy[2][0].item())            i = 0            while bfs_input_maps[0, observation[0], observation[1]].item() == self._num_actions and i < max_len:                bfs_input_onehot = torch.nn.functional.one_hot(bfs_input_maps, self._num_actions + 1).long()                bfs_states = torch.cat([                    states,                    bfs_input_onehot,                ], dim=-1)                logits = self._model(bfs_states)['logit']                bfs_input_maps = torch.argmax(logits, dim=-1)                i += 1            output[ii] = bfs_input_maps[0, observation[0], observation[1]]            if self._cuda:                output[ii] = {'action': to_device(output[ii], 'cpu'), 'info': {}}            if output[ii]['action'].item() == self._num_actions:                output[ii]['action'] = torch.randint(low=0, high=self._num_actions, size=[1])[0]        return output    def _init_collect(self) -> None:        raise NotImplementedError    def _forward_collect(self, data: Dict[int, Any], **kwargs) -> Dict[int, Any]:        raise NotImplementedError    def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple) -> dict:        raise NotImplementedError    def _get_train_sample(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:        raise NotImplementedError
ding.policy.pc¶

ding.policy.pc ¶

ProcedureCloningBFSPolicy ¶

process_states(observations, maze_maps) ¶

Full Source Code

`ding.policy.pc`¶

`ding.policy.pc` ¶

`ProcedureCloningBFSPolicy` ¶

`process_states(observations, maze_maps)` ¶