`ding.policy.qgpo`¶

`ding.policy.qgpo` ¶

`QGPOPolicy` ¶

Bases: Policy
Overview
Policy class of QGPO algorithm (https://arxiv.org/abs/2304.12824). Contrastive Energy Prediction for Exact Energy-Guided Diffusion Sampling in Offline Reinforcement Learning
Interfaces: __init__, forward, learn, eval, state_dict, load_state_dict
Full Source Code

../ding/policy/qgpo.py
############################################################## This QGPO model is a modification implementation from https://github.com/ChenDRAG/CEP-energy-guided-diffusion#############################################################from typing import List, Dict, Anyimport torchfrom ding.utils import POLICY_REGISTRYfrom ding.utils.data import default_collatefrom ding.torch_utils import to_devicefrom .base_policy import Policy@POLICY_REGISTRY.register('qgpo')class QGPOPolicy(Policy):    """    Overview:        Policy class of QGPO algorithm (https://arxiv.org/abs/2304.12824).        Contrastive Energy Prediction for Exact Energy-Guided Diffusion Sampling in Offline Reinforcement Learning    Interfaces:        ``__init__``, ``forward``, ``learn``, ``eval``, ``state_dict``, ``load_state_dict``    """    config = dict(        # (str) RL policy register name (refer to function "POLICY_REGISTRY").        type='qgpo',        # (bool) Whether to use cuda for network.        cuda=False,        # (bool type) on_policy: Determine whether on-policy or off-policy.        # on-policy setting influences the behaviour of buffer.        # Default False in QGPO.        on_policy=False,        multi_agent=False,        model=dict(            qgpo_critic=dict(                # (float) The scale of the energy guidance when training qt.                # \pi_{behavior}\exp(f(s,a)) \propto \pi_{behavior}\exp(alpha * Q(s,a))                alpha=3,                # (float) The scale of the energy guidance when training q0.                # \mathcal{T}Q(s,a)=r(s,a)+\mathbb{E}_{s'\sim P(s'|s,a),a'\sim\pi_{support}(a'|s')}Q(s',a')                # \pi_{support} \propto \pi_{behavior}\exp(q_alpha * Q(s,a))                q_alpha=1,            ),            device='cuda',            # obs_dim            # action_dim        ),        learn=dict(            # learning rate for behavior model training            learning_rate=1e-4,            # batch size during the training of behavior model            batch_size=4096,            # batch size during the training of q value            batch_size_q=256,            # number of fake action support            M=16,            # number of diffusion time steps            diffusion_steps=15,            # training iterations when behavior model is fixed            behavior_policy_stop_training_iter=600000,            # training iterations when energy-guided policy begin training            energy_guided_policy_begin_training_iter=600000,            # training iterations when q value stop training, default None means no limit            q_value_stop_training_iter=1100000,        ),        eval=dict(            # energy guidance scale for policy in evaluation            # \pi_{evaluation} \propto \pi_{behavior}\exp(guidance_scale * alpha * Q(s,a))            guidance_scale=[0.0, 1.0, 2.0, 3.0, 5.0, 8.0, 10.0],        ),    )    def _init_learn(self) -> None:        """        Overview:            Learn mode initialization method. For QGPO, it mainly contains the optimizer, \            algorithm-specific arguments such as qt_update_momentum, discount, behavior_policy_stop_training_iter, \            energy_guided_policy_begin_training_iter and q_value_stop_training_iter, etc.            This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``.        """        self.cuda = self._cfg.cuda        self.behavior_model_optimizer = torch.optim.Adam(            self._model.score_model.parameters(), lr=self._cfg.learn.learning_rate        )        self.q_optimizer = torch.optim.Adam(self._model.q.q0.parameters(), lr=3e-4)        self.qt_optimizer = torch.optim.Adam(self._model.q.qt.parameters(), lr=3e-4)        self.qt_update_momentum = 0.005        self.discount = 0.99        self.behavior_policy_stop_training_iter = self._cfg.learn.behavior_policy_stop_training_iter        self.energy_guided_policy_begin_training_iter = self._cfg.learn.energy_guided_policy_begin_training_iter        self.q_value_stop_training_iter = self._cfg.learn.q_value_stop_training_iter    def _forward_learn(self, data: dict) -> Dict[str, Any]:        """        Overview:            Forward function for learning mode.            The training of QGPO algorithm is based on contrastive energy prediction, \            which needs true action and fake action. The true action is sampled from the dataset, and the fake action \            is sampled from the action support generated by the behavior policy.            The training process is divided into two stages:            1. Train the behavior model, which is modeled as a diffusion model by parameterizing the score function.            2. Train the Q function by fake action support generated by the behavior model.            3. Train the energy-guided policy by the Q function.        Arguments:            - data (:obj:`dict`): Dict type data.        Returns:            - result (:obj:`dict`): Dict type data of algorithm results.        """        if self.cuda:            data = to_device(data, self._device)        s = data['s']        a = data['a']        r = data['r']        s_ = data['s_']        d = data['d']        fake_a = data['fake_a']        fake_a_ = data['fake_a_']        # training behavior model        if self.behavior_policy_stop_training_iter > 0:            behavior_model_training_loss = self._model.score_model_loss_fn(a, s)            self.behavior_model_optimizer.zero_grad()            behavior_model_training_loss.backward()            self.behavior_model_optimizer.step()            self.behavior_policy_stop_training_iter -= 1            behavior_model_training_loss = behavior_model_training_loss.item()        else:            behavior_model_training_loss = 0        # training Q function        self.energy_guided_policy_begin_training_iter -= 1        self.q_value_stop_training_iter -= 1        if self.energy_guided_policy_begin_training_iter < 0:            if self.q_value_stop_training_iter > 0:                q0_loss = self._model.q_loss_fn(a, s, r, s_, d, fake_a_, discount=self.discount)                self.q_optimizer.zero_grad()                q0_loss.backward()                self.q_optimizer.step()                # Update target                for param, target_param in zip(self._model.q.q0.parameters(), self._model.q.q0_target.parameters()):                    target_param.data.copy_(                        self.qt_update_momentum * param.data + (1 - self.qt_update_momentum) * target_param.data                    )                q0_loss = q0_loss.item()            else:                q0_loss = 0            qt_loss = self._model.qt_loss_fn(s, fake_a)            self.qt_optimizer.zero_grad()            qt_loss.backward()            self.qt_optimizer.step()            qt_loss = qt_loss.item()        else:            q0_loss = 0            qt_loss = 0        total_loss = behavior_model_training_loss + q0_loss + qt_loss        return dict(            total_loss=total_loss,            behavior_model_training_loss=behavior_model_training_loss,            q0_loss=q0_loss,            qt_loss=qt_loss,        )    def _init_collect(self) -> None:        """        Overview:            Collect mode initialization method. Not supported for QGPO.        """        pass    def _forward_collect(self) -> None:        """        Overview:            Forward function for collect mode. Not supported for QGPO.        """        pass    def _init_eval(self) -> None:        """        Overview:            Eval mode initialization method. For QGPO, it mainly contains the guidance_scale and diffusion_steps, etc.            This method will be called in ``__init__`` method if ``eval`` field is in ``enable_field``.        """        self.diffusion_steps = self._cfg.eval.diffusion_steps    def _forward_eval(self, data: dict, guidance_scale: float) -> dict:        """        Overview:            Forward function for eval mode. The eval process is based on the energy-guided policy, \            which is modeled as a diffusion model by parameterizing the score function.        Arguments:            - data (:obj:`dict`): Dict type data.            - guidance_scale (:obj:`float`): The scale of the energy guidance.        Returns:            - output (:obj:`dict`): Dict type data of algorithm output.        """        data_id = list(data.keys())        states = default_collate(list(data.values()))        actions = self._model.select_actions(            states, diffusion_steps=self.diffusion_steps, guidance_scale=guidance_scale        )        output = actions        return {i: {"action": d} for i, d in zip(data_id, output)}    def _get_train_sample(self, transitions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:        """        Overview:            Get the train sample from the replay buffer, currently not supported for QGPO.        Arguments:            - transitions (:obj:`List[Dict[str, Any]]`): The data from replay buffer.        Returns:            - samples (:obj:`List[Dict[str, Any]]`): The data for training.        """        pass    def _process_transition(self) -> None:        """        Overview:            Process the transition data, currently not supported for QGPO.        """        pass    def _state_dict_learn(self) -> Dict[str, Any]:        """        Overview:            Return the state dict for saving.        Returns:            - state_dict (:obj:`Dict[str, Any]`): Dict type data of state dict.        """        return {            'model': self._model.state_dict(),            'behavior_model_optimizer': self.behavior_model_optimizer.state_dict(),        }    def _load_state_dict_learn(self, state_dict: Dict[str, Any]) -> None:        """        Overview:            Load the state dict.        Arguments:            - state_dict (:obj:`Dict[str, Any]`): Dict type data of state dict.        """        self._model.load_state_dict(state_dict['model'])        self.behavior_model_optimizer.load_state_dict(state_dict['behavior_model_optimizer'])    def _monitor_vars_learn(self) -> List[str]:        """        Overview:            Return the variables names to be monitored.        """        return ['total_loss', 'behavior_model_training_loss', 'q0_loss', 'qt_loss']
ding.policy.qgpo¶

ding.policy.qgpo ¶

QGPOPolicy ¶

Full Source Code

`ding.policy.qgpo`¶

`ding.policy.qgpo` ¶

`QGPOPolicy` ¶