`ding.utils.linklink_dist_helper`¶

`ding.utils.linklink_dist_helper` ¶

`DistContext` ¶

Overview

A context manager for linklink distribution

Interfaces: __init__, __enter__, __exit__

`init()` ¶

Overview

Initialize the DistContext

`enter()` ¶

Overview

Initialize linklink distribution

`exit(*args, **kwargs)` ¶

Overview

Finalize linklink distribution

Arugments: - args (:obj:Tuple): The arguments passed to the __exit__ function. - kwargs (:obj:Dict): The keyword arguments passed to the __exit__ function.

`get_rank()` ¶

Overview

Get the rank of linklink model, return 0 if use FakeLink.

.. note:: Reference import_helper.try_import_link and linklink.get_rank.

`get_world_size()` ¶

Overview

Get the world_size of linklink model, return 0 if use FakeLink.

.. note:: Reference import_helper.try_import_link and linklink.get_world_size.

`broadcast(value, rank)` ¶

Overview

Use linklink.broadcast and raise error when using FakeLink

Arguments: - value (:obj:obj): the value to board cast - rank (:obj:int): the rank to broadcast on

`allreduce(data, op='sum')` ¶

Overview

Call linklink.allreduce on the data

Arguments: - data (:obj:obj): the data to reduce - op (:obj:str): the operation to perform on data, support ['sum', 'max']

`allreduce_async(data, op='sum')` ¶

Overview

Call linklink.allreduce_async on the data

Arguments: - data (:obj:obj): the data to reduce - op (:obj:str): the operation to perform on data, support ['sum', 'max']

`get_group(group_size)` ¶

Overview

Get the group segmentation of group_size each group

Arguments: - group_size (:obj:int) the group_size

`dist_mode(func)` ¶

Overview

Wrap the function so that in can init and finalize automatically before each call

Arguments: - func (:obj:Callable): the function to wrap

`dist_init(method='slurm', device_id=0)` ¶

Overview

Init the distribution

Arguments: - method (:obj:str): Support ['slurm', 'single_node`] - device_id (:obj:int): Default device when using single_node method

`dist_finalize()` ¶

Overview

Finalize linklink, see linklink.finalize()

`simple_group_split(world_size, rank, num_groups)` ¶

Overview

Split the group according to worldsize, rank and num_groups

Arguments: - world_size (:obj:int): The world size - rank (:obj:int): The rank - num_groups (:obj:int): The number of groups

.. note:: With faulty input, raise array split does not result in an equal division

`synchronize()` ¶

Overview

Synchronize the process

Full Source Code

../ding/utils/linklink_dist_helper.py

from functools import lru_cachefrom typing import Callable, Tuple, List, Anyimport numpy as npimport torchfrom .default_helper import error_wrapperfrom .fake_linklink import FakeLinkfrom .import_helper import try_import_link@lru_cache()def get_link():    return try_import_link()@lru_cache()def is_fake_link():    return isinstance(get_link(), FakeLink)def get_rank() -> int:    """    Overview:        Get the rank of ``linklink`` model, return 0 if use ``FakeLink``.    .. note::        Reference ``import_helper.try_import_link`` and ``linklink.get_rank``.    """    if is_fake_link():        return 0    return error_wrapper(get_link().get_rank, 0, "[WARNING]: call linklink error, return default_ret.")()def get_world_size() -> int:    """    Overview:        Get the ``world_size`` of ``linklink model``, return 0 if use ``FakeLink``.    .. note::        Reference ``import_helper.try_import_link`` and ``linklink.get_world_size``.    """    if is_fake_link():        return 1    return error_wrapper(get_link().get_world_size, 1, "[WARNING]: call linklink error, return default_ret.")()def broadcast(value: torch.Tensor, rank: int) -> None:    """    Overview:        Use ``linklink.broadcast`` and raise error when using ``FakeLink``    Arguments:        - value (:obj:`obj`): the value to board cast        - rank (:obj:`int`): the rank to broadcast on    """    if is_fake_link():        raise NotImplementedError    get_link().broadcast(value, rank)def allreduce(data: torch.Tensor, op: str = 'sum') -> None:    """    Overview:        Call ``linklink.allreduce`` on the data    Arguments:        - data (:obj:`obj`): the data to reduce        - op (:obj:`str`): the operation to perform on data, support ``['sum', 'max']``    """    link_op_map = {'sum': get_link().allreduceOp_t.Sum, 'max': get_link().allreduceOp_t.Max}    if op not in link_op_map.keys():        raise KeyError("not support allreduce op type: {}".format(op))    else:        link_op = link_op_map[op]    if is_fake_link():        return data    get_link().allreduce(data, reduce_op=link_op)    if op == 'sum':        data.div_(get_world_size())def allreduce_async(data: torch.Tensor, op: str = 'sum') -> None:    """    Overview:        Call ``linklink.allreduce_async`` on the data    Arguments:        - data (:obj:`obj`): the data to reduce        - op (:obj:`str`): the operation to perform on data, support ``['sum', 'max']``    """    link_op_map = {'sum': get_link().allreduceOp_t.Sum, 'max': get_link().allreduceOp_t.Max}    if op not in link_op_map.keys():        raise KeyError("not support allreduce op type: {}".format(op))    else:        link_op = link_op_map[op]    if is_fake_link():        return data    if op == 'sum':        data.div_(get_world_size())    get_link().allreduce_async(data, reduce_op=link_op)def get_group(group_size: int) -> List:    """    Overview:        Get the group segmentation of ``group_size`` each group    Arguments:        - group_size (:obj:`int`) the ``group_size``    """    rank = get_rank()    world_size = get_world_size()    if group_size is None:        group_size = world_size    assert (world_size % group_size == 0)    return simple_group_split(world_size, rank, world_size // group_size)def dist_mode(func: Callable) -> Callable:    """    Overview:        Wrap the function so that in can init and finalize automatically before each call    Arguments:        - func (:obj:`Callable`): the function to wrap    """    def wrapper(*args, **kwargs):        dist_init()        func(*args, **kwargs)        dist_finalize()    return wrapperdef dist_init(method: str = 'slurm', device_id: int = 0) -> Tuple[int, int]:    """    Overview:        Init the distribution    Arguments:        - method (:obj:`str`): Support ``['slurm', 'single_node`]``        - device_id (:obj:`int`): Default device when using ``single_node`` method    """    get_link().initialize()    world_size = get_link().get_world_size()    rank = get_link().get_rank()    if method == 'slurm':        # proc_id = int(os.environ['SLURM_PROCID'])        # ntasks = int(os.environ['SLURM_NTASKS'])        # node_list = os.environ['SLURM_NODELIST']        num_gpus = torch.cuda.device_count()        torch.cuda.set_device(rank % num_gpus)    elif method == 'single_node':        torch.cuda.set_device(device_id)    return rank, world_sizedef dist_finalize() -> None:    """    Overview:        Finalize ``linklink``, see ``linklink.finalize()``    """    get_link().finalize()class DistContext:    """    Overview:        A context manager for ``linklink`` distribution    Interfaces:        ``__init__``, ``__enter__``, ``__exit__``    """    def __init__(self) -> None:        """        Overview:            Initialize the ``DistContext``        """        pass    def __enter__(self) -> None:        """        Overview:            Initialize ``linklink`` distribution        """        dist_init()    def __exit__(self, *args, **kwargs) -> Any:        """        Overview:            Finalize ``linklink`` distribution        Arugments:            - args (:obj:`Tuple`): The arguments passed to the ``__exit__`` function.            - kwargs (:obj:`Dict`): The keyword arguments passed to the ``__exit__`` function.        """        dist_finalize()def simple_group_split(world_size: int, rank: int, num_groups: int) -> List:    """    Overview:        Split the group according to ``worldsize``, ``rank`` and ``num_groups``    Arguments:        - world_size (:obj:`int`): The world size        - rank (:obj:`int`): The rank        - num_groups (:obj:`int`): The number of groups    .. note::        With faulty input, raise ``array split does not result in an equal division``    """    groups = []    rank_list = np.split(np.arange(world_size), num_groups)    rank_list = [list(map(int, x)) for x in rank_list]    for i in range(num_groups):        groups.append(get_link().new_group(rank_list[i]))    group_size = world_size // num_groups    return groups[rank // group_size]def synchronize():    """    Overview:        Synchronize the process    """    get_link().synchronize()

ding.utils.linklink_dist_helper¶