Skip to content

ding.utils.slurm_helper

ding.utils.slurm_helper

get_ip()

Overview

Get the ip of the current node

get_manager_node_ip(node_ip=None)

Overview

Look up the manager node of the slurm cluster and return the node ip

Arguments: - node_ip (:obj:Optional[str]): The ip of the current node

get_cls_info()

Overview

Get the cluster info

node_to_partition(target_node)

Overview

Get the partition of the target node

Arguments: - target_node (:obj:str): The target node

node_to_host(node)

Overview

Get the host of the node

Arguments: - node (:obj:str): The node

find_free_port_slurm(node)

Overview

Find a free port on the node

Arguments: - node (:obj:str): The node

Full Source Code

../ding/utils/slurm_helper.py

1import os 2import subprocess 3from typing import Optional, Dict, Tuple 4 5MANAGER_NODE_TABLE = { 6 '10.198.8': '10.198.8.31', 7 '10.198.6': '10.198.6.31', 8 '10.5.38': '10.5.38.31', 9 '10.5.39': '10.5.38.31', 10 '10.5.36': '10.5.36.31', 11 '10.5.37': '10.5.36.31', 12 '10.10.30': '10.10.30.91', 13} 14 15 16def get_ip() -> str: 17 """ 18 Overview: 19 Get the ip of the current node 20 """ 21 22 assert os.environ.get('SLURMD_NODENAME'), 'not found SLURMD_NODENAME env variable' 23 # expecting nodename to be like: 'SH-IDC1-10-5-36-64' 24 nodename = os.environ.get('SLURMD_NODENAME', '') 25 myaddr = '.'.join(nodename.split('-')[-4:]) 26 return myaddr 27 28 29def get_manager_node_ip(node_ip: Optional[str] = None) -> str: 30 """ 31 Overview: 32 Look up the manager node of the slurm cluster and return the node ip 33 Arguments: 34 - node_ip (:obj:`Optional[str]`): The ip of the current node 35 """ 36 if 'SLURM_JOB_ID' not in os.environ: 37 from ditk import logging 38 logging.error( 39 'We are not running on slurm!, \'auto\' for manager_ip or ' 40 'coordinator_ip is only intended for running on multiple slurm clusters' 41 ) 42 return '127.0.0.1' 43 node_ip = node_ip or get_ip() 44 learner_manager_ip_prefix = '.'.join(node_ip.split('.')[0:3]) 45 46 if learner_manager_ip_prefix in MANAGER_NODE_TABLE: 47 return MANAGER_NODE_TABLE[learner_manager_ip_prefix] 48 else: 49 raise KeyError("Cluster not found, please add it to the MANAGER_NODE_TABLE in {}".format(__file__)) 50 51 52# get all info of cluster 53def get_cls_info() -> Dict[str, list]: 54 """ 55 Overview: 56 Get the cluster info 57 """ 58 59 ret_dict = {} 60 info = subprocess.getoutput('sinfo -Nh').split('\n') 61 for line in info: 62 line = line.strip().split() 63 if len(line) != 4: 64 continue 65 node, _, partition, state = line 66 if partition not in ret_dict: 67 ret_dict[partition] = [] 68 assert node not in ret_dict[partition] 69 if state in ['idle', 'mix']: 70 ret_dict[partition].append(node) 71 72 return ret_dict 73 74 75def node_to_partition(target_node: str) -> Tuple[str, str]: 76 """ 77 Overview: 78 Get the partition of the target node 79 Arguments: 80 - target_node (:obj:`str`): The target node 81 """ 82 83 info = subprocess.getoutput('sinfo -Nh').split('\n') 84 for line in info: 85 line = line.strip().split() 86 if len(line) != 4: 87 continue 88 node, _, partition, state = line 89 if node == target_node: 90 return partition 91 raise RuntimeError("not found target_node: {}".format(target_node)) 92 93 94def node_to_host(node: str) -> str: 95 """ 96 Overview: 97 Get the host of the node 98 Arguments: 99 - node (:obj:`str`): The node 100 """ 101 102 return '.'.join(node.split('-')[-4:]) 103 104 105def find_free_port_slurm(node: str) -> int: 106 """ 107 Overview: 108 Find a free port on the node 109 Arguments: 110 - node (:obj:`str`): The node 111 """ 112 113 partition = node_to_partition(node) 114 if partition == 'spring_scheduler': 115 comment = '--comment=spring-submit' 116 else: 117 comment = '' 118 output = subprocess.getoutput( 119 "srun -p {} -w {} {} python -c \"from ding.utils import find_free_port; print('port' + str(find_free_port(0)))\"" # noqa 120 .format(partition, node, comment) 121 ) 122 port = output.split('port')[-1] 123 return int(port)