Dear * ,
I've some question for understanding.
Essentially, I use the following job script:
--------------------------->
#!/bin/bash
#SBATCH -J srun_test
#SBATCH --time=0:02:00
#SBATCH --export=NONE
#SBATCH --partition=test
#SBATCH --nodes=2
#SBATCH -o ./%x.%j.out
#SBATCH -D .
srun --export=all --mpi=pmi2 --verbose --cpu-bind=verbose -n 96 --ntasks-per-node=48 --ntasks-per-socket=24 -c 2 -m block:block -B 2:56:1 /lrz/sys/tools/placement_test_2021/bin/placement-test.intel_impi
--------------------------->
On a sapphire rapid with 2 sockets and 56 physical CPUs. The executable is just some MPI dummy, build with Intel Compilers and Intel MPI (although I guess that's not essential here).
The related MPI environment looks as show below (1). Slurm environment is shown below under (2).
Now. what I would expect intuitively is that I place here 24 ranks per socket. And here also on the even CPUs in sequence starting from CPU 0 of each socket.
What I get is
srun: defined options
srun: -------------------- --------------------
srun: (null) : i20r01c01s[06,08]
srun: cpu-bind : verbose
srun: cpus-per-task : 2
srun: distribution : block:block
srun: export : all
srun: extra-node-info : 2:56:1
srun: jobid : 395726
srun: job-name : MGLET_srun
srun: mpi : pmi2
srun: nodes : 2
srun: ntasks : 96
srun: ntasks-per-node : 48
srun: ntasks-per-socket : 24
srun: verbose : 1
srun: -------------------- --------------------
srun: end of defined options
srun: jobid 395726: nodes(2):`i20r01c01s[06,08]', cpu counts: 224(x2)
srun: Implicitly setting --exact, because -c/--cpus-per-task given.
srun: CpuBindType=verbose,threads
srun: launching StepId=395726.0 on host i20r01c01s06, 48 tasks: [0-47]
srun: launching StepId=395726.0 on host i20r01c01s08, 48 tasks: [48-95]
srun: topology/tree: init: topology tree plugin loaded
cpu-bind=MASK - i20r01c01s06, task 1 1 [98437]: mask 0xc set
cpu-bind=MASK - i20r01c01s06, task 2 2 [98438]: mask 0x30 set
cpu-bind=MASK - i20r01c01s06, task 3 3 [98439]: mask 0xc0 set
cpu-bind=MASK - i20r01c01s06, task 4 4 [98440]: mask 0x300 set
cpu-bind=MASK - i20r01c01s06, task 5 5 [98441]: mask 0xc00 set
cpu-bind=MASK - i20r01c01s06, task 6 6 [98442]: mask 0x3000 set
cpu-bind=MASK - i20r01c01s06, task 7 7 [98443]: mask 0xc000 set
cpu-bind=MASK - i20r01c01s06, task 8 8 [98444]: mask 0x30000 set
cpu-bind=MASK - i20r01c01s06, task 9 9 [98445]: mask 0xc0000 set
cpu-bind=MASK - i20r01c01s06, task 10 10 [98446]: mask 0x300000 set
cpu-bind=MASK - i20r01c01s06, task 11 11 [98447]: mask 0xc00000 set
cpu-bind=MASK - i20r01c01s06, task 12 12 [98448]: mask 0x3000000 set
cpu-bind=MASK - i20r01c01s06, task 13 13 [98449]: mask 0xc000000 set
cpu-bind=MASK - i20r01c01s06, task 14 14 [98450]: mask 0x30000000 set
cpu-bind=MASK - i20r01c01s06, task 15 15 [98451]: mask 0xc0000000 set
cpu-bind=MASK - i20r01c01s06, task 16 16 [98452]: mask 0x300000000 set
cpu-bind=MASK - i20r01c01s06, task 17 17 [98453]: mask 0xc00000000 set
cpu-bind=MASK - i20r01c01s06, task 18 18 [98454]: mask 0x3000000000 set
cpu-bind=MASK - i20r01c01s06, task 19 19 [98455]: mask 0xc000000000 set
cpu-bind=MASK - i20r01c01s06, task 20 20 [98456]: mask 0x30000000000 set
cpu-bind=MASK - i20r01c01s06, task 21 21 [98457]: mask 0xc0000000000 set
cpu-bind=MASK - i20r01c01s06, task 22 22 [98458]: mask 0x300000000000 set
cpu-bind=MASK - i20r01c01s06, task 23 23 [98459]: mask 0xc00000000000 set
cpu-bind=MASK - i20r01c01s06, task 24 24 [98460]: mask 0x300000000000000 set
cpu-bind=MASK - i20r01c01s06, task 25 25 [98461]: mask 0xc00000000000000 set
cpu-bind=MASK - i20r01c01s06, task 26 26 [98462]: mask 0x3000000000000000 set
cpu-bind=MASK - i20r01c01s06, task 27 27 [98463]: mask 0xc000000000000000 set
cpu-bind=MASK - i20r01c01s06, task 28 28 [98464]: mask 0x30000000000000000 set
cpu-bind=MASK - i20r01c01s06, task 29 29 [98465]: mask 0xc0000000000000000 set
cpu-bind=MASK - i20r01c01s06, task 30 30 [98466]: mask 0x300000000000000000 set
cpu-bind=MASK - i20r01c01s06, task 31 31 [98467]: mask 0xc00000000000000000 set
cpu-bind=MASK - i20r01c01s06, task 32 32 [98468]: mask 0x3000000000000000000 set
cpu-bind=MASK - i20r01c01s06, task 33 33 [98469]: mask 0xc000000000000000000 set
cpu-bind=MASK - i20r01c01s06, task 34 34 [98470]: mask 0x30000000000000000000 set
cpu-bind=MASK - i20r01c01s06, task 35 35 [98471]: mask 0xc0000000000000000000 set
cpu-bind=MASK - i20r01c01s06, task 36 36 [98472]: mask 0x300000000000000000000 set
cpu-bind=MASK - i20r01c01s06, task 37 37 [98473]: mask 0xc00000000000000000000 set
cpu-bind=MASK - i20r01c01s06, task 38 38 [98474]: mask 0x3000000000000000000000 set
cpu-bind=MASK - i20r01c01s06, task 39 39 [98475]: mask 0xc000000000000000000000 set
cpu-bind=MASK - i20r01c01s06, task 40 40 [98476]: mask 0x30000000000000000000000 set
cpu-bind=MASK - i20r01c01s06, task 41 41 [98477]: mask 0xc0000000000000000000000 set
cpu-bind=MASK - i20r01c01s06, task 42 42 [98478]: mask 0x300000000000000000000000 set
cpu-bind=MASK - i20r01c01s06, task 43 43 [98479]: mask 0xc00000000000000000000000 set
cpu-bind=MASK - i20r01c01s06, task 44 44 [98480]: mask 0x3 set
cpu-bind=MASK - i20r01c01s06, task 45 45 [98481]: mask 0xc set
cpu-bind=MASK - i20r01c01s06, task 46 46 [98482]: mask 0x30 set
srun: Node i20r01c01s06, 48 tasks started
cpu-bind=MASK - i20r01c01s08, task 48 0 [179006]: mask 0x3 set
cpu-bind=MASK - i20r01c01s08, task 49 1 [179007]: mask 0xc set
cpu-bind=MASK - i20r01c01s08, task 50 2 [179008]: mask 0x30 set
cpu-bind=MASK - i20r01c01s08, task 51 3 [179009]: mask 0xc0 set
cpu-bind=MASK - i20r01c01s08, task 52 4 [179010]: mask 0x300 set
cpu-bind=MASK - i20r01c01s08, task 53 5 [179011]: mask 0xc00 set
cpu-bind=MASK - i20r01c01s08, task 54 6 [179012]: mask 0x3000 set
cpu-bind=MASK - i20r01c01s08, task 55 7 [179013]: mask 0xc000 set
cpu-bind=MASK - i20r01c01s08, task 56 8 [179014]: mask 0x30000 set
cpu-bind=MASK - i20r01c01s08, task 57 9 [179015]: mask 0xc0000 set
cpu-bind=MASK - i20r01c01s08, task 58 10 [179016]: mask 0x300000 set
cpu-bind=MASK - i20r01c01s08, task 59 11 [179017]: mask 0xc00000 set
cpu-bind=MASK - i20r01c01s08, task 60 12 [179018]: mask 0x3000000 set
cpu-bind=MASK - i20r01c01s08, task 61 13 [179019]: mask 0xc000000 set
cpu-bind=MASK - i20r01c01s08, task 62 14 [179020]: mask 0x30000000 set
cpu-bind=MASK - i20r01c01s08, task 63 15 [179021]: mask 0xc0000000 set
cpu-bind=MASK - i20r01c01s08, task 64 16 [179022]: mask 0x300000000 set
cpu-bind=MASK - i20r01c01s08, task 65 17 [179023]: mask 0xc00000000 set
cpu-bind=MASK - i20r01c01s08, task 66 18 [179024]: mask 0x3000000000 set
cpu-bind=MASK - i20r01c01s08, task 67 19 [179025]: mask 0xc000000000 set
cpu-bind=MASK - i20r01c01s08, task 68 20 [179026]: mask 0x30000000000 set
cpu-bind=MASK - i20r01c01s08, task 69 21 [179027]: mask 0xc0000000000 set
cpu-bind=MASK - i20r01c01s08, task 70 22 [179028]: mask 0x300000000000 set
cpu-bind=MASK - i20r01c01s08, task 71 23 [179029]: mask 0xc00000000000 set
cpu-bind=MASK - i20r01c01s08, task 72 24 [179030]: mask 0x300000000000000 set
cpu-bind=MASK - i20r01c01s08, task 73 25 [179031]: mask 0xc00000000000000 set
cpu-bind=MASK - i20r01c01s08, task 74 26 [179032]: mask 0x3000000000000000 set
cpu-bind=MASK - i20r01c01s08, task 75 27 [179033]: mask 0xc000000000000000 set
cpu-bind=MASK - i20r01c01s08, task 76 28 [179034]: mask 0x30000000000000000 set
cpu-bind=MASK - i20r01c01s08, task 77 29 [179035]: mask 0xc0000000000000000 set
cpu-bind=MASK - i20r01c01s08, task 78 30 [179036]: mask 0x300000000000000000 set
cpu-bind=MASK - i20r01c01s08, task 79 31 [179037]: mask 0xc00000000000000000 set
cpu-bind=MASK - i20r01c01s08, task 80 32 [179038]: mask 0x3000000000000000000 set
cpu-bind=MASK - i20r01c01s08, task 81 33 [179039]: mask 0xc000000000000000000 set
cpu-bind=MASK - i20r01c01s08, task 82 34 [179040]: mask 0x30000000000000000000 set
cpu-bind=MASK - i20r01c01s08, task 83 35 [179041]: mask 0xc0000000000000000000 set
cpu-bind=MASK - i20r01c01s08, task 84 36 [179042]: mask 0x300000000000000000000 set
cpu-bind=MASK - i20r01c01s08, task 85 37 [179043]: mask 0xc00000000000000000000 set
cpu-bind=MASK - i20r01c01s08, task 86 38 [179044]: mask 0x3000000000000000000000 set
cpu-bind=MASK - i20r01c01s08, task 87 39 [179045]: mask 0xc000000000000000000000 set
cpu-bind=MASK - i20r01c01s08, task 88 40 [179046]: mask 0x30000000000000000000000 set
cpu-bind=MASK - i20r01c01s08, task 89 41 [179047]: mask 0xc0000000000000000000000 set
cpu-bind=MASK - i20r01c01s08, task 90 42 [179048]: mask 0x300000000000000000000000 set
cpu-bind=MASK - i20r01c01s08, task 91 43 [179049]: mask 0xc00000000000000000000000 set
cpu-bind=MASK - i20r01c01s08, task 92 44 [179050]: mask 0x3 set
cpu-bind=MASK - i20r01c01s08, task 93 45 [179051]: mask 0xc set
cpu-bind=MASK - i20r01c01s08, task 94 46 [179052]: mask 0x30 set
cpu-bind=MASK - i20r01c01s08, task 95 47 [179053]: mask 0xc0 set
srun: Node i20r01c01s08, 48 tasks started
cpu-bind=MASK - i20r01c01s06, task 47 47 [98483]: mask 0xc0 set
In words: the last 4 ranks meant for the second socket of each node are actually placed on the first socket ... So, effectively --ntasks-per-socket is then ignored?
I couldn't see an obvious reason for that. Maybe I miss some important point ... or neglect some interference with Intel MPI's runtime environment (there didn't change anything in the masks when unsetting KMP_AFFINITY).
I'd like also to mention that explicit placement via cpu_map does the right thing.
--------------------------->
#!/bin/bash
#SBATCH -J srun_test
#SBATCH --time=0:02:00
#SBATCH --export=NONE
#SBATCH --partition=test
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=48
#SBATCH -o ./%x.%j.out
#SBATCH -D .
srun --export=all --mpi=pmi2 --cpu-bind=map_cpu:$(seq 0 2 46 | tr '\n' ',')$(seq 56 2 102 | tr '\n' ',') /lrz/sys/tools/placement_test_2021/bin/placement-test.intel_impi
--------------------------->
So, a workaround is available for me.
But if I could get some illuminating hint on where my intuition failed above, I'd be very grateful.
Thank you!
Cheers, Martin
(1)
I_MPI_FILESYSTEM=on
I_MPI_HYDRA_BOOTSTRAP=slurm
I_MPI_HYDRA_BOOTSTRAP_EXEC_EXTRA_ARGS=--external-launcher
I_MPI_HYDRA_BRANCH_COUNT=128
I_MPI_OFFLOAD=0
I_MPI_OFFLOAD_FAST_MEMCPY_COLL=0
I_MPI_OFFLOAD_RDMA=0
I_MPI_OFI_PROVIDER=psm3
I_MPI_ROOT=/dss/lrzsys/sys/spack/release/24.1.0/opt/x86_64/intel-oneapi-mpi/2021.11.0-gcc-w56vuor/mpi/2021.11
KMP_AFFINITY=granularity=thread,compact,1,0
(2)
SLURMD_NODENAME=i20r01c01s06
SLURM_CLUSTER_NAME=sng2
SLURM_CONF=/etc/slurm/slurm.conf
SLURM_CPUS_ON_NODE=224
SLURM_ECLIBR=0
SLURM_ECPLUG=1
SLURM_ERLAST=sbatch
SLURM_ERSBAC=1
SLURM_GET_USER_ENV=0
SLURM_GTIDS=0
SLURM_JOBID=395726
SLURM_JOB_ACCOUNT=pr28fa
SLURM_JOB_CPUS_PER_NODE=224(x2)
SLURM_JOB_END_TIME=1732781080
SLURM_JOB_GID=3000114
SLURM_JOB_ID=395726
SLURM_JOB_NAME=MGLET_srun
SLURM_JOB_NODELIST=i20r01c01s[06,08]
SLURM_JOB_NUM_NODES=2
SLURM_JOB_PARTITION=test
SLURM_JOB_QOS=test
SLURM_JOB_START_TIME=1732780959
SLURM_JOB_UID=3808660
SLURM_JOB_USER=di49zop
SLURM_LOCALID=0
SLURM_NNODES=2
SLURM_NODEID=0
SLURM_NODELIST=i20r01c01s[06,08]
SLURM_PRIO_PROCESS=0
SLURM_PROCID=0
SLURM_SCRIPT_CONTEXT=prolog_task
SLURM_SETUP_LICENSE=open source - no access restrictions
SLURM_SETUP_MAINTAINER_LIST=Bader(a)lrz.de
SLURM_SUBMIT_DIR=/dss/dsshome1/00/di49zop/test_mglet
SLURM_SUBMIT_HOST=login26
SLURM_TASKS_PER_NODE=224(x2)
SLURM_TASK_PID=98399
SLURM_TOPOLOGY_ADDR=leaf.m02r05.i20r01c01s06
SLURM_TOPOLOGY_ADDR_PATTERN=switch.switch.node