Hello,
We have a two node GPU cluster with 8 NVidia GPUs. GRES is currently configured and works if a user defines it within their sbtach/interactive job submission (--gres=gpu:3). Users only have access to the GPUs they request. However, when they omit “--gres=gpu:n”, they can use every GPU, which interferes with running jobs that used the gres option. I’m at a loss as to why this is happening. Can someone please look at our configuration to see if anything stands out?
SLURM Version = 21.08.5
*Slurm.conf*
ClusterName=ommit
SlurmctldHost=headnode
ProctrackType=proctrack/cgroup
ReturnToService=2
SlurmdPidFile=/run/slurmd.pid
SlurmdSpoolDir=/var/lib/slurm/slurmd
StateSaveLocation=/var/lib/slurm/slurmctld
SlurmUser=slurm
TaskPlugin=task/cgroup
SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_Core_Memory
AccountingStorageType=accounting_storage/slurmdbd
# AccountingStorageType for other resources
#
AccountingStorageTRES=gres/gpu
#DebugFlags=CPU_Bind,gres
JobCompType=jobcomp/none
JobAcctGatherType=jobacct_gather/cgroup
SlurmctldDebug=info
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=info
SlurmdLogFile=/var/log/slurm/slurmd.log
DefMemPerCPU=4000
#NodeName=n01 CPUs=256 Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore=2 RealMemory=1000000
NodeName=n01 Gres=gpu:nvidia-l40:8 CPUs=256 Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore=2 RealMemory=1000000
NodeName=n02 Gres=gpu:nvidia-l40:8 CPUs=256 Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore=2 RealMemory=1000000
#Gres config for GPUs
GresTypes=gpu
PreemptType=preempt/qos
PreemptMode=REQUEUE
# reset usage after 1 week
PriorityUsageResetPeriod=WEEKLY
# The job's age factor reaches 1.0 after waiting in the
# queue for 2 weeks.
PriorityMaxAge=14-0
# This next group determines the weighting of each of the
# components of the Multifactor Job Priority Plugin.
# The default value for each of the following is 1.
PriorityWeightAge=1000
PriorityWeightFairshare=10000
PriorityWeightJobSize=1000
PriorityWeightPartition=1000
PriorityWeightQOS=1500
# Primary partitions
PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP
PartitionName=all Nodes=n01,n02 Default=YES MaxTime=01:00:00 DefaultTime=00:30:00 State=UP
PartitionName=statds Nodes=n01 Default=NO MaxTime=48:00:00 State=UP Priority=100 State=UP OverSubscribe=FORCE AllowAccounts=statds
PartitionName=phil Nodes=n02 Default=NO MaxTime=48:00:00 State=UP Priority=100 State=UP OverSubscribe=FORCE AllowAccounts=phil
#Set up condo mode
# Condo partitions
PartitionName=phil_condo Nodes=n02 Default=NO MaxTime=48:00:00 DefaultTime=00:01:00 State=UP Priority=50 OverSubscribe=FORCE AllowQos=normal
PartitionName=statds_condo Nodes=n01 Default=NO MaxTime=48:00:00 DefaultTime=00:01:00 State=UP Priority=50 OverSubscribe=FORCE AllowQos=normal
JobSubmitPlugins=lua
*Gres.conf*
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia0
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia1
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia2
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia3
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia4
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia5
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia6
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia7
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia0
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia1
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia2
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia3
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia4
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia5
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia6
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia7
*Cgroup.conf*
CgroupMountpoint="/sys/fs/cgroup"
CgroupAutomount=yes
CgroupReleaseAgentDir="/etc/slurm/cgroup"
AllowedDevicesFile="/etc/slurm/cgroup_allowed_devices_file.conf"
ConstrainCores=yes
ConstrainDevices=yes
ConstrainRAMSpace=yes
*cgroup_allowed_devices_file.conf*
/dev/null
/dev/urandom
/dev/zero
/dev/sda*
/dev/cpu/*/*
/dev/pts/*
/dev/nvidia*
I'm not entirely sure, and I can't vouch for differences in a (relatively) older version of slurm.... But I'm pretty sure on our cluster, we have to specify the GRES in the partition in order for Slurm to treat them as allocatable resources. On our interactive nodes, we have GPUs but we don't list them as a GRES in the partition, which lets anyone on those nodes use them. On our other partitions, we do specify the GRES, and that prevents a user from accessing them unless they specify --gres.
Rob
________________________________ From: Jacob Gordon via slurm-users slurm-users@lists.schedmd.com Sent: Friday, January 3, 2025 11:32 AM To: slurm-users@lists.schedmd.com slurm-users@lists.schedmd.com Subject: [slurm-users] All GPUs are Usable if no Gres is Defined
Hello,
We have a two node GPU cluster with 8 NVidia GPUs. GRES is currently configured and works if a user defines it within their sbtach/interactive job submission (--gres=gpu:3). Users only have access to the GPUs they request. However, when they omit “--gres=gpu:n”, they can use every GPU, which interferes with running jobs that used the gres option. I’m at a loss as to why this is happening. Can someone please look at our configuration to see if anything stands out?
SLURM Version = 21.08.5
Slurm.conf
ClusterName=ommit
SlurmctldHost=headnode
ProctrackType=proctrack/cgroup
ReturnToService=2
SlurmdPidFile=/run/slurmd.pid
SlurmdSpoolDir=/var/lib/slurm/slurmd
StateSaveLocation=/var/lib/slurm/slurmctld
SlurmUser=slurm
TaskPlugin=task/cgroup
SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_Core_Memory
AccountingStorageType=accounting_storage/slurmdbd
# AccountingStorageType for other resources
#
AccountingStorageTRES=gres/gpu
#DebugFlags=CPU_Bind,gres
JobCompType=jobcomp/none
JobAcctGatherType=jobacct_gather/cgroup
SlurmctldDebug=info
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=info
SlurmdLogFile=/var/log/slurm/slurmd.log
DefMemPerCPU=4000
#NodeName=n01 CPUs=256 Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore=2 RealMemory=1000000
NodeName=n01 Gres=gpu:nvidia-l40:8 CPUs=256 Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore=2 RealMemory=1000000
NodeName=n02 Gres=gpu:nvidia-l40:8 CPUs=256 Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore=2 RealMemory=1000000
#Gres config for GPUs
GresTypes=gpu
PreemptType=preempt/qos
PreemptMode=REQUEUE
# reset usage after 1 week
PriorityUsageResetPeriod=WEEKLY
# The job's age factor reaches 1.0 after waiting in the
# queue for 2 weeks.
PriorityMaxAge=14-0
# This next group determines the weighting of each of the
# components of the Multifactor Job Priority Plugin.
# The default value for each of the following is 1.
PriorityWeightAge=1000
PriorityWeightFairshare=10000
PriorityWeightJobSize=1000
PriorityWeightPartition=1000
PriorityWeightQOS=1500
# Primary partitions
PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP
PartitionName=all Nodes=n01,n02 Default=YES MaxTime=01:00:00 DefaultTime=00:30:00 State=UP
PartitionName=statds Nodes=n01 Default=NO MaxTime=48:00:00 State=UP Priority=100 State=UP OverSubscribe=FORCE AllowAccounts=statds
PartitionName=phil Nodes=n02 Default=NO MaxTime=48:00:00 State=UP Priority=100 State=UP OverSubscribe=FORCE AllowAccounts=phil
#Set up condo mode
# Condo partitions
PartitionName=phil_condo Nodes=n02 Default=NO MaxTime=48:00:00 DefaultTime=00:01:00 State=UP Priority=50 OverSubscribe=FORCE AllowQos=normal
PartitionName=statds_condo Nodes=n01 Default=NO MaxTime=48:00:00 DefaultTime=00:01:00 State=UP Priority=50 OverSubscribe=FORCE AllowQos=normal
JobSubmitPlugins=lua
Gres.conf
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia0
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia1
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia2
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia3
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia4
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia5
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia6
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia7
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia0
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia1
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia2
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia3
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia4
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia5
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia6
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia7
Cgroup.conf
CgroupMountpoint="/sys/fs/cgroup"
CgroupAutomount=yes
CgroupReleaseAgentDir="/etc/slurm/cgroup"
AllowedDevicesFile="/etc/slurm/cgroup_allowed_devices_file.conf"
ConstrainCores=yes
ConstrainDevices=yes
ConstrainRAMSpace=yes
cgroup_allowed_devices_file.conf
/dev/null
/dev/urandom
/dev/zero
/dev/sda*
/dev/cpu/*/*
/dev/pts/*
/dev/nvidia*
Ensure cgroups is working and configured to limit access to devices (which includes gpus).
Check your cgroup.conf to see that there is an entry for:
ConstrainDevices=yes
Brian Andrus
On 1/3/2025 10:49 AM, Groner, Rob via slurm-users wrote:
I'm not entirely sure, and I can't vouch for differences in a (relatively) older version of slurm.... But I'm pretty sure on our cluster, we have to specify the GRES in the partition in order for Slurm to treat them as allocatable resources. On our interactive nodes, we have GPUs but we don't list them as a GRES in the partition, which lets anyone on those nodes use them. On our other partitions, we do specify the GRES, and that prevents a user from accessing them unless they specify --gres.
Rob
*From:* Jacob Gordon via slurm-users slurm-users@lists.schedmd.com *Sent:* Friday, January 3, 2025 11:32 AM *To:* slurm-users@lists.schedmd.com slurm-users@lists.schedmd.com *Subject:* [slurm-users] All GPUs are Usable if no Gres is Defined
Hello,
We have a two node GPU cluster with 8 NVidia GPUs. GRES is currently configured and works if a user defines it within their sbtach/interactive job submission (--gres=gpu:3). Users only have access to the GPUs they request. However, when they omit “--gres=gpu:n”, they can use every GPU, which interferes with running jobs that used the gres option. I’m at a loss as to why this is happening. Can someone please look at our configuration to see if anything stands out?
SLURM Version = 21.08.5
*_Slurm.conf_*
ClusterName=ommit
SlurmctldHost=headnode
ProctrackType=proctrack/cgroup
ReturnToService=2
SlurmdPidFile=/run/slurmd.pid
SlurmdSpoolDir=/var/lib/slurm/slurmd
StateSaveLocation=/var/lib/slurm/slurmctld
SlurmUser=slurm
TaskPlugin=task/cgroup
SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_Core_Memory
AccountingStorageType=accounting_storage/slurmdbd
# AccountingStorageType for other resources
#
AccountingStorageTRES=gres/gpu
#DebugFlags=CPU_Bind,gres
JobCompType=jobcomp/none
JobAcctGatherType=jobacct_gather/cgroup
SlurmctldDebug=info
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=info
SlurmdLogFile=/var/log/slurm/slurmd.log
DefMemPerCPU=4000
#NodeName=n01 CPUs=256 Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore=2 RealMemory=1000000
NodeName=n01 Gres=gpu:nvidia-l40:8 CPUs=256 Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore=2 RealMemory=1000000
NodeName=n02 Gres=gpu:nvidia-l40:8 CPUs=256 Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore=2 RealMemory=1000000
#Gres config for GPUs
GresTypes=gpu
PreemptType=preempt/qos
PreemptMode=REQUEUE
# reset usage after 1 week
PriorityUsageResetPeriod=WEEKLY
# The job's age factor reaches 1.0 after waiting in the
# queue for 2 weeks.
PriorityMaxAge=14-0
# This next group determines the weighting of each of the
# components of the Multifactor Job Priority Plugin.
# The default value for each of the following is 1.
PriorityWeightAge=1000
PriorityWeightFairshare=10000
PriorityWeightJobSize=1000
PriorityWeightPartition=1000
PriorityWeightQOS=1500
# Primary partitions
PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP
PartitionName=all Nodes=n01,n02 Default=YES MaxTime=01:00:00 DefaultTime=00:30:00 State=UP
PartitionName=statds Nodes=n01 Default=NO MaxTime=48:00:00 State=UP Priority=100 State=UP OverSubscribe=FORCE AllowAccounts=statds
PartitionName=phil Nodes=n02 Default=NO MaxTime=48:00:00 State=UP Priority=100 State=UP OverSubscribe=FORCE AllowAccounts=phil
#Set up condo mode
# Condo partitions
PartitionName=phil_condo Nodes=n02 Default=NO MaxTime=48:00:00 DefaultTime=00:01:00 State=UP Priority=50 OverSubscribe=FORCE AllowQos=normal
PartitionName=statds_condo Nodes=n01 Default=NO MaxTime=48:00:00 DefaultTime=00:01:00 State=UP Priority=50 OverSubscribe=FORCE AllowQos=normal
JobSubmitPlugins=lua
*_Gres.conf_*
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia0
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia1
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia2
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia3
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia4
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia5
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia6
NodeName=n01 Name=gpu Type=nvidia-l40 File=/dev/nvidia7
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia0
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia1
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia2
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia3
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia4
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia5
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia6
NodeName=n02 Name=gpu Type=nvidia-l40 File=/dev/nvidia7
*_Cgroup.conf_*
CgroupMountpoint="/sys/fs/cgroup"
CgroupAutomount=yes
CgroupReleaseAgentDir="/etc/slurm/cgroup"
AllowedDevicesFile="/etc/slurm/cgroup_allowed_devices_file.conf"
ConstrainCores=yes
ConstrainDevices=yes
ConstrainRAMSpace=yes
*_cgroup_allowed_devices_file.conf_*
/dev/null
/dev/urandom
/dev/zero
/dev/sda*
/dev/cpu/*/*
/dev/pts/*
/dev/nvidia*