[slurm-users] Only 2 jobs will start per GPU node despite 4 GPU's being present

Fri Aug 7 15:12:07 UTC 2020

Hello,

This is something I've seen once on our systems & it took me a while to 
figure out what was going on.

The solution was that the system topology was such that all GPUs were 
connected to one CPU. There were no free cores on that particular CPU; 
so SLURM did not schedule any more jobs to the GPUs. Needed to disable 
binding in job submission to schedule to all of them.

Not sure that applies in your situation (don't know your system), but 
it's something to check?

Tina

On 07/08/2020 15:42, Jodie H. Sprouse wrote:
> Good  morning.
> I have having the same experience here. Wondering if you had a resolution?
> Thank you.
> Jodie
>
>
> On Jun 11, 2020, at 3:27 PM, Rhian Resnick <rresnick at fau.edu 
> <mailto:rresnick at fau.edu>> wrote:
>
> We have several users submitting single GPU jobs to our cluster. We 
> expected the jobs to fill each node and fully utilize the available 
> GPU's but we instead find that only 2 out of the 4 gpu's in each node 
> gets allocated.
>
> If we request 2 GPU's in the job and start two jobs, both jobs will 
> start on the same node fully allocating the node. We are puzzled about 
> is going on and any hints are welcome.
>
> Thanks for your help,
>
> Rhian
>
>
>
> *Example SBATCH Script*
> #!/bin/bash
> #SBATCH --job-name=test
> #SBATCH --partition=longq7-mri
> #SBATCH -N 1
> #SBATCH -n 1
> #SBATCH --gres=gpu:1
> #SBATCH --mail-type=ALL
> hostname
> echo CUDA_VISIBLE_DEVICES $CUDA_VISIBLE_DEVICES
>
> set | grep SLURM
> nvidia-smi
> sleep 500
>
>
>
>
> *gres.conf*
> #AutoDetect=nvml
> Name=gpu Type=v100  File=/dev/nvidia0 Cores=0
> Name=gpu Type=v100  File=/dev/nvidia1 Cores=1
> Name=gpu Type=v100  File=/dev/nvidia2 Cores=2
> Name=gpu Type=v100  File=/dev/nvidia3 Cores=3
>
>
> *slurm.conf*
> #
> # Example slurm.conf file. Please run configurator.html
> # (in doc/html) to build a configuration file customized
> # for your environment.
> #
> #
> # slurm.conf file generated by configurator.html.
> #
> # See the slurm.conf man page for more information.
> #
> ClusterName=cluster
> ControlMachine=cluster-slurm1.example.com 
> <http://cluster-slurm1.example.com/>
> ControlAddr=10.116.0.11
> BackupController=cluster-slurm2. 
> <http://cluster-slurm2.example.com/>example.com 
> <http://cluster-slurm2.example.com/>
> BackupAddr=10.116.0.17
> #
> SlurmUser=slurm
> #SlurmdUser=root
> SlurmctldPort=6817
> SlurmdPort=6818
> SchedulerPort=7321
>
> RebootProgram="/usr/sbin/reboot"
>
>
> AuthType=auth/munge
> #JobCredentialPrivateKey=
> #JobCredentialPublicCertificate=
> StateSaveLocation=/var/spool/slurm/ctld
> SlurmdSpoolDir=/var/spool/slurm/d
> SwitchType=switch/none
> MpiDefault=none
> SlurmctldPidFile=/var/run/slurmctld.pid
> SlurmdPidFile=/var/run/slurmd.pid
> ProctrackType=proctrack/pgid
>
> GresTypes=gpu,mps,bandwidth
>
> PrologFlags=x11
> #PluginDir=
> #FirstJobId=
> #MaxJobCount=
> #PlugStackConfig=
> #PropagatePrioProcess=
> #PropagateResourceLimits=
> #PropagateResourceLimitsExcept=
> #Prolog=
> #Epilog=/etc/slurm/slurm.epilog.clean
> #SrunProlog=
> #SrunEpilog=
> #TaskProlog=
> #TaskEpilog=
> #TaskPlugin=
> #TrackWCKey=no
> #TreeWidth=50
> #TmpFS=
> #UsePAM=
> #
> # TIMERS
> SlurmctldTimeout=300
> SlurmdTimeout=300
> InactiveLimit=0
> MinJobAge=300
> KillWait=30
> Waittime=0
> #
> # SCHEDULING
> SchedulerType=sched/backfill
> #bf_interval=10
> #SchedulerAuth=
> #SelectType=select/linear
> # Cores and memory are consumable
> #SelectType=select/cons_res
> #SelectTypeParameters=CR_Core_Memory
> SchedulerParameters=bf_interval=10
> SelectType=select/cons_res
> SelectTypeParameters=CR_Core
>
> FastSchedule=1
> #PriorityType=priority/multifactor
> #PriorityDecayHalfLife=14-0
> #PriorityUsageResetPeriod=14-0
> #PriorityWeightFairshare=100000
> #PriorityWeightAge=1000
> #PriorityWeightPartition=10000
> #PriorityWeightJobSize=1000
> #PriorityMaxAge=1-0
> #
> # LOGGING
> SlurmctldDebug=3
> SlurmctldLogFile=/var/log/slurmctld.log
> SlurmdDebug=3
> SlurmdLogFile=/var/log/slurmd.log
> JobCompType=jobcomp/none
> #JobCompLoc=
> #
> # ACCOUNTING
> #JobAcctGatherType=jobacct_gather/linux
> #JobAcctGatherFrequency=30
> #
> #AccountingStorageType=accounting_storage/slurmdbd
> #AccountingStorageHost=
> #AccountingStorageLoc=
> #AccountingStoragePass=
> #AccountingStorageUser=
> #
> #
> #
> # Default values
> # DefMemPerNode=64000
> # DefCpuPerGPU=4
> # DefMemPerCPU=4000
> # DefMemPerGPU=16000
>
>
>
> # OpenHPC default configuration
> #TaskPlugin=task/affinity
> TaskPlugin=task/affinity,task/cgroup
> PropagateResourceLimitsExcept=MEMLOCK
> TaskPluginParam=autobind=cores
> #AccountingStorageType=accounting_storage/mysql
> #StorageLoc=slurm_acct_db
>
> AccountingStorageType=accounting_storage/slurmdbd
> AccountingStorageHost=cluster-slurmdbd1.example.com 
> <http://cluster-slurmdbd1.example.com/>
> #AccountingStorageType=accounting_storage/filetxt
> Epilog=/etc/slurm/slurm.epilog.clean
>
>
> #PartitionName=normal Nodes=c[1-5] Default=YES MaxTime=24:00:00 State=UP
> PartitionName=DEFAULT State=UP Default=NO AllowGroups=ALL Priority=10 
> DisableRootJobs=NO RootOnly=NO Hidden=NO Shared=NO GraceTime=0 
> PreemptMode=OFF ReqResv=NO AllowAccounts=ALL AllowQos=ALL LLN=NO 
> ExclusiveUser=NO  Nodes=nodeamd[009-016],c[1-4],nodehtc[001-025]
>
>
> # Partitions
>
> # Group Limited Queues
>
> # OIT DEBUG QUEUE
> PartitionName=debug Nodes=c[1-4] MaxTime=24:00:00 State=UP 
> AllowGroups=oit-hpc-admin
>
> # RNA CHEM
> PartitionName=longq7-rna MinNodes=1 MaxNodes=30 DefaultTime=168:00:00 
> MaxTime=UNLIMITED Priority=200 Nodes=nodeamd[001-008],nodegpu[021-025] 
> AllowGroups=gpu-rnachem
>
> # V100's
> PartitionName=longq7-mri MinNodes=1 MaxNodes=30 DefaultTime=168:00:00 
> MaxTime=168:00:00 Priority=200 Nodes=nodenviv100[001-016] 
> AllowGroups=gpu-mri
>
> # BIGDATA GRANT
> PartitionName=longq-bigdata7 MinNodes=1 MaxNodes=30 
> DefaultTime=168:00:00 MaxTime=168:00:00 Priority=200 
> Nodes=node[087-098],nodegpu001 AllowGroups=fau-bigdata,nsf-bigdata
>
> PartitionName=gpu-bigdata7 Default=NO MinNodes=1 Priority=10 
>  AllowAccounts=ALL  Nodes=nodegpu001 AllowGroups=fau-bigdata,nsf-bigdata
>
> # CogNeuroLab
> PartitionName=CogNeuroLab Default=NO MinNodes=1 MaxNodes=4 
> MaxTime=7-12:00:00 AllowGroups=cogneurolab Priority=200 State=UP 
> Nodes=node[001-004]
>
>
> # Standard queues
>
> # OPEN TO ALL
>
> #Short Queue
> PartitionName=shortq7 MinNodes=1 MaxNodes=30 DefaultTime=06:00:00 
> MaxTime=06:00:00 Priority=100 
> Nodes=nodeamd[001-016],nodenviv100[001-015],nodegpu[001-025],node[001-100],nodehtc[001-025] 
>  Default=YES
>
> # Medium Queue
> PartitionName=mediumq7 MinNodes=1 MaxNodes=30 DefaultTime=72:00:00 
> MaxTime=72:00:00 Priority=50 Nodes=nodeamd[009-016],node[004-100]
>
> # Long Queue
> PartitionName=longq7 MinNodes=1 MaxNodes=30 DefaultTime=168:00:00 
> MaxTime=168:00:00 Priority=30 Nodes=nodeamd[009-016],node[004-100]
>
>
> # Interactive
> PartitionName=interactive MinNodes=1 MaxNodes=4 DefaultTime=06:00:00 
> MaxTime=06:00:00 Priority=101 Nodes=node[001-100]  Default=No Hidden=YES
>
> # Nodes
>
> # Test nodes, (vms)
> NodeName=c[1-4] Cpus=4 Feature=virtual RealMemory=16000
>
> # AMD Nodes
> NodeName=nodeamd[001-016] Procs=64 Boards=1 SocketsPerBoard=8 
> CoresPerSocket=8 ThreadsPerCore=1 Features=amd,epyc RealMemory=225436
>
> # V100 MRI
> NodeName=nodenviv100[001-016] CPUs=64 Boards=1 SocketsPerBoard=2 
> CoresPerSocket=16 ThreadsPerCore=2 Gres=gpu:v100:4 Feature=v100 
> RealMemory=192006
>
> # GPU nodes
> NodeName=nodegpu001 Procs=40 Boards=1 SocketsPerBoard=2 
> CoresPerSocket=10 ThreadsPerCore=2 Gres=gpu:k80:8 Feature=k80,intel 
> RealMemory=64000
> NodeName=nodegpu002 Procs=40 Boards=1 SocketsPerBoard=2 
> CoresPerSocket=10 ThreadsPerCore=2 Gres=gpu:gk1:8 Feature=gk1,intel 
> RealMemory=128000
> NodeName=nodegpu[003-020] Boards=1 SocketsPerBoard=2 CoresPerSocket=8 
> ThreadsPerCore=2 Gres=gpu:gk1:8 Feature=gk1,intel RealMemory=128000
> NodeName=nodegpu[021-025] Procs=16 Boards=1 SocketsPerBoard=2 
> CoresPerSocket=8 ThreadsPerCore=1 Gres=gpu:4 Feature=exxact,intel 
> RealMemory=128000
>
> # IvyBridge nodes
> NodeName=node[001-021] Procs=20 Boards=1 SocketsPerBoard=2 
> CoresPerSocket=10 ThreadsPerCore=1 Feature=intel,ivybridge 
> RealMemory=112750
> # SandyBridge node(2)
> NodeName=node022 Procs=16 Boards=1 SocketsPerBoard=2 CoresPerSocket=8 
> ThreadsPerCore=1 Feature=intel,sandybridge RealMemory=64000
> # IvyBridge
> NodeName=node[023-050] Procs=20 Boards=1 SocketsPerBoard=2 
> CoresPerSocket=10 ThreadsPerCore=1 Feature=intel,ivybridge 
> RealMemory=112750
> # Haswell
> NodeName=node[051-100] Procs=20 Boards=1 SocketsPerBoard=2 
> CoresPerSocket=10 ThreadsPerCore=1 Feature=intel,haswell RealMemory=112750
>
>
> # Node health monitoring
> HealthCheckProgram=/usr/sbin/nhc
> HealthCheckInterval=300
> ReturnToService=2
>
> # Fix for X11 issues
> X11Parameters=use_raw_hostname
>
>
>
> Rhian Resnick
> Associate Director Research Computing
> Enterprise Systems
> Office of Information Technology
>
> Florida Atlantic University
> 777 Glades Road, CM22, Rm 173B
> Boca Raton, FL 33431
> Phone 561.297.2647
> Fax 561.297.0222
>