[slurm-users] Only 2 jobs will start per GPU node despite 4 GPU's being present

Thu Jun 11 19:27:07 UTC 2020

We have several users submitting single GPU jobs to our cluster.  We expected the jobs to fill each node and fully utilize the available GPU's but we instead find that only 2 out of the 4 gpu's in each node gets allocated.

If we request 2 GPU's in the job and start two jobs, both jobs will start on the same node fully allocating the node. We are puzzled about is going on and any hints are welcome.

Thanks for your help,

Rhian

Example SBATCH Script
#!/bin/bash
#SBATCH --job-name=test
#SBATCH --partition=longq7-mri
#SBATCH -N 1
#SBATCH -n 1
#SBATCH --gres=gpu:1
#SBATCH --mail-type=ALL
hostname
echo CUDA_VISIBLE_DEVICES $CUDA_VISIBLE_DEVICES

set | grep SLURM
nvidia-smi
sleep 500

gres.conf
#AutoDetect=nvml
Name=gpu Type=v100  File=/dev/nvidia0 Cores=0
Name=gpu Type=v100  File=/dev/nvidia1 Cores=1
Name=gpu Type=v100  File=/dev/nvidia2 Cores=2
Name=gpu Type=v100  File=/dev/nvidia3 Cores=3

slurm.conf
#
# Example slurm.conf file. Please run configurator.html
# (in doc/html) to build a configuration file customized
# for your environment.
#
#
# slurm.conf file generated by configurator.html.
#
# See the slurm.conf man page for more information.
#
ClusterName=cluster
ControlMachine=cluster-slurm1.example.com
ControlAddr=10.116.0.11
BackupController=cluster-slurm2.example.com
BackupAddr=10.116.0.17
#
SlurmUser=slurm
#SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
SchedulerPort=7321

RebootProgram="/usr/sbin/reboot"

AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/var/spool/slurm/ctld
SlurmdSpoolDir=/var/spool/slurm/d
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmdPidFile=/var/run/slurmd.pid
ProctrackType=proctrack/pgid

GresTypes=gpu,mps,bandwidth

PrologFlags=x11
#PluginDir=
#FirstJobId=
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=/etc/slurm/slurm.epilog.clean
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
#TaskPlugin=
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
#
# TIMERS
SlurmctldTimeout=300
SlurmdTimeout=300
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
#
# SCHEDULING
SchedulerType=sched/backfill
#bf_interval=10
#SchedulerAuth=
#SelectType=select/linear
# Cores and memory are consumable
#SelectType=select/cons_res
#SelectTypeParameters=CR_Core_Memory
SchedulerParameters=bf_interval=10
SelectType=select/cons_res
SelectTypeParameters=CR_Core

FastSchedule=1
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=14-0
#PriorityUsageResetPeriod=14-0
#PriorityWeightFairshare=100000
#PriorityWeightAge=1000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=1000
#PriorityMaxAge=1-0
#
# LOGGING
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=3
SlurmdLogFile=/var/log/slurmd.log
JobCompType=jobcomp/none
#JobCompLoc=
#
# ACCOUNTING
#JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherFrequency=30
#
#AccountingStorageType=accounting_storage/slurmdbd
#AccountingStorageHost=
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStorageUser=
#
#
#
# Default values
# DefMemPerNode=64000
# DefCpuPerGPU=4
# DefMemPerCPU=4000
# DefMemPerGPU=16000

# OpenHPC default configuration
#TaskPlugin=task/affinity
TaskPlugin=task/affinity,task/cgroup
PropagateResourceLimitsExcept=MEMLOCK
TaskPluginParam=autobind=cores
#AccountingStorageType=accounting_storage/mysql
#StorageLoc=slurm_acct_db

AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=cluster-slurmdbd1.example.com
#AccountingStorageType=accounting_storage/filetxt
Epilog=/etc/slurm/slurm.epilog.clean

#PartitionName=normal Nodes=c[1-5] Default=YES MaxTime=24:00:00 State=UP
PartitionName=DEFAULT State=UP Default=NO AllowGroups=ALL Priority=10 DisableRootJobs=NO RootOnly=NO Hidden=NO Shared=NO GraceTime=0 PreemptMode=OFF ReqResv=NO AllowAccounts=ALL AllowQos=ALL LLN=NO ExclusiveUser=NO  Nodes=nodeamd[009-016],c[1-4],nodehtc[001-025]

# Partitions

# Group Limited Queues

# OIT DEBUG QUEUE
PartitionName=debug Nodes=c[1-4] MaxTime=24:00:00 State=UP AllowGroups=oit-hpc-admin

# RNA CHEM
PartitionName=longq7-rna MinNodes=1 MaxNodes=30 DefaultTime=168:00:00 MaxTime=UNLIMITED Priority=200 Nodes=nodeamd[001-008],nodegpu[021-025] AllowGroups=gpu-rnachem

# V100's
PartitionName=longq7-mri MinNodes=1 MaxNodes=30 DefaultTime=168:00:00 MaxTime=168:00:00 Priority=200 Nodes=nodenviv100[001-016] AllowGroups=gpu-mri

# BIGDATA GRANT
PartitionName=longq-bigdata7 MinNodes=1 MaxNodes=30 DefaultTime=168:00:00 MaxTime=168:00:00 Priority=200 Nodes=node[087-098],nodegpu001 AllowGroups=fau-bigdata,nsf-bigdata

PartitionName=gpu-bigdata7 Default=NO MinNodes=1 Priority=10  AllowAccounts=ALL  Nodes=nodegpu001 AllowGroups=fau-bigdata,nsf-bigdata

# CogNeuroLab
PartitionName=CogNeuroLab Default=NO MinNodes=1 MaxNodes=4 MaxTime=7-12:00:00 AllowGroups=cogneurolab Priority=200 State=UP Nodes=node[001-004]

# Standard queues

# OPEN TO ALL

#Short Queue
PartitionName=shortq7 MinNodes=1 MaxNodes=30 DefaultTime=06:00:00 MaxTime=06:00:00 Priority=100 Nodes=nodeamd[001-016],nodenviv100[001-015],nodegpu[001-025],node[001-100],nodehtc[001-025]  Default=YES

# Medium Queue
PartitionName=mediumq7 MinNodes=1 MaxNodes=30 DefaultTime=72:00:00 MaxTime=72:00:00 Priority=50 Nodes=nodeamd[009-016],node[004-100]

# Long Queue
PartitionName=longq7 MinNodes=1 MaxNodes=30 DefaultTime=168:00:00 MaxTime=168:00:00 Priority=30 Nodes=nodeamd[009-016],node[004-100]

# Interactive
PartitionName=interactive MinNodes=1 MaxNodes=4 DefaultTime=06:00:00 MaxTime=06:00:00 Priority=101 Nodes=node[001-100]  Default=No Hidden=YES

# Nodes

# Test nodes, (vms)
NodeName=c[1-4] Cpus=4 Feature=virtual RealMemory=16000

# AMD Nodes
NodeName=nodeamd[001-016] Procs=64 Boards=1 SocketsPerBoard=8 CoresPerSocket=8 ThreadsPerCore=1 Features=amd,epyc RealMemory=225436

# V100 MRI
NodeName=nodenviv100[001-016] CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 Gres=gpu:v100:4 Feature=v100 RealMemory=192006

# GPU nodes
NodeName=nodegpu001 Procs=40 Boards=1 SocketsPerBoard=2 CoresPerSocket=10 ThreadsPerCore=2 Gres=gpu:k80:8 Feature=k80,intel RealMemory=64000
NodeName=nodegpu002 Procs=40 Boards=1 SocketsPerBoard=2 CoresPerSocket=10 ThreadsPerCore=2 Gres=gpu:gk1:8 Feature=gk1,intel RealMemory=128000
NodeName=nodegpu[003-020] Boards=1 SocketsPerBoard=2 CoresPerSocket=8 ThreadsPerCore=2 Gres=gpu:gk1:8 Feature=gk1,intel RealMemory=128000
NodeName=nodegpu[021-025] Procs=16 Boards=1 SocketsPerBoard=2 CoresPerSocket=8 ThreadsPerCore=1 Gres=gpu:4 Feature=exxact,intel RealMemory=128000

# IvyBridge nodes
NodeName=node[001-021] Procs=20 Boards=1 SocketsPerBoard=2 CoresPerSocket=10 ThreadsPerCore=1 Feature=intel,ivybridge RealMemory=112750
# SandyBridge node(2)
NodeName=node022 Procs=16 Boards=1 SocketsPerBoard=2 CoresPerSocket=8 ThreadsPerCore=1 Feature=intel,sandybridge RealMemory=64000
# IvyBridge
NodeName=node[023-050] Procs=20 Boards=1 SocketsPerBoard=2 CoresPerSocket=10 ThreadsPerCore=1 Feature=intel,ivybridge RealMemory=112750
# Haswell
NodeName=node[051-100] Procs=20 Boards=1 SocketsPerBoard=2 CoresPerSocket=10 ThreadsPerCore=1 Feature=intel,haswell RealMemory=112750

# Node health monitoring
HealthCheckProgram=/usr/sbin/nhc
HealthCheckInterval=300
ReturnToService=2

# Fix for X11 issues
X11Parameters=use_raw_hostname

Rhian Resnick

Associate Director Research Computing

Enterprise Systems

Office of Information Technology

Florida Atlantic University

777 Glades Road, CM22, Rm 173B

Boca Raton, FL 33431

Phone 561.297.2647

Fax 561.297.0222

 [image] <https://hpc.fau.edu/wp-content/uploads/2015/01/image.jpg>

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.schedmd.com/pipermail/slurm-users/attachments/20200611/fdda3ada/attachment-0001.htm>