[slurm-users] Queue size, slow/unresponsive head node

Thu Jan 11 14:39:43 MST 2018

Hello,

I'm managing a small cluster (one head node, 24 workers, 1160 total 
worker threads). The head node has two E5-2680 v3 CPUs (hyper-threaded), 
~100 GB of memory and spinning disks.
The head node becomes occasionally less responsive when there are more 
than 10k jobs in queue, and becomes really unmanageable when reaching 
100k jobs in queue, with error messages such as:
> sbatch: error: Slurm temporarily unable to accept job, sleeping and 
> retrying.
or
> Running: slurm_load_jobs error: Socket timed out on send/recv operation
Is that normal to experience slowdowns when the queue reaches this few 
10k jobs? What limit should I expect? Would adding a SSD drive for 
SlurmdSpoolDir help? What can be done to push this limit?

The cluster runs Slurm 17.02.4 on CentOS 6 and the config is attached 
(from `scontrol show config`).

Thanks,
Colas
-------------- next part --------------
Configuration data as of 2018-01-11T15:47:25
AccountingStorageBackupHost = (null)
AccountingStorageEnforce = associations,limits
AccountingStorageHost   = mckenzie
AccountingStorageLoc    = N/A
AccountingStoragePort   = 6819
AccountingStorageTRES   = cpu,mem,energy,node
AccountingStorageType   = accounting_storage/slurmdbd
AccountingStorageUser   = N/A
AccountingStoreJobComment = Yes
AcctGatherEnergyType    = acct_gather_energy/none
AcctGatherFilesystemType = acct_gather_filesystem/none
AcctGatherInfinibandType = acct_gather_infiniband/none
AcctGatherNodeFreq      = 0 sec
AcctGatherProfileType   = acct_gather_profile/none
AllowSpecResourcesUsage = 0
AuthInfo                = (null)
AuthType                = auth/munge
BackupAddr              = (null)
BackupController        = (null)
BatchStartTimeout       = 10 sec
BOOT_TIME               = 2017-09-05T13:00:11
BurstBufferType         = (null)
CacheGroups             = 0
CheckpointType          = checkpoint/none
ChosLoc                 = (null)
ClusterName             = blackbox
CompleteWait            = 0 sec
ControlAddr             = mckenzie
ControlMachine          = mckenzie
CoreSpecPlugin          = core_spec/none
CpuFreqDef              = Unknown
CpuFreqGovernors        = Performance,OnDemand
CryptoType              = crypto/munge
DebugFlags              = (null)
DefMemPerCPU            = 1291
DisableRootJobs         = Yes
EioTimeout              = 60
EnforcePartLimits       = ANY
Epilog                  = /etc/slurm/job_scripts/epilog
EpilogMsgTime           = 2000 usec
EpilogSlurmctld         = (null)
ExtSensorsType          = ext_sensors/none
ExtSensorsFreq          = 0 sec
FairShareDampeningFactor = 1
FastSchedule            = 1
FirstJobId              = 1
GetEnvTimeout           = 2 sec
GresTypes               = bandwidth
GroupUpdateForce        = 1
GroupUpdateTime         = 600 sec
HASH_VAL                = Match
HealthCheckInterval     = 0 sec
HealthCheckNodeState    = ANY
HealthCheckProgram      = (null)
InactiveLimit           = 0 sec
JobAcctGatherFrequency  = 30
JobAcctGatherType       = jobacct_gather/linux
JobAcctGatherParams     = (null)
JobCheckpointDir        = /var/slurm/checkpoint
JobCompHost             = localhost
JobCompLoc              = /var/log/slurm/jobcomp.log
JobCompPort             = 0
JobCompType             = jobcomp/none
JobCompUser             = root
JobContainerType        = job_container/none
JobCredentialPrivateKey = (null)
JobCredentialPublicCertificate = (null)
JobFileAppend           = 0
JobRequeue              = 1
JobSubmitPlugins        = (null)
KeepAliveTime           = SYSTEM_DEFAULT
KillOnBadExit           = 0
KillWait                = 30 sec
LaunchParameters        = (null)
LaunchType              = launch/slurm
Layouts                 = 
Licenses                = (null)
LicensesUsed            = (null)
MailDomain              = (null)
MailProg                = /bin/mail
MaxArraySize            = 1001
MaxJobCount             = 100000
MaxJobId                = 67043328
MaxMemPerNode           = UNLIMITED
MaxStepCount            = 40000
MaxTasksPerNode         = 512
MCSPlugin               = mcs/none
MCSParameters           = (null)
MemLimitEnforce         = Yes
MessageTimeout          = 10 sec
MinJobAge               = 100 sec
MpiDefault              = none
MpiParams               = (null)
MsgAggregationParams    = (null)
NEXT_JOB_ID             = 13077546
NodeFeaturesPlugins     = (null)
OverTimeLimit           = 0 min
PluginDir               = /usr/lib64/slurm
PlugStackConfig         = /etc/slurm/plugstack.conf
PowerParameters         = (null)
PowerPlugin             = 
PreemptMode             = OFF
PreemptType             = preempt/none
PriorityParameters      = (null)
PriorityDecayHalfLife   = 1-00:00:00
PriorityCalcPeriod      = 00:05:00
PriorityFavorSmall      = No
PriorityFlags           = SMALL_RELATIVE_TO_TIME,FAIR_TREE,MAX_TRES
PriorityMaxAge          = 7-00:00:00
PriorityUsageResetPeriod = NONE
PriorityType            = priority/multifactor
PriorityWeightAge       = 0
PriorityWeightFairShare = 100000
PriorityWeightJobSize   = 0
PriorityWeightPartition = 0
PriorityWeightQOS       = 0
PriorityWeightTRES      = (null)
PrivateData             = none
ProctrackType           = proctrack/linuxproc
Prolog                  = /etc/slurm/job_scripts/prolog
PrologEpilogTimeout     = 65534
PrologSlurmctld         = (null)
PrologFlags             = (null)
PropagatePrioProcess    = 2
PropagateResourceLimits = ALL
PropagateResourceLimitsExcept = (null)
RebootProgram           = (null)
ReconfigFlags           = (null)
RequeueExit             = (null)
RequeueExitHold         = (null)
ResumeProgram           = (null)
ResumeRate              = 300 nodes/min
ResumeTimeout           = 60 sec
ResvEpilog              = (null)
ResvOverRun             = 0 min
ResvProlog              = (null)
ReturnToService         = 0
RoutePlugin             = route/default
SallocDefaultCommand    = (null)
SbcastParameters        = (null)
SchedulerParameters     = (null)
SchedulerTimeSlice      = 30 sec
SchedulerType           = sched/backfill
SelectType              = select/cons_res
SelectTypeParameters    = CR_CORE_MEMORY,CR_LLN
SlurmUser               = slurm(1041)
SlurmctldDebug          = error
SlurmctldLogFile        = /var/log/slurm/slurmctld.log
SlurmctldPort           = 6817
SlurmctldTimeout        = 120 sec
SlurmdDebug             = error
SlurmdLogFile           = /var/log/slurm/slurmd.log
SlurmdPidFile           = /var/run/slurmd.pid
SlurmdPlugstack         = (null)
SlurmdPort              = 6818
SlurmdSpoolDir          = /var/spool/slurmd
SlurmdTimeout           = 300 sec
SlurmdUser              = root(0)
SlurmSchedLogFile       = /var/log/slurm/slurm-sched.log
SlurmSchedLogLevel      = 1
SlurmctldPidFile        = /var/run/slurmctld.pid
SlurmctldPlugstack      = (null)
SLURM_CONF              = /etc/slurm/slurm.conf
SLURM_VERSION           = 17.02.4
SrunEpilog              = /etc/slurm/job_scripts/epilog.srun
SrunPortRange           = 0-0
SrunProlog              = /etc/slurm/job_scripts/prolog.srun
StateSaveLocation       = /var/spool/slurm
SuspendExcNodes         = (null)
SuspendExcParts         = (null)
SuspendProgram          = (null)
SuspendRate             = 60 nodes/min
SuspendTime             = NONE
SuspendTimeout          = 30 sec
SwitchType              = switch/none
TaskEpilog              = /etc/slurm/job_scripts/epilog.task
TaskPlugin              = task/none
TaskPluginParam         = (null type)
TaskProlog              = /etc/slurm/job_scripts/prolog.task
TCPTimeout              = 2 sec
TmpFS                   = /local
TopologyParam           = (null)
TopologyPlugin          = topology/none
TrackWCKey              = No
TreeWidth               = 50
UsePam                  = 0
UnkillableStepProgram   = (null)
UnkillableStepTimeout   = 60 sec
VSizeFactor             = 0 percent
WaitTime                = 0 sec

Slurmctld(primary/backup) at mckenzie/(NULL) are UP/DOWN