[slurm-users] Preemtion / Oversubscribe
Hidas, Dean
dhidas at bnl.gov
Tue Apr 14 18:27:35 UTC 2020
Dear Slurm Experts,
I am trying to implement a low priority partition that overlaps with other partitions which would suspend for higher priority tier jobs. I noticed when I do this and switch SelectTypeParameters=CR_Core to CR_CPU (my intention is to use each ht thread separately for better or worse) I get oversubscription on nodes running jobs from PreemptMode=OFF partitions, for instance long and gdfidl below.
I've tried to follow https://slurm.schedmd.com/preempt.html in several variations of the first Another Example, but they sometimes give me this behavior. I admittedly do not fully understand oversubscribe so I'm not sure I'm using this correctly. I would basically just like to suspend and resume jobs on the covid partition when another partition job needs the resources.
Any pointers you could give would be much appreciated.
Best Regards,
-Dean Hidas
# slurm.conf file generated by configurator.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
SlurmctldHost=node-master
SlurmctldHost=node-001
#SlurmctldAddr=
#SlurmctldHost=
#
AuthType=auth/munge
#CheckpointType=checkpoint/none
CryptoType=crypto/munge
#DisableRootJobs=NO
#EnforcePartLimits=NO
#Epilog=
#EpilogSlurmctld=
#FirstJobId=1
#MaxJobId=999999
#GresTypes=
#GroupUpdateForce=0
#GroupUpdateTime=600
#JobCheckpointDir=/var/slurm/checkpoint
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
#JobFileAppend=0
#JobRequeue=1
#JobSubmitPlugins=1
#KillOnBadExit=0
#LaunchType=launch/slurm
#Licenses=foo*4,bar
#MailProg=/bin/mail
MaxJobCount=1000000
MaxArraySize=2000000
#MaxStepCount=40000
#MaxTasksPerNode=128
MpiDefault=pmix_v3
MpiParams=ports=12000-12999
#PluginDir=
#PlugStackConfig=
#PrivateData=jobs
ProctrackType=proctrack/cgroup
#ProctrackType=proctrack/linuxproc
#Prolog=
PrologFlags=x11
#PrologSlurmctld=
#PropagatePrioProcess=0
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#RebootProgram=
ReturnToService=1
#SallocDefaultCommand=
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=slurm
#SlurmdUser=root
#SrunEpilog=
#SrunProlog=
#StateSaveLocation=/var/spool/slurmctld
StateSaveLocation=/sh/slurm/var/spool/slurmctld
SwitchType=switch/none
#TaskEpilog=
TaskPlugin=task/affinity,task/cgroup
#TaskPlugin=task/none
##TaskPluginParam=Sched
TaskPluginParam=None
#TaskProlog=
TopologyPlugin=topology/tree
#TopologyPlugin=topology/none
#TmpFS=/tmp
#TrackWCKey=no
#TreeWidth=
#UnkillableStepProgram=
#UsePAM=0
#X11Parameters=local_xauthority
#
# TIMERS
#BatchStartTimeout=10
#CompleteWait=0
#EpilogMsgTime=2000
#GetEnvTimeout=2
#HealthCheckInterval=0
#HealthCheckProgram=
InactiveLimit=0
KillWait=30
#MessageTimeout=10
#ResvOverRun=0
MinJobAge=300
#OverTimeLimit=0
SlurmctldTimeout=120
SlurmdTimeout=300
UnkillableStepTimeout=120
#VSizeFactor=0
Waittime=0
#
#
# SCHEDULING
#DefMemPerCPU=0
FastSchedule=1
#MaxMemPerCPU=0
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
#SchedulerType=sched/builtin
SelectType=select/cons_res
#SelectTypeParameters=CR_Core
SelectTypeParameters=CR_CPU
#
#
# JOB PRIORITY
#PriorityFlags=
PriorityType=priority/multifactor
PriorityDecayHalfLife=7-0
#PriorityCalcPeriod=
#PriorityFavorSmall=
#PriorityMaxAge=
#PriorityUsageResetPeriod=
#PriorityWeightAge=
#PriorityWeightFairshare=
#PriorityWeightJobSize=
PriorityWeightPartition=100
#PriorityWeightQOS=
#
#
# LOGGING AND ACCOUNTING
AccountingStorageEnforce=safe
AccountingStorageHost=node-001
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStoragePort=
AccountingStorageType=accounting_storage/slurmdbd
#AccountingStorageUser=
AccountingStoreJobComment=YES
ClusterName=apcluster
#DebugFlags=
#JobCompHost=
#JobCompLoc=
#JobCompPass=
#JobCompPort=
JobCompType=jobcomp/none
#JobCompUser=
#JobContainerType=job_container/none
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/linux
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=3
SlurmdLogFile=/var/log/slurm/slurmd.log
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#
#
# POWER SAVE SUPPORT FOR IDLE NODES (optional)
#SuspendProgram=
#ResumeProgram=
#SuspendTimeout=
#ResumeTimeout=
#ResumeRate=
#SuspendExcNodes=
#SuspendExcParts=
#SuspendRate=
#SuspendTime=
#
#
#
# PREEMPTION
PreemptMode=SUSPEND,GANG
PreemptType=preempt/partition_prio
#
#
# COMPUTE NODES
NodeName=node-001 CPUs=88 RealMemory=516911
NodeName=node-002 CPUs=88 RealMemory=516911
NodeName=node-003 CPUs=88 RealMemory=516911
NodeName=node-004 CPUs=88 RealMemory=516911
NodeName=node-005 CPUs=88 RealMemory=516911
NodeName=node-master CPUs=20 RealMemory=31914
NodeName=nodeb-002 CPUs=24 RealMemory=516917
NodeName=nodeb-003 CPUs=24 RealMemory=516917
NodeName=nodeb-004 CPUs=72 RealMemory=516914
NodeName=nodeb-005 CPUs=72 RealMemory=516914
NodeName=nodeb-007 CPUs=24 RealMemory=516917
NodeName=nodeb-008 CPUs=24 RealMemory=516917
NodeName=nodeb-009 CPUs=88 RealMemory=516911
NodeName=nodeb-010 CPUs=44 RealMemory=516915
NodeName=nodeb-011 CPUs=88 RealMemory=386655
NodeName=nodeb-012 CPUs=88 RealMemory=386655
NodeName=nodeb-013 CPUs=88 RealMemory=386655
NodeName=nodeb-014 CPUs=88 RealMemory=354399
NodeName=nodeb-015 CPUs=88 RealMemory=354399
NodeName=nodeb-019 CPUs=72 RealMemory=386657
NodeName=nodeb-020 CPUs=72 RealMemory=386657
NodeName=nodeb-021 CPUs=72 RealMemory=386657
NodeName=nodeb-022 CPUs=72 RealMemory=386657
NodeName=nodeb-023 CPUs=72 RealMemory=386657
NodeName=nodeb-024 CPUs=72 RealMemory=386657
NodeName=nodeb-025 CPUs=72 RealMemory=386657
#NodeName=nodeb-026 CPUs=72 RealMemory=386657
PartitionName=default Nodes=node-00[1-5] State=UP PreemptMode=OFF Default=NO OverSubscribe=FORCE:1
PartitionName=normal MaxTime=00-12:00:00 PriorityJobFactor=1500 PriorityTier=2 OverSubscribe=FORCE:1 PreemptMode=OFF Default=YES
PartitionName=tiny MaxTime=00-00:05:00 PriorityJobFactor=1500 PriorityTier=2 OverSubscribe=FORCE:1 PreemptMode=OFF
PartitionName=short MaxTime=00-04:00:00 PriorityJobFactor=1500 PriorityTier=2 OverSubscribe=FORCE:1 PreemptMode=OFF
PartitionName=low MaxTime=04-00:00:00 PriorityJobFactor=10 PriorityTier=1 OverSubscribe=FORCE:1 PreemptMode=OFF
PartitionName=high MaxTime=01-00:00:00 PriorityJobFactor=35000 PriorityTier=3 OverSubscribe=FORCE:1 PreemptMode=OFF
PartitionName=long MaxTime=04-00:00:00 PriorityJobFactor=1500 PriorityTier=2 OverSubscribe=FORCE:1 PreemptMode=OFF
PartitionName=longlong MaxTime=14-00:00:00 PriorityJobFactor=1500 PriorityTier=2 OverSubscribe=FORCE:1 PreemptMode=OFF
PartitionName=gdfidl MaxTime=14-00:00:00 PriorityJobFactor=1500 PriorityTier=2 OverSubscribe=FORCE:1 PreemptMode=OFF
PartitionName=debug MaxTime=00-00:30:00 PriorityJobFactor=50000 PriorityTier=2 OverSubscribe=FORCE:1 PreemptMode=OFF #MaxNodes=3 MaxCPUsPerNode=20
PartitionName=lix-atsas MaxTime=00-01:00:00 PriorityJobFactor=1500 PriorityTier=2 OverSubscribe=FORCE:1 PreemptMode=OFF Nodes=nodeb-002,nodeb-007
PartitionName=covid MaxTime=UNLIMITED PriorityJobFactor=1000 PriorityTier=1 OverSubscribe=FORCE:1 PreemptMode=SUSPEND Nodes=node-00[1-5],nodeb-[002-005],nodeb-[007-015],nodeb-[019-025]
--
Dean Andrew Hidas, Ph.D.
Associate Physicist
Brookhaven National Laboratory
Upton, NY 11973, U.S.A.
Phone: +1 631 344 3568
https://hidas.org
More information about the slurm-users
mailing list