[slurm-users] Job not running with Resource Reason even though resources appear to be available

Paul Raines raines at nmr.mgh.harvard.edu
Sat Jan 23 17:54:11 UTC 2021


Yes, I meant job 38692.  Sorry.

I am still having the problem.  I suspect it has something to do with
the GPU configuration as this does not happen on my non-GPU node partitions.
Also, if I submit non-GPU jobs to the rtx8000 partition here, they
use up all the cores on the nodes just fine.

The upshot is on my 10 GPU nodes, I never see more than 6 GPUs in use
and jobs just asking for 1 or 2 GPUs are just made to wait in the qeuue.

Here is an example.  The state of the nodes in rtx8000 queue before
I queue jobs:

rtx-04
    CfgTRES=cpu=32,mem=1546000M,billing=99,gres/gpu=10
    AllocTRES=cpu=15,mem=120G,gres/gpu=5
rtx-05
    CfgTRES=cpu=32,mem=1546000M,billing=99,gres/gpu=10
    AllocTRES=cpu=15,mem=328G,gres/gpu=5
rtx-06
    CfgTRES=cpu=32,mem=1546000M,billing=99,gres/gpu=10
    AllocTRES=cpu=15,mem=224G,gres/gpu=5
rtx-07
    CfgTRES=cpu=32,mem=1546000M,billing=99,gres/gpu=10
    AllocTRES=cpu=16,mem=232G,gres/gpu=6
rtx-08
    CfgTRES=cpu=32,mem=1546000M,billing=81,gres/gpu=4

I then submit 10 jobs.  Then the queue for rtx8000 is:

NODELIST    JOBID PARTITION  ST TIME_LIMIT  TRES_ALLOC           TRES_PER
rtx-04      40365 rtx8000    R  7-00:00:00  cpu=3,mem=24G,node=1 gpu:1
rtx-04      38676 rtx8000    R  7-00:00:00  cpu=3,mem=24G,node=1 gpu:1
rtx-04      38673 rtx8000    R  7-00:00:00  cpu=3,mem=24G,node=1 gpu:1
rtx-04      38670 rtx8000    R  7-00:00:00  cpu=3,mem=24G,node=1 gpu:1
rtx-04      38409 rtx8000    R  7-00:00:00  cpu=3,mem=24G,node=1 gpu:1
rtx-05      40214 rtx8000    R  6-10:00:00  cpu=3,mem=128G,node= gpu:1
rtx-05      38677 rtx8000    R  7-00:00:00  cpu=3,mem=24G,node=1 gpu:1
rtx-05      38674 rtx8000    R  7-00:00:00  cpu=3,mem=24G,node=1 gpu:1
rtx-05      37450 rtx8000    R  6-10:00:00  cpu=3,mem=128G,node= gpu:1
rtx-05      37278 rtx8000    R  7-00:00:00  cpu=3,mem=24G,node=1 gpu:1
rtx-06      40366 rtx8000    R  7-00:00:00  cpu=3,mem=24G,node=1 gpu:1
rtx-06      40364 rtx8000    R  6-10:00:00  cpu=3,mem=128G,node= gpu:1
rtx-06      38648 rtx8000    R  7-00:00:00  cpu=3,mem=24G,node=1 gpu:1
rtx-06      38646 rtx8000    R  7-00:00:00  cpu=3,mem=24G,node=1 gpu:1
rtx-06      37267 rtx8000    R  7-00:00:00  cpu=3,mem=24G,node=1 gpu:1
rtx-07      40760 rtx8000    R  50:00       cpu=4,mem=32G,node=1 gpu:2
rtx-07      38675 rtx8000    R  7-00:00:00  cpu=3,mem=24G,node=1 gpu:1
rtx-07      38672 rtx8000    R  7-00:00:00  cpu=3,mem=24G,node=1 gpu:1
rtx-07      38671 rtx8000    R  7-00:00:00  cpu=3,mem=24G,node=1 gpu:1
rtx-07      37451 rtx8000    R  6-10:00:00  cpu=3,mem=128G,node= gpu:1
rtx-08      40785 rtx8000    R  50:00       cpu=4,mem=32G,node=1 gpu:2
rtx-08      40786 rtx8000    R  50:00       cpu=4,mem=32G,node=1 gpu:2
(Priorit    40794 rtx8000    PD 50:00       cpu=4,mem=32G,node=1 gpu:2
(Priorit    40793 rtx8000    PD 50:00       cpu=4,mem=32G,node=1 gpu:2
(Priorit    40792 rtx8000    PD 50:00       cpu=4,mem=32G,node=1 gpu:2
(Priorit    40791 rtx8000    PD 50:00       cpu=4,mem=32G,node=1 gpu:2
(Priorit    40790 rtx8000    PD 50:00       cpu=4,mem=32G,node=1 gpu:2
(Priorit    40789 rtx8000    PD 50:00       cpu=4,mem=32G,node=1 gpu:2
(Priorit    40788 rtx8000    PD 50:00       cpu=4,mem=32G,node=1 gpu:2
(Resourc    40787 rtx8000    PD 50:00       cpu=4,mem=32G,node=1 gpu:2

[root at mlsc-head ~]# scontrol show job=40787
JobId=40787 JobName=sjob_5
    UserId=raines(5829) GroupId=raines(5829) MCS_label=N/A
    Priority=19836243 Nice=0 Account=sysadm QOS=normal
    JobState=PENDING Reason=Resources Dependency=(null)
    Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
    RunTime=00:00:00 TimeLimit=00:50:00 TimeMin=N/A
    SubmitTime=2021-01-23T12:37:51 EligibleTime=2021-01-23T12:37:51
    AccrueTime=2021-01-23T12:37:51
    StartTime=2021-01-23T13:08:52 EndTime=2021-01-23T13:58:52 Deadline=N/A
    SuspendTime=None SecsPreSuspend=0 LastSchedEval=2021-01-23T12:38:36
    Partition=rtx8000 AllocNode:Sid=mlsc-head:1268664
    ReqNodeList=(null) ExcNodeList=(null)
    NodeList=(null) SchedNodeList=rtx-07
    NumNodes=1-2 NumCPUs=4 NumTasks=1 CPUs/Task=4 ReqB:S:C:T=0:0:*:*
    TRES=cpu=4,mem=32G,node=1,billing=11,gres/gpu=2
    Socks/Node=* NtasksPerN:B:S:C=0:0:*:1 CoreSpec=*
    MinCPUsNode=4 MinMemoryNode=32G MinTmpDiskNode=0
    Features=(null) DelayBoot=00:00:00
    OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
    Command=/autofs/cluster/batch/raines/sjob_5
    WorkDir=/autofs/cluster/batch/raines
    StdErr=/autofs/cluster/batch/raines/sjob_5.err40787
    StdIn=/dev/null
    StdOut=/autofs/cluster/batch/raines/sjob_5.out40787
    Power=
    TresPerJob=gpu:2
    MailUser=(null) MailType=NONE


[root at mlsc-head ~]# scontrol show node=rtx-04
NodeName=rtx-04 Arch=x86_64 CoresPerSocket=16
    CPUAlloc=15 CPUTot=32 CPULoad=18.21
    AvailableFeatures=intel,cascade,rtx8000
    ActiveFeatures=intel,cascade,rtx8000
    Gres=gpu:quadro_rtx_8000:10(S:0)
    NodeAddr=rtx-04 NodeHostName=rtx-04 Version=20.02.3
    OS=Linux 4.18.0-193.28.1.el8_2.x86_64 #1 SMP Thu Oct 22 00:20:22 UTC 2020
    RealMemory=1546000 AllocMem=122880 FreeMem=1413061 Sockets=2 Boards=1
    MemSpecLimit=2048
    State=MIXED ThreadsPerCore=1 TmpDisk=6000000 Weight=1 Owner=N/A
    MCS_label=N/A Partitions=rtx8000
    BootTime=2020-12-29T13:40:45 SlurmdStartTime=2020-12-29T13:44:12
    CfgTRES=cpu=32,mem=1546000M,billing=99,gres/gpu=10
    AllocTRES=cpu=15,mem=120G,gres/gpu=5
    CapWatts=n/a
    CurrentWatts=0 AveWatts=0
    ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s

rtx-04
    CfgTRES=cpu=32,mem=1546000M,billing=99,gres/gpu=10
    AllocTRES=cpu=15,mem=120G,gres/gpu=5
rtx-05
    CfgTRES=cpu=32,mem=1546000M,billing=99,gres/gpu=10
    AllocTRES=cpu=15,mem=328G,gres/gpu=5
rtx-06
    CfgTRES=cpu=32,mem=1546000M,billing=99,gres/gpu=10
    AllocTRES=cpu=15,mem=224G,gres/gpu=5
rtx-07
    CfgTRES=cpu=32,mem=1546000M,billing=99,gres/gpu=10
    AllocTRES=cpu=16,mem=232G,gres/gpu=6
rtx-08
    CfgTRES=cpu=32,mem=1546000M,billing=81,gres/gpu=4
    AllocTRES=cpu=8,mem=64G,gres/gpu=4

Now rtx-08 which has only 4 GPUs seems to always get all 4 uses.
But the others seem to always only get half used (except rtx-07
which somehow gets 6 used so another wierd thing).

Again if I submit non-GPU jobs, they end up allocating all hte
cores/cpus on the nodes just fine.

I have two nodes with RTX6000's in a rtx6000 queue and those fill up
using all GPUs just fine:


NODELIST    JOBID PARTITION  ST TIME_LIMIT  TRES_ALLOC           TRES_PER
rtx-01      40830 rtx6000    R  50:00       cpu=4,mem=32G,node=1 gpu:2
rtx-01      40831 rtx6000    R  50:00       cpu=4,mem=32G,node=1 gpu:2
rtx-01      40833 rtx6000    R  50:00       cpu=4,mem=32G,node=1 gpu:2
rtx-01      40835 rtx6000    R  50:00       cpu=4,mem=32G,node=1 gpu:2
rtx-02      40832 rtx6000    R  50:00       cpu=4,mem=32G,node=1 gpu:2
rtx-02      40834 rtx6000    R  50:00       cpu=4,mem=32G,node=1 gpu:2
rtx-02      40836 rtx6000    R  50:00       cpu=4,mem=32G,node=1 gpu:2
(Priorit    40839 rtx6000    PD 50:00       cpu=4,mem=32G,node=1 gpu:2
(Priorit    40838 rtx6000    PD 50:00       cpu=4,mem=32G,node=1 gpu:2
(Resourc    40837 rtx6000    PD 50:00       cpu=4,mem=32G,node=1 gpu:2

rtx-01
    CfgTRES=cpu=32,mem=1546000M,billing=89,gres/gpu=8
    AllocTRES=cpu=16,mem=128G,gres/gpu=8
rtx-02
    CfgTRES=cpu=32,mem=1546000M,billing=84,gres/gpu=6
    AllocTRES=cpu=12,mem=96G,gres/gpu=6

So maybe it is something odd about those 7 day jobs already running
on rtx8000 boxes.  Here are two examples:

[root at mlsc-head ~]# scontrol show job=40365
JobId=40365 JobName=unet_1
    UserId=mu40(4181545) GroupId=mu40(4181545) MCS_label=N/A
    Priority=8813 Nice=0 Account=lcn QOS=normal
    JobState=RUNNING Reason=None Dependency=(null)
    Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
    RunTime=01:50:42 TimeLimit=7-00:00:00 TimeMin=N/A
    SubmitTime=2021-01-23T10:56:01 EligibleTime=2021-01-23T10:56:01
    AccrueTime=2021-01-23T10:56:01
    StartTime=2021-01-23T10:56:02 EndTime=2021-01-30T10:56:02 Deadline=N/A
    SuspendTime=None SecsPreSuspend=0 LastSchedEval=2021-01-23T10:56:02
    Partition=rtx8000 AllocNode:Sid=mlsc-head:1266838
    ReqNodeList=(null) ExcNodeList=(null)
    NodeList=rtx-04
    BatchHost=rtx-04
    NumNodes=1 NumCPUs=3 NumTasks=1 CPUs/Task=3 ReqB:S:C:T=0:0:*:*
    TRES=cpu=3,mem=24G,node=1,billing=7,gres/gpu=1
    Socks/Node=* NtasksPerN:B:S:C=0:0:*:1 CoreSpec=*
    MinCPUsNode=3 MinMemoryNode=24G MinTmpDiskNode=0
    Features=(null) DelayBoot=00:00:00
    OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
    Command=(null)
    WorkDir=/homes/9/mu40/l/ge
    StdErr=/homes/9/mu40/l/jobs/mlsc-login.40365.log
    StdIn=/dev/null
    StdOut=/homes/9/mu40/l/jobs/mlsc-login.40365.log
    Power=
    TresPerJob=gpu:1
    MailUser=mu40 MailType=FAIL

[root at mlsc-head ~]# scontrol show job=38676
JobId=38676 JobName=int_sos
    UserId=mu40(4181545) GroupId=mu40(4181545) MCS_label=N/A
    Priority=96466 Nice=0 Account=lcn QOS=normal
    JobState=RUNNING Reason=None Dependency=(null)
    Requeue=0 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
    RunTime=2-02:04:59 TimeLimit=7-00:00:00 TimeMin=N/A
    SubmitTime=2021-01-21T10:42:01 EligibleTime=2021-01-21T10:42:01
    AccrueTime=2021-01-21T10:42:01
    StartTime=2021-01-21T10:42:01 EndTime=2021-01-28T10:42:01 Deadline=N/A
    SuspendTime=None SecsPreSuspend=0 LastSchedEval=2021-01-21T10:42:01
    Partition=rtx8000 AllocNode:Sid=mlsc-head:965521
    ReqNodeList=(null) ExcNodeList=(null)
    NodeList=rtx-04
    BatchHost=rtx-04
    NumNodes=1 NumCPUs=3 NumTasks=1 CPUs/Task=3 ReqB:S:C:T=0:0:*:*
    TRES=cpu=3,mem=24G,node=1,billing=7,gres/gpu=1
    Socks/Node=* NtasksPerN:B:S:C=0:0:*:1 CoreSpec=*
    MinCPUsNode=3 MinMemoryNode=24G MinTmpDiskNode=0
    Features=(null) DelayBoot=00:00:00
    OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
    Command=(null)
    WorkDir=/homes/9/mu40/l/ge
    StdErr=/homes/9/mu40/l/jobs/mlsc-login.38676.log
    StdIn=/dev/null
    StdOut=/homes/9/mu40/l/jobs/mlsc-login.38676.log
    Power=
    TresPerJob=gpu:1
    MailUser=mu40 MailType=FAIL

I don't see anything obvious here.  Is it maybe the 7 day thing?  If
I submit my jobs for 7 days to the rtx6000 partition though I don't
see the problem.

-- Paul Raines (http://help.nmr.mgh.harvard.edu)



On Thu, 21 Jan 2021 5:47pm, Williams, Gareth (IM&T, Black Mountain) wrote:

> I think job 38687 *is* being run on the rtx-06 node.
> I think you mean why job 38692 is not being run on the rtx-06 node (the top prio pending job).
>
> I can't see the problem... This (and other info) does seem to indicate that there is enough resource for the extra job:
>    CfgTRES=cpu=32,mem=1546000M,billing=99,gres/gpu=10
>    AllocTRES=cpu=16,mem=143G,gres/gpu=5
>
> If I were debugging this, I'd submit some test jobs that just request resource and sleep, and look for if a node ever allocates more than 16 cores/cpus or 5 gpus.
>
> Maybe the answer is in the comprehensive info you posted and someone will see the gem. Not me, sorry.
>
> Gareth
>
> -----Original Message-----
> From: slurm-users <slurm-users-bounces at lists.schedmd.com> On Behalf Of Paul Raines
> Sent: Friday, 22 January 2021 7:12 AM
> To: slurm-users at lists.schedmd.com
> Subject: [slurm-users] Job not running with Resource Reason even though resources appear to be available
>
>
> I am in the beginning of setting up my first SLURM cluster and I am trying to understand why jobs are pending when resources are available
>
> These are the pending jobs:
>
> # squeue -P --sort=-p,i --states=PD -O "JobID:.12 ,Partition:9 ,StateCompact:2
> ,Priority:.12 ,ReasonList"
>        JOBID PARTITION ST     PRIORITY NODELIST(REASON)
>        38692 rtx8000   PD 0.0046530945 (Resources)
>        38693 rtx8000   PD 0.0046530945 (Priority)
>        38694 rtx8000   PD 0.0046530906 (Priority)
>        38695 rtx8000   PD 0.0046530866 (Priority)
>        38696 rtx8000   PD 0.0046530866 (Priority)
>        38697 rtx8000   PD 0.0000208867 (Priority)
>
> The job at the top is as follows:
>
> Submission command line:
>
>   sbatch -p rtx8000 -G 1 -c 4 -t 12:00:00 --mem=47G \
>    -o /cluster/batch/iman/%j.out --wrap='cmd .....'
>
> # scontrol show job=38692
> JobId=38692 JobName=wrap
>    UserId=iman(8084) GroupId=iman(8084) MCS_label=N/A
>    Priority=19989863 Nice=0 Account=imanlab QOS=normal
>    JobState=PENDING Reason=Resources Dependency=(null)
>    Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
>    RunTime=00:00:00 TimeLimit=12:00:00 TimeMin=N/A
>    SubmitTime=2021-01-21T13:05:02 EligibleTime=2021-01-21T13:05:02
>    AccrueTime=2021-01-21T13:05:02
>    StartTime=2021-01-22T01:05:02 EndTime=2021-01-22T13:05:02 Deadline=N/A
>    SuspendTime=None SecsPreSuspend=0 LastSchedEval=2021-01-21T14:04:32
>    Partition=rtx8000 AllocNode:Sid=mlsc-head:974529
>    ReqNodeList=(null) ExcNodeList=(null)
>    NodeList=(null) SchedNodeList=rtx-06
>    NumNodes=1-1 NumCPUs=4 NumTasks=1 CPUs/Task=4 ReqB:S:C:T=0:0:*:*
>    TRES=cpu=4,mem=47G,node=1,billing=8,gres/gpu=1
>    Socks/Node=* NtasksPerN:B:S:C=0:0:*:1 CoreSpec=*
>    MinCPUsNode=4 MinMemoryNode=47G MinTmpDiskNode=0
>    Features=(null) DelayBoot=00:00:00
>    OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
>    Command=(null)
>    WorkDir=/autofs/homes/008/iman
>    StdErr=/cluster/batch/iman/38692.out
>    StdIn=/dev/null
>    StdOut=/cluster/batch/iman/38692.out
>    Power=
>    TresPerJob=gpu:1
>    MailUser=(null) MailType=NONE
>
> This node shows it has enough free resources (cpu,mem,gpus) for the job in the partition
>
> # scontrol show node=rtx-06
> NodeName=rtx-06 Arch=x86_64 CoresPerSocket=16
>    CPUAlloc=16 CPUTot=32 CPULoad=5.77
>    AvailableFeatures=intel,cascade,rtx8000
>    ActiveFeatures=intel,cascade,rtx8000
>    Gres=gpu:quadro_rtx_8000:10(S:0)
>    NodeAddr=rtx-06 NodeHostName=rtx-06 Version=20.02.3
>    OS=Linux 4.18.0-193.28.1.el8_2.x86_64 #1 SMP Thu Oct 22 00:20:22 UTC 2020
>    RealMemory=1546000 AllocMem=146432 FreeMem=1420366 Sockets=2 Boards=1
>    MemSpecLimit=2048
>    State=MIXED ThreadsPerCore=1 TmpDisk=6000000 Weight=1 Owner=N/A MCS_label=N/A
>    Partitions=rtx8000
>    BootTime=2020-12-30T10:35:34 SlurmdStartTime=2020-12-30T10:37:21
>    CfgTRES=cpu=32,mem=1546000M,billing=99,gres/gpu=10
>    AllocTRES=cpu=16,mem=143G,gres/gpu=5
>    CapWatts=n/a
>    CurrentWatts=0 AveWatts=0
>    ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
>
> # squeue --partition=rtx8000 --states=R -O "NodeList:10 ,JobID:.8 ,Partition:10,tres-alloc,tres-per-job" -w rtx-06
> NODELIST      JOBID PARTITION  TRES_ALLOC           TRES_PER_JOB
> rtx-06        38687 rtx8000    cpu=4,mem=47G,node=1 gpu:1
> rtx-06        37267 rtx8000    cpu=3,mem=24G,node=1 gpu:1
> rtx-06        37495 rtx8000    cpu=3,mem=24G,node=1 gpu:1
> rtx-06        38648 rtx8000    cpu=3,mem=24G,node=1 gpu:1
> rtx-06        38646 rtx8000    cpu=3,mem=24G,node=1 gpu:1
>
> In case this is needed
>
> # scontrol show part=rtx8000
> PartitionName=rtx8000
>    AllowGroups=ALL AllowAccounts=ALL AllowQos=ALL
>    AllocNodes=ALL Default=NO QoS=N/A
>    DefaultTime=04:00:00 DisableRootJobs=NO ExclusiveUser=NO GraceTime=0 Hidden=NO
>    MaxNodes=UNLIMITED MaxTime=7-00:00:00 MinNodes=0 LLN=NO MaxCPUsPerNode=UNLIMITED
>    Nodes=rtx-[04-08]
>    PriorityJobFactor=1 PriorityTier=4 RootOnly=NO ReqResv=NO OverSubscribe=NO
>    OverTimeLimit=NONE PreemptMode=OFF
>    State=UP TotalCPUs=160 TotalNodes=5 SelectTypeParameters=NONE
>    JobDefaults=(null)
>    DefMemPerNode=UNLIMITED MaxMemPerNode=UNLIMITED
>    TRESBillingWeights=CPU=1.24,Mem=0.02G,Gres/gpu=3.0
>
>
> Scheduling parameters from slurm.conf are:
>
> EnforcePartLimits=ALL
> LaunchParameters=mem_sort,slurmstepd_memlock_all,test_exec
> MaxJobCount=300000
> MaxArraySize=10000
> DefMemPerCPU=10240
> DefCpuPerGPU=1
> DefMemPerGPU=10240
> GpuFreqDef=medium
> CompleteWait=0
> EpilogMsgTime=3000000
> InactiveLimit=60
> KillWait=30
> UnkillableStepTimeout=180
> ResvOverRun=UNLIMITED
> MinJobAge=600
> Waittime=5
> SchedulerType=sched/backfill
> SelectType=select/cons_tres
> SelectTypeParameters=CR_Core_Memory,CR_CORE_DEFAULT_DIST_BLOCK,CR_ONE_TASK_PER_CORE
> PreemptType=preempt/partition_prio
> PreemptMode=REQUEUE
>
> SchedulerParameters=\
> default_queue_depth=1500,\
> partition_job_depth=10,\
> bf_continue,\
> bf_interval=30,\
> bf_resolution=600,\
> bf_window=11520,\
> bf_max_job_part=0,\
> bf_max_job_user=10,\
> bf_max_job_test=100000,\
> bf_max_job_start=1000,\
> bf_ignore_newly_avail_nodes,\
> enable_user_top,\
> pack_serial_at_end,\
> nohold_on_prolog_fail,\
> permit_job_expansion,\
> preempt_strict_order,\
> preempt_youngest_first,\
> reduce_completing_frag,\
> max_rpc_cnt=16
>
> DependencyParameters=kill_invalid_depend
>
>
> So any idea why job 38687 is not being run on the rtx-06 node
>
> ---------------------------------------------------------------
> Paul Raines                     http://help.nmr.mgh.harvard.edu
> MGH/MIT/HMS Athinoula A. Martinos Center for Biomedical Imaging
> 149 (2301) 13th Street     Charlestown, MA 02129	    USA
>
>
>
>
>
>
>



More information about the slurm-users mailing list