Hello all.
An user reported that a job wasn't starting, so I tried to replicate the
request and I get:
-8<--
[root@ophfe1 root.old]# scontrol show job 113936
JobId=113936 JobName=test.sh
UserId=root(0) GroupId=root(0) MCS_label=N/A
Priority=1 Nice=0 Account=root QOS=long
JobState=PENDING Reason=Priority Dependency=(null)
Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
RunTime=00:00:00 TimeLimit=2-00:00:00 TimeMin=N/A
SubmitTime=2024-12-06T13:19:36 EligibleTime=2024-12-06T13:19:36
AccrueTime=2024-12-06T13:19:36
StartTime=Unknown EndTime=Unknown Deadline=N/A
SuspendTime=None SecsPreSuspend=0 LastSchedEval=2024-12-06T13:21:32
Scheduler=Backfill:*
Partition=m3 AllocNode:Sid=ophfe1:855189
ReqNodeList=(null) ExcNodeList=(null)
NodeList=
NumNodes=1-1 NumCPUs=96 NumTasks=96 CPUs/Task=1 ReqB:S:C:T=0:0:*:*
TRES=cpu=96,mem=95000M,node=1,billing=1296
Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=*
MinCPUsNode=1 MinMemoryNode=95000M MinTmpDiskNode=0
Features=(null) DelayBoot=00:00:00
OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
Command=/home/root.old/test.sh
WorkDir=/home/root.old
StdErr=/home/root.old/%N-%J.err
StdIn=/dev/null
StdOut=/home/root.old/%N-%J.out
Power=
[root@ophfe1 root.old]# scontrol sho partition m3
PartitionName=m3
AllowGroups=ALL DenyAccounts=formazione AllowQos=ALL
AllocNodes=ALL Default=NO QoS=N/A
DefaultTime=NONE DisableRootJobs=NO ExclusiveUser=NO GraceTime=0
Hidden=NO
MaxNodes=UNLIMITED MaxTime=UNLIMITED MinNodes=0 LLN=NO
MaxCPUsPerNode=UNLIMITED
Nodes=mtx20
PriorityJobFactor=1 PriorityTier=1 RootOnly=NO ReqResv=NO
OverSubscribe=NO
OverTimeLimit=NONE PreemptMode=CANCEL
State=UP TotalCPUs=192 TotalNodes=1
SelectTypeParameters=CR_SOCKET_MEMORY
JobDefaults=(null)
DefMemPerNode=UNLIMITED MaxMemPerNode=UNLIMITED
TRES=cpu=192,mem=1150000M,node=1,billing=2592
TRESBillingWeights=CPU=13.500,Mem=2.2378G
[root@ophfe1 root.old]# scontrol show node mtx20
NodeName=mtx20 Arch=x86_64 CoresPerSocket=24
CPUAlloc=0 CPUEfctv=192 CPUTot=192 CPULoad=0.00
AvailableFeatures=ib,matrix,intel,avx
ActiveFeatures=ib,matrix,intel,avx
Gres=(null)
NodeAddr=mtx20 NodeHostName=mtx20 Version=22.05.6
OS=Linux 4.18.0-372.9.1.el8.x86_64 #1 SMP Tue May 10 14:48:47 UTC 2022
RealMemory=1150000 AllocMem=0 FreeMem=1156606 Sockets=4 Boards=1
MemSpecLimit=2048
State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=8 Owner=N/A MCS_label=N/A
Partitions=m3
BootTime=2024-12-06T10:01:42 SlurmdStartTime=2024-12-06T10:02:54
LastBusyTime=2024-12-06T10:51:58
CfgTRES=cpu=192,mem=1150000M,billing=2592
AllocTRES=
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
-8<--
So the node is free, the partition does not impose extra limits (used
only for accounting factors) but the job does not start.
Any hints?
Tks
--
Diego Zuccato
DIFA - Dip. di Fisica e Astronomia
Servizi Informatici
Alma Mater Studiorum - Università di Bologna
V.le Berti-Pichat 6/2 - 40127 Bologna - Italy
tel.: +39 051 20 95786