[slurm-users] Slurmstepd errors
Matthew BETTINGER
matthew.bettinger at external.total.com
Tue Jul 28 04:52:59 UTC 2020
Hello,
Running slurm 17.02.6 on a cray system and all of a sudden we have been receiving these message errors from slurmstepd. Not sure what triggers this?
srun -N 4 -n 4 hostname
nid00031
slurmstepd: error: task/cgroup: unable to add task[pid=903] to memory cg '(null)'
nid00029
nid00030
slurmstepd: error: task/cgroup: unable to add task[pid=50322] to memory cg '(null)'
nid00032
The jobs seem to be running but this sort of just popped up for some reason.
Configuration data as of 2020-07-27T23:51:10
AccountingStorageBackupHost = (null)
AccountingStorageEnforce = none
AccountingStorageHost 1
AccountingStorageLoc = N/A
AccountingStoragePort = 6819
AccountingStorageTRES = gres/gpu,gres/craynetwork,bb/cray,cpu,mem,energy,node
AccountingStorageType = accounting_storage/slurmdbd
AccountingStorageUser = N/A
AccountingStoreJobComment = Yes
AcctGatherEnergyType = acct_gather_energy/rapl
AcctGatherFilesystemType = acct_gather_filesystem/none
AcctGatherInfinibandType = acct_gather_infiniband/none
AcctGatherNodeFreq = 30 sec
AcctGatherProfileType = acct_gather_profile/none
AllowSpecResourcesUsage = 1
AuthInfo = (null)
AuthType = auth/munge
BackupAddr = hickory-2
BackupController = hickory-2
BatchStartTimeout = 10 sec
BOOT_TIME = 2020-07-27T14:27:51
BurstBufferType = burst_buffer/cray
CacheGroups = 0
CheckpointType = checkpoint/none
ChosLoc = (null)
ClusterName = hickory
CompleteWait = 0 sec
ControlAddr = hickory-1
ControlMachine = hickory-1
CoreSpecPlugin = cray
CpuFreqDef = Performance
CpuFreqGovernors = Performance,OnDemand
CryptoType = crypto/munge
DebugFlags = (null)
DefMemPerNode = UNLIMITED
DisableRootJobs = No
EioTimeout = 60
EnforcePartLimits = NO
Epilog = (null)
EpilogMsgTime = 2000 usec
EpilogSlurmctld = (null)
ExtSensorsType = ext_sensors/none
ExtSensorsFreq = 0 sec
FairShareDampeningFactor = 1
FastSchedule = 0
FirstJobId = 1
GetEnvTimeout = 2 sec
GresTypes = gpu,craynetwork
GroupUpdateForce = 1
GroupUpdateTime = 600 sec
HASH_VAL = Match
HealthCheckInterval = 0 sec
HealthCheckNodeState = ANY
HealthCheckProgram = (null)
InactiveLimit = 0 sec
JobAcctGatherFrequency = 30
JobAcctGatherType = jobacct_gather/linux
JobAcctGatherParams = (null)
JobCheckpointDir = /var/slurm/checkpoint
JobCompHost = localhost
JobCompLoc = /var/log/slurm_jobcomp.log
JobCompPort = 0
JobCompType = jobcomp/none
JobCompUser = root
JobContainerType = job_container/cncu
JobCredentialPrivateKey = (null)
JobCredentialPublicCertificate = (null)
JobFileAppend = 0
JobRequeue = 1
JobSubmitPlugins = cray
KeepAliveTime = SYSTEM_DEFAULT
KillOnBadExit = 1
KillWait = 30 sec
LaunchParameters = (null)
LaunchType = launch/slurm
Layouts =
Licenses = (null)
LicensesUsed = (null)
MailDomain = (null)
MailProg = /bin/mail
MaxArraySize = 1001
MaxJobCount = 10000
MaxJobId = 67043328
MaxMemPerCPU = 128450
MaxStepCount = 40000
MaxTasksPerNode = 512
MCSPlugin = mcs/none
MCSParameters = (null)
MemLimitEnforce = Yes
MessageTimeout = 10 sec
MinJobAge = 300 sec
MpiDefault = none
MpiParams = ports=20000-32767
MsgAggregationParams = (null)
NEXT_JOB_ID = 2760029
NodeFeaturesPlugins = (null)
OverTimeLimit = 0 min
PluginDir = /opt/slurm/17.02.6/lib64/slurm
PlugStackConfig = /etc/opt/slurm/plugstack.conf
PowerParameters = (null)
PowerPlugin =
PreemptMode = CANCEL
PreemptType = preempt/partition_prio
PriorityParameters = (null)
PriorityDecayHalfLife = 7-00:00:00
PriorityCalcPeriod = 00:05:00
PriorityFavorSmall = No
PriorityFlags =
PriorityMaxAge = 7-00:00:00
PriorityUsageResetPeriod = NONE
PriorityType = priority/multifactor
PriorityWeightAge = 0
PriorityWeightFairShare = 0
PriorityWeightJobSize = 0
PriorityWeightPartition = 0
PriorityWeightQOS = 0
PriorityWeightTRES = (null)
PrivateData = none
ProctrackType = proctrack/cray
Prolog = (null)
PrologEpilogTimeout = 65534
PrologSlurmctld = (null)
PrologFlags = (null)
PropagatePrioProcess = 0
PropagateResourceLimits = (null)
PropagateResourceLimitsExcept = AS
RebootProgram = (null)
ReconfigFlags = (null)
RequeueExit = (null)
RequeueExitHold = (null)
ResumeProgram = (null)
ResumeRate = 300 nodes/min
ResumeTimeout = 60 sec
ResvEpilog = (null)
ResvOverRun = 0 min
ResvProlog = (null)
ReturnToService = 2
RoutePlugin = route/default
SallocDefaultCommand = (null)
SbcastParameters = (null)
SchedulerParameters = bf_max_job_test=1000,bf_max_job_user=50,bf_continue
SchedulerTimeSlice = 30 sec
SchedulerType = sched/backfill
SelectType = select/cray
SelectTypeParameters = CR_CORE_MEMORY,OTHER_CONS_RES,NHC_ABSOLUTELY_NO
SlurmUser = root(0)
SlurmctldDebug = info
SlurmctldLogFile = /var/spool/slurm/slurmctld.log
SlurmctldPort = 6817
SlurmctldTimeout = 120 sec
SlurmdDebug = info
SlurmdLogFile = /var/spool/slurmd/%h.log
SlurmdPidFile = /var/spool/slurmd/slurmd.pid
SlurmdPlugstack = (null)
SlurmdPort = 6818
SlurmdSpoolDir = /var/spool/slurmd
SlurmdTimeout = 300 sec
SlurmdUser = root(0)
SlurmSchedLogFile = (null)
SlurmSchedLogLevel = 0
SlurmctldPidFile = /var/spool/slurm/slurmctld.pid
SlurmctldPlugstack = (null)
SLURM_CONF = /etc/opt/slurm/slurm.conf
SLURM_VERSION = 17.02.6
SrunEpilog = (null)
SrunPortRange = 0-0
SrunProlog = (null)
StateSaveLocation = /apps/cluster/hickory/slurm/
SuspendExcNodes = (null)
SuspendExcParts = (null)
SuspendProgram = (null)
SuspendRate = 60 nodes/min
SuspendTime = NONE
SuspendTimeout = 30 sec
SwitchType = switch/cray
TaskEpilog = (null)
TaskPlugin = task/cray,task/affinity,task/cgroup
TaskPluginParam = (null type)
TaskProlog = (null)
TCPTimeout = 2 sec
TmpFS = /tmp
TopologyParam = (null)
TopologyPlugin = topology/none
TrackWCKey = No
TreeWidth = 50
UsePam = 0
UnkillableStepProgram = (null)
UnkillableStepTimeout = 60 sec
VSizeFactor = 0 percent
WaitTime = 0 sec
Slurmctld(primary/backup) 1/2 are UP/UP
More information about the slurm-users
mailing list