[slurm-users] GRES GPU issues

Lou Nicotra lnicotra at interactions.com
Mon Dec 3 13:44:52 MST 2018


Here you go... Thanks for looking into this...
lnicotra at tiger11 run# scontrol show config
Configuration data as of 2018-12-03T15:39:51
AccountingStorageBackupHost = (null)
AccountingStorageEnforce = none
AccountingStorageHost   = panther02
AccountingStorageLoc    = N/A
AccountingStoragePort   = 6819
AccountingStorageTRES   =
cpu,mem,energy,node,billing,fs/disk,vmem,pages,gres/gpu,gres/gpu:1080gtx,gres/gpu:k20
AccountingStorageType   = accounting_storage/slurmdbd
AccountingStorageUser   = N/A
AccountingStoreJobComment = Yes
AcctGatherEnergyType    = acct_gather_energy/none
AcctGatherFilesystemType = acct_gather_filesystem/none
AcctGatherInterconnectType = acct_gather_interconnect/none
AcctGatherNodeFreq      = 0 sec
AcctGatherProfileType   = acct_gather_profile/none
AllowSpecResourcesUsage = 0
AuthInfo                = (null)
AuthType                = auth/munge
BatchStartTimeout       = 10 sec
BOOT_TIME               = 2018-12-03T12:13:48
BurstBufferType         = (null)
CheckpointType          = checkpoint/none
ClusterName             = sltgroup
CommunicationParameters = (null)
CompleteWait            = 0 sec
CoreSpecPlugin          = core_spec/none
CpuFreqDef              = Unknown
CpuFreqGovernors        = Performance,OnDemand
CryptoType              = crypto/munge
DebugFlags              = NO_CONF_HASH
DefMemPerNode           = UNLIMITED
DisableRootJobs         = No
EioTimeout              = 60
EnforcePartLimits       = NO
Epilog                  = (null)
EpilogMsgTime           = 2000 usec
EpilogSlurmctld         = (null)
ExtSensorsType          = ext_sensors/none
ExtSensorsFreq          = 0 sec
FastSchedule            = 1
FederationParameters    = (null)
FirstJobId              = 1
GetEnvTimeout           = 2 sec
GresTypes               = gpu
GroupUpdateForce        = 1
GroupUpdateTime         = 600 sec
HASH_VAL                = Different Ours=0xcfc037c0 Slurmctld=0x940764e7
HealthCheckInterval     = 0 sec
HealthCheckNodeState    = ANY
HealthCheckProgram      = (null)
InactiveLimit           = 0 sec
JobAcctGatherFrequency  = 30
JobAcctGatherType       = jobacct_gather/linux
JobAcctGatherParams     = (null)
JobCheckpointDir        = /var/slurm/checkpoint
JobCompHost             = localhost
JobCompLoc              = /var/log/slurm_jobcomp.log
JobCompPort             = 0
JobCompType             = jobcomp/none
JobCompUser             = root
JobContainerType        = job_container/none
JobCredentialPrivateKey = (null)
JobCredentialPublicCertificate = (null)
JobDefaults             = (null)
JobFileAppend           = 0
JobRequeue              = 1
JobSubmitPlugins        = (null)
KeepAliveTime           = SYSTEM_DEFAULT
KillOnBadExit           = 0
KillWait                = 30 sec
LaunchParameters        = (null)
LaunchType              = launch/slurm
Layouts                 =
Licenses                = (null)
LicensesUsed            = (null)
LogTimeFormat           = iso8601_ms
MailDomain              = (null)
MailProg                = /bin/mail
MaxArraySize            = 1001
MaxJobCount             = 10000
MaxJobId                = 67043328
MaxMemPerNode           = UNLIMITED
MaxStepCount            = 40000
MaxTasksPerNode         = 512
MCSPlugin               = mcs/none
MCSParameters           = (null)
MemLimitEnforce         = Yes
MessageTimeout          = 10 sec
MinJobAge               = 300 sec
MpiDefault              = none
MpiParams               = (null)
MsgAggregationParams    = (null)
NEXT_JOB_ID             = 749
NodeFeaturesPlugins     = (null)
OverTimeLimit           = 0 min
PluginDir               = /usr/lib64/slurm
PlugStackConfig         = /etc/slurm/plugstack.conf
PowerParameters         = (null)
PowerPlugin             =
PreemptMode             = OFF
PreemptType             = preempt/none
PriorityParameters      = (null)
PriorityType            = priority/basic
PrivateData             = none
ProctrackType           = proctrack/pgid
Prolog                  = (null)
PrologEpilogTimeout     = 65534
PrologSlurmctld         = (null)
PrologFlags             = (null)
PropagatePrioProcess    = 0
PropagateResourceLimits = ALL
PropagateResourceLimitsExcept = (null)
RebootProgram           = (null)
ReconfigFlags           = (null)
RequeueExit             = (null)
RequeueExitHold         = (null)
ResumeFailProgram       = (null)
ResumeProgram           = (null)
ResumeRate              = 300 nodes/min
ResumeTimeout           = 60 sec
ResvEpilog              = (null)
ResvOverRun             = 0 min
ResvProlog              = (null)
ReturnToService         = 0
RoutePlugin             = route/default
SallocDefaultCommand    = (null)
SbcastParameters        = (null)
SchedulerParameters     = (null)
SchedulerTimeSlice      = 30 sec
SchedulerType           = sched/backfill
SelectType              = select/linear
SlurmUser               = slurm(902)
SlurmctldAddr           = (null)
SlurmctldDebug          = info
SlurmctldHost[0]        = panther02
SlurmctldHost[1]        = perf2(X.X.X.X)
SlurmctldLogFile        = /var/log/slurm/slurmctld.log
SlurmctldPort           = 6817
SlurmctldSyslogDebug    = unknown
SlurmctldPrimaryOffProg = (null)
SlurmctldPrimaryOnProg  = (null)
SlurmctldTimeout        = 300 sec
SlurmctldParameters     = (null)
SlurmdDebug             = info
SlurmdLogFile           = /var/log/slurm/slurmd.log
SlurmdParameters        = (null)
SlurmdPidFile           = /var/run/slurmd.pid
SlurmdPort              = 6818
SlurmdSpoolDir          = /var/spool/slurmd
SlurmdSyslogDebug       = unknown
SlurmdTimeout           = 300 sec
SlurmdUser              = root(0)
SlurmSchedLogFile       = (null)
SlurmSchedLogLevel      = 0
SlurmctldPidFile        = /var/run/slurmctld.pid
SlurmctldPlugstack      = (null)
SLURM_CONF              = /etc/slurm/slurm.conf
SLURM_VERSION           = 18.08.0
SrunEpilog              = (null)
SrunPortRange           = 0-0
SrunProlog              = (null)
StateSaveLocation       = /n/common/opt/slurm/slurmctld
SuspendExcNodes         = (null)
SuspendExcParts         = (null)
SuspendProgram          = (null)
SuspendRate             = 60 nodes/min
SuspendTime             = NONE
SuspendTimeout          = 30 sec
SwitchType              = switch/none
TaskEpilog              = (null)
TaskPlugin              = task/none
TaskPluginParam         = (null type)
TaskProlog              = (null)
TCPTimeout              = 2 sec
TmpFS                   = /tmp
TopologyParam           = (null)
TopologyPlugin          = topology/none
TrackWCKey              = No
TreeWidth               = 50
UsePam                  = 0
UnkillableStepProgram   = (null)
UnkillableStepTimeout   = 60 sec
VSizeFactor             = 0 percent
WaitTime                = 0 sec
X11Parameters           = (null)

Slurmctld(primary) at panther02 is UP
Slurmctld(backup) at perf2 is UP


On Mon, Dec 3, 2018 at 2:44 PM Michael Di Domenico <mdidomenico4 at gmail.com>
wrote:

> are you willing to paste an `scontrol show config` from the machine
> having trouble
> On Mon, Dec 3, 2018 at 12:10 PM Lou Nicotra <lnicotra at interactions.com>
> wrote:
> >
> > I'm running  slurmd version 18.08.0...
> >
> > It seems that the system recognizes the GPUs after a slurmd restart. I
> tuned debug to 5, restarted and then submitted job. Nothing get logged to
> log file in local server...
> > [2018-12-03T11:55:18.442] Slurmd shutdown completing
> > [2018-12-03T11:55:18.484] debug:  Log file re-opened
> > [2018-12-03T11:55:18.485] debug:  CPUs:48 Boards:1 Sockets:2
> CoresPerSocket:12 ThreadsPerCore:2
> > [2018-12-03T11:55:18.485] Message aggregation disabled
> > [2018-12-03T11:55:18.486] debug:  CPUs:48 Boards:1 Sockets:2
> CoresPerSocket:12 ThreadsPerCore:2
> > [2018-12-03T11:55:18.486] debug:  init: Gres GPU plugin loaded
> > [2018-12-03T11:55:18.486] Gres Name=gpu Type=K20 Count=2
> > [2018-12-03T11:55:18.487] gpu device number 0(/dev/nvidia0):c 195:0 rwm
> > [2018-12-03T11:55:18.487] gpu device number 1(/dev/nvidia1):c 195:1 rwm
> > [2018-12-03T11:55:18.487] topology NONE plugin loaded
> > [2018-12-03T11:55:18.487] route default plugin loaded
> > [2018-12-03T11:55:18.530] debug:  Resource spec: No specialized cores
> configured by default on this node
> > [2018-12-03T11:55:18.530] debug:  Resource spec: Reserved system memory
> limit not configured for this node
> > [2018-12-03T11:55:18.530] debug:  task NONE plugin loaded
> > [2018-12-03T11:55:18.530] debug:  Munge authentication plugin loaded
> > [2018-12-03T11:55:18.530] debug:  spank: opening plugin stack
> /etc/slurm/plugstack.conf
> > [2018-12-03T11:55:18.530] Munge cryptographic signature plugin loaded
> > [2018-12-03T11:55:18.532] slurmd version 18.08.0 started
> > [2018-12-03T11:55:18.532] debug:  Job accounting gather LINUX plugin
> loaded
> > [2018-12-03T11:55:18.532] debug:  job_container none plugin loaded
> > [2018-12-03T11:55:18.532] debug:  switch NONE plugin loaded
> > [2018-12-03T11:55:18.532] slurmd started on Mon, 03 Dec 2018 11:55:18
> -0500
> > [2018-12-03T11:55:18.533] CPUs=48 Boards=1 Sockets=2 Cores=12 Threads=2
> Memory=386757 TmpDisk=4758 Uptime=21165906 CPUSpecList=(null)
> FeaturesAvail=(null) FeaturesActive=(null)
> > [2018-12-03T11:55:18.533] debug:  AcctGatherEnergy NONE plugin loaded
> > [2018-12-03T11:55:18.533] debug:  AcctGatherProfile NONE plugin loaded
> > [2018-12-03T11:55:18.533] debug:  AcctGatherInterconnect NONE plugin
> loaded
> > [2018-12-03T11:55:18.533] debug:  AcctGatherFilesystem NONE plugin loaded
> > root at tiger11 slurm#
> >
> > So, I turned on debug to 5 in slurmcltd in master server, and after I
> submitted my job, it shows...
> > [2018-12-03T12:02:10.355] _job_create: account 'lnicotra' has no
> association for user 1498 using default account 'slt'
> > [2018-12-03T12:02:10.356] _slurm_rpc_submit_batch_job: Invalid Trackable
> RESource (TRES) specification
> >
> > So, we use LDAP for authentication and my UID is 1498, but I created a
> user in slurm using my login name. The default account for all users is
> "slt"  Is this the cause of my problems?
> > root at panther02 slurm# getent passwd lnicotra
> > lnicotra:*:1498:1152:Lou Nicotra:/home/lnicotra:/bin/bash
> >
> > If so, how is this resolved as we use multiple servers and there are no
> local accounts for them?
> >
> > Thanks!
> > Lou
> >
> >
> >
> > On Mon, Dec 3, 2018 at 11:36 AM Michael Di Domenico <
> mdidomenico4 at gmail.com> wrote:
> >>
> >> do you get anything additional in the slurm logs?  have you tried
> >> adding gres to the debugflags?  what version of slurm are you running?
> >> On Mon, Dec 3, 2018 at 9:18 AM Lou Nicotra <lnicotra at interactions.com>
> wrote:
> >> >
> >> > Hi All, I have recently set up a slurm cluster with my servers and
> I'm running into an issue while submitting GPU jobs. It has something to to
> with gres configurations, but I just can't seem to figure out what is
> wrong. Non GPU jobs run fine.
> >> >
> >> > The error is as follows:
> >> > sbatch: error: Batch job submission failed: Invalid Trackable
> RESource (TRES) specification  after submitting a batch job.
> >> >
> >> > My batch job is as follows:
> >> > #!/bin/bash
> >> > #SBATCH --partition=tiger_1   # partition name
> >> > #SBATCH --gres=gpu:k20:1
> >> > #SBATCH --gres-flags=enforce-binding
> >> > #SBATCH --time=0:20:00  # wall clock limit
> >> > #SBATCH --output=gpu-%J.txt
> >> > #SBATCH --account=lnicotra
> >> > module load cuda
> >> > python gpu1
> >> >
> >> > Where gpu1 is a GPU test script that runs correctly while invoked via
> python. Tiger_1 partition has servers with GPUs, with a mix of 1080GTX and
> K20 as specified in slurm.conf
> >> >
> >> > I have defined GRES resources in the slurm.conf file:
> >> > # GPU GRES
> >> > GresTypes=gpu
> >> > NodeName=tiger[01,05,10,15,20] Gres=gpu:1080gtx:2
> >> > NodeName=tiger[02-04,06-09,11-14,16-19,21-22] Gres=gpu:k20:2
> >> >
> >> > And have a local gres.conf on the servers containing GPUs...
> >> > lnicotra at tiger11 ~# cat /etc/slurm/gres.conf
> >> > # GPU Definitions
> >> > # NodeName=tiger[02-04,06-09,11-14,16-19,21-22] Name=gpu Type=K20
> File=/dev/nvidia[0-1]
> >> > Name=gpu Type=K20 File=/dev/nvidia[0-1] Cores=0,1
> >> >
> >> > and a similar one for the 1080GTX
> >> > # GPU Definitions
> >> > # NodeName=tiger[01,05,10,15,20] Name=gpu Type=1080GTX
> File=/dev/nvidia[0-1]
> >> > Name=gpu Type=1080GTX File=/dev/nvidia[0-1] Cores=0,1
> >> >
> >> > The account manager seems to know about the GPUs...
> >> > lnicotra at tiger11 ~# sacctmgr show tres
> >> >     Type            Name     ID
> >> > -------- --------------- ------
> >> >      cpu                      1
> >> >      mem                      2
> >> >   energy                      3
> >> >     node                      4
> >> >  billing                      5
> >> >       fs            disk      6
> >> >     vmem                      7
> >> >    pages                      8
> >> >     gres             gpu   1001
> >> >     gres         gpu:k20   1002
> >> >     gres     gpu:1080gtx   1003
> >> >
> >> > Can anyone point out what am I missing?
> >> >
> >> > Thanks!
> >> > Lou
> >> >
> >> >
> >> > --
> >> >
> >> > Lou Nicotra
> >> >
> >> > IT Systems Engineer - SLT
> >> >
> >> > Interactions LLC
> >> >
> >> > o:  908-673-1833
> >> >
> >> > m: 908-451-6983
> >> >
> >> > lnicotra at interactions.com
> >> >
> >> > www.interactions.com
> >> >
> >> >
> *******************************************************************************
> >> >
> >> > This e-mail and any of its attachments may contain Interactions LLC
> proprietary information, which is privileged, confidential, or subject to
> copyright belonging to the Interactions LLC. This e-mail is intended solely
> for the use of the individual or entity to which it is addressed. If you
> are not the intended recipient of this e-mail, you are hereby notified that
> any dissemination, distribution, copying, or action taken in relation to
> the contents of and attachments to this e-mail is strictly prohibited and
> may be unlawful. If you have received this e-mail in error, please notify
> the sender immediately and permanently delete the original and any copy of
> this e-mail and any printout. Thank You.
> >> >
> >> >
> *******************************************************************************
> >>
> >
> >
> > --
> >
> > Lou Nicotra
> >
> > IT Systems Engineer - SLT
> >
> > Interactions LLC
> >
> > o:  908-673-1833
> >
> > m: 908-451-6983
> >
> > lnicotra at interactions.com
> >
> > www.interactions.com
> >
> >
> *******************************************************************************
> >
> > This e-mail and any of its attachments may contain Interactions LLC
> proprietary information, which is privileged, confidential, or subject to
> copyright belonging to the Interactions LLC. This e-mail is intended solely
> for the use of the individual or entity to which it is addressed. If you
> are not the intended recipient of this e-mail, you are hereby notified that
> any dissemination, distribution, copying, or action taken in relation to
> the contents of and attachments to this e-mail is strictly prohibited and
> may be unlawful. If you have received this e-mail in error, please notify
> the sender immediately and permanently delete the original and any copy of
> this e-mail and any printout. Thank You.
> >
> >
> *******************************************************************************
>
>

-- 

*Lou Nicotra*

IT Systems Engineer - SLT

Interactions LLC

o:  908-673-1833 <781-405-5114>

m: 908-451-6983 <781-405-5114>

*lnicotra at interactions.com <lnicotra at interactions.com>*
www.interactions.com

-- 





*******************************************************************************




This e-mail and any of its attachments may contain
Interactions LLC 
proprietary information, which is privileged,
confidential, or subject to 
copyright belonging to the Interactions
LLC. This e-mail is intended solely 
for the use of the individual or
entity to which it is addressed. If you 
are not the intended recipient of this
e-mail, you are hereby notified that 
any dissemination, distribution, copying,
or action taken in relation to 
the contents of and attachments to this e-mail
is strictly prohibited and 
may be unlawful. If you have received this e-mail in
error, please notify 
the sender immediately and permanently delete the original
and any copy of 
this e-mail and any printout. Thank You.  




******************************************************************************* 
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.schedmd.com/pipermail/slurm-users/attachments/20181203/2b7bd675/attachment-0001.html>


More information about the slurm-users mailing list