<div dir="ltr"><div dir="ltr">Here you go... Thanks for looking into this...<div><div>lnicotra@tiger11 run# scontrol show config</div><div>Configuration data as of 2018-12-03T15:39:51</div><div>AccountingStorageBackupHost = (null)</div><div>AccountingStorageEnforce = none</div><div>AccountingStorageHost = panther02</div><div>AccountingStorageLoc = N/A</div><div>AccountingStoragePort = 6819</div><div>AccountingStorageTRES = cpu,mem,energy,node,billing,fs/disk,vmem,pages,gres/gpu,gres/gpu:1080gtx,gres/gpu:k20</div><div>AccountingStorageType = accounting_storage/slurmdbd</div><div>AccountingStorageUser = N/A</div><div>AccountingStoreJobComment = Yes</div><div>AcctGatherEnergyType = acct_gather_energy/none</div><div>AcctGatherFilesystemType = acct_gather_filesystem/none</div><div>AcctGatherInterconnectType = acct_gather_interconnect/none</div><div>AcctGatherNodeFreq = 0 sec</div><div>AcctGatherProfileType = acct_gather_profile/none</div><div>AllowSpecResourcesUsage = 0</div><div>AuthInfo = (null)</div><div>AuthType = auth/munge</div><div>BatchStartTimeout = 10 sec</div><div>BOOT_TIME = 2018-12-03T12:13:48</div><div>BurstBufferType = (null)</div><div>CheckpointType = checkpoint/none</div><div>ClusterName = sltgroup</div><div>CommunicationParameters = (null)</div><div>CompleteWait = 0 sec</div><div>CoreSpecPlugin = core_spec/none</div><div>CpuFreqDef = Unknown</div><div>CpuFreqGovernors = Performance,OnDemand</div><div>CryptoType = crypto/munge</div><div>DebugFlags = NO_CONF_HASH</div><div>DefMemPerNode = UNLIMITED</div><div>DisableRootJobs = No</div><div>EioTimeout = 60</div><div>EnforcePartLimits = NO</div><div>Epilog = (null)</div><div>EpilogMsgTime = 2000 usec</div><div>EpilogSlurmctld = (null)</div><div>ExtSensorsType = ext_sensors/none</div><div>ExtSensorsFreq = 0 sec</div><div>FastSchedule = 1</div><div>FederationParameters = (null)</div><div>FirstJobId = 1</div><div>GetEnvTimeout = 2 sec</div><div>GresTypes = gpu</div><div>GroupUpdateForce = 1</div><div>GroupUpdateTime = 600 sec</div><div>HASH_VAL = Different Ours=0xcfc037c0 Slurmctld=0x940764e7</div><div>HealthCheckInterval = 0 sec</div><div>HealthCheckNodeState = ANY</div><div>HealthCheckProgram = (null)</div><div>InactiveLimit = 0 sec</div><div>JobAcctGatherFrequency = 30</div><div>JobAcctGatherType = jobacct_gather/linux</div><div>JobAcctGatherParams = (null)</div><div>JobCheckpointDir = /var/slurm/checkpoint</div><div>JobCompHost = localhost</div><div>JobCompLoc = /var/log/slurm_jobcomp.log</div><div>JobCompPort = 0</div><div>JobCompType = jobcomp/none</div><div>JobCompUser = root</div><div>JobContainerType = job_container/none</div><div>JobCredentialPrivateKey = (null)</div><div>JobCredentialPublicCertificate = (null)</div><div>JobDefaults = (null)</div><div>JobFileAppend = 0</div><div>JobRequeue = 1</div><div>JobSubmitPlugins = (null)</div><div>KeepAliveTime = SYSTEM_DEFAULT</div><div>KillOnBadExit = 0</div><div>KillWait = 30 sec</div><div>LaunchParameters = (null)</div><div>LaunchType = launch/slurm</div><div>Layouts =</div><div>Licenses = (null)</div><div>LicensesUsed = (null)</div><div>LogTimeFormat = iso8601_ms</div><div>MailDomain = (null)</div><div>MailProg = /bin/mail</div><div>MaxArraySize = 1001</div><div>MaxJobCount = 10000</div><div>MaxJobId = 67043328</div><div>MaxMemPerNode = UNLIMITED</div><div>MaxStepCount = 40000</div><div>MaxTasksPerNode = 512</div><div>MCSPlugin = mcs/none</div><div>MCSParameters = (null)</div><div>MemLimitEnforce = Yes</div><div>MessageTimeout = 10 sec</div><div>MinJobAge = 300 sec</div><div>MpiDefault = none</div><div>MpiParams = (null)</div><div>MsgAggregationParams = (null)</div><div>NEXT_JOB_ID = 749</div><div>NodeFeaturesPlugins = (null)</div><div>OverTimeLimit = 0 min</div><div>PluginDir = /usr/lib64/slurm</div><div>PlugStackConfig = /etc/slurm/plugstack.conf</div><div>PowerParameters = (null)</div><div>PowerPlugin =</div><div>PreemptMode = OFF</div><div>PreemptType = preempt/none</div><div>PriorityParameters = (null)</div><div>PriorityType = priority/basic</div><div>PrivateData = none</div><div>ProctrackType = proctrack/pgid</div><div>Prolog = (null)</div><div>PrologEpilogTimeout = 65534</div><div>PrologSlurmctld = (null)</div><div>PrologFlags = (null)</div><div>PropagatePrioProcess = 0</div><div>PropagateResourceLimits = ALL</div><div>PropagateResourceLimitsExcept = (null)</div><div>RebootProgram = (null)</div><div>ReconfigFlags = (null)</div><div>RequeueExit = (null)</div><div>RequeueExitHold = (null)</div><div>ResumeFailProgram = (null)</div><div>ResumeProgram = (null)</div><div>ResumeRate = 300 nodes/min</div><div>ResumeTimeout = 60 sec</div><div>ResvEpilog = (null)</div><div>ResvOverRun = 0 min</div><div>ResvProlog = (null)</div><div>ReturnToService = 0</div><div>RoutePlugin = route/default</div><div>SallocDefaultCommand = (null)</div><div>SbcastParameters = (null)</div><div>SchedulerParameters = (null)</div><div>SchedulerTimeSlice = 30 sec</div><div>SchedulerType = sched/backfill</div><div>SelectType = select/linear</div><div>SlurmUser = slurm(902)</div><div>SlurmctldAddr = (null)</div><div>SlurmctldDebug = info</div><div>SlurmctldHost[0] = panther02</div><div>SlurmctldHost[1] = perf2(X.X.X.X)</div><div>SlurmctldLogFile = /var/log/slurm/slurmctld.log</div><div>SlurmctldPort = 6817</div><div>SlurmctldSyslogDebug = unknown</div><div>SlurmctldPrimaryOffProg = (null)</div><div>SlurmctldPrimaryOnProg = (null)</div><div>SlurmctldTimeout = 300 sec</div><div>SlurmctldParameters = (null)</div><div>SlurmdDebug = info</div><div>SlurmdLogFile = /var/log/slurm/slurmd.log</div><div>SlurmdParameters = (null)</div><div>SlurmdPidFile = /var/run/slurmd.pid</div><div>SlurmdPort = 6818</div><div>SlurmdSpoolDir = /var/spool/slurmd</div><div>SlurmdSyslogDebug = unknown</div><div>SlurmdTimeout = 300 sec</div><div>SlurmdUser = root(0)</div><div>SlurmSchedLogFile = (null)</div><div>SlurmSchedLogLevel = 0</div><div>SlurmctldPidFile = /var/run/slurmctld.pid</div><div>SlurmctldPlugstack = (null)</div><div>SLURM_CONF = /etc/slurm/slurm.conf</div><div>SLURM_VERSION = 18.08.0</div><div>SrunEpilog = (null)</div><div>SrunPortRange = 0-0</div><div>SrunProlog = (null)</div><div>StateSaveLocation = /n/common/opt/slurm/slurmctld</div><div>SuspendExcNodes = (null)</div><div>SuspendExcParts = (null)</div><div>SuspendProgram = (null)</div><div>SuspendRate = 60 nodes/min</div><div>SuspendTime = NONE</div><div>SuspendTimeout = 30 sec</div><div>SwitchType = switch/none</div><div>TaskEpilog = (null)</div><div>TaskPlugin = task/none</div><div>TaskPluginParam = (null type)</div><div>TaskProlog = (null)</div><div>TCPTimeout = 2 sec</div><div>TmpFS = /tmp</div><div>TopologyParam = (null)</div><div>TopologyPlugin = topology/none</div><div>TrackWCKey = No</div><div>TreeWidth = 50</div><div>UsePam = 0</div><div>UnkillableStepProgram = (null)</div><div>UnkillableStepTimeout = 60 sec</div><div>VSizeFactor = 0 percent</div><div>WaitTime = 0 sec</div><div>X11Parameters = (null)</div><div><br></div><div>Slurmctld(primary) at panther02 is UP</div><div>Slurmctld(backup) at perf2 is UP</div></div><div><br></div></div></div><br><div class="gmail_quote"><div dir="ltr">On Mon, Dec 3, 2018 at 2:44 PM Michael Di Domenico <<a href="mailto:mdidomenico4@gmail.com">mdidomenico4@gmail.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">are you willing to paste an `scontrol show config` from the machine<br>
having trouble<br>
On Mon, Dec 3, 2018 at 12:10 PM Lou Nicotra <<a href="mailto:lnicotra@interactions.com" target="_blank">lnicotra@interactions.com</a>> wrote:<br>
><br>
> I'm running slurmd version 18.08.0...<br>
><br>
> It seems that the system recognizes the GPUs after a slurmd restart. I tuned debug to 5, restarted and then submitted job. Nothing get logged to log file in local server...<br>
> [2018-12-03T11:55:18.442] Slurmd shutdown completing<br>
> [2018-12-03T11:55:18.484] debug: Log file re-opened<br>
> [2018-12-03T11:55:18.485] debug: CPUs:48 Boards:1 Sockets:2 CoresPerSocket:12 ThreadsPerCore:2<br>
> [2018-12-03T11:55:18.485] Message aggregation disabled<br>
> [2018-12-03T11:55:18.486] debug: CPUs:48 Boards:1 Sockets:2 CoresPerSocket:12 ThreadsPerCore:2<br>
> [2018-12-03T11:55:18.486] debug: init: Gres GPU plugin loaded<br>
> [2018-12-03T11:55:18.486] Gres Name=gpu Type=K20 Count=2<br>
> [2018-12-03T11:55:18.487] gpu device number 0(/dev/nvidia0):c 195:0 rwm<br>
> [2018-12-03T11:55:18.487] gpu device number 1(/dev/nvidia1):c 195:1 rwm<br>
> [2018-12-03T11:55:18.487] topology NONE plugin loaded<br>
> [2018-12-03T11:55:18.487] route default plugin loaded<br>
> [2018-12-03T11:55:18.530] debug: Resource spec: No specialized cores configured by default on this node<br>
> [2018-12-03T11:55:18.530] debug: Resource spec: Reserved system memory limit not configured for this node<br>
> [2018-12-03T11:55:18.530] debug: task NONE plugin loaded<br>
> [2018-12-03T11:55:18.530] debug: Munge authentication plugin loaded<br>
> [2018-12-03T11:55:18.530] debug: spank: opening plugin stack /etc/slurm/plugstack.conf<br>
> [2018-12-03T11:55:18.530] Munge cryptographic signature plugin loaded<br>
> [2018-12-03T11:55:18.532] slurmd version 18.08.0 started<br>
> [2018-12-03T11:55:18.532] debug: Job accounting gather LINUX plugin loaded<br>
> [2018-12-03T11:55:18.532] debug: job_container none plugin loaded<br>
> [2018-12-03T11:55:18.532] debug: switch NONE plugin loaded<br>
> [2018-12-03T11:55:18.532] slurmd started on Mon, 03 Dec 2018 11:55:18 -0500<br>
> [2018-12-03T11:55:18.533] CPUs=48 Boards=1 Sockets=2 Cores=12 Threads=2 Memory=386757 TmpDisk=4758 Uptime=21165906 CPUSpecList=(null) FeaturesAvail=(null) FeaturesActive=(null)<br>
> [2018-12-03T11:55:18.533] debug: AcctGatherEnergy NONE plugin loaded<br>
> [2018-12-03T11:55:18.533] debug: AcctGatherProfile NONE plugin loaded<br>
> [2018-12-03T11:55:18.533] debug: AcctGatherInterconnect NONE plugin loaded<br>
> [2018-12-03T11:55:18.533] debug: AcctGatherFilesystem NONE plugin loaded<br>
> root@tiger11 slurm#<br>
><br>
> So, I turned on debug to 5 in slurmcltd in master server, and after I submitted my job, it shows...<br>
> [2018-12-03T12:02:10.355] _job_create: account 'lnicotra' has no association for user 1498 using default account 'slt'<br>
> [2018-12-03T12:02:10.356] _slurm_rpc_submit_batch_job: Invalid Trackable RESource (TRES) specification<br>
><br>
> So, we use LDAP for authentication and my UID is 1498, but I created a user in slurm using my login name. The default account for all users is "slt" Is this the cause of my problems?<br>
> root@panther02 slurm# getent passwd lnicotra<br>
> lnicotra:*:1498:1152:Lou Nicotra:/home/lnicotra:/bin/bash<br>
><br>
> If so, how is this resolved as we use multiple servers and there are no local accounts for them?<br>
><br>
> Thanks!<br>
> Lou<br>
><br>
><br>
><br>
> On Mon, Dec 3, 2018 at 11:36 AM Michael Di Domenico <<a href="mailto:mdidomenico4@gmail.com" target="_blank">mdidomenico4@gmail.com</a>> wrote:<br>
>><br>
>> do you get anything additional in the slurm logs? have you tried<br>
>> adding gres to the debugflags? what version of slurm are you running?<br>
>> On Mon, Dec 3, 2018 at 9:18 AM Lou Nicotra <<a href="mailto:lnicotra@interactions.com" target="_blank">lnicotra@interactions.com</a>> wrote:<br>
>> ><br>
>> > Hi All, I have recently set up a slurm cluster with my servers and I'm running into an issue while submitting GPU jobs. It has something to to with gres configurations, but I just can't seem to figure out what is wrong. Non GPU jobs run fine.<br>
>> ><br>
>> > The error is as follows:<br>
>> > sbatch: error: Batch job submission failed: Invalid Trackable RESource (TRES) specification after submitting a batch job.<br>
>> ><br>
>> > My batch job is as follows:<br>
>> > #!/bin/bash<br>
>> > #SBATCH --partition=tiger_1 # partition name<br>
>> > #SBATCH --gres=gpu:k20:1<br>
>> > #SBATCH --gres-flags=enforce-binding<br>
>> > #SBATCH --time=0:20:00 # wall clock limit<br>
>> > #SBATCH --output=gpu-%J.txt<br>
>> > #SBATCH --account=lnicotra<br>
>> > module load cuda<br>
>> > python gpu1<br>
>> ><br>
>> > Where gpu1 is a GPU test script that runs correctly while invoked via python. Tiger_1 partition has servers with GPUs, with a mix of 1080GTX and K20 as specified in slurm.conf<br>
>> ><br>
>> > I have defined GRES resources in the slurm.conf file:<br>
>> > # GPU GRES<br>
>> > GresTypes=gpu<br>
>> > NodeName=tiger[01,05,10,15,20] Gres=gpu:1080gtx:2<br>
>> > NodeName=tiger[02-04,06-09,11-14,16-19,21-22] Gres=gpu:k20:2<br>
>> ><br>
>> > And have a local gres.conf on the servers containing GPUs...<br>
>> > lnicotra@tiger11 ~# cat /etc/slurm/gres.conf<br>
>> > # GPU Definitions<br>
>> > # NodeName=tiger[02-04,06-09,11-14,16-19,21-22] Name=gpu Type=K20 File=/dev/nvidia[0-1]<br>
>> > Name=gpu Type=K20 File=/dev/nvidia[0-1] Cores=0,1<br>
>> ><br>
>> > and a similar one for the 1080GTX<br>
>> > # GPU Definitions<br>
>> > # NodeName=tiger[01,05,10,15,20] Name=gpu Type=1080GTX File=/dev/nvidia[0-1]<br>
>> > Name=gpu Type=1080GTX File=/dev/nvidia[0-1] Cores=0,1<br>
>> ><br>
>> > The account manager seems to know about the GPUs...<br>
>> > lnicotra@tiger11 ~# sacctmgr show tres<br>
>> > Type Name ID<br>
>> > -------- --------------- ------<br>
>> > cpu 1<br>
>> > mem 2<br>
>> > energy 3<br>
>> > node 4<br>
>> > billing 5<br>
>> > fs disk 6<br>
>> > vmem 7<br>
>> > pages 8<br>
>> > gres gpu 1001<br>
>> > gres gpu:k20 1002<br>
>> > gres gpu:1080gtx 1003<br>
>> ><br>
>> > Can anyone point out what am I missing?<br>
>> ><br>
>> > Thanks!<br>
>> > Lou<br>
>> ><br>
>> ><br>
>> > --<br>
>> ><br>
>> > Lou Nicotra<br>
>> ><br>
>> > IT Systems Engineer - SLT<br>
>> ><br>
>> > Interactions LLC<br>
>> ><br>
>> > o: 908-673-1833<br>
>> ><br>
>> > m: 908-451-6983<br>
>> ><br>
>> > <a href="mailto:lnicotra@interactions.com" target="_blank">lnicotra@interactions.com</a><br>
>> ><br>
>> > <a href="http://www.interactions.com" rel="noreferrer" target="_blank">www.interactions.com</a><br>
>> ><br>
>> > *******************************************************************************<br>
>> ><br>
>> > This e-mail and any of its attachments may contain Interactions LLC proprietary information, which is privileged, confidential, or subject to copyright belonging to the Interactions LLC. This e-mail is intended solely for the use of the individual or entity to which it is addressed. If you are not the intended recipient of this e-mail, you are hereby notified that any dissemination, distribution, copying, or action taken in relation to the contents of and attachments to this e-mail is strictly prohibited and may be unlawful. If you have received this e-mail in error, please notify the sender immediately and permanently delete the original and any copy of this e-mail and any printout. Thank You.<br>
>> ><br>
>> > *******************************************************************************<br>
>><br>
><br>
><br>
> --<br>
><br>
> Lou Nicotra<br>
><br>
> IT Systems Engineer - SLT<br>
><br>
> Interactions LLC<br>
><br>
> o: 908-673-1833<br>
><br>
> m: 908-451-6983<br>
><br>
> <a href="mailto:lnicotra@interactions.com" target="_blank">lnicotra@interactions.com</a><br>
><br>
> <a href="http://www.interactions.com" rel="noreferrer" target="_blank">www.interactions.com</a><br>
><br>
> *******************************************************************************<br>
><br>
> This e-mail and any of its attachments may contain Interactions LLC proprietary information, which is privileged, confidential, or subject to copyright belonging to the Interactions LLC. This e-mail is intended solely for the use of the individual or entity to which it is addressed. If you are not the intended recipient of this e-mail, you are hereby notified that any dissemination, distribution, copying, or action taken in relation to the contents of and attachments to this e-mail is strictly prohibited and may be unlawful. If you have received this e-mail in error, please notify the sender immediately and permanently delete the original and any copy of this e-mail and any printout. Thank You.<br>
><br>
> *******************************************************************************<br>
<br>
</blockquote></div><br clear="all"><div><br></div>-- <br><div dir="ltr" class="gmail_signature" data-smartmail="gmail_signature"><div dir="ltr"><p style="margin-bottom:0.0001pt;line-height:normal;background-image:initial;background-position:initial;background-repeat:initial"><b><span style="font-size:9.5pt;font-family:"Arial",sans-serif;color:#6fa8dc">Lou Nicotra</span></b><span style="font-size:9.5pt;font-family:Arial,sans-serif"></span></p>
<p style="margin-bottom:0.0001pt;line-height:normal;background-image:initial;background-position:initial;background-repeat:initial"><span style="font-size:9.5pt;font-family:"Arial",sans-serif;color:#666666">IT Systems Engineer -
SLT</span><span style="font-size:9.5pt;font-family:Arial,sans-serif"></span></p>
<p style="margin-bottom:0.0001pt;line-height:normal;background-image:initial;background-position:initial;background-repeat:initial"><span style="font-size:9.5pt;font-family:"Arial",sans-serif;color:#666666">Interactions LLC</span></p>
<p style="margin-bottom:0.0001pt;line-height:normal;background-image:initial;background-position:initial;background-repeat:initial"><span style="font-size:9.5pt;font-family:Arial,sans-serif">o: </span><span style="font-size:9.5pt;font-family:"Arial",sans-serif;color:#666666"><a href="tel:781-405-5114" target="_blank"><span style="color:#1155cc">908-673-1833</span></a></span><span style="font-size:9.5pt;font-family:Arial,sans-serif"></span></p>
<p style="margin-bottom:0.0001pt;line-height:normal;background-image:initial;background-position:initial;background-repeat:initial"><span style="font-size:9.5pt;font-family:"Arial",sans-serif;color:#666666">m: <a href="tel:781-405-5114" target="_blank"><span style="color:#1155cc">908-451-6983</span></a></span><span style="font-size:9.5pt;font-family:Arial,sans-serif"></span></p>
<p style="margin-bottom:0.0001pt;line-height:normal;background-image:initial;background-position:initial;background-repeat:initial"><u><span style="font-size:9.5pt;font-family:"Arial",sans-serif;color:#1155cc"><a href="mailto:lnicotra@interactions.com" target="_blank">lnicotra@interactions.com</a></span></u><span style="font-size:9.5pt;font-family:Arial,sans-serif"></span></p>
<span style="font-size:9.5pt;line-height:107%;font-family:"Arial",sans-serif;color:#666666"><a href="http://www.interactions.com/" target="_blank"><span style="color:#1155cc">www.interactions.com</span></a></span><br></div></div>
<br>
<font face="Times New Roman" size="3">
</font><p style="margin:0in 0in 8pt"><font face="Calibri" size="3">******************************<wbr>******************************<wbr>*******************</font></p><font face="Times New Roman" size="3">
</font><p style="margin:0in 0in 8pt"><font face="Calibri" size="3">This e-mail and any of its attachments may contain
Interactions LLC proprietary information, which is privileged,
confidential, or subject to copyright belonging to the Interactions
LLC. This e-mail is intended solely for the use of the individual or
entity to which it is addressed. If you are not the intended recipient of this
e-mail, you are hereby notified that any dissemination, distribution, copying,
or action taken in relation to the contents of and attachments to this e-mail
is strictly prohibited and may be unlawful. If you have received this e-mail in
error, please notify the sender immediately and permanently delete the original
and any copy of this e-mail and any printout. Thank You. </font></p><font face="Times New Roman" size="3">
</font><p style="margin:0in 0in 8pt"><font face="Calibri"><font size="3">******************************<wbr>******************************<wbr>*******************<span> </span></font></font></p><font face="Times New Roman" size="3">
</font>