[slurm-users] Slurm : compute node status is UNKNOWN and Reason=NO NETWORK ADDRESS FOUND
Zainul Abiddin
zainul1114 at gmail.com
Tue Feb 2 12:45:55 UTC 2021
Hi All,
Please help me to resolve this issue
My compute node (snode) status is UNKNOWN and Reason=NO NETWORK ADDRESS
FOUND
Master node (smaster) :
[root at smaster ~]# cat /etc/slurm/slurm.conf
# slurm.conf file generated by configurator easy.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ControlMachine=smaster
ControlAddr=192.168.1.195
#
#MailProg=/bin/mail
MpiDefault=none
#MpiParams=ports=#-#
ProctrackType=proctrack/pgid
ReturnToService=1
SlurmctldPidFile=/var/run/slurmctld.pid
#SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
#SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=slurm
#SlurmdUser=root
StateSaveLocation=/var/spool/slurmctld
SwitchType=switch/none
TaskPlugin=task/none
#
#
# TIMERS
#KillWait=30
#MinJobAge=300
#SlurmctldTimeout=120
#SlurmdTimeout=300
#
#
# SCHEDULING
SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_Core
#
# LOGGING AND ACCOUNTING
AccountingStorageType=accounting_storage/none
ClusterName=scluster
#JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/none
#SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurmctld.log
#SlurmdDebug=3
SlurmdLogFile=/var/log/slurmd.log
#
#
# COMPUTE NODES
NodeName=smaster NodeAddr=192.168.1.195 CPUs=2 RealMemory=1024 State=UNKNOWN
NodeName=sndode NodeAddr=192.168.1.196 CPUs=2 RealMemory=1024 State=UNKNOWN
#PartitionName=debug Nodes=sndode Default=YES MaxTime=INFINITE State=UP
PartitionName=debug Nodes=sndode Default=YES MaxTime=INFINITE State=UP
PartitionName=hpc Nodes=smaster Default=YES MaxTime=INFINITE State=UP
*On Master Node (smaster):*
[root at smaster ~]# sinfo -Nl
Tue Feb 02 18:11:00 2021
NODELIST NODES PARTITION STATE CPUS S:C:T MEMORY TMP_DISK WEIGHT
AVAIL_FE REASON
smaster 1 hpc* idle 2 2:1:1 1024 0 1
(null) none
sndode 1 debug unknown* 2 2:1:1 1024 0 1
(null) NO NETWORK ADDRESS F
[root at smaster ~]# scontrol show nodes
NodeName=smaster Arch=x86_64 CoresPerSocket=1
CPUAlloc=0 CPUTot=2 CPULoad=0.01
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=(null)
NodeAddr=192.168.1.195 NodeHostName=smaster Version=20.11.2
OS=Linux 3.10.0-1160.11.1.el7.x86_64 #1 SMP Fri Dec 18 16:34:56 UTC 2020
RealMemory=1024 AllocMem=0 FreeMem=4500 Sockets=2 Boards=1
State=IDLE ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=hpc
BootTime=2021-02-02T10:53:56 SlurmdStartTime=2021-02-02T13:21:10
CfgTRES=cpu=2,mem=1G,billing=2
AllocTRES=
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
Comment=(null)
NodeName=sndode CoresPerSocket=1
CPUAlloc=0 CPUTot=2 CPULoad=N/A
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=(null)
NodeAddr=192.168.1.196 NodeHostName=sndode
RealMemory=1024 AllocMem=0 FreeMem=N/A Sockets=2 Boards=1
State=UNKNOWN* ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A
MCS_label=N/A
Partitions=debug
BootTime=None SlurmdStartTime=None
CfgTRES=cpu=2,mem=1G,billing=2
AllocTRES=
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
Reason=NO NETWORK ADDRESS FOUND [slurm at 2021-02-02T10:58:11]
Comment=(null)
[root at smaster ~]#
*Conpute Node:*
[root at snode ~]# for i in munge slurmd; do service $i status; done
Redirecting to /bin/systemctl status munge.service
● munge.service - MUNGE authentication service
Loaded: loaded (/usr/lib/systemd/system/munge.service; enabled; vendor
preset: disabled)
Active: active (running) since Tue 2021-02-02 13:29:11 IST; 4h 43min ago
Docs: man:munged(8)
Process: 17759 ExecStart=/usr/sbin/munged (code=exited, status=0/SUCCESS)
Main PID: 17761 (munged)
Tasks: 4
Memory: 600.0K
CGroup: /system.slice/munge.service
└─17761 /usr/sbin/munged
Feb 02 13:29:11 snode.calligotech.com systemd[1]: Starting MUNGE
authentication service...
Feb 02 13:29:11 snode.calligotech.com systemd[1]: Started MUNGE
authentication service.
Redirecting to /bin/systemctl status slurmd.service
● slurmd.service - Slurm node daemon
Loaded: loaded (/usr/lib/systemd/system/slurmd.service; enabled; vendor
preset: disabled)
Active: failed (Result: exit-code) since Tue 2021-02-02 13:29:12 IST; 4h
43min ago
Process: 17785 ExecStart=/usr/sbin/slurmd -D $SLURMD_OPTIONS
(code=exited, status=1/FAILURE)
Main PID: 17785 (code=exited, status=1/FAILURE)
Feb 02 13:29:11 snode.calligotech.com systemd[1]: Started Slurm node daemon.
Feb 02 13:29:12 snode.calligotech.com systemd[1]: slurmd.service: main
process exited, code=exited, status=1/FAILURE
Feb 02 13:29:12 snode.calligotech.com systemd[1]: Unit slurmd.service
entered failed state.
Feb 02 13:29:12 snode.calligotech.com systemd[1]: slurmd.service failed.
[root at snode ~]# sinfo -Nl
Tue Feb 02 18:12:47 2021
NODELIST NODES PARTITION STATE CPUS S:C:T MEMORY TMP_DISK WEIGHT
AVAIL_FE REASON
smaster 1 hpc* idle 2 2:1:1 1024 0 1
(null) none
sndode 1 debug unknown* 2 2:1:1 1024 0 1
(null) NO NETWORK ADDRESS F
[root at snode ~]# scontrol show nodes
NodeName=smaster Arch=x86_64 CoresPerSocket=1
CPUAlloc=0 CPUTot=2 CPULoad=0.01
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=(null)
NodeAddr=192.168.1.195 NodeHostName=smaster Version=20.11.2
OS=Linux 3.10.0-1160.11.1.el7.x86_64 #1 SMP Fri Dec 18 16:34:56 UTC 2020
RealMemory=1024 AllocMem=0 FreeMem=4502 Sockets=2 Boards=1
State=IDLE ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=hpc
BootTime=2021-02-02T10:53:56 SlurmdStartTime=2021-02-02T13:21:10
CfgTRES=cpu=2,mem=1G,billing=2
AllocTRES=
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
Comment=(null)
NodeName=sndode CoresPerSocket=1
CPUAlloc=0 CPUTot=2 CPULoad=N/A
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=(null)
NodeAddr=192.168.1.196 NodeHostName=sndode
RealMemory=1024 AllocMem=0 FreeMem=N/A Sockets=2 Boards=1
State=UNKNOWN* ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A
MCS_label=N/A
Partitions=debug
BootTime=None SlurmdStartTime=None
CfgTRES=cpu=2,mem=1G,billing=2
AllocTRES=
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
Reason=NO NETWORK ADDRESS FOUND [slurm at 2021-02-02T10:58:11]
Comment=(null)
[root at snode ~]# sinfo
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
debug up 1:00:00 1 unk* sndode
hpc* up infinite 1 idle smaster
[root at snode ~]#
Please help me to resolve this issue.
Regards,
Zain
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.schedmd.com/pipermail/slurm-users/attachments/20210202/853548d5/attachment.htm>
More information about the slurm-users
mailing list