[slurm-users] slurmd: error: Error binding slurm stream socket: Address already in use

Wed Jan 9 23:45:43 UTC 2019

Hello, 

I am trying to install slurm in a small test cluster. Just after installation the nodes were up and running, but after rebooting the machines, the following error appears :  

  slurmd: debug:  switch NONE plugin loaded
  slurmd: error: Error binding slurm stream socket: Address already in use
  slurmd: error: Unable to bind listen port (192.168.70.213:8018): Address already in use

Below the details of my history installation and configuration files. If you have the time to take a look and give me an idea on how to solve the problem let me know.  

Thank you.
Kind regards,
Alseny
-------------------------------------------------------------------------------
# INSTALLATION HISTORY DETAILS

# mini cluster test set up: 
# master node : hostname -> toklap124, IP 192.168.70.214
# compute node 1 : hostname -> tokwor112, IP 192.168.70.212
# compute node 2 : hostname -> toklap120, IP 192.168.70.213

# All the following commands have been done as root in each of the 3 machines

############################################################### AT MASTER NODE 

# checking IP and hostname 
ip route get 8.8.8.8 | awk '{print $NF; exit}'
    # 192.168.70.214
# checking that the hostname is correct
vi /etc/hostname
    # toklap124
vi /etc/hosts
    #    127.0.0.1   localhost localhost.localdomain localhost4 localhost4.localdomain4
    #    ::1         localhost localhost.localdomain localhost6 localhost6.localdomain6
    #    192.168.70.214 toklap124

############################################################### AT COMPUTE  NODE1

# checking IP and hostname 
ip route get 8.8.8.8 | awk '{print $NF; exit}'
    # 192.168.70.212
vi /etc/hostname
    # tokwor112    
vi /etc/hosts
    # 127.0.0.1 localhost.localdomain localhost4 localhost4.localdomain4
    # ::1 localhost localhost.localdomain localhost6 localhost6.localdomain6
    # 192.168.70.212 tokwor112

############################################################### AT COMPUTE  NODE2

# checking IP and hostname 
ip route get 8.8.8.8 | awk '{print $NF; exit}'
    # 192.168.70.213
vi /etc/hostname
    # toklap120
vi /etc/hosts
    # 127.0.0.1 localhost.localdomain localhost4 localhost4.localdomain4
    # ::1 localhost localhost.localdomain localhost6 localhost6.localdomain6
    # 192.168.70.213 toklap120

-------------------------------------------------------------- REMOVE PREVIOUS INSTALLATION

########################################################## ON ALL NODES 
yum remove mariadb-server mariadb-devel -y
yum remove slurm munge munge-libs munge-devel -y
userdel -r slurm
userdel -r munge

# install maria
yum install mariadb-server mariadb-devel -y

cd /
# create the new user group "munge" (-g option is used to assign a numerical group ID)
export MUNGEUSER=1127
groupadd -g $MUNGEUSER munge
useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGEUSER -g munge -s /sbin/nologin munge

# create the new user group "slurm"
export SLURMUSER=1128
groupadd -g $SLURMUSER slurm
useradd -m -c "SLURM workload manager" -d /var/lib/slurm -u $SLURMUSER -g slurm -s /bin/bash slurm

# check
grep '1127' /etc/passwd
#OUTPUT  -> munge:x:1127:1127:MUNGE Uid 'N' Gid Emporium:/var/lib/munge:/sbin/nologin

grep '1128' /etc/passwd
#OUTPUT  -> slurm:x:1128:1128:SLURM workload manager:/var/lib/slurm:/bin/bash

############################################################# BACK TO MASTER NODE

# at master node 
yum install epel-release -y
yum install munge munge-libs munge-devel -y
yum install rng-tools -y
/usr/sbin/create-munge-key -r  # overwrite key? yes
dd if=/dev/urandom bs=1 count=1024 > /etc/munge/munge.key

############################################################# AT COMPUTE NODES 

# at compute nodes 
yum install epel-release -y
yum install munge munge-libs munge-devel -y
rm -rf /etc/munge/munge.key

############################################################# AT MASTER NODE 
# at master node 
# sending the key to each of the compute nodes 
# compute node 1 tokwor112: 
scp /etc/munge/munge.key root at 192.168.70.212:/etc/munge
# compute node 2 toklap120:
scp /etc/munge/munge.key root at 192.168.70.213:/etc/munge

# starting munge services 
chown munge: /etc/munge/munge.key
chmod 400 /etc/munge/munge.key
chown -R munge: /etc/munge/ /var/log/munge/
chmod 0700 /etc/munge/ /var/log/munge/
systemctl enable munge
systemctl start munge

############################################################## AT COMPUTE NODES 

chown munge: /etc/munge/munge.key
chmod 400 /etc/munge/munge.key
chown -R munge: /etc/munge/ /var/log/munge/
chmod 0700 /etc/munge/ /var/log/munge/
systemctl enable munge
systemctl start munge

############################################################## AT MASTER 
# testing munge 
[root at toklap124 /]#  munge -n | unmunge
STATUS:           Success (0)
ENCODE_HOST:      toklap124 (192.168.70.214)
ENCODE_TIME:      2018-12-27 14:44:19 +0900 (1545889459)
DECODE_TIME:      2018-12-27 14:44:19 +0900 (1545889459)

[root at toklap124 /]#  munge -n | ssh 192.168.70.212 unmunge
root at 192.168.70.212's password: 
STATUS:           Success (0)
ENCODE_HOST:      ??? (192.168.70.214)
ENCODE_TIME:      2018-12-27 14:44:52 +0900 (1545889492)
DECODE_TIME:      2018-12-27 14:44:57 +0900 (1545889497)

[root at toklap124 /]# munge -n | ssh 192.168.70.213 unmunge
root at 192.168.70.213's password: 
STATUS:           Success (0)
ENCODE_HOST:      ??? (192.168.70.214)
ENCODE_TIME:      2018-12-27 14:46:08 +0900 (1545889568)
DECODE_TIME:      2018-12-27 14:46:13 +0900 (1545889573)

# OK it is working but are you sure those ENCODE_HOST ??? 
# "???" does not look quite right, anyway let's keep going.

# at compute node  192.168.70.212   : 
[root at tokwor112 /]# munge -n | unmunge
STATUS:           Success (0)
ENCODE_HOST:      tokwor112 (192.168.70.212)
ENCODE_TIME:      2018-12-27 14:48:40 +0900 (1545889720)
DECODE_TIME:      2018-12-27 14:48:40 +0900 (1545889720)

[root at toklap120 /]# munge -n | unmunge
STATUS:           Success (0)
ENCODE_HOST:      toklap120 (192.168.70.213)
ENCODE_TIME:      2018-12-27 14:49:39 +0900 (1545889779)
DECODE_TIME:      2018-12-27 14:49:39 +0900 (1545889779)

###################################################### SLURM INSTALLATION 

###################################################### IN EACH NODE (both master and compute)

yum install gcc gcc-c++ gcc-gfortran kernel-devel -y 
yum install openssl openssl-devel pam-devel numactl numactl-devel hwloc hwloc-devel lua lua-devel readline-devel rrdtool-devel ncurses-devel man2html libibmad libibumad cpanm* -y  
yum install wget gcc gcc-c++ hdf5 hdf5-devel -y   
yum install libcurl-devel json-c-devel lz4-devel libibmad-devel libssh2-devel glibc-devel glib2-devel gtk2-devel -y   
yum install rpmdevtools -y 
cd ~ 

######################## at MASTER NODE: 
rm -rf rpmbuild
rpmbuild -ta slurm-17.11.5.tar.bz2
libtool --finish /lib64/security
rm -rf ~/slurm_rpms/
mkdir ~/slurm_rpms
mv rpmbuild/RPMS/x86_64/slurm*.rpm ~/slurm_rpms

# sending the rpm to the compute nodes: 
scp -r ~/slurm_rpms root at 192.168.70.212:~/
scp -r ~/slurm_rpms root at 192.168.70.213:~/

########################################################## IN EACH NODE 
yum install ntp -y
yum install mailx -y  
yum install ~/slurm_rpms/*.rpm -y  
vi /etc/slurm/slurm.conf 
# and inside we copy the following : 

############################################################################ slurm.conf start
# slurm.conf file generated by configurator easy.html. 
# Put this file on all nodes of your cluster. 
# See the slurm.conf man page for more information. 
# 
ControlMachine=toklap124 
ControlAddr=192.168.70.214 
#
# additional suggestions from https://wiki.fysik.dtu.dk/niflheim/Slurm_configuration#reboot-option
RebootProgram="/usr/sbin/reboot"
UnkillableStepTimeout=120
# end additional suggestions
# 
MailProg=/bin/mail 
MpiDefault=none 
#MpiParams=ports=#-# 
ProctrackType=proctrack/cgroup 
ReturnToService=1 
SlurmctldPidFile=/var/run/slurm/slurmctld.pid 
SlurmctldPort=8017 
SlurmdPidFile=/var/run/slurm/slurmd.pid 
SlurmdPort=8018 
SlurmdSpoolDir=/var/spool/slurm 
SlurmUser=slurm 
#SlurmdUser=root 
StateSaveLocation=/var/spool/slurm 
SwitchType=switch/none 
TaskPlugin=task/affinity 
#
# 
# TIMERS 
#KillWait=30 
#MinJobAge=300 
#SlurmctldTimeout=120 
#SlurmdTimeout=300 
# 
# 
# SCHEDULING 
FastSchedule=1 
SchedulerType=sched/backfill 
SelectType=select/cons_res 
SelectTypeParameters=CR_Core 
# 
# 
# LOGGING AND ACCOUNTING 
AccountingStorageType=accounting_storage/none 
ClusterName=cluster 
#JobAcctGatherFrequency=30 
JobAcctGatherType=jobacct_gather/none 
#SlurmctldDebug=3 
SlurmctldLogFile=/var/log/slurmctld.log 
#SlurmdDebug=3 
SlurmdLogFile=/var/log/slurmd.log 
# 
# 
# COMPUTE NODES 
NodeName=tokwor112 NodeAddr=192.168.70.212 CPUs=8 Sockets=1 CoresPerSocket=4 ThreadsPerCore=2 State=UNKNOWN
NodeName=toklap120 NodeAddr=192.168.70.213 CPUs=4 Sockets=1 CoresPerSocket=2 ThreadsPerCore=2 State=UNKNOWN
PartitionName=production Nodes=ALL Default=YES MaxTime=INFINITE State=UP
#################################################################### /etc/slurm/slurm.conf   ENDS 

# we also set the cgroup.conf as follows in each of the nodes 

############################################################## /etc/slurm/cgroup.conf STARTS
CgroupAutomount=yes
CgroupReleaseAgentDir="/etc/slurm/cgroup"

ConstrainCores=no
ConstrainRAMSpace=yes
TaskAffinity=no
ConstrainSwapSpace=yes
AllowedSwapSpace=0
############################################################## /etc/slurm/cgroup.conf ENDS

########################################### both on cluster and on compute nodes 
mkdir /var/run/slurm
chown slurm: /var/run/slurm
chmod 755 /var/run/slurm
mkdir /var/spool/slurm
chown slurm: /var/spool/slurm
chmod 755 /var/spool/slurm
slurmd -C
  # OUTPUT @ master node NodeName=toklap124 CPUs=4 Boards=1 SocketsPerBoard=1 CoresPerSocket=2 ThreadsPerCore=2 RealMemory=7727 
  # OUTPUT @ compute node1 NodeName=tokwor112 CPUs=8 Boards=1 SocketsPerBoard=1 CoresPerSocket=4 ThreadsPerCore=2 RealMemory=15811
  # OUTPUT @ compute node2 NodeName=toklap120 CPUs=4 Boards=1 SocketsPerBoard=1 CoresPerSocket=2 ThreadsPerCore=2 RealMemory=7728

sed -i -e 's/PIDFile=.*/PIDFile=\/var\/run\/slurm\/slurmctld.pid/g' /usr/lib/systemd/system/slurmctld.service
sed -i -e 's/PIDFile=.*/PIDFile=\/var\/run\/slurm\/slurmd.pid/g' /usr/lib/systemd/system/slurmd.service

# start services ##############################  master node only 
systemctl enable slurmctld
systemctl start slurmctld
systemctl status slurmctld.service
# OK now master node is up and running
        # OUTPUT
        #● slurmctld.service - Slurm controller daemon
        #   Loaded: loaded (/usr/lib/systemd/system/slurmctld.service; enabled; vendor preset: disabled)
        #   Active: active (running) since Tue 2019-01-08 14:04:45 JST; 491ms ago
        #  Process: 30750 ExecStart=/usr/sbin/slurmctld $SLURMCTLD_OPTIONS (code=exited, status=0/SUCCESS)
        # Main PID: 30753 (slurmctld)
        #    Tasks: 7
        #   CGroup: /system.slice/slurmctld.service
        #           └─30753 /usr/sbin/slurmctld
        # Jan 08 14:04:45 toklap124 systemd[1]: Starting Slurm controller daemon...
        # Jan 08 14:04:45 toklap124 systemd[1]: Started Slurm controller daemon.

# start services ############################## compute nodes only
systemctl enable slurmd.service
systemctl start slurmd.service
systemctl status slurmd.service

      # OUTPUT 

      # COMPUTE NODE 1
      # ● slurmd.service - Slurm node daemon
      #   Loaded: loaded (/usr/lib/systemd/system/slurmd.service; enabled; vendor preset: disabled)
      #   Active: active (running) since Tue 2019-01-08 14:05:09 JST; 453ms ago
      #  Process: 22335 ExecStart=/usr/sbin/slurmd $SLURMD_OPTIONS (code=exited, status=0/SUCCESS)
      # Main PID: 22340 (slurmd)
      #    Tasks: 2
      #   CGroup: /system.slice/slurmd.service
      #           ├─ 4960 /usr/sbin/slurmd
      #           └─22340 /usr/sbin/slurmd
      #
      # Jan 08 14:05:09 tokwor112 systemd[1]: Starting Slurm node daemon...
      # Jan 08 14:05:09 tokwor112 systemd[1]: PID file /var/run/slurm/slurmd.pid not readable (yet?) after start.
      # Jan 08 14:05:09 tokwor112 systemd[1]: Started Slurm node daemon.

      # COMPUTE NODE 2
      # ● slurmd.service - Slurm node daemon
      #   Loaded: loaded (/usr/lib/systemd/system/slurmd.service; enabled; vendor preset: disabled)
      #   Active: active (running) since Tue 2019-01-08 14:05:17 JST; 541ms ago
      #  Process: 7873 ExecStart=/usr/sbin/slurmd $SLURMD_OPTIONS (code=exited, status=0/SUCCESS)
      # Main PID: 7878 (slurmd)
      #    Tasks: 3
      #   CGroup: /system.slice/slurmd.service
      #           ├─1236 /usr/sbin/slurmd
      #           └─7878 /usr/sbin/slurmd
      #
      # Jan 08 14:05:17 toklap120 systemd[1]: Starting Slurm node daemon...
      # Jan 08 14:05:17 toklap120 systemd[1]: Started Slurm node daemon.

############################################# AT NODE 1
# now compute node 1 is up and running
# you can launch sinfo successfully 
sinfo
  OUTPUT 
  # PARTITION   AVAIL  TIMELIMIT  NODES  STATE NODELIST
  # production*    up   infinite      2   idle toklap120,tokwor112

############################################# AT NODE 2
sinfo
  OUTPUT 
  # PARTITION   AVAIL  TIMELIMIT  NODES  STATE NODELIST
  # production*    up   infinite      2   idle toklap120,tokwor112

################################################################### COMPUTE NODE 2 job test 
[root at toklap120 ~]# srun --ntasks=1  --label /bin/hostname && pwd && whoami
# OUTPUT 
    0: toklap120
    /root/testSlurm
    root

  cd ~
  mkdir testSlurm
  cd testSlurm/
  vi job.slurm
and inside we copy : 

#!/bin/bash
#SBATCH -J pbe_delta       # Job name
#SBATCH -o pbe_delta.o%j   # Name of stdout output file(%j expands to jobId)
#SBATCH -e pbe_delta.o%j   # Name of stderr output file(%j expands to jobId)
#SBATCH -N 1                # Total number of nodes requested (16 cores/node)
#SBATCH -n 1
#SBATCH -t 48:00:00         # Run time (hh:mm:ss) 
date> output.out
pwd >> output.out
hostname >> output.out
ls -lah

# launching the job 
sbatch job.slurm

# getting the message  
# Submitted batch job 17

# inside the directory 2 new files are present: 
  output.out pbe_delta.o17
# Output.out contains the date, pwd e hostname like in the submitted job:
  Tue  8 Jan 14:22:57 JST 2019
  /root/testSlurm
  toklap120

############################## >> REBOOT MACHINE TESTING 

# Rebooting each node  
reboot now 

# AFTER RESTART COMPLETES 
################################################## IN EACH COMPUTE NODE
# disabling firewall on the compute nodes
systemctl stop firewalld
systemctl disable firewalld  # Ok no errors
# syncronizing clocks
chkconfig ntpd on
ntpdate pool.ntp.org
systemctl start ntpd  # clock should be now syncronized
systemctl enable munge
systemctl start munge
systemctl enable slurmd
systemctl stop slurmd
systemctl start slurmd
systemctl status slurmd
slurmd -D -vvv
  # OUTPUT 
  # from NODE 1 
  slurmd: debug:  switch NONE plugin loaded
  slurmd: error: Error binding slurm stream socket: Address already in use
  slurmd: error: Unable to bind listen port (192.168.70.212:8018): Address already in use

  # from NODE 2
  slurmd: debug:  switch NONE plugin loaded
  slurmd: error: Error binding slurm stream socket: Address already in use
  slurmd: error: Unable to bind listen port (192.168.70.213:8018): Address already in use

# not working anymore 

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.schedmd.com/pipermail/slurm-users/attachments/20190109/abb309eb/attachment-0001.html>