[slurm-users] slurmd: error: Error binding slurm stream socket: Address already in use
Alseny Diallo
alsdia at yahoo.com
Wed Jan 9 23:45:43 UTC 2019
Hello,
I am trying to install slurm in a small test cluster. Just after installation the nodes were up and running, but after rebooting the machines, the following error appears :
slurmd: debug: switch NONE plugin loaded
slurmd: error: Error binding slurm stream socket: Address already in use
slurmd: error: Unable to bind listen port (192.168.70.213:8018): Address already in use
Below the details of my history installation and configuration files. If you have the time to take a look and give me an idea on how to solve the problem let me know.
Thank you.
Kind regards,
Alseny
-------------------------------------------------------------------------------
# INSTALLATION HISTORY DETAILS
# mini cluster test set up:
# master node : hostname -> toklap124, IP 192.168.70.214
# compute node 1 : hostname -> tokwor112, IP 192.168.70.212
# compute node 2 : hostname -> toklap120, IP 192.168.70.213
# All the following commands have been done as root in each of the 3 machines
############################################################### AT MASTER NODE
# checking IP and hostname
ip route get 8.8.8.8 | awk '{print $NF; exit}'
# 192.168.70.214
# checking that the hostname is correct
vi /etc/hostname
# toklap124
vi /etc/hosts
# 127.0.0.1 localhost localhost.localdomain localhost4 localhost4.localdomain4
# ::1 localhost localhost.localdomain localhost6 localhost6.localdomain6
# 192.168.70.214 toklap124
############################################################### AT COMPUTE NODE1
# checking IP and hostname
ip route get 8.8.8.8 | awk '{print $NF; exit}'
# 192.168.70.212
vi /etc/hostname
# tokwor112
vi /etc/hosts
# 127.0.0.1 localhost.localdomain localhost4 localhost4.localdomain4
# ::1 localhost localhost.localdomain localhost6 localhost6.localdomain6
# 192.168.70.212 tokwor112
############################################################### AT COMPUTE NODE2
# checking IP and hostname
ip route get 8.8.8.8 | awk '{print $NF; exit}'
# 192.168.70.213
vi /etc/hostname
# toklap120
vi /etc/hosts
# 127.0.0.1 localhost.localdomain localhost4 localhost4.localdomain4
# ::1 localhost localhost.localdomain localhost6 localhost6.localdomain6
# 192.168.70.213 toklap120
-------------------------------------------------------------- REMOVE PREVIOUS INSTALLATION
########################################################## ON ALL NODES
yum remove mariadb-server mariadb-devel -y
yum remove slurm munge munge-libs munge-devel -y
userdel -r slurm
userdel -r munge
# install maria
yum install mariadb-server mariadb-devel -y
cd /
# create the new user group "munge" (-g option is used to assign a numerical group ID)
export MUNGEUSER=1127
groupadd -g $MUNGEUSER munge
useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGEUSER -g munge -s /sbin/nologin munge
# create the new user group "slurm"
export SLURMUSER=1128
groupadd -g $SLURMUSER slurm
useradd -m -c "SLURM workload manager" -d /var/lib/slurm -u $SLURMUSER -g slurm -s /bin/bash slurm
# check
grep '1127' /etc/passwd
#OUTPUT -> munge:x:1127:1127:MUNGE Uid 'N' Gid Emporium:/var/lib/munge:/sbin/nologin
grep '1128' /etc/passwd
#OUTPUT -> slurm:x:1128:1128:SLURM workload manager:/var/lib/slurm:/bin/bash
############################################################# BACK TO MASTER NODE
# at master node
yum install epel-release -y
yum install munge munge-libs munge-devel -y
yum install rng-tools -y
/usr/sbin/create-munge-key -r # overwrite key? yes
dd if=/dev/urandom bs=1 count=1024 > /etc/munge/munge.key
############################################################# AT COMPUTE NODES
# at compute nodes
yum install epel-release -y
yum install munge munge-libs munge-devel -y
rm -rf /etc/munge/munge.key
############################################################# AT MASTER NODE
# at master node
# sending the key to each of the compute nodes
# compute node 1 tokwor112:
scp /etc/munge/munge.key root at 192.168.70.212:/etc/munge
# compute node 2 toklap120:
scp /etc/munge/munge.key root at 192.168.70.213:/etc/munge
# starting munge services
chown munge: /etc/munge/munge.key
chmod 400 /etc/munge/munge.key
chown -R munge: /etc/munge/ /var/log/munge/
chmod 0700 /etc/munge/ /var/log/munge/
systemctl enable munge
systemctl start munge
############################################################## AT COMPUTE NODES
chown munge: /etc/munge/munge.key
chmod 400 /etc/munge/munge.key
chown -R munge: /etc/munge/ /var/log/munge/
chmod 0700 /etc/munge/ /var/log/munge/
systemctl enable munge
systemctl start munge
############################################################## AT MASTER
# testing munge
[root at toklap124 /]# munge -n | unmunge
STATUS: Success (0)
ENCODE_HOST: toklap124 (192.168.70.214)
ENCODE_TIME: 2018-12-27 14:44:19 +0900 (1545889459)
DECODE_TIME: 2018-12-27 14:44:19 +0900 (1545889459)
[root at toklap124 /]# munge -n | ssh 192.168.70.212 unmunge
root at 192.168.70.212's password:
STATUS: Success (0)
ENCODE_HOST: ??? (192.168.70.214)
ENCODE_TIME: 2018-12-27 14:44:52 +0900 (1545889492)
DECODE_TIME: 2018-12-27 14:44:57 +0900 (1545889497)
[root at toklap124 /]# munge -n | ssh 192.168.70.213 unmunge
root at 192.168.70.213's password:
STATUS: Success (0)
ENCODE_HOST: ??? (192.168.70.214)
ENCODE_TIME: 2018-12-27 14:46:08 +0900 (1545889568)
DECODE_TIME: 2018-12-27 14:46:13 +0900 (1545889573)
# OK it is working but are you sure those ENCODE_HOST ???
# "???" does not look quite right, anyway let's keep going.
# at compute node 192.168.70.212 :
[root at tokwor112 /]# munge -n | unmunge
STATUS: Success (0)
ENCODE_HOST: tokwor112 (192.168.70.212)
ENCODE_TIME: 2018-12-27 14:48:40 +0900 (1545889720)
DECODE_TIME: 2018-12-27 14:48:40 +0900 (1545889720)
[root at toklap120 /]# munge -n | unmunge
STATUS: Success (0)
ENCODE_HOST: toklap120 (192.168.70.213)
ENCODE_TIME: 2018-12-27 14:49:39 +0900 (1545889779)
DECODE_TIME: 2018-12-27 14:49:39 +0900 (1545889779)
###################################################### SLURM INSTALLATION
###################################################### IN EACH NODE (both master and compute)
yum install gcc gcc-c++ gcc-gfortran kernel-devel -y
yum install openssl openssl-devel pam-devel numactl numactl-devel hwloc hwloc-devel lua lua-devel readline-devel rrdtool-devel ncurses-devel man2html libibmad libibumad cpanm* -y
yum install wget gcc gcc-c++ hdf5 hdf5-devel -y
yum install libcurl-devel json-c-devel lz4-devel libibmad-devel libssh2-devel glibc-devel glib2-devel gtk2-devel -y
yum install rpmdevtools -y
cd ~
######################## at MASTER NODE:
rm -rf rpmbuild
rpmbuild -ta slurm-17.11.5.tar.bz2
libtool --finish /lib64/security
rm -rf ~/slurm_rpms/
mkdir ~/slurm_rpms
mv rpmbuild/RPMS/x86_64/slurm*.rpm ~/slurm_rpms
# sending the rpm to the compute nodes:
scp -r ~/slurm_rpms root at 192.168.70.212:~/
scp -r ~/slurm_rpms root at 192.168.70.213:~/
########################################################## IN EACH NODE
yum install ntp -y
yum install mailx -y
yum install ~/slurm_rpms/*.rpm -y
vi /etc/slurm/slurm.conf
# and inside we copy the following :
############################################################################ slurm.conf start
# slurm.conf file generated by configurator easy.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ControlMachine=toklap124
ControlAddr=192.168.70.214
#
# additional suggestions from https://wiki.fysik.dtu.dk/niflheim/Slurm_configuration#reboot-option
RebootProgram="/usr/sbin/reboot"
UnkillableStepTimeout=120
# end additional suggestions
#
MailProg=/bin/mail
MpiDefault=none
#MpiParams=ports=#-#
ProctrackType=proctrack/cgroup
ReturnToService=1
SlurmctldPidFile=/var/run/slurm/slurmctld.pid
SlurmctldPort=8017
SlurmdPidFile=/var/run/slurm/slurmd.pid
SlurmdPort=8018
SlurmdSpoolDir=/var/spool/slurm
SlurmUser=slurm
#SlurmdUser=root
StateSaveLocation=/var/spool/slurm
SwitchType=switch/none
TaskPlugin=task/affinity
#
#
# TIMERS
#KillWait=30
#MinJobAge=300
#SlurmctldTimeout=120
#SlurmdTimeout=300
#
#
# SCHEDULING
FastSchedule=1
SchedulerType=sched/backfill
SelectType=select/cons_res
SelectTypeParameters=CR_Core
#
#
# LOGGING AND ACCOUNTING
AccountingStorageType=accounting_storage/none
ClusterName=cluster
#JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/none
#SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurmctld.log
#SlurmdDebug=3
SlurmdLogFile=/var/log/slurmd.log
#
#
# COMPUTE NODES
NodeName=tokwor112 NodeAddr=192.168.70.212 CPUs=8 Sockets=1 CoresPerSocket=4 ThreadsPerCore=2 State=UNKNOWN
NodeName=toklap120 NodeAddr=192.168.70.213 CPUs=4 Sockets=1 CoresPerSocket=2 ThreadsPerCore=2 State=UNKNOWN
PartitionName=production Nodes=ALL Default=YES MaxTime=INFINITE State=UP
#################################################################### /etc/slurm/slurm.conf ENDS
# we also set the cgroup.conf as follows in each of the nodes
############################################################## /etc/slurm/cgroup.conf STARTS
CgroupAutomount=yes
CgroupReleaseAgentDir="/etc/slurm/cgroup"
ConstrainCores=no
ConstrainRAMSpace=yes
TaskAffinity=no
ConstrainSwapSpace=yes
AllowedSwapSpace=0
############################################################## /etc/slurm/cgroup.conf ENDS
########################################### both on cluster and on compute nodes
mkdir /var/run/slurm
chown slurm: /var/run/slurm
chmod 755 /var/run/slurm
mkdir /var/spool/slurm
chown slurm: /var/spool/slurm
chmod 755 /var/spool/slurm
slurmd -C
# OUTPUT @ master node NodeName=toklap124 CPUs=4 Boards=1 SocketsPerBoard=1 CoresPerSocket=2 ThreadsPerCore=2 RealMemory=7727
# OUTPUT @ compute node1 NodeName=tokwor112 CPUs=8 Boards=1 SocketsPerBoard=1 CoresPerSocket=4 ThreadsPerCore=2 RealMemory=15811
# OUTPUT @ compute node2 NodeName=toklap120 CPUs=4 Boards=1 SocketsPerBoard=1 CoresPerSocket=2 ThreadsPerCore=2 RealMemory=7728
sed -i -e 's/PIDFile=.*/PIDFile=\/var\/run\/slurm\/slurmctld.pid/g' /usr/lib/systemd/system/slurmctld.service
sed -i -e 's/PIDFile=.*/PIDFile=\/var\/run\/slurm\/slurmd.pid/g' /usr/lib/systemd/system/slurmd.service
# start services ############################## master node only
systemctl enable slurmctld
systemctl start slurmctld
systemctl status slurmctld.service
# OK now master node is up and running
# OUTPUT
#● slurmctld.service - Slurm controller daemon
# Loaded: loaded (/usr/lib/systemd/system/slurmctld.service; enabled; vendor preset: disabled)
# Active: active (running) since Tue 2019-01-08 14:04:45 JST; 491ms ago
# Process: 30750 ExecStart=/usr/sbin/slurmctld $SLURMCTLD_OPTIONS (code=exited, status=0/SUCCESS)
# Main PID: 30753 (slurmctld)
# Tasks: 7
# CGroup: /system.slice/slurmctld.service
# └─30753 /usr/sbin/slurmctld
# Jan 08 14:04:45 toklap124 systemd[1]: Starting Slurm controller daemon...
# Jan 08 14:04:45 toklap124 systemd[1]: Started Slurm controller daemon.
# start services ############################## compute nodes only
systemctl enable slurmd.service
systemctl start slurmd.service
systemctl status slurmd.service
# OUTPUT
# COMPUTE NODE 1
# ● slurmd.service - Slurm node daemon
# Loaded: loaded (/usr/lib/systemd/system/slurmd.service; enabled; vendor preset: disabled)
# Active: active (running) since Tue 2019-01-08 14:05:09 JST; 453ms ago
# Process: 22335 ExecStart=/usr/sbin/slurmd $SLURMD_OPTIONS (code=exited, status=0/SUCCESS)
# Main PID: 22340 (slurmd)
# Tasks: 2
# CGroup: /system.slice/slurmd.service
# ├─ 4960 /usr/sbin/slurmd
# └─22340 /usr/sbin/slurmd
#
# Jan 08 14:05:09 tokwor112 systemd[1]: Starting Slurm node daemon...
# Jan 08 14:05:09 tokwor112 systemd[1]: PID file /var/run/slurm/slurmd.pid not readable (yet?) after start.
# Jan 08 14:05:09 tokwor112 systemd[1]: Started Slurm node daemon.
# COMPUTE NODE 2
# ● slurmd.service - Slurm node daemon
# Loaded: loaded (/usr/lib/systemd/system/slurmd.service; enabled; vendor preset: disabled)
# Active: active (running) since Tue 2019-01-08 14:05:17 JST; 541ms ago
# Process: 7873 ExecStart=/usr/sbin/slurmd $SLURMD_OPTIONS (code=exited, status=0/SUCCESS)
# Main PID: 7878 (slurmd)
# Tasks: 3
# CGroup: /system.slice/slurmd.service
# ├─1236 /usr/sbin/slurmd
# └─7878 /usr/sbin/slurmd
#
# Jan 08 14:05:17 toklap120 systemd[1]: Starting Slurm node daemon...
# Jan 08 14:05:17 toklap120 systemd[1]: Started Slurm node daemon.
############################################# AT NODE 1
# now compute node 1 is up and running
# you can launch sinfo successfully
sinfo
OUTPUT
# PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
# production* up infinite 2 idle toklap120,tokwor112
############################################# AT NODE 2
sinfo
OUTPUT
# PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
# production* up infinite 2 idle toklap120,tokwor112
################################################################### COMPUTE NODE 2 job test
[root at toklap120 ~]# srun --ntasks=1 --label /bin/hostname && pwd && whoami
# OUTPUT
0: toklap120
/root/testSlurm
root
cd ~
mkdir testSlurm
cd testSlurm/
vi job.slurm
and inside we copy :
#!/bin/bash
#SBATCH -J pbe_delta # Job name
#SBATCH -o pbe_delta.o%j # Name of stdout output file(%j expands to jobId)
#SBATCH -e pbe_delta.o%j # Name of stderr output file(%j expands to jobId)
#SBATCH -N 1 # Total number of nodes requested (16 cores/node)
#SBATCH -n 1
#SBATCH -t 48:00:00 # Run time (hh:mm:ss)
date> output.out
pwd >> output.out
hostname >> output.out
ls -lah
# launching the job
sbatch job.slurm
# getting the message
# Submitted batch job 17
# inside the directory 2 new files are present:
output.out pbe_delta.o17
# Output.out contains the date, pwd e hostname like in the submitted job:
Tue 8 Jan 14:22:57 JST 2019
/root/testSlurm
toklap120
############################## >> REBOOT MACHINE TESTING
# Rebooting each node
reboot now
# AFTER RESTART COMPLETES
################################################## IN EACH COMPUTE NODE
# disabling firewall on the compute nodes
systemctl stop firewalld
systemctl disable firewalld # Ok no errors
# syncronizing clocks
chkconfig ntpd on
ntpdate pool.ntp.org
systemctl start ntpd # clock should be now syncronized
systemctl enable munge
systemctl start munge
systemctl enable slurmd
systemctl stop slurmd
systemctl start slurmd
systemctl status slurmd
slurmd -D -vvv
# OUTPUT
# from NODE 1
slurmd: debug: switch NONE plugin loaded
slurmd: error: Error binding slurm stream socket: Address already in use
slurmd: error: Unable to bind listen port (192.168.70.212:8018): Address already in use
# from NODE 2
slurmd: debug: switch NONE plugin loaded
slurmd: error: Error binding slurm stream socket: Address already in use
slurmd: error: Unable to bind listen port (192.168.70.213:8018): Address already in use
# not working anymore
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.schedmd.com/pipermail/slurm-users/attachments/20190109/abb309eb/attachment-0001.html>
More information about the slurm-users
mailing list