[slurm-users] unable to Hold and release the job using scontrol
Zainul Abiddin
zainul1114 at gmail.com
Sun May 23 06:05:54 UTC 2021
Hi All,
i am trying to hold the job from Scontol but not able to hold the job.
i am not able to understand..can any one please explain the concept of Hold
and Release, Suspend and Resume.
Please find the below steps which i have tried.
[root at master ~]# cat test.sh
#!/bin/bash
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -p hpc
#SBATCH -t 01:00:00
#SBATCH -J testjob
#SBATCH -o testjob.o%j
#SBATCH -e testjob.e%j
cd $SLURM_SUBMIT_DIR
/bin/hostname
date
sleep 120
[root at master ~]# sbatch test.sh
Submitted batch job 28
[root at master ~]# sbatch test.sh
Submitted batch job 29
[root at master ~]# sbatch test.sh
Submitted batch job 30
[root at master ~]# squeue
JOBID PARTITION NAME USER ST TIME NODES
NODELIST(REASON)
30 hpc testjob root PD 0:00 1
(Resources)
28 hpc testjob root R 0:06 1 master
29 hpc testjob root R 0:05 1 master
[root at master ~]# sinfo -Nl
Sun May 23 11:16:55 2021
NODELIST NODES PARTITION STATE CPUS S:C:T MEMORY TMP_DISK WEIGHT
AVAIL_FE REASON
master 1 hpc* allocated 2 2:1:1 1024 0 1
(null) none
[root at master ~]# squeue
JOBID PARTITION NAME USER ST TIME NODES
NODELIST(REASON)
30 hpc testjob root PD 0:00 1
(Resources)
28 hpc testjob root R 0:39 1 master
29 hpc testjob root R 0:38 1 master
[root at master ~]# scontrol hold 28
[root at master ~]# squeue
JOBID PARTITION NAME USER ST TIME NODES
NODELIST(REASON)
30 hpc testjob root PD 0:00 1
(Resources)
29 hpc testjob root R 1:04 1 master
28 hpc testjob root R 1:05 1 master
[root at master ~]# scontrol hold 28
[root at master ~]# scontrol hold 28
[root at master ~]# squeue
JOBID PARTITION NAME USER ST TIME NODES
NODELIST(REASON)
30 hpc testjob root PD 0:00 1
(Resources)
29 hpc testjob root R 1:14 1 master
28 hpc testjob root R 1:15 1 master
[root at master ~]# scontrol suspend 28
[root at master ~]# squeue
JOBID PARTITION NAME USER ST TIME NODES
NODELIST(REASON)
29 hpc testjob root R 1:38 1 master
30 hpc testjob root R 0:01 1 master
28 hpc testjob root S 1:37 1 master
[root at master ~]# squeue
JOBID PARTITION NAME USER ST TIME NODES
NODELIST(REASON)
29 hpc testjob root R 1:59 1 master
30 hpc testjob root R 0:22 1 master
28 hpc testjob root S 1:37 1 master
[root at master ~]# squeue
JOBID PARTITION NAME USER ST TIME NODES
NODELIST(REASON)
30 hpc testjob root R 0:41 1 master
28 hpc testjob root S 1:37 1 master
[root at master ~]# squeue
JOBID PARTITION NAME USER ST TIME NODES
NODELIST(REASON)
30 hpc testjob root R 0:55 1 master
28 hpc testjob root S 1:37 1 master
[root at master ~]# scontrol release 28
[root at master ~]# squeue
JOBID PARTITION NAME USER ST TIME NODES
NODELIST(REASON)
30 hpc testjob root R 1:20 1 master
28 hpc testjob root S 1:37 1 master
[root at master ~]# squeue
JOBID PARTITION NAME USER ST TIME NODES
NODELIST(REASON)
30 hpc testjob root R 1:22 1 master
28 hpc testjob root S 1:37 1 master
[root at master ~]# squeue
JOBID PARTITION NAME USER ST TIME NODES
NODELIST(REASON)
30 hpc testjob root R 1:23 1 master
28 hpc testjob root S 1:37 1 master
[root at master ~]# squeue
JOBID PARTITION NAME USER ST TIME NODES
NODELIST(REASON)
30 hpc testjob root R 1:25 1 master
28 hpc testjob root S 1:37 1 master
[root at master ~]# scontrol resume 28
[root at master ~]# squeue
JOBID PARTITION NAME USER ST TIME NODES
NODELIST(REASON)
30 hpc testjob root R 1:40 1 master
[root at master ~]# squeue
JOBID PARTITION NAME USER ST TIME NODES
NODELIST(REASON)
30 hpc testjob root R 2:00 1 master
[root at master ~]# squeue
JOBID PARTITION NAME USER ST TIME NODES
NODELIST(REASON)
[root at master ~]#
--
*Regards*
*Zain*
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.schedmd.com/pipermail/slurm-users/attachments/20210523/ff860b07/attachment.htm>
More information about the slurm-users
mailing list