[slurm-users] unable to Hold and release the job using scontrol

Zainul Abiddin zainul1114 at gmail.com
Sun May 23 06:05:54 UTC 2021


Hi All,
i am trying to hold the job from Scontol but not able to hold the job.
i am not able to understand..can any one please explain the concept of Hold
and Release, Suspend and Resume.

Please find the below steps which i have tried.

[root at master ~]# cat test.sh
#!/bin/bash

#SBATCH -N 1
#SBATCH -n 1
#SBATCH -p hpc
#SBATCH -t 01:00:00
#SBATCH -J testjob
#SBATCH -o testjob.o%j
#SBATCH -e testjob.e%j

cd $SLURM_SUBMIT_DIR
/bin/hostname
date
sleep 120

[root at master ~]# sbatch test.sh
Submitted batch job 28
[root at master ~]# sbatch test.sh
Submitted batch job 29
[root at master ~]# sbatch test.sh
Submitted batch job 30
[root at master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root PD       0:00      1
(Resources)
                28       hpc  testjob     root  R       0:06      1 master
                29       hpc  testjob     root  R       0:05      1 master
[root at master ~]# sinfo -Nl
Sun May 23 11:16:55 2021
NODELIST   NODES PARTITION       STATE CPUS    S:C:T MEMORY TMP_DISK WEIGHT
AVAIL_FE REASON
master         1      hpc*   allocated 2       2:1:1   1024        0      1
  (null) none
[root at master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root PD       0:00      1
(Resources)
                28       hpc  testjob     root  R       0:39      1 master
                29       hpc  testjob     root  R       0:38      1 master
[root at master ~]# scontrol hold 28
[root at master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root PD       0:00      1
(Resources)
                29       hpc  testjob     root  R       1:04      1 master
                28       hpc  testjob     root  R       1:05      1 master
[root at master ~]# scontrol hold 28
[root at master ~]# scontrol hold 28
[root at master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root PD       0:00      1
(Resources)
                29       hpc  testjob     root  R       1:14      1 master
                28       hpc  testjob     root  R       1:15      1 master
[root at master ~]# scontrol suspend 28
[root at master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                29       hpc  testjob     root  R       1:38      1 master
                30       hpc  testjob     root  R       0:01      1 master
                28       hpc  testjob     root  S       1:37      1 master
[root at master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                29       hpc  testjob     root  R       1:59      1 master
                30       hpc  testjob     root  R       0:22      1 master
                28       hpc  testjob     root  S       1:37      1 master
[root at master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root  R       0:41      1 master
                28       hpc  testjob     root  S       1:37      1 master
[root at master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root  R       0:55      1 master
                28       hpc  testjob     root  S       1:37      1 master
[root at master ~]# scontrol release 28
[root at master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root  R       1:20      1 master
                28       hpc  testjob     root  S       1:37      1 master
[root at master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root  R       1:22      1 master
                28       hpc  testjob     root  S       1:37      1 master
[root at master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root  R       1:23      1 master
                28       hpc  testjob     root  S       1:37      1 master
[root at master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root  R       1:25      1 master
                28       hpc  testjob     root  S       1:37      1 master
[root at master ~]# scontrol resume 28
[root at master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root  R       1:40      1 master
[root at master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root  R       2:00      1 master
[root at master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
[root at master ~]#
-- 
*Regards*

*Zain*
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.schedmd.com/pipermail/slurm-users/attachments/20210523/ff860b07/attachment.htm>


More information about the slurm-users mailing list