[slurm-users] Fwd: Slurm/cgroups on a single head/compute node

David da Silva Pires david.pires at butantan.gov.br
Thu Aug 29 19:58:27 UTC 2019


Hi Alex.

Thank you very much for sending the cgroup-related settings of you cluster.

I implemented a result for the problem. The solution was based on your
advices and the ones that I found at the following URL:

http://rolk.github.io/2015/04/20/slurm-cluster

Now, the user processes started without making use of sbatch, srun or
salloc are allocated at the last 8 threads of the server (numbered
217-224). If one of the three mentioned commands are used, then the jobs
can be allocated at any of the 224 threads.

This situation is almost the perfect one. I still would like to avoid the
last 8 threads for slurm jobs in order to avoid jobs and usual processes
sharing the same threads, which can impact negatively for both.

While I try to do this, here is my current configuration:


=============================================================================
/etc/slurm-llnl/slurm.conf
=============================================================================
ControlAddr=172.25.2.25
AuthType=auth/munge
CacheGroups=0
CryptoType=crypto/munge

GresTypes=gpu
MaxTasksPerNode=216

SlurmUser=slurm
SlurmctldPort=6817
SlurmdPort=6818
SlurmdSpoolDir=/var/lib/slurm-llnl/slurmd
StateSaveLocation=/var/lib/slurm-llnl/slurmctld
SlurmctldPidFile=/var/run/slurm-llnl/slurmctld.pid
SlurmdPidFile=/var/run/slurm-llnl/slurmd%n.pid
SwitchType=switch/none
ProctrackType=proctrack/cgroup
MpiDefault=none
RebootProgram=/sbin/reboot

ReturnToService=2

TaskPluginParam=sched
InactiveLimit=0
KillWait=30
MinJobAge=300
SlurmctldTimeout=120
SlurmdTimeout=1800
Waittime=0
SchedulerType=sched/backfill
PreemptMode=suspend,gang
PreemptType=preempt/partition_prio
DefMemPerNode=998749
FastSchedule=1

SelectType=select/cons_res
SelectTypeParameters=CR_Core_Memory,CR_CORE_DEFAULT_DIST_BLOCK
TaskPlugin=task/affinity,task/cgroup
PriorityType=priority/multifactor
PriorityDecayHalfLife=3-0
PriorityFavorSmall=YES
PriorityMaxAge=7-0
PriorityWeightAge=1000
PriorityWeightFairshare=0
PriorityWeightJobSize=125
PriorityWeightPartition=1000
PriorityWeightQOS=0
SlurmctldLogFile=/var/log/slurm-llnl/slurmctld.log
SlurmdLogFile=/var/log/slurm-llnl/slurmd.log

AccountingStorageType=accounting_storage/filetxt

CheckpointType=checkpoint/none

AccountingStorageHost=vital
AccountingStorageLoc=/var/log/slurm-llnl/accounting
AccountingStoragePass=/var/run/munge/munge.socket.2
AccountingStoragePort=6819
AccountingStorageUser=slurm
AccountingStoreJobComment=YES

ClusterName=bioinfo
ControlMachine=vital

JobCompHost=vital
JobCompLoc=/var/log/slurm-llnl/job_completions
JobCompPass=<xxxxxxxx>
JobCompPort=6819
JobCompType=jobcomp/filetxt
JobCompUser=slurm
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/cgroup
SlurmctldDebug=verbose
SlurmdDebug=verbose
BurstBufferType=burst_buffer/generic
NodeName=vital NodeAddr=172.25.2.25 CPUs=224 RealMemory=1031517 Sockets=4
CoresPerSocket=28 ThreadsPerCore=2 State=UNKNOWN Gres=gpu:1
MemSpecLimit=32768

PartitionName=batch Nodes=vital Shared=FORCE:1 OverSubscribe=YES
Default=YES MaxTime=INFINITE State=UP


=============================================================================
/etc/slurm-llnl/cgroup.conf
=============================================================================
CgroupMountpoint="/sys/fs/cgroup"
CgroupAutomount=yes
CgroupReleaseAgentDir="/etc/slurm-llnl/cgroup"
ConstrainCores=yes
ConstrainRAMSpace=yes


=============================================================================
/etc/cgconfig.conf
=============================================================================
group interactive {
  cpu {
  }
  cpuset {
     cpuset.cpus = 216-223;
     cpuset.cpu_exclusive = 1;
     cpuset.mem_exclusive = 1;
     cpuset.mem_hardwall = 1;
     cpuset.memory_migrate = 0;
     cpuset.memory_spread_page = 0;
     cpuset.memory_spread_slab = 0;
     cpuset.mems = 0;
     cpuset.sched_load_balance = 0;
     cpuset.sched_relax_domain_level = -1;
  }
  memory {
     memory.limit_in_bytes = "8G";
     memory.memsw.limit_in_bytes = "8G";
  }
}


=============================================================================
/etc/cgrules.conf
=============================================================================
root  cpu,memory /
slurm cpu,memory /
* cpuset,memory /interactive


The ideia is to create a cgroups with a cpuset for slurm, from 1 to 216.
Let's see if it works.

Best.

--
David da Silva Pires
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.schedmd.com/pipermail/slurm-users/attachments/20190829/484db511/attachment-0002.htm>


More information about the slurm-users mailing list