[slurm-users] Fwd: Slurm/cgroups on a single head/compute node
David da Silva Pires
david.pires at butantan.gov.br
Thu Aug 29 19:58:27 UTC 2019
Hi Alex.
Thank you very much for sending the cgroup-related settings of you cluster.
I implemented a result for the problem. The solution was based on your
advices and the ones that I found at the following URL:
http://rolk.github.io/2015/04/20/slurm-cluster
Now, the user processes started without making use of sbatch, srun or
salloc are allocated at the last 8 threads of the server (numbered
217-224). If one of the three mentioned commands are used, then the jobs
can be allocated at any of the 224 threads.
This situation is almost the perfect one. I still would like to avoid the
last 8 threads for slurm jobs in order to avoid jobs and usual processes
sharing the same threads, which can impact negatively for both.
While I try to do this, here is my current configuration:
=============================================================================
/etc/slurm-llnl/slurm.conf
=============================================================================
ControlAddr=172.25.2.25
AuthType=auth/munge
CacheGroups=0
CryptoType=crypto/munge
GresTypes=gpu
MaxTasksPerNode=216
SlurmUser=slurm
SlurmctldPort=6817
SlurmdPort=6818
SlurmdSpoolDir=/var/lib/slurm-llnl/slurmd
StateSaveLocation=/var/lib/slurm-llnl/slurmctld
SlurmctldPidFile=/var/run/slurm-llnl/slurmctld.pid
SlurmdPidFile=/var/run/slurm-llnl/slurmd%n.pid
SwitchType=switch/none
ProctrackType=proctrack/cgroup
MpiDefault=none
RebootProgram=/sbin/reboot
ReturnToService=2
TaskPluginParam=sched
InactiveLimit=0
KillWait=30
MinJobAge=300
SlurmctldTimeout=120
SlurmdTimeout=1800
Waittime=0
SchedulerType=sched/backfill
PreemptMode=suspend,gang
PreemptType=preempt/partition_prio
DefMemPerNode=998749
FastSchedule=1
SelectType=select/cons_res
SelectTypeParameters=CR_Core_Memory,CR_CORE_DEFAULT_DIST_BLOCK
TaskPlugin=task/affinity,task/cgroup
PriorityType=priority/multifactor
PriorityDecayHalfLife=3-0
PriorityFavorSmall=YES
PriorityMaxAge=7-0
PriorityWeightAge=1000
PriorityWeightFairshare=0
PriorityWeightJobSize=125
PriorityWeightPartition=1000
PriorityWeightQOS=0
SlurmctldLogFile=/var/log/slurm-llnl/slurmctld.log
SlurmdLogFile=/var/log/slurm-llnl/slurmd.log
AccountingStorageType=accounting_storage/filetxt
CheckpointType=checkpoint/none
AccountingStorageHost=vital
AccountingStorageLoc=/var/log/slurm-llnl/accounting
AccountingStoragePass=/var/run/munge/munge.socket.2
AccountingStoragePort=6819
AccountingStorageUser=slurm
AccountingStoreJobComment=YES
ClusterName=bioinfo
ControlMachine=vital
JobCompHost=vital
JobCompLoc=/var/log/slurm-llnl/job_completions
JobCompPass=<xxxxxxxx>
JobCompPort=6819
JobCompType=jobcomp/filetxt
JobCompUser=slurm
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/cgroup
SlurmctldDebug=verbose
SlurmdDebug=verbose
BurstBufferType=burst_buffer/generic
NodeName=vital NodeAddr=172.25.2.25 CPUs=224 RealMemory=1031517 Sockets=4
CoresPerSocket=28 ThreadsPerCore=2 State=UNKNOWN Gres=gpu:1
MemSpecLimit=32768
PartitionName=batch Nodes=vital Shared=FORCE:1 OverSubscribe=YES
Default=YES MaxTime=INFINITE State=UP
=============================================================================
/etc/slurm-llnl/cgroup.conf
=============================================================================
CgroupMountpoint="/sys/fs/cgroup"
CgroupAutomount=yes
CgroupReleaseAgentDir="/etc/slurm-llnl/cgroup"
ConstrainCores=yes
ConstrainRAMSpace=yes
=============================================================================
/etc/cgconfig.conf
=============================================================================
group interactive {
cpu {
}
cpuset {
cpuset.cpus = 216-223;
cpuset.cpu_exclusive = 1;
cpuset.mem_exclusive = 1;
cpuset.mem_hardwall = 1;
cpuset.memory_migrate = 0;
cpuset.memory_spread_page = 0;
cpuset.memory_spread_slab = 0;
cpuset.mems = 0;
cpuset.sched_load_balance = 0;
cpuset.sched_relax_domain_level = -1;
}
memory {
memory.limit_in_bytes = "8G";
memory.memsw.limit_in_bytes = "8G";
}
}
=============================================================================
/etc/cgrules.conf
=============================================================================
root cpu,memory /
slurm cpu,memory /
* cpuset,memory /interactive
The ideia is to create a cgroups with a cpuset for slurm, from 1 to 216.
Let's see if it works.
Best.
--
David da Silva Pires
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.schedmd.com/pipermail/slurm-users/attachments/20190829/484db511/attachment-0002.htm>
More information about the slurm-users
mailing list