# # slurm.conf file generated by configurator easy.html. # Put this file on all nodes of your cluster. # See the slurm.conf man page for more information. # # cluster master and backup # ControlMachine=g-vm03 ControlAddr=g-vm03.cmm.in.tum.de MailProg="/bin/mail" MpiDefault="none" MpiParams=ports=55000-60000 ProctrackType="proctrack/cgroup" ## # Rebooting nodes ## # return after reboot ReturnToService=2 # reboot a node once it becomes idle RebootProgram="/sbin/reboot" # raise the timeout for rebooting to 10 mins ResumeTimeout=600 ## # Health check ## HealthCheckProgram=/etc/slurm/healthcheck.sh # check every 10 mins HealthCheckInterval=600 # authentication AuthType=auth/munge CryptoType=crypto/munge # job control ## # slurm run files and communication ports ## SlurmctldPidFile=/run/slurmctld.pid SlurmctldPort=6817 SlurmdPidFile=/run/slurmd.pid SlurmdPort=6818 SlurmdSpoolDir=/var/spool/slurmd/ SlurmUser=slurm SrunPortRange=60001-63000 ## # slurm task switches ### SwitchType=switch/none #TaskEpilog= TaskPlugin=task/cgroup,task/affinity #TaskPluginParam= # send group id's as listed on the job-spawning node LaunchParameters=send_gids # # # TIMERS # increase timeout until 'Kill Task Failed' is thrown UnkillableStepTimeout=300 # slurmctld connection management MessageTimeout=60 TCPTimeout=30 SlurmctldParameters=cloud_dns,conmgr_max_connections=64,conmgr_threads=8 # # # SCHEDULING EnforcePartLimits="yes" SchedulerType=sched/backfill SelectType=select/cons_tres SelectTypeParameters=CR_CPU_Memory DefMemPerCPU=4000 # # # LOGGING AND ACCOUNTING ClusterName=ag_gagneur AccountingStorageType="accounting_storage/slurmdbd" AccountingStorageHost="g-vm03" AccountingStoragePort="6819" AccountingStorageTRES=gres/gpu JobAcctGatherType=jobacct_gather/cgroup ### # Submitting jobs ### MinJobAge=300 ### # Slurm Debuging infos ### SlurmctldDebug="error" SlurmctldLogFile=/var/log/slurmd/slurmctl.log SlurmdDebug="info" SlurmdLogFile=/var/log/slurmd/slurmd.log #DebugFlags="CPU_Bind,gres" StateSaveLocation="/var/spool/slurmd/" #### # Slurm Prolog and Epilog scripts #### Epilog="/etc/slurm/epilog.d/*.sh" TaskProlog="/etc/slurm/TaskProlog.sh" #### # Types of Generic Resources #### GresTypes=gpu,mps,tmp PriorityType = priority/multifactor PreemptMode = REQUEUE PreemptType = preempt/partition_prio PriorityWeightAge=100 PriorityWeightTRES=GRES/gpu=1000 PriorityWeightPartition=1000 # COMPUTE NODES NodeName="ouga03" CPUs=64 RealMemory=256000 CoresPerSocket=8 ThreadsPerCore=2 Weight=10 State=UNKNOWN Feature=sse2,sse4_1,sse4_2,avx NodeName="ouga04" CPUs=80 RealMemory=512000 CoresPerSocket=10 ThreadsPerCore=2 Weight=20 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 NodeName="ouga05" CPUs=128 RealMemory=512000 CoresPerSocket=64 ThreadsPerCore=2 Weight=100 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 Gres=gpu:titanrtx:1 NodeName="ouga06" CPUs=128 RealMemory=512000 CoresPerSocket=64 ThreadsPerCore=2 Weight=400 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 Gres=gpu:a6000:4 NodeName="ouga07" CPUs=256 RealMemory=1024000 CoresPerSocket=64 ThreadsPerCore=2 Weight=200 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 Gres=gpu:a40:2 NodeName="ouga08" CPUs=128 RealMemory=512000 CoresPerSocket=64 ThreadsPerCore=2 Weight=200 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 Gres=gpu:rtx3090:1 NodeName="ouga09" CPUs=32 RealMemory=61440 CoresPerSocket=8 ThreadsPerCore=2 Weight=10 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 NodeName="ouga10" CPUs=128 RealMemory=512000 CoresPerSocket=64 ThreadsPerCore=2 Weight=400 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 Gres=gpu:a40:4 NodeName="ouga11" CPUs=128 RealMemory=512000 CoresPerSocket=64 ThreadsPerCore=2 Weight=400 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 Gres=gpu:a40:4 NodeName="ouga12" CPUs=128 RealMemory=512000 CoresPerSocket=64 ThreadsPerCore=2 Weight=800 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 Gres=gpu:a40:8 NodeName="ouga13" CPUs=128 RealMemory=512000 CoresPerSocket=64 ThreadsPerCore=2 Weight=800 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 Gres=gpu:a40:8 NodeName="ouga14" CPUs=128 RealMemory=512000 CoresPerSocket=64 ThreadsPerCore=2 Weight=800 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 Gres=gpu:a40:8 NodeName="ouga15" CPUs=128 RealMemory=512000 CoresPerSocket=64 ThreadsPerCore=2 Weight=400 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 Gres=gpu:a40:4 NodeName="ouga16" CPUs=256 RealMemory=1024000 CoresPerSocket=64 ThreadsPerCore=2 Weight=400 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 Gres=gpu:a40:4 NodeName="ouga17" CPUs=256 RealMemory=1024000 CoresPerSocket=64 ThreadsPerCore=2 Weight=400 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 Gres=gpu:a40:4 NodeName="ouga18" CPUs=128 RealMemory=512000 CoresPerSocket=64 ThreadsPerCore=2 Weight=30 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 NodeName="ouga19" CPUs=128 RealMemory=512000 CoresPerSocket=64 ThreadsPerCore=2 Weight=30 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 NodeName="ouga20" CPUs=128 RealMemory=512000 CoresPerSocket=64 ThreadsPerCore=2 Weight=30 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 NodeName="ouga21" CPUs=128 RealMemory=512000 CoresPerSocket=64 ThreadsPerCore=2 Weight=30 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 NodeName="ouga22" CPUs=128 RealMemory=512000 CoresPerSocket=64 ThreadsPerCore=2 Weight=30 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 NodeName="ouga23" CPUs=128 RealMemory=512000 CoresPerSocket=64 ThreadsPerCore=2 Weight=30 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 NodeName="ouga24" CPUs=128 RealMemory=512000 CoresPerSocket=64 ThreadsPerCore=2 Weight=30 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 NodeName="ouga25" CPUs=128 RealMemory=512000 CoresPerSocket=32 ThreadsPerCore=2 Weight=2000 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 Gres=gpu:l40s:4 NodeName="ouga26" CPUs=128 RealMemory=512000 CoresPerSocket=32 ThreadsPerCore=2 Weight=2000 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 Gres=gpu:l40s:4 NodeName="ouga27" CPUs=128 RealMemory=512000 CoresPerSocket=32 ThreadsPerCore=2 Weight=2000 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 Gres=gpu:l40s:4 NodeName="ouga28" CPUs=240 RealMemory=1536000 CoresPerSocket=60 ThreadsPerCore=2 Weight=5000 State=UNKNOWN Feature=fma,sse2,sse4_1,sse4_2,avx,avx2 Gres=gpu:h200:8 # PARTITIONS PartitionName=lowprio Nodes="ouga[03-27]" Default="NO" PriorityTier=5 PreemptMode="REQUEUE" MaxTime="INFINITE" State="UP" OverSubscribe="NO" AllowGroups="cluster_access" PartitionName=noninterruptive Nodes="ouga[03-10],ouga[12-14],ouga[24-26]" Default="NO" PriorityTier=10 PreemptMode="off" MaxTime="INFINITE" State="UP" OverSubscribe="NO" AllowGroups="cluster_access" PartitionName=standard Nodes="ouga[03-27]" Default="YES" PriorityTier=10 PreemptMode="REQUEUE" MaxTime="INFINITE" State="UP" OverSubscribe="NO" AllowGroups="cluster_access" PartitionName=urgent Nodes="ouga[03-28]" Default="NO" PriorityTier=20 PreemptMode="REQUEUE" MaxTime="1-0" State="UP" OverSubscribe="NO" AllowGroups="cluster_access" PartitionName=highperformance Nodes="ouga[25-28]" Default="NO" PriorityTier=15 PreemptMode="REQUEUE" MaxTime="INFINITE" State="UP" OverSubscribe="NO" AllowGroups="cluster_access" PartitionName=jupyterhub Nodes="ouga[05-08],ouga[10-11],ouga[14-22],ouga24" Default="NO" PriorityTier=30 PreemptMode="off" MaxTime="0-12" State="UP" OverSubscribe="NO" AllowGroups="cluster_access"