[slurm-users] seff Not Caluculating [FIXED?]
Diego Zuccato
diego.zuccato at unibo.it
Tue Nov 17 11:38:51 UTC 2020
Il 09/11/20 12:53, Diego Zuccato ha scritto:
> Seems my corrections actually work only for single-node jobs.
> In case of multi-node jobs, it only considers the memory used on one
> node, hence understimates the real efficiency.
> Someone more knowledgeable than me can spot the error?Seems I managed to have it account for the memory on all the nodes.
See attached file.
The results seem quite meaningful and match the ones done by hand.
--
Diego Zuccato
DIFA - Dip. di Fisica e Astronomia
Servizi Informatici
Alma Mater Studiorum - Università di Bologna
V.le Berti-Pichat 6/2 - 40127 Bologna - Italy
tel.: +39 051 20 95786
-------------- next part --------------
#!/usr/bin/perl
use warnings;
use strict qw/vars/;
use Getopt::Std;
use POSIX qw/pow/;
use Sys::Hostname;
use Slurmdb ':all';
use Slurm ':all';
#use Data::Dumper;
my $VERSION = "2.1";
# This script is roughtly equivalent to:
# sacct -P -n -a --format JobID,User,Group,State,Cluster,AllocCPUS,REQMEM,TotalCPU,Elapsed,MaxRSS,ExitCode,NNodes,NTasks -j <job_id>
my %opts;
getopts('hvdf:',\%opts);
if (exists $opts{v}) {
print "seff Version $VERSION\n";
exit 1;
}
if (exists $opts{h} || scalar @ARGV != 1) {
print "Usage: seff [Options] <Jobid>\n";
print " Options:\n";
print " -h Help menu\n";
print " -v Version\n";
print " -d Debug mode: display raw Slurm data\n";
exit 1;
}
my $mydebug = 0;
if (exists $opts{d}) {
$mydebug = 1;
}
my $jobid_arg = $ARGV[0];
my $db_conn = Slurmdb::connection_get();
my $slurm = Slurm::new();
# Get cluster name from SLurm config file.
my $conf = $slurm->load_ctl_conf();
my $clustername = $conf->{'cluster_name'};
my %job_cond = ();
$job_cond{without_usage_truncation} = 1;
$job_cond{cluster_list} = [$clustername];
$job_cond{step_list} = $jobid_arg;
$job_cond{usage_start} = 0;
$job_cond{usage_end} = 0;
# Get and test for a single job.
my $jobs = Slurmdb::jobs_get($db_conn, \%job_cond);
if (scalar @$jobs < 1) {
print STDERR "Job not found.\n";
exit 2;
}
my $job = @$jobs[0];
#print Dumper($job);
my $jobid = $job->{'jobid'};
my $user = $job->{'user'};
my $group = getgrgid($job->{'gid'});
my $state = $slurm->job_state_string($job->{'state'});
$clustername = $job->{'cluster'};
my $ncpus = $job->{'req_cpus'}; #@@@ was alloc_cpus
# Check for missing number of cpus.
if ($ncpus == 0) { $ncpus = 1; }
my $reqmem = $job->{'req_mem'};
my $nnodes = $job->{'alloc_nodes'};
# Check for missing number of nodes.
if ($nnodes == 0) { $nnodes = 1; }
my $pernode;
if ($reqmem & MEM_PER_CPU) {
$reqmem = ($reqmem & ~MEM_PER_CPU) * 1024 * $ncpus;
$pernode = 0;
} else {
$reqmem = $reqmem * 1024 * $nnodes;
$pernode = 1;
}
my $walltime = $job->{'elapsed'};
# Only use hi-order byte for error code.
my $exit_status = $job->{'exitcode'} >> 8;
my $array_job_id = $job->{'array_job_id'};
my $array_jobid = "";
if ($array_job_id != 0) {
# Convert array_task_id to a signed long integer.
my $array_task_id = unpack('l', pack('l', $job->{'array_task_id'}));
if ($array_task_id == -2) {
print STDERR "Badly formatted array jobid $array_job_id with task_id = -2\n";
exit 3;
}
$array_jobid = "${array_job_id}_${array_task_id}";
}
my $tot_cpu_sec = 0;
my $tot_cpu_usec = 0;
my $mem = 0;
my $ntasks = 0;
for my $step (@{$job->{'steps'}}) {
$tot_cpu_sec += $step->{'tot_cpu_sec'};
$tot_cpu_usec += $step->{'tot_cpu_usec'};
# my $lmem = $step->{'stats'}{'rss_max'};
#@@@ deve usare la seconda voce di tres_usage_in_max
my %hash = split /[,=]/, $step->{'stats'}{'tres_usage_in_max'};
my $lmem=$hash{'2'}/1024;
# if ($mem < $lmem) {
# $mem = $lmem;
# $ntasks = $step->{'ntasks'};
# }
my $ltasks=$step->{'ntasks'};
$ntasks += $ltasks;
$mem += $lmem*$ltasks;
}
my $cput = $tot_cpu_sec + int(($tot_cpu_usec / 1000000) + 0.5);
#$mem = $mem * $ntasks;
if ($mydebug) {
print "Slurm data: JobID ArrayJobID User Group State Clustername Ncpus Nnodes Ntasks Reqmem PerNode Cput Walltime Mem ExitStatus\n";
print "Slurm data: $jobid $array_jobid $user $group $state $clustername $ncpus $nnodes $ntasks $reqmem $pernode $cput $walltime $mem $exit_status\n\n";
}
print "Job ID: $jobid\n";
if (length $array_jobid) {
print "Array Job ID: $array_jobid\n";
}
print "Cluster: $clustername\n";
print "User/Group: $user/$group\n";
if ($state eq "PENDING" || $state eq "RUNNING") {
print "State: $state\n";
} else {
print "State: $state (exit code $exit_status)\n";
}
if ($ncpus == 1) {
print "Cores: $ncpus\n";
} else {
print "Nodes: $nnodes\n";
printf "Cores per node: %d\n", $ncpus/$nnodes;
}
if ($state ne "PENDING") {
my $corewalltime = $walltime * $ncpus;
my $cpu_eff;
if ($corewalltime != 0) {
$cpu_eff = $cput / $corewalltime * 100;
} else {
$cpu_eff = 0.0;
}
printf("CPU Utilized: %s\n", time2str($cput));
printf("CPU Efficiency: %.2f%% of %s core-walltime\n", $cpu_eff, time2str($corewalltime));
if ($ntasks == 1) {
printf("Memory Utilized: %s\n", kbytes2str($mem));
} else {
printf("Memory Utilized: %s (estimated maximum)\n", kbytes2str($mem));
}
my $mem_eff;
if ($reqmem != 0) {
$mem_eff = $mem / $reqmem * 100;
} else {
$mem_eff = 0.0;
}
if ($ntasks == 1) {
printf("Memory Efficiency: %.2f%% of %s\n", $mem_eff, kbytes2str($reqmem));
} else {
if ($pernode) {
printf("Memory Efficiency: %.2f%% of %s (%s\/node)\n", $mem_eff, kbytes2str($reqmem), kbytes2str($reqmem / $nnodes));
} else {
printf("Memory Efficiency: %.2f%% of %s (%s\/core)\n", $mem_eff, kbytes2str($reqmem), kbytes2str($reqmem / $ncpus));
}
}
if ($state eq "RUNNING") {
print "WARNING: Efficiency statistics may be misleading for $state jobs.\n";
}
} else {
print "Efficiency not available for jobs in the PENDING state.\n";
}
# Convert elapsed time to string.
sub time2str {
my $time = shift;
my $days = int($time / 86400);
$time -= ($days * 86400);
my $hours = int($time / 3600);
$time -= ($hours * 3600);
my $minutes = int($time / 60);
my $seconds = $time % 60;
$days = $days < 1 ? '' : "$days-";
$time = $days . sprintf("%02s:%02s:%02s", $hours, $minutes, $seconds);
return $time;
}
# Convert memory to human-readable string.
sub kbytes2str {
my $kbytes = shift;
if ($kbytes == 0) {
return sprintf("%.2f %sB", 0.0, 'M');
}
my $mul = 1024;
my $exp = int(log($kbytes) / log($mul));
my @pre = qw/ M G T P E /;
my $pre = $pre[$exp-1];
return sprintf("%.2f %sB", ($kbytes / pow($mul, $exp)), $pre);
}
More information about the slurm-users
mailing list