[slurm-users] seff Not Caluculating [FIXED?]

Diego Zuccato diego.zuccato at unibo.it
Tue Nov 17 11:38:51 UTC 2020


Il 09/11/20 12:53, Diego Zuccato ha scritto:

> Seems my corrections actually work only for single-node jobs.
> In case of multi-node jobs, it only considers the memory used on one
> node, hence understimates the real efficiency.
> Someone more knowledgeable than me can spot the error?Seems I managed to have it account for the memory on all the nodes.
See attached file.
The results seem quite meaningful and match the ones done by hand.

-- 
Diego Zuccato
DIFA - Dip. di Fisica e Astronomia
Servizi Informatici
Alma Mater Studiorum - Università di Bologna
V.le Berti-Pichat 6/2 - 40127 Bologna - Italy
tel.: +39 051 20 95786
-------------- next part --------------
#!/usr/bin/perl

use warnings;
use strict qw/vars/;

use Getopt::Std;
use POSIX qw/pow/;
use Sys::Hostname;
use Slurmdb ':all';
use Slurm ':all';
#use Data::Dumper;

my $VERSION = "2.1";

# This script is roughtly equivalent to:
# sacct -P -n -a --format JobID,User,Group,State,Cluster,AllocCPUS,REQMEM,TotalCPU,Elapsed,MaxRSS,ExitCode,NNodes,NTasks -j <job_id>

my %opts;
getopts('hvdf:',\%opts);

if (exists $opts{v}) {
    print "seff Version $VERSION\n";
    exit 1;
}
if (exists $opts{h} || scalar @ARGV != 1) {
    print "Usage: seff [Options] <Jobid>\n";
    print "       Options:\n";
    print "       -h    Help menu\n";
    print "       -v    Version\n";
    print "       -d    Debug mode: display raw Slurm data\n";
    exit 1;
}
my $mydebug = 0;
if (exists $opts{d}) {
    $mydebug = 1;
}
my $jobid_arg = $ARGV[0];

my $db_conn = Slurmdb::connection_get();
my $slurm = Slurm::new();

# Get cluster name from SLurm config file.
my $conf = $slurm->load_ctl_conf();
my $clustername = $conf->{'cluster_name'};

my %job_cond = ();
$job_cond{without_usage_truncation} = 1;
$job_cond{cluster_list} = [$clustername];
$job_cond{step_list} = $jobid_arg;
$job_cond{usage_start} = 0;
$job_cond{usage_end} = 0;

# Get and test for a single job.
my $jobs = Slurmdb::jobs_get($db_conn, \%job_cond);
if (scalar @$jobs < 1) {
    print STDERR "Job not found.\n";
    exit 2;
}
my $job = @$jobs[0];
#print Dumper($job);

my $jobid = $job->{'jobid'};
my $user = $job->{'user'};
my $group = getgrgid($job->{'gid'});
my $state = $slurm->job_state_string($job->{'state'});
$clustername = $job->{'cluster'};
my $ncpus = $job->{'req_cpus'};	#@@@ was alloc_cpus
# Check for missing number of cpus.
if ($ncpus == 0) { $ncpus = 1; }
my $reqmem = $job->{'req_mem'};
my $nnodes = $job->{'alloc_nodes'};
# Check for missing number of nodes.
if ($nnodes == 0) { $nnodes = 1; }
my $pernode;
if ($reqmem & MEM_PER_CPU) {
    $reqmem = ($reqmem & ~MEM_PER_CPU) * 1024 * $ncpus;
    $pernode = 0;
} else {
    $reqmem = $reqmem * 1024 * $nnodes;
    $pernode = 1;
}
my $walltime = $job->{'elapsed'};
# Only use hi-order byte for error code.
my $exit_status = $job->{'exitcode'} >> 8;

my $array_job_id = $job->{'array_job_id'};
my $array_jobid = "";
if ($array_job_id != 0) {
    # Convert array_task_id to a signed long integer.
    my $array_task_id = unpack('l', pack('l', $job->{'array_task_id'}));
    if ($array_task_id == -2) {
        print STDERR "Badly formatted array jobid $array_job_id with task_id = -2\n";
        exit 3;
    }
    $array_jobid = "${array_job_id}_${array_task_id}";
}
my $tot_cpu_sec = 0;
my $tot_cpu_usec = 0;
my $mem = 0;
my $ntasks = 0;
for my $step (@{$job->{'steps'}}) {
    $tot_cpu_sec += $step->{'tot_cpu_sec'};
    $tot_cpu_usec += $step->{'tot_cpu_usec'};

#    my $lmem = $step->{'stats'}{'rss_max'};
    #@@@ deve usare la seconda voce di tres_usage_in_max
    my %hash = split /[,=]/, $step->{'stats'}{'tres_usage_in_max'};
    my $lmem=$hash{'2'}/1024;
#    if ($mem < $lmem) {
#        $mem = $lmem;
#        $ntasks = $step->{'ntasks'};
#    }
    my $ltasks=$step->{'ntasks'};
    $ntasks += $ltasks;
    $mem += $lmem*$ltasks;
}
my $cput = $tot_cpu_sec + int(($tot_cpu_usec / 1000000) + 0.5);
#$mem = $mem * $ntasks;

if ($mydebug) {
    print "Slurm data: JobID ArrayJobID User Group State Clustername Ncpus Nnodes Ntasks Reqmem PerNode Cput Walltime Mem ExitStatus\n";
    print "Slurm data: $jobid $array_jobid $user $group $state $clustername $ncpus $nnodes $ntasks $reqmem $pernode $cput $walltime $mem $exit_status\n\n";
}
print "Job ID: $jobid\n";
if (length $array_jobid) {
    print "Array Job ID: $array_jobid\n";
}
print "Cluster: $clustername\n";
print "User/Group: $user/$group\n";
if ($state eq "PENDING" || $state eq "RUNNING") {
    print "State: $state\n";
} else {
    print "State: $state (exit code $exit_status)\n";
}
if ($ncpus == 1) {
    print "Cores: $ncpus\n";
} else {
    print "Nodes: $nnodes\n";
    printf "Cores per node: %d\n", $ncpus/$nnodes;
}
if ($state ne "PENDING") {
    my $corewalltime = $walltime * $ncpus;
    my $cpu_eff;
    if ($corewalltime != 0) {
        $cpu_eff = $cput / $corewalltime * 100;
    } else {
        $cpu_eff = 0.0;
    }
    printf("CPU Utilized: %s\n", time2str($cput));
    printf("CPU Efficiency: %.2f%% of %s core-walltime\n", $cpu_eff, time2str($corewalltime));
    if ($ntasks == 1) {
        printf("Memory Utilized: %s\n", kbytes2str($mem));
    } else {
        printf("Memory Utilized: %s (estimated maximum)\n", kbytes2str($mem));
    }
    my $mem_eff;
    if ($reqmem != 0) {
        $mem_eff = $mem / $reqmem * 100;
    } else {
        $mem_eff = 0.0;
    }
    if ($ntasks == 1) {
        printf("Memory Efficiency: %.2f%% of %s\n", $mem_eff, kbytes2str($reqmem));
    } else {
        if ($pernode) {
            printf("Memory Efficiency: %.2f%% of %s (%s\/node)\n", $mem_eff, kbytes2str($reqmem), kbytes2str($reqmem / $nnodes));
        } else {
            printf("Memory Efficiency: %.2f%% of %s (%s\/core)\n", $mem_eff, kbytes2str($reqmem), kbytes2str($reqmem / $ncpus));
        }
    }
    if ($state eq "RUNNING") {
        print "WARNING: Efficiency statistics may be misleading for $state jobs.\n";
    }
} else {
    print "Efficiency not available for jobs in the PENDING state.\n";
}

# Convert elapsed time to string.
sub time2str {
    my $time = shift;
    my $days = int($time / 86400);
    $time -= ($days * 86400);
    my $hours = int($time / 3600);
    $time -= ($hours * 3600);
    my $minutes = int($time / 60);
    my $seconds = $time % 60;

    $days = $days < 1 ? '' : "$days-";
    $time = $days . sprintf("%02s:%02s:%02s", $hours, $minutes, $seconds);
    return $time;
}

# Convert memory to human-readable string.
sub kbytes2str {
    my $kbytes = shift;
    if ($kbytes == 0) {
        return sprintf("%.2f %sB", 0.0, 'M');
    }
    my $mul = 1024;

    my $exp = int(log($kbytes) / log($mul));
    my @pre = qw/ M G T P E /;
    my $pre = $pre[$exp-1];
    return sprintf("%.2f %sB", ($kbytes / pow($mul, $exp)), $pre);
}


More information about the slurm-users mailing list