[slurm-users] slurm 17 node feature Failed to reboot nodes

Tueur Volvo huitre39 at gmail.com
Tue Mar 27 06:39:06 MDT 2018


Hello i migrated from slurm 16 to slurm 17.11.4

but my node feature not work, i apply my feature and i reboot but when
machine is rebooted slurm make my node in drain

with sinfo -r i have this error message :
Failed to reboot nodes machine415 into expected state for job 945

My problem is located in slurm source at this file :
slurm-17.11.4/src/slurmctld/job_sheduler.c
ligne 4222

static void *_wait_boot(void *arg)
{

    if (boot_node_bitmap && bit_set_count(boot_node_bitmap)) {
        char *node_list = bitmap2node_name(boot_node_bitmap);
        error("Failed to reboot nodes %s into expected state for job %u",
              node_list, job_ptr->job_id);
        (void) drain_nodes(node_list, "Node mode change failure",
                   getuid());
        xfree(node_list);
        (void) job_requeue(getuid(), job_ptr->job_id, NULL, false, 0);
    }




i comment this function and my node feature plugin work !


but i want to solve my problem with modify my plugin code
but I don't know what to change in my code
what should I save as a variable in my code? with xalloc or xrealloc.

i share my basic sour ce code, i think that i save active_features when
function node_features_p_node_set is called, but how can i save this
variable ? i try to save *active_features into char **current_mode but i
have same error

thanks for advance for your help




#include "config.h"

#define _GNU_SOURCE    /* For POLLRDHUP */
#include <ctype.h>
#include <poll.h>
#include <signal.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

#if defined(__FreeBSD__) || defined(__NetBSD__)
#define POLLRDHUP POLLHUP
#endif

#include "slurm/slurm.h"

#include "src/common/assoc_mgr.h"
#include "src/common/bitstring.h"
#include "src/common/fd.h"
#include "src/common/gres.h"
#include "src/common/list.h"
#include "src/common/macros.h"
#include "src/common/pack.h"
#include "src/common/parse_config.h"
#include "src/common/slurm_protocol_api.h"
//#include "src/common/slurm_strcasestr.h"
#include "src/common/timers.h"
#include "src/common/uid.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/slurmctld/job_scheduler.h"
#include "src/slurmctld/locks.h"
#include "src/slurmctld/node_scheduler.h"
#include "src/slurmctld/reservation.h"
#include "src/slurmctld/slurmctld.h"
#include "src/slurmctld/state_save.h"


const char plugin_name[]        = "node_features update_linux plugin";
const char plugin_type[]        = "node_features/update_linux";
const uint32_t plugin_version   = SLURM_VERSION_NUMBER;


extern int node_features_p_get_config()  {
}

extern bool node_features_p_node_update_valid(void *node_ptr,
update_node_msg_t *update_node_msg) {
    return true;
}

extern uint32_t node_features_p_boot_time(void) {
    uint32_t boot_time = (6 * 60);    /* 6 minute estimated boot time */
    return boot_time;
}

extern bool node_features_p_changible_feature(char *feature) {
    return true;
}

extern bool node_features_p_changeable_feature(char *feature) {
    return true;
}

extern void node_features_p_step_config(bool mem_sort, bitstr_t
*numa_bitmap)  {
}

extern char *node_features_p_node_xlate2(char *new_features)  {
    return new_features;
}


/* Load configuration */
extern int init(void)
{
    debug("init_new
plugin*********************************************************************************");
    int rc = SLURM_SUCCESS;
    return rc;
}

extern int fini(void)
{

debug("fini_*********************************************************************************");
    return SLURM_SUCCESS;
}

extern int node_features_p_get_node(char *node_list)
{

debug("node_features_p_get_node_*********************************************************************************");
    return SLURM_SUCCESS;
}

extern char *node_features_p_node_xlate(char *new_features, char
*orig_features, char *avail_features)
{
    return avail_features;
}

extern void node_features_p_node_state(char **avail_modes, char
**current_mode)
{

debug("node_features_p_node_state_*********************************************************************************");
}

/* Translate a job's feature request to the node features needed at boot
time */
extern char *node_features_p_job_xlate(char *job_features)
{

debug("node_features_p_job_xlate_*********************************************************************************");
    return job_features;
}

/* Test if a job's feature specification is valid */
extern int node_features_p_job_valid(char *job_features)
{

debug("node_features_p_job_valid_*********************************************************************************");
    return SLURM_SUCCESS;
}

bool node_features_g_node_update_valid(void *node_ptr, update_node_msg_t
*update_node_msg) {
    return true;
}

/* Set's the node's active features based upon job constraints.
 * NOTE: Executed by the slurmd daemon.
 * IN active_features - New active features
 * RET error code */
extern int node_features_p_node_set(char *active_features)
{

debug("node_features_p_node_set_*********************************************************************************");
    int error_code = SLURM_SUCCESS;
    active_features[0] = '\0';
    return error_code;
}

/* Return true if the plugin requires PowerSave mode for booting nodes */
extern bool node_features_p_node_power(void)
{

debug("node_features_p_node_power_*********************************************************************************");
    return false;
}

extern int node_features_p_node_update(char *active_features, bitstr_t
*node_bitmap)
{

debug("_node_features_p_node_update********************************************************************************");
    return SLURM_SUCCESS;
}

/* Reload configuration */
extern int node_features_p_reconfig(void)
{

debug("_node_features_p_reconfig********************************************************************************");
    return 1;
}

/* Determine if the specified user can modify the currently available node
 * features */
extern bool node_features_p_user_update(uid_t uid)
{

debug("_node_features_p_user_update********************************************************************************");
    return true;
}
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.schedmd.com/pipermail/slurm-users/attachments/20180327/370d0fa8/attachment-0001.html>


More information about the slurm-users mailing list