<div dir="ltr">I posted this yesterday and this does appear to be related to a specific job. Note this error: "gres/gpu: count changed for node node002 from 0 to 1" Could it be misleading? What could cause the node to drain? Here are the contents of the user's SBATCH file. Could the piping having an effect here?<div><br><div>#!/bin/sh<br>#SBATCH -N 1<br>#SBATCH -n 1<br>#SBATCH --mail-type=ALL<br>#SBATCH --gres=gpu:1<br>#SBATCH --job-name=$1sequentialBlur_squeezenet_training_imagewoof_crossval<br>module purge<br>module load gcc5 cuda10.0<br>module load openmpi/cuda/64<br>module load pytorch-py36-cuda10.1-gcc/1.3.1<br>module load ml-pythondeps-py36-cuda10.1-gcc/3.0.0<br>python3.6 SequentialBlur_untrained.py squeezenet 100 imagewoof $1 | tee squeeze_100_imwoof_seq_longtrain_cv_$1.txt<br><br>Here is the script contents:<br><br># Banks 1978 paper:<br># 1 month:  2.4 cyc/deg<br># 2 month:  2.8 cyc/deg<br># 3 month:  4 cyc/deg<br># 224 pixels:<br># 20 deg -> 11 pix in deg;  4.6 pix blur;  4 pix blur;  2.8 pix blur<br># 4 deg -> 56 pix in deg; 23 pix blur (1 mo); 20 pix blur (2 mo); 14 pix blur (3 mo)<br><br>import torch<br>import torchvision<br>import torchvision.transforms as transforms<br>from torchvision import models<br>import torchvision.datasets<br>import matplotlib.pyplot as plt<br>import numpy as np<br>import matplotlib.pyplot as plt<br>import numpy as np<br>import torch.nn as nn<br>import torch.nn.functional as F<br>import torch.optim as optim<br>import os<br>import sys<br>import scipy<br>from torch.utils.data.sampler import SubsetRandomSampler<br>import h5py<br><br>args = sys.argv<br>modelType = args[1] # 'alexnet', 'squeezenet', 'vgg16'<br>numEpochs = args[2] # int<br>image_set = str(args[3]) # 'imagewoof', 'imagenette'<br>block_call = args[4] # int {0:4}<br><br># Example call:<br># python3 alexnet 100 imagenette 1<br><br>def get_train_valid_loader(data_dir,block,augment=0,random_seed=69420,valid_size=0.2,shuffle=False,<br>                                                show_sample=False,num_workers=4, pin_memory=False, batch_size=128):<br>        # valid_size gotta be in [0,1]<br>        # block must be an int between 0:(1/valid_size) (0:4 for valid_size==0.2)<br>        transform = transforms.Compose([<br>                transforms.Resize(256),<br>                transforms.CenterCrop(224),<br>                transforms.ToTensor(),<br>                transforms.Normalize(<br>                mean=[0.485, 0.456, 0.406],<br>                std=[0.229, 0.224, 0.225]<br>        )])<br>        train_dataset = torchvision.datasets.ImageFolder(root=data_dir,transform=transform)<br>        valid_dataset = torchvision.datasets.ImageFolder(root=data_dir,transform=transform)<br>        num_train = len(train_dataset)<br>        indices = list(range(num_train))<br>        split = int(np.floor(valid_size * num_train))<br>        split1 = int(np.floor(block*split))<br>        split2 = int(np.floor((block+1)*split))<br>        # if shuffle:<br>        np.random.seed(100)<br>        np.random.shuffle(indices)<br>        valid_idx = indices[split1:split2]<br>        train_idx = np.append(indices[:split1],indices[split2:])<br>        train_idx = train_idx.astype('int32')<br>        if block != 0:<br>                for b in range(block):<br>                        indices = [indices[(i + split) % len(indices)] for i, x in enumerate(indices)]<br>        # train_idx, valid_idx = indices[split:], indices[:split]<br>        train_sampler = SubsetRandomSampler(train_idx)<br>        # train_sampler = torch.utils.data.Subset(dataset, indices)<br>        valid_sampler = SubsetRandomSampler(valid_idx)<br>        train_loader = torch.utils.data.DataLoader(<br>                train_dataset, sampler=train_sampler, batch_size=batch_size,<br>                num_workers=num_workers, pin_memory=pin_memory,<br>        )<br>        valid_loader = torch.utils.data.DataLoader(<br>                valid_dataset, sampler=valid_sampler, batch_size=batch_size,<br>                num_workers=num_workers, pin_memory=pin_memory,<br>        )<br>        return (train_loader, valid_loader)<br><br>transform = transforms.Compose([<br>        transforms.Resize(256),<br>        transforms.CenterCrop(224),<br>        transforms.ToTensor(),<br>        transforms.Normalize(<br>        mean=[0.485, 0.456, 0.406],<br>        std=[0.229, 0.224, 0.225]<br> )])<br><br><br>blurTypes = ['gaussian']<br><br>data_dir = "/path/to/dir/" + image_set + "-320_blur/"<br><br><br>classes = []<br>for directory, subdirectories, files in os.walk(data_dir):<br>        for file in files:<br>                if directory.split("\\")[-1] not in classes:<br>                        classes.append(directory.split("\\")[-1])<br><br>criterion = nn.CrossEntropyLoss()<br>device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")<br>def train():<br><br>        for epoch in range(int(numEpochs)):<br>                prev_loss = 100000.0<br>                running_loss = 0.0<br>                for i, data in enumerate(trainloader, 0):<br>                        # get the inputs; data is a list of [inputs, labels]<br>                        inputs, labels = data<br>                        inputs = <a href="http://inputs.to">inputs.to</a>(device)<br>                        labels = <a href="http://labels.to">labels.to</a>(device)<br><br>                        # zero the parameter gradients<br>                        optimizer.zero_grad()<br><br>                        # forward + backward + optimize<br>                        outputs = net(inputs)<br>                        loss = criterion(outputs, labels)<br>                        loss.backward()<br>                        optimizer.step()<br><br>                        running_loss += loss.item()<br><br>                if epoch % 10 == 9:<br>                        print('[%d, %5d] loss: %.3f' %<br>                                (epoch + 1, i + 1, running_loss / 100))<br><br>allAccs = []<br>for blurType in blurTypes: # multiple types of blur<br>        print(blurType)<br>        print('-' * 10)<br>        # for block in range(5):<br>        block = int(block_call)<br>        print("\nFOLD " + str(block+1) + ":")<br>        for i in range(5):<br>                if i == 0:<br>                        blurLevels = [23, 11, 5, 3, 1]<br>                elif i == 1:<br>                        blurLevels = [11, 5, 3, 1]<br>                elif i == 2:<br>                        blurLevels = [5, 3, 1]<br>                elif i == 3:<br>                        blurLevels = [3, 1]<br>                elif i == 4:<br>                        blurLevels = [1]<br><br>                if modelType == 'vgg16':<br>                        net = torchvision.models.vgg16(pretrained=False)<br>                        num_ftrs = net.classifier[6].in_features<br>                        net.classifier[6] = nn.Linear(num_ftrs, len(classes))<br>                elif modelType == 'alexnet':<br>                        net = torchvision.models.alexnet(pretrained=False)<br>                        num_ftrs = net.classifier[6].in_features<br>                        net.classifier[6] = nn.Linear(num_ftrs, len(classes))<br>                else:<br>                        net = torchvision.models.squeezenet1_1(pretrained=False)<br>                        net.classifier[1] = nn.Conv2d(512, len(classes), kernel_size=(1, 1), stride=(1, 1))<br>                        net.num_classes = len(classes)<br>                optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)<br>                net = <a href="http://net.to">net.to</a>(device)<br>                for i in range(len(blurLevels)): #5 levels of blur: 1, 3, 5, 11, 23<br>                        mult = blurLevels[i]<br><br>                        trainloader, validloader = get_train_valid_loader(data_dir=data_dir + blurType + '/' + image_set +<br>                                '-320_' + str(mult) + '/train',<br>                                block=block,shuffle=False,num_workers=0,batch_size=128)<br>                        print('Start training on blur window of ' + str(mult))<br>                        train()<br>                        print('Finished Training on ' + blurType + ' with blur window of ' + str(mult))<br><br>                accs = []<br>                permBlurLevels = [23, 11, 5, 3, 1]<br>                for j in range(len(permBlurLevels)):<br>                        tempMult = permBlurLevels[j]<br>                        correct = 0<br>                        total = 0<br>                        # newTestSet = torchvision.datasets.ImageFolder(root=data_dir + blurType + '/' + image_set + '-320_' +<br>                        #       str(tempMult) + '/val',<br>                        #       transform=transform)<br>                        # newTestLoader = torch.utils.data.DataLoader(newTestSet, batch_size=128,<br>                        #       shuffle=True, num_workers=0)<br>                        t2, validloader2 = get_train_valid_loader(data_dir=data_dir + blurType + '/' + image_set +<br>                                '-320_' + str(mult) + '/train',<br>                                block=block,shuffle=False,num_workers=0,batch_size=128)<br><br>                        with torch.no_grad():<br>                                for data in validloader2:<br>                                        images, labels = data<br>                                        images = <a href="http://images.to">images.to</a>(device)<br>                                        labels = <a href="http://labels.to">labels.to</a>(device)<br>                                        outputs = net(images)<br>                                        _, predicted = torch.max(outputs.data, 1)<br>                                        total += labels.size(0)<br>                                        correct += (predicted == labels).sum().item()<br>                                        acc = 100 * correct / total<br>                        print('Accuracy: %f %%' % (acc))<br>                        accs.append(acc)<br>                allAccs.append(accs)<br><br><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">---------- Forwarded message ---------<br>From: <strong class="gmail_sendername" dir="auto">Robert Kudyba</strong> <span dir="auto"><<a href="mailto:rkudyba@fordham.edu">rkudyba@fordham.edu</a>></span><br>Date: Fri, Mar 13, 2020 at 11:36 AM<br>Subject: gres/gpu: count changed for node node002 from 0 to 1<br>To: Slurm User Community List <<a href="mailto:slurm-users@lists.schedmd.com">slurm-users@lists.schedmd.com</a>><br></div><br><br><div dir="ltr"><div>We're running slurm-17.11.12 on Bright Cluster 8.1 and our node002 keeps going into a draining state:</div><div> sinfo -a<br>PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST<br>defq*        up   infinite      1   drng node002<br></div><div><br></div><div><font face="monospace">info -N -o "%.20N %.15C %.10t %.10m %.15P %.15G %.35E"<br>            NODELIST   CPUS(A/I/O/T)      STATE     MEMORY       PARTITION            GRES                              REASON<br>             node001       9/15/0/24        mix     191800           defq*           gpu:1                                none<br>             node002       1/0/23/24       drng     191800           defq*           gpu:1 gres/gpu count changed and jobs are<br>             node003       1/23/0/24        mix     191800           defq*           gpu:1                                none<br></font></div><div><br></div><div>Node of the nodes have a separate slurm.conf file, it's all shared from the head node. What else could be causing this?</div><div>[2020-03-13T08:54:02.269] gres/gpu: count changed for node node002 from 0 to 1<br>[2020-03-13T08:54:02.269] error: Setting node node002 state to DRAIN<br>[2020-03-13T08:54:02.269] drain_nodes: node node002 state set to DRAIN<br>[2020-03-13T08:54:02.269] error: _slurm_rpc_node_registration node=node002: Invalid argument<br></div></div>
</div></div></div></div>