server freezes when training

38 views
Skip to first unread message

Mostapha Benhenda

unread,
Nov 7, 2015, 6:22:09 PM11/7/15
to lasagne-users
Hello,

When I train a network on a server, it freezes (after it prints ### Epoch###). (Ctrl+C does not even work)


I rebooted the server, and the problem remains.

My code is here:

import matplotlib
matplotlib.use('Agg')

import lasagne
from vgg16_solution import build_model
import pickle
from load_data_solution import batch_gen
import theano.tensor as T
import theano
import numpy as np
import time,sys

import matplotlib
matplotlib.use('Agg')


import os
os.environ['THEANO_FLAGS']='floatX=float32,device=gpu0,nvcc.fastmath=True'

from lasagne.layers import InputLayer, DenseLayer, NonlinearityLayer,DropoutLayer
import theano

from lasagne.layers import Conv2DLayer as ConvLayer

from lasagne.layers import Pool2DLayer as PoolLayer
from lasagne.nonlinearities import softmax

from collections import OrderedDict



from nolearn.lasagne import NeuralNet
from nolearn.lasagne import TrainSplit
from nolearn.lasagne import objective


print 'Building model...'
#build net model using vgg16 architecture (changed to have 5 outputs rather than 1000)
#net_vgg16=build_model()

print 'Loading weights...'
#load pretrained weights for all layers before output layer
model = pickle.load(open('vgg16.pkl'))
pretrained_weights = model['param values'][:-2] # drop W, b for Imagenet 1000 class softmax
#lasagne.layers.set_all_param_values(net_vgg16['fc7'], pretrained_weights)

#store maximal number of instances 

#N_instances= 30 #maximal number of images in one bag. average number is 10.7, and median is 11.

#N_instances=2

print 'building the modified net...'

#My modified net:   #add dropout later
net = OrderedDict()
net['input'] = InputLayer((None, 3, 224, 224))
net['conv1_1'] = ConvLayer(net['input'], 64, 3, pad=1)
net['conv1_2'] = ConvLayer(net['conv1_1'], 64, 3, pad=1)
net['pool1'] = PoolLayer(net['conv1_2'], 2)
net['conv2_1'] = ConvLayer(net['pool1'], 128, 3, pad=1)
net['conv2_2'] = ConvLayer(net['conv2_1'], 128, 3, pad=1)
net['pool2'] = PoolLayer(net['conv2_2'], 2)
net['conv3_1'] = ConvLayer(net['pool2'], 256, 3, pad=1)
net['conv3_2'] = ConvLayer(net['conv3_1'], 256, 3, pad=1)
net['conv3_3'] = ConvLayer(net['conv3_2'], 256, 3, pad=1)
net['pool3'] = PoolLayer(net['conv3_3'], 2)
net['conv4_1'] = ConvLayer(net['pool3'], 512, 3, pad=1)
net['conv4_2'] = ConvLayer(net['conv4_1'], 512, 3, pad=1)
net['conv4_3'] = ConvLayer(net['conv4_2'], 512, 3, pad=1)
net['pool4'] = PoolLayer(net['conv4_3'], 2)
net['conv5_1'] = ConvLayer(net['pool4'], 512, 3, pad=1)
net['conv5_2'] = ConvLayer(net['conv5_1'], 512, 3, pad=1)
net['conv5_3'] = ConvLayer(net['conv5_2'], 512, 3, pad=1)
net['pool5'] = PoolLayer(net['conv5_3'], 2)
net['fc6'] = DenseLayer(net['pool5'], num_units=4096)
net['fc7'] = DenseLayer(net['fc6'], num_units=4096)
 
#Merge upper layers of each instance by taking the maximum (refinement:replace maximum with maximum over random subset: dropout on instances)   
#net['fc_merged']  =  lasagne.layers.ElemwiseMergeLayer( [net['fc7'+ str(n)] for n in range(N_instances) ],  theano.tensor.maximum)

#Add dense layer with 4 outputs, for the 4 labels: 'plants','window','grass','lake'.    

net['fc8'] = DenseLayer(net['fc7'], num_units=10, nonlinearity=None)
net['prob'] = NonlinearityLayer(net['fc8'], softmax)


print 'modified net built!'
#end of modified net


print 'pre-training the modified net...'

# pre-training
#for n in range(N_instances):
lasagne.layers.set_all_param_values(net['fc7'], pretrained_weights)



#print 'end of pre-training!'



#define tensor variables for network input and labels
#X = {'input'+ str(n): T.tensor4('x')   for n in range(N_instances)} 

X= T.tensor4('x')



y = T.imatrix('y')

#get output of network in terms of tensor X
output_layer=net['prob'] 
output = lasagne.layers.get_output(output_layer, X)

#get output with dropuut switched off
output_deterministic = lasagne.layers.get_output(output_layer,X  , deterministic=True)

#prediction
pred = T.argmax(output_deterministic, axis=1)
accuracy = T.mean(T.eq(pred, T.argmax(y, axis=1)), dtype=theano.config.floatX)

# Create a loss expression for training, i.e., a scalar objective we want
# to minimize (for our multi-class problem, it is the cross-entropy loss):

train_loss = lasagne.objectives.categorical_crossentropy(output, y)
train_loss = lasagne.objectives.aggregate(train_loss, mode='mean')

# Create update expressions for training, i.e., how to modify the
# parameters at each training step. Here, we'll use Stochastic Gradient
# Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
params = lasagne.layers.get_all_params(output_layer, trainable=True)
updates = lasagne.updates.nesterov_momentum(train_loss, params, learning_rate=0.001, momentum=0.9)


t0=time.time()
print 'Compiling train function...',
sys.stdout.flush()
# Compile a function performing a training step on a mini-batch (by giving
# the updates dictionary) and returning the corresponding training loss:
iter_train = theano.function([X,y],
                             [train_loss,accuracy],
                             updates=updates
                             )

print 'Done [{} s]'.format(time.time()-t0)

#loop through data and train network
n_epochs=10
batch_size=32
for epoch in range(n_epochs):
    print '#########'
    print ' Epoch ',epoch
    print '#########'
    
    
    
    
    batches=batch_gen('train.txt',batch_size)
    for i,batch in enumerate(batches):
        x_i,y_i=batch

        loss,acc=iter_train(x_i,y_i)
        print 'Iteration {0}, train loss: {1}, train accuracy: {2}'.format(i,loss,acc)

Sander Dieleman

unread,
Nov 8, 2015, 11:15:16 AM11/8/15
to lasagne-users
That sounds bad! I think it's probably a problem with your setup, rather than with Lasagne specifically. Have you tried running the Theano test suite? Or any other program that uses CUDA? It might even be a hardware issue.

Sander

prafulag...@gmail.com

unread,
Jun 12, 2018, 6:07:27 AM6/12/18
to lasagne-users
Facing same issue, please let me know if you found anything.

Thanks
Reply all
Reply to author
Forward
0 new messages