server freezes when training

Mostapha Benhenda

unread,

Nov 7, 2015, 6:22:09 PM11/7/15

to lasagne-users

Hello,

When I train a network on a server, it freezes (after it prints ### Epoch###). (Ctrl+C does not even work)

I rebooted the server, and the problem remains.

My code is here:

import matplotlib

matplotlib.use('Agg')

import lasagne

from vgg16_solution import build_model

import pickle

from load_data_solution import batch_gen

import theano.tensor as T

import theano

import numpy as np

import time,sys

import matplotlib

matplotlib.use('Agg')

import os

os.environ['THEANO_FLAGS']='floatX=float32,device=gpu0,nvcc.fastmath=True'

from lasagne.layers import InputLayer, DenseLayer, NonlinearityLayer,DropoutLayer

import theano

from lasagne.layers import Conv2DLayer as ConvLayer

from lasagne.layers import Pool2DLayer as PoolLayer

from lasagne.nonlinearities import softmax

from collections import OrderedDict

from nolearn.lasagne import NeuralNet

from nolearn.lasagne import TrainSplit

from nolearn.lasagne import objective

print 'Building model...'

#build net model using vgg16 architecture (changed to have 5 outputs rather than 1000)

#net_vgg16=build_model()

print 'Loading weights...'

#load pretrained weights for all layers before output layer

model = pickle.load(open('vgg16.pkl'))

pretrained_weights = model['param values'][:-2] # drop W, b for Imagenet 1000 class softmax

#lasagne.layers.set_all_param_values(net_vgg16['fc7'], pretrained_weights)

#store maximal number of instances

#N_instances= 30 #maximal number of images in one bag. average number is 10.7, and median is 11.

#N_instances=2

print 'building the modified net...'

#My modified net: #add dropout later

net = OrderedDict()

net['input'] = InputLayer((None, 3, 224, 224))

net['conv1_1'] = ConvLayer(net['input'], 64, 3, pad=1)

net['conv1_2'] = ConvLayer(net['conv1_1'], 64, 3, pad=1)

net['pool1'] = PoolLayer(net['conv1_2'], 2)

net['conv2_1'] = ConvLayer(net['pool1'], 128, 3, pad=1)

net['conv2_2'] = ConvLayer(net['conv2_1'], 128, 3, pad=1)

net['pool2'] = PoolLayer(net['conv2_2'], 2)

net['conv3_1'] = ConvLayer(net['pool2'], 256, 3, pad=1)

net['conv3_2'] = ConvLayer(net['conv3_1'], 256, 3, pad=1)

net['conv3_3'] = ConvLayer(net['conv3_2'], 256, 3, pad=1)

net['pool3'] = PoolLayer(net['conv3_3'], 2)

net['conv4_1'] = ConvLayer(net['pool3'], 512, 3, pad=1)

net['conv4_2'] = ConvLayer(net['conv4_1'], 512, 3, pad=1)

net['conv4_3'] = ConvLayer(net['conv4_2'], 512, 3, pad=1)

net['pool4'] = PoolLayer(net['conv4_3'], 2)

net['conv5_1'] = ConvLayer(net['pool4'], 512, 3, pad=1)

net['conv5_2'] = ConvLayer(net['conv5_1'], 512, 3, pad=1)

net['conv5_3'] = ConvLayer(net['conv5_2'], 512, 3, pad=1)

net['pool5'] = PoolLayer(net['conv5_3'], 2)

net['fc6'] = DenseLayer(net['pool5'], num_units=4096)

net['fc7'] = DenseLayer(net['fc6'], num_units=4096)

#Merge upper layers of each instance by taking the maximum (refinement:replace maximum with maximum over random subset: dropout on instances)

#net['fc_merged'] = lasagne.layers.ElemwiseMergeLayer( [net['fc7'+ str(n)] for n in range(N_instances) ], theano.tensor.maximum)

#Add dense layer with 4 outputs, for the 4 labels: 'plants','window','grass','lake'.

net['fc8'] = DenseLayer(net['fc7'], num_units=10, nonlinearity=None)

net['prob'] = NonlinearityLayer(net['fc8'], softmax)

print 'modified net built!'

#end of modified net

print 'pre-training the modified net...'

# pre-training

#for n in range(N_instances):

lasagne.layers.set_all_param_values(net['fc7'], pretrained_weights)

#print 'end of pre-training!'

#define tensor variables for network input and labels

#X = {'input'+ str(n): T.tensor4('x') for n in range(N_instances)}

X= T.tensor4('x')

y = T.imatrix('y')

#get output of network in terms of tensor X

output_layer=net['prob']

output = lasagne.layers.get_output(output_layer, X)

#get output with dropuut switched off

output_deterministic = lasagne.layers.get_output(output_layer,X , deterministic=True)

#prediction

pred = T.argmax(output_deterministic, axis=1)

accuracy = T.mean(T.eq(pred, T.argmax(y, axis=1)), dtype=theano.config.floatX)

# Create a loss expression for training, i.e., a scalar objective we want

# to minimize (for our multi-class problem, it is the cross-entropy loss):

train_loss = lasagne.objectives.categorical_crossentropy(output, y)

train_loss = lasagne.objectives.aggregate(train_loss, mode='mean')

# Create update expressions for training, i.e., how to modify the

# parameters at each training step. Here, we'll use Stochastic Gradient

# Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.

params = lasagne.layers.get_all_params(output_layer, trainable=True)

updates = lasagne.updates.nesterov_momentum(train_loss, params, learning_rate=0.001, momentum=0.9)

t0=time.time()

print 'Compiling train function...',

sys.stdout.flush()

# Compile a function performing a training step on a mini-batch (by giving

# the updates dictionary) and returning the corresponding training loss:

iter_train = theano.function([X,y],

[train_loss,accuracy],

updates=updates

)

print 'Done [{} s]'.format(time.time()-t0)

#loop through data and train network

n_epochs=10

batch_size=32

for epoch in range(n_epochs):

print '#########'

print ' Epoch ',epoch

print '#########'

batches=batch_gen('train.txt',batch_size)

for i,batch in enumerate(batches):

x_i,y_i=batch

loss,acc=iter_train(x_i,y_i)

print 'Iteration {0}, train loss: {1}, train accuracy: {2}'.format(i,loss,acc)

Sander Dieleman

unread,

Nov 8, 2015, 11:15:16 AM11/8/15

to lasagne-users

That sounds bad! I think it's probably a problem with your setup, rather than with Lasagne specifically. Have you tried running the Theano test suite? Or any other program that uses CUDA? It might even be a hardware issue.

Sander

prafulag...@gmail.com

unread,

Jun 12, 2018, 6:07:27 AM6/12/18

to lasagne-users

Facing same issue, please let me know if you found anything.

Thanks

Reply all

Reply to author

Forward