Hi guys,
I get an error using cudnn for a CNN I made. Lasagne's use of cudnn is working for me in another CNN so it is specific to this file. I think this is the relevant part of the code but I also dumped the entire file. The error I am getting is CUDNN_STATUS_BAD_PARAM. I assume I am making an error.
Best,
Mike
def build_layers(self,input_var=None):
network = lasagne.layers.InputLayer(shape=(None, self.n_channels, 360,360),
input_var=input_var)
network = lasagne.layers.Conv2DLayer(
network, num_filters=64, filter_size=(4, 4),stride=1,
nonlinearity=lasagne.nonlinearities.rectify,
W=lasagne.init.GlorotUniform())
network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2))
network = lasagne.layers.dropout(network, p=.2)
network = lasagne.layers.DenseLayer(network,num_units=100,
nonlinearity=lasagne.nonlinearities.rectify)
network = lasagne.layers.dropout(network, p=.5)
network = lasagne.layers.DenseLayer(network,
num_units=1,nonlinearity=lasagne.nonlinearities.tanh)
return network
mike@mike-MS-7881:~/Desktop/CFD$ THEANO_FLAGS=floatX=float32,device=gpu0,exception_verbosity=high python CNN.py
Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled)
Building model and compiling functions...
------------------------------------------------------------
('Iteration', 1, 'Fold', 1)
------------------------------------------------------------
('X', (1235, 3, 360, 360), 'X_train', (988, 3, 360, 360))
('pictures:', 988, 'macro size', 98, 'micro_count', 19)
Traceback (most recent call last):
File "CNN.py", line 186, in <module>
main()
File "CNN.py", line 183, in main
test_real(X=faces_X1.astype(theano.config.floatX), Y=faces_Y.astype(theano.config.floatX), n_channels=3)
File "CNN.py", line 155, in test_real
model.fit()
File "CNN.py", line 132, in fit
a += self.train_fn(micro_index)
File "/home/mike/anaconda/lib/python2.7/site-packages/theano/compile/function_module.py", line 871, in __call__
storage_map=getattr(self.fn, 'storage_map', None))
File "/home/mike/anaconda/lib/python2.7/site-packages/theano/gof/link.py", line 314, in raise_with_op
reraise(exc_type, exc_value, exc_trace)
File "/home/mike/anaconda/lib/python2.7/site-packages/theano/compile/function_module.py", line 859, in __call__
outputs = self.fn()
RuntimeError: Could not set tensorNd descriptor: CUDNN_STATUS_BAD_PARAMdim=4
Apply node that caused the error: GpuDnnConv{algo='small', inplace=True}(GpuContiguous.0, GpuContiguous.0, GpuAllocEmpty.0, GpuDnnConvDesc{border_mode='valid', subsample=(1, 1), conv_mode='conv'}.0, Constant{1.0}, Constant{0.0})
Toposort index: 38
Inputs types: [CudaNdarrayType(float32, 4D), CudaNdarrayType(float32, 4D), CudaNdarrayType(float32, 4D), <theano.gof.type.CDataType object at 0x7fa8d6abe250>, Scalar(float32), Scalar(float32)]
Inputs shapes: [(0, 3, 360, 360), (64, 3, 4, 4), (0, 64, 357, 357), 'No shapes', (), ()]
Inputs strides: [(388800, 129600, 360, 1), (48, 16, 4, 1), (8156736, 127449, 357, 1), 'No strides', (), ()]
Inputs values: [<CudaNdarray object at 0x7fa8cd3a9770>, 'not shown', <CudaNdarray object at 0x7fa8cd3466b0>, <PyCObject object at 0x7fa8cd732530>, 1.0, 0.0]
Inputs name: ('image', 'kernel', 'output', 'descriptor', 'alpha', 'beta')
Outputs clients: [[GpuElemwise{Composite{(i0 * ((i1 + i2) + Abs((i1 + i2))))}}[(0, 1)](CudaNdarrayConstant{[[[[ 0.5]]]]}, GpuDnnConv{algo='small', inplace=True}.0, GpuDimShuffle{x,0,x,x}.0)]]
Debugprint of the apply node:
GpuDnnConv{algo='small', inplace=True} [@A] <CudaNdarrayType(float32, 4D)> ''
|GpuContiguous [@B] <CudaNdarrayType(float32, 4D)> ''
| |GpuSubtensor{int32:int32:} [@C] <CudaNdarrayType(float32, 4D)> ''
| |macro_batch_x [@D] <CudaNdarrayType(float32, 4D)>
| |ScalarFromTensor [@E] <int32> ''
| | |Elemwise{mul,no_inplace} [@F] <TensorType(int32, scalar)> ''
| | |TensorConstant{5} [@G] <TensorType(int32, scalar)>
| | |<TensorType(int32, scalar)> [@H] <TensorType(int32, scalar)>
| |ScalarFromTensor [@I] <int32> ''
| |Elemwise{Composite{(i0 * (i1 + i2))}} [@J] <TensorType(int32, scalar)> ''
| |TensorConstant{5} [@G] <TensorType(int32, scalar)>
| |TensorConstant{1} [@K] <TensorType(int32, scalar)>
| |<TensorType(int32, scalar)> [@H] <TensorType(int32, scalar)>
|GpuContiguous [@L] <CudaNdarrayType(float32, 4D)> ''
| |W [@M] <CudaNdarrayType(float32, 4D)>
|GpuAllocEmpty [@N] <CudaNdarrayType(float32, 4D)> ''
| |Elemwise{Composite{(Composite{Switch(LT(i0, i1), i1, i0)}(Composite{Switch(GE(i0, i1), i1, i0)}(Composite{Switch(LT(i0, i1), i2, i0)}(Composite{Switch(LT(i0, i1), (i0 + i2), i0)}(i0, i1, i2), i1, i3), i2), i1) - Switch(LT(Composite{Switch(LT(i0, i1), i1, i0)}(Composite{Switch(GE(i0, i1), i1, i0)}(Composite{Switch(LT(i0, i1), i1, i0)}(Composite{Switch(LT(i0, i1), (i0 + i2), i0)}(i4, i1, i2), i1), i2), i1), Composite{Switch(LT(i0, i1), i1, i0)}(Composite{Switch(GE(i0, i1), i1, i0)}(Composite{Switch(LT(i0, i1), i2, i0)}(Composite{Switch(LT(i0, i1), (i0 + i2), i0)}(i0, i1, i2), i1, i3), i2), i1)), Composite{Switch(LT(i0, i1), i1, i0)}(Composite{Switch(GE(i0, i1), i1, i0)}(Composite{Switch(LT(i0, i1), i1, i0)}(Composite{Switch(LT(i0, i1), (i0 + i2), i0)}(i4, i1, i2), i1), i2), i1), Composite{Switch(LT(i0, i1), i1, i0)}(Composite{Switch(GE(i0, i1), i1, i0)}(Composite{Switch(LT(i0, i1), i2, i0)}(Composite{Switch(LT(i0, i1), (i0 + i2), i0)}(i0, i1, i2), i1, i3), i2), i1)))}}[(0, 2)] [@O] <TensorType(int64, scalar)> ''
| | |Elemwise{Composite{(i0 * (i1 + i2))}} [@J] <TensorType(int32, scalar)> ''
| | |TensorConstant{0} [@P] <TensorType(int8, scalar)>
| | |Shape_i{0} [@Q] <TensorType(int64, scalar)> ''
| | | |macro_batch_x [@D] <CudaNdarrayType(float32, 4D)>
| | |TensorConstant{-1} [@R] <TensorType(int8, scalar)>
| | |Elemwise{mul,no_inplace} [@F] <TensorType(int32, scalar)> ''
| |Shape_i{0} [@S] <TensorType(int64, scalar)> ''
| | |W [@M] <CudaNdarrayType(float32, 4D)>
| |Elemwise{Composite{((((i0 + i1) - i2) // i3) + i3)}}[(0, 0)] [@T] <TensorType(int64, scalar)> ''
| | |Shape_i{2} [@U] <TensorType(int64, scalar)> ''
| | | |macro_batch_x [@D] <CudaNdarrayType(float32, 4D)>
| | |TensorConstant{0} [@P] <TensorType(int8, scalar)>
| | |Shape_i{2} [@V] <TensorType(int64, scalar)> ''
| | | |W [@M] <CudaNdarrayType(float32, 4D)>
| | |TensorConstant{1} [@W] <TensorType(int8, scalar)>
| |Elemwise{Composite{((((i0 + i1) - i2) // i3) + i3)}}[(0, 0)] [@X] <TensorType(int64, scalar)> ''
| |Shape_i{3} [@Y] <TensorType(int64, scalar)> ''
| | |macro_batch_x [@D] <CudaNdarrayType(float32, 4D)>
| |TensorConstant{0} [@P] <TensorType(int8, scalar)>
| |Shape_i{3} [@Z] <TensorType(int64, scalar)> ''
| | |W [@M] <CudaNdarrayType(float32, 4D)>
| |TensorConstant{1} [@W] <TensorType(int8, scalar)>
|GpuDnnConvDesc{border_mode='valid', subsample=(1, 1), conv_mode='conv'} [@BA] <CDataType{cudnnConvolutionDescriptor_t}> ''
| |MakeVector{dtype='int64'} [@BB] <TensorType(int64, vector)> ''
| | |Elemwise{Composite{(Composite{Switch(LT(i0, i1), i1, i0)}(Composite{Switch(GE(i0, i1), i1, i0)}(Composite{Switch(LT(i0, i1), i2, i0)}(Composite{Switch(LT(i0, i1), (i0 + i2), i0)}(i0, i1, i2), i1, i3), i2), i1) - Switch(LT(Composite{Switch(LT(i0, i1), i1, i0)}(Composite{Switch(GE(i0, i1), i1, i0)}(Composite{Switch(LT(i0, i1), i1, i0)}(Composite{Switch(LT(i0, i1), (i0 + i2), i0)}(i4, i1, i2), i1), i2), i1), Composite{Switch(LT(i0, i1), i1, i0)}(Composite{Switch(GE(i0, i1), i1, i0)}(Composite{Switch(LT(i0, i1), i2, i0)}(Composite{Switch(LT(i0, i1), (i0 + i2), i0)}(i0, i1, i2), i1, i3), i2), i1)), Composite{Switch(LT(i0, i1), i1, i0)}(Composite{Switch(GE(i0, i1), i1, i0)}(Composite{Switch(LT(i0, i1), i1, i0)}(Composite{Switch(LT(i0, i1), (i0 + i2), i0)}(i4, i1, i2), i1), i2), i1), Composite{Switch(LT(i0, i1), i1, i0)}(Composite{Switch(GE(i0, i1), i1, i0)}(Composite{Switch(LT(i0, i1), i2, i0)}(Composite{Switch(LT(i0, i1), (i0 + i2), i0)}(i0, i1, i2), i1, i3), i2), i1)))}}[(0, 2)] [@O] <TensorType(int64, scalar)> ''
| | |Shape_i{1} [@BC] <TensorType(int64, scalar)> ''
| | | |macro_batch_x [@D] <CudaNdarrayType(float32, 4D)>
| | |Shape_i{2} [@U] <TensorType(int64, scalar)> ''
| | |Shape_i{3} [@Y] <TensorType(int64, scalar)> ''
| |MakeVector{dtype='int64'} [@BD] <TensorType(int64, vector)> ''
| |Shape_i{0} [@S] <TensorType(int64, scalar)> ''
| |Shape_i{1} [@BE] <TensorType(int64, scalar)> ''
| | |W [@M] <CudaNdarrayType(float32, 4D)>
| |Shape_i{2} [@V] <TensorType(int64, scalar)> ''
| |Shape_i{3} [@Z] <TensorType(int64, scalar)> ''
|Constant{1.0} [@BF] <float32>
|Constant{0.0} [@BG] <float32>
Storage map footprint:
- W, Shared Input, Shape: (2027776, 100), ElemSize: 4 Byte(s), TotalSize: 811110400 Byte(s)
- macro_batch_x, Shared Input, Shape: (10, 3, 360, 360), ElemSize: 4 Byte(s), TotalSize: 15552000 Byte(s)
- <CudaNdarrayType(float32, vector)>, Shared Input, Shape: (92160,), ElemSize: 4 Byte(s), TotalSize: 368640 Byte(s)
- <CudaNdarrayType(float32, vector)>, Shared Input, Shape: (92160,), ElemSize: 4 Byte(s), TotalSize: 368640 Byte(s)
- W, Shared Input, Shape: (64, 3, 4, 4), ElemSize: 4 Byte(s), TotalSize: 12288 Byte(s)
- GpuContiguous.0, Shape: (64, 3, 4, 4), ElemSize: 4 Byte(s), TotalSize: 12288 Byte(s)
- b, Shared Input, Shape: (100,), ElemSize: 4 Byte(s), TotalSize: 400 Byte(s)
- W, Shared Input, Shape: (100, 1), ElemSize: 4 Byte(s), TotalSize: 400 Byte(s)
- b, Shared Input, Shape: (64,), ElemSize: 4 Byte(s), TotalSize: 256 Byte(s)
- macro_batch_y, Shared Input, Shape: (10,), ElemSize: 4 Byte(s), TotalSize: 40 Byte(s)
- Elemwise{Composite{(Composite{Switch(LT(i0, i1), i1, i0)}(Composite{Switch(GE(i0, i1), i1, i0)}(Composite{Switch(LT(i0, i1), i2, i0)}(Composite{Switch(LT(i0, i1), (i0 + i2), i0)}(i0, i1, i2), i1, i3), i2), i1) - Switch(LT(Composite{Switch(LT(i0, i1), i1, i0)}(Composite{Switch(GE(i0, i1), i1, i0)}(Composite{Switch(LT(i0, i1), i1, i0)}(Composite{Switch(LT(i0, i1), (i0 + i2), i0)}(i4, i1, i2), i1), i2), i1), Composite{Switch(LT(i0, i1), i1, i0)}(Composite{Switch(GE(i0, i1), i1, i0)}(Composite{Switch(LT(i0, i1), i2, i0)}(Composite{Switch(LT(i0, i1), (i0 + i2), i0)}(i0, i1, i2), i1, i3), i2), i1)), Composite{Switch(LT(i0, i1), i1, i0)}(Composite{Switch(GE(i0, i1), i1, i0)}(Composite{Switch(LT(i0, i1), i1, i0)}(Composite{Switch(LT(i0, i1), (i0 + i2), i0)}(i4, i1, i2), i1), i2), i1), Composite{Switch(LT(i0, i1), i1, i0)}(Composite{Switch(GE(i0, i1), i1, i0)}(Composite{Switch(LT(i0, i1), i2, i0)}(Composite{Switch(LT(i0, i1), (i0 + i2), i0)}(i0, i1, i2), i1, i3), i2), i1)))}}[(0, 2)].0, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
- TensorConstant{178}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
- TensorConstant{64}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
- ScalarFromTensor.0, Shape: (), ElemSize: 4 Byte(s), TotalSize: 4.0 Byte(s)
- CudaNdarrayConstant{[[[[ 1.25]]]]}, Shape: (1, 1, 1, 1), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
- Constant{1.0}, Shape: (), ElemSize: 4 Byte(s), TotalSize: 4.0 Byte(s)
- CudaNdarrayConstant{[[[[ 0.80000001]]]]}, Shape: (1, 1, 1, 1), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
- <TensorType(int32, scalar)>, Input, Shape: (), ElemSize: 4 Byte(s), TotalSize: 4.0 Byte(s)
- CudaNdarrayConstant{[[ 0.5]]}, Shape: (1, 1), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
- b, Shared Input, Shape: (1,), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
- Constant{0.0}, Shape: (), ElemSize: 4 Byte(s), TotalSize: 4.0 Byte(s)
- TensorConstant{1}, Shape: (), ElemSize: 4 Byte(s), TotalSize: 4.0 Byte(s)
- ScalarFromTensor.0, Shape: (), ElemSize: 4 Byte(s), TotalSize: 4.0 Byte(s)
- CudaNdarrayConstant{[[[[ 0.5]]]]}, Shape: (1, 1, 1, 1), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
- TensorConstant{5}, Shape: (), ElemSize: 4 Byte(s), TotalSize: 4.0 Byte(s)
- TensorConstant{0}, Shape: (), ElemSize: 1 Byte(s), TotalSize: 1.0 Byte(s)
- TensorConstant{-1}, Shape: (), ElemSize: 1 Byte(s), TotalSize: 1.0 Byte(s)
- TensorConstant{1}, Shape: (), ElemSize: 1 Byte(s), TotalSize: 1.0 Byte(s)
- GpuContiguous.0, Shape: (0, 3, 360, 360), ElemSize: 4 Byte(s), TotalSize: 0 Byte(s)
- GpuAllocEmpty.0, Shape: (0, 64, 357, 357), ElemSize: 4 Byte(s), TotalSize: 0 Byte(s)
TotalSize: 827413139.0 Byte(s) 0.771 GB
TotalSize inputs: 827413123.0 Byte(s) 0.771 GB
class CNN(object):
def __init__(self,macro_splits=10,micro_splits=5,X=None,Y=None, n_channels=3):
self.n_channels = int(n_channels)
self.macro_splits = int(macro_splits)
self.micro_splits = int(micro_splits)
self.X=X
self.Y=Y
self.build_cnn()
def build_cnn(self):
print("Building model and compiling functions...")
x = T.ftensor4('x')
y = T.fvector('y')
i = T.iscalar()
self.network = self.build_layers(x)
self.init = write_model_data(self.network, 'init')
prediction = lasagne.layers.get_output(self.network)
prediction = T.flatten(prediction)
if y.ndim != prediction.ndim:
raise TypeError('wrong prediction dim', ('y', y.type, 'prediction', prediction))
loss = lasagne.objectives.squared_error(prediction, y).mean()
params = lasagne.layers.get_all_params(self.network, trainable=True)
updates = lasagne.updates.adadelta(loss, params)
test_prediction = lasagne.layers.get_output(self.network, deterministic=True)
test_loss = lasagne.objectives.squared_error(test_prediction, y).mean()
self.macro_batch_x = shared(np.empty((self.macro_splits,)+self.X.shape[1:],dtype=(config.floatX)), name='macro_batch_x', borrow=True)
self.macro_batch_y = shared(np.empty((self.macro_splits,),dtype=(config.floatX)), name='macro_batch_y', borrow=True)
givenBoth = {
x: self.macro_batch_x[i*self.micro_splits : (i+1)*self.micro_splits],
y: self.macro_batch_y[i*self.micro_splits : (i+1)*self.micro_splits]
}
givenInput = {
x: self.macro_batch_x[i*self.micro_splits:(i+1)*self.micro_splits]
}
self.train_fn = theano.function([i], loss, givens=givenBoth)
self.test_fn = theano.function([i], test_loss, givens=givenBoth)
self.pred_fn = theano.function([i], test_prediction, givens=givenInput)
def build_layers(self,input_var=None):
network = lasagne.layers.InputLayer(shape=(None, self.n_channels, 360,360),
input_var=input_var)
network = lasagne.layers.Conv2DLayer(
network, num_filters=64, filter_size=(4, 4),stride=1,
nonlinearity=lasagne.nonlinearities.rectify,
W=lasagne.init.GlorotUniform())
# network = lasagne.layers.cuda_convnet.Conv2DCCLayer(
# network, num_filters=64, filter_size=(4, 4),
# nonlinearity=lasagne.nonlinearities.rectify,
# W=lasagne.init.GlorotUniform())
network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2))
network = lasagne.layers.dropout(network, p=.2)
network = lasagne.layers.DenseLayer(network,num_units=100,
nonlinearity=lasagne.nonlinearities.rectify)
network = lasagne.layers.dropout(network, p=.5)
network = lasagne.layers.DenseLayer(network,
num_units=1,nonlinearity=lasagne.nonlinearities.tanh)
return network
def fit(self):
av_loss = 0
av_acc = 0
iters = 1
nb_epochs = 1000
nb_folds = 5
f = 1
for _ in xrange(iters):
kfolds = KFold(len(self.Y), nb_folds)
for train, test in kfolds:
print('---'*20)
print('Iteration', iters, 'Fold', f)
print('---'*20)
f += 1
X_train = self.X[train]
X_test = self.X[test]
Y_train = self.Y[train]
Y_test = self.Y[test]
print('X',self.X.shape,'X_train',X_train.shape)
macro_count = X_train.shape[0]/self.macro_splits
micro_count = macro_count/self.micro_splits
print('pictures:', X_train.shape[0], 'macro size', macro_count, 'micro_count', micro_count)
# Train
lasagne.layers.set_all_param_values(self.network, self.init)
for epoch in range(nb_epochs):
train_loss = 0
start = timeit.default_timer()
for macro_index in xrange(macro_count):
self.macro_batch_x.set_value(X_train[macro_index * self.macro_splits : (macro_index+1) * self.macro_splits], borrow=True)
self.macro_batch_y.set_value(Y_train[macro_index * self.macro_splits : (macro_index+1) * self.macro_splits], borrow=True)
a=0.
for micro_index in xrange(micro_count):
a += self.train_fn(micro_index)
train_loss += a/macro_count
print('epoch:',epoch, 'train loss:',a, 'images/s', X_train.shape[0]/(timeit.default_timer()-start))
# Test
macro_count = X_test.shape[0]/self.macro_splits
micro_count = self.macro_splits/self.micro_splits
test_loss = 0
for macro_index in xrange(macro_count):
self.macro_batch_x.set_value(X_test[macro_index * self.macro_splits: (macro_index + 1) * self.macro_splits], borrow=True)
self.macro_batch_y.set_value(Y_test[macro_index * self.macro_splits: (macro_index + 1) * self.macro_splits], borrow=True)
a = 0.
for micro_index in xrange(micro_count):
a += self.test_fn(micro_index)
test_loss += a/macro_count
print('epoch:',epoch, 'test loss:',a)
av_loss += test_loss
print('Average loss:', av_loss/nb_folds/iters)
write_model_data(self.network, 'saved')
def read_model_data(model, filename):
filename = os.path.join('./', '%s.%s' % (filename, 'params'))
with open(filename, 'r') as f:
data = pickle.load(f)
lasagne.layers.set_all_param_values(model, data)
def write_model_data(model, filename):
data = lasagne.layers.get_all_param_values(model)
filename = os.path.join('./', filename)
filename = '%s.%s' % (filename, 'params')
with open(filename, 'w') as f:
pickle.dump(data, f)
return data
def main():
seed = 1
random.seed(seed)
np.random.seed(seed)
faces = np.load('saved.npz')
faces_X1, faces_X2, faces_Y = faces['faces_X1'], faces['faces_X2'], faces['faces_Y']
faces.close()
faces_X1 = faces_X1.reshape((-1, 3, 360, 360))
faces_Y *= 1/faces_Y.max()
# logging.basicConfig(level=logging.INFO)
t0 = time.time()
model = CNN(X=faces_X1.astype(theano.config.floatX),Y=faces_Y.astype(theano.config.floatX), n_channels=3)
model.fit()
print "Elapsed time: %f" % (time.time() - t0)
if __name__ == "__main__":
main()