import caffe
net_caffe = caffe.Net('yu-koltun-net.prototxt', 'yu-koltun-net.caffemodel', caffe.TEST)
import lasagne
from lasagne.layers.dnn import Conv2DDNNLayer as ConvLayer
from lasagne.layers import InputLayer, DropoutLayer
from lasagne.layers import Pool2DLayer as PoolLayer
from lasagne.utils import floatX
from lasagne.nonlinearities import rectify as relu
from lasagne.nonlinearities import softmax
from lasagne.layers import DilatedConv2DLayer as DilatedConvLayer
from lasagne.layers import DenseLayer
from lasagne.layers import NonlinearityLayer
nnet = {}
nnet['input'] = InputLayer((None, 3, None, None))
nnet['conv1_1'] = ConvLayer(nnet['input'], num_filters=64, filter_size=3, pad=0, flip_filters=False, nonlinearity=relu)
nnet['conv1_2'] = ConvLayer(nnet['conv1_1'], num_filters=64, filter_size=3, pad=0, flip_filters=False, nonlinearity=relu)
nnet['pool1'] = PoolLayer(nnet['conv1_2'], pool_size=2, stride=2, mode='max', ignore_border=False)
nnet['conv2_1'] = ConvLayer(nnet['pool1'], num_filters=128, filter_size=3, pad=0, flip_filters=False, nonlinearity=relu)
nnet['conv2_2'] = ConvLayer(nnet['conv2_1'], num_filters = 128, filter_size=3, pad=0, flip_filters=False, nonlinearity=relu)
nnet['pool2'] = PoolLayer(nnet['conv2_2'], pool_size=2, stride=2, mode='max', ignore_border=False)
nnet['conv3_1'] = ConvLayer(nnet['pool2'], num_filters=256, filter_size=3, pad=0, flip_filters=False, nonlinearity=relu)
nnet['conv3_2'] = ConvLayer(nnet['conv3_1'], num_filters=256, filter_size=3, pad=0, flip_filters=False, nonlinearity=relu)
nnet['conv3_3'] = ConvLayer(nnet['conv3_2'], num_filters=256, filter_size=3, pad=0, flip_filters=False, nonlinearity=relu)
nnet['pool3'] = PoolLayer(nnet['conv3_3'], pool_size=2, stride=2, mode='max', ignore_border=False)
nnet['conv4_1'] = ConvLayer(nnet['pool3'], num_filters=512, filter_size=3, pad=0, flip_filters=False, nonlinearity=relu)
nnet['conv4_2'] = ConvLayer(nnet['conv4_1'], num_filters=512, filter_size=3, pad=0, flip_filters=False, nonlinearity=relu)
nnet['conv4_3'] = ConvLayer(nnet['conv4_2'], num_filters=512, filter_size=3, pad=0, flip_filters=False, nonlinearity=relu)
nnet['conv5_1'] = DilatedConvLayer(nnet['conv4_3'], num_filters=512, dilation=(2,2), filter_size=3, pad=0, flip_filters=False, nonlinearity=relu)
nnet['conv5_2'] = DilatedConvLayer(nnet['conv5_1'], num_filters=512, dilation=(2,2), filter_size=3, pad=0, flip_filters=False, nonlinearity=relu)
nnet['conv5_3'] = DilatedConvLayer(nnet['conv5_2'], num_filters=512, dilation=(2,2), filter_size=3, pad=0, flip_filters=False, nonlinearity=relu)
nnet['fc6'] = DilatedConvLayer(nnet['conv5_3'], num_filters=4096, dilation=(4,4), filter_size=7, pad=0, flip_filters=False, nonlinearity=relu)
nnet['drop6'] = DropoutLayer(nnet['fc6'], p=0.5)
nnet['fc7'] = ConvLayer(nnet['drop6'], num_filters=4096, filter_size=1, pad=0, flip_filters=False, nonlinearity=relu)
nnet['drop7'] = DropoutLayer(nnet['fc6'], p=0.5)
nnet['fc-final'] = ConvLayer(nnet['drop7'], num_filters=21, filter_size=1, pad=0, flip_filters=False, nonlinearity=lasagne.nonlinearities.linear)
# begin context network
nnet['ct_conv1_1'] = ConvLayer(nnet['fc-final'], num_filters=42, filter_size=3, pad=33, flip_filters=False, nonlinearity=relu) # has lr_mult and decay_mult
nnet['ct_conv1_2'] = ConvLayer(nnet['ct_conv1_1'], num_filters=42, filter_size=3, pad=0, flip_filters=False, nonlinearity=relu) # has lr_mult, decay_mult
nnet['ct_conv2_1'] = DilatedConvLayer(nnet['ct_conv1_2'], num_filters=84, filter_size=3, dilation=(2,2), pad=0, flip_filters=False, nonlinearity=relu)
nnet['ct_conv3_1'] = DilatedConvLayer(nnet['ct_conv2_1'], num_filters=168, filter_size=3, dilation=(4,4), pad=0, flip_filters=False, nonlinearity=relu)
nnet['ct_conv4_1'] = DilatedConvLayer(nnet['ct_conv3_1'], num_filters=336, filter_size=3, dilation=(8,8), pad=0, flip_filters=False, nonlinearity=relu)
nnet['ct_conv5_1'] = DilatedConvLayer(nnet['ct_conv4_1'], num_filters=672, filter_size=3, dilation=(16,16), pad=0, flip_filters=False, nonlinearity=relu)
nnet['ct_fc1'] = ConvLayer(nnet['ct_conv5_1'], num_filters=672, filter_size=3, pad=0, flip_filters=False, nonlinearity=relu)
nnet['ct_final'] = ConvLayer(nnet['ct_fc1'], num_filters=21, filter_size=1, pad=0, flip_filters=False, nonlinearity=lasagne.nonlinearities.linear)
#nnet['prob'] = DenseLayer(nnet['ct_final'], num_units=21, nonlinearity=lasagne.nonlinearities.softmax)
nnet['output'] = lasagne.layers.FlattenLayer(nnet['ct_final'])
nnet['prob'] = NonlinearityLayer(nnet['output'], nonlinearity=softmax)
# Copy parameters from Caffe to Lasagne
layers_caffe = dict(zip(list(net_caffe._layer_names), net_caffe.layers))
for name, layer in nnet.items():
try:
layer.W.set_value(layers_caffe[name].blobs[0].data)
layer.b.set_value(layers_caffe[name].blobs[1].data)
except AttributeError:
continue
prob = lasagne.layers.get_output(nnet['prob'], the_image_to_classify, deterministic=True).eval()
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-42-4bb404431448> in <module>()
----> 1 prob = lasagne.layers.get_output(nnet['prob'], im, deterministic=True).eval()
/data/deep-learning-env/anaconda2/lib/python2.7/site-packages/theano/gof/graph.pyc in eval(self, inputs_to_values)
521 args = [inputs_to_values[param] for param in inputs]
522
--> 523 rval = self._fn_cache[inputs](*args)
524
525 return rval
/data/deep-learning-env/anaconda2/lib/python2.7/site-packages/theano/compile/function_module.pyc in __call__(self, *args, **kwargs)
869 node=self.fn.nodes[self.fn.position_of_error],
870 thunk=thunk,
--> 871 storage_map=getattr(self.fn, 'storage_map', None))
872 else:
873 # old-style linkers raise their own exceptions
/data/deep-learning-env/anaconda2/lib/python2.7/site-packages/theano/gof/link.pyc in raise_with_op(node, thunk, exc_info, storage_map)
312 # extra long error message in that case.
313 pass
--> 314 reraise(exc_type, exc_value, exc_trace)
315
316
/data/deep-learning-env/anaconda2/lib/python2.7/site-packages/theano/compile/function_module.pyc in __call__(self, *args, **kwargs)
857 t0_fn = time.time()
858 try:
--> 859 outputs = self.fn()
860 except Exception:
861 if hasattr(self.fn, 'position_of_error'):
RuntimeError: GpuDnnConvGradW: error getting worksize: CUDNN_STATUS_BAD_PARAM
Apply node that caused the error: GpuDnnConvGradW{algo='none', inplace=True}(GpuContiguous.0, GpuContiguous.0, GpuAllocEmpty.0, GpuDnnConvDesc{border_mode='valid', subsample=(4, 4), conv_mode='cross', precision='float32'}.0, Constant{1.0}, Constant{0.0})
Toposort index: 292
Inputs types: [CudaNdarrayType(float32, (False, True, False, False)), CudaNdarrayType(float32, 4D), CudaNdarrayType(float32, (False, True, False, False)), <theano.gof.type.CDataType object at 0x7fbe43ea2350>, Scalar(float32), Scalar(float32)]
Inputs shapes: [(512, 1, 90, 90), (4096, 512, 7, 7), (512, 1, 66, 66), 'No shapes', (), ()]
Inputs strides: [(8100, 0, 90, 1), (25088, 49, 7, 1), (4356, 0, 66, 1), 'No strides', (), ()]
Inputs values: ['not shown', 'not shown', 'not shown', <PyCObject object at 0x7fbd8b0cbfa8>, 1.0, 0.0]
Inputs name: ('image', 'grad', 'output', 'descriptor', 'alpha', 'beta')
Outputs clients: [[GpuDimShuffle{1,0,2,3}(GpuDnnConvGradW{algo='none', inplace=True}.0)]]
HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.
I'm trying to load the network from Yu & Koltun (http://arxiv.org/abs/1511.07122) using the model available at https://github.com/fyu/dilation and am running into some problems. I'm trying to follow along the recipe here to load the weights. Here's what I've got:
RuntimeError: GpuDnnConvGradW: error getting worksize: CUDNN_STATUS_BAD_PARAM
Apply node that caused the error: GpuDnnConvGradW{algo='none', inplace=True}(GpuContiguous.0, GpuContiguous.0, GpuAllocEmpty.0, GpuDnnConvDesc{border_mode='valid', subsample=(4, 4), conv_mode='cross', precision='float32'}.0, Constant{1.0}, Constant{0.0})
Toposort index: 292
Inputs types: [CudaNdarrayType(float32, (False, True, False, False)), CudaNdarrayType(float32, 4D), CudaNdarrayType(float32, (False, True, False, False)), <theano.gof.type.CDataType object at 0x7fbe43ea2350>, Scalar(float32), Scalar(float32)]
Inputs shapes: [(512, 1, 90, 90), (4096, 512, 7, 7), (512, 1, 66, 66), 'No shapes', (), ()]
Inputs strides: [(8100, 0, 90, 1), (25088, 49, 7, 1), (4356, 0, 66, 1), 'No strides', (), ()]
Inputs values: ['not shown', 'not shown', 'not shown', <PyCObject object at 0x7fbd8b0cbfa8>, 1.0, 0.0]
Inputs name: ('image', 'grad', 'output', 'descriptor', 'alpha', 'beta')
(None, 4096, 66, 66)
(512, 4096, 66, 66)
The kernel (i.e., the output of the dilated convolution) should have been (4096, 1, 66, 66). Looks like the output shape computation is wrong.
How do I actually swap those in the dilated layers? I'm very new to Lasagne and not at all familiar yet.
layer.W.set_value(layers_caffe[name].blobs[0].data)
Do something like:
W =
layers_caffe[name].blobs[0].data
if isinstance(layer, DilatedConvLayer):
W = W.transpose(1, 0, 2, 3)
layer.W.set_value(W)
names = ['input','conv1_1','conv1_2','pool1','conv2_1','conv2_2',
'pool2','conv3_1','conv3_2','conv3_3','pool3','conv4_1','conv4_2','conv4_3',
'conv5_1','conv5_2','conv5_3','fc6','drop6','fc6','drop7','fc-final',
'ct_conv1_1','ct_conv1_2','ct_conv2_1','ct_conv3_1','ct_conv4_1','ct_conv5_1','ct_fc1','ct_final']
for name in names:
if len(lasagne.layers.get_all_param_values(net[name])) > 0 and not "pool" in name and not "drop" in name:
lasagne_params = lasagne.layers.get_all_param_values(net[name])[0]
caffe_params = caffe_net.params[name][0].data
print name, np.array_equal(caffe_params, lasagne_params)
print np.shape(lasagne.layers.get_all_param_values(net['conv1_2'])[0])
print np.shape(caffe_net.params['conv1_2'][0].data)
# output is
# (64, 3, 3, 3)
# (64, 64, 3, 3)
print np.shape(lasagne.layers.get_all_param_values(net['conv2_1'])[0])
print np.shape(caffe_net.params['conv2_1'][0].data)
# output is
# (64, 3, 3, 3)
# (128, 64, 3, 3)
nnet = {}
nnet['input'] = InputLayer((1, 3, 900, 900))
nnet['conv1_1'] = ConvLayer(nnet['input'], num_filters=64, filter_size=3, pad=0, flip_filters=False, nonlinearity=relu)
nnet['conv1_2'] = ConvLayer(nnet['conv1_1'], num_filters=64, filter_size=3, pad=0, flip_filters=False, nonlinearity=relu)
nnet['pool1'] = PoolLayer(nnet['conv1_2'], pool_size=2, stride=2, mode='max', ignore_border=False)
print 'Lasagne layers', lasagne.layers.get_all_param_values(net['conv5_1'])[20][0]
print 'Caffe layers', caffe_net.params['conv5_1'][0].data[0]
Lasagne layers [[[ 1.79773569e-03 -3.86781082e-03 1.44778879e-03]
[ 5.62629709e-03 -3.78476828e-03 -3.50363902e-03]
[ -9.70753608e-04 -1.55057211e-03 2.05189921e-03]]
[[ -4.44258876e-05 -2.85256910e-03 -7.51745538e-04]
[ -5.39085083e-03 -8.67442507e-03 -6.84367260e-03]
[ -1.69579440e-03 -2.00542971e-03 -2.40873150e-03]]
[[ 3.54722259e-03 7.02135731e-04 5.49884560e-03]
[ 1.63380158e-04 -2.63992278e-03 4.20169148e-04]
[ 1.86512922e-03 -1.90990162e-03 1.24418832e-04]]
...,
[[ -3.51253781e-03 -1.02917319e-02 -4.89753392e-03]
[ -3.92112508e-03 -7.78933940e-03 -7.75852590e-04]
[ -5.20925329e-04 8.36972799e-03 7.79767148e-03]]
[[ -1.18238491e-03 -8.94870237e-03 -5.45267761e-03]
[ -5.66318957e-03 -1.76325385e-02 -1.12886960e-02]
[ -8.11124220e-03 -1.42779266e-02 -8.83196760e-03]]
[[ -3.31869884e-03 -4.01778612e-03 -3.69090540e-03]
[ -3.45548079e-03 -3.44294333e-03 -2.09339289e-03]
[ -1.11368170e-03 -1.81419810e-03 -4.53128887e-04]]]
Caffe layers [[[ 1.79773569e-03 -3.86781082e-03 1.44778879e-03]
[ 5.62629709e-03 -3.78476828e-03 -3.50363902e-03]
[ -9.70753608e-04 -1.55057211e-03 2.05189921e-03]]
[[ -2.17772159e-03 -2.09413143e-03 2.61314330e-03]
[ -1.32299084e-02 -4.97253053e-03 6.14153594e-03]
[ -1.26133068e-02 -5.66283567e-03 1.61643873e-03]]
[[ 4.73600952e-03 -1.91315322e-03 4.68450657e-04]
[ 4.06997791e-03 -5.24714636e-03 6.18363556e-04]
[ 1.02492245e-02 -7.65518111e-04 1.23429857e-03]]
...,
[[ -1.94619840e-03 -2.44452502e-03 -5.26150083e-03]
[ -2.70234491e-03 -3.81989777e-03 -5.41628478e-03]
[ 6.36245997e-04 9.92348650e-04 -2.53115897e-03]]
[[ 6.45608827e-03 -1.09352032e-02 -2.37631015e-02]
[ -3.14260926e-03 4.69462713e-03 -1.03884107e-02]
[ -6.74673310e-03 -4.62803198e-03 -6.75826194e-03]]
[[ 9.95074442e-05 3.66775156e-03 2.45841919e-04]
[ -3.43237282e-03 -4.81389143e-04 -4.60039213e-04]
[ -2.06783577e-03 4.17942210e-05 5.18482819e-04]]]
layers_caffe = dict(zip(list(net_caffe._layer_names), net_caffe.layers))
for name, layer in nnet.items():
if "pool" in name or "drop" in name or isinstance(layer, DenseLayer) or isinstance(layer, InputLayer):
continue
try:
name = layers_caffe[name]
W = name.blobs[0].data
if isinstance(layer, DilatedConvLayer):
W = W.transpose(1, 0, 2, 3)
assert W.shape == layer.W.get_value().shape
layer.W.set_value(W)
layer.b.set_value(name.blobs[1].data)
except AttributeError:
continue
Clearly these aren't the same, even though the first few numbers are. Is this something weird with the transpose in the weight copy?
for i in range(len(lasagne_outputs)):
print i, np.array_equal(caffe_outputs[i], lasagne_outputs[i])
print 'lasagne', lasagne_outputs[18][0][0][0]
print 'caffe', caffe_outputs[18][0][0][0]
# gives
lasagne [ 0.05386817 0.0505662 0.05117193 0.04846499 0.04983476 0.04654144
0.03697921 0.01228332 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. ]
caffe [ 0. 0.10113241 0.10234386 0.09692999 0. 0.09308289
0.07395843 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. ]
# get caffe output
net.blobs['data'].data[...] = caffe_in
caffe_out = net.forward(start='conv1_1', end='conv1_1')
# get lasagne output
lasagne_out = fn(caffe_in)
# compare lasagne and caffe
print np.array_equal(lasagne_out[1], caffe_out['conv1_1'])
# prints False
# check shapes
print np.shape(caffe_out['conv1_1'])
print np.shape(lasagne_out[1])
# prints
# (1, 64, 898, 898)
# (1, 64, 898, 898)
# print out the first 5 elements for comparison
print 'lasagne', lasagne_out[1][0][0][0][:5]
print 'caffe', caffe_out['conv1_1'][0][0][0][:5]
# prints
# lasagne [ 0. 0. 0. 0. 0.]
# caffe [ 0.08922774 -1.54430449 -0.95565462 -0.32640123 -0.32640123]
Ok, so I'm narrowing down the problem there was an issue with the bias not being applied in the first layer. Now it looks like the first convolution layer is correct. I'm going to map it out layer by layer and try to figure this out. I'll update when I've got something concrete.
Maybe I'm not wrong, though. In fc-final, which is the end of the front-end module the maximum difference is 17.3796. This is in 21 66x66 feature maps. Then at the end of the context network this front-end feeds into, the maximum difference is 24.7394. The last dilated convolution layer ct_conv5_1 has a maximum error of 0.96.
I'll add that even stronger than getting allclose on the first layer, I also get array_equal as true.
def max_rel_error(x, y):
return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))
caffe_conv1_1 = caffe_outputs[0]
lasagne_conv1_1 = lasagne_outputs[1]
print np.array_equal(caffe_conv1_1, lasagne_conv1_1) # True
caffe_conv1_2 = caffe_outputs[1]
lasagne_conv1_2 = lasagne_outputs[2]
print np.max(np.abs(caffe_conv1_2 - lasagne_conv1_2)) # 0.00109863
# Check to see if the weights are the same
lasagne_conv1_2_weights = lasagne.layers.get_all_param_values(nnet['conv1_2'])[2]
caffe_conv1_2_weights = net_caffe.params['conv1_2'][0].data
print np.array_equal(lasagne_conv1_2_weights, caffe_conv1_2_weights) # True
# Now check biases
lasagne_conv1_2_biases = lasagne.layers.get_all_param_values(nnet['conv1_2'])[3]
caffe_conv1_2_biases = net_caffe.params['conv1_2'][1].data
print np.array_equal(lasagne_conv1_2_biases, caffe_conv1_2_biases) # True
So as far as I can tell, somehow the input is identical, the weights are identical, the biases are identical, and the outputs are different. But it couldn't be an issue with the convolution operation, because the first layer works, right?
layers of caffe and lasagne flattened and their difference plotted. Errors tend to be low (on the order of 1e-6 throughout the network and the max errors are pretty low for most of the network. At layer 13 the max error is 1.9e-5. In layer 14, the error jumps to 12.435 and the differences tend near 2. This corresponds to the first dilated convolution layer.
Has that been tested out well? Is there something I need to do since the weight initialization required a transposition?nnet['drop6'] = DropoutLayer(nnet['fc6'], p=0.5)
nnet['fc7'] = ConvLayer(nnet['drop6'], num_filters=4096, filter_size=1, pad=0, flip_filters=False, nonlinearity=relu)
nnet['drop7'] = DropoutLayer(nnet['fc6'], p=0.5)
nnet['fc-final'] = ConvLayer(nnet['drop7'], num_filters=21, filter_size=1, pad=0, flip_filters=False, nonlinearity=lasagne.nonlinearities.linear)
I think I understand what the Lasagne implementation is doing now. So it's using a backward pass with the weights as output, subsampling the output.
Has that been tested out well?
Is there something I need to do since the weight initialization required a transposition?
In our case, we should perform the Softmax across all the 21 channels. So before applying the softmax, we must switch the dimension of the output (ct_final layer) to get something with the shape (size, 21) and then applying the softmax on the last dimension.