How to train a model, successfully?

61 views

Overfittingactioncaffe-modelconvergencerecognitionsolvertrainunderfitting

Skip to first unread message

liulu...@gmail.com

unread,

Dec 27, 2016, 3:58:04 AM12/27/16

to Caffe Users

Issue summary

Hello everyone, I am a newer in Caffe. I want to do action recognition with 3D CNN, but when I train my model I get troubled in it. Firstly, it appears loss=nan. Then, it can not convergence after changing the parameters. Who can teach me how to solve it?

Steps to reproduce

First, I concatenate two data sources into one input, forming 6 channels. In total, I have 10821 train samples and 2064 test samples. And the models are the same as when I use 3 channels as input data(when I use 3 channels, I can get well result.), as followings:

name: "cnn_kth_gray"
layers {
name: "data"
type: VIDEO_DATA
top: "data1"
top: "label"
image_data_param {
source: "../threechannels9/test_list.txt"
use_image: true
mean_file: "../threechannels9/train_mean.binaryproto"
batch_size: 10
show_data: 0
new_height: 60
new_width: 80
new_length: 8
shuffle: false
}
}
layers {
name: "data"
type: VIDEO_DATA
top: "data2"

top: "label2"

image_data_param {
source: "../threechannels7/test_list.txt"
use_image: true
mean_file: "../threechannels7/train_mean.binaryproto"
batch_size: 10
show_data: 0
new_height: 60
new_width: 80
new_length: 8
shuffle: false
}
}
layers{
name:"concat_data"
type:CONCAT
bottom:"data1"
bottom:"data2"
top:"data"
concat_param{
concat_dim:1
}
}

----------- 1st layer group ---------------

layers {
name: "conv1a"
type: CONVOLUTION3D
bottom: "data"
top: "conv1a"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 64
kernel_size: 7
kernel_depth: 3
pad: 3
temporal_pad: 1
stride: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layers {
name: "relu1a"
type: RELU
bottom: "conv1a"
top: "conv1a"
}
layers {
name: "pool1"
type: POOLING3D
bottom: "conv1a"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 2
kernel_depth: 2
stride: 2
temporal_stride: 2
}
}

------------- 2nd layer group --------------

}
}
layers {
name: "relu2a"
type: RELU
bottom: "conv2a"
top: "conv2a"
}
layers {
name: "pool2"
type: POOLING3D
bottom: "conv2a"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 2
kernel_depth: 2
stride: 2
temporal_stride: 2
}
}

---------------- fc layers -------------

layers {
name: "fc4"
type: INNER_PRODUCT
bottom: "pool2"
top: "fc4"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
inner_product_param {
num_output: 256
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
}
}
}
layers {
name: "relu4"
type: RELU
bottom: "fc4"
top: "fc4"
}
layers {
name: "drop4"
type: DROPOUT
bottom: "fc4"
top: "fc4"
dropout_param {
dropout_ratio: 0.5
}
}
layers {
name: "fc5"
type: INNER_PRODUCT
bottom: "fc4"
top: "fc5"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
inner_product_param {
num_output: 128
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
}
}
}
layers {
name: "relu5"
type: RELU
bottom: "fc5"
top: "fc5"
}
layers {
name: "drop5"
type: DROPOUT
bottom: "fc5"
top: "fc5"
dropout_param {
dropout_ratio: 0.5
}
}
layers {
name: "fc6"
type: INNER_PRODUCT
bottom: "fc5"
top: "fc6"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
inner_product_param {
num_output: 6
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layers {
name: "prob"
type: SOFTMAX
bottom: "fc6"
top: "prob"
}
layers {
top: "accuracy"
name: "accuracy"
type: ACCURACY
bottom: "prob"
bottom: "label"
}

The above is the test model, and the below is the train model:

name: "cnn_kth_gray"
layers {
name: "data"
type: VIDEO_DATA
top: "data1"
top: "label"
image_data_param {
source: "../threechannels9/train_list.txt"
use_image: true
mean_file: "../threechannels9/train_mean.binaryproto"
batch_size: 10
mirror:false
show_data: 0
new_height: 60
new_width: 80
new_length: 8
shuffle: false
}
}
layers {
name: "data"
type: VIDEO_DATA
top: "data2"

top: "label2"

image_data_param {
source: "../threechannels7/train_list.txt"
use_image: true
mean_file: "../threechannels7/train_mean.binaryproto"
# batch_size: 10
batch_size: 10
mirror:false
show_data: 0
new_height: 60
new_width: 80
new_length: 8
shuffle: false
}
}
layers{
name:"concat_data"
type:CONCAT
bottom:"data1"
bottom:"data2"
top:"data"
concat_param{
concat_dim:1
}
}

----------- 1st layer group ---------------

------------- 2nd layer group --------------

layers {
name: "conv2a"
type: CONVOLUTION3D
bottom: "pool1"
top: "conv2a"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 128
kernel_size: 7
kernel_depth: 3
pad: 3
temporal_pad: 1
weight_filler {
#type: "gaussian"
#std: 0.01
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layers {
name: "relu2a"
type: RELU
bottom: "conv2a"
top: "conv2a"
}
layers {
name: "pool2"
type: POOLING3D
bottom: "conv2a"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 2
kernel_depth: 2
stride: 2
temporal_stride: 2
}
}

---------------- fc layers -------------

layers {
name: "fc4"
type: INNER_PRODUCT
bottom: "pool2"
top: "fc4"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
inner_product_param {
num_output: 256
weight_filler {
#type: "gaussian"
#std: 0.005
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layers {
name: "relu4"
type: RELU
bottom: "fc4"
top: "fc4"
}
layers {
name: "drop4"
type: DROPOUT
bottom: "fc4"
top: "fc4"
dropout_param {
dropout_ratio: 0.5
}
}
layers {
name: "fc5"
type: INNER_PRODUCT
bottom: "fc4"
top: "fc5"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
inner_product_param {
num_output: 128
weight_filler {
#type: "gaussian"
#std: 0.005
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layers {
name: "relu5"
type: RELU
bottom: "fc5"
top: "fc5"
}
layers {
name: "drop5"
type: DROPOUT
bottom: "fc5"
top: "fc5"
dropout_param {
dropout_ratio: 0.5
}
}
layers {
name: "fc6"
type: INNER_PRODUCT
bottom: "fc5"
top: "fc6"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
inner_product_param {
num_output: 6
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layers {
name: "loss"
type: SOFTMAX_LOSS
bottom: "fc6"
bottom: "label"
}

And it is the solve:

train_net: "multi_channels_train.prototxt"
test_net: "multi_channels_test.prototxt"
test_iter: 300
test_interval: 1000
base_lr: 0.0003
momentum: 0.9
weight_decay: 0.005
lr_policy: "inv"
gamma: 0.0001
power: 0.75

display: 1000

max_iter: 15000

#snapshot: 15000
#snapshot_prefix: "chan3"

solver_mode: GPU
device_id:0

So, I get the result:

I1224 16:55:25.943403 31888 solver.cpp:61] Solving cnn_kth_gray
I1224 16:55:25.943426 31888 solver.cpp:106] Iteration 0, Testing net
I1224 16:56:24.839272 31888 solver.cpp:142] Test score #0: 0.13
I1224 16:56:24.839352 31888 solver.cpp:142] Test score #1: 1.79222
I1224 17:08:05.139917 31888 solver.cpp:237] Iteration 1000, lr = 0.000279304
I1224 17:08:05.140704 31888 solver.cpp:87] Iteration 1000, loss = 3.57628e-08
I1224 17:08:05.140821 31888 solver.cpp:106] Iteration 1000, Testing net
I1224 17:09:12.241538 31888 solver.cpp:142] Test score #0: 0.106667
I1224 17:09:12.241590 31888 solver.cpp:142] Test score #1: 37.4691
I1224 17:20:54.375815 31888 solver.cpp:237] Iteration 2000, lr = 0.000261659
I1224 17:20:54.376219 31888 solver.cpp:87] Iteration 2000, loss = 1.30465
I1224 17:20:54.376232 31888 solver.cpp:106] Iteration 2000, Testing net
I1224 17:22:02.605139 31888 solver.cpp:142] Test score #0: 0.079
I1224 17:22:02.605213 31888 solver.cpp:142] Test score #1: 27.376
I1224 17:33:49.042558 31888 solver.cpp:237] Iteration 3000, lr = 0.000246413
I1224 17:33:49.043254 31888 solver.cpp:87] Iteration 3000, loss = 0
I1224 17:33:49.043361 31888 solver.cpp:106] Iteration 3000, Testing net
I1224 17:34:55.220796 31888 solver.cpp:142] Test score #0: 0.126333
I1224 17:34:55.220849 31888 solver.cpp:142] Test score #1: 38.0356
I1224 17:46:39.299548 31888 solver.cpp:237] Iteration 4000, lr = 0.000233091
I1224 17:46:39.299958 31888 solver.cpp:87] Iteration 4000, loss = 0
I1224 17:46:39.299970 31888 solver.cpp:106] Iteration 4000, Testing net
I1224 17:47:52.172710 31888 solver.cpp:142] Test score #0: 0.126333
I1224 17:47:52.172760 31888 solver.cpp:142] Test score #1: 38.8023
I1224 17:59:29.775135 31888 solver.cpp:237] Iteration 5000, lr = 0.000221336
I1224 17:59:29.775545 31888 solver.cpp:87] Iteration 5000, loss = nan
I1224 17:59:29.775560 31888 solver.cpp:106] Iteration 5000, Testing net
I1224 18:00:34.618185 31888 solver.cpp:142] Test score #0: 0.173667
I1224 18:00:34.618242 31888 solver.cpp:142] Test score #1: nan
I1224 18:12:12.438012 31888 solver.cpp:237] Iteration 6000, lr = 0.000210878
I1224 18:12:12.438431 31888 solver.cpp:87] Iteration 6000, loss = nan
I1224 18:12:12.438449 31888 solver.cpp:106] Iteration 6000, Testing net
I1224 18:13:16.184010 31888 solver.cpp:142] Test score #0: 0.173667
I1224 18:13:16.184063 31888 solver.cpp:142] Test score #1: nan
I1224 18:24:57.493057 31888 solver.cpp:237] Iteration 7000, lr = 0.000201504
I1224 18:24:57.493491 31888 solver.cpp:87] Iteration 7000, loss = nan
I1224 18:24:57.493508 31888 solver.cpp:106] Iteration 7000, Testing net
I1224 18:26:16.571517 31888 solver.cpp:142] Test score #0: 0.181667
I1224 18:26:16.571570 31888 solver.cpp:142] Test score #1: nan
I1224 18:37:54.199434 31888 solver.cpp:237] Iteration 8000, lr = 0.000193049
I1224 18:37:54.199985 31888 solver.cpp:87] Iteration 8000, loss = nan
I1224 18:37:54.200021 31888 solver.cpp:106] Iteration 8000, Testing net
I1224 18:39:00.424139 31888 solver.cpp:142] Test score #0: 0.285667
I1224 18:39:00.424192 31888 solver.cpp:142] Test score #1: nan

There are many loss=nan. Then I reduce the base_lr to 0.0001, and the result is

I1224 11:32:09.807268 24040 solver.cpp:61] Solving cnn_kth_gray
I1224 11:32:09.807291 24040 solver.cpp:106] Iteration 0, Testing net
I1224 11:33:07.635201 24040 solver.cpp:142] Test score #0: 0.130667
I1224 11:33:07.635284 24040 solver.cpp:142] Test score #1: 1.79162
I1224 11:44:45.316015 24040 solver.cpp:237] Iteration 1000, lr = 9.31012e-05
I1224 11:44:45.316423 24040 solver.cpp:87] Iteration 1000, loss = 0.470303
I1224 11:44:45.316437 24040 solver.cpp:106] Iteration 1000, Testing net
I1224 11:45:46.747316 24040 solver.cpp:142] Test score #0: 0.106667
I1224 11:45:46.747375 24040 solver.cpp:142] Test score #1: 3.3449
I1224 11:57:26.854066 24040 solver.cpp:237] Iteration 2000, lr = 8.72196e-05
I1224 11:57:26.854576 24040 solver.cpp:87] Iteration 2000, loss = 0.002719
I1224 11:57:26.854609 24040 solver.cpp:106] Iteration 2000, Testing net
I1224 11:58:33.282390 24040 solver.cpp:142] Test score #0: 0.079
I1224 11:58:33.282449 24040 solver.cpp:142] Test score #1: 20.6993
I1224 12:10:12.140779 24040 solver.cpp:237] Iteration 3000, lr = 8.21377e-05
I1224 12:10:12.141168 24040 solver.cpp:87] Iteration 3000, loss = 2.38419e-07
I1224 12:10:12.141180 24040 solver.cpp:106] Iteration 3000, Testing net
I1224 12:11:21.869127 24040 solver.cpp:142] Test score #0: 0.126333
I1224 12:11:21.869180 24040 solver.cpp:142] Test score #1: 11.7715
I1224 12:22:57.055192 24040 solver.cpp:237] Iteration 4000, lr = 7.76969e-05
I1224 12:22:57.055614 24040 solver.cpp:87] Iteration 4000, loss = 0.979086
I1224 12:22:57.055632 24040 solver.cpp:106] Iteration 4000, Testing net
I1224 12:24:03.803319 24040 solver.cpp:142] Test score #0: 0.334333
I1224 12:24:03.803407 24040 solver.cpp:142] Test score #1: 1.84655
I1224 12:35:39.197751 24040 solver.cpp:237] Iteration 5000, lr = 7.37788e-05
I1224 12:35:39.198192 24040 solver.cpp:87] Iteration 5000, loss = 0.0016107
I1224 12:35:39.198210 24040 solver.cpp:106] Iteration 5000, Testing net
I1224 12:36:42.912531 24040 solver.cpp:142] Test score #0: 0.242667
I1224 12:36:42.912593 24040 solver.cpp:142] Test score #1: 28.0197
I1224 12:48:18.505199 24040 solver.cpp:237] Iteration 6000, lr = 7.02927e-05
I1224 12:48:18.505659 24040 solver.cpp:87] Iteration 6000, loss = 0
I1224 12:48:18.505678 24040 solver.cpp:106] Iteration 6000, Testing net
I1224 12:49:22.231613 24040 solver.cpp:142] Test score #0: 0.226667
I1224 12:49:22.231674 24040 solver.cpp:142] Test score #1: 7.86658
I1224 13:00:57.767045 24040 solver.cpp:237] Iteration 7000, lr = 6.71681e-05
I1224 13:00:57.767477 24040 solver.cpp:87] Iteration 7000, loss = 1.65985
I1224 13:00:57.767493 24040 solver.cpp:106] Iteration 7000, Testing net
I1224 13:02:04.090325 24040 solver.cpp:142] Test score #0: 0.210667
I1224 13:02:04.090385 24040 solver.cpp:142] Test score #1: 2.0796
I1224 13:13:45.222671 24040 solver.cpp:237] Iteration 8000, lr = 6.43496e-05
I1224 13:13:45.223103 24040 solver.cpp:87] Iteration 8000, loss = 0.00018522
I1224 13:13:45.223119 24040 solver.cpp:106] Iteration 8000, Testing net
I1224 13:14:52.831725 24040 solver.cpp:142] Test score #0: 0.175667
I1224 13:14:52.831780 24040 solver.cpp:142] Test score #1: 21.2707
I1224 13:26:29.929746 24040 solver.cpp:237] Iteration 9000, lr = 6.17924e-05
I1224 13:26:29.930202 24040 solver.cpp:87] Iteration 9000, loss = 1.74052
I1224 13:26:29.930222 24040 solver.cpp:106] Iteration 9000, Testing net
I1224 13:27:39.307377 24040 solver.cpp:142] Test score #0: 0.227333
I1224 13:27:39.307441 24040 solver.cpp:142] Test score #1: 1.74413
I1224 13:39:18.619287 24040 solver.cpp:237] Iteration 10000, lr = 5.94604e-05
I1224 13:39:18.619700 24040 solver.cpp:87] Iteration 10000, loss = 1.6683
I1224 13:39:18.619720 24040 solver.cpp:106] Iteration 10000, Testing net
I1224 13:40:23.578016 24040 solver.cpp:142] Test score #0: 0.173667
I1224 13:40:23.578075 24040 solver.cpp:142] Test score #1: 2.71767
I1224 13:52:02.817840 24040 solver.cpp:237] Iteration 11000, lr = 5.73239e-05
I1224 13:52:02.818285 24040 solver.cpp:87] Iteration 11000, loss = 9.85268e-05
I1224 13:52:02.818303 24040 solver.cpp:106] Iteration 11000, Testing net
I1224 13:53:04.384310 24040 solver.cpp:142] Test score #0: 0.173667
I1224 13:53:04.384362 24040 solver.cpp:142] Test score #1: 4.03042

As we can see, loss is big sometimes, and then small, and Test score #0 cannot convergence.
Then I think it may be underfitting, because the loss of trainset is small enough, but the loss of testset is unchanged. So I try to change some other parameters, such as, set weight_decay 0.5，and base_lr is still 0.0003. And the result is

I1224 18:49:38.281743 22199 solver.cpp:61] Solving cnn_kth_gray
I1224 18:49:38.281769 22199 solver.cpp:106] Iteration 0, Testing net
I1224 18:50:35.910184 22199 solver.cpp:142] Test score #0: 0.125667
I1224 18:50:35.910266 22199 solver.cpp:142] Test score #1: 1.79233
I1224 19:02:20.779150 22199 solver.cpp:237] Iteration 1000, lr = 0.000279304
I1224 19:02:20.779824 22199 solver.cpp:87] Iteration 1000, loss = 1.94739
I1224 19:02:20.779846 22199 solver.cpp:106] Iteration 1000, Testing net
I1224 19:03:32.471354 22199 solver.cpp:142] Test score #0: 0.126333
I1224 19:03:32.471415 22199 solver.cpp:142] Test score #1: 1.89274
I1224 19:15:13.761765 22199 solver.cpp:237] Iteration 2000, lr = 0.000261659
I1224 19:15:13.762218 22199 solver.cpp:87] Iteration 2000, loss = 2.04903
I1224 19:15:13.762238 22199 solver.cpp:106] Iteration 2000, Testing net
I1224 19:16:25.087585 22199 solver.cpp:142] Test score #0: 0.226667
I1224 19:16:25.087637 22199 solver.cpp:142] Test score #1: 1.78218
I1224 19:28:06.180078 22199 solver.cpp:237] Iteration 3000, lr = 0.000246413
I1224 19:28:06.180554 22199 solver.cpp:87] Iteration 3000, loss = 1.68554
I1224 19:28:06.180575 22199 solver.cpp:106] Iteration 3000, Testing net
I1224 19:29:16.247740 22199 solver.cpp:142] Test score #0: 0.325333
I1224 19:29:16.247791 22199 solver.cpp:142] Test score #1: 1.69897
I1224 19:41:00.501526 22199 solver.cpp:237] Iteration 4000, lr = 0.000233091
I1224 19:41:00.501942 22199 solver.cpp:87] Iteration 4000, loss = 2.08878
I1224 19:41:00.501956 22199 solver.cpp:106] Iteration 4000, Testing net
I1224 19:42:13.518661 22199 solver.cpp:142] Test score #0: 0.338667
I1224 19:42:13.518718 22199 solver.cpp:142] Test score #1: 1.67799
I1224 19:53:53.451045 22199 solver.cpp:237] Iteration 5000, lr = 0.000221336
I1224 19:53:53.451524 22199 solver.cpp:87] Iteration 5000, loss = 1.09684
I1224 19:53:53.451541 22199 solver.cpp:106] Iteration 5000, Testing net
I1224 19:54:56.529981 22199 solver.cpp:142] Test score #0: 0.242667
I1224 19:54:56.530036 22199 solver.cpp:142] Test score #1: 1.79733
I1224 20:06:40.013425 22199 solver.cpp:237] Iteration 6000, lr = 0.000210878
I1224 20:06:40.013837 22199 solver.cpp:87] Iteration 6000, loss = 1.3376
I1224 20:06:40.013852 22199 solver.cpp:106] Iteration 6000, Testing net
I1224 20:07:45.888494 22199 solver.cpp:142] Test score #0: 0.226667
I1224 20:07:45.888546 22199 solver.cpp:142] Test score #1: 1.81561
I1224 20:19:23.910408 22199 solver.cpp:237] Iteration 7000, lr = 0.000201504
I1224 20:19:23.910861 22199 solver.cpp:87] Iteration 7000, loss = 1.60183
I1224 20:19:23.910879 22199 solver.cpp:106] Iteration 7000, Testing net
I1224 20:20:35.471875 22199 solver.cpp:142] Test score #0: 0.175667
I1224 20:20:35.471928 22199 solver.cpp:142] Test score #1: 1.79461
I1224 20:32:13.073523 22199 solver.cpp:237] Iteration 8000, lr = 0.000193049
I1224 20:32:13.073933 22199 solver.cpp:87] Iteration 8000, loss = 1.32693
I1224 20:32:13.073948 22199 solver.cpp:106] Iteration 8000, Testing net
I1224 20:33:15.383118 22199 solver.cpp:142] Test score #0: 0.175667
I1224 20:33:15.383174 22199 solver.cpp:142] Test score #1: 1.73514
I1224 20:44:52.600426 22199 solver.cpp:237] Iteration 9000, lr = 0.000185377
I1224 20:44:52.600903 22199 solver.cpp:87] Iteration 9000, loss = 1.57691
I1224 20:44:52.600921 22199 solver.cpp:106] Iteration 9000, Testing net
I1224 20:45:55.698966 22199 solver.cpp:142] Test score #0: 0.227333
I1224 20:45:55.699019 22199 solver.cpp:142] Test score #1: 1.72745

The loss cannot be so small, but it still does not convergence. Who knows what's wrong and how can I improve it?

Your system configuration

Operating system: Ubuntu
Compiler: gcc
CUDA version (if applicable): 7.5

Reply all

Reply to author

Forward

0 new messages