loss does not decrease at 8.2 during training

41 views

Skip to first unread message

kai

unread,

Oct 30, 2017, 11:10:53 PM10/30/17

to Caffe Users

I was training a classifier for printed chinese character. My dataset include 3755 categories. for each category I have 512 images. I was trying to train my data using alexnet and googlenet. But the wired thing is when I training my model, the loss always stuck at 8.2 in both network. I have try to set the lr_rate smaller to 0.0000001 and the loss would start as around 15 and begin to decrease. however, after the loss close to 8 it stuck at there again...I cannot figure out where is the problem. I test and train the network with other dataset and found the loss do decrease to a small number. I do not know if is because of my training data. All my data is handmade by image augmentation( rotate , blur , add noise, change color etc) from one original image. below is my net work and training log and some sample of my data.

layer {

type: "Data"

top: "data"

top: "label"

include {

phase: TRAIN

}

transform_param {

mirror: false

mean_value: 104

mean_value: 117

mean_value: 123

}

data_param {

source: "new_train_lmdb"

batch_size: 10

backend: LMDB

}

layer {

type: "Data"

top: "data"

top: "label"

include {

phase: TEST

}

transform_param {

mirror: false

mean_value: 104

mean_value: 117

mean_value: 123

}

data_param {

source: "new_val_lmdb"

batch_size: 20

backend: LMDB

}

layer {

type: "Convolution"

bottom: "data"

top: "conv1"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 64

kernel_size: 7

stride: 2

pad: 0

weight_filler {

type: "xavier"

#std: 0.015

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "conv1"

top: "conv1"

}

layer {

type: "Pooling"

bottom: "conv1"

top: "pool1"

pooling_param {

pool: MAX

kernel_size: 3

stride: 2

pad: 0

}

layer {

type: "LRN"

bottom: "pool1"

top: "norm1"

lrn_param {

local_size: 5

alpha: 0.0001

beta: 0.75

}

layer {

type: "Convolution"

bottom: "norm1"

top: "reduction2"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 64

pad: 0

kernel_size: 1

group: 1

weight_filler {

type: "xavier"

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "reduction2"

top: "reduction2"

}

layer {

type: "Convolution"

bottom: "reduction2"

top: "conv2"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 192

pad: 1

kernel_size: 3

group: 1

weight_filler {

type: "xavier"

#std: 0.02

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "conv2"

top: "conv2"

}

layer {

type: "LRN"

bottom: "conv2"

top: "norm2"

lrn_param {

local_size: 5

alpha: 0.0001

beta: 0.75

}

layer {

type: "Pooling"

bottom: "norm2"

top: "pool2"

pooling_param {

pool: MAX

kernel_size: 3

stride: 2

pad: 0

}

# Inception module 1 ***************

layer {

type: "Convolution"

bottom: "pool2"

top: "icp1_reduction1"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 96

pad: 0

kernel_size: 1

weight_filler {

type: "xavier"

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp1_reduction1"

top: "icp1_reduction1"

}

layer {

type: "Convolution"

bottom: "pool2"

top: "icp1_reduction2"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 16

pad: 0

kernel_size: 1

weight_filler {

type: "xavier"

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp1_reduction2"

top: "icp1_reduction2"

}

layer {

type: "Pooling"

bottom: "pool2"

top: "icp1_pool"

pooling_param {

pool: MAX

kernel_size: 3

stride: 1

pad: 1

}

# ***********

layer {

type: "Convolution"

bottom: "pool2"

top: "icp1_out0"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 64

pad: 0

kernel_size: 1

weight_filler {

type: "xavier"

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp1_out0"

top: "icp1_out0"

}

layer {

type: "Convolution"

bottom: "icp1_reduction1"

top: "icp1_out1"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 128

pad: 1

kernel_size: 3

weight_filler {

type: "xavier"

#std: 0.04

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp1_out1"

top: "icp1_out1"

}

layer {

type: "Convolution"

bottom: "icp1_reduction2"

top: "icp1_out2"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 32

pad: 2

kernel_size: 5

weight_filler {

type: "xavier"

#std: 0.08

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp1_out2"

top: "icp1_out2"

}

layer {

type: "Convolution"

bottom: "icp1_pool"

top: "icp1_out3"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 32

pad: 0

kernel_size: 1

weight_filler {

type: "xavier"

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp1_out3"

top: "icp1_out3"

}

# Concat them together

layer {

type: "Concat"

bottom: "icp1_out0"

bottom: "icp1_out1"

bottom: "icp1_out2"

bottom: "icp1_out3"

top: "icp2_in"

}

# Inception module 2 ***************

layer {

type: "Convolution"

bottom: "icp2_in"

top: "icp2_reduction1"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 128

pad: 0

kernel_size: 1

weight_filler {

type: "xavier"

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp2_reduction1"

top: "icp2_reduction1"

}

layer {

type: "Convolution"

bottom: "icp2_in"

top: "icp2_reduction2"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 32

pad: 0

kernel_size: 1

weight_filler {

type: "xavier"

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp2_reduction2"

top: "icp2_reduction2"

}

layer {

type: "Pooling"

bottom: "icp2_in"

top: "icp2_pool"

pooling_param {

pool: MAX

kernel_size: 3

stride: 1

pad: 1

}

# ***********

layer {

type: "Convolution"

bottom: "icp2_in"

top: "icp2_out0"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 128

pad: 0

kernel_size: 1

weight_filler {

type: "xavier"

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp2_out0"

top: "icp2_out0"

}

layer {

type: "Convolution"

bottom: "icp2_reduction1"

top: "icp2_out1"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 192

pad: 1

kernel_size: 3

weight_filler {

type: "xavier"

#std: 0.04

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp2_out1"

top: "icp2_out1"

}

layer {

type: "Convolution"

bottom: "icp2_reduction2"

top: "icp2_out2"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 96

pad: 2

kernel_size: 5

weight_filler {

type: "xavier"

#std: 0.08

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp2_out2"

top: "icp2_out2"

}

layer {

type: "Convolution"

bottom: "icp2_pool"

top: "icp2_out3"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 64

pad: 0

kernel_size: 1

weight_filler {

type: "xavier"

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp2_out3"

top: "icp2_out3"

}

# Concat them together

layer {

type: "Concat"

bottom: "icp2_out0"

bottom: "icp2_out1"

bottom: "icp2_out2"

bottom: "icp2_out3"

top: "icp2_out"

}

layer {

type: "Pooling"

bottom: "icp2_out"

top: "icp3_in"

pooling_param {

pool: MAX

kernel_size: 3

stride: 2

pad: 0

}

# Inception module 3 ***************

layer {

type: "Convolution"

bottom: "icp3_in"

top: "icp3_reduction1"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 112

pad: 0

kernel_size: 1

weight_filler {

type: "xavier"

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp3_reduction1"

top: "icp3_reduction1"

}

layer {

type: "Convolution"

bottom: "icp3_in"

top: "icp3_reduction2"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 24

pad: 0

kernel_size: 1

weight_filler {

type: "xavier"

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp3_reduction2"

top: "icp3_reduction2"

}

layer {

type: "Pooling"

bottom: "icp3_in"

top: "icp3_pool"

pooling_param {

pool: MAX

kernel_size: 3

stride: 1

pad: 1

}

# ***********

layer {

type: "Convolution"

bottom: "icp3_in"

top: "icp3_out0"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 160

pad: 0

kernel_size: 1

weight_filler {

type: "xavier"

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp3_out0"

top: "icp3_out0"

}

layer {

type: "Convolution"

bottom: "icp3_reduction1"

top: "icp3_out1"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 224

pad: 1

kernel_size: 3

weight_filler {

type: "xavier"

#std: 0.04

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp3_out1"

top: "icp3_out1"

}

layer {

type: "Convolution"

bottom: "icp3_reduction2"

top: "icp3_out2"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 64

pad: 2

kernel_size: 5

weight_filler {

type: "xavier"

#std: 0.08

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp3_out2"

top: "icp3_out2"

}

layer {

type: "Convolution"

bottom: "icp3_pool"

top: "icp3_out3"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 64

pad: 0

kernel_size: 1

weight_filler {

type: "xavier"

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp3_out3"

top: "icp3_out3"

}

# Concat them together

layer {

type: "Concat"

bottom: "icp3_out0"

bottom: "icp3_out1"

bottom: "icp3_out2"

bottom: "icp3_out3"

top: "icp3_out"

}

# Inception module 4 ***************

layer {

type: "Convolution"

bottom: "icp3_out"

top: "icp4_reduction1"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 160

pad: 0

kernel_size: 1

weight_filler {

type: "xavier"

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp4_reduction1"

top: "icp4_reduction1"

}

layer {

type: "Convolution"

bottom: "icp3_out"

top: "icp4_reduction2"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 32

pad: 0

kernel_size: 1

weight_filler {

type: "xavier"

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp4_reduction2"

top: "icp4_reduction2"

}

layer {

type: "Pooling"

bottom: "icp3_out"

top: "icp4_pool"

pooling_param {

pool: MAX

kernel_size: 3

stride: 1

pad: 1

}

# ***********

layer {

type: "Convolution"

bottom: "icp3_out"

top: "icp4_out0"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 256

pad: 0

kernel_size: 1

weight_filler {

type: "xavier"

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp4_out0"

top: "icp4_out0"

}

layer {

type: "Convolution"

bottom: "icp4_reduction1"

top: "icp4_out1"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 320

pad: 1

kernel_size: 3

weight_filler {

type: "xavier"

#std: 0.04

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp4_out1"

top: "icp4_out1"

}

layer {

type: "Convolution"

bottom: "icp4_reduction2"

top: "icp4_out2"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 128

pad: 2

kernel_size: 5

weight_filler {

type: "xavier"

#std: 0.08

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp4_out2"

top: "icp4_out2"

}

layer {

type: "Convolution"

bottom: "icp4_pool"

top: "icp4_out3"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 128

pad: 0

kernel_size: 1

weight_filler {

type: "xavier"

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "icp4_out3"

top: "icp4_out3"

}

# Concat them together

layer {

type: "Concat"

bottom: "icp4_out0"

bottom: "icp4_out1"

bottom: "icp4_out2"

bottom: "icp4_out3"

top: "icp4_out"

}

# classification branch

layer {

type: "Pooling"

bottom: "icp4_out"

top: "cls3_pool"

pooling_param {

pool: AVE

kernel_size: 5

stride: 3

pad: 0

# This padding is somewhat special

}

layer {

type: "Convolution"

bottom: "cls3_pool"

top: "cls3_reduction"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

convolution_param {

num_output: 128

pad: 0

kernel_size: 1

weight_filler {

type: "xavier"

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "cls3_reduction"

top: "cls3_reduction"

}

layer {

type: "InnerProduct"

bottom: "cls3_reduction"

top: "cls3_fc1"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

inner_product_param {

num_output: 1024

weight_filler {

type: "xavier"

#std: 0.01

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "ReLU"

bottom: "cls3_fc1"

top: "cls3_fc1"

}

layer {

type: "Dropout"

bottom: "cls3_fc1"

top: "cls3_fc1"

dropout_param {

dropout_ratio: 0.4

}

layer {

type: "InnerProduct"

bottom: "cls3_fc1"

top: "cls3_fc2"

param {

lr_mult: 1

decay_mult: 1

}

param {

lr_mult: 2

decay_mult: 0

}

inner_product_param {

num_output: 3755

weight_filler {

type: "xavier"

#std: 0.01

}

bias_filler {

type: "constant"

value: 0

}

layer {

type: "SoftmaxWithLoss"

bottom: "cls3_fc2"

bottom: "label"

top: "loss"

loss_weight: 1

}

layer {

type: "Accuracy"

bottom: "cls3_fc2"

bottom: "label"

top: "top-1"

include {

phase: TEST

}

layer {

type: "Accuracy"

bottom: "cls3_fc2"

bottom: "label"

top: "top-5"

include {

phase: TEST

}

accuracy_param {

top_k: 5

}

Reply all

Reply to author

Forward

0 new messages