I was training a classifier for printed chinese character. My dataset include 3755 categories. for each category I have 512 images. I was trying to train my data using alexnet and googlenet. But the wired thing is when I training my model, the loss always stuck at 8.2 in both network. I have try to set the lr_rate smaller to 0.0000001 and the loss would start as around 15 and begin to decrease. however, after the loss close to 8 it stuck at there again...I cannot figure out where is the problem. I test and train the network with other dataset and found the loss do decrease to a small number. I do not know if is because of my training data. All my data is handmade by image augmentation( rotate , blur , add noise, change color etc) from one original image. below is my net work and training log and some sample of my data.
name: "GoogleNet"
layer {
name: "data"
type: "Data"
top: "data"
top: "label"
include {
phase: TRAIN
}
transform_param {
mirror: false
mean_value: 104
mean_value: 117
mean_value: 123
}
data_param {
source: "new_train_lmdb"
batch_size: 10
backend: LMDB
}
}
layer {
name: "data"
type: "Data"
top: "data"
top: "label"
include {
phase: TEST
}
transform_param {
mirror: false
mean_value: 104
mean_value: 117
mean_value: 123
}
data_param {
source: "new_val_lmdb"
batch_size: 20
backend: LMDB
}
}
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 64
kernel_size: 7
stride: 2
pad: 0
weight_filler {
type: "xavier"
#std: 0.015
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu1"
type: "ReLU"
bottom: "conv1"
top: "conv1"
}
layer {
name: "pool1"
type: "Pooling"
bottom: "conv1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
pad: 0
}
}
layer {
name: "norm1"
type: "LRN"
bottom: "pool1"
top: "norm1"
lrn_param {
local_size: 5
alpha: 0.0001
beta: 0.75
}
}
layer {
name: "reduction2"
type: "Convolution"
bottom: "norm1"
top: "reduction2"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 64
pad: 0
kernel_size: 1
group: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_reduction2"
type: "ReLU"
bottom: "reduction2"
top: "reduction2"
}
layer {
name: "conv2"
type: "Convolution"
bottom: "reduction2"
top: "conv2"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 192
pad: 1
kernel_size: 3
group: 1
weight_filler {
type: "xavier"
#std: 0.02
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu2"
type: "ReLU"
bottom: "conv2"
top: "conv2"
}
layer {
name: "norm2"
type: "LRN"
bottom: "conv2"
top: "norm2"
lrn_param {
local_size: 5
alpha: 0.0001
beta: 0.75
}
}
layer {
name: "pool2"
type: "Pooling"
bottom: "norm2"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
pad: 0
}
}
# Inception module 1 ***************
layer {
name: "icp1_reduction1"
type: "Convolution"
bottom: "pool2"
top: "icp1_reduction1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 96
pad: 0
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp1_reduction1"
type: "ReLU"
bottom: "icp1_reduction1"
top: "icp1_reduction1"
}
layer {
name: "icp1_reduction2"
type: "Convolution"
bottom: "pool2"
top: "icp1_reduction2"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 16
pad: 0
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp1_reduction2"
type: "ReLU"
bottom: "icp1_reduction2"
top: "icp1_reduction2"
}
layer {
name: "icp1_pool"
type: "Pooling"
bottom: "pool2"
top: "icp1_pool"
pooling_param {
pool: MAX
kernel_size: 3
stride: 1
pad: 1
}
}
# ***********
layer {
name: "icp1_out0"
type: "Convolution"
bottom: "pool2"
top: "icp1_out0"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 64
pad: 0
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp1_out0"
type: "ReLU"
bottom: "icp1_out0"
top: "icp1_out0"
}
layer {
name: "icp1_out1"
type: "Convolution"
bottom: "icp1_reduction1"
top: "icp1_out1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 128
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
#std: 0.04
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp1_out1"
type: "ReLU"
bottom: "icp1_out1"
top: "icp1_out1"
}
layer {
name: "icp1_out2"
type: "Convolution"
bottom: "icp1_reduction2"
top: "icp1_out2"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 32
pad: 2
kernel_size: 5
weight_filler {
type: "xavier"
#std: 0.08
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp1_out2"
type: "ReLU"
bottom: "icp1_out2"
top: "icp1_out2"
}
layer {
name: "icp1_out3"
type: "Convolution"
bottom: "icp1_pool"
top: "icp1_out3"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 32
pad: 0
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp1_out3"
type: "ReLU"
bottom: "icp1_out3"
top: "icp1_out3"
}
# Concat them together
layer {
name: "icp2_in"
type: "Concat"
bottom: "icp1_out0"
bottom: "icp1_out1"
bottom: "icp1_out2"
bottom: "icp1_out3"
top: "icp2_in"
}
# Inception module 2 ***************
layer {
name: "icp2_reduction1"
type: "Convolution"
bottom: "icp2_in"
top: "icp2_reduction1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 128
pad: 0
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp2_reduction1"
type: "ReLU"
bottom: "icp2_reduction1"
top: "icp2_reduction1"
}
layer {
name: "icp2_reduction2"
type: "Convolution"
bottom: "icp2_in"
top: "icp2_reduction2"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 32
pad: 0
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp2_reduction2"
type: "ReLU"
bottom: "icp2_reduction2"
top: "icp2_reduction2"
}
layer {
name: "icp2_pool"
type: "Pooling"
bottom: "icp2_in"
top: "icp2_pool"
pooling_param {
pool: MAX
kernel_size: 3
stride: 1
pad: 1
}
}
# ***********
layer {
name: "icp2_out0"
type: "Convolution"
bottom: "icp2_in"
top: "icp2_out0"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 128
pad: 0
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp2_out0"
type: "ReLU"
bottom: "icp2_out0"
top: "icp2_out0"
}
layer {
name: "icp2_out1"
type: "Convolution"
bottom: "icp2_reduction1"
top: "icp2_out1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 192
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
#std: 0.04
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp2_out1"
type: "ReLU"
bottom: "icp2_out1"
top: "icp2_out1"
}
layer {
name: "icp2_out2"
type: "Convolution"
bottom: "icp2_reduction2"
top: "icp2_out2"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 96
pad: 2
kernel_size: 5
weight_filler {
type: "xavier"
#std: 0.08
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp2_out2"
type: "ReLU"
bottom: "icp2_out2"
top: "icp2_out2"
}
layer {
name: "icp2_out3"
type: "Convolution"
bottom: "icp2_pool"
top: "icp2_out3"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 64
pad: 0
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp2_out3"
type: "ReLU"
bottom: "icp2_out3"
top: "icp2_out3"
}
# Concat them together
layer {
name: "icp2_out"
type: "Concat"
bottom: "icp2_out0"
bottom: "icp2_out1"
bottom: "icp2_out2"
bottom: "icp2_out3"
top: "icp2_out"
}
layer {
name: "icp3_in"
type: "Pooling"
bottom: "icp2_out"
top: "icp3_in"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
pad: 0
}
}
# Inception module 3 ***************
layer {
name: "icp3_reduction1"
type: "Convolution"
bottom: "icp3_in"
top: "icp3_reduction1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 112
pad: 0
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp3_reduction1"
type: "ReLU"
bottom: "icp3_reduction1"
top: "icp3_reduction1"
}
layer {
name: "icp3_reduction2"
type: "Convolution"
bottom: "icp3_in"
top: "icp3_reduction2"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 24
pad: 0
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp3_reduction2"
type: "ReLU"
bottom: "icp3_reduction2"
top: "icp3_reduction2"
}
layer {
name: "icp3_pool"
type: "Pooling"
bottom: "icp3_in"
top: "icp3_pool"
pooling_param {
pool: MAX
kernel_size: 3
stride: 1
pad: 1
}
}
# ***********
layer {
name: "icp3_out0"
type: "Convolution"
bottom: "icp3_in"
top: "icp3_out0"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 160
pad: 0
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp3_out0"
type: "ReLU"
bottom: "icp3_out0"
top: "icp3_out0"
}
layer {
name: "icp3_out1"
type: "Convolution"
bottom: "icp3_reduction1"
top: "icp3_out1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 224
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
#std: 0.04
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp3_out1"
type: "ReLU"
bottom: "icp3_out1"
top: "icp3_out1"
}
layer {
name: "icp3_out2"
type: "Convolution"
bottom: "icp3_reduction2"
top: "icp3_out2"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 64
pad: 2
kernel_size: 5
weight_filler {
type: "xavier"
#std: 0.08
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp3_out2"
type: "ReLU"
bottom: "icp3_out2"
top: "icp3_out2"
}
layer {
name: "icp3_out3"
type: "Convolution"
bottom: "icp3_pool"
top: "icp3_out3"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 64
pad: 0
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp3_out3"
type: "ReLU"
bottom: "icp3_out3"
top: "icp3_out3"
}
# Concat them together
layer {
name: "icp3_out"
type: "Concat"
bottom: "icp3_out0"
bottom: "icp3_out1"
bottom: "icp3_out2"
bottom: "icp3_out3"
top: "icp3_out"
}
# Inception module 4 ***************
layer {
name: "icp4_reduction1"
type: "Convolution"
bottom: "icp3_out"
top: "icp4_reduction1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 160
pad: 0
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp4_reduction1"
type: "ReLU"
bottom: "icp4_reduction1"
top: "icp4_reduction1"
}
layer {
name: "icp4_reduction2"
type: "Convolution"
bottom: "icp3_out"
top: "icp4_reduction2"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 32
pad: 0
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp4_reduction2"
type: "ReLU"
bottom: "icp4_reduction2"
top: "icp4_reduction2"
}
layer {
name: "icp4_pool"
type: "Pooling"
bottom: "icp3_out"
top: "icp4_pool"
pooling_param {
pool: MAX
kernel_size: 3
stride: 1
pad: 1
}
}
# ***********
layer {
name: "icp4_out0"
type: "Convolution"
bottom: "icp3_out"
top: "icp4_out0"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 256
pad: 0
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp4_out0"
type: "ReLU"
bottom: "icp4_out0"
top: "icp4_out0"
}
layer {
name: "icp4_out1"
type: "Convolution"
bottom: "icp4_reduction1"
top: "icp4_out1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 320
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
#std: 0.04
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp4_out1"
type: "ReLU"
bottom: "icp4_out1"
top: "icp4_out1"
}
layer {
name: "icp4_out2"
type: "Convolution"
bottom: "icp4_reduction2"
top: "icp4_out2"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 128
pad: 2
kernel_size: 5
weight_filler {
type: "xavier"
#std: 0.08
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp4_out2"
type: "ReLU"
bottom: "icp4_out2"
top: "icp4_out2"
}
layer {
name: "icp4_out3"
type: "Convolution"
bottom: "icp4_pool"
top: "icp4_out3"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 128
pad: 0
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_icp4_out3"
type: "ReLU"
bottom: "icp4_out3"
top: "icp4_out3"
}
# Concat them together
layer {
name: "icp4_out"
type: "Concat"
bottom: "icp4_out0"
bottom: "icp4_out1"
bottom: "icp4_out2"
bottom: "icp4_out3"
top: "icp4_out"
}
# classification branch
layer {
name: "cls3_pool"
type: "Pooling"
bottom: "icp4_out"
top: "cls3_pool"
pooling_param {
pool: AVE
kernel_size: 5
stride: 3
pad: 0
# This padding is somewhat special
}
}
layer {
name: "cls3_reduction"
type: "Convolution"
bottom: "cls3_pool"
top: "cls3_reduction"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 128
pad: 0
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_cls3_reduction"
type: "ReLU"
bottom: "cls3_reduction"
top: "cls3_reduction"
}
layer {
name: "cls3_fc1"
type: "InnerProduct"
bottom: "cls3_reduction"
top: "cls3_fc1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 1024
weight_filler {
type: "xavier"
#std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu_cls3_fc1"
type: "ReLU"
bottom: "cls3_fc1"
top: "cls3_fc1"
}
layer {
name: "cls3_drop"
type: "Dropout"
bottom: "cls3_fc1"
top: "cls3_fc1"
dropout_param {
dropout_ratio: 0.4
}
}
layer {
name: "cls3_fc2"
type: "InnerProduct"
bottom: "cls3_fc1"
top: "cls3_fc2"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 3755
weight_filler {
type: "xavier"
#std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "loss"
type: "SoftmaxWithLoss"
bottom: "cls3_fc2"
bottom: "label"
top: "loss"
loss_weight: 1
}
layer {
name: "top-1"
type: "Accuracy"
bottom: "cls3_fc2"
bottom: "label"
top: "top-1"
include {
phase: TEST
}
}
layer {
name: "top-5"
type: "Accuracy"
bottom: "cls3_fc2"
bottom: "label"
top: "top-5"
include {
phase: TEST
}
accuracy_param {
top_k: 5
}
}