Help, network is not learning. Training bvlc_reference_caffenet WITHOUT PADDING on multi-GPUs.

37 views
Skip to first unread message

Jumabek Alikhanov

unread,
Sep 7, 2016, 9:44:27 PM9/7/16
to Caffe Users
Hi,
I need to know why my network is not learning.

I wanted to train bvlc_reference_caffenet model with slight modifications.
I simply removed padding from convolutional layers( prototxt file is in the end).

It is almost 70K iterations and training loss is still 6.9:


I0908
19:17:41.481381 11378 solver.cpp:404]     Test net output #0: accuracy = 0.001
I0908
19:17:41.481497 11378 solver.cpp:404]     Test net output #1: loss = 6.91003 (* 1 = 6.91003 loss)
I0908
19:17:41.623545 11378 solver.cpp:228] Iteration 67000, loss = 6.90454
I0908
19:17:41.623757 11378 solver.cpp:244]     Train net output #0: loss = 6.90454 (* 1 = 6.90454 loss)
I0908
19:17:42.265553 11378 sgd_solver.cpp:106] Iteration 67000, lr = 0.01

 
I have two GPUs:
  • Titan X
  • Tesla K40

My RAM is 24 GB.

Thanks a lot!

Below is my network prototxt :


name: "CaffeNet"
layer
{
  name
: "data"
  type
: "Data"
  top
: "data"
  top
: "label"
  include
{
    phase
: TRAIN
 
}
  transform_param
{
    mirror
: true
    crop_size
: 227
    mean_file
: "/media/ailab/Data/imagenet/imagenet_mean.binaryproto"
 
}
# mean pixel / channel-wise mean instead of mean image
#  transform_param {
#    crop_size: 227
#    mean_value: 104
#    mean_value: 117
#    mean_value: 123
#    mirror: true
#  }
  data_param
{
    source
: "/media/ailab/Data/imagenet/ilsvrc12_train_lmdb"
    batch_size
: 256
    backend
: LMDB
 
}
}
layer
{
  name
: "data"
  type
: "Data"
  top
: "data"
  top
: "label"
  include
{
    phase
: TEST
 
}
  transform_param
{
    mirror
: false
    crop_size
: 227
    mean_file
: "/media/ailab/Data/imagenet/imagenet_mean.binaryproto"
 
}
# mean pixel / channel-wise mean instead of mean image
#  transform_param {
#    crop_size: 227
#    mean_value: 104
#    mean_value: 117
#    mean_value: 123
#    mirror: false
#  }
  data_param
{
    source
: "/media/ailab/Data/imagenet/ilsvrc12_val_lmdb"
    batch_size
: 50
    backend
: LMDB
 
}
}
layer
{
  name
: "conv1"
  type
: "Convolution"
  bottom
: "data"
  top
: "conv1"
  param
{
    lr_mult
: 1
    decay_mult
: 1
 
}
  param
{
    lr_mult
: 2
    decay_mult
: 0
 
}
  convolution_param
{
    num_output
: 96
    kernel_size
: 11
    stride
: 4
    weight_filler
{
      type
: "gaussian"
      std
: 0.01
   
}
    bias_filler
{
      type
: "constant"
      value
: 0
   
}
 
}
}
layer
{
  name
: "relu1"
  type
: "ReLU"
  bottom
: "conv1"
  top
: "conv1"
}
layer
{
  name
: "pool1"
  type
: "Pooling"
  bottom
: "conv1"
  top
: "pool1"
  pooling_param
{
    pool
: MAX
    kernel_size
: 3
    stride
: 2
 
}
}
layer
{
  name
: "norm1"
  type
: "LRN"
  bottom
: "pool1"
  top
: "norm1"
  lrn_param
{
    local_size
: 5
    alpha
: 0.0001
    beta
: 0.75
 
}
}
layer
{
  name
: "conv2"
  type
: "Convolution"
  bottom
: "norm1"
  top
: "conv2"
  param
{
    lr_mult
: 1
    decay_mult
: 1
 
}
  param
{
    lr_mult
: 2
    decay_mult
: 0
 
}
  convolution_param
{
    num_output
: 256
    kernel_size
: 5
   
group: 2
    weight_filler
{
      type
: "gaussian"
      std
: 0.01
   
}
    bias_filler
{
      type
: "constant"
      value
: 1
   
}
 
}
}
layer
{
  name
: "relu2"
  type
: "ReLU"
  bottom
: "conv2"
  top
: "conv2"
}
layer
{
  name
: "pool2"
  type
: "Pooling"
  bottom
: "conv2"
  top
: "pool2"
  pooling_param
{
    pool
: MAX
    kernel_size
: 3
    stride
: 2
 
}
}
layer
{
  name
: "norm2"
  type
: "LRN"
  bottom
: "pool2"
  top
: "norm2"
  lrn_param
{
    local_size
: 5
    alpha
: 0.0001
    beta
: 0.75
 
}
}
layer
{
  name
: "conv3"
  type
: "Convolution"
  bottom
: "norm2"
  top
: "conv3"
  param
{
    lr_mult
: 1
    decay_mult
: 1
 
}
  param
{
    lr_mult
: 2
    decay_mult
: 0
 
}
  convolution_param
{
    num_output
: 384
    kernel_size
: 3
    weight_filler
{
      type
: "gaussian"
      std
: 0.01
   
}
    bias_filler
{
      type
: "constant"
      value
: 0
   
}
 
}
}
layer
{
  name
: "relu3"
  type
: "ReLU"
  bottom
: "conv3"
  top
: "conv3"
}
layer
{
  name
: "conv4"
  type
: "Convolution"
  bottom
: "conv3"
  top
: "conv4"
  param
{
    lr_mult
: 1
    decay_mult
: 1
 
}
  param
{
    lr_mult
: 2
    decay_mult
: 0
 
}
  convolution_param
{
    num_output
: 384
    kernel_size
: 3
   
group: 2
    weight_filler
{
      type
: "gaussian"
      std
: 0.01
   
}
    bias_filler
{
      type
: "constant"
      value
: 1
   
}
 
}
}
layer
{
  name
: "relu4"
  type
: "ReLU"
  bottom
: "conv4"
  top
: "conv4"
}
layer
{
  name
: "conv5"
  type
: "Convolution"
  bottom
: "conv4"
  top
: "conv5"
  param
{
    lr_mult
: 1
    decay_mult
: 1
 
}
  param
{
    lr_mult
: 2
    decay_mult
: 0
 
}
  convolution_param
{
    num_output
: 256
    kernel_size
: 3
   
group: 2
    weight_filler
{
      type
: "gaussian"
      std
: 0.01
   
}
    bias_filler
{
      type
: "constant"
      value
: 1
   
}
 
}
}
layer
{
  name
: "relu5"
  type
: "ReLU"
  bottom
: "conv5"
  top
: "conv5"
}
layer
{
  name
: "pool5"
  type
: "Pooling"
  bottom
: "conv5"
  top
: "pool5"
  pooling_param
{
    pool
: MAX
    kernel_size
: 3
    stride
: 2
 
}
}
layer
{
  name
: "fc6"
  type
: "InnerProduct"
  bottom
: "pool5"
  top
: "fc6"
  param
{
    lr_mult
: 1
    decay_mult
: 1
 
}
  param
{
    lr_mult
: 2
    decay_mult
: 0
 
}
  inner_product_param
{
    num_output
: 4096
    weight_filler
{
      type
: "gaussian"
      std
: 0.005
   
}
    bias_filler
{
      type
: "constant"
      value
: 1
   
}
 
}
}
layer
{
  name
: "relu6"
  type
: "ReLU"
  bottom
: "fc6"
  top
: "fc6"
}
layer
{
  name
: "drop6"
  type
: "Dropout"
  bottom
: "fc6"
  top
: "fc6"
  dropout_param
{
    dropout_ratio
: 0.5
 
}
}
layer
{
  name
: "fc7"
  type
: "InnerProduct"
  bottom
: "fc6"
  top
: "fc7"
  param
{
    lr_mult
: 1
    decay_mult
: 1
 
}
  param
{
    lr_mult
: 2
    decay_mult
: 0
 
}
  inner_product_param
{
    num_output
: 4096
    weight_filler
{
      type
: "gaussian"
      std
: 0.005
   
}
    bias_filler
{
      type
: "constant"
      value
: 1
   
}
 
}
}
layer
{
  name
: "relu7"
  type
: "ReLU"
  bottom
: "fc7"
  top
: "fc7"
}
layer
{
  name
: "drop7"
  type
: "Dropout"
  bottom
: "fc7"
  top
: "fc7"
  dropout_param
{
    dropout_ratio
: 0.5
 
}
}
layer
{
  name
: "fc8"
  type
: "InnerProduct"
  bottom
: "fc7"
  top
: "fc8"
  param
{
    lr_mult
: 1
    decay_mult
: 1
 
}
  param
{
    lr_mult
: 2
    decay_mult
: 0
 
}
  inner_product_param
{
    num_output
: 1000
    weight_filler
{
      type
: "gaussian"
      std
: 0.01
   
}
    bias_filler
{
      type
: "constant"
      value
: 0
   
}
 
}
}
layer
{
  name
: "accuracy"
  type
: "Accuracy"
  bottom
: "fc8"
  bottom
: "label"
  top
: "accuracy"
  include
{
    phase
: TEST
 
}
}
layer
{
  name
: "loss"
  type
: "SoftmaxWithLoss"
  bottom
: "fc8"
  bottom
: "label"
  top
: "loss"
}�





shai harel

unread,
Sep 8, 2016, 11:38:14 AM9/8/16
to Caffe Users
what is your solver.prototxt?
base learning rate
learning alg.
step_size

are your images aligned?
are they all in the same format?
BGR format 0-255 value minus the mean image?

Jumabek Alikhanov

unread,
Sep 8, 2016, 11:55:16 PM9/8/16
to Caffe Users
My solver file is the same as bvlc_reference_caffenet.
Since I didn't change the train.prototxt much, I thought the same solver would work.
I created lmdb using create_imagenet.sh script.

And this is my solver.prototxt
net: "models/bvlc_reference_caffenet_npd/train_val.prototxt"
test_iter
: 1000
test_interval
: 1000
base_lr
: 0.01
lr_policy
: "step"
gamma
: 0.1
stepsize
: 100000
display
: 20
max_iter
: 450000
momentum
: 0.9
weight_decay
: 0.0005
snapshot
: 10000
snapshot_prefix
: "models/bvlc_reference_caffenet_npd/caffenet_train"
solver_mode
: GPU

Reply all
Reply to author
Forward
0 new messages