Training a multitask CNN for the detection of facial landmarks and pose

953 views

CNNcaffeconvolutionfacehdf5imageiterationlandmarklossmultitaskingnetworkneuralposetraining

Skip to first unread message

Fabian Schrumpf

unread,

May 20, 2015, 10:09:20 AM5/20/15

to caffe...@googlegroups.com

Hi folks,

I'm trying to train my own CNN from scratch. Basically I want to reproduce the network from Zhang et.al "Improving Multiview Face Detection with Multi-Task Deep Convolutional Neural Networks". The problem is, that the loss is never decreasing. I have recreated their CNN and I have stored my own training and validation data (in total 60000 non-faces and 25000 faces) in hdf5-files. The network is trained with these images and a label vector that contains:

Face / Non-Face classification: {1,0} for faces and {0,1} for non-faces
face pose: a five element vector, each element pertaining to an intervall between -180° and 180 degree ; e.g. if the person on the image is looking directly into the camera one would get {0,0,1,0,0} or if the right profile of the face is visible one would get {1,0,0,0,0}
coordinates of 7 facial landmarks

resulting in a 21 element vector as label for every picture. The network consists of 3 convolutional layers and several fully connected layers that are split into 3 branches that do the face/non-face, pose and landmark detection respectively. It looks like this:

name: "FaceNet"
layer {
  name: "data"
  type: "HDF5Data"
  top: "data"
  top: "label"
  include {
    phase: TRAIN
  }
  hdf5_data_param {
    source: "..."
    batch_size: 1000
    shuffle: true
  }
}

layer {
  name: "data"
  type: "HDF5Data"
  top: "data"
  top: "label"
  include {
    phase: TEST
  }

  hdf5_data_param {
    source: "..."
    batch_size: 1000
    shuffle: true
  }
}

layer {
  name: "slice0"
  type: "Slice"
  bottom: "label"
  top: "label_face"
  top: "label_yaw"
  top: "label_landmarks"
  slice_param {
    slice_dim: 1
    slice_point: 2
    slice_point: 7    
  }
}
# 1ST LAYER ###############################
layer {
  name: "conv1"
  type: "Convolution"
  bottom: "data"
  top: "conv1"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 32
    kernel_size: 5
    pad: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relu1"
  type: "ReLU"
  bottom: "conv1"
  top: "relu1"
}
layer {
  name: "pool1"
  type: "Pooling"
  bottom: "relu1"
  top: "pool1"
  pooling_param {
    pool: MAX
    kernel_size: 2
  }
}
# 2ND Layer ###############################
layer {
  name: "conv2"
  type: "Convolution"
  bottom: "pool1"
  top: "conv2"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 32
    kernel_size: 3
    pad: 1
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relu2"
  type: "ReLU"
  bottom: "conv2"
  top: "relu2"
}
# 3 RD LAYER ###############################
layer {
  name: "conv3"
  type: "Convolution"
  bottom: "relu2"
  top: "conv3"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 24
    kernel_size: 3
    pad: 1
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relu3"
  type: "ReLU"
  bottom: "conv3"
  top: "relu3"
}
layer {
  name: "pool3"
  type: "Pooling"
  bottom: "relu3"
  top: "pool3"
  pooling_param {
    pool: MAX
    kernel_size: 2
  }
}
# 4 TH LAYER ###############################
layer {
  name: "fc4"
  type: "InnerProduct"
  bottom: "pool3"
  top: "fc4"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 512
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relu4"
  type: "ReLU"
  bottom: "fc4"
  top: "relu4"
}
layer {
  name: "split"
  type: "Split"
  bottom: "relu4"
  top: "fc4_face"
  top: "fc4_yaw"
  top: "fc4_landmarks"
}
# 5 TH LAYER ###############################
layer {
  name: "fc5_face"
  type: "InnerProduct"
  bottom: "fc4_face"
  top: "fc5_face"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 128
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relu5_face"
  type: "ReLU"
  bottom: "fc5_face"
  top: "relu5_face"
}
layer {
  name: "drop5_face"
  type: "Dropout"
  bottom: "relu5_face"
  top: "drop5_face"
  dropout_param {
    dropout_ratio: 0.7
  }
}

layer {
  name: "fc5_yaw"
  type: "InnerProduct"
  bottom: "fc4_yaw"
  top: "fc5_yaw"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 128
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relu5_yaw"
  type: "ReLU"
  bottom: "fc5_yaw"
  top: "relu5_yaw"
}
layer {
  name: "drop5_yaw"
  type: "Dropout"
  bottom: "relu5_yaw"
  top: "drop5_yaw"
  dropout_param {
    dropout_ratio: 0.7
  }
}

layer {
  name: "fc5_landmarks"
  type: "InnerProduct"
  bottom: "fc4_landmarks"
  top: "fc5_landmarks"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 256
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "tanh5_landmarks"
  type: "TanH"
  bottom: "fc5_landmarks"
  top: "tanh5_landmarks"
}
layer {
  name: "drop5_landmarks"
  type: "Dropout"
  bottom: "tanh5_landmarks"
  top: "drop5_landmarks"
  dropout_param {
    dropout_ratio: 0.7
  }
}
# 6 TH LAYER ###############################
layer {
  name: "fc6_face"
  type: "InnerProduct"
  bottom: "drop5_face"
  top: "fc6_face"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relu6_yaw"
  type: "ReLU"
  bottom: "fc6_face"
  top: "relu6_face"
}
layer {
  name: "fc6_yaw"
  type: "InnerProduct"
  bottom: "drop5_yaw"
  top: "fc6_yaw"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 5
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relu6_yaw"
  type: "ReLU"
  bottom: "fc6_yaw"
  top: "relu6_yaw"
}

layer {
  name: "fc6_landmarks"
  type: "InnerProduct"
  bottom: "drop5_landmarks"
  top: "fc6_landmarks"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 196
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "tanh6_landmarks"
  type: "TanH"
  bottom: "fc6_landmarks"
  top: "tanh6_landmarks"
}
layer {
  name: "drop6_landmarks"
  type: "Dropout"
  bottom: "tanh6_landmarks"
  top: "drop6_landmarks"
  dropout_param {
    dropout_ratio: 0.7
  }
}
# 7 TH LAYER ###############################
layer {
  name: "fc7_landmarks"
  type: "InnerProduct"
  bottom: "drop6_landmarks"
  top: "fc7_landmarks"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 14
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "tanh7_landmarks"
  type: "TanH"
  bottom: "fc7_landmarks"
  top: "tanh7_landmarks"
}
# SOFTMAX LAYER ###########################

layer {
  name: "face_Softmax"
  type: "Softmax"
  bottom: "relu6_face"
  top: "softmax6_face"
}
layer {
  name: "yaw_Softmax"
  type: "Softmax"
  bottom: "relu6_yaw"
  top: "softmax6_yaw"
}

# LOSS LAYER ##############################

layer {
  name: "loss_face"
  type: "SigmoidCrossEntropyLoss"
  loss_weight: 1
  bottom: "softmax6_face"
  bottom: "label_face"
  top: "loss_face"
}
layer {
  name: "loss_yaw"
  type: "SigmoidCrossEntropyLoss"
  loss_weight: 1
  bottom: "softmax6_yaw"
  bottom: "label_yaw"
  top: "loss_yaw"
}
layer {
  name: "loss_landmarks"
  type: "EuclideanLoss"
  loss_weight: 1
  bottom: "tanh7_landmarks"
  bottom: "label_landmarks"
  top: "loss_landmarks"
}

solver.txt looks like this:

net: "train_val.prototxt"
test_iter: 1000
test_interval: 10000
base_lr: 0.001
lr_policy: "step"
gamma: 0.1
stepsize: 10000
display: 20
max_iter: 200000
momentum: 0.9
weight_decay: 0.0005
snapshot: 20000
snapshot_prefix: "FaceNet_train"
solver_mode: GPU

A sample output from my training procedure:

I0520 15:24:31.982103 57005 solver.cpp:189] Iteration 31760, loss = 1.27058
I0520 15:24:31.982327 57005 solver.cpp:204]     Train net output #0: loss_face = 1.27058 (* 1 = 1.27058 loss)
I0520 15:24:31.982352 57005 solver.cpp:204]     Train net output #1: loss_landmarks = 424.336
I0520 15:24:31.982372 57005 solver.cpp:204]     Train net output #2: loss_yaw = 3.93837
I0520 15:24:31.982390 57005 solver.cpp:464] Iteration 31760, lr = 0.00125
I0520 15:24:47.112720 57005 solver.cpp:189] Iteration 31780, loss = 1.27626
I0520 15:24:47.112793 57005 solver.cpp:204]     Train net output #0: loss_face = 1.27626 (* 1 = 1.27626 loss)
I0520 15:24:47.112810 57005 solver.cpp:204]     Train net output #1: loss_landmarks = 463.532
I0520 15:24:47.112828 57005 solver.cpp:204]     Train net output #2: loss_yaw = 3.9372
I0520 15:24:47.112846 57005 solver.cpp:464] Iteration 31780, lr = 0.00125
I0520 15:25:02.242328 57005 solver.cpp:189] Iteration 31800, loss = 1.27833
I0520 15:25:02.242540 57005 solver.cpp:204]     Train net output #0: loss_face = 1.27833 (* 1 = 1.27833 loss)
I0520 15:25:02.242565 57005 solver.cpp:204]     Train net output #1: loss_landmarks = 448.197
I0520 15:25:02.242583 57005 solver.cpp:204]     Train net output #2: loss_yaw = 3.9368
I0520 15:25:02.242601 57005 solver.cpp:464] Iteration 31800, lr = 0.00125
I0520 15:25:17.376849 57005 solver.cpp:189] Iteration 31820, loss = 1.26867
I0520 15:25:17.376956 57005 solver.cpp:204]     Train net output #0: loss_face = 1.26867 (* 1 = 1.26867 loss)
I0520 15:25:17.376978 57005 solver.cpp:204]     Train net output #1: loss_landmarks = 423.358
I0520 15:25:17.376998 57005 solver.cpp:204]     Train net output #2: loss_yaw = 3.93876
I0520 15:25:17.377022 57005 solver.cpp:464] Iteration 31820, lr = 0.00125
I0520 15:25:32.505827 57005 solver.cpp:189] Iteration 31840, loss = 1.27058
I0520 15:25:32.506081 57005 solver.cpp:204]     Train net output #0: loss_face = 1.27058 (* 1 = 1.27058 loss)
I0520 15:25:32.506106 57005 solver.cpp:204]     Train net output #1: loss_landmarks = 443.968
I0520 15:25:32.506124 57005 solver.cpp:204]     Train net output #2: loss_yaw = 3.93836
I0520 15:25:32.506166 57005 solver.cpp:464] Iteration 31840, lr = 0.00125

The losses are pretty much stuck at the values shown above which are pretty high in my opinion (especially the landmarks loss). My guess is that, since I have a rather small dataset (in the original paper they use millions of images) I have to train longer than 200000 iteration in order to reach a sufficiently small loss. But before accelerating global warming by starting a training procedure that takes days to complete I wanted the community to have a look at my network and my training settings. Maybe there is something wrong with my network or you have suggestions for tuning my training parameters. So any help would be greatly appreciated.

Best wishes,

Fabian

Peng Li

unread,

Jun 10, 2015, 6:11:54 AM6/10/15

to caffe...@googlegroups.com

It seems that the loss is inappropriate. The loss that you have used seem to be used for classification. You should use Euclidean loss for this task. There might be other problems besides this one as well. But this is so far I can find.