ELU layer

360 views
Skip to first unread message

Ihor Menshykov

unread,
May 12, 2016, 12:11:06 AM5/12/16
to Caffe Users
ELU layer is supposed to be better than ReLU, but in my project it just always makes loss climb to some 157 value and never come down. Have anyone had any experience with it?

Ihor Menshykov

unread,
May 12, 2016, 1:11:45 AM5/12/16
to Caffe Users
Starting with much lower learning rates might have done the trick. I'll write back, when I'll have more positive experience with this thing.

Ihor Menshykov

unread,
May 12, 2016, 8:16:15 AM5/12/16
to Caffe Users
Later that climbed to that 157.2 loss too. Looks to me, like the ELU layer in Caffe is bugged.

SRQ

unread,
May 12, 2016, 9:30:47 AM5/12/16
to Caffe Users
Are results fine  with ReLU?
Message has been deleted

Ihor Menshykov

unread,
May 12, 2016, 9:59:18 AM5/12/16
to Caffe Users
Yes, results are fine with ReLU and PReLU. Just to be absolutely clear, can I use ELU without any special parameters? I've seen in the caffe docs that alpha should default to 1, which is what it is in the ELU paper http://arxiv.org/pdf/1511.07289v5.pdf . 

Here's what the network looks like with PReLU. I was just replacing "PReLU" for "ELU" on this net.


name: "CaffeNet"
layer {
  name: "data"
  type: "Data"
  top: "data"
  top: "label"
  include {
    phase: TRAIN
  }
  
  transform_param {
    crop_size: 45
  }

  
  data_param {
    source: "/storage/lmdb/10"
    batch_size: 32
    backend: LMDB
  }
}
layer {
  name: "data"
  type: "Data"
  top: "data"
  top: "label"
  include {
    phase: TEST
  }

  transform_param {
    crop_size: 45
  }

  data_param {
    source: "/home/sharpy/local-net/lmdb-server/from-ram-37g/2/train_labels"
    batch_size: 32
    backend: LMDB
  }
}



layer {
  name: "scale"
  bottom: "data"
  top: "scale"
  type: "Power"
  power_param {
    scale: -1
shift: 1
# white shift
  }
}


# 46




layer {
  bottom: "scale"
  type: "Convolution"
  convolution_param {
    num_output: 48
    pad: 2
    kernel_size: 5

    weight_filler {
      type: "xavier"
      std: 0.1
    }
    bias_filler {
      type: "constant"
      value: 0.2
    }
}
  param {
    lr_mult: 1
    decay_mult: 1
  }
  top: "conv_1"
  name: "conv_1"
}

layer {
  bottom: "conv_1"
  top: "conv_1"
  name: "conv_1_rectifier"
  type: "PReLU"
}



layer {
  type: "Dropout"
  name: "conv_1_d"
  top: "conv_1"
  bottom: "conv_1"
  dropout_param {
    dropout_ratio: 0.1
  }
}




# 45



layer {
  type: "Pooling"
  bottom: "conv_1"
  name: "conv_1_pool"
  top: "conv_1_pool"
  pooling_param {
    pool: MAX
    pad: 0
    kernel_size: 4
    stride: 3
  }
}


# 15





layer {
  bottom: "conv_1_pool"
  type: "Convolution"
  convolution_param {
    num_output: 64
    pad: 1
    kernel_size: 3
group: 16


    weight_filler {
      type: "xavier"
      std: 0.03
    }
    bias_filler {
      type: "constant"
      value: 0.2
    }
}
  param {
    lr_mult: 1
    decay_mult: 1
  }
  top: "conv_2_pre"
  name: "conv_2_pre"
}

layer {
  bottom: "conv_2_pre"
  top: "conv_2_pre"
  name: "conv_2_pre_rectifier"
  type: "PReLU"
}



layer {
  type: "Dropout"
  name: "conv_2_pre_d"
  top: "conv_2_pre"
  bottom: "conv_2_pre"
  dropout_param {
    dropout_ratio: 0.1
  }
}




layer {
  bottom: "conv_2_pre"
  type: "Convolution"
  convolution_param {
    num_output: 48
    pad: 0
    kernel_size: 1
group: 1


    weight_filler {
      type: "xavier"
      std: 0.04
    }
    bias_filler {
      type: "constant"
      value: 0.2
    }
}
  param {
    lr_mult: 1
    decay_mult: 1
  }
  top: "conv_2_m"
  name: "conv_2_m"
}

layer {
  bottom: "conv_2_m"
  top: "conv_2_m"
  name: "conv_2_m_rectifier"
  type: "PReLU"
}


layer {
  type: "Dropout"
  name: "conv_2_m_d"
  top: "conv_2_m"
  bottom: "conv_2_m"
  dropout_param {
    dropout_ratio: 0.1
  }
}


layer {
  bottom: "conv_2_m"
  type: "Convolution"
  convolution_param {
    num_output: 128
pad: 3
    kernel_size: 7
group: 16
    weight_filler {
      type: "xavier"
      std: 0.1
    }
    bias_filler {
      type: "constant"
      value: 0.2
    }
  }
  param {
    lr_mult: 1
    decay_mult: 1
  }
  top: "conv_2"
  name: "conv_2"
}

layer {
  bottom: "conv_2"
  top: "conv_2"
  name: "conv_2_rectifier"
  type: "PReLU"
}

layer {
  type: "Dropout"
  name: "conv_2_d"
  top: "conv_2"
  bottom: "conv_2"
  dropout_param {
    dropout_ratio: 0.1
  }
}



layer {
  bottom: "conv_2"
  type: "InnerProduct"
  name: "Before_Loss_4"
  top: "Before_Loss_4"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
num_output: 1024
    weight_filler {
type: "xavier"
std: 0.1
    }
    bias_filler {
      type: "constant"
      value: 0.01
    }
  }
}

layer {
  type: "PReLU"
  name: "Before_Loss_4Relu"
  top: "Before_Loss_4"
  bottom: "Before_Loss_4"
}

layer {
  type: "Dropout"
  name: "Before_Loss_4_d"
  top: "Before_Loss_4"
  bottom: "Before_Loss_4"
  dropout_param {
    dropout_ratio: 0.1
  }
}



layer {
  bottom: "Before_Loss_4"
  type: "InnerProduct"
  name: "loss/classifier4"
  top: "loss/classifier4"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 33512
    weight_filler {
      type: "xavier"
std: 0.01

    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}


layer {
  type: "SoftmaxWithLoss"
  top: "loss4"
  name: "loss/loss4"
  bottom: "loss/classifier4"
  bottom: "label"
  loss_weight: 0.8
}




layer {
  bottom: "conv_2_pre"
  type: "InnerProduct"
  name: "Before_Loss_2"
  top: "Before_Loss_2"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
num_output: 200
    weight_filler {
type: "xavier"
std: 0.1
    }
    bias_filler {
      type: "constant"
      value: 0.01
    }
  }
}

layer {
  type: "PReLU"
  name: "Before_Loss_2Relu"
  top: "Before_Loss_2"
  bottom: "Before_Loss_2"
}

layer {
  type: "Dropout"
  name: "Before_Loss_2_d"
  top: "Before_Loss_2"
  bottom: "Before_Loss_2"
  dropout_param {
    dropout_ratio: 0.3
  }
}





layer {
  bottom: "Before_Loss_2"
  type: "InnerProduct"
  name: "loss/classifier2"
  top: "loss/classifier2"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 33512
    weight_filler {
      type: "xavier"
std: 0.01

    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}




layer {
  type: "SoftmaxWithLoss"
  top: "loss2"
  name: "loss/loss2"
  bottom: "loss/classifier2"
  bottom: "label"
  loss_weight: 1
}



layer {
  type: "Accuracy"
  name: "loss/top-5-conv_1_pool"
  top: "exact_conv_1_train"
  bottom: "loss/classifier2"
  bottom: "label"
  include {
    phase: TRAIN
  }
}


layer {
  type: "Accuracy"
  name: "loss/top-5-conv_1_pool"
  top: "exact_conv_1"
  bottom: "loss/classifier2"
  bottom: "label"
  include {
    phase: TEST
  }
}



layer {
  type: "Accuracy"
  name: "loss/top-5-conv_1_pool"
  top: "accuracy5conv_1_pool"
  bottom: "loss/classifier2"
  bottom: "label"
  include {
    phase: TRAIN
  }
  accuracy_param {
    top_k: 5
  }
}



layer {
  type: "Accuracy"
  name: "loss/top-5-conv_4_final"
  top: "accuracy5conv_4_final"
  bottom: "loss/classifier4"
  bottom: "label"
  include {
    phase: TRAIN
  }
  accuracy_param {
    top_k: 5
  }
}




layer {
  type: "Accuracy"
  name: "loss/top-5-conv_1_pool_VAL"
  top: "accuracy5conv_1_pool_VAL"
  bottom: "loss/classifier2"
  bottom: "label"
  include {
    phase: TEST
  }
  accuracy_param {
    top_k: 5
  }
}



layer {
  type: "Accuracy"
  name: "loss/top-5-conv_4_final_VAL"
  top: "accuracy5conv_4_final_VAL"
  bottom: "loss/classifier4"
  bottom: "label"
  include {
    phase: TEST
  }
  accuracy_param {
    top_k: 5
  }
}

layer {
  type: "Accuracy"
  name: "loss/top-1"
  top: "Exact"
  bottom: "loss/classifier4"
  bottom: "label"
  include {
    phase: TEST
  }
}





layer {
  type: "Accuracy"
  name: "loss/top-10"
  top: "accuracy-top-10"
  bottom: "loss/classifier4"
  bottom: "label"
  include {
    phase: TEST
  }
  accuracy_param {
    top_k: 10
  }
}


layer {
  name: "loss/top-5-conv_4_final"
  type: "Accuracy"
  bottom: "loss/classifier4"
  bottom: "label"
  top: "exact_conv_4_final"
  include {
    phase: TRAIN
  }
}


layer {
  name: "prob"
  type: "Softmax"
  bottom: "loss/classifier4"
  top: "prob"
}




четвер, 12 травня 2016 р. 16:30:47 UTC+3 користувач SRQ написав:

Ihor Menshykov

unread,
May 17, 2016, 2:46:50 PM5/17/16
to Caffe Users
On more basic stuff it works fine. Seems like it just doesn't work with this network with this data I have. Produces the dreaded NaNs.
Reply all
Reply to author
Forward
0 new messages