ELU layer

Ihor Menshykov

unread,

May 12, 2016, 12:11:06 AM5/12/16

to Caffe Users

ELU layer is supposed to be better than ReLU, but in my project it just always makes loss climb to some 157 value and never come down. Have anyone had any experience with it?

Ihor Menshykov

unread,

May 12, 2016, 1:11:45 AM5/12/16

to Caffe Users

Starting with much lower learning rates might have done the trick. I'll write back, when I'll have more positive experience with this thing.

Ihor Menshykov

unread,

May 12, 2016, 8:16:15 AM5/12/16

to Caffe Users

Later that climbed to that 157.2 loss too. Looks to me, like the ELU layer in Caffe is bugged.

SRQ

unread,

May 12, 2016, 9:30:47 AM5/12/16

to Caffe Users

Are results fine with ReLU?

Message has been deleted

Ihor Menshykov

unread,

May 12, 2016, 9:59:18 AM5/12/16

to Caffe Users

Yes, results are fine with ReLU and PReLU. Just to be absolutely clear, can I use ELU without any special parameters? I've seen in the caffe docs that alpha should default to 1, which is what it is in the ELU paper http://arxiv.org/pdf/1511.07289v5.pdf .

Here's what the network looks like with PReLU. I was just replacing "PReLU" for "ELU" on this net.

name: "CaffeNet"
layer {
  name: "data"
  type: "Data"
  top: "data"
  top: "label"
  include {
    phase: TRAIN
  }
  
  transform_param {
    crop_size: 45
  }

  
  data_param {
    source: "/storage/lmdb/10"
    batch_size: 32
    backend: LMDB
  }
}
layer {
  name: "data"
  type: "Data"
  top: "data"
  top: "label"
  include {
    phase: TEST
  }

  transform_param {
    crop_size: 45
  }

  data_param {
    source: "/home/sharpy/local-net/lmdb-server/from-ram-37g/2/train_labels"
    batch_size: 32
    backend: LMDB
  }
}



layer {
  name: "scale"
  bottom: "data"
  top: "scale"
  type: "Power"
  power_param {
    scale: 						-1
	shift:						1
	#	white shift
  }
}


#	46




layer {
  bottom: 	"scale"
  type: 													"Convolution"
  convolution_param {
    num_output: 							48
    pad: 									2
    kernel_size: 							5

    weight_filler {
      type: "xavier"
      std: 0.1
    }
    bias_filler {
      type: "constant"
      value: 0.2
    }
}
  param {
    lr_mult: 1
    decay_mult: 1
  }
  top: 					"conv_1"
  name: 				"conv_1"
}

layer {
  bottom: 				"conv_1"
  top: 					"conv_1"
  name: 				"conv_1_rectifier"
  type: 							"PReLU"
}



layer {
  type: 														"Dropout"
  name: 		"conv_1_d"
  top: 			"conv_1"
  bottom: 		"conv_1"
  dropout_param {
    dropout_ratio: 						0.1
  }
}




# 45



layer {
  type: 														"Pooling"
  bottom:		"conv_1"
  name: 		"conv_1_pool"
  top: 			"conv_1_pool"
  pooling_param {
    pool: MAX
    pad: 									0
    kernel_size: 							4
    stride: 								3
  }
}


# 15





layer {
  bottom: 	"conv_1_pool"
  type: 													"Convolution"
  convolution_param {
    num_output: 							64
    pad: 									1
    kernel_size: 							3
	group:									16


    weight_filler {
      type: "xavier"
      std: 0.03
    }
    bias_filler {
      type: "constant"
      value: 0.2
    }
}
  param {
    lr_mult: 1
    decay_mult: 1
  }
  top: 					"conv_2_pre"
  name: 				"conv_2_pre"
}

layer {
  bottom: 				"conv_2_pre"
  top: 					"conv_2_pre"
  name: 				"conv_2_pre_rectifier"
  type: 													"PReLU"
}



layer {
  type: 													"Dropout"
  name: 		"conv_2_pre_d"
  top: 			"conv_2_pre"
  bottom: 		"conv_2_pre"
  dropout_param {
    dropout_ratio: 						0.1
  }
}




layer {
  bottom: 	"conv_2_pre"
  type: 													"Convolution"
  convolution_param {
    num_output: 							48
    pad: 									0
    kernel_size: 							1
	group:									1


    weight_filler {
      type: "xavier"
      std: 0.04
    }
    bias_filler {
      type: "constant"
      value: 0.2
    }
}
  param {
    lr_mult: 1
    decay_mult: 1
  }
  top: 					"conv_2_m"
  name: 				"conv_2_m"
}

layer {
  bottom: 				"conv_2_m"
  top: 					"conv_2_m"
  name: 				"conv_2_m_rectifier"
  type: 													"PReLU"
}


layer {
  type: 													"Dropout"
  name: 		"conv_2_m_d"
  top: 			"conv_2_m"
  bottom: 		"conv_2_m"
  dropout_param {
    dropout_ratio: 						0.1
  }
}


layer {
  bottom: 	"conv_2_m"
  type: 													"Convolution"
  convolution_param {
    num_output: 							128
	pad:									3
    kernel_size: 							7
	group:									16
	
    weight_filler {
      type: "xavier"
      std: 0.1
    }
    bias_filler {
      type: "constant"
      value: 0.2
    }
  }
  param {
    lr_mult: 1
    decay_mult: 1
  }
  top: 					"conv_2"
  name: 				"conv_2"
}

layer {
  bottom: 				"conv_2"
  top: 					"conv_2"
  name: 				"conv_2_rectifier"
  type: 							"PReLU"
}

layer {
  type: 														"Dropout"
  name: 		"conv_2_d"
  top: 			"conv_2"
  bottom: 		"conv_2"
  dropout_param {
    dropout_ratio: 						0.1
  }
}



layer {
  bottom: 		"conv_2"
  type: 																	"InnerProduct"
  name: 						"Before_Loss_4"
  top: 							"Before_Loss_4"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
	num_output:									1024
    weight_filler {
		type: "xavier"
		std: 0.1
    }
    bias_filler {
      type: "constant"
      value: 0.01
    }
  }
}

layer {
  type: 																		"PReLU"
  name: 		"Before_Loss_4Relu"
  top: 			"Before_Loss_4"
  bottom: 		"Before_Loss_4"
}

layer {
  type: 																		"Dropout"
  name: 		"Before_Loss_4_d"
  top: 			"Before_Loss_4"
  bottom: 		"Before_Loss_4"
  dropout_param {
    dropout_ratio: 								0.1
  }
}



layer {
  bottom: 		"Before_Loss_4"
  type: 																		"InnerProduct"
  name: 		"loss/classifier4"
  top: 			"loss/classifier4"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 							33512
    weight_filler {
      type: "xavier"
		std: 0.01

    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}


layer {
  type: 	"SoftmaxWithLoss"
  top: 		"loss4"
  name: 	"loss/loss4"
  bottom:	"loss/classifier4"
  bottom:	"label"
  loss_weight: 0.8
}




layer {
  bottom: 						"conv_2_pre"
  type: 																	"InnerProduct"
  name: 						"Before_Loss_2"
  top: 							"Before_Loss_2"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
	num_output:									200
    weight_filler {
		type: "xavier"
		std: 0.1
    }
    bias_filler {
      type: "constant"
      value: 0.01
    }
  }
}

layer {
  type: 																	"PReLU"
  name: 						"Before_Loss_2Relu"
  top: 							"Before_Loss_2"
  bottom: 						"Before_Loss_2"
}						

						
layer {						
  type: 																		"Dropout"
  name: 							"Before_Loss_2_d"
  top: 								"Before_Loss_2"
  bottom: 							"Before_Loss_2"
  dropout_param {
    dropout_ratio: 								0.3
  }
}





layer {
  bottom: 		"Before_Loss_2"
  type: 																		"InnerProduct"
  name: 		"loss/classifier2"
  top: 			"loss/classifier2"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 							33512
    weight_filler {
      type: "xavier"
		std: 0.01

    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}




layer {
  type: 	"SoftmaxWithLoss"
  top: 		"loss2"
  name: 	"loss/loss2"
  bottom:	"loss/classifier2"
  bottom:	"label"
  loss_weight: 1
}



layer {
  type: 	"Accuracy"
  name: 		"loss/top-5-conv_1_pool"
  top: 			"exact_conv_1_train"
  bottom: 		"loss/classifier2"
  bottom: "label"
  include {
    phase: TRAIN
  }
}


layer {
  type: 	"Accuracy"
  name: 		"loss/top-5-conv_1_pool"
  top: 			"exact_conv_1"
  bottom: 		"loss/classifier2"
  bottom: "label"
  include {
    phase: TEST
  }
}



layer {
  type: 	"Accuracy"
  name: 		"loss/top-5-conv_1_pool"
  top: 			"accuracy5conv_1_pool"
  bottom: 		"loss/classifier2"
  bottom: "label"
  include {
    phase: TRAIN
  }
  accuracy_param {
    top_k: 		5
  }
}



layer {
  type: 	"Accuracy"
  name: 		"loss/top-5-conv_4_final"
  top: 			"accuracy5conv_4_final"
  bottom: 		"loss/classifier4"
  bottom: "label"
  include {
    phase: TRAIN
  }
  accuracy_param {
    top_k: 		5
  }
}




layer {
  type: 	"Accuracy"
  name: 		"loss/top-5-conv_1_pool_VAL"
  top: 			"accuracy5conv_1_pool_VAL"
  bottom: 		"loss/classifier2"
  bottom: "label"
  include {
    phase: TEST
  }
  accuracy_param {
    top_k: 		5
  }
}



layer {
  type: 	"Accuracy"
  name: 		"loss/top-5-conv_4_final_VAL"
  top: 			"accuracy5conv_4_final_VAL"
  bottom: 		"loss/classifier4"
  bottom: "label"
  include {
    phase: TEST
  }
  accuracy_param {
    top_k: 		5
  }
}

layer {
  type: 	"Accuracy"
  name: 		"loss/top-1"
  top: 			"Exact"
  bottom: 		"loss/classifier4"
  bottom: 		"label"
  include {
    phase: TEST
  }
}





layer {
  type: 	"Accuracy"
  name: 		"loss/top-10"
  top: 			"accuracy-top-10"
  bottom: 		"loss/classifier4"
  bottom: "label"
  include {
    phase: TEST
  }
  accuracy_param {
    top_k: 		10
  }
}


layer {
  name: "loss/top-5-conv_4_final"
  type: "Accuracy"
  bottom: "loss/classifier4"
  bottom: "label"
  top: "exact_conv_4_final"
  include {
    phase: TRAIN
  }
}


layer {
  name: "prob"
  type: "Softmax"
  bottom: "loss/classifier4"
  top: "prob"
}

четвер, 12 травня 2016 р. 16:30:47 UTC+3 користувач SRQ написав:

Ihor Menshykov

unread,

May 17, 2016, 2:46:50 PM5/17/16

to Caffe Users

On more basic stuff it works fine. Seems like it just doesn't work with this network with this data I have. Produces the dreaded NaNs.

Reply all

Reply to author

Forward