FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04
MAINTAINER caffe...@googlegroups.com
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cmake \
git \
wget \
libatlas-base-dev \
libboost-all-dev \
libgflags-dev \
libgoogle-glog-dev \
libhdf5-serial-dev \
libleveldb-dev \
liblmdb-dev \
libopencv-dev \
libprotobuf-dev \
libsnappy-dev \
protobuf-compiler \
python-dev \
python-numpy \
python-pip \
python-scipy && \
rm -rf /var/lib/apt/lists/*
ENV CAFFE_ROOT=/opt/caffe
WORKDIR $CAFFE_ROOT
# FIXME: clone a specific git tag and use ARG instead of ENV once DockerHub supports this.
ENV CLONE_TAG=master
RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/BVLC/caffe.git . && \
for req in $(cat python/requirements.txt) pydot; do pip install $req; done && \
mkdir build && cd build && \
cmake -DUSE_CUDNN=1 .. && \
make -j"$(nproc)"
ENV PYCAFFE_ROOT $CAFFE_ROOT/python
ENV PYTHONPATH $PYCAFFE_ROOT:$PYTHONPATH
ENV PATH $CAFFE_ROOT/build/tools:$PYCAFFE_ROOT:$PATH
RUN echo "$CAFFE_ROOT/build/lib" >> /etc/ld.so.conf.d/caffe.conf && ldconfig
WORKDIR /workspace
Followed by running the caffe model as below command
nvidia-docker run -v `pwd`:`pwd` -w `pwd` -i -t caffe:gpu caffe train --solver=solver.prototxt -weights Emotiw.caffemodel
I am facing the following error,
I1111 04:25:39.775985 1 net.cpp:228] data does not need backward computation.
I1111 04:25:39.775993 1 net.cpp:270] This network produces output accuracy
I1111 04:25:39.776005 1 net.cpp:270] This network produces output loss
I1111 04:25:39.776032 1 net.cpp:283] Network initialization done.
I1111 04:25:39.776175 1 solver.cpp:60] Solver scaffolding done.
I1111 04:25:39.776871 1 caffe.cpp:155] Finetuning from Emotiw.caffemodel
I1111 04:25:48.997071 1 net.cpp:761] Ignoring source layer fc8_cat
I1111 04:27:41.813612 1 net.cpp:761] Ignoring source layer fc8_cat
I1111 04:27:41.852115 1 caffe.cpp:251] Starting Optimization
I1111 04:27:41.852200 1 solver.cpp:279] Solving PainwildNet
I1111 04:27:41.852221 1 solver.cpp:280] Learning Rate Policy: step
I1111 04:27:41.871546 1 solver.cpp:337] Iteration 0, Testing net (#0)
F1111 04:27:41.975798 1 pooling_layer.cu:212] Check failed: error == cudaSuccess (8 vs. 0) invalid device function
*** Check failure stack trace: ***
@ 0x7fcd40cf25cd google::LogMessage::Fail()
@ 0x7fcd40cf4433 google::LogMessage::SendToLog()
@ 0x7fcd40cf215b google::LogMessage::Flush()
@ 0x7fcd40cf4e1e google::LogMessageFatal::~LogMessageFatal()
@ 0x7fcd41488851 caffe::PoolingLayer<>::Forward_gpu()
@ 0x7fcd4128a652 caffe::Net<>::ForwardFromTo()
@ 0x7fcd4128a777 caffe::Net<>::Forward()
@ 0x7fcd412c244a caffe::Solver<>::Test()
@ 0x7fcd412c30ce caffe::Solver<>::TestAll()
@ 0x7fcd412c31ec caffe::Solver<>::Step()
@ 0x7fcd412c3f19 caffe::Solver<>::Solve()
@ 0x40cf5f train()
@ 0x4088d8 main
@ 0x7fcd3f205830 __libc_start_main
@ 0x4091a9 _start
@ (nil) (unknown)
*** Aborted at 1478838462 (unix time) try "date -d @1478838462" if you are using GNU date ***
PC: @ 0x7fcd3f21c186 abort
*** SIGSEGV (@0x0) received by PID 1 (TID 0x7fcd41d84ac0) from PID 0; stack trace: ***
@ 0x7fcd3f21a4a0 (unknown)
@ 0x7fcd3f21c186 abort
@ 0x7fcd40cfb12c (unknown)
@ 0x7fcd40cf25cd google::LogMessage::Fail()
@ 0x7fcd40cf4433 google::LogMessage::SendToLog()
@ 0x7fcd40cf215b google::LogMessage::Flush()
@ 0x7fcd40cf4e1e google::LogMessageFatal::~LogMessageFatal()
@ 0x7fcd41488851 caffe::PoolingLayer<>::Forward_gpu()
@ 0x7fcd4128a652 caffe::Net<>::ForwardFromTo()
@ 0x7fcd4128a777 caffe::Net<>::Forward()
@ 0x7fcd412c244a caffe::Solver<>::Test()
@ 0x7fcd412c30ce caffe::Solver<>::TestAll()
@ 0x7fcd412c31ec caffe::Solver<>::Step()
@ 0x7fcd412c3f19 caffe::Solver<>::Solve()
@ 0x40cf5f train()
@ 0x4088d8 main
@ 0x7fcd3f205830 __libc_start_main
@ 0x4091a9 _start
@ 0x0 (unknown)
name: "TempWLDNET"
layer {
name: "data"
type: "ImageData"
top: "data"
top: "label"
include {
phase: TRAIN
}
transform_param {
mirror: true
crop_size: 224
mean_file: "mean.binaryproto"
}
image_data_param {
source: "train.txt"
batch_size: 25
new_height: 256
new_width: 256
}
}
layer {
name: "data"
type: "ImageData"
top: "data"
top: "label"
include {
phase: TEST
}
transform_param {
mirror: false
crop_size: 224
mean_file: "mean.binaryproto"
}
image_data_param {
source: "test.txt"
batch_size: 25
new_height: 256
new_width: 256
}
}
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 96
kernel_size: 7
stride: 2
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu1"
type: "ReLU"
bottom: "conv1"
top: "conv1"
}
layer {
name: "norm1"
type: "LRN"
bottom: "conv1"
top: "norm1"
lrn_param {
local_size: 5
alpha: 0.0005
beta: 0.75
}
}
layer {
name: "pool1"
type: "Pooling"
bottom: "norm1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 3
stride: 3
}
}
layer {
name: "conv2"
type: "Convolution"
bottom: "pool1"
top: "conv2"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 256
pad: 2
kernel_size: 5
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 1
}
}
}
layer {
name: "relu2"
type: "ReLU"
bottom: "conv2"
top: "conv2"
}
layer {
name: "pool2"
type: "Pooling"
bottom: "conv2"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv3"
type: "Convolution"
bottom: "pool2"
top: "conv3"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu3"
type: "ReLU"
bottom: "conv3"
top: "conv3"
}
layer {
name: "conv4"
type: "Convolution"
bottom: "conv3"
top: "conv4"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 1
}
}
}
layer {
name: "relu4"
type: "ReLU"
bottom: "conv4"
top: "conv4"
}
layer {
name: "conv5"
type: "Convolution"
bottom: "conv4"
top: "conv5"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu5"
type: "ReLU"
bottom: "conv5"
top: "conv5"
}
layer {
name: "pool5"
type: "Pooling"
bottom: "conv5"
top: "pool5"
pooling_param {
pool: MAX
kernel_size: 3
stride: 3
}
}
layer {
name: "fc6"
type: "InnerProduct"
bottom: "pool5"
top: "fc6"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 4048
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 1
}
}
}
layer {
name: "relu6"
type: "ReLU"
bottom: "fc6"
top: "fc6"
}
layer {
name: "drop6"
type: "Dropout"
bottom: "fc6"
top: "fc6"
dropout_param {
dropout_ratio: 0.5
}
}
layer {
name: "fc7"
type: "InnerProduct"
bottom: "fc6"
top: "fc7"
# Note that lr_mult can be set to 0 to disable any fine-tuning of this, and any other, layer
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 4048
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 1
}
}
}
layer {
name: "relu7"
type: "ReLU"
bottom: "fc7"
top: "fc7"
}
layer {
name: "drop7"
type: "Dropout"
bottom: "fc7"
top: "fc7"
dropout_param {
dropout_ratio: 0.5
}
}
layer {
name: "fc8_temp"
type: "InnerProduct"
bottom: "fc7"
top: "fc8_temp"
# lr_mult is set to higher than for other layers, because this layer is starting from random while the others are already trained
param {
lr_mult: 10
decay_mult: 1
}
param {
lr_mult: 20
decay_mult: 0
}
inner_product_param {
num_output: 16
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "accuracy"
type: "Accuracy"
bottom: "fc8_temp"
bottom: "label"
top: "accuracy"
include {
phase: TEST
}
}
layer {
name: "loss"
type: "SoftmaxWithLoss"
bottom: "fc8_temp"
bottom: "label"
top: "loss"
}
Can someone please help me why i am seeing this error and how to fix it. Please let me know if you need any additional information. Thank you in advance.