Hi everyone,
While I try to train my network, I get the:
KernelException: exception thrown during kernel execution on device Tesla V100-SXM2-32GB
error. I do not know what causes the error exactly, so I cannot share any of my code pieces. What may cause this error, any ideas?
The entire Stacktrace is given below:
ERROR: a exception was thrown during kernel execution.
Run Julia on debug level 2 for device stack traces.
ERROR: a exception was thrown during kernel execution.
Run Julia on debug level 2 for device stack traces.
ERROR: a exception was thrown during kernel execution.
Run Julia on debug level 2 for device stack traces.
Stacktrace:
[1] check_exceptions() at /kuacc/users/baristopal20/.julia/packages/CUDA/YeS8q/src/compiler/exceptions.jl:94
[2] prepare_cuda_call() at /kuacc/users/baristopal20/.julia/packages/CUDA/YeS8q/src/state.jl:85
[3] initialize_api() at /kuacc/users/baristopal20/.julia/packages/CUDA/YeS8q/lib/cudadrv/error.jl:92
[4] macro expansion at /kuacc/users/baristopal20/.julia/packages/CUDA/YeS8q/lib/cudadrv/libcuda.jl:505 [inlined]
[5] macro expansion at /kuacc/users/baristopal20/.julia/packages/CUDA/YeS8q/lib/cudadrv/error.jl:102 [inlined]
[6] cuMemcpyDtoH_v2(::Ptr{Float32}, ::CUDA.CuPtr{Nothing}, ::Int64) at /kuacc/users/baristopal20/.julia/packages/CUDA/YeS8q/lib/utils/call.jl:93
[7] _unsafe_copy!(::Array{Float32,3}, ::Int64, ::KnetArray{Float32,3}, ::Int64, ::Int64) at /kuacc/users/baristopal20/.julia/packages/Knet/C0PoK/src/knetarrays/karray.jl:102
[8] Array at /kuacc/users/baristopal20/.julia/packages/Knet/C0PoK/src/knetarrays/karray.jl:115 [inlined]
[9] Array at /kuacc/users/baristopal20/.julia/packages/Knet/C0PoK/src/knetarrays/karray.jl:113 [inlined]
[10] get_loss(::AutoGrad.Result{KnetArray{Float32,3}}, ::AutoGrad.Result{KnetArray{Float32,3}}, ::AutoGrad.Result{KnetArray{Float32,3}}, ::Array{Any,1}, ::Array{Float64,2}; mode::Int64) at /scratch/users/baristopal20/retinaface/BBTNet/models/retinaface.jl:176
[11] (::RetinaFace)(::KnetArray{Float32,4}, ::Array{Any,1}, ::Int64, ::Bool, ::Float64) at /scratch/users/baristopal20/retinaface/BBTNet/models/retinaface.jl:132
[12] (::Knet.Train20.var"#27#28"{Knet.Train20.Minimize{Array{Tuple{KnetArray{Float32,4},Array{Any,1},Int64,Bool,Float64},1}},Tuple{KnetArray{Float32,4},Array{Any,1},Int64,Bool,Float64}})() at /kuacc/users/baristopal20/.julia/packages/AutoGrad/VFrAv/src/core.jl:205
[13] differentiate(::Function; o::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /kuacc/users/baristopal20/.julia/packages/AutoGrad/VFrAv/src/core.jl:144
[14] differentiate at /kuacc/users/baristopal20/.julia/packages/AutoGrad/VFrAv/src/core.jl:135 [inlined]
[15] iterate(::Knet.Train20.Minimize{Array{Tuple{KnetArray{Float32,4},Array{Any,1},Int64,Bool,Float64},1}}) at /kuacc/users/baristopal20/.julia/packages/Knet/C0PoK/src/train20/train.jl:26
[16] momentum!(::RetinaFace, ::Vararg{Any,N} where N; o::Base.Iterators.Pairs{Symbol,Float64,Tuple{Symbol,Symbol},NamedTuple{(:lr, :gamma),Tuple{Float64,Float64}}}) at /kuacc/users/baristopal20/.julia/packages/Knet/C0PoK/src/train20/update.jl:181
[17] train_model(::RetinaFace, ::WIDER_Data; val_data::Nothing, save_dir::String) at /scratch/users/baristopal20/retinaface/BBTNet/models/retinaface.jl:276
[18] top-level scope at /scratch/users/baristopal20/retinaface/train.jl:21
[19] include(::Function, ::Module, ::String) at ./Base.jl:380
[20] include(::Module, ::String) at ./Base.jl:368
[21] exec_options(::Base.JLOptions) at ./client.jl:296
[22] _start() at ./client.jl:506
ERROR: LoadError: KernelException: exception thrown during kernel execution on device Tesla V100-SXM2-32GB
Stacktrace:
[1] differentiate(::Function; o::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /kuacc/users/baristopal20/.julia/packages/AutoGrad/VFrAv/src/core.jl:148
[2] differentiate at /kuacc/users/baristopal20/.julia/packages/AutoGrad/VFrAv/src/core.jl:135 [inlined]
[3] iterate(::Knet.Train20.Minimize{Array{Tuple{KnetArray{Float32,4},Array{Any,1},Int64,Bool,Float64},1}}) at /kuacc/users/baristopal20/.julia/packages/Knet/C0PoK/src/train20/train.jl:26
[4] momentum!(::RetinaFace, ::Vararg{Any,N} where N; o::Base.Iterators.Pairs{Symbol,Float64,Tuple{Symbol,Symbol},NamedTuple{(:lr, :gamma),Tuple{Float64,Float64}}}) at /kuacc/users/baristopal20/.julia/packages/Knet/C0PoK/src/train20/update.jl:181
[5] train_model(::RetinaFace, ::WIDER_Data; val_data::Nothing, save_dir::String) at /scratch/users/baristopal20/retinaface/BBTNet/models/retinaface.jl:276
[6] top-level scope at /scratch/users/baristopal20/retinaface/train.jl:21
[7] include(::Function, ::Module, ::String) at ./Base.jl:380
[8] include(::Module, ::String) at ./Base.jl:368
[9] exec_options(::Base.JLOptions) at ./client.jl:296
[10] _start() at ./client.jl:506
in expression starting at /scratch/users/baristopal20/retinaface/train.jl:21
caused by [exception 1]
KernelException: exception thrown during kernel execution on device Tesla V100-SXM2-32GB
Stacktrace:
[1] check_exceptions() at /kuacc/users/baristopal20/.julia/packages/CUDA/YeS8q/src/compiler/exceptions.jl:94
[2] prepare_cuda_call() at /kuacc/users/baristopal20/.julia/packages/CUDA/YeS8q/src/state.jl:85
[3] initialize_api() at /kuacc/users/baristopal20/.julia/packages/CUDA/YeS8q/lib/cudadrv/error.jl:92
[4] macro expansion at /kuacc/users/baristopal20/.julia/packages/CUDA/YeS8q/lib/cudadrv/libcuda.jl:505 [inlined]
[5] macro expansion at /kuacc/users/baristopal20/.julia/packages/CUDA/YeS8q/lib/cudadrv/error.jl:102 [inlined]
[6] cuMemcpyDtoH_v2(::Ptr{Float32}, ::CUDA.CuPtr{Nothing}, ::Int64) at /kuacc/users/baristopal20/.julia/packages/CUDA/YeS8q/lib/utils/call.jl:93
[7] _unsafe_copy!(::Array{Float32,3}, ::Int64, ::KnetArray{Float32,3}, ::Int64, ::Int64) at /kuacc/users/baristopal20/.julia/packages/Knet/C0PoK/src/knetarrays/karray.jl:102
[8] Array at /kuacc/users/baristopal20/.julia/packages/Knet/C0PoK/src/knetarrays/karray.jl:115 [inlined]
[9] Array at /kuacc/users/baristopal20/.julia/packages/Knet/C0PoK/src/knetarrays/karray.jl:113 [inlined]
[10] get_loss(::AutoGrad.Result{KnetArray{Float32,3}}, ::AutoGrad.Result{KnetArray{Float32,3}}, ::AutoGrad.Result{KnetArray{Float32,3}}, ::Array{Any,1}, ::Array{Float64,2}; mode::Int64) at /scratch/users/baristopal20/retinaface/BBTNet/models/retinaface.jl:176
[11] (::RetinaFace)(::KnetArray{Float32,4}, ::Array{Any,1}, ::Int64, ::Bool, ::Float64) at /scratch/users/baristopal20/retinaface/BBTNet/models/retinaface.jl:132
[12] (::Knet.Train20.var"#27#28"{Knet.Train20.Minimize{Array{Tuple{KnetArray{Float32,4},Array{Any,1},Int64,Bool,Float64},1}},Tuple{KnetArray{Float32,4},Array{Any,1},Int64,Bool,Float64}})() at /kuacc/users/baristopal20/.julia/packages/AutoGrad/VFrAv/src/core.jl:205
[13] differentiate(::Function; o::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /kuacc/users/baristopal20/.julia/packages/AutoGrad/VFrAv/src/core.jl:144
[14] differentiate at /kuacc/users/baristopal20/.julia/packages/AutoGrad/VFrAv/src/core.jl:135 [inlined]
[15] iterate(::Knet.Train20.Minimize{Array{Tuple{KnetArray{Float32,4},Array{Any,1},Int64,Bool,Float64},1}}) at /kuacc/users/baristopal20/.julia/packages/Knet/C0PoK/src/train20/train.jl:26
[16] momentum!(::RetinaFace, ::Vararg{Any,N} where N; o::Base.Iterators.Pairs{Symbol,Float64,Tuple{Symbol,Symbol},NamedTuple{(:lr, :gamma),Tuple{Float64,Float64}}}) at /kuacc/users/baristopal20/.julia/packages/Knet/C0PoK/src/train20/update.jl:181
[17] train_model(::RetinaFace, ::WIDER_Data; val_data::Nothing, save_dir::String) at /scratch/users/baristopal20/retinaface/BBTNet/models/retinaface.jl:276
[18] top-level scope at /scratch/users/baristopal20/retinaface/train.jl:21
[19] include(::Function, ::Module, ::String) at ./Base.jl:380
[20] include(::Module, ::String) at ./Base.jl:368
[21] exec_options(::Base.JLOptions) at ./client.jl:296
[22] _start() at ./client.jl:506
make: *** [train] Error 1
Thanks.
Barış Batuhan Topal
M.Sc. Computer Science Student
Koç University