HTH.
Dtype accum_ratio = 0;
// forward pass must be performed first as a prerequisite for this function to work correctly
// scale should be filled with k+alpha/size * sum( a[i]^2 ) from forward pass
// accumulate values
while (head < post_pad) {
// top_data is b[n_wh] = a[n_wh]*(k+alpha*sum(a[n_wh]^2))^-beta
accum_ratio += top_diff[head * step] * top_data[head * step] / scale[head * step];
// top_data is b[n_wh] = a[n_wh]*(k+alpha*sum(a[n_wh]^2))^-beta, so divided by scale..
// accum_ratio += top_diff * a[n_wh]*(k+alpha*sum(a[n_wh]^2))^(-beta-1)
++head;
}
// until we reach size, nothing needs to be subtracted
while (head < size) {
accum_ratio += top_diff[head * step] * top_data[head * step] / scale[head * step];
// continue adding top_diff * a[n_wh]*(k+alpha*sum(a[n_wh]^2))^(-beta-1)
bottom_diff[(head - post_pad) * step] =
// top_diff * ( k+alpha*sum(a[n_wh]^2) )^-beta
top_diff[(head - post_pad) * step] * pow(scale[(head - post_pad) * step], negative_beta)
// cache_ratio = Dtype(2. * alpha_ * beta_ / size_)
// accum_ratio = sum( top_diff[n_wh] * a[n_wh]*(k+alpha*sum(a[n_wh]^2))^(-beta-1) )
// a[n_wh] is bottom_data
// so 2*alpha*beta/size * a[n_wh] * a[n_wh]*(k+alpha*sum())^(-beta-1)
// this matches the derivative d/da[i] ( LRN( a[i]+a[not i] ) )
- cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio;
// so after all it looks like the full formula is correct:
// top_diff * (k+alpha*sum(a[n_wh]^2)^-beta
// - 2*alpha*beta/size * a[n_wh] * top_diff * sum( a[n_wh]*(k+alpha*sum(a[n_wh]^2))^(-beta-1) )
// from differentiation:
// dLrn/da[center] = (sumsq[a_all]*alpha+k)^-beta - 2*alpha*beta*a[center]^2*(sumsq[a_all]*alpha+k)^(-beta-1)
// dLrn/da[other] = - 2*alpha*beta* a[k]* a[other] * (sumsq[a_all]*alpha+k)^(-beta-1)
// so it appears that the gradient is summed correctly over d/da[k]
++head;