Hi,
In fact it might be faster to do it in one pass. You might want to try something like this:
-- Prepare outputs
local flat_outputs = outputs:transpose(2, 3) -- BVHWC -> BHVWC
local transposed_size = flat_outputs:size() -- save size for reshaping the gradient
flat_outputs = flat_outputs:view(-1, flat_outputs:size(3), flat_outputs:size(4), flat_outputs:size(5)) -- merge BH into one dimension
flat_outputs = flat_outputs:contiguous() -- ensure they're contiguous
-- Prepare targets
local flat_targets = targets:view(-1, targets:size(2), targets:size(3)):contiguous()
-- Get the gradient
local err = criterion:forward(flat_outputs, flat_targets)
local flat_df_do = criterion:backward(flat_outputs, flat_targets)
df_do = flat_df_do:reshape(transposed_size):transpose(2, 3):contiguous()
Best,
Adam