Gradient is NaN

12 views

Skip to first unread message

fausto....@gmail.com

unread,

Jul 31, 2020, 4:22:09 AM7/31/20

to theano-users

I have a problem with Theano, namely the Jacobian of a Theano function found by auto-differentiation, returns all NaNs. I suspect the problem might be the switch statements because the gradient calculation worked for a similar function that didn't have any switch statements, but in theory the switch function should not break the gradient. I was wondering what could be causing this.

The following function returns the Theano function from a bunch of inputs to the Jacobian of the flattened s3 variable:

```python

def theano_RSA(

possible_signals_array=T.ltensor3("possible_signals_array"),

real_signals_indices=T.lvector("real_signals_indices"),

alphas=T.dvector("alphas"),

choice_alphas=T.dvector("choice_alphas"),

cost_factors= T.dvector("cost_factors"),

objective_costs_possible=T.dvector("objective_costs_possible"),

at_most_as_costly=T.lmatrix("at_most_as_costly"),

types=T.lvector("types"),

distances=T.dmatrix("distances")):

real_signals_array = possible_signals_array[:,real_signals_indices]

objective_costs_real = objective_costs_possible[real_signals_indices]

max_pic_size = possible_signals_array.shape[0] - 1

considered_signals = at_most_as_costly & types.dimshuffle("x", 0)

language_l = possible_signals_array / possible_signals_array.sum(axis=-1, keepdims=True)

expected_dist_l0 = T.tensordot(language_l, distances, axes=[[2],[0]])

unnorm_l0 = T.exp(choice_alphas[:,np.newaxis,np.newaxis,np.newaxis]*-expected_dist_l0)

shape = unnorm_l0.shape

_, picsize_index, _, state_index = T.mgrid[

0:shape[0],

0:shape[1],

0:shape[2],

0:shape[3]

]

unnorm_l0 = T.switch(state_index > picsize_index, 0, unnorm_l0)

l0 = unnorm_l0 / T.sum(unnorm_l0, axis=-1, keepdims=True)

l0_extended = l0[:,:,np.newaxis,:,:]

costs_possible = T.outer(cost_factors, objective_costs_possible)

utility_l0 = T.log(l0_extended)

unnorm_s1 = T.exp(

alphas[:,np.newaxis, np.newaxis, np.newaxis, np.newaxis] *

(utility_l0 - costs_possible[:,np.newaxis,np.newaxis,:,np.newaxis])

)

unnorm_s1 = unnorm_s1 * considered_signals[np.newaxis,np.newaxis,:,:,np.newaxis]

s1 = unnorm_s1 / unnorm_s1.sum(axis=-2, keepdims=True)

s1 = T.switch(T.isnan(s1), 0., s1)

l2 = s1 / s1.sum(axis=-1, keepdims=True)

expected_dist_l2 = T.tensordot(l2, distances, axes=[[4],[0]])

unnorm_l2 = T.exp(choice_alphas[:,np.newaxis,np.newaxis,np.newaxis,np.newaxis]*-expected_dist_l2)

shape = unnorm_l2.shape

_, picsize_index, _, _, state_index = T.mgrid[

0:shape[0],

0:shape[1],

0:shape[2],

0:shape[3],

0:shape[4]

]

unnorm_l2 = T.switch(state_index > picsize_index, 0, unnorm_l2)

l2 = unnorm_l2 / T.sum(unnorm_l2, axis=-1, keepdims=True)

l2_language = l2[:,:,T.arange(real_signals_indices.shape[0]), real_signals_indices,:].squeeze()

costs_real = T.outer(cost_factors, objective_costs_real)

utility_l2 = T.log(l2_language)

unnorm_s3 = T.exp(

alphas[:,np.newaxis,np.newaxis, np.newaxis]*

(utility_l2 - costs_real[:,np.newaxis,:,np.newaxis])

)

s3 = unnorm_s3 / unnorm_s3.sum(axis=-2, keepdims=True)

s3 = T.switch(T.isnan(s3), 0, s3)

return_value = T.jacobian(s3.flatten(), alphas)

return tt.function([

possible_signals_array,

real_signals_indices,

alphas,

choice_alphas,

cost_factors,

objective_costs_possible,

at_most_as_costly,

types,

distances

], return_value, on_unused_input='warn'

)

```

So to calculate the Jacobian:

```python

s3_function_grad = theano_RSA()

```

To evaluate the function at a point, create some toy data:

```python

num_participants = 1

num_states = 2

possible_signals_array = np.array([

[[0, 0],

[0, 0]],

[[1, 0],

[0, 1]]

])

real_signals_indices = np.array([0])

costs_possible = np.array([1, 1])

at_most_as_costly = np.array([

[1,1],

[1,1]

])

types = np.array([1,1])

distances = np.array([

[0,1],

[1,0]

])

picked_signals_indices = np.array([0])

picsizes_values = np.array([1])

participants_indices = np.array([0])

states_values = np.array([1])

alphas = np.array([1.])

choice_alphas = np.array([1.])

cost_factors = np.array([0.01])

```

And finally, evaluate the gradient, which is all NaN:

```python

s3_function_grad(

possible_signals_array = possible_signals_array,

real_signals_indices = real_signals_indices,

alphas = alphas,

choice_alphas = choice_alphas,

cost_factors = cost_factors,

objective_costs_possible = costs_possible,

at_most_as_costly = at_most_as_costly,

types = types,

distances = distances

)

```

Reply all

Reply to author

Forward

0 new messages