Gradient is NaN

12 views
Skip to first unread message

fausto....@gmail.com

unread,
Jul 31, 2020, 4:22:09 AM7/31/20
to theano-users

I have a problem with Theano, namely the Jacobian of a Theano function found by auto-differentiation, returns all NaNs. I suspect the problem might be the switch statements because the gradient calculation worked for a similar function that didn't have any switch statements, but in theory the switch function should not break the gradient. I was wondering what could be causing this.

The following function returns the Theano function from a bunch of inputs to the Jacobian of the flattened s3 variable:

```python

    def theano_RSA(

            possible_signals_array=T.ltensor3("possible_signals_array"), 

            real_signals_indices=T.lvector("real_signals_indices"), 

            alphas=T.dvector("alphas"), 

            choice_alphas=T.dvector("choice_alphas"), 

            cost_factors= T.dvector("cost_factors"), 

            objective_costs_possible=T.dvector("objective_costs_possible"), 

            at_most_as_costly=T.lmatrix("at_most_as_costly"), 

            types=T.lvector("types"),

            distances=T.dmatrix("distances")):


        real_signals_array = possible_signals_array[:,real_signals_indices]

        objective_costs_real = objective_costs_possible[real_signals_indices]

    

        max_pic_size = possible_signals_array.shape[0] - 1

    

        considered_signals = at_most_as_costly & types.dimshuffle("x", 0)

    

        language_l = possible_signals_array / possible_signals_array.sum(axis=-1, keepdims=True)

        expected_dist_l0 = T.tensordot(language_l, distances, axes=[[2],[0]])

    

        unnorm_l0 = T.exp(choice_alphas[:,np.newaxis,np.newaxis,np.newaxis]*-expected_dist_l0)

        shape = unnorm_l0.shape

        _, picsize_index, _, state_index = T.mgrid[

            0:shape[0], 

            0:shape[1], 

            0:shape[2], 

            0:shape[3]

        ]

        unnorm_l0 = T.switch(state_index > picsize_index, 0, unnorm_l0)

        l0 = unnorm_l0 / T.sum(unnorm_l0, axis=-1, keepdims=True)

    

        l0_extended = l0[:,:,np.newaxis,:,:]

    

        costs_possible = T.outer(cost_factors, objective_costs_possible)

        utility_l0 = T.log(l0_extended)

        unnorm_s1 = T.exp(

            alphas[:,np.newaxis, np.newaxis, np.newaxis, np.newaxis] *

            (utility_l0 - costs_possible[:,np.newaxis,np.newaxis,:,np.newaxis])

        )

    

        unnorm_s1 = unnorm_s1 * considered_signals[np.newaxis,np.newaxis,:,:,np.newaxis]

        s1 = unnorm_s1 / unnorm_s1.sum(axis=-2, keepdims=True)

        s1 = T.switch(T.isnan(s1), 0., s1)

    

        l2 = s1 / s1.sum(axis=-1, keepdims=True)

        expected_dist_l2 = T.tensordot(l2, distances, axes=[[4],[0]])

        

        unnorm_l2 = T.exp(choice_alphas[:,np.newaxis,np.newaxis,np.newaxis,np.newaxis]*-expected_dist_l2)

        shape = unnorm_l2.shape

        _, picsize_index, _, _, state_index = T.mgrid[

            0:shape[0], 

            0:shape[1], 

            0:shape[2], 

            0:shape[3], 

            0:shape[4]

        ]

        unnorm_l2 = T.switch(state_index > picsize_index, 0, unnorm_l2)

        l2 = unnorm_l2 / T.sum(unnorm_l2, axis=-1, keepdims=True)


        l2_language = l2[:,:,T.arange(real_signals_indices.shape[0]), real_signals_indices,:].squeeze()

        costs_real = T.outer(cost_factors, objective_costs_real)

    

        utility_l2 = T.log(l2_language)

    

        unnorm_s3 = T.exp(

            alphas[:,np.newaxis,np.newaxis, np.newaxis]*

            (utility_l2 - costs_real[:,np.newaxis,:,np.newaxis])

        )

    

        s3 = unnorm_s3 / unnorm_s3.sum(axis=-2, keepdims=True)

        s3 = T.switch(T.isnan(s3), 0, s3)

        

        return_value = T.jacobian(s3.flatten(), alphas)


        return tt.function([

                    possible_signals_array, 

                    real_signals_indices, 

                    alphas, 

                    choice_alphas, 

                    cost_factors, 

                    objective_costs_possible, 

                    at_most_as_costly, 

                    types,

                    distances

                ], return_value, on_unused_input='warn'

            )

```

So to calculate the Jacobian:

```python

s3_function_grad = theano_RSA()

```

To evaluate the function at a point, create some toy data:

```python

num_participants = 1

num_states = 2

possible_signals_array = np.array([

    [[0, 0], 

     [0, 0]],

    [[1, 0], 

     [0, 1]]    

])

real_signals_indices = np.array([0])

costs_possible = np.array([1, 1])

at_most_as_costly = np.array([

    [1,1],

    [1,1]

])

types = np.array([1,1])

distances = np.array([

    [0,1],

    [1,0]

])

picked_signals_indices = np.array([0])

picsizes_values = np.array([1])

participants_indices = np.array([0])

states_values = np.array([1])

alphas = np.array([1.])

choice_alphas = np.array([1.])

cost_factors = np.array([0.01])

```


And finally, evaluate the gradient, which is all NaN:

```python

s3_function_grad(

    possible_signals_array = possible_signals_array, 

    real_signals_indices = real_signals_indices, 

    alphas = alphas, 

    choice_alphas = choice_alphas, 

    cost_factors = cost_factors, 

    objective_costs_possible = costs_possible, 

    at_most_as_costly = at_most_as_costly, 

    types = types,

    distances = distances

)

```

Reply all
Reply to author
Forward
0 new messages