simple variable length sequence masking with LSTM + Dense

1,226 views
Skip to first unread message

philip....@gmail.com

unread,
Feb 2, 2017, 1:11:34 PM2/2/17
to Keras-users
Hi all,

I've seen much discussion about this, but no simple answers.  Dense does not support masking, so how to get around it?
If I don't mask, I am asking the model to learn non-useful input-output mappings.

My simple-minded approach tried an Embedding layer before the LSTM, but that gives the "Layer dense_1 does not support masking' error.

Without Embedding the following code runs.

I have 2-dim inputs and binary one-hot outputs with maximum sequence length of 9, and 5 of these sequences.

Many thanks for a pointer, maybe how to replace the Dense layer with a new Layer like "DenseWithMasking"?

Phil


```python
import numpy as np

from keras.models import Sequential
from keras.engine.topology import Layer
from keras.layers import Dense, LSTM, Embedding
from keras.optimizers import SGD

def test():  
  input_n_features = 2
  input_max_length = 9
  n_classes = 2
  n_lstm_cells = 1
  n_epochs = 10
  n_sequences = 5

  train_input = np.array([[[1,1],[2,2],[3,3],[4,4],[5,5],[6,6],[7,7],[8,8],[9,9]],
                          [[1,1],[2,2],[3,3],[4,4],[5,5],[6,6],[7,7],[8,8],[0,0]], # last 1 sample to mask
                          [[1,1],[2,2],[3,3],[4,4],[5,5],[6,6],[7,7],[0,0],[0,0]], # last 2 sample to mask
                          [[1,1],[2,2],[3,3],[4,4],[5,5],[6,6],[0,0],[0,0],[0,0]], # last 3 sample to mask
                          [[1,1],[2,2],[3,3],[4,4],[5,5],[0,0],[0,0],[0,0],[0,0]]], # last 4 sample to mask
                          dtype=np.float64)
  
  train_output =  np.array([[[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[0,1]],
                            [[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[0,0]],
                            [[1,0],[1,0],[1,0],[1,0],[1,0],[1,0],[1,0],[0,0],[0,0]],
                            [[1,0],[1,0],[1,0],[1,0],[1,0],[1,0],[0,0],[0,0],[0,0]],
                            [[1,0],[1,0],[1,0],[1,0],[1,0],[0,0],[0,0],[0,0],[0,0]]],
                            dtype=np.float64)                   
                   
                   
  model = Sequential() 
  
  model.add(Embedding(output_dim=n_classes, input_dim=input_n_features, mask_zero=True))

  model.add(LSTM(n_lstm_cells,
                  #input_shape=(input_max_length, input_n_features),
                  return_sequences=True,
                  stateful=False,
                  init='he_normal',
                  activation='tanh'))
    
  model.add(Dense(n_classes, init='he_normal',
                                 activation='softmax'))
 
  sgd = SGD()
  
  model.compile(loss='categorical_crossentropy',
                optimizer=sgd,
                #optimizer=rmsprop,
                metrics=['accuracy'])
  
  
  print('Training')
  for i in range(n_epochs):
      print('Epoch', i, '/', n_epochs)
      model.fit(train_input,
                train_output,
                verbose=1,
                nb_epoch=1,
                shuffle=True)      
      #model.reset_states()
  
  print('Predicting')
  validation_predicted_output = model.predict(train_input) 
  
if __name__ == "__main__":
  test()
  
  
```

Philip Warrick

unread,
Feb 2, 2017, 7:14:57 PM2/2/17
to Keras-users
I think that I've answered my question.  

The previous discussions about the TimeDistributed layer wrapper were important:

I replaced the Dense() layer with Dense(TimeDistributed(...)) and used a Masking as the first layer.

That works and the loss and error metrics look better now in the sense that they seem to be less skewed by the masked samples.

Am I right that the loss (and metrics like 'accuracy') are not calculated for these masked samples?

The one downside is that the training time per epoch on a larger network and input data (not shown below: 50x50x50 LSTM, 20 input features, 5 one-hot outputs) seems to have increased ~50% --to 300 seconds on a TitanX GPU--with the masking on.

Below is a working version of the code with the masking on (i.e., using Masking and TimeDistributed)

Maybe the docs could be clarified to indicate that if you want a Dense layer with masking, you need to use something like TimeDistributed(Dense(...))

Any insights and answers would be most appreciated.

Thanks for the great toolkit.

Phil



 
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.core import Masking
from keras.layers.wrappers import TimeDistributed
from keras.optimizers import SGD

def test():  
  input_n_features
= 2
  input_max_length
= 9
  n_classes
= 2
  n_lstm_cells
= 1
  n_epochs
= 10
  n_sequences
= 5

  train_input
= np.array([[[1,1],[2,2],[3,3],[4,4],[5,5],[6,6],[7,7],[8,8],[9,9]],
                         
[[1,1],[2,2],[3,3],[4,4],[5,5],[6,6],[7,7],[8,8],[0,0]], # last 1 sample to mask
                         
[[1,1],[2,2],[3,3],[4,4],[5,5],[6,6],[7,7],[0,0],[0,0]], # last 2 sample to mask
                         
[[1,1],[2,2],[3,3],[4,4],[5,5],[6,6],[0,0],[0,0],[0,0]], # last 3 sample to mask
                         
[[1,1],[2,2],[3,3],[4,4],[5,5],[0,0],[0,0],[0,0],[0,0]]], # last 4 sample to mask
                          dtype
=np.float64)
 
  train_output
=  np.array([[[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[0,1]],
                           
[[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[0,0]],
                           
[[1,0],[1,0],[1,0],[1,0],[1,0],[1,0],[1,0],[0,0],[0,0]],
                           
[[1,0],[1,0],[1,0],[1,0],[1,0],[1,0],[0,0],[0,0],[0,0]],
                           
[[1,0],[1,0],[1,0],[1,0],[1,0],[0,0],[0,0],[0,0],[0,0]]],
                            dtype
=np.float64)                  
                                     
  model
= Sequential()

 
  model
.add(Masking(mask_value=0., input_shape=(input_max_length, input_n_features)))


  model
.add(LSTM(n_lstm_cells,
                 
#input_shape=(input_max_length, input_n_features),
                  return_sequences
=True,
                  stateful
=False,
                  init
='he_normal',
                  activation
='tanh'))
   
#   model.add(Dense(n_classes, init='he_normal',
#                                  activation='softmax'))
  model
.add(TimeDistributed(Dense(n_classes, init='he_normal',
                                  activation
='softmax')))

   
  sgd
= SGD()
 
  model
.compile(loss='categorical_crossentropy',
                optimizer
=sgd,

                metrics
=['accuracy'])
 

 
print('Training')
 
for i in range(n_epochs):
     
print('Epoch', i, '/', n_epochs)
      model
.fit(train_input,
                train_output
,
                verbose
=1,
                nb_epoch
=1,
                shuffle
=True)
     
 
 
print('Predicting')

rain.bes...@seecs.edu.pk

unread,
Aug 19, 2019, 10:53:56 AM8/19/19
to Keras-users
Hey I get this error on such a use. Further i am not sure if its correct way or not but This is the only way in keras that I came across during my search.
ValueError: Output tensors to a Model must be the output of a Keras `Layer` (thus holding past layer metadata). Found: <keras.layers.wrappers.TimeDistributed object at @#$@%>
Reply all
Reply to author
Forward
0 new messages