I am interested in a relatively simple operation - computing an attention mask over the activations produced by an LSTM after an Embedding layer, which crucially uses mask_zero=True.
I can get it working without masking. However I cannot get it working with masking because I am using Flatten and Reshape layers, which don't support masking, so I am kind of stuck! Can anyone suggest how I can accomplish this? Can I get away with not using masking somehow? Should I write a custom layer? If so, could someone provide an example?
from keras.layers import Input, Embedding, Dense, LSTM, merge, Activation, Permute, Reshape
from keras.layers import Convolution1D, MaxPooling1D, Flatten, TimeDistributed, RepeatVector
from keras.layers.convolutional import AveragePooling1D
from keras.models import Model
max_doclen = 12
word_dim, vocab_size = 5, 10
nb_class = 2
input = Input(shape=[max_doclen], dtype='int32')
# embed and lstm the document
embedded = Embedding(output_dim=word_dim, input_dim=vocab_size, input_length=max_doclen, weights=None, mask_zero=True)(input)
activations = LSTM(16, return_sequences=True)(embedded)
# attention
mask = TimeDistributed(Dense(1))(activations) # compute the attention mask
mask = Flatten()(mask) # flatten the mask to get it ready to be used by RepeatVector - DOES NOT SUPPORT MASKING!
mask = Activation('softmax')(mask)
mask = RepeatVector(16)(mask)
mask = Permute([2, 1])(mask)
# apply mask
activations = merge([activations, mask], mode='mul')
activations = AveragePooling1D(pool_length=max_doclen)(activations)
activations = Flatten()(activations)
probas = Dense(nb_class, activation='softmax')(activations)
# compile
model = Model(input=input, output=probas)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()