# Import necessary libraries
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate, TimeDistributed, Bidirectional, Attention, Masking
from tensorflow.keras.callbacks import EarlyStopping
# Define hyperparameters
vocab_size = 5000
max_len_text = 200
max_len_summary = 20
embedding_dim = 200
hidden_units = 256
# Load and preprocess data
with open('train.txt', 'r', encoding='utf-8') as f:
text_lines = f.read().split('\n')
with open('summary.txt', 'r', encoding='utf-8') as f:
summary_lines = f.read().split('\n')
# Tokenize input and output textinput_tokenizer = Tokenizer(num_words=vocab_size)
input_tokenizer.fit_on_texts(text_lines)
input_seq = input_tokenizer.texts_to_sequences(text_lines)
input_seq = pad_sequences(input_seq, maxlen=max_len_text, padding='post')
output_tokenizer = Tokenizer(num_words=vocab_size)
output_tokenizer.fit_on_texts(summary_lines)
output_seq = output_tokenizer.texts_to_sequences(summary_lines)
output_seq = pad_sequences(output_seq, maxlen=max_len_summary, padding='post')
# Define the encoder-decoder model with attention
# Encoder model
encoder_inputs = Input(shape=(max_len_text,))
enc_emb = Embedding(vocab_size, embedding_dim)(encoder_inputs)
masked_inputs = Masking()(enc_emb) # Add a Masking() layer
encoder_lstm = Bidirectional(LSTM(hidden_units, return_sequences=True, return_state=True))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(masked_inputs)
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
# Decoder model
decoder_inputs = Input(shape=(None,))
dec_emb = Embedding(vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(hidden_units*2, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])
# Attention mechanism
attn_layer = Attention()
attn_out= attn_layer([encoder_outputs, decoder_outputs])
# Slice the last token from both output tensors
decoder_outputs = decoder_outputs[:, :-1, :]
attn_out = tf.slice(attn_out, [0, 0, 0], [-1, max_len_summary-1, -1])
# Concatenate attention output and decoder output
decoder_concat_input = Concatenate(axis=-1)([decoder_outputs, attn_out])
# Dense layer
decoder_dense = TimeDistributed(Dense(vocab_size, activation='softmax'))
decoder_outputs = decoder_dense(decoder_concat_input)
# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# Compile the model
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
# Define early stopping callback
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)
# Train the model
model.fit([input_seq, output_seq[:,:-1]], output_seq[:,1:], epochs=50, batch_size=64, validation_split=0.2, callbacks=[early_stop])
# Save the model
model.save('tigrinya_summarization_model.h5')
# Generate summaries for new text data
def generate_summary(input_text):
# Preprocess input text
input_seq = input_tokenizer.texts_to_sequences([input_text])
input_seq = pad_sequences(input_seq, maxlen=max_len_text, padding='post')
# Generate summary
decoder_input = tf.expand_dims([output_tokenizer.word_index['start']], 0)
decoder_output = ''
while decoder_output != 'end' and len(decoder_output.split()) < max_len_summary:
decoder_output, state_h, state_c = model.layers[3].predict([input_seq, decoder_input])
decoder_output = tf.argmax(decoder_output, axis=-1)
decoder_output = output_tokenizer.index_word[int(decoder_output)]
decoder_input = tf.expand_dims([output_tokenizer.word_index[decoder_output]], 0)
return decoder_output