#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import sys
import json
import pandas as pd
from keras.models import Sequential
from keras.engine.training import slice_X
from keras.layers.core import Activation, RepeatVector, Dense
from keras.layers import recurrent, TimeDistributed
import numpy as np
from six.moves import range
class CharacterTable(object):
'''
Given a set of characters:
+ Encode them to a one hot integer representation
+ Decode the one hot integer representation to their character output
+ Decode a vector of probabilties to their character output
'''
def __init__(self, chars, maxlen):
self.chars = sorted(set(chars))
self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
self.indices_char = dict((i, c) for i, c in enumerate(self.chars))
self.maxlen = maxlen
def encode(self, C, maxlen=None):
maxlen = maxlen if maxlen else self.maxlen
X = np.zeros((maxlen, len(self.chars)))
for i, c in enumerate(C):
X[i, self.char_indices[c]] = 1
return X
def decode(self, X, calc_argmax=True):
if calc_argmax:
X = X.argmax(axis=-1)
return ''.join(self.indices_char[x] for x in X)
class colors:
ok = '\033[92m'
fail = '\033[91m'
close = '\033[0m'
INVERT = True
HIDDEN_SIZE = 128
BATCH_SIZE = 64
LAYERS = 3
# Try replacing GRU, or SimpleRNN
RNN = recurrent.LSTM
def main():
"""
Epitope_core = answers
Antigen = questions
"""
epi_antigen_df = pd.io.parsers.read_table("http://dpaste.com/0BPTWTN.txt")
antigens = epi_antigen_df["Antigen"].tolist()
epitopes = epi_antigen_df["Epitope Core"].tolist()
allchars = "".join(antigens+epitopes)
allchars = list(set(allchars))
aa_chars = "".join(allchars)
sys.stderr.write(aa_chars + "\n")
max_antigen_len = len(max(antigens, key=len))
max_epitope_len = len(max(epitopes, key=len))
X = np.zeros((len(antigens),max_antigen_len, len(aa_chars)),dtype=np.bool)
y = np.zeros((len(epitopes),max_epitope_len, len(aa_chars)),dtype=np.bool)
ctable = CharacterTable(aa_chars, max_antigen_len)
sys.stderr.write("Begin vectorization\n")
for i, antigen in enumerate(antigens):
X[i] = ctable.encode(antigen, maxlen=max_antigen_len)
for i, epitope in enumerate(epitopes):
y[i] = ctable.encode(epitope, maxlen=max_epitope_len)
# Shuffle (X, y) in unison as the later parts of X will almost all be larger digits
indices = np.arange(len(y))
np.random.shuffle(indices)
X = X[indices]
y = y[indices]
# Explicitly set apart 10% for validation data that we never train over
split_at = len(X) - len(X) / 10
(X_train, X_val) = (slice_X(X, 0, split_at), slice_X(X, split_at))
(y_train, y_val) = (y[:split_at], y[split_at:])
sys.stderr.write("Build model\n")
model = Sequential()
# "Encode" the input sequence using an RNN, producing an output of HIDDEN_SIZE
# note: in a situation where your input sequences have a variable length,
# use input_shape=(None, nb_feature).
model.add(RNN(HIDDEN_SIZE, input_shape=(max_antigen_len, len(aa_chars))))
# For the decoder's input, we repeat the encoded input for each time step
model.add(RepeatVector(max_epitope_len))
# The decoder RNN could be multiple layers stacked or a single layer
for _ in range(LAYERS):
model.add(RNN(HIDDEN_SIZE, return_sequences=True))
# For each of step of the output sequence, decide which character should be chosen
model.add(TimeDistributed(Dense(len(aa_chars))))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
# Train the model each generation and show predictions against the validation dataset
for iteration in range(1, 200):
print()
print('-' * 50)
print('Iteration', iteration)
model.fit(X_train, y_train, batch_size=BATCH_SIZE, nb_epoch=5,
validation_data=(X_val, y_val))
###
# Select 10 samples from the validation set at random so we can visualize errors
for i in range(10):
ind = np.random.randint(0, len(X_val))
rowX, rowy = X_val[np.array([ind])], y_val[np.array([ind])]
preds = model.predict_classes(rowX, verbose=0)
q = ctable.decode(rowX[0])
correct = ctable.decode(rowy[0])
guess = ctable.decode(preds[0], calc_argmax=False)
# print('Q', q[::-1] if INVERT else q)
print('T', correct)
print(colors.ok + '☑' + colors.close if correct == guess else colors.fail + '☒' + colors.close, guess)
print('---')
if __name__ == '__main__':
main()
Unless the image of the data is truncated, I don't see that the Epitope is a substring of the Antigen, but a shorter different sequence.
In the case that the Epitope is always a substring from the antigen, you could tackle the problem as a labeling problem (for each character of the antigen, decide if it is a part of the Epitope sequence), instead of a RNN encoder-decoder.
Also, I may be wrong, but, does your model correctly model the padded sequences? Do you have an output class for the padding character? Otherwise, your model would try to predictcorrect characters for the padding.
For the error analysis, try to get global numbers, specially at the beginning if you have a lot of errors, to catch if you are making always the same mistake in all sequences.