import sys
sys.argv=['']
del sys
SPLITTOR='$$$'
def load_embedding(embedding_file='/content/drive/MyDrive/glove.6B.100d.txt'):
#glove.6B.100d
embeddings_index = {}
f = open('/content/drive/MyDrive/glove.6B.100d.txt')
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
return embeddings_index
def clean_str(string):
string = re.sub(r"\\", "", string)
string = re.sub(r"\'", "", string)
string = re.sub(r"\"", "", string)
return string.strip()
def extract_text_lable(data_panda):
all_texts_kn, all_texts_unkn, labels_all, id_all = [], [],[] ,[]
for idx in range(data_panda.review.shape[0]):
temp = data_panda.review[idx].split('$$$')
if len(temp) != 2:
quit()
text_kn = BeautifulSoup(temp[0], "html.parser").get_text().encode('ascii','ignore')
text_unkn = BeautifulSoup(temp[1], "html.parser").get_text().encode('ascii','ignore')
text_kn, text_unkn = clean_str(text_kn), clean_str(text_unkn)
all_texts_kn.append(text_kn)
all_texts_unkn.append(text_unkn)
labels_all.append(data_panda.sentiment[idx])
id_all.append(id_doc)
return all_texts_kn, all_texts_unkn, labels_all , id_all
def load_data(dataset_id='2015.cnn', isonefile=False, MAX_SEQUENCE_LENGTH = 1000):
'''if not isonefile:
data_train_ = pd.read_csv('train{}'.format(dataset_id), sep='\t')
data_test_ = pd.read_csv('test{}'.format(dataset_id), sep='\t')
print('size of original training and test files :')
print (data_train_.shape, data_test_.shape)
texts_kn, texts_unkn, labels_all , id_alls = extract_text_lable(data_train_)
t0, t1, l1, id1 = extract_text_lable(data_test_)
texts_kn = texts_kn + t0
texts_unkn = texts_unkn + t1
labels_all = labels_all + l1
id_alls = id_alls + id1'''
#else:
data_test_train_ = pd.read_csv('{}'.format(dataset_id), sep='\t')
print (data_test_train_.shape)
texts_kn, texts_unkn, labels_all, id_alls = extract_text_lable(data_test_train_)
print('Total # of docs : %s' % len(texts_kn))
assert len(texts_kn) == len(texts_unkn) == (len(id_alls)) == (len(labels_all))
all_text = texts_kn + texts_unkn
l = [len(t.split()) for t in all_text]
print ('Mean of doc length: %.4f max of doc length: %.4f' %(np.mean(l), np.max(l)))
tokenizer = Tokenizer(char_level=False)
tokenizer.fit_on_texts(all_text)
l_sequences = tokenizer.texts_to_sequences(texts_kn)
r_sequences = tokenizer.texts_to_sequences(texts_unkn)
word_index = tokenizer.word_index
print('Found %s unique tokens in all text' % len(word_index))
l_data = pad_sequences(l_sequences, maxlen=MAX_SEQUENCE_LENGTH)
r_data = pad_sequences(r_sequences, maxlen=MAX_SEQUENCE_LENGTH)
assert l_data.shape == r_data.shape
print('Shape of left/right-side tensor:', l_data.shape)
return [l_data, r_data], labels_all, tokenizer.word_index, id_alls
def euclid_dist(inputs):
output = K.sqrt(K.sum( K.square(inputs[0]-inputs[1]), axis=-1))
output = K.expand_dims(output, 1)
return output
def cosine_dist(inputs):
x1 = K.l2_normalize(inputs[0], axis=-1)
x2 = K.l2_normalize(inputs[1], axis=-1)
output = K.sum(x1 * x2, axis=1, keepdims=True)
return output
def dotm(inputs, axis=1):
return K.sum(inputs[0] * inputs[1], axis=axis, keepdims=True)
def mean_of_l1(inputs):
return K.mean(K.abs(inputs[0] - inputs[1]), axis=1, keepdims=True)
def sigmoid_kernel(inputs, lamda=None,c=1):
if lamda is None:
lamda = inputs[0].shape[1]
output = K.tanh(lamda* dotm(inputs, axis=-1)+c)
return output
''''def sigmoid_kernel(inputs,gamma=None, coef0=1):
#inputs = check_pairwise_arrays(inputs)
if gamma is None:
gamma = inputs[0].shape[1]
K = safe_sparse_dot(inputs, dense_output=True)
K *= gamma
K += coef0
output=np.tanh(K, K) # compute tanh in-place
return output'''
def chi_squared(inputs, lamda=1):
output = K.exp(lamda * K.sum(K.square(inputs[0]-inputs[1])/(inputs[0]-inputs[1]), axis=-1, keepdims=True))
return output
def rbf_kernel(inputs, gamma=1):
output = K.sum(K.square(inputs[0]-inputs[1]), axis=-1, keepdims=True)
output = K.exp(-gamma*output)
return output
def all_distances_moremetrics(inputs):
euc = euclid_dist(inputs)
cos = cosine_dist(inputs)
rbf = rbf_kernel(inputs)
#chi = chi_squared(inputs)
sig = sigmoid_kernel(inputs)
dt = dotm(inputs)
mean = mean_of_l1(inputs)
return K.concatenate([euc,cos,rbf,sig,dt, mean],-1)
def out_shape_moremetrics(shape):
#print shape
return(None,6)
#import numpy as np
def model(drop=0.3, hidden_units=64, word_index=None, embedding_index=None, EMBEDDING_DIM=8, MAX_SEQUENCE_LENGTH=1000):
'''
specifies NN architecture
:param drop: dropout size
:param hidden_units:
:param word_index:
:param embedding_index:
:param EMBEDDING_DIM:
:param MAX_SEQUENCE_LENGTH:
:return: complete model and intermediate model
'''
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
embedding_vector = embedding_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
l_embedding_layer = Embedding(len(word_index) + 1,
EMBEDDING_DIM,
input_length=MAX_SEQUENCE_LENGTH,
weights=[embedding_matrix],
trainable=True
)
l_sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int16')
l_embedded_sequences = l_embedding_layer(l_sequence_input)
l_bi = SimpleRNN(hidden_units)(l_embedded_sequences)
l_drop = Dropout(drop)(l_bi)
l_model = Model(l_sequence_input, l_drop)
# right CNN
r_embedding_layer = Embedding(len(word_index) + 1,
EMBEDDING_DIM,
input_length=MAX_SEQUENCE_LENGTH,
weights=[embedding_matrix],
trainable=True
)
r_sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int16')
r_embedded_sequences = r_embedding_layer(r_sequence_input)
r_bi = SimpleRNN(hidden_units)(r_embedded_sequences)
r_drop = Dropout(drop)(r_bi)
r_model = Model(r_sequence_input, r_drop)
merged = Merge([l_model, r_model], mode=all_distances_moremetrics, output_shape=out_shape_moremetrics,name='fusion')
final_model = Sequential()
final_model.add(merged)
final_model.add(Dense(1, activation='sigmoid'))
print (final_model.inputs)
final_model.compile(loss=losses.binary_crossentropy,
optimizer=optimizers.rmsprop(),
metrics=['acc']
)
layer_name = 'fusion'
intermediate_layer_model = Model(inputs=final_model.input,
outputs=final_model.get_layer(layer_name).output)
return final_model , intermediate_layer_model
def train_and_evaluate_model(model_, data_train, y_train, data_test, y_test, epochs, inter_model, batch, verbose=1):
'''
trains and evaluates the model
:param model_: NN model
:param data_train: training data
:param y_train: labels of training set
:param data_test: test data
:param y_test: labels of test set
:param epochs:
:param inter_model: itemnediate model to get the output of the fusion layer for plotting
:param batch:
:param verbose: keras verbose paramets of fit function
:return:
'''
[l_x_train, r_x_train] = data_train
hist = model_.fit([l_x_train, r_x_train], y_train,
epochs=epochs, batch_size=batch, verbose=verbose)
#print(hist)
[l_x_test, r_x_test] = data_test
loss_train, acc_train = model_.evaluate([l_x_train, r_x_train], y_train, batch_size=batch)
loss_test, acc_test = model_.evaluate([l_x_test, r_x_test], y_test, batch_size=batch)
intermediate_output_train = inter_model.predict([l_x_train, r_x_train])
intermediate_output_test = inter_model.predict([l_x_test, r_x_test])
print('train error : {0} train accuracy: {1} '.format(loss_train, acc_train))
print('test error : {0} test accuracy: {1} '.format(loss_test, acc_test))
return [loss_train,acc_train], [loss_test, acc_test],[intermediate_output_train,intermediate_output_test]
def get_args():
'''
get arguments from command line
:return: a dic of all arguments
'''
parser = argparse.ArgumentParser()
parser.add_argument('-ds', action='store', default='2015.cnn', help='dataset name')
parser.add_argument('-bs', action='store', default=1, help='batch size', type=int)
parser.add_argument('-do', action='store', default=0.2, help='dropout')
parser.add_argument('-hs', action='store', default=8, help='hidden layer size', type=int)
parser.add_argument('-nf', action='store', default=5, help='# of folds in CV', type=int)
parser.add_argument('-ep', action='store', default=50, help='# of epochs', type=int)
parser.add_argument('-ms', action='store', default=1000, help='maximum sequence length', type=int)
parser.add_argument('-vb', action='store', default=1, help='verbose: 0, 1 or 2', type=int)
#print parser.print_help()
results = parser.parse_args()
print(results)
return vars(results)
def CV(args):
'''
k-fold Cross-Validation
:param args: model arguments
'''
# loading model parameters
MAX_SEQUENCE_LENGTH = args['ms']
embeddings_index = load_embedding('glove.6B.100d.txt')
EMBEDDING_DIM = 100
drops = args['do']
batch = args['bs']
hidden = args['hs']
n_folds = args['nf']
epochNo = args['ep']
ds_id = args['ds']
verbose = args['vb']
# loading data
data, labels, word_idx, id_all = load_data(dataset_id=ds_id,isonefile=False,
MAX_SEQUENCE_LENGTH=MAX_SEQUENCE_LENGTH)
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=None)
# saving bins info
i = 0
f_bin = open(ds_id+'.interbins', 'w')
f_bin.close()
f_train = open(ds_id+'.interout_train', 'wt')
f_train.close()
f_test = open(ds_id+'.interout_test', 'wt')
f_test.close()
id_all = np.array(id_all)
# Cross-Validation
avg_acc_train, avg_acc_test, avg_error_train, avg_error_test =0,0,0,0
for train_index, test_index in skf.split(np.zeros(len(labels)), labels):
f_bin = open(ds_id+'.interbins', 'a')
np.savetxt(f_bin, [id_all[train_index]], fmt='%s')
np.savetxt(f_bin, [id_all[test_index]], fmt ='%s')
f_bin.close()
print("size of train index ", len(train_index))
print("size of test index ", len(test_index))
print ("Running Fold %d/%d " % (i+1, n_folds))
my_model = None # Clearing the NN.
my_model ,inter_model = model(drop=drops, hidden_units=hidden, word_index=word_idx,
embedding_index=embeddings_index, EMBEDDING_DIM=EMBEDDING_DIM,
MAX_SEQUENCE_LENGTH=MAX_SEQUENCE_LENGTH)
[data_l, data_r] = data
my_data_train = [data_l[train_index], data_r[train_index]]
my_data_test = [data_l[test_index], data_r[test_index]]
labels = np.asarray(labels)
[loss_train, acc_train], [loss_test, acc_test],[inter_out_train,inter_out_test] = train_and_evaluate_model\
(my_model, my_data_train, labels[train_index], my_data_test, labels[test_index], epochNo, inter_model,
batch, verbose)
Thanks