from six import iteritems, itervalues, string_types
from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL,\
ndarray, array
from gensim import matutils
def most_similar_iguk(self, positive=[], negative=[], topn=10, restrict_vocab=None, indexer=None): #most_similar adpted to ignore unknow words.
self.init_sims()
if isinstance(positive, string_types) and not negative:
# allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
positive = [positive]
# add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
positive = [
(word, 1.0) if isinstance(word, string_types + (ndarray,)) else word
for word in positive
]
negative = [
(word, -1.0) if isinstance(word, string_types + (ndarray,)) else word
for word in negative
]
# compute the weighted average of all words
all_words, mean, ignr_words = set(), [], []
for word, weight in positive + negative:
if isinstance(word, ndarray):
mean.append(weight * word)
elif word in self.vocab:
mean.append(weight * self.vectors_norm[self.vocab[word].index])
all_words.add(self.vocab[word].index)
elif word not in self.vocab: #UPDATE FOR INGORE UNKNOW WORDS AND PRINT.
ignr_words.append(word)
continue
print("Palavras não encontradas no vocabulário:")
print(ignr_words)
if not mean:
raise ValueError("cannot compute similarity with no input")
mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
if indexer is not None:
return indexer.most_similar(mean, topn)
limited = self.vectors_norm if restrict_vocab is None else self.vectors_norm[:restrict_vocab]
dists = dot(limited, mean)
if not topn:
return dists
best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
# ignore (don't return) words from the input
result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
return result[:topn]