wv = Word2Vec.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
#wv.init_sims(replace=True)
wv.train(doc_list)
Traceback (most recent call last):
File "word2vec_trained.py", line 66, in <module>
wv.train(doc_list)
File "C:\Users\bgsingh\Anaconda2\lib\site-packages\gensim\models\word2vec.py", line 695, in train
if self.corpus_count:
AttributeError: 'Word2Vec' object has no attribute 'corpus_count'def w2v_tokenize_text(text):
tokens = []
for sent in nltk.sent_tokenize(text, language='english'):
for word in nltk.word_tokenize(sent, language='english'):
if len(word) < 2:
continue
tokens.append(word)
return tokens
doc_list = []
#with open('new_faq.txt') as alldata:
with smart_open.smart_open('new_faq.txt', encoding="utf-8") as alldata:
for line_no, line in enumerate(alldata):
line = line.decode('utf-8')
token = w2v_tokenize_text(line)
doc_list.append(token)
wv = Word2Vec.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
wv.init_sims(replace=True)
def word_averaging(wv, words):
all_words, mean = set(), []
for word in words:
if isinstance(word, np.ndarray):
mean.append(word)
elif word in wv.vocab:
mean.append(wv.syn0norm[wv.vocab[word].index])
all_words.add(wv.vocab[word].index)
print "biswa"
if not mean:
logging.warning("cannot compute similarity with no input %s", words)
# FIXME: remove these examples in pre-processing
return np.zeros(wv.layer_size,)
mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
print mean
return mean
def word_averaging_list(wv, text_list):
return np.vstack([word_averaging(wv, review) for review in text_list ])
X_train_word_average = word_averaging_list(wv, doc_list)from scipy.spatial.distance import cosine
X_train_word_average = word_averaging_list(wv, doc_list)
similarity = 1 - cosine(X_train_word_average[0], X_train_word_average[1])
print(similarity)
Enter code here...