Thanks for Miel.shayne suggestions on inference stage!
I believe in the steps that they can lead me to predict a new sentence's vector, however, I got a non-sense result which cannot convince myself its ok for generating a new sentences's vector.
If you have any time, would you please scratch my head out of the confusion?
The predict.py code is as following:
def MyLabeledLineSentence(train_sen_len, model, test_file):
sens = []
inner_id = 0
try:
for line in open(test_file, 'r'):
line = line.strip()
item_no = train_sen_len + inner_id
label = 'SENT_'+str(item_no)
newvocab = gensim.models.doc2vec.Vocab()
newvocab.index = item_no
newvocab.sample_probability = 1.0
newvocab.code = []
for i in range(0, int(math.log(item_no, 2)+1)):
newvocab.code.append(1)
model.vocab[label] = newvocab
model.syn0 = numpy.vstack((model.syn0, model.syn0[0]))
model.index2word.append(label)
random.seed(uint32(model.hashfxn(model.index2word[item_no] + str(model.seed))))
model.syn0[item_no] = (random.rand(model.layer1_size) - 0.5) / model.layer1_size
sens.append(gensim.models.doc2vec.LabeledSentence(utils.to_unicode(line).split(), ['SENT_%s' % item_no]))
inner_id += 1
except:
print "test_file:", test_file, " Load error!"
print traceback.format_exc()
sys.exit(-1)
return sens;
model = gensim.models.doc2vec.Doc2Vec.load(model_file)
train_sen_len=len(model.vocab)
sentences=MyLabeledLineSentence(train_sen_len, model, test_file)
model.train_labels=True
model.train_words=False
model.train(sentences)
wfile=open(out_file, 'w')
for sen in sentences:
label = sen.labels[0]
similar_array = model.most_similar(label)
wfile.write("Input test sentence:%s\n" % (' '.join(sen.words).encode('utf-8')))
for sim in similar_array:
wfile.write("\t\t%20s\t%.6f\n" % (sim[0].encode('utf-8'), sim[1]))
wfile.write("\n")
在 2014年12月15日星期一 UTC+8下午10:21:53,miel.shayne写道: