corpus = MyCorpus('/home/test/nltk_data/corpora/movie_reviews/pos')
tfidf = gensim.models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
model = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=corpus.dictionary, num_topics=50, alpha = None)
In [171]: model.show_topics(10)
Out[171]:
['0.001*leila + 0.001*matilda + 0.001*truman + 0.001*virgil + 0.001*leigh + 0.001*beth + 0.001*mighty + 0.001*stewart + 0.001*bergman + 0.001*spacey',
'0.001*truman + 0.001*pauline + 0.001*trekkies + 0.001*cole + 0.001*baby + 0.001*scream + 0.001*mel + 0.001*indian + 0.001*valek + 0.000*reeves',
'0.001*bulworth + 0.001*memphis + 0.001*scream + 0.001*girls + 0.000*dance + 0.000*jamaican + 0.000*nikki + 0.000*thai + 0.000*patti + 0.000*simon',
'0.001*flynt + 0.001*gibson + 0.001*toy + 0.001*mel + 0.001*linklater + 0.001*mullen + 0.001*gladiator + 0.001*tarantino + 0.001*ordell + 0.001*jackie',
'0.001*hrundi + 0.001*flynt + 0.001*dolores + 0.001*bava + 0.001*stempel + 0.001*aliens + 0.001*barlow + 0.001*scream + 0.001*cinque + 0.001*claiborne',
'0.001*beaumarchais + 0.001*grodin + 0.001*hunting + 0.001*damon + 0.001*endings + 0.001*furtwangler + 0.001*vail + 0.001*dvd + 0.001*chan + 0.001*damme',
'0.001*lambeau + 0.001*cauldron + 0.001*taran + 0.001*sean + 0.001*quaid + 0.001*brown + 0.001*maximus + 0.001*ryan + 0.000*rosalba + 0.000*tarantino',
'0.001*quilt + 0.001*faculty + 0.001*nixon + 0.001*li + 0.001*frankenstein + 0.001*vivian + 0.001*hogarth + 0.001*sonny + 0.001*giant + 0.001*pam',
'0.001*lola + 0.001*maggie + 0.001*frequency + 0.001*smith + 0.001*bacon + 0.001*ronna + 0.001*dragon + 0.001*skinheads + 0.001*lucas + 0.001*derek',
'0.001*gallo + 0.001*hauer + 0.001*chucky + 0.001*cynthia + 0.001*fingernail + 0.001*titanic + 0.001*doll + 0.001*chan + 0.001*hortense + 0.001*turner']
In [172]: model.show_topic(0,10)
Out[172]:
[(0.00080199922395261867, 'truman'),
(0.00077141573371340949, 'tarzan'),
(0.00066999736068072029, 'lola'),
(0.00066889472058921271, 'chocolat'),
(0.0006608367820080598, 'gugino'),
(0.00064430353454066709, 'kissed'),
(0.00063166619926945361, 'roger'),
(0.00055236832395473968, 'patlabor'),
(0.00054165617319329332, 'kurosawa'),
(0.00053950152038525709, 'spoon')]
corpus_in_mm = corpora.MmCorpus('corpus_unsup.mm')
dictionary = corpora.Dictionary.load('dict_unsup_50k.dict')
hdp = gensim.models.hdpmodel.HdpModel(corpus_in_mm, dictionary)
corpus = MyCorpus('/home/test/nltk_data/corpora/movie_reviews/neg_pre')# preprocessed corpus.
dictionary = corpus.dictionary # save the dictionary
corpora.MmCorpus.serialize('testing_corpus.mm', corpus) # since we need len, save the corpus in mm format.
corpus = corpora.MmCorpus('testing_corpus.mm') # read back the saved corpus.
#model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=200)
model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=200, update_every=1, chunksize=10000, passes=15, alpha = None)
doc = read_texts()# read one sample review.
bow = dictionary.doc2bow(utils.simple_preprocess(doc))
vec_lda = model[bow]
#index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=20)
index = similarities.MatrixSimilarity(model[corpus])
sims = index[vec_lda]
#print(list(enumerate(sims)))
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims[:10])
=====================
[(84, 0.58722341), (156, 0.58722341), (257, 0.58722341), (622, 0.58722341), (705, 0.58722341), (474, 0.39097035), (855, 0.39097035), (978, 0.34163111), (623, 0.3177681), (629, 0.3177681)]=============================In order to find better similarity, i'm now thinking of calculating KL distance. For KL distance, i would need to find distribution of topics for each document. Any hint how to get that from the model.
Also, In the Radim's benchmark of nearest neighbour, LSI is used, Is it because LSI is faster (on such a big corpus as Wikipedia ) or is it because LSI is more preferred over LDA when finding similarity?After reading so many older posts regarding LSI vs LDA, i'm still confused. When is the LDA a good choice for finding similarities? At-least for human interpretation, topics generated by LDA are quite pleasant but i don't know if it matters when finding similarity among docs/movies.