setDocs1 = []
allDocuments = []
for file_name in os.listdir("/home/vagrant/shared/Test/1"):
file = codecs.open("/home/vagrant/shared/Test/1/" + file_name, "r", "utf-8")
aux = file.read()
setDocs1.append(aux)
allDocuments.append(aux)
setDocs2 = []
for file_name in os.listdir("/home/vagrant/shared/Test/2"):
file = codecs.open("/home/vagrant/shared/Test/2/" + file_name, "r", "utf-8")
aux = file.read()
setDocs2.append(aux)
allDocuments.append(aux)
texts1 = []
texts2 = []
all_texts = []
tokenizer = RegexpTokenizer(r'\w+')
stoplist_tw=['amp','get','got','hey','hmm','hoo','hop','iep','let','ooo','par',
'pdt','pln','pst','wha','yep','yer','aest','didn','nzdt','via',
'one','com','new','like','great','make','top','awesome','best',
'good','wow','yes','say','yay','would','thanks','thank','going',
'new','use','should','could','best','really','see','want','nice',
'while','know']
unigrams = [ w for doc in allDocuments for w in doc if len(w)==1]
bigrams = [ w for doc in allDocuments for w in doc if len(w)==2]
en_stop = set(nltk.corpus.stopwords.words("english") + stoplist_tw
+ unigrams + bigrams)
p_stemmer = PorterStemmer()
# loop through document list
for i in setDocs1:
# clean and tokenize document string
raw = i.lower()
tokens = tokenizer.tokenize(raw)
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]
# stem tokens
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
# add tokens to list
texts1.append(stemmed_tokens)
all_texts.append(stemmed_tokens)
for i in setDocs2:
# clean and tokenize document string
raw = i.lower()
tokens = tokenizer.tokenize(raw)
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]
# stem tokens
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
# add tokens to list
texts2.append(stemmed_tokens)
all_texts.append(stemmed_tokens)
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(all_texts)
# convert tokenized documents into a document-term matrix
corpus1 = [dictionary.doc2bow(text) for text in texts1]
corpus2 = [dictionary.doc2bow(text) for text in texts2]lda_model_1 = gensim.models.ldamodel.LdaModel(corpus1, num_topics=3, id2word = dictionary, passes=10, alpha=0.001)
for i in xrange(3):
print i
for tup in lda_model_1.get_topic_terms(i):
print dictionary[tup[0]] + ' ' + str(tup[1])data1 = pyLDAvis.gensim.prepare(lda_model_1, corpus1, dictionary)
pyLDAvis.display(data1)
IndexErrorTraceback (most recent call last)
<ipython-input-17-209fc1d6a743> in <module>()
----> 1 data1 = pyLDAvis.gensim.prepare(lda_model_1, corpus1, dictionary)
2 pyLDAvis.display(data1)
/usr/local/lib/python2.7/dist-packages/pyLDAvis/gensim.pyc in prepare(topic_model, corpus, dictionary, doc_topic_dist, **kwargs)
95 See `pyLDAvis.prepare` for **kwargs.
96 """
---> 97 opts = fp.merge(_extract_data(topic_model, corpus, dictionary, doc_topic_dist), kwargs)
98 return vis_prepare(**opts)
/usr/local/lib/python2.7/dist-packages/pyLDAvis/gensim.pyc in _extract_data(topic_model, corpus, dictionary, doc_topic_dists)
26 beta = 0.01
27 fnames_argsort = np.asarray(list(dictionary.token2id.values()), dtype=np.int_)
---> 28 term_freqs = corpus_csc.sum(axis=1).A.ravel()[fnames_argsort]
29 term_freqs[term_freqs == 0] = beta
30 doc_lengths = corpus_csc.sum(axis=0).A.ravel()
IndexError: index 1098 is out of bounds for axis 1 with size 707
lda_model_2 = gensim.models.ldamodel.LdaModel(corpus2, num_topics=3, id2word = dictionary, passes=10, alpha=0.001)
data2 = pyLDAvis.gensim.prepare(lda_model_2, corpus2, dictionary)
pyLDAvis.display(data2)
#it works well
--
You received this message because you are subscribed to the Google Groups "gensim" group.
To unsubscribe from this group and stop receiving emails from it, send an email to gensim+unsubscribe@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.
To unsubscribe from this group and stop receiving emails from it, send an email to gensim+un...@googlegroups.com.
vis_data = gensimvis.prepare(lda, bow_corpus, dictionary)pyLDAvis.save_html(vis_data, 'data/lda_75_lem_5_pass.html')