def get_doc_topic(corpus, model):
doc_topic = list()
for doc in tqdm(corpus):
doc_topic.append(model.get_document_topics(doc))
doc_topic = [dict(i) for i in doc_topic]
doc_topic = pd.DataFrame(doc_topic)
doc_topic.fillna(value=0, inplace=True)
return doc_topic
def train_lda_model(corpus, ntops): no_docs = len(corpus) if no_docs > 10000: print "Training Model > 10k docs" lda = gensim.models.ldamulticore.LdaMulticore(corpus=corpus, num_topics=50, id2word=corpus.dictionary, workers=3, iterations=800, chunksize = 1000) else: print "Training Model less than 10k docs" lda = gensim.models.ldamulticore.LdaMulticore(corpus=corpus, num_topics=50, id2word=corpus.dictionary, workers=3, iterations=800, chunksize = 1000, passes = 10) return lda
#!/usr/bin/env python2# -*- coding: utf-8 -*-"""Created on Thu Nov 24 13:04:30 2016
@author: matthias"""
import gensim import osimport pandas as pd#import globfrom tqdm import tqdmimport logging
# Set up logginglogging.basicConfig(level=logging.DEBUG)logFormatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] %(message)s")rootLogger = logging.getLogger()
# Add console handler for printingconsoleHandler = logging.StreamHandler()consoleHandler.setFormatter(logFormatter)rootLogger.addHandler(consoleHandler)
def listdir_fullpath(d): return [os.path.join(d, f) for f in os.listdir(d)]
#-----------------------------------# This should be the relevant part.#-----------------------------------def get_doc_topic(corpus, model): doc_topic = list() logging.info("Getting topic to document matrix") logging.info("Applying model to documents...") for doc in tqdm(corpus): # Sometimes gets stuck on a single doc in this loop doc_topic.append(model.get_document_topics(doc)) doc_topic = [dict(i) for i in doc_topic] doc_topic = pd.DataFrame(doc_topic) doc_topic.fillna(value=0, inplace=True) return doc_topic #-----------------------------------# End of relevant part#-----------------------------------
def get_doc_topic_one_model_dir(comp_model_dir): fileHandler = logging.FileHandler("{0}/{1}.log".format(comp_model_dir, 'Topic_Doc_All.log')) fileHandler.setFormatter(logFormatter) rootLogger.addHandler(fileHandler) logging.info("Running on: \n {}".format(comp_model_dir)) event_corpus = comp_model_dir + "/Corpus_Event.mm" event_model = comp_model_dir + "/Model_Event.gensim" pre_corpus = comp_model_dir + "/Corpus_Pre.mm" pre_model = comp_model_dir + "/Model_Pre.gensim" logging.info("Reading Corpus Event") event_corpus = gensim.corpora.MmCorpus(event_corpus) logging.info("Reading Model Event") event_model = gensim.models.LdaMulticore.load(event_model) doc_topic_event = get_doc_topic(event_corpus, event_model) doc_topic_event.to_excel(comp_model_dir + "/Doc_Top_Event.xlsx", index=False) logging.info("Reading Corpus Pre") pre_corpus = gensim.corpora.MmCorpus(pre_corpus) logging.info("Reading Model Pre") pre_model = gensim.models.LdaMulticore.load(pre_model) doc_topic_pre = get_doc_topic(pre_corpus, pre_model) doc_topic_pre.to_excel(comp_model_dir + "/Doc_Top_Pre.xlsx", index=False)
if __name__ == "__main__": models_main = "/path/to/my/project/Models" model_dirs = listdir_fullpath(models_main) model_dirs = [i for i in model_dirs if os.path.isdir(i)] for m_dir in model_dirs: get_doc_topic_one_model_dir(m_dir)
2016-11-28 22:40:31,150 : INFO : loaded corpus index from data/wiki_corpus.mm.index2016-11-28 22:40:31,150 : INFO : initializing corpus reader from data/wiki_corpus.mm2016-11-28 22:40:31,151 : INFO : accepted corpus with 1319343 documents, 170000 features, 435221157 non-zero entries2016-11-28 22:40:31,151 : INFO : loading LdaModel object from data/tweets_25_lem_5_pass.model2016-11-28 22:40:31,572 : INFO : loading id2word recursively from data/tweets_25_lem_5_pass.model.id2word.* with mmap=None2016-11-28 22:40:31,572 : INFO : setting ignored attribute state to None2016-11-28 22:40:31,572 : INFO : setting ignored attribute dispatcher to None2016-11-28 22:40:31,573 : INFO : loading LdaModel object from data/tweets_100_lem_5_pass.model.state0it [00:00, ?it/s]Traceback (most recent call last): File "test.py", line 28, in <module> topics = [item for item in tqdm(get_doc_topics())] File "/usr/local/lib/python2.7/dist-packages/tqdm/_tqdm.py", line 816, in __iter__ for obj in iterable: File "test.py", line 14, in get_doc_topics yield dict(lda.get_document_topics(doc)) File "/usr/local/lib/python2.7/dist-packages/gensim/models/ldamodel.py", line 910, in get_document_topics gamma, phis = self.inference([bow], collect_sstats=True) File "/usr/local/lib/python2.7/dist-packages/gensim/models/ldamodel.py", line 433, in inference expElogbetad = self.expElogbeta[:, ids]IndexError: index 100225 is out of bounds for axis 1 with size 100000
2016-11-28 22:54:35,954 : INFO : initializing corpus reader from data/tweets.mm2016-11-28 22:54:35,954 : INFO : accepted corpus with 40537 documents, 100000 features, 74069118 non-zero entries2016-11-28 22:54:35,954 : INFO : loading LdaModel object from data/tweets_100_lem_5_pass.model2016-11-28 22:54:36,092 : INFO : loading id2word recursively from data/tweets_100_lem_5_pass.model.id2word.* with mmap=None2016-11-28 22:54:36,092 : INFO : setting ignored attribute state to None2016-11-28 22:54:36,092 : INFO : setting ignored attribute dispatcher to None2016-11-28 22:54:36,093 : INFO : loading LdaModel object from data/tweets_100_lem_5_pass.model.state12325it [05:02, 12.53it/s]
2016-11-28 22:35:30,597 : INFO : loaded corpus index from data/wiki_corpus.mm.index2016-11-28 22:35:30,597 : INFO : initializing corpus reader from data/wiki_corpus.mm2016-11-28 22:35:30,597 : INFO : accepted corpus with 1319343 documents, 170000 features, 435221157 non-zero entries2016-11-28 22:35:30,597 : INFO : loading LdaModel object from data/lda_100_lem_5_pass.model2016-11-28 22:35:30,964 : INFO : loading id2word recursively from data/lda_100_lem_5_pass.model.id2word.* with mmap=None2016-11-28 22:35:30,964 : INFO : loading expElogbeta from data/lda_100_lem_5_pass.model.expElogbeta.npy with mmap=None2016-11-28 22:35:31,373 : INFO : setting ignored attribute state to None2016-11-28 22:35:31,373 : INFO : setting ignored attribute dispatcher to None2016-11-28 22:35:31,373 : INFO : loading LdaModel object from data/lda_100_lem_5_pass.model.state2016-11-28 22:35:31,374 : INFO : loading sstats from data/lda_100_lem_5_pass.model.state.sstats.npy with mmap=None25it [00:05, 5.13it/s]# From self.inference (line 433):
if len(chunk) > 1: logger.debug("performing inference on a chunk of %i documents", len(chunk))