Hello!
I am trying to map topics to each document in a corpus of ca 2000 academic papers.
So following Bleis (2003) recommendations to use a training set of ca 10% of the corpus to train a LDA model, I used the following code
************************************************************************************************************************
def iter_documents(top_directory):
"""Iterate over all documents, yielding a document (=list of utf8 tokens) at a time."""
for root, dirs, files in os.walk(top_directory):
for file in filter(lambda file: file.endswith('.txt'), files):
document = open(os.path.join(root, file)).read() # read the entire document, as one big string
yield gensim.utils.tokenize(document, lower=True) # or whatever tokenization suits you
class MyCorpus(object):
def __init__(self, top_dir):
self.top_dir = top_dir
self.dictionary = gensim.corpora.Dictionary(iter_documents(top_dir))
self.dictionary.save('/tmp/deerwester.dict')
def __iter__(self):
for tokens in iter_documents(self.top_dir):
yield self.dictionary.doc2bow(tokens)
def main():
top_directory = path to my training set
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
corpus = MyCorpus(top_directory) #Creates a MyCorpus object, containing all the documents
dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
# Not entierly sure what this is doing, but it was necessary to create a proper corpus object.
# I think this is because that gensims algorithms require to have the corpuses stored on the hard drive.
corpora.MmCorpus.serialize('/tmp/
corpus.mm', corpus)
corpus = corpora.MmCorpus('/tmp/
corpus.mm')
lda = gensim.models.ldamulticore.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=100, passes=50, batch=True, workers=2, chunksize=3000, iterations=1000)
lda.save('model directory')
print("Ending!")
main()
************************************************************************************************************************
Then I thought it would be totally fine to reuse this saved model on the entire set of papers, so I use a different directory but almost the same code. It is only the last lines in the main that is of importance here I guess.
************************************************************************************************************************
def iter_documents(top_directory):
"""Iterate over all documents, yielding a document (=list of utf8 tokens) at a time."""
for root, dirs, files in os.walk(top_directory):
for file in filter(lambda file: file.endswith('.txt'), files):
document = open(os.path.join(root, file)).read() # read the entire document, as one big string
yield gensim.utils.tokenize(document, lower=True) # or whatever tokenization suits you
class MyCorpus(object):
def __init__(self, top_dir):
self.top_dir = top_dir
self.dictionary = gensim.corpora.Dictionary(iter_documents(top_dir))
def __iter__(self):
for tokens in iter_documents(self.top_dir):
yield self.dictionary.doc2bow(tokens)
def main():
top_directory = 'my already lemmatized and pre processed documents'
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
corpus = MyCorpus(top_directory) #Creates a MyCorpus object, containing all the documents
# Not entierly sure what this is doing, but it was necessary to create a proper corpus object.
# I think this is because that gensims algorithms require to have the corpuses stored on the hard drive.
corpora.MmCorpus.serialize('/tmp/runCorpus.mm', corpus)
corpus = corpora.MmCorpus('/tmp/runCorpus.mm')
lda = gensim.models.LdaModel.load('my already trained model')
#lda.print_topics(num_topics=100, num_words=7)
print(corpus)
#A counter so I can try to find where it goes to shit
x=1
for i in corpus:
print(x)
print(lda[i])
x = x + 1
main()
************************************************************************************************************************
Then it all goes down the drain, I have tried a lot of different in house gensim functions to try to achieve a topic extraction at least, but I think it is something that I am doing wrong when it comes to the dictionary.
The error I receive is "Traceback (most recent call last):
File "runModel.py", line 55, in <module>
main()
File "runModel.py", line 49, in main
print(lda[i])
File "/usr/local/lib/python2.7/dist-packages/gensim-0.12.4-py2.7-linux-x86_64.egg/gensim/models/ldamodel.py", line 921, in __getitem__
return self.get_document_topics(bow, eps)
File "/usr/local/lib/python2.7/dist-packages/gensim-0.12.4-py2.7-linux-x86_64.egg/gensim/models/ldamodel.py", line 908, in get_document_topics
gamma, _ = self.inference([bow])
File "/usr/local/lib/python2.7/dist-packages/gensim-0.12.4-py2.7-linux-x86_64.egg/gensim/models/ldamodel.py", line 432, in inference
expElogbetad = self.expElogbeta[:, ids]
IndexError: index 30213 is out of bounds for axis 1 with size 30213"
And according to Radim in other similar error threads, he suggests that it something with the dictionary being used wrong. But maybe it is just me that is slow minded because I cant seem to figure it out.
Anyone who can help?