import time
import gensim.corpora.dictionary as gsdict
from nltk.stem.porter import PorterStemmer as nltkPS
from gensim import corpora, models, similarities
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
my_tfidf_corpus = corpora.MmCorpus('E:/Corpus/tfidf_toy_corpus.mm')
my_mini_corpus = my_tfidf_corpus[0:50000]
print(time.process_time())
my_dictionary = gsdict.Dictionary.load('compact_dictionary.pickle')
print("before model initialization")
lda_model = models.LdaModel(my_mini_corpus,num_topics=40,id2word=my_dictionary,chunksize=2000)
But if I attempt to use the multi-processor version, I run into problems. I change the last line to:
lda_model = models.LdaMulticore(my_mini_corpus,num_topics=40,id2word=my_dictionary,workers=1)
and I get the following error over and over again:
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "C:\Winpython\WinPython-64bit-3.5.4.0Qt5\python-3.5.4.amd64\lib\multiprocessing\spawn.py", line 106, in spawn_main
exitcode = _main(fd)
File "C:\Winpython\WinPython-64bit-3.5.4.0Qt5\python-3.5.4.amd64\lib\multiprocessing\spawn.py", line 115, in _main
prepare(preparation_data)
File "C:\Winpython\WinPython-64bit-3.5.4.0Qt5\python-3.5.4.amd64\lib\multiprocessing\spawn.py", line 226, in prepare
_fixup_main_from_path(data['init_main_from_path'])
File "C:\Winpython\WinPython-64bit-3.5.4.0Qt5\python-3.5.4.amd64\lib\multiprocessing\spawn.py", line 278, in _fixup_main_from_path
run_name="__mp_main__")
File "C:\Winpython\WinPython-64bit-3.5.4.0Qt5\python-3.5.4.amd64\lib\runpy.py", line 263, in run_path
pkg_name=pkg_name, script_name=fname)
File "C:\Winpython\WinPython-64bit-3.5.4.0Qt5\python-3.5.4.amd64\lib\runpy.py", line 96, in _run_module_code
mod_name, mod_spec, pkg_name, script_name)
File "C:\Winpython\WinPython-64bit-3.5.4.0Qt5\python-3.5.4.amd64\lib\runpy.py", line 85, in _run_code
exec(code, run_globals)
File "C:\Users\David\Desktop\AB\AB_2017_Dev\NLP\Create_LDA_Model.py", line 26, in <module>
lda_model = models.LdaMulticore(my_mini_corpus,num_topics=40,id2word=my_dictionary,workers=1)
File "C:\Winpython\WinPython-64bit-3.5.4.0Qt5\python-3.5.4.amd64\lib\site-packages\gensim\models\ldamulticore.py", line 151, in __init__
minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics
File "C:\Winpython\WinPython-64bit-3.5.4.0Qt5\python-3.5.4.amd64\lib\site-packages\gensim\models\ldamodel.py", line 340, in __init__
self.update(corpus, chunks_as_numpy=use_numpy)
File "C:\Winpython\WinPython-64bit-3.5.4.0Qt5\python-3.5.4.amd64\lib\site-packages\gensim\models\ldamulticore.py", line 214, in update
pool = Pool(self.workers, worker_e_step, (job_queue, result_queue,))
File "C:\Winpython\WinPython-64bit-3.5.4.0Qt5\python-3.5.4.amd64\lib\multiprocessing\context.py", line 118, in Pool
context=self.get_context())
File "C:\Winpython\WinPython-64bit-3.5.4.0Qt5\python-3.5.4.amd64\lib\multiprocessing\pool.py", line 174, in __init__
self._repopulate_pool()
File "C:\Winpython\WinPython-64bit-3.5.4.0Qt5\python-3.5.4.amd64\lib\multiprocessing\pool.py", line 239, in _repopulate_pool
w.start()
File "C:\Winpython\WinPython-64bit-3.5.4.0Qt5\python-3.5.4.amd64\lib\multiprocessing\process.py", line 105, in start
self._popen = self._Popen(self)
File "C:\Winpython\WinPython-64bit-3.5.4.0Qt5\python-3.5.4.amd64\lib\multiprocessing\context.py", line 313, in _Popen
return Popen(process_obj)
File "C:\Winpython\WinPython-64bit-3.5.4.0Qt5\python-3.5.4.amd64\lib\multiprocessing\popen_spawn_win32.py", line 34, in __init__
prep_data = spawn.get_preparation_data(process_obj._name)
File "C:\Winpython\WinPython-64bit-3.5.4.0Qt5\python-3.5.4.amd64\lib\multiprocessing\spawn.py", line 144, in get_preparation_data
_check_not_importing_main()
File "C:\Winpython\WinPython-64bit-3.5.4.0Qt5\python-3.5.4.amd64\lib\multiprocessing\spawn.py", line 137, in _check_not_importing_main
is not going to be frozen to produce an executable.''')
RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:
if __name__ == '__main__':
freeze_support()
...
The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce an executable.
After each of these errors, it looks like a new process is started because I see:
C:\Winpython\WinPython-64bit-3.5.4.0Qt5\python-3.5.4.amd64\lib\site-packages\gensim\utils.py:862: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
Using TensorFlow backend.
Then I see some logging info and another copy of the same error.