For past week I am facing this strange issue with ldamallet wrapper in gensim. I am training a mallet model with a larger dataset and saving trained model along with the dictionary for further use.
I passed prefix while training to look at the temp files created by the model. Following files are created when the model is trained:
'corpus.mallet'
'corpus.txt'
'doctopics'txt'
inferencer.mallet'
'state.mallet.gz'
'topickeys.txt'
Now when I load the saved model in a different console and pass unseen corpus created using the saved dictionary, I can see no other temp files being created and produces following error:
FileNotFounderror: No such file or directory : 'my_directory\\doctopics.txt.infer'
For some odd reason, if I load the saved model in same console (console it was trained on) and pass unseen corpus like above, 'corpus.txt' is updated and two new temp files are created:
'corpus.mallet.infer'
'doctopics.txt.infer'
Any idea why I might be having this issue?
import pandas as pdimport numpy as npimport re
import gensimimport gensim.corpora as corporafrom gensim.utils import simple_preprocessfrom gensim.models import CoherenceModel
import spacyimport en_core_web_sm
from nltk.corpus import stopwords
import osimport joblib
all_news = pd.read_csv('all_news.csv')
def processed_text(news_df): def sent_to_words(): data = news_df.Text.values.tolist() data = [re.sub('\S*@\S*\s?','',sent) for sent in data] data = [re.sub('\s+',' ',sent) for sent in data] data = [re.sub("\'","",sent) for sent in data] for sentence in data: yield(simple_preprocess(str(sentence),deacc=True)) def remove_stopwords(): stop_words = stopwords.words('english') stop_words.extend(['from','subject','re','edu','use']) return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in sent_to_words()]
def make_trigrams(): bigram = gensim.models.phrases.Phrases(sent_to_words(),min_count=5,threshold=100) bigram_mod = gensim.models.phrases.Phraser(bigram) bigram_data_words = [bigram_mod[doc] for doc in remove_stopwords()]
trigram = gensim.models.phrases.Phrases(bigram[sent_to_words()],threshold=100) trigram_mod = gensim.models.phrases.Phraser(trigram) return [trigram_mod[bigram_mod[doc]] for doc in bigram_data_words]
def lemmatization(): texts_out = [] allowed_postags = ['NOUN','ADJ','VERB','ADV'] nlp = en_core_web_sm.load(disable=['parser','ner']) for sent in make_trigrams(): doc = nlp(" ".join(sent)) texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) return texts_out return lemmatization()
clean_text = processed_text(all_news[0:15])
def find_optimum_model(clean_text): lemmatized_words = clean_text id2word = corpora.Dictionary(lemmatized_words) all_corpus = [id2word.doc2bow(text) for text in lemmatized_words]
#For two lines below update with your path to new_mallet os.environ.update({'MALLET_HOME':r'C:\\users\\axk0er8\\Sentiment_Analysis_Working\\NewsSentimentAnalysis\\mallet-2.0.8'}) mallet_path = r'C:\\users\\axk0er8\\Sentiment_Analysis_Working\\NewsSentimentAnalysis\\mallet-2.0.8\\bin\\mallet.bat' prefix_path = r'C:\\users\\axk0er8\\Sentiment_Analysis_Working\\NewsSentimentAnalysis\\mallet_temp\\'
def compute_coherence_values(dictionary, all_corpus, texts, limit, start=2, step=4): coherence_values = [] model_list = [] num_topics_list = []
for num_topics in range(start, limit, step): model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=all_corpus, num_topics=num_topics, id2word=dictionary, prefix=prefix_path, optimize_interval=50, random_seed=42) model_list.append(model) coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') coherence_values.append(coherencemodel.get_coherence()) num_topics_list.append(num_topics)
return model_list, coherence_values, num_topics_list
model_list, coherence_values, num_topics_list = compute_coherence_values(dictionary=id2word,all_corpus=all_corpus, texts=lemmatized_words,start=5,limit=40, step=6) model_values_df = pd.DataFrame({'model_list':model_list,'coherence_values':coherence_values,'num_topics':num_topics_list})
optimal_num_topics = model_values_df.loc[model_values_df['coherence_values'].idxmax()]['num_topics']
optimal_model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=all_corpus, num_topics=optimal_num_topics, id2word=id2word, prefix=prefix_path, random_seed=42)
id2word.save('id2word_dictionary.gensim') joblib.dump(optimal_model,'optimal_lda_model.pkl')
find_optimum_model(clean_text)
import pandas as pdimport numpy as npimport re
import gensimimport gensim.corpora as corporafrom gensim.utils import simple_preprocessfrom gensim.models import CoherenceModel
import spacyimport en_core_web_sm
from nltk.corpus import stopwords
import osimport joblib
unseen_news = pd.read_csv('article_df.csv')
def processed_text(news_df): def sent_to_words(): data = news_df.Text.values.tolist() data = [re.sub('\S*@\S*\s?','',sent) for sent in data] data = [re.sub('\s+',' ',sent) for sent in data] data = [re.sub("\'","",sent) for sent in data] for sentence in data: yield(simple_preprocess(str(sentence),deacc=True)) def remove_stopwords(): stop_words = stopwords.words('english') stop_words.extend(['from','subject','re','edu','use']) return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in sent_to_words()]
def make_trigrams(): bigram = gensim.models.phrases.Phrases(sent_to_words(),min_count=5,threshold=100) bigram_mod = gensim.models.phrases.Phraser(bigram) bigram_data_words = [bigram_mod[doc] for doc in remove_stopwords()]
trigram = gensim.models.phrases.Phrases(bigram[sent_to_words()],threshold=100) trigram_mod = gensim.models.phrases.Phraser(trigram) return [trigram_mod[bigram_mod[doc]] for doc in bigram_data_words]
def lemmatization(): texts_out = [] allowed_postags = ['NOUN','ADJ','VERB','ADV'] nlp = en_core_web_sm.load(disable=['parser','ner']) for sent in make_trigrams(): doc = nlp(" ".join(sent)) texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) return texts_out return lemmatization()
clean_unseen_text = processed_text(unseen_news[0:5])
def generate_dominant_topic(clean_text): lemmatized_words = clean_text id2word = corpora.Dictionary.load('id2word_dictionary.gensim') new_corpus = [id2word.doc2bow(text) for text in lemmatized_words] optimal_model = joblib.load('optimal_lda_model.pkl')
def format_topics_sentences(ldamodel, new_corpus): sent_topics_df = pd.DataFrame() for i, row in enumerate(ldamodel[new_corpus]): row = sorted(row, key=lambda x: (x[1]), reverse=True) for j, (topic_num, prop_topic) in enumerate(row): if j == 0: wp = ldamodel.show_topic(topic_num) topic_keywords = ", ".join([word for word, prop in wp]) sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True) else: break sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'] return (sent_topics_df)
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, new_corpus=new_corpus) return(df_topic_sents_keywords)
unseen_topic_df = generate_dominant_topic(clean_unseen_text)
unseen_topic_df.shape