def extract_phrase_score(phrases_model, documents):
"""
Function to extract the phrases identified by GenSim model
:param phrases_model: the model GenSim phrases model to extract phrases
from
:param documents: the corpus of documents the GenSim phrases model was
trained on
:return: returns an iterator of all 'phrases' and the associated scores
: note that this return iterator requires a list() call to execute
: this iterator also includes duplicates that will be handled later
"""
for phrase, score in phrases_model.export_phrases(documents):
yield phrase, score
def search_for_three_word_phrases(tokenized_document_list,
min_count=10,
threshold=100,
save_to_disk=True,
stopwords=stopwords.words('english')):
"""
Function that iteratively applies phraser model to search for bigram and
trigram phrases
:param tokenized_document_list: the output of preprocess_string_for
phraser(). Like the GenSim Phraser() function,
this function takes a list of tokenized documents (list of lists).
:param min_count: the minimum number of times a phrase must appear to be
considered a "phrase"
:param threshold: the minimum statistic computed by the GenSim phraser
package
:param save_to_disk: Will model be saved to disk in the model/phrases
folder. Named with timeDate stamp.
:return: returns a sorted pandas dataframe of unique phrases with the
associated phraser score.
"""
bigram_phrases = Phrases(tokenized_document_list,
common_terms=stopwords,
min_count=min_count,
threshold=threshold)
bigram_phraser = Phraser(bigram_phrases)
trigram_phrases = Phrases(bigram_phraser[tokenized_document_list],
common_terms=stopwords,
min_count=min_count,
threshold=threshold)
phrase_list = list(extract_phrase_score(trigram_phrases, tokenized_document_list))
out_df = pd.DataFrame(phrase_list, columns=['phrase', 'score']).drop_duplicates()
out_df['phrase'] = [i.decode('UTF-8') for i in out_df['phrase']]
out_df_sorted = out_df.sort_values('score',ascending=False).reset_index(drop=True)
if(save_to_disk):
create_module_directory(join(ROOT_DIRECTORY, 'model'))
create_module_directory(join(ROOT_DIRECTORY, 'model/phrases'))
phrase_filename = 'model/phrases/' +\
datetime.datetime.now().strftime('%Y%m%d_%H%M%S') +\
' - phrases_export.csv'
out_df_sorted.to_csv(join(ROOT_DIRECTORY, phrase_filename))
return out_df_sorte
def phrases_pipeline(document_list,
min_count=10,
threshold=100,
save_to_disk=True):
"""
High level function that creates a pipeline of all phrases functions.
Takes all arguments required to train a trigram phrases model, trains the
model, optionally saves it to disk, and returns the unique phrases and
scores as a pandas or R dataframe.
:param document_list: The list of text documents to search for phrases.
:param min_count: The minimum number of appearances for a phrase to be
eligible.
:param threshold: The minimum threshold of the GenSim phrase statistic to
be considered a phrase.
:param save_to_disk: Should the model be saved to disk?
:return: A data frame containing the strings to be considered phrases with
their associated score.
"""
sentence_stream = [tokenize_string(doc) for doc in document_list]
phrase_df = search_for_three_word_phrases(sentence_stream,
min_count=min_count,
threshold=threshold,
save_to_disk=save_to_disk)
return phrase_df

import pandas as pd
from nltk.corpus import stopwords
from gensim.models.phrases import (Phrases, Phraser)
def extract_phrase_score(phrases_model, documents):
"""
Function to extract the phrases identified by GenSim model
:param phrases_model: the model GenSim phrases model to extract phrases
from
:param documents: the corpus of documents the GenSim phrases model was
trained on
:return: returns an iterator of all 'phrases' and the associated scores
: note that this return iterator requires a list() call to execute
: this iterator also includes duplicates that will be handled later
"""
for phrase, score in phrases_model.export_phrases(documents):
yield phrase, score
if __name__ == '__main__':
## import and install the brown corpus from nltk. This only needs to be
## done once.
# import nltk
# nltk.download('brown')
from nltk.corpus import brown
sentence_stream = brown.sents()
# Loop over 7 thresholds. Prints the top 25 phrases and associated score
for thresh in [100, 200, 400, 800, 1600, 3200, 6400]:
# identify bigram phrases
bigram_phrases = Phrases(sentence_stream,
common_terms=stopwords.words("english"),
min_count=10,
threshold=thresh)
# Block of code to extract and print phrases and score
bi_phrase_list = list(extract_phrase_score(bigram_phrases,sentence_stream))
out_df = pd.DataFrame(bi_phrase_list, columns=['phrase', 'score']).drop_duplicates()
out_df['phrase'] = [i.decode('UTF-8') for i in out_df['phrase']]
out_df_sorted = out_df.sort_values('score', ascending=False).reset_index(drop=True)
# print(out_df_sorted.head(25))
# Createa a bigram phraser object to iteratively create trigram phrases
bigram_phraser = Phraser(bigram_phrases)
trigram_phrases = Phrases(bigram_phraser[sentence_stream],
common_terms=stopwords.words("english"),
min_count=10,
threshold=thresh)
# Block of code to extract and print phrases and score
tri_phrase_list = list(extract_phrase_score(trigram_phrases,sentence_stream))
out_df = pd.DataFrame(tri_phrase_list, columns=['phrase', 'score']).drop_duplicates()
out_df['phrase'] = [i.decode('UTF-8') for i in out_df['phrase']]
out_df_sorted = out_df.sort_values('score',ascending=False).reset_index(drop=True)
print(out_df_sorted.head(25))