import logging
import os
import sys
import bz2
import re
import itertools
import tarfile
import multiprocessing
from functools import partial
import gensim
from gensim.corpora import MmCorpus, Dictionary, WikiCorpus
from gensim import models, utils
import pyLDAvis
from pyLDAvis import gensim as gensim_vis
import argparse
import argcomplete
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
ignore_words = frozenset('the', 'at', 'and', 'if', 'are', 'am', 'be', 'is', 'etc')
def list_to_gen(directory):
for filename in os.listdir(directory):
yield directory + str(filename)
def preprocess_text(lemma, tweet, document):
# transform document into one string
with open(document, 'r') as infile:
text = ' '.join(line.rstrip('\n') for line in infile)
# convert string into unicode
text = gensim.utils.any2unicode(text)
# remove URL's
text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)
if lemma:
return utils.lemmatize(text, stopwords=ignore_words, min_length=3)
if tweet:
# remove symbols excluding the @, # and \s symbol
text = re.sub(r'[^\w@#\s]', '', text)
# tokenize words using NLTK Twitter Tokenizer
tknzr = TweetTokenizer()
text = tknzr.tokenize(text)
# lowercase, remove words less than len 2 & remove numbers in tokenized list
text = [word.lower() for word in text if len(word) > 2 and not word.isdigit()]
# remove stopwords
return [word for word in text if not word in ignore_words]
return utils.simple_preprocess(text, deacc=True, min_len=3)
# inherit from the TextCorpus class and override the get_texts method
class DocCorpus(gensim.corpora.TextCorpus):
def __init__(self, docs_loc, lemmatize, twitterize, dictionary=None, metadata=None):
self.docs_loc = docs_loc
self.lemmatize = lemmatize
self.twitterize = twitterize
self.metadata = metadata
if dictionary is None:
self.dictionary = Dictionary(self.get_texts())
else:
self.dictionary = dictionary
def get_texts(self):
pool = multiprocessing.Pool(max(1, multiprocessing.cpu_count() - 1))
func = partial(preprocess_text, self.lemmatize, self.twitterize)
for tokens in pool.map(func, list_to_gen(self.docs_loc)):
print(tokens)
yield tokens
pool.terminate()
def main():
lemma = False
twitterize = True
docs_loc = '/path/to/dir/containing/text_docs/'
doc_corpus = DocCorpus(docs_loc, lemma, twitterize)
if __name__ == '__main__':
sys.exit(main())