from __future__ import unicode_literals, print_function
from gensim.parsing import PorterStemmer
from spacy.en import English
from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import tokenize
import string
import re
import os
stemmer = PorterStemmer()
stopwords = stopwords.words('english')
nlp = English() #nlp = spacy.load("en")
data_dir_path = "full_path"
base_dir = os.path.dirname(data_dir_path)
os.chdir(base_dir)
class Stemming(object):
word_lookup = {}
@classmethod
def stem(cls, word):
stemmed = stemmer.stem(word)
if stemmed not in cls.word_lookup:
cls.word_lookup[stemmed] = {}
cls.word_lookup[stemmed][word] = (
cls.word_lookup[stemmed].get(word, 0) + 1)
return stemmed
@classmethod
def original_form(cls, word):
if word in cls.word_lookup:
return max(cls.word_lookup[word].keys(),
key=lambda x: cls.word_lookup[word][x])
else:
return word
class SentenceClass(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for fname in os.listdir(self.dirname):
with open(os.path.join(self.dirname,fname), 'r') as myfile:
doc = myfile.read().replace('\n', ' ')
for sent in tokenize.sent_tokenize(doc.lower()):
yield [Stemming.stem(word)\
for word in word_tokenize(re.sub("[^A-Za-z]", " ",sent))\
if word not in stopwords]
class PhrasingIterable(object):
def __init__(self, phrasifier, texts):
self.phrasifier, self.texts = phrasifier, texts
def __iter__(self):
yield self.phrasifier[self.texts]
my_sentences = SentenceClass(data_dir_path)
my_phrases = Phrases(my_sentences, min_count=1)
my_corpus = PhrasingIterable(my_phrases,my_sentences)
model = Word2Vec(my_corpus, size=100, window=2, min_count=1, workers=2)
Hey Gordon,
Above is my complete code, the error I am getting as of now is below, this code above is passing a list somewhere when it is suppose to pass a words.
File "C:/Users/Adubey4/Desktop/rasagit/mycode/error_bigram.py", line 65, in <module>
model = Word2Vec(my_corpus, size=100, window=2, min_count=1, workers=2)
File "C:\Anaconda3\lib\site-packages\gensim\models\word2vec.py", line 503, in __init__
self.build_vocab(sentences, trim_rule=trim_rule)
File "C:\Anaconda3\lib\site-packages\gensim\models\word2vec.py", line 577, in build_vocab
self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule) # initial survey
File "C:\Anaconda3\lib\site-packages\gensim\models\word2vec.py", line 601, in scan_vocab
vocab[word] += 1
TypeError: unhashable type: 'list'