I am trying to training FastText using gensim on the entire PMC/PubMed dump (cca 150GB) but I cant seem to figure out how to initialize the object and/or setup the iterator to submit the sentences to the train method. I tried it with some dummy txt data, to see if everything works and it does. I just cant seem to figure out how to train on a very large corpora which doesn't fit in to RAM.
class MySentences(object):
def __init__(self, dirname):
self.dirname = dirname
self.nlp = spacy.load('en')
self.textFields = ['title', 'full_title','abstract']
def __iter__(self):
for root, dirs, files in os.walk(self.dirname, topdown=True):
for filename in files:
fullpath = os.path.join(root, filename)
print fullpath
articles = getArticle(fullpath)
for article in articles:
text = u'. '.join([article[x].strip() for x in article if x in self.textFields]).strip()
print article
print text, type(text)
tokens = nlp(text, parse=True)
sentences = [sent.string.strip() for sent in tokens.sents]
for line in sentences:
print line.split()
yield line.split()
sentences = MySentences(r"/home/docClass/files/")
model = FastText(workers=cpu_count(), sentences=sentences, size=300)
trained = model.train(ft_path=HOME+'tools/fastText/fasttext', model='skipgram')