# build the model
print('Now building doc2vec model with {} CPUs'.format(str(n_cpus)))
doc_iterator = DocIterator()
model = gensim.models.Doc2Vec(
documents=doc_iterator, workers=n_cpus, size=100, dbow_words=1)
model.save(
os.path.join(general_settings["save_dir"],
general_settings["doc2vec_model"]))
model.save_word2vec_format(
os.path.join(general_settings["save_dir"],
general_settings["word2vec_model"]))
class DocIterator(object):
def __iter__(self):
for docs in keystore.get_docs(size=100):
for doc in docs:
yield TaggedDocument(doc["words"], doc["tags"])
@staticmethod
def get_tagged_document(text, label): text = text.replace('\n+', ' ').strip()
words = re.findall(r"[\w']+|[.,!?;]", text) # # lowercase. perhaps lemmatize too? words = [word.lower() for word in words]
# remove stop words from tokens stop_words = TextProcessor._en_stop words = [i for i in words if (i not in stop_words) and (len(i) > 1)]
# # remove numbers words = [re.sub(r'[\d]', ' ', i) for i in words] words = ' '.join(words).split()
tags = [label] return {"words": words, "tags": tags}
--
You received this message because you are subscribed to a topic in the Google Groups "gensim" group.
To unsubscribe from this topic, visit https://groups.google.com/d/topic/gensim/VmCai9ciUz0/unsubscribe.
To unsubscribe from this group and all its topics, send an email to gensim+un...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.
--
You received this message because you are subscribed to a topic in the Google Groups "gensim" group.
To unsubscribe from this topic, visit https://groups.google.com/d/topic/gensim/VmCai9ciUz0/unsubscribe.
To unsubscribe from this group and all its topics, send an email to gensim+un...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.
--
You received this message because you are subscribed to a topic in the Google Groups "gensim" group.
To unsubscribe from this topic, visit https://groups.google.com/d/topic/gensim/VmCai9ciUz0/unsubscribe.
To unsubscribe from this group and all its topics, send an email to gensim+un...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.