import string from gensim.models.word2vec import Word2Vec def remove_punc(paragraph: list[str]): for i in range(len(paragraph)): paragraph[i] = paragraph[i].translate(str.maketrans('', '', string.punctuation)).lower() return paragraph def get_list_of_lists(data: list[str]) -> list[list[str]]: result = [] for line in data: splited = line.split(' ') result.append(splited) return result def lists_to_list(lst: list[list[str]], windows: int) -> list[list[str, str]]: pair = list() for i in range(1, windows+1): for line in lst: for j in range(len(line)): if j+i=0: pair.append([line[j], line[j-i]]) return pair origin_text = [ 'This article is about military actions primarily.', 'The American Revolutionary War (April 19, 1775 September 3, 1783), also known as the Revolutionary War or American War of Independence, was a major war of the American Revolution', 'Widely considered as the war that secured the independence of the United States, fighting began on April 19, 1775, followed by the Lee Resolution on July 2, 1776, and the Declaration of Independence on July 4, 1776', 'The American Patriots were supported by the Kingdom of France and, to a lesser extent, the Dutch Republic and the Spanish Empire, in a conflict taking place in North America, the Caribbean, and the Atlantic Ocean.' ] processed = remove_punc(origin_text) traindata = lists_to_list(get_list_of_lists(processed), 2) model1 = Word2Vec(traindata, window=5, sg=1, min_count=2, seed=666, vector_size=50) model2 = Word2Vec(traindata, window=3, sg=1, min_count=2, seed=666, vector_size=50) similarity1 = model1.wv.similarity('american', 'war') similarity2 = model2.wv.similarity('american', 'war') print('model1 similarity: ', similarity1) print('model2 similarity: ', similarity2)