import string
from gensim.models.word2vec import Word2Vec 

def remove_punc(paragraph: list[str]):
    for i in range(len(paragraph)):
        paragraph[i] = paragraph[i].translate(str.maketrans('', '', string.punctuation)).lower()
    return paragraph

def get_list_of_lists(data: list[str]) -> list[list[str]]:
    result = []
    for line in data:
        splited = line.split(' ')
        result.append(splited)
    return result

def lists_to_list(lst: list[list[str]], windows: int) -> list[list[str, str]]:
    pair = list()
    for i in range(1, windows+1):
        for line in lst:
            for j in range(len(line)):
                if j+i<len(line):
                    pair.append([line[j], line[i+j]])
                if j-i>=0:
                    pair.append([line[j], line[j-i]])
    return pair
    
origin_text = [
    'This article is about military actions primarily.',
    'The American Revolutionary War (April 19, 1775 September 3, 1783), also known as the Revolutionary War or American War of Independence, was a major war of the American Revolution',
    'Widely considered as the war that secured the independence of the United States, fighting began on April 19, 1775, followed by the Lee Resolution on July 2, 1776, and the Declaration of Independence on July 4, 1776',
    'The American Patriots were supported by the Kingdom of France and, to a lesser extent, the Dutch Republic and the Spanish Empire, in a conflict taking place in North America, the Caribbean, and the Atlantic Ocean.'
]

processed = remove_punc(origin_text)
traindata = lists_to_list(get_list_of_lists(processed), 2)

model1 = Word2Vec(traindata, window=5, sg=1, min_count=2, seed=666, vector_size=50)
model2 = Word2Vec(traindata, window=3, sg=1, min_count=2, seed=666, vector_size=50)

similarity1 = model1.wv.similarity('american', 'war')
similarity2 = model2.wv.similarity('american', 'war')

print('model1 similarity: ', similarity1)
print('model2 similarity: ', similarity2)