import nltk
from nltk import word_tokenize, sent_tokenize

text = "Natural language processing is an area of computer science. NLP is an area of computer science."
sents = nltk.sent_tokenize(text)
print("The number of sentences is", len(sents)) #prints the number of sentences

tokens = nltk.word_tokenize(text)
print("The number of tokens is", len(tokens)) #prints the number of tokens

unique_tokens = set(tokens) #set() Build an unordered collection of unique elements.
print("The number of unique tokens are", len(unique_tokens)) #prints the number of unique tokens

The number of sentences is 2
The number of tokens is 18
The number of unique tokens are 11


tokenized_text = []
for sentence in sents:
    sequence = word_tokenize(sentence) 
    tokenized_text.append(sequence) 
#tokenized_text


from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(3, tokenized_text)
for ngramlize_sent in train:
    print(list(ngramlize_sent))
    print()

[('<s>',), ('<s>', '<s>'), ('<s>', '<s>', 'Natural'), ('<s>',), ('<s>', 'Natural'), ('<s>', 'Natural', 'language'), ('Natural',), ('Natural', 'language'), ('Natural', 'language', 'processing'), ('language',), ('language', 'processing'), ('language', 'processing', 'is'), ('processing',), ('processing', 'is'), ('processing', 'is', 'an'), ('is',), ('is', 'an'), ('is', 'an', 'area'), ('an',), ('an', 'area'), ('an', 'area', 'of'), ('area',), ('area', 'of'), ('area', 'of', 'computer'), ('of',), ('of', 'computer'), ('of', 'computer', 'science'), ('computer',), ('computer', 'science'), ('computer', 'science', '.'), ('science',), ('science', '.'), ('science', '.', '</s>'), ('.',), ('.', '</s>'), ('.', '</s>', '</s>'), ('</s>',), ('</s>', '</s>'), ('</s>',)]

[('<s>',), ('<s>', '<s>'), ('<s>', '<s>', 'NLP'), ('<s>',), ('<s>', 'NLP'), ('<s>', 'NLP', 'is'), ('NLP',), ('NLP', 'is'), ('NLP', 'is', 'an'), ('is',), ('is', 'an'), ('is', 'an', 'area'), ('an',), ('an', 'area'), ('an', 'area', 'of'), ('area',), ('area', 'of'), ('area', 'of', 'computer'), ('of',), ('of', 'computer'), ('of', 'computer', 'science'), ('computer',), ('computer', 'science'), ('computer', 'science', '.'), ('science',), ('science', '.'), ('science', '.', '</s>'), ('.',), ('.', '</s>'), ('.', '</s>', '</s>'), ('</s>',), ('</s>', '</s>'), ('</s>',)]


#list(vocab)


from nltk.lm import MLE
lm1 = MLE(3)
lm1.fit(train,vocab)
len(lm1.vocab)

14


print(lm1.counts)

<NgramCounter with 1 ngram orders and 0 ngrams>


print(lm1.vocab.lookup(tokenized_text[1]))

('NLP', 'is', 'an', 'area', 'of', 'computer', 'science', '.')


lm1.score("the")

0


lm1.score('processing', 'Natural-language'.split())

0


lm1.score('an','area'.split())

0