import nltk
from nltk import word_tokenize, sent_tokenize
text = "Natural language processing is an area of computer science. NLP is an area of computer science."
sents = nltk.sent_tokenize(text)
print("The number of sentences is", len(sents)) #prints the number of sentences
tokens = nltk.word_tokenize(text)
print("The number of tokens is", len(tokens)) #prints the number of tokens
unique_tokens = set(tokens) #set() Build an unordered collection of unique elements.
print("The number of unique tokens are", len(unique_tokens)) #prints the number of unique tokens
The number of sentences is 2 The number of tokens is 18 The number of unique tokens are 11
tokenized_text = []
for sentence in sents:
sequence = word_tokenize(sentence)
tokenized_text.append(sequence)
#tokenized_text
from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(3, tokenized_text)
for ngramlize_sent in train:
print(list(ngramlize_sent))
print()
[('<s>',), ('<s>', '<s>'), ('<s>', '<s>', 'Natural'), ('<s>',), ('<s>', 'Natural'), ('<s>', 'Natural', 'language'), ('Natural',), ('Natural', 'language'), ('Natural', 'language', 'processing'), ('language',), ('language', 'processing'), ('language', 'processing', 'is'), ('processing',), ('processing', 'is'), ('processing', 'is', 'an'), ('is',), ('is', 'an'), ('is', 'an', 'area'), ('an',), ('an', 'area'), ('an', 'area', 'of'), ('area',), ('area', 'of'), ('area', 'of', 'computer'), ('of',), ('of', 'computer'), ('of', 'computer', 'science'), ('computer',), ('computer', 'science'), ('computer', 'science', '.'), ('science',), ('science', '.'), ('science', '.', '</s>'), ('.',), ('.', '</s>'), ('.', '</s>', '</s>'), ('</s>',), ('</s>', '</s>'), ('</s>',)] [('<s>',), ('<s>', '<s>'), ('<s>', '<s>', 'NLP'), ('<s>',), ('<s>', 'NLP'), ('<s>', 'NLP', 'is'), ('NLP',), ('NLP', 'is'), ('NLP', 'is', 'an'), ('is',), ('is', 'an'), ('is', 'an', 'area'), ('an',), ('an', 'area'), ('an', 'area', 'of'), ('area',), ('area', 'of'), ('area', 'of', 'computer'), ('of',), ('of', 'computer'), ('of', 'computer', 'science'), ('computer',), ('computer', 'science'), ('computer', 'science', '.'), ('science',), ('science', '.'), ('science', '.', '</s>'), ('.',), ('.', '</s>'), ('.', '</s>', '</s>'), ('</s>',), ('</s>', '</s>'), ('</s>',)]
#list(vocab)
from nltk.lm import MLE
lm1 = MLE(3)
lm1.fit(train,vocab)
len(lm1.vocab)
14
print(lm1.counts)
<NgramCounter with 1 ngram orders and 0 ngrams>
print(lm1.vocab.lookup(tokenized_text[1]))
('NLP', 'is', 'an', 'area', 'of', 'computer', 'science', '.')
lm1.score("the")
0
lm1.score('processing', 'Natural-language'.split())
0
lm1.score('an','area'.split())
0