# Tokenize the data and put it in a vector called tokens
tokens <- tokenize(crude[[2L]])
# Cleaning the tokenized data by removing all puntuations (Since this will be out true set)
tokens <- tokens[-which(tokens[1:496]==","|tokens[1:496]=="."|tokens[1:496]=="\\"|tokens[1:496]=="("|tokens[1:496]==")"|tokens[1:496]=="\"")]
# Using openNLP to get a tagged set to be used as our TRUE SET
tagged <- tagPOS(tokens)
# Organising the data in a list holding both token and tag
testList <- strsplit(tagged,"/")
test <- data.frame(c(0),c(0))
names(test) <- c("token","PennTag")
# Converting the list to a data.frame
try <- as.data.frame(testList)
for(i in 1:447){
test[i,1] <- try[1,i]
test[i,2] <- try[2,i]
}
# Creating a document term matrix
docTermMat <- create_matrix(cbind(test$token),language="english", removeNumbers=TRUE,removeSparseTerms=.998)
# Assigning which part of the document term matrix is to be used for training the models
# and which one to test the accuracy of the trained model
corpus1 <- create_corpus(docTermMat,as.numeric(docTermMat$PennTag),trainSize=1:150, testSize=151:447,virgin=FALSE)
# Training model using the Maximum Entropy Algorithm
MAXENT1 <- train_model(corpus1,"MAXENT")
# Tagging using Maxmum Entropy trained model
MAXENT_CLASSIFY1 <- classify_model(corpus1, MAXENT1)
#RStudio crashes