I can't seem to get 'tag_sents' to work in the expected manner. If I segment to sentences and then tag the words using 'tag', I don't have any issues. But each time I try to do "batches" of sentences, the tagger splits up the input sentences into single letters which are then tagged. I've tried to track down the issue using print statements and everything appears to work as expected right up until it enters the Stanford POS black box. Any clues?
I've included my working code below. I want to adapt it to use 'pos.tag_sents' instead of 'pos.tag.' Don't be too harsh on it, I'm new Python and my C days are well behind me. Let me know if there's a better or more efficient way of coding anything.
# includes
import nltk
import textmining
from nltk.tokenize import sent_tokenize
from nltk.tag.stanford import StanfordPOSTagger
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import os
# setJavaPath
java_path = "<path here>"
os.environ['JAVAHOME'] = java_path
# readFiles
corpusText = open('corpus.txt').read()
# tokenizer
tokenized = sent_tokenize(corpusText)
# stanfordPOS
stanford_dir = '<path here>/Python/stanford-postagger-full-2016-10-31/'
modelfile = stanford_dir + 'models/english-left3words-distsim.tagger'
jarfile = stanford_dir + 'stanford-postagger.jar'
pos = StanfordPOSTagger(model_filename=modelfile, path_to_jar=jarfile)
# wnLemmatizer
wn_lemmatizer = WordNetLemmatizer()
# tdmMatrix
termDocuMatrix = textmining.TermDocumentMatrix()
# globalVariables
indexGlobal = 0
def processContent(indexGlobal): # processFunction
try:
for sentences in tokenized[indexGlobal:]:
print(sentences)
words = nltk.word_tokenize(sentences)
tagged = pos.tag(words)
return(tagged)
except Exception as e:
print(str(e))
def getWordnetPOS(popPOS): # retagFunction
if popPOS.startswith('J'):
return wordnet.ADJ
elif popPOS.startswith('V'):
return wordnet.VERB
elif popPOS.startswith('N'):
return wordnet.NOUN
elif popPOS.startswith('R'):
return wordnet.ADV
else:
return ''
while True: # while not end of file
try:
bitSentence = [] # list must be in scope for doc-term matrix init
taggedWords = processContent(indexGlobal)
nouns = [tag for tag in taggedWords if tag[1] == 'NN' or tag[1] == 'NNP' or tag[1] == 'NNS']
# or tag[1] == 'JJ' or tag[1] == 'JJR' or tag[1] == 'JJS']
length = len(nouns)
for n in nouns[:length]:
bitPop = nouns.pop()
popWord = bitPop[0]
popPOS = bitPop[1]
wnPOS = getWordnetPOS(popPOS)
lemmatized = (wn_lemmatizer.lemmatize(str.lower(popWord), wnPOS))
bitSentence.insert(0, lemmatized)
strSentence = ' '.join(bitSentence)
termDocuMatrix.add_doc(strSentence)
indexGlobal += 1
except:
break
termDocuMatrix.append_csv("doc_termmatrix.csv", cutoff=1)
for row in termDocuMatrix.rows(cutoff=1):
print(row)