MEGAM not working for training a chunker

59 views

Skip to first unread message

tanlik...@gmail.com

unread,

Apr 8, 2016, 7:36:02 PM4/8/16

to nltk-users

Hello everybody!

I am new to this place but I hope you guys can help! I am want to train a chunker based on this corpus. Here is the code:

import nltk
from nltk.tree import *

qtrees = [line.rstrip('\n') for line in open('4000qs.txt')]

import subprocess


nltk.config_megam('question-search/megam_i686.opt')

class ConsecutiveNPChunkTagger(nltk.TaggerI): 
    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = npchunk_features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.MaxentClassifier.train(train_set, algorithm='megam', trace=0)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = npchunk_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

class ConsecutiveNPChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        tagged_sents = [[((w,t),c) for (w,t,c) in
                         nltk.chunk.tree2conlltags(sent)]
                        for sent in train_sents]
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
        return nltk.chunk.conlltags2tree(conlltags)


def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    return {"pos": pos, "word": word, "prevpos": prevpos}


def flatten_childtrees(trees):
    children = []
    for t in trees:
        if t.height() < 3:
            children.extend(t.pos())
        elif t.height() == 3:
            children.append(Tree(t.label(), t.pos()))
        else:
            children.extend(flatten_childtrees([c for c in t]))
    return children
def flatten_deeptree(tree):
    return Tree(tree.label(), flatten_childtrees([c for c in tree]))

train_sents = [flatten_deeptree(Tree.fromstring(i)) for i in qtrees[0:200]]
test_sents = [flatten_deeptree(Tree.fromstring(i)) for i in qtrees[200:400]]

print test_sents[0]

cp = nltk.RegexpParser("")
print cp.evaluate(test_sents)

from nltk.corpus import conll2000
cp = nltk.RegexpParser("")
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
print test_sents
print cp.evaluate(test_sents)

chunker = ConsecutiveNPChunker(train_sents)
print chunker.evaluate(test_sents)

I have also trained on conll2000 to make sure the chunker is working. But it isn't! I get the following error:

OSError: [Errno 2] No such file or directory

I tried to add this line of code but it gave a Permission Denied error:

subprocess.call(["/projects/72d28544-74a5-4ef0-8a93-b838af66bdc8/.local/lib/python2.7/site-packages/nltk/classify/maxent.py","/projects/72d28544-74a5-4ef0-8a93-b838af66bdc8/.local/lib/python2.7/site-packages/nltk/classify/megam.py","question-search/megam_i686.opt"])