--
You received this message because you are subscribed to the Google Groups "nltk-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email to nltk-users+...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.
def load_corpus():
with open('testout_data/123.txt', 'r') as f:
tagged_sents = f.read()
return tagged_sents
def chunk_sents(tagged_sents, pos_pattern):
chunk_freq_dict = defaultdict(int)
chunker = nltk.RegexpParser(pos_pattern)
all_chunks = list(itertools.chain.from_iterable(chunker.parse(tagged_sent)for tagged_sent in tagged_sents))
print(all_chunks)
#print(chunk_freq_dict)
return chunk_freq_dict
def main(domain_corpus, pos_pattern):
# STEP 1
domain_sents = domain_corpus
# Extract matching patterns
chunks_freqs = chunk_sents(domain_sents, pos_pattern)
return chunks_freqs
if __name__ == '__main__':
PATTERN = r"""
NP: {<NN.*|adj>*<NN.*>}
"""
domain_corpus = load_corpus()
candidates = main(domain_corpus, PATTERN)
domain_sents: [['فوٹو', 'PN'], ['گرافر', 'PN'], ['انتونیو', 'PN'], ['ریپیسی', 'PN'], ['نے', 'P'], ['سنہ', 'NN'], ['2011', 'CA'], ['میں', 'P'], ['ری', 'ADJ'], ['سائیکل', 'NN'], ['ہونے', 'VB'], ['والی', 'WALA'], ['ذاتی', 'ADJ'], ['استعمال', 'NN'], ['کی', 'P'], ['اشیا', 'NN'], ['کا', 'P'], ['کچرا', 'NN'], ['جمع', 'ADJ'], ['کرنا', 'VB'], ['شروع', 'NN'], ['کیا', 'VB'], ['اور', 'CC'], ['چار', 'CA'], ['سال', 'NN'], ['بعد', 'NN'], ['ان', 'PP'], ['کی', 'P'], ['مدد', 'NN'], ['سے', 'SE'], ['طاقتور', 'ADJ'], ['تصاویر', 'NN'], ['کی', 'P'], ['ایک', 'CA'], ['سیریز', 'NN'], ['بنائی', 'VB'], ['جس', 'REP'], ['کی', 'P'], ['مدد', 'NN'], ['سے', 'SE'], ['انھوں', 'NN'], ['کے', 'P'], ['لوگوں', 'NN'], ['کے', 'P'], ['بحیثیت', 'ADV'], ['صارف', 'NN'], ['خیالات', 'NN'], ['بدلنے', 'VB'], ['کی', 'P'], ['کوشش', 'NN'], ['کی', 'P'], ['ہے', 'VB']]
type(domain_sents): <class 'str'>
def load_corpus():
corpus_root = os.path.abspath('../out1_data')
mycorpus = nltk.corpus.reader.TaggedCorpusReader(corpus_root,'.*')
#for infile in (mycorpus.fileids()):
#print(infile)
for sent in mycorpus.tagged_sents():
print(sent)
tagged_sents =sent return tagged_sents
def chunk_sents(tagged_sents, pos_pattern):
chunk_freq_dict = defaultdict(int)
chunker = nltk.RegexpParser(pos_pattern)
chunked = []
for s in tagged_sents:
chunked.append(chunker.parse(s))
print(chunked)
def main(domain_corpus, pos_pattern):
# STEP 1
domain_sents =
domain_corpus
#print("domain_sents:", domain_sents)
#print("type(domain_sents):", type(domain_sents))
# Extract matching patterns
chunks_freqs = chunk_sents(domain_sents, pos_pattern)
return chunks_freqs
if __name__ == '__main__':
PATTERN = r
"""
NP: {<NN.*|ADJ>*<NN.*>}
"""
domain_corpus = load_corpus()
candidates = main(domain_corpus, PATTERN)
def load_corpus():
corpus_root = os.path.abspath('../out1_data')
mycorpus = nltk.corpus.reader.TaggedCorpusReader(corpus_root,'.*’)
return mycorpus.tagged_sents()