[hltdi-l3] push by onlysk...@gmail.com - Hiiktuu: learn.py: corpora from L3Morpho-analyzed documents; MWE patte... on 2014-07-11 19:47 GMT

0 views

Skip to first unread message

hltd...@googlecode.com

unread,

Jul 11, 2014, 3:47:22 PM7/11/14

to hltdi-...@googlegroups.com

Revision: 53eb28521f58
Branch: default
Author: Michael Gasser <gas...@cs.indiana.edu>
Date: Fri Jul 11 19:47:01 2014 UTC
Log: Hiiktuu: learn.py: corpora from L3Morpho-analyzed documents; MWE
patterns
http://code.google.com/p/hltdi-l3/source/detail?r=53eb28521f58

Added:
/hiiktuu/learn.py
/hiiktuu/utils.py
Modified:
/hiiktuu.py
/hiiktuu/__init__.py
/hiiktuu/features.py
/l3xdg/languages/es/n_chunk.inst

=======================================
--- /dev/null
+++ /hiiktuu/learn.py Fri Jul 11 19:47:01 2014 UTC
@@ -0,0 +1,293 @@
+#
+# Learning Hiiktuu groups
+#
+########################################################################
+#
+# This file is part of the HLTDI L^3 project
+# for parsing, generation, translation, and computer-assisted
+# human translation.
+#
+# Copyright (C) 2014, HLTDI <gas...@cs.indiana.edu>
+#
+# This program is free software: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of
+# the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# =========================================================================
+
+# 2014.07.04
+# -- Created
+# Possible classes:
+# Pattern -- head (root or POS), dependent (root or POS);
+# relative position; gaps
+# Corpus -- list (dict?) of sentences, consisting of lists
+# of word strings or word representations
+
+# Need this to parse and interpret features
+
+from .features import *
+from .utils import *
+
+class Corpus(list):
+ """A list of sentences, each a tuple of words or word-representation
Features objects."""
+
+ def __init__(self, name=''):
+ list.__init__(self)
+ self.name = name or 'anon'
+ # Store string->Features mapping here to save time in reading in
corpus
+ self.feat_cache = {}
+
+ def read(self, file, lines=0, expand=True):
+ """Extend the corpus with a list of lists of word-analysis strings.
+ file is a sentence-by-line file with words analyzed by L3Morpho
anal_file()
+ with minim=True. If expand is True, expand word analyses into
dicts."""
+ with open(file, encoding='utf8') as f:
+ n = 0
+ for line in f:
+ if lines and n >= lines:
+ return
+ n += 1
+ if n % 50000 == 0:
+ print("Read {} lines".format(n))
+ words = line.split()
+ if expand:
+ for i, word in enumerate(words):
+ if ';' in word:
+ # There is an analysis of the word
+ form, analyses = word.split(';')
+ w = [form]
+ for analysis in analyses.split('|'):
+ anal_attribs = analysis.split(':')
+ root = anal_attribs[0]
+ pos = False
+ feats = False
+ if root == '*':
+ # Root is same as wordform, so don't
record
+ root = False
+# if len(anal_attribs) > 1:
+# pos = anal_attribs[1]
+ if len(anal_attribs) > 2:
+ fs = anal_attribs[2]
+ if fs in self.feat_cache:
+ feats = self.feat_cache[fs]
+ else:
+ feats = Features.from_string(fs)
+ feats['p'] = anal_attribs[1]
+ self.feat_cache[fs] = feats
+ elif len(anal_attribs) == 2:
+ # POS but no additional grammatical
constraints
+ pos = anal_attribs[1]
+ if pos in self.feat_cache:
+ feats = self.feat_cache[pos]
+ else:
+ feats = Features({'p':
anal_attribs[1]})
+ self.feat_cache[pos] = feats
+ w.extend([root, feats])
+ words[i] = tuple(w)
+ self.append(tuple(words))
+
+ def __repr__(self):
+ return "C~~{}".format(self.name)
+
+ @staticmethod
+ def get_root(word, word_tup):
+ if isinstance(word_tup, str):
+ return None
+ root = word_tup[0]
+ if not root:
+ return word
+ return root
+
+ @staticmethod
+ def get_pos(word, word_tup):
+ gram = word_tup[1]
+ if gram:
+ return gram.get('p', False)
+ return False
+
+ @staticmethod
+ def get_gram(word, word_tup):
+ return word_tup[1]
+
+ @staticmethod
+ def get_form_anals(word):
+ """word is a string (form) or a tuple consisting of form following
by
+ unsegmented pairs representing analyses. Return a tuple consisting
of the
+ wordform followed by pairs (root, features) representing analyses.
+ """
+ if isinstance(word, str):
+ return word, ()
+ else:
+ form = word[0]
+ anals = [list(word[i+1:i+3]) for i in range(0, len(word)-1, 2)]
+ # Replace False root with form
+ for index, (root, gram) in enumerate(anals):
+ if not root:
+ anals[index][0] = form
+ return form, anals
+
+ def count_roots(self, roots, sort=True):
+ """Return either a dict or a sorted list of roots by their
frequency."""
+ d = {}
+ constraint = (None, (roots, None))
+ for sent in self:
+ for w in sent:
+ match = Pattern.match_item(w, constraint)
+ if match:
+ root = match[0]
+ if root in d:
+ d[root] += 1
+ else:
+ d[root] = 1
+ # Don't bother to look for another match with this
word
+ continue
+ if sort:
+ # Sort by frequency
+ l = list(d.items())
+ l.sort(key=lambda x: x[1], reverse=True)
+ return l
+ else:
+ return d
+
+ def sents(self, constraints=(None, None)):
+ """Find all sentences containing word with features matching feats
if any.
+ constraints is (forms, (root, gram))
+ forms is a set of wordforms.
+ root is None or a string or a set of strings.
+ feats is None or a list/tuple of feat-val constraint tuples.
+ Return list of pairs of sentence indices and word indices with
sentences."""
+ result = []
+# if isinstance(forms, str):
+# forms = {forms}
+ for sindex, sent in enumerate(self):
+ indices = []
+ for index, w in enumerate(sent):
+ if Pattern.match_item(w, constraints):
+ indices.append(index)
+# s_form, analyses = Corpus.get_form_anals(w)
+# if not root:
+# if s_form in forms:
+# indices.append(index)
+# else:
+# for rg in analyses:
+# if Pattern.match_constraints(rg, (root, feats)):
+# indices.append(index)
+# break
+ if indices:
+ result.append((sindex, indices))
+ return result
+
+class Pattern(list):
+ """A list of items to look for in sentences.
+ Each list element is a pair:
+ ({set of word forms}, ({set of roots}, (tuple of grammatical
constraints)}}
+ Any of the three components may be None.
+ """
+
+ def __init__(self, lst):
+ list.__init__(self, lst)
+ self.complete()
+
+ def __repr__(self):
+ return "&{}".format(list.__repr__(self))
+
+ def complete(self):
+ """Expand incomplete items."""
+ for index, item in enumerate(self):
+ if isinstance(item, str):
+ # A single form
+ self[index] = ({item}, None)
+ elif isinstance(item, set):
+ # A set of forms
+ self[index] = (item, None)
+ elif isinstance(item, tuple):
+ if isinstance(item[1], str):
+ # A single root string
+ self[index] = (None, ({item[1]}, None))
+ elif isinstance(item[1], set):
+ # A set of roots
+ self[index] = (None, (item[1], None))
+
+ @staticmethod
+ def match_item(s_word, constraints=(None, None), gap=0):
+ """Does word from sentence match various constraints?
+ Either the form should match or one or more other constraints
+ must."""
+ s_form, s_anals = Corpus.get_form_anals(s_word)
+ forms, rg = constraints
+ if forms:
+ if s_form in forms:
+ return s_form
+ else:
+ for s_rg in s_anals:
+ if Pattern.match_constraints(s_rg, rg):
+ return s_rg
+ return None
+
+ @staticmethod
+ def match_constraints(w_rg, p_rg):
+ """Does a pair of word properties (root, grammatical features/POS)
+ match the corresponding constraints (if any) from a pattern?
+ Pattern root constraint is a string or a set of root strings."""
+ roots, grams = zip(w_rg, p_rg)
+ wr, pr = roots
+ if isinstance(pr, str):
+ pr = {pr}
+ if pr and wr not in pr:
+ # Either there is no pattern root constraint or the
+ # roots must be equal
+ return False
+ wg, pg = grams
+ if pg and wg and not wg.match_list(pg):
+ # Either there is no list of feature pair constraints in
+ # the pattern or the word has no grammatical FS or the
+ # word's FS must match the feature pair list
+ return False
+ return True
+
+ def match(self, sentence, verbose=True):
+ """Does the Pattern match a sequence in the sentence?
+ If so, return the boundary indices of matching words within the
+ sentence."""
+ p_index = 0
+ p_item = self[0]
+ s_start = 0
+ for s_index, word in enumerate(sentence):
+ if verbose:
+ print("Matching pattern item {} against sentence item #{}:
{}".format(p_item, s_index, word))
+ if Pattern.match_item(word, p_item):
+ if verbose:
+ print("{} matches pattern element {}: {}".format(word,
p_index, p_item))
+ if p_index == 0:
+ s_start = s_index
+ p_index += 1
+ if p_index == len(self):
+ # Done with pattern; return bounds (end is last index
+ 1)
+ return s_start, s_index + 1
+ # Not done, get next element
+ p_item = self[p_index]
+ elif p_index > 0:
+ # Don't allow for gaps, so if the next item doesn't
+ # match, go back to the beginning
+ p_index = 0
+ p_item = self[0]
+ return False
+
+ def search(self, corpus, verbose=False):
+ """Search a corpus for instances of this pattern,
+ returning a list of their locations in the corpus."""
+ result = []
+ for sentence in corpus:
+ matched = self.match(sentence, verbose=verbose)
+ if matched:
+ result.append(matched)
+ return result
=======================================
--- /dev/null
+++ /hiiktuu/utils.py Fri Jul 11 19:47:01 2014 UTC
@@ -0,0 +1,81 @@
+#
+# Hiiktuu UI: initial attempt at a user interface for creating languages
+#
+########################################################################
+#
+# This file is part of the HLTDI L^3 project
+# for parsing, generation, translation, and computer-assisted
+# human translation.
+#
+# Copyright (C) 2014, HLTDI <gas...@cs.indiana.edu>
+#
+# This program is free software: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of
+# the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# =========================================================================
+
+# 2014.07.08
+# -- Created
+
+from __future__ import print_function
+from sys import getsizeof, stderr
+from itertools import chain
+from collections import deque
+try:
+ from reprlib import repr
+except ImportError:
+ pass
+
+### Measure the size of an object (recursively)
+
+def total_size(o, handlers={}, verbose=False):
+ """ Returns the approximate memory footprint an object and all of its
contents.
+
+ Automatically finds the contents of the following builtin containers
and
+ their subclasses: tuple, list, deque, dict, set and frozenset.
+ To search other containers, add handlers to iterate over their
contents:
+
+ handlers = {SomeContainerClass: iter,
+ OtherContainerClass: OtherContainerClass.get_elements}
+ Example call
+ d = dict(a=1, b=2, c=3, d=[4,5,6,7], e='a string of chars')
+ print(total_size(d, verbose=True))
+ """
+ dict_handler = lambda d: chain.from_iterable(d.items())
+ all_handlers = {tuple: iter,
+ list: iter,
+ deque: iter,
+ dict: dict_handler,
+ set: iter,
+ frozenset: iter,
+ }
+ all_handlers.update(handlers) # user handlers take precedence
+ seen = set() # track which object id's have
already been seen
+ default_size = getsizeof(0) # estimate sizeof object without
__sizeof__
+
+ def sizeof(o):
+ if id(o) in seen: # do not double count the same object
+ return 0
+ seen.add(id(o))
+ s = getsizeof(o, default_size)
+
+ if verbose:
+ print(s, type(o), repr(o), file=stderr)
+
+ for typ, handler in all_handlers.items():
+ if isinstance(o, typ):
+ s += sum(map(sizeof, handler(o)))
+ break
+ return s
+
+ return sizeof(o)
=======================================
--- /hiiktuu.py Tue May 20 06:19:15 2014 UTC
+++ /hiiktuu.py Fri Jul 11 19:47:01 2014 UTC
@@ -36,6 +36,14 @@
#import cProfile
#import pstats

+def europarl_corpus(corpus=None, suffix='a', lines=0):
+ corpus = corpus or hiiktuu.Corpus('ep')
+ corpus.read("../../LingData/Es/Europarl/es-en/es-ep7" + suffix
+ ".anl", lines=lines)
+ return corpus
+
+def monton():
+ return hiiktuu.Pattern(['montón', 'de', (None, (None, {('p', 'n')}))])
+
def test(verbosity=0):
piece_of_mind_parse_ung(verbosity=verbosity)
piece_of_mind_trans(verbosity=verbosity)
=======================================
--- /hiiktuu/__init__.py Tue May 6 07:09:41 2014 UTC
+++ /hiiktuu/__init__.py Fri Jul 11 19:47:01 2014 UTC
@@ -1,5 +1,6 @@
"""Hiiktuu: do-it-yourself L3. Create simple bilingual lexicons and
grammars for language pairs."""

-__all__ =
['language', 'entry', 'ui', 'constraint', 'variable', 'sentence', 'features', 'cs']
+__all__ =
['language', 'entry', 'ui', 'constraint', 'variable', 'sentence', 'features', 'cs', 'learn', 'utils']

from .sentence import *
+from .learn import *
=======================================
--- /hiiktuu/features.py Mon May 19 06:57:46 2014 UTC
+++ /hiiktuu/features.py Fri Jul 11 19:47:01 2014 UTC
@@ -36,6 +36,8 @@
# -- mutual_agree() makes two Features agree with one another on
# feature pairs.

+import re
+
class Features(dict):

def __init__(self, dct):
@@ -53,6 +55,13 @@
l.sort()
return l

+ @staticmethod
+ def from_string(string):
+ """Convert a string representation of a FeatStruct (from L3Morpho)
to
+ a Features object."""
+ d = PARSER.parse(string)
+ return Features(d)
+
@staticmethod
def unify_sets(x, y):
"""If both are sets, their intersection. If one is a set,
@@ -164,7 +173,7 @@

@staticmethod
def n_agree(feats1, feats2, agrs):
- """Return could of feats1 objects that agree with some feats2
objects and feats2
+ """Return feats1 objects that agree with some feats2 objects and
feats2
objects that agree with some feats1 objects."""
f1agr = 0
f2agr = 0
@@ -197,7 +206,7 @@

@staticmethod
def agree_with_none2(feats1, feats2, agrs):
- """Return all Features objects in feats1 that fail to agree with
any objects in feats1
+ """Return all Features objects in feats1 that fail to agree with
any objects in feats2
on agrs features."""
failures = []
for feat2 in feats2:
@@ -214,7 +223,13 @@
"""Does this Features object match list or tuple of feature/value
pairs?"""
for feat, val in feat_list:
if feat in self:
- if Features.simple_unify(val, self[feat]) == 'fail':
+ selfval = self[feat]
+ # val could be a set, in which case selfval has to unify
+ # with some element in val
+ if isinstance(val, set):
+ if all([Features.simple_unify(v, selfval) == 'fail'
for v in val]):
+ return False
+ elif Features.simple_unify(val, selfval) == 'fail':
return False
return True

@@ -230,3 +245,171 @@
return 'fail'
return result

+class DictStringParser:
+
+ def __init__(self):
+ self._features = {}
+ self._class = dict
+ self._prefix_feature = None
+ self._slash_feature = None
+ self._features_with_defaults = []
+
+ def parse(self, s, fstruct=None):
+ s = s.strip()
+ value, position = self.partial_parse(s, 0, fstruct)
+ if position != len(s):
+ self._error(s, 'end of string', position)
+ return value
+
+ _START_FSTRUCT_RE = re.compile(r'\s*(?:\((\d+)\)\s*)?(\??[\w-]+)?(\[)')
+ _END_FSTRUCT_RE = re.compile(r'\s*]\s*')
+ _FEATURE_NAME_RE = re.compile(r'\s*([+-]?)([^\s\(\)"\'\-=\[\],]+)\s*')
+ _TARGET_RE = re.compile(r'\s*\((\d+)\)\s*')
+ _ASSIGN_RE = re.compile(r'\s*=\s*')
+ _COMMA_RE = re.compile(r'\s*,\s*')
+ _BARE_PREFIX_RE = re.compile(r'\s*(?:\((\d+)\)\s*)?(\??[\w-]+\s*)()')
+
+ def partial_parse(self, s, position=0, fstruct=None):
+ try:
+ return self._partial_parse(s, position, fstruct)
+ except ValueError as e:
+ if len(e.args) != 2: raise
+ self._error(s, *e.args)
+
+ def _partial_parse(self, s, position, fstruct=None):
+ # Create the new feature structure
+ if fstruct is None:
+ fstruct = self._class()
+ else:
+ fstruct.clear()
+
+ # Read up to the open bracket.
+ match = self._START_FSTRUCT_RE.match(s, position)
+ if not match:
+ match = self._BARE_PREFIX_RE.match(s, position)
+ if not match:
+ raise ValueError('open bracket or identifier', position)
+ position = match.end()
+
+ # If there as an identifier, record it.
+ if match.group(1):
+ identifier = match.group(1)
+
+ # If there was a prefix feature, record it.
+ if match.group(2):
+ if self._prefix_feature is None:
+ raise ValueError('open bracket or identifier',
match.start(2))
+ prefixval = match.group(2).strip()
+ if prefixval.startswith('?'):
+ prefixval = Variable(prefixval)
+ fstruct[self._prefix_feature] = prefixval
+
+ # If group 3 is emtpy, then we just have a bare prefix, so
+ # we're done.
+ if not match.group(3):
+ return fstruct, match.end()
+
+ # Build a list of the features defined by the structure.
+ # Each feature has one of the three following forms:
+ # name = value
+ # name -> (target)
+ # +name
+ # -name
+ while position < len(s):
+ # Use these variables to hold info about each feature:
+ name = target = value = None
+
+ # Check for the close bracket.
+ match = self._END_FSTRUCT_RE.match(s, position)
+ if match is not None:
+ return fstruct, position + 1
+
+ # Get the feature name's name
+ match = self._FEATURE_NAME_RE.match(s, position)
+ if match is None: raise ValueError('feature name', position)
+ name = match.group(2)
+ position = match.end()
+
+ # Check if it's a special feature.
+ if name[0] == '*' and name[-1] == '*':
+ name = self._features.get(name[1:-1])
+ if name is None:
+ raise ValueError('known special feature',
match.start(2))
+
+ # Check if this feature has a value already.
+ if name in fstruct:
+ raise ValueError('new name', match.start(2))
+
+ # Boolean value ("+name" or "-name")
+ if match.group(1) == '+': value = True
+ if match.group(1) == '-': value = False
+
+ # Assignment ("= value").
+ if value is None:
+ match = self._ASSIGN_RE.match(s, position)
+ if match:
+ position = match.end()
+ value, position = (self.parse_value(s, position))
+ # None of the above: error.
+ else:
+ raise ValueError('equals sign', position)
+
+ # Store the value.
+ fstruct[name] = value
+
+ # If there's a close bracket, handle it at the top of the loop.
+ if self._END_FSTRUCT_RE.match(s, position):
+ continue
+
+ # Otherwise, there should be a comma
+ match = self._COMMA_RE.match(s, position)
+ if match is None: raise ValueError('comma', position)
+ position = match.end()
+
+ # We never saw a close bracket.
+ raise ValueError('close bracket', position)
+
+ def parse_value(self, s, position):
+ for (handler, regexp) in self.VALUE_HANDLERS:
+ match = regexp.match(s, position)
+ if match:
+ handler_func = getattr(self, handler)
+ return handler_func(s, position, match)
+ raise ValueError('value', position)
+
+ def _error(self, s, expected, position):
+ estr = ('Error parsing feature structure\n ' +
+ s + '\n ' + ' '*position + '^ ' +
+ 'Expected %s' % expected)
+ raise ValueError(estr)
+
+ VALUE_HANDLERS = [
+ ('parse_fstruct_value', _START_FSTRUCT_RE),
+ # One more digits followed by alphabetic characters
+ ('parse_digit_pre_value', re.compile(r'[0-9]+[a-zA-Z_]+')),
+ # Digits only
+ ('parse_int_value', re.compile(r'-?\d+')),
+ # Other string combinations
+ ('parse_sym_value', re.compile(r'\w\w*'))
+# ('parse_str_value', re.compile("[uU]?[rR]?(['\"])")),
+ ]
+
+ def parse_fstruct_value(self, s, position, match):
+ return self.partial_parse(s, position)
+
+ def parse_digit_pre_value(self, s, position, match):
+ val, end = match.group(), match.end()
+ return val, end
+
+# def parse_str_value(self, s, position, match):
+# return internals.parse_str(s, position)
+
+ def parse_int_value(self, s, position, match):
+ return int(match.group()), match.end()
+
+ _SYM_CONSTS = {'None':None, 'True':True, 'False':False}
+ def parse_sym_value(self, s, position, match):
+ val, end = match.group(), match.end()
+ return self._SYM_CONSTS.get(val, val), end
+
+PARSER = DictStringParser()
=======================================
--- /l3xdg/languages/es/n_chunk.inst Thu Oct 31 23:45:08 2013 UTC
+++ /l3xdg/languages/es/n_chunk.inst Fri Jul 11 19:47:01 2014 UTC
@@ -499,6 +499,7 @@
& chicle pl= chicles
& chico pl= chicos
& chileno pl= chilenos
+& chimpancé pl= chimpancés
& chino pl= chinos
& chiste pl= chistes
& chivo pl= chivos
@@ -7738,7 +7739,6 @@
& ceniceros
& cerezos
& cetáceos
-& chimpancés
& chivatos
& cigarros
& claroscuros
@@ -8029,7 +8029,6 @@
& vándalos
& ácaros
& álamos
-& vacaciones

^nfp
& acequias
@@ -8207,3 +8206,4 @@
& virutas
& vísceras
& zapatillas
+& vacaciones

Reply all

Reply to author

Forward

0 new messages