Revision: 7c6dd0caf9be
Branch: default
Author: Michael Gasser <
gas...@cs.indiana.edu>
Date: Sun Apr 20 19:19:51 2014 UTC
Log: L3Lite: languages/amh.lg and eng.lg initial examples of lexicons
http://code.google.com/p/hltdi-l3/source/detail?r=7c6dd0caf9be
Added:
/l3lite/languages/amh.lg
/l3lite/languages/eng.lg
Modified:
/l3lite/entry.py
/l3lite/language.py
/l3lite/sentence.py
=======================================
--- /dev/null
+++ /l3lite/languages/amh.lg Sun Apr 20 19:19:51 2014 UTC
@@ -0,0 +1,29 @@
+name: አማርኛ
+abbrev: amh
+groups:
+ በላ:
+ - words: [$food, በላ]
+ አወቀ:
+ - words: [$vb, አወቀ]
+ features: [{tam: ger}, {tam: imf, pol: neg}]
+forms:
+ ያውቃል:
+ root: አወቀ
+ features: {tam: imf, pol: aff, sp: 3, sn: 0, sg: 0}
+ cats: [$vb]
+ አያውቅም:
+ root: አወቀ
+ features: {tam: imf, pol: neg, sp: 3, sn: 0, sg: 0}
+ cats: [$vb]
+ በልቶ:
+ root: በላ
+ features: {tam: ger, sp: 3, sn: 0, sg: 0}
+ cats: [$vb]
+ ድንች:
+ root: ድንች
+ features: {num: 0, case: 0, poss: 0, def: 0}
+ cats: [$food, $thing]
+ አሳ:
+ root: አሳ
+ features: {num: 0, case: 0, poss: 0, def: 0}
+ cats: [$food, $animal, $thing]
=======================================
--- /dev/null
+++ /l3lite/languages/eng.lg Sun Apr 20 19:19:51 2014 UTC
@@ -0,0 +1,34 @@
+name: English
+abbrev: eng
+forms:
+ "it's":
+ seg:
+ - [it, {root: it, features: {num: 0, per: 3, gen: 2}, cats: [$pron]}]
+ - [is, {root: be, features: {tns: prs, per: 3, num: 0}, cats: [$aux,
$cop]}]
+ end:
+ root: end
+ features: {num: 0, prs: 3}
+ cats: [$abs]
+ boy:
+ root: boy
+ features: {num: 0, prs: 3}
+ cats: [$sbd]
+ act:
+ root: act
+ features: {num: 0, prs: 3}
+ cats: [$sth]
+ us:
+ root: we
+ features: {num: 1, prs: 1, case: 1}
+ cats: [$sbd]
+
+groups:
+ end:
+ - words: [the, end, of, the, world]
+ read:
+ - words: [read, $sbd, the, riot, act]
+ - words: [read, $sth]
+ boy:
+ - words: [the, boy]
+ us:
+ - words: [us]
=======================================
--- /l3lite/entry.py Sun Apr 20 07:07:10 2014 UTC
+++ /l3lite/entry.py Sun Apr 20 19:19:51 2014 UTC
@@ -643,21 +643,24 @@
class Group(Entry):
"""Primitive multi-word expressions. Default is a head with unlabeled
dependencies
- to all other words and translations, including alignments, to one or
more
+ to all other tokens and translations, including alignments, to one or
more
other languages."""
- def __init__(self, words, head_index=-1, head='', language=None,
name='',
+ def __init__(self, tokens, head_index=-1, head='', language=None,
name='',
features=None):
- # words is a list of strings or (string, dict) tuples
+ """Either head_index or head (a string) must be specified."""
+ # tokens is a list of strings
# name may be specified explicitly or not
- name = name or Group.make_name(words)
+ name = name or Group.make_name(tokens)
Entry.__init__(self, name, language)
- self.words = words
+ self.tokens = tokens
if head:
self.head = head
+ self.head_index = tokens.index(head)
else:
- self.head = words[head_index]
- # Either None or a list of feat-val dicts for words that require
them
+ self.head = tokens[head_index]
+ self.head_index = head_index
+ # Either None or a list of feat-val dicts for tokens that require
them
# Convert dicts to Features objects
if isinstance(features, list):
features = [Features(d) for d in features]
@@ -668,63 +671,65 @@
return '<{}:{}>'.format(
self.name,
self.id)
@staticmethod
- def make_name(words):
- # Each word is either a string or a (string, feat_dict) pair
+ def make_name(tokens):
+ # Each token is either a string or a (string, feat_dict) pair
# strings = []
-# for word in words:
-# if isinstance(word, str):
-# strings.append(word)
+# for token in tokens:
+# if isinstance(token, str):
+# strings.append(token)
# else:
-# form, feat_dict = word
+# form, feat_dict = token
# fv = ['{}={}'.format(f, v) for f, v in feat_dict.items()]
# fv = ','.join(fv)
# strings.append("{}:{}".format(form, fv))
- return '_'.join(words)
+ return '_'.join(tokens)
# Serialization
def to_dict(self):
"""Convert the group to a dictionary to be serialized in a yaml
file."""
d = Entry.to_dict(self)
- d['words'] = self.words
+ d['words'] = self.tokens
d['features'] = self.features
return d
@staticmethod
def from_dict(d, language, head):
"""Convert a dict (loaded from a yaml file) to a Group object."""
- words = d['words']
+ tokens = d['words']
features = d.get('features')
- p = Group(words, head=head, language=language, features=features)
+ p = Group(tokens, head=head, language=language, features=features)
return p
- def match_nodes(self, nodes, head_index):
- """Attempt to match the group words (and features) with nodes from
a sentence."""
-# print("Does {} match {}".format(self, nodes))
- tok_indices = []
- for index, word in enumerate(self.words):
-# print(" Attempting to match {}".format(word))
+ def match_nodes(self, snodes, head_index):
+ """Attempt to match the group tokens (and features) with snodes
from a sentence."""
+# print("Does {} match {}".format(self, snodes))
+ sent_indices = []
+ for index, token in enumerate(self.tokens):
+ sent_tok_indices = []
+# print(" Attempting to match {}".format(token))
matched = False
- for node in nodes:# for word, feats in zip(self.words,
self.features):
+ for node in snodes:# for token, feats in
zip(self.tokens, self.features):
# print(" Trying {}".format(node))
if index == node.index == head_index:
# This is the token corresponding to the group head
- tok_indices.append(node.index)
+ sent_tok_indices.append(node.index)
# print(" Head matched already".format(node))
matched = True
break
else:
feats = self.features[index] if self.features else None
- if node.match(word, feats):
- tok_indices.append(node.index)
+ if node.match(token, feats):
+ sent_tok_indices.append(node.index)
# print(" Matched node {}".format(node))
matched = True
- break
if not matched:
-# print(" {} not matched; failed".format(word))
+# print(" {} not matched; failed".format(token))
return False
- return tok_indices
+ else:
+ sent_indices.append(sent_tok_indices)
+ return sent_indices
class EntryError(Exception):
'''Class for errors encountered when attempting to update an entry.'''
=======================================
--- /l3lite/language.py Sun Apr 20 07:07:10 2014 UTC
+++ /l3lite/language.py Sun Apr 20 19:19:51 2014 UTC
@@ -133,7 +133,7 @@
yaml.dump(self.to_dict(), file)
@staticmethod
- def from_dict(d):
+ def from_dict(d, reverse=True):
"""Convert a dict (loaded from a yaml file) to a Language
object."""
l = Language(d.get('name'), d.get('abbrev'))
l.possible = d.get('possible')
@@ -181,6 +181,14 @@
if 'features' in d:
d['features'] = Features(d['features'])
l.forms[k] = v
+ if reverse:
+ # Add item to genform dict
+ if isinstance(v, dict):
+ if 'seg' not in v:
+ l.add_genform(k, v['root'], v.get('features'))
+ else:
+ for d in v:
+ l.add_genform(k, d['root'], d.get('features'))
return l
@staticmethod
@@ -280,12 +288,12 @@
self.changed = True
return entry
- def add_group(self, words, head_index=-1, head='', name='',
features=None):
- group = Group(words, head_index=head_index, head=head,
+ def add_group(self, tokens, head_index=-1, head='', name='',
features=None):
+ group = Group(tokens, head_index=head_index, head=head,
language=self, name=name, features=features)
# print('Group {}, head {}'.format(group, group.head))
if features:
- head_i = words.index(group.head)
+ head_i = tokens.index(group.head)
head_feats = features[head_i]
else:
head_feats = None
=======================================
--- /l3lite/sentence.py Sun Apr 20 07:07:10 2014 UTC
+++ /l3lite/sentence.py Sun Apr 20 19:19:51 2014 UTC
@@ -26,6 +26,8 @@
# 2014.04.15
# -- Created.
+# 2014.04.19-20
+# -- Group matching. GInst, GNode, and SNode classes.
# ui.py loads language, etc.
from .ui import *
@@ -46,7 +48,7 @@
# self.analyses = analyses or []
# A language object
self.language = language
- # A list of Node objects, one for each token
+ # A list of SNode objects, one for each token
self.nodes = nodes or []
# A list of candidate groups found during lexicalization
self.groups = groups or []
@@ -60,9 +62,14 @@
else:
return '|| {} sentence ||'.format(self.language)
+ def initialize(self):
+ """Things to do before running constraint satisfaction."""
+ self.tokenize()
+ self.lexicalize()
+
def tokenize(self):
"""Segment the sentence string into tokens, analyze them
morphologically,
- and create a Node object for each."""
+ and create a SNode object for each."""
if not self.nodes:
# (Otherwise it's already done.)
# Split at spaces by default (later allow for dedicated
language-specific tokenizers).
@@ -73,7 +80,7 @@
# Look up token in language.forms
if token not in self.language.forms:
# Not found, just use the raw string
- self.nodes.append(Node(token, index, None, self))
+ self.nodes.append(SNode(token, index, None, self))
index += 1
else:
# A dict, for unambiguous forms, or a list of dicts,
for ambiguous forms
@@ -84,20 +91,20 @@
segs = formdict['seg']
for seg in segs:
tok, analysis = seg
- self.nodes.append(Node(tok, index,
analysis, self))
+ self.nodes.append(SNode(tok, index,
analysis, self))
index += 1
else:
- self.nodes.append(Node(token, index, formdict,
self))
+ self.nodes.append(SNode(token, index,
formdict, self))
index += 1
else:
# Multiple dicts: ambiguity; let node handle it
- self.nodes.append(Node(token, index, formdict,
self))
+ self.nodes.append(SNode(token, index, formdict,
self))
index += 1
- def activate_groups(self):
- """Activate all groups that are compatible with the tokens in the
sentence."""
+ def lexicalize(self):
+ """Find and instantiate all groups that are compatible with the
tokens in the sentence."""
if not self.nodes:
- print("Tokenization must precede group activation.")
+ print("Tokenization must precede lexicalization.")
return
candidates = []
for node in self.nodes:
@@ -117,16 +124,17 @@
# For each group, save a list of list of sentence token indices
that correspond
# to the group's words
groups = []
- for head_index, group in candidates:
- indices = group.match_nodes(self.nodes, head_index)
+ for head_i, group in candidates:
+ indices = group.match_nodes(self.nodes, head_i)
if not indices:
# This group is out
continue
- groups.append((head_index, indices, group))
+# print('Found indices {} for group {}, head index
{}'.format(indices, group, head_i))
+ groups.append((head_i, indices, group))
- self.groups = groups
+ self.groups = [GInst(group, self, head_i, indices) for head_i,
indices, group in groups]
-class Node:
+class SNode:
"""Sentence token and its associated analyses and variables."""
def __init__(self, token, index, analyses, sentence):
@@ -145,7 +153,7 @@
def __repr__(self):
"""Print name."""
- return "*{}".format(self.token)
+ return "*{}:{}".format(self.token, self.index)
def get_cats(self):
"""The set of categories for the node's token, or None."""
@@ -180,3 +188,35 @@
elif node_features.unify(features):
return True
return False
+
+class GInst:
+
+ """Instantiation of a group; holds variables and GNode objects."""
+
+ def __init__(self, group, sentence, head_index, snode_indices):
+ # The Group object that this "instantiates"
+ self.group = group
+ self.sentence = sentence
+ # Index of SNode associated with group head
+ self.head_index = head_index
+ # List of GNodes
+ self.nodes = [GNode(self, index, indices) for index, indices in
enumerate(snode_indices)]
+
+ def __repr__(self):
+ return '<<{}:{}>>'.format(
self.group.name,
self.group.id)
+
+class GNode:
+
+ """Representation of a single node (word, position) within a GInst
object."""
+
+ def __init__(self, ginst, index, snode_indices):
+ self.ginst = ginst
+ self.index = index
+ self.snode_indices = snode_indices
+ # Whether this is the head of the group
+ self.head = index == ginst.group.head_index
+ # Group word, etc. associated with this node
+ self.token = ginst.group.tokens[index]
+
+ def __repr__(self):
+ return "{}|{}".format(self.ginst, self.token)