[clml] r129 committed - update

0 views

Skip to first unread message

cl...@googlecode.com

unread,

Jul 1, 2011, 9:36:55 PM7/1/11

to cl...@googlegroups.com

Revision: 129
Author: sbo...@isetsu.net
Date: Fri Jul 1 18:36:02 2011
Log: update
http://code.google.com/p/clml/source/detail?r=129

Added:
/trunk/pyphon/src/pyphon/permutationdistance.py
/trunk/pyphon/src/pyphon/twotierlangs.py
Deleted:
/trunk/pyphon/src/permutationdistance.py
/trunk/pyphon/src/twotierlangs.py
Modified:
/trunk/pyphon/src/pyphon/__init__.py
/trunk/pyphon/src/pyphon_recurse.py
/trunk/pyphon/src/pyphon_twotier.py

=======================================
--- /dev/null
+++ /trunk/pyphon/src/pyphon/permutationdistance.py Fri Jul 1 18:36:02 2011
@@ -0,0 +1,132 @@
+#!/usr/bin/env python
+
+#Very much a draft.
+import itertools, sys
+
+#ercset1s = sys.argv[-2]
+#ercset2s = sys.argv[-1]
+
+#ercset1 = ercset1s.rsplit("_")
+#ercset2 = ercset2s.rsplit("_")
+#print ercset1
+#print ercset2
+
+def compatible(r, erc):
+ """Check if two ERCs are compatible"""
+ havew = False
+ for i in range(len(r)):
+ if erc[r[i]] == 'w':
+ havew = True
+ elif erc[r[i]] == 'l' and not havew:
+ return False
+ return True
+
+def getrankings(ercset, n=0):
+ """Get all of the rankings consistent with an erc set"""
+ pset = itertools.permutations(range(len(ercset[0])))
+ psetl = list(pset)
+ ret = []
+ for p in psetl:
+ okay = True
+ for erc in ercset:
+ if okay and not compatible(p, erc):
+ okay = False
+ if okay:
+ ret.append(p)
+ if n!=0 and len(ret)>n-1:
+ return ret
+
+ return ret
+
+#Below: Code for permutation distance testing
+def merge_and_count(a, b):
+ assert a == sorted(a) and b == sorted(b)
+ c = []
+ count = 0
+ i, j = 0, 0
+ while i < len(a) and j < len(b):
+ c.append(min(b[j], a[i]))
+ if b[j] < a[i]:
+ count += len(a) - i # number of elements remaining in `a`
+ j+=1
+ else:
+ i+=1
+ # now we reached the end of one the lists
+ c += a[i:] + b[j:] # append the remainder of the list to C
+ return count, c
+
+
+def sort_and_count(L):
+ if len(L) == 1: return 0, L
+ n = len(L) // 2
+ a, b = L[:n], L[n:]
+ ra, a = sort_and_count(a)
+ rb, b = sort_and_count(b)
+ r, L = merge_and_count(a, b)
+ return ra+rb+r, L
+
+
+def get_permutation(L1, L2):
+ """Find permutation that converts L1 into L2.
+
+ See http://en.wikipedia.org/wiki/Cycle_representation#Notation
+ """
+ if sorted(L1) != sorted(L2):
+ raise ValueError("L2 must be permutation of L1 (%s, %s)" % (L1,L2))
+
+ permutation = map(dict((v, i) for i, v in enumerate(L1)).get, L2)
+ assert [L1[p] for p in permutation] == L2
+ return permutation
+
+
+def number_of_swaps(permutation):
+ """Find number of swaps required to convert the permutation into
+ identity one.
+
+ """
+ # decompose the permutation into disjoint cycles
+ nswaps = 0
+ seen = set()
+ for i in xrange(len(permutation)):
+ if i not in seen:
+ j = i
+ while permutation[j] != i:
+ j = permutation[j]
+ seen.add(j)
+ nswaps += 1
+
+ return nswaps
+
+def set_dif(rset1, rset2):
+ """Get the minimum distance between rankings defined by two erc sets, and
the best rankings"""
+ m = -1
+ pairs = []
+ i = 0
+ for r1 in rset1:
+ if i%50==0:
+ print "Processing ranking", i+1,'/',len(rset1),"..."
+ i+=1
+ for r2 in rset2:
+ per = get_permutation(list(r1),list(r2))
+ dist = sort_and_count(per)[0]
+ if m > 0 and dist < m:
+ m = dist
+ pairs = []
+ pairs.append([r1, r2])
+ elif m < 0:
+ m = dist
+ pairs.append([r1, r2])
+ elif dist == m:
+ pairs.append([r1, r2])
+ return [m, pairs]
+
+
+#Get permutation sets, compute distances, find min
+
+#r1 = getrankings(ercset1)
+#r2 = getrankings(ercset2)
+
+#print r1
+#print r2
+
+#print set_dif(r1, r2)
=======================================
--- /dev/null
+++ /trunk/pyphon/src/pyphon/twotierlangs.py Fri Jul 1 18:36:02 2011
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+
+#Give this the output filename from pyphon_twotier -- it'll give you
ERC-set pairs which together define an Input-intermediate-output
relationship consistent with the provided observations.
+
+import RNF
+from permutationdistance import getrankings, set_dif
+
+def print_languages(languages, log, i):
+ for language in languages:
+ print "Language:", i
+ log.write("Language "+str(i)+"\n")
+ print "Cyclic ERCs", language[0]
+ log.write(str(language[0]))
+ log.write("\n")
+ rs=getrankings(language[0], 10)
+ print "Sample Cyclic Rankings", rs
+ log.write(str(language[1]))
+ log.write("\n")
+ print "Postlex ERCs", language[1]
+ rs=getrankings(language[1], 10)
+ print "Sample Postlex Rankings", rs
+
+ for member in language[2]:
+ print member[0], member[1], member[2]
+ log.write(str(member[0])+" "+str(member[1])+" "+str(member[2])+"\n")
+ #print language[3]
+
+def compatible_languages(cohort, languages):
+ retlanguages = []
+ if languages == []:
+ for member in cohort:
+ retlanguages.append([member[3].rsplit("_"), member[4].rsplit("_"),
[member]])
+
+ else:
+ for language in languages:
+ for member in cohort:
+ es1 = RNF.RNF(language[0]+member[3].rsplit("_"))
+ es2 = RNF.RNF(language[1]+member[4].rsplit("_"))
+
+ #are the above really equivalent to concatenate + simplify?
+
+ es1 = language[0]+member[3].rsplit("_")
+
+ es2 = language[1]+member[4].rsplit("_")
+
+ for erc in es1:
+ if not "l" in erc:
+ es1.remove(erc)
+
+ for erc in es2:
+ if not "l" in erc:
+ es2.remove(erc)
+
+
+ if (RNF.consistent(es1) and RNF.consistent(es2)):
+ #print "success"
+ retlanguages.append([es1, es2, language[2]+[member]])
+ #else:
+ #print "fail"
+ #print RNF.consistent(es1), es1
+ #print RNF.consistent(es2), es2
+
+
+ todelete = []
+ for i in range(len(retlanguages)):
+ for j in range(i):
+ #print "checking", i, j
+ if retlanguages[i][0] == retlanguages[j][0] and retlanguages[i][1] ==
retlanguages[j][1]:
+ todelete.append(i)
+ pfretlanguages=[]
+ for i in range(len(retlanguages)):
+ if i not in todelete:
+ pfretlanguages.append(retlanguages[i])
+
+ return pfretlanguages
+
+def find2tlangs(infilename):
+ if ".csv" in infilename:
+ infilename = infilename[:-4]
+
+ log = open(infilename+"_languages.csv",'w')
+
+
+ infile = open(infilename + ".csv", 'r')
+
+
+
+ seqs = []
+ for line in infile:
+ if ("input form" in line):
+ continue
+ line = line[:-1]
+ seq = line.rsplit(",") #build pairs of IO mappings
+ seqs.append(seq)
+
+ cohorts = []
+ cohort = []
+ #def next_input(erc, combos):
+
+ seqs.sort()
+
+ prev = ""
+ for i in range(len(seqs)):
+ if prev == "" or seqs[i][0] == prev:
+ cohort.append(seqs[i])
+ else:
+ cohorts.append(cohort)
+ cohort=[seqs[i]]
+ prev = seqs[i][0]
+ cohorts.append(cohort)
+
+
+ #next_input()
+
+
+
+
+ languages = []
+
+ for cohort in cohorts:
+ languages = compatible_languages(cohort, languages)
+ print "Adding cohort with", len(languages), "languages:", cohort[0][0]
+ #print_languages(languages)
+ if len(languages)==0:
+ print "No languages found. Sorry!"
+ break
+
+ annotated_languages = []
+ i=0
+ for language in languages:
+ i+=1
+ #language.append(cover_dist(language[0], language[1]))
+ annotated_languages.append(language)
+ print_languages([language], log, i)
+
=======================================
--- /trunk/pyphon/src/permutationdistance.py Tue Jun 28 16:12:00 2011
+++ /dev/null
@@ -1,132 +0,0 @@
-#!/usr/bin/env python
-
-#Very much a draft.
-import itertools, sys
-
-#ercset1s = sys.argv[-2]
-#ercset2s = sys.argv[-1]
-
-#ercset1 = ercset1s.rsplit("_")
-#ercset2 = ercset2s.rsplit("_")
-#print ercset1
-#print ercset2
-
-def compatible(r, erc):
- """Check if two ERCs are compatible"""
- havew = False
- for i in range(len(r)):
- if erc[r[i]] == 'w':
- havew = True
- elif erc[r[i]] == 'l' and not havew:
- return False
- return True
-
-def getrankings(ercset, n=0):
- """Get all of the rankings consistent with an erc set"""
- pset = itertools.permutations(range(len(ercset[0])))
- psetl = list(pset)
- ret = []
- for p in psetl:
- okay = True
- for erc in ercset:
- if okay and not compatible(p, erc):
- okay = False
- if okay:
- ret.append(p)
- if n!=0 and len(ret)>n-1:
- return ret
-
- return ret
-
-#Below: Code for permutation distance testing
-def merge_and_count(a, b):
- assert a == sorted(a) and b == sorted(b)
- c = []
- count = 0
- i, j = 0, 0
- while i < len(a) and j < len(b):
- c.append(min(b[j], a[i]))
- if b[j] < a[i]:
- count += len(a) - i # number of elements remaining in `a`
- j+=1
- else:
- i+=1
- # now we reached the end of one the lists
- c += a[i:] + b[j:] # append the remainder of the list to C
- return count, c
-
-
-def sort_and_count(L):
- if len(L) == 1: return 0, L
- n = len(L) // 2
- a, b = L[:n], L[n:]
- ra, a = sort_and_count(a)
- rb, b = sort_and_count(b)
- r, L = merge_and_count(a, b)
- return ra+rb+r, L
-
-
-def get_permutation(L1, L2):
- """Find permutation that converts L1 into L2.
-
- See http://en.wikipedia.org/wiki/Cycle_representation#Notation
- """
- if sorted(L1) != sorted(L2):
- raise ValueError("L2 must be permutation of L1 (%s, %s)" % (L1,L2))
-
- permutation = map(dict((v, i) for i, v in enumerate(L1)).get, L2)
- assert [L1[p] for p in permutation] == L2
- return permutation
-
-
-def number_of_swaps(permutation):
- """Find number of swaps required to convert the permutation into
- identity one.
-
- """
- # decompose the permutation into disjoint cycles
- nswaps = 0
- seen = set()
- for i in xrange(len(permutation)):
- if i not in seen:
- j = i
- while permutation[j] != i:
- j = permutation[j]
- seen.add(j)
- nswaps += 1
-
- return nswaps
-
-def set_dif(rset1, rset2):
- """Get the minimum distance between rankings defined by two erc sets, and
the best rankings"""
- m = -1
- pairs = []
- i = 0
- for r1 in rset1:
- if i%50==0:
- print "Processing ranking", i+1,'/',len(rset1),"..."
- i+=1
- for r2 in rset2:
- per = get_permutation(list(r1),list(r2))
- dist = sort_and_count(per)[0]
- if m > 0 and dist < m:
- m = dist
- pairs = []
- pairs.append([r1, r2])
- elif m < 0:
- m = dist
- pairs.append([r1, r2])
- elif dist == m:
- pairs.append([r1, r2])
- return [m, pairs]
-
-
-#Get permutation sets, compute distances, find min
-
-#r1 = getrankings(ercset1)
-#r2 = getrankings(ercset2)
-
-#print r1
-#print r2
-
-#print set_dif(r1, r2)
=======================================
--- /trunk/pyphon/src/twotierlangs.py Wed Jun 29 12:51:08 2011
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/usr/bin/env python
-
-#Give this the output filename from pyphon_twotier -- it'll give you
ERC-set pairs which together define an Input-intermediate-output
relationship consistent with the provided observations.
-
-from pyphon import *
-from permutationdistance import getrankings, set_dif
-
-def print_languages(languages, log, i):
- for language in languages:
- print "Language:", i
- log.write("Language "+str(i)+"\n")
- print "Cyclic ERCs", language[0]
- log.write(str(language[0]))
- log.write("\n")
- rs=getrankings(language[0], 10)
- print "Sample Cyclic Rankings", rs
- log.write(str(language[1]))
- log.write("\n")
- print "Postlex ERCs", language[1]
- rs=getrankings(language[1], 10)
- print "Sample Postlex Rankings", rs
-
- for member in language[2]:
- print member[0], member[1], member[2]
- log.write(str(member[0])+" "+str(member[1])+" "+str(member[2])+"\n")
- #print language[3]
-
-def compatible_languages(cohort, languages):
- retlanguages = []
- if languages == []:
- for member in cohort:
- retlanguages.append([member[3].rsplit("_"), member[4].rsplit("_"),
[member]])
-
- else:
- for language in languages:
- for member in cohort:
- es1 = RNF.RNF(language[0]+member[3].rsplit("_"))
- es2 = RNF.RNF(language[1]+member[4].rsplit("_"))
-
- #are the above really equivalent to concatenate + simplify?
-
- es1 = language[0]+member[3].rsplit("_")
-
- es2 = language[1]+member[4].rsplit("_")
-
- for erc in es1:
- if not "l" in erc:
- es1.remove(erc)
-
- for erc in es2:
- if not "l" in erc:
- es2.remove(erc)
-
-
- if (RNF.consistent(es1) and RNF.consistent(es2)):
- #print "success"
- retlanguages.append([es1, es2, language[2]+[member]])
- #else:
- #print "fail"
- #print RNF.consistent(es1), es1
- #print RNF.consistent(es2), es2
-
-
- todelete = []
- for i in range(len(retlanguages)):
- for j in range(i):
- #print "checking", i, j
- if retlanguages[i][0] == retlanguages[j][0] and retlanguages[i][1] ==
retlanguages[j][1]:
- todelete.append(i)
- pfretlanguages=[]
- for i in range(len(retlanguages)):
- if i not in todelete:
- pfretlanguages.append(retlanguages[i])
-
- return pfretlanguages
-
-def find2tlangs(infilename):
- if ".csv" in infilename:
- infilename = infilename[:-4]
-
- log = open(infilename+"_languages.csv",'w')
-
-
- infile = open(infilename + ".csv", 'r')
-
-
-
- seqs = []
- for line in infile:
- if ("input form" in line):
- continue
- line = line[:-1]
- seq = line.rsplit(",") #build pairs of IO mappings
- seqs.append(seq)
-
- cohorts = []
- cohort = []
- #def next_input(erc, combos):
-
- seqs.sort()
-
- prev = ""
- for i in range(len(seqs)):
- if prev == "" or seqs[i][0] == prev:
- cohort.append(seqs[i])
- else:
- cohorts.append(cohort)
- cohort=[seqs[i]]
- prev = seqs[i][0]
- cohorts.append(cohort)
-
-
- #next_input()
-
-
-
-
- languages = []
-
- for cohort in cohorts:
- languages = compatible_languages(cohort, languages)
- print "Adding cohort with", len(languages), "languages:", cohort[0][0]
- #print_languages(languages)
- if len(languages)==0:
- print "No languages found. Sorry!"
- break
-
- annotated_languages = []
- i=0
- for language in languages:
- i+=1
- #language.append(cover_dist(language[0], language[1]))
- annotated_languages.append(language)
- print_languages([language], log, i)
-
=======================================
--- /trunk/pyphon/src/pyphon/__init__.py Wed Sep 1 08:34:37 2010
+++ /trunk/pyphon/src/pyphon/__init__.py Fri Jul 1 18:36:02 2011
@@ -2,10 +2,12 @@
Package documentation goes here.
"""

-__version__ = "1.5.0-dev"
+__version__ = "1.5.0-dev with two-tier extensions"

from pyphon import *

from featureMap import ipaFeatures, ipaDiacriticFeatures

import fsa, pycbg, pyspe, regex
+
+import twotierlangs, permutationdistance
=======================================
--- /trunk/pyphon/src/pyphon_recurse.py Wed Jun 29 12:51:08 2011
+++ /trunk/pyphon/src/pyphon_recurse.py Fri Jul 1 18:36:02 2011
@@ -1,221 +1,252 @@
#!/usr/bin/env python

+#from Bowman, Samuel. 2011. Vowel Harmony, Opacity, and Finite-State OT.
MA Thesis, The University of Chicago.
+
import sys, os, re, string, copy
+from optparse import OptionParser
sys.path[0:0] = 'src' # puts the /foo directory at the start of your path
-from pyphon_maketableaux import *
from pyphon_generate import *
+from pyphon_maketableaux import *
import pyphon_maketableaux, pyphon_generate

usage = \
-"""pyphon_recurse.py modelfile inputfile
-
-Read a PyPhon model and a single inputs and write a list of
cyclic-evaluation contenders to disk.
+"""
+pyphon_recurse.py modelfile inputfile
+
+Read a PyPhon model and a file of inputs (on separate lines) and write a
list of cyclic-evaluation contenders to disk. Since this depends on ERCs,
it is presently limited to OT, and will not generate HG typologies.

The model filename must have no suffix, and must refer both to a model
file (.mod), as built by pyphon_makemodel.py, and to a model description
file (.csv).

-The input file must be a single input on a single line, with affixes
separated by \. and the stem deliminated by \=s:, as in:
-
-a\.b\.c\=de\=f\.g
-
-The output defaults to inputfile-modelfile-recursive_typology.csv.
+The input file must be a single input on a single line, with affixes
separated by \. and the stem delimited by \#:, as in:
+
+a\.b\.c\#de\#f\.g
+
+The output will be [inputfile]-[modelfile]-recursive_typology.csv.
"""

-def process(output, con, words):
- 'Run one tier'
- loutput = open(infilename + "_ri.csv",'w')
-
- wordsrem = []
-
- for word in words:
- stem = word[0]
- laffixes = word[1]
- raffixes = word[2]
- input = word[3]
- erc = word[4]
- interm = copy.deepcopy(word[5])
- interm.append(stem)
- laffixes_rem = laffixes[:]
- raffixes_rem = raffixes[:]
-
- la = ra = ''
- if laffixes_rem:
- la = laffixes_rem.pop()
- if raffixes_rem:
- ra = raffixes_rem.pop()
- if not (la + ra):
- output.write(input+', '+stem+', '+erc+', '+('>'.join(word[5]))+'\n')
- continue
- else:
-
- word[1] = laffixes_rem
- word[2] = raffixes_rem
-
-
- fstem = la + "#" + stem + "#" + ra
- stem = la + "\#" + stem + "\#" + ra
-
- # Produce the stem input
- # print "writing", stem, infilename
- loutput.write(stem+'\n')
- word[5] = interm
- word[0] = stem
- wordsrem.append(word)
-
- loutput.close()
-
- if wordsrem==[]:
- return
-
- pyphon_maketableaux.main(model, infilename+"_ri", "ot", "temptab")
-
- cmd = "cat temptab.csv"
- os.system(cmd)
-
-
- cache = []
-
- for word in wordsrem:
- erc = word[4]
- stem = word[0]
-
- # Generate ERCs
- if erc != "0":
- eoutput = open('ercs.csv','w')
- eoutput.write('\nERCS, gram\n')
- w=[]
- l=[]
- e=[]
- i=0
- ci=0
- while 1:
- #order W L E
- if i==len(erc):
- break
- if erc[i] == 'w':
- w.append(con[ci])
- elif erc[i] == 'l':
- l.append(con[ci])
- elif erc[i] == 'e':
- e.append(con[ci])
- elif erc[i] == '_':
- eoutput.write((' '.join(w)) + ", " + (' '.join(l)) + ", " +
(' '.join(e)) + "\n")
- w=[]
- l=[]
- e=[]
- ci=-1
- i=i+1
- ci=ci+1
-
-
- eoutput.write((' '.join(w)) + ", " + (' '.join(l)) + ", " +
(' '.join(e)) + "\n\nERCS, blank\n,,\n")
- eoutput.close()
-
- pyphon_generate.main("temptab.csv", "temptyp", "ot", "ercs", "gram")
- else:
- pyphon_generate.main("temptab.csv", "temptyp", "ot")
- #Here we generate a typology for all of the inputs when we only need
one. It may shave runtime to limit this, but not my much.
-
-
- infile = open("temptyp.csv", 'r')
- erc = ""
-
-
- for line in infile:
-
- #if "Input,Output," in line:
- # sectioned = line.rsplit(',')
- # con = sectioned[2:]
-
- if line[0]=="/":
- line = line[:-1]
-
- n = re.search('(?<=/).+(?=/)', line)
- input = n.group(0)
-
- stem = re.sub('(\\\)','',stem)
- if input != stem:
- continue
-
- n = re.search('(?<=\{).*?(?=\})', line)
- if n:
- ercset = n.group(0)
- else:
- ercset = ""
-
- ercs=[]
-
- for n in re.finditer('[wel]+', ercset):
- if n.group(0) not in ercs:
- ercs.append(n.group(0))
- if word[4]!="0":
- ercs.append(word[4])
-
- if ercs:
- erc = "_".join(ercs)
- else:
- erc = "0"
-
- for m in re.finditer('(?<=\[).*?(?=\])', line):
- fstem_out = re.sub('[\#\.]','',m.group(0))
- candidate = [fstem_out, word[1], word[2], word[3], erc, word[5]]
- if candidate not in cache:
- cache.append(candidate)
- infile.close()
- loutput.close()
- process(output, con, cache)
- #process_erc(output, con, words)
-
-
-
-infilename = sys.argv[-1]
-if ".csv" in infilename:
- infilename = infilename[:-4]
-model = sys.argv[-2]
-
-# Read the first line of the input file. Hopefully this is the only line.
-infile = open(infilename + ".csv", 'r')
-print "Recurse over:"
-
-inputs = []
-
-for line in infile:
- print line
- line = line.rstrip("\n")
-
- # Separate the affixes
- sectioned = line.rsplit('\#')
- if len(sectioned)>1:
- stem = sectioned[1];
- laffixes = sectioned[0].rsplit('\.') # Affixes are ordered by distance,
not orthography, so reverse the left
- raffixes = (sectioned[2].rsplit('\.'))[::-1]
- else:
- stem = sectioned[0] #robustness to inputs without morpheme boundries
- laffixes = raffixes = []
- inputs.append([stem, laffixes, raffixes, line, '0', []])
-
-#print stem, laffixes, raffixes
-
-# Produce the stem input
-output = open(infilename + '_ri.csv','w')
-for word in inputs:
- output.write('\#'+word[0]+'\#\n')
-output.close()
-
-
-output = open(model + '_' + infilename + '_recursive_typology.csv','w')
-#output.write('Input, '+input+"\n")
-
-infile = open(model + ".csv", 'r')
-for line in infile:
- if line[0:3]=="CON":
- line = line[:-1]
- sectioned = line.rsplit(',')
- con = sectioned[2:]
- for i in range(len(con)):
- if con[i][0]==' ':
- con[i]=con[i][1:]
-infile.close()
-
-process(output, con, inputs)
-output.close()
-
+
+
+def main(model, ifn):
+
+ def process(output, con, words):
+ 'Run one tier'
+ loutput = open("recurse_temp.csv",'w')
+
+ wordsrem = []
+
+ for word in words:
+ stem = word[0]
+ laffixes = word[1]
+ raffixes = word[2]
+ input = word[3]
+ erc = word[4]
+ interm = copy.deepcopy(word[5])
+ interm.append(stem)
+ laffixes_rem = laffixes[:]
+ raffixes_rem = raffixes[:]
+
+ la = ra = ''
+ if laffixes_rem:
+ la = laffixes_rem.pop()
+ if raffixes_rem:
+ ra = raffixes_rem.pop()
+ if not (la + ra):
+ output.write(input+', '+stem+', '+erc+', '+('>'.join(word[5]))+'\n')
+ continue
+ else:
+
+ word[1] = laffixes_rem
+ word[2] = raffixes_rem
+
+
+ fstem = la + "#" + stem + "#" + ra
+ stem = la + "\#" + stem + "\#" + ra
+
+ # Produce the stem input
+ # print "writing", stem, infilename
+ loutput.write(stem+'\n')
+ word[5] = interm
+ word[0] = stem
+ wordsrem.append(word)
+
+ loutput.close()
+
+ if wordsrem==[]:
+ return
+
+ pyphon_maketableaux.main(model, "recurse_temp", "ot", "recurse_tab_temp")
+
+ cmd = "cat recurse_tab_temp.csv"
+ os.system(cmd)
+
+ if not con:
+ infile = open("recurse_tab_temp.csv", 'r')
+ for line in infile:
+ if line and line[0:3]==",,,":
+ line = line.replace("\r","")
+ line = line.replace("\n","")
+ sectioned = line.rsplit(',')
+ con = sectioned[3:]
+ #print "Found con:", con
+ break
+ infile.close()
+
+ cache = []
+
+ for word in wordsrem:
+ erc = word[4]
+ stem = word[0]
+
+ # Generate ERCs
+ if erc != "0":
+ eoutput = open('ercs.csv','w')
+ eoutput.write('\nERCS, gram\n')
+ w=[]
+ l=[]
+ e=[]
+ i=0
+ ci=0
+ while 1:
+ #order W L E
+ if i==len(erc):
+ break
+ if erc[i] == 'w':
+ w.append(con[ci])
+ elif erc[i] == 'l':
+ l.append(con[ci])
+ elif erc[i] == 'e':
+ e.append(con[ci])
+ elif erc[i] == '_':
+ eoutput.write((' '.join(w)) + ", " + (' '.join(l)) + ", " +
(' '.join(e)) + "\n")
+ w=[]
+ l=[]
+ e=[]
+ ci=-1
+ i=i+1
+ ci=ci+1
+
+
+ eoutput.write((' '.join(w)) + ", " + (' '.join(l)) + ", " +
(' '.join(e)) + "\n\nERCS, blank\n,,\n")
+ eoutput.close()
+
+
pyphon_generate.main("recurse_tab_temp.csv", "recurse_typ_temp", "ot", "ercs", "gram")
+ else:
+ pyphon_generate.main("recurse_tab_temp.csv", "recurse_typ_temp", "ot")
+ #Here we generate a typology for all of the inputs when we only need
one. It may shave runtime to limit this, but not my much.
+
+
+ infile = open("recurse_typ_temp.csv", 'r')
+ erc = ""
+
+
+ for line in infile:
+
+ #if "Input,Output," in line:
+ # sectioned = line.rsplit(',')
+ # con = sectioned[2:]
+
+ if line and line[0]=="/":
+ line = line.replace("\r","")
+ line = line.replace("\n","")
+ n = re.search('(?<=/).+(?=/)', line)
+ input = n.group(0)
+
+ stem = re.sub('(\\\)','',stem)
+ if input != stem:
+ continue
+
+ n = re.search('(?<=\{).*?(?=\})', line)
+ if n:
+ ercset = n.group(0)
+ else:
+ ercset = ""
+
+ ercs=[]
+
+ for n in re.finditer('[wel]+', ercset):
+ if n.group(0) not in ercs:
+ ercs.append(n.group(0))
+ if word[4]!="0":
+ ercs.append(word[4])
+
+ if ercs:
+ erc = "_".join(ercs)
+ else:
+ erc = "0"
+
+ for m in re.finditer('(?<=\[).*?(?=\])', line):
+ fstem_out = re.sub('[\#\.]','',m.group(0))
+ candidate = [fstem_out, word[1], word[2], word[3], erc, word[5]]
+ if candidate not in cache:
+ cache.append(candidate)
+ infile.close()
+ loutput.close()
+ process(output, con, cache)
+ #process_erc(output, con, words)
+
+ # Read the first line of the input file. Hopefully this is the only line.
+ infilename = ifn
+ if ".csv" in infilename:
+ infilename = infilename[:-4]
+
+ infile = open(infilename + ".csv", 'r')
+ print "Recurse over:"
+
+ inputs = []
+
+ for line in infile:
+ print line
+ line = line.rstrip("\n")
+
+ # Separate the affixes
+ sectioned = line.rsplit('\#')
+ if len(sectioned)>1:
+ stem = sectioned[1];
+ laffixes = sectioned[0].rsplit('\.') # Affixes are ordered by distance,
not orthography, so reverse the left
+ raffixes = (sectioned[2].rsplit('\.'))[::-1]
+ else:
+ stem = sectioned[0] #robustness to inputs without morpheme boundries
+ laffixes = raffixes = []
+ inputs.append([stem, laffixes, raffixes, line, '0', []])
+
+ #print stem, laffixes, raffixes
+
+ # Produce the stem input
+ output = open('recurse_inp_temp.csv','w')
+ for word in inputs:
+ output.write('\#'+word[0]+'\#\n')
+ output.close()
+
+
+ output = open(model + '_' + infilename + '_recursive_typology.csv','w')
+ #output.write('Input, '+input+"\n")
+
+
+ process(output, [], inputs)
+ output.close()
+ os.remove('recurse_temp.csv')
+ os.remove('recurse_inp_temp.csv')
+ os.remove('recurse_tab_temp.csv')
+ os.remove('recurse_typ_temp.csv')
+ os.remove('ercs.csv')
+
+
+
+if __name__ == "__main__":
+ parser = OptionParser(usage=usage)
+ parser.add_option('-V', '--version', action='store_true',
dest='version',
+ default=False,
+ help='Print pyPhon version and quit.')
+ p_options, args = parser.parse_args()
+
+ if p_options.version:
+ print 'pyPhon version %s.' % pyphon.__version__
+ raise SystemExit
+
+ if len(args) < 2:
+ print usage
+ elif len(args) < 3:
+ main(args[0], args[1])
+ else:
+ print usage
+
=======================================
--- /trunk/pyphon/src/pyphon_twotier.py Wed Jun 29 12:51:08 2011
+++ /trunk/pyphon/src/pyphon_twotier.py Fri Jul 1 18:36:02 2011
@@ -1,149 +1,217 @@
#!/usr/bin/env python
+
+#from Bowman, Samuel. 2011. Vowel Harmony, Opacity, and Finite-State OT.
MA Thesis, The University of Chicago.
import sys, os, re, string
+from optparse import OptionParser
sys.path[0:0] = 'src'
-import pyphon
-import twotierlangs
-
-
-#Performs two tier evaluation: If an input file contains comma-separated
input output pairs, then this finds intermediate forms and ercs/rankings
that can allow two-tier generation of those pairs. If only inputs are
provided, a complete typology of outputs is provided.
-#Takes arguments: input filename; model filename
-
-infilename = sys.argv[-1]
-if ".csv" in infilename:
- infilename = infilename[:-4]
-model = sys.argv[-2]
-
-log = open(model+"_twotier.csv",'w')
-log.write("input form, intermediate form, output form, erc for cyclic
eval, erc for second eval\n")
-
-infile = open(infilename + ".csv", 'r')
-ios = []
-combinations = []
-con = []
-failed = []
-
-for line in infile: #read the input
- if (line[0] == "#"):
- continue
- line = line[:-1]
- pair = line.rsplit(", ") #build pairs of IO mappings
- if len(pair)!= 2:
- continue
- pair.extend([[],[],[],False, re.sub("(\\\#)|(\\\.)", "", pair[1])])
#checked, intermediates, interm ercs, locsat, target
- ios.append(pair)
-
-loutput = open("twotier_temp.csv",'w')
-for pair in ios:
- loutput.write(pair[0]+'\n') #write inputs for the first tier
-loutput.close()
-
-cmd = "pyphon_recurse.py " + model + " " + "twotier_temp" #run the first
tier
-print cmd
-os.system(cmd)
+from pyphon import *
+import pyphon_recurse, pyphon_maketableaux, pyphon_generate
+
+usage = \
+"""
+pyphon_twotier.py modelfile inputfile
+
+Performs two-tier OT evaluation: If an input file contains comma-separated
input output pairs, then this finds intermediate forms and ERCs/rankings
that can allow two-tier generation of those pairs. If only inputs are
provided, a complete typology of outputs is provided.
+
+The model filename must have no suffix, and must refer both to a model
file (.mod), as built by pyphon_makemodel.py, and to a model description
file (.csv).
+
+Input forms with affixes must me marked as follows, with \. dividing
affixes and \# delimiting the stem, as in:
+
+a\.b\.c\#de\#f\.g
+
+Output forms should be bare segments:
+
+abcdejg
+
+"""

-#scan the resulting typology
-infile = open(model + "_twotier_temp_recursive_typology.csv", 'r')
-
-loutput = open("twotier_temp.csv",'w')
-
-for line in infile: #take these as inputs for MakeTab
- line = line[:-1]
- if (line[0] == ","):
- continue
- print line
- sectioned = line.rsplit(', ')
-
- for pair in ios:
- if sectioned[0] != pair[0]:
+
+
+def main(model, infilename):
+ IO = True
+ if ".csv" in infilename:
+ infilename = infilename[:-4]
+
+ log = open(model+"_twotier.csv",'w')
+ log.write("input form, intermediate form, output form, erc for cyclic
eval, erc for second eval\n")
+
+ infile = open(infilename + ".csv", 'r')
+ ios = []
+ combinations = []
+ con = []
+ failed = []
+
+ for line in infile: #read the input
+ if (line and line[0] == "#") or len(line)<2:
continue
-
- pair[3].append(sectioned[1])
- pair[4].append(sectioned[2])
-
- #produce input line
- loutput.write(sectioned[1]+'\n')
-
-loutput.close()
-infile.close()
-
-cmd = "pyphon_maketableaux.py " + model + " twotier_temp.csv OT
twotier_tab_temp.csv"
-print cmd
-os.system(cmd)
-
-#Read tableau
-infile = open("twotier_tab_temp.csv", 'r')
-
-for line in infile: #take these as inputs for MakeTab
- line = line[:-1]
- if (con == [] and ",,," in line):
- sectioned = line.rsplit(',')
- con = sectioned[3:]
- continue
- if(line[0]==","):
- continue
-
-print "Twotier Read CON:", con
-
-infile.close()
-
-cmd = "pyphon_generate.py twotier_tab_temp twotier_temp OT"
-print cmd
-os.system(cmd)
-
-infile = open("twotier_temp.csv", 'r')
-
-for line in infile: #scan the intermediate-final form pairs
- line = line[:-1]
- if(line[0]=="/"):
- seg = line.rsplit(",")
- n = re.search('(?<=/).*(?=/)', seg[0])
- input = n.group(0)
- n = re.search('(?<=\[).*(?=\])', seg[1])
- output = re.sub("(#)|(\.)", "", n.group(0))
-
+ line = line.replace("\r","")
+ line = line.replace("\n","")
+ pair = line.rsplit(", ") #build pairs of input-output mappings (or
input-blank mappings)
+ if len(pair)>2:
+ continue
+ if len(pair)==1:
+ IO = False
+ pair.append("")
+ print pair
+ pair.extend([[],[],[],False, re.sub("(\\\#)|(\\\.)", "", pair[1])])
#checked, intermediates, interm ercs, locsat, target
+ ios.append(pair)
+
+ if not IO:
+ print "Output forms not found. Generating typology."
+
+ loutput = open("twotier_temp.csv",'w')
+ for pair in ios:
+ loutput.write(pair[0]+'\n') #write inputs for the first tier
+ loutput.close()
+
+ pyphon_recurse.main(model, "twotier_temp")
+
+ #scan the resulting typology
+ infile = open(model + "_twotier_temp_recursive_typology.csv", 'r')
+
+ loutput = open("twotier_temp.csv",'w')
+
+ for line in infile: #take these as inputs for MakeTab
+ line = line.replace("\r","")
+ line = line.replace("\n","")
+ if (line and line[0] == ","):
+ continue
+ print line
+ sectioned = line.rsplit(', ')
+
+ for pair in ios:
+ if sectioned[0] != pair[0]:
+ continue
+
+ pair[3].append(sectioned[1])
+ pair[4].append(sectioned[2])
+
+ #produce input line
+ loutput.write(sectioned[1]+'\n')
+
+ loutput.close()
+ infile.close()
+
+ pyphon_maketableaux.main(model, "twotier_temp", "ot", "twotier_tab_temp")
+
+ #Read tableau
+ infile = open("twotier_tab_temp.csv", 'r')
+
+ for line in infile: #take these as inputs for MakeTab
+ line = line.replace("\r","")
+ line = line.replace("\n","")
+ if (con == [] and ",,," in line):
+ sectioned = line.rsplit(',')
+ con = sectioned[3:]
+ continue
+ if(line and line[0]==","):
+ continue
+
+ print "Twotier Read CON:", con
+
+ infile.close()
+
+ pyphon_generate.main("twotier_tab_temp", "twotier_temp", "ot")
+
+ infile = open("twotier_temp.csv", 'r')
+
+ for line in infile: #scan the intermediate-final form pairs
+ line = line.replace("\r","")
+ line = line.replace("\n","")
+ if(line and line[0]=="/"):
+ seg = line.rsplit(",")
+ n = re.search('(?<=/).*(?=/)', seg[0])
+ input = n.group(0)
+ n = re.search('(?<=\[).*(?=\])', seg[1])
+ output = re.sub("(#)|(\.)", "", n.group(0))
+
+
+
+ if len(seg)>len(con)+2:
+ ercs = ",".join(seg[(len(con)+1):])
+ localercs=[]
+ for n in re.finditer('[wel]+', ercs):
+ localercs.append(n.group(0))
+ localerc = "_".join(localercs)
+ else:
+ localerc = 'e'*len(con)
+
+ for pair in ios:
+ if IO and pair[6]==output and [input, output] not in pair[2]:
+ pair[2].append([input, output])
+ found = False
+ for i in range(len(pair[3])):
+ if input == pair[3][i]:
+ initialerc = pair[4][i]
+ if initialerc == '0':
+ initialerc = 'e'*len(con)
+ found = True
+ break
+ if not found:
+ continue
+
+ #Found an I-I-O mapping!
+ pair[5] = True
+ #print "Found",pair[0], input, output,initialerc,localerc,"\n"
+
log.write(pair[0]+","+input+","+output+","+initialerc+","+localerc+'\n')
+ combinations.append([pair[0], pair[6], output, initialerc, localerc])
+
+ if not IO and [input, output] not in pair[2]:
+ pair[2].append([input, output])
+ found = False
+ for i in range(len(pair[3])):
+ if input == pair[3][i]:
+ initialerc = pair[4][i]
+ if initialerc == '0':
+ initialerc = 'e'*len(con)
+ found = True
+ break
+ if not found:
+ continue
+
+ #Found an I-I-O mapping!
+ pair[5] = True
+ print "Found",pair[0], input, output,initialerc,localerc,"\n"
+
log.write(pair[0]+","+input+","+output+","+initialerc+","+localerc+'\n')
+ combinations.append([pair[0], pair[6], output, initialerc, localerc])
+
+ for pair in ios:
+ if not pair[5]:
+ failed.append(pair)
+
+ infile.close()
+
+ log.close()
+
+ if failed==[]: #If we have a complete set, send it to the language-finder
+ twotierlangs.find2tlangs(model+"_twotier.csv")
+ print "I-I-O sets written to "+model+"_twotier.csv"
+ print "Language definitions written to "+model+"_twotier_languages.csv"
+ else:
+ print "Some IO pairs could not be generated:", failed
+
+ os.remove('twotier_temp.csv')
+ os.remove('twotier_tab_temp.csv')
+ os.remove(model + "_twotier_temp_recursive_typology.csv")

- if len(seg)>len(con)+2:
- ercs = ",".join(seg[(len(con)+1):])
- localercs=[]
- for n in re.finditer('[wel]+', ercs):
- localercs.append(n.group(0))
- localerc = "_".join(localercs)
- else:
- localerc = 'e'*len(con)
-
- for pair in ios:
- if pair[6]==output and [input, output] not in pair[2]:
- pair[2].append([input, output])
- found = False
- for i in range(len(pair[3])):
- if input == pair[3][i]:
- initialerc = pair[4][i]
- if initialerc == '0':
- initialerc = 'e'*len(con)
- found = True
- break
- if not found:
- continue
-
- #Found an I-I-O mapping!
- pair[5] = True
- print "Found",pair[0], input, output,initialerc,localerc,"\n"
-
log.write(pair[0]+","+input+","+output+","+initialerc+","+localerc+'\n')
- combinations.append([pair[0], pair[6], output, initialerc, localerc])
-
-for pair in ios:
- if not pair[5]:
- failed.append(pair)
-
-infile.close()
-
-log.close()
-
-if failed==[]:
- twotierlangs.find2tlangs(model+"_twotier.csv")
- print "I-I-O sets written to "+model+"_twotier.csv"
- print "Language definitions written to "+model+"_twotier_languages.csv"
-else:
- print "Some IO pairs could not be generated:", failed
+
+if __name__ == "__main__":
+ parser = OptionParser(usage=usage)
+ parser.add_option('-V', '--version', action='store_true',
dest='version',
+ default=False,
+ help='Print pyPhon version and quit.')
+ p_options, args = parser.parse_args()
+
+ if p_options.version:
+ print 'pyPhon version %s.' % pyphon.__version__
+ raise SystemExit
+
+ if len(args) < 2:
+ print usage
+ elif len(args) < 3:
+ main(args[0], args[1])
+ else:
+ print usage
+

Reply all

Reply to author

Forward

0 new messages