[clml] r127 committed - Tuning up the two-tier code.

0 views

Skip to first unread message

cl...@googlecode.com

unread,

Jun 28, 2011, 7:12:43 PM6/28/11

to cl...@googlegroups.com

Revision: 127
Author: sbo...@isetsu.net
Date: Tue Jun 28 16:12:00 2011
Log: Tuning up the two-tier code.
http://code.google.com/p/clml/source/detail?r=127

Modified:
/trunk/pyphon/src/permutationdistance.py
/trunk/pyphon/src/pyphon/bitERCs.py
/trunk/pyphon/src/pyphon_find2tlanguages.py
/trunk/pyphon/src/pyphon_recurse.py
/trunk/pyphon/src/pyphon_twotier.py

=======================================
--- /trunk/pyphon/src/permutationdistance.py Thu Feb 10 14:55:26 2011
+++ /trunk/pyphon/src/permutationdistance.py Tue Jun 28 16:12:00 2011
@@ -21,7 +21,7 @@
return False
return True

-def getrankings(ercset):
+def getrankings(ercset, n=0):
"""Get all of the rankings consistent with an erc set"""
pset = itertools.permutations(range(len(ercset[0])))
psetl = list(pset)
@@ -33,6 +33,8 @@
okay = False
if okay:
ret.append(p)
+ if n!=0 and len(ret)>n-1:
+ return ret

return ret

@@ -121,7 +123,6 @@

#Get permutation sets, compute distances, find min

-
#r1 = getrankings(ercset1)
#r2 = getrankings(ercset2)

=======================================
--- /trunk/pyphon/src/pyphon_find2tlanguages.py Fri Feb 11 12:18:05 2011
+++ /trunk/pyphon/src/pyphon_find2tlanguages.py Tue Jun 28 16:12:00 2011
@@ -5,10 +5,16 @@
import sys
from pyphon import *
from permutationdistance import getrankings, set_dif
+#from temperctools import cover_dist
+from permutationdistance import getrankings
+

infilename = sys.argv[-1]
if ".csv" in infilename:
infilename = infilename[:-4]
+
+log = open(infilename+"_languages.csv",'w')
+

infile = open(infilename + ".csv", 'r')

@@ -37,14 +43,25 @@

#next_input()

-def print_languages(languages):
+def print_languages(languages, log, i):
for language in languages:
- print "Language:"
- print language[0]
- print language[1]
+ print "Language:", i
+ log.write("Language "+str(i)+"\n")
+ print "Cyclic ERCs", language[0]
+ log.write(str(language[0]))
+ log.write("\n")
+ rs=getrankings(language[0], 10)
+ print "Sample Cyclic Rankings", rs
+ log.write(str(language[1]))
+ log.write("\n")
+ print "Postlex ERCs", language[1]
+ rs=getrankings(language[1], 10)
+ print "Sample Postlex Rankings", rs
+
for member in language[2]:
print member[0], member[1], member[2]
- #print language[3], language[4]
+ log.write(str(member[0])+" "+str(member[1])+" "+str(member[2])+"\n")
+ #print language[3]

def compatible_languages(cohort, languages):
retlanguages = []
@@ -58,28 +75,28 @@
es1 = RNF.RNF(language[0]+member[3].rsplit("_"))
es2 = RNF.RNF(language[1]+member[4].rsplit("_"))

- #are the above really equivalent to concatenate+simplify
+ #are the above really equivalent to concatenate + simplify?

es1 = language[0]+member[3].rsplit("_")

es2 = language[1]+member[4].rsplit("_")

for erc in es1:
- if not "w" in erc:
+ if not "l" in erc:
es1.remove(erc)

for erc in es2:
- if not "w" in erc:
+ if not "l" in erc:
es2.remove(erc)

if (RNF.consistent(es1) and RNF.consistent(es2)):
#print "success"
retlanguages.append([es1, es2, language[2]+[member]])
- else:
+ #else:
#print "fail"
- print RNF.consistent(es1), es1
- print RNF.consistent(es2), es2
+ #print RNF.consistent(es1), es1
+ #print RNF.consistent(es2), es2

todelete = []
@@ -100,15 +117,21 @@
languages = []

for cohort in cohorts:
- print "Adding cohort with", len(languages), "languages:",
cohort[0][0], "->", cohort[0][2]
- #print_languages(languages)
languages = compatible_languages(cohort, languages)
+ print "Adding cohort with", len(languages), "languages:", cohort[0][0]
+ #print_languages(languages)
if len(languages)==0:
print "No languages found. Sorry!"
break

-
annotated_languages = []
+i=0
+for language in languages:
+ i+=1
+ #language.append(cover_dist(language[0], language[1]))
+ annotated_languages.append(language)
+ print_languages([language], log, i)
+
#for language in languages:
# r1 = getrankings(language[0])
# r2 = getrankings(language[1])
@@ -117,4 +140,3 @@
# language.append(sd[1])
# annotated_languages.append(language)
# print_languages([language])
-print_languages(languages)
=======================================
--- /trunk/pyphon/src/pyphon_recurse.py Thu Feb 10 14:55:26 2011
+++ /trunk/pyphon/src/pyphon_recurse.py Tue Jun 28 16:12:00 2011
@@ -1,5 +1,10 @@
#!/usr/bin/env python
-import sys, os, re, string
+
+import sys, os, re, string, copy
+sys.path[0:0] = 'src' # puts the /foo directory at the start of your path
+from pyphon_maketableaux import *
+from pyphon_generate import *
+import pyphon_maketableaux, pyphon_generate

usage = \
"""pyphon_recurse.py modelfile inputfile
@@ -14,165 +19,198 @@

The output defaults to inputfile-modelfile-recursive_typology.csv.
"""
-
-def process_typology(output, fstem, laffixes_rem, raffixes_rem):
- 'Read the typology file and pull out (output, erc) pairs'
- infile = open(infilename + "_ro.csv", 'r')
- cache = []
- erc = ""
-
- for line in infile:
-
- if "Input,Output," in line:
- sectioned = line.rsplit(',')
- con = sectioned[2:]
-
- if "ERC Set:," in line:
- n = re.search('(?<=\{).*?(?=\})', line)
- ercset = n.group(0)
- ercs=[]
-
- for n in re.finditer('[wel]+', ercset):
- if n.group(0) not in ercs:
- ercs.append(n.group(0))
-
- erc = "_".join(ercs)
-
- elif "/,[" in line:
- line = line[:-1]
- print "Read output:", line
-
- for m in re.finditer('(?<=\[).*?(?=\])', line):
- fstem_out = re.sub('[\#\.]','',m.group(0))
- if not erc:
- erc = "0"
- cache.append([fstem_out, erc])
- infile.close()
- for pair in cache:
- process_erc(output, pair[0], pair[1], laffixes_rem, raffixes_rem, con)
-
-def process_erc(output, stem, erc, laffixes, raffixes, con):
- 'Take (output, erc) pairs, add affixes, rinse, repeat'
- laffixes_rem = laffixes[:];
- raffixes_rem = raffixes[:];
-
- la = ra = ''
- if laffixes_rem: la = laffixes_rem.pop()
- if raffixes_rem: ra = raffixes_rem.pop()
-
- if not (la + ra):
- output.write(stem+', '+erc+'\n')
+
+def process(output, con, words):
+ 'Run one tier'
+ loutput = open(infilename + "_ri.csv",'w')
+
+ wordsrem = []
+
+ for word in words:
+ stem = word[0]
+ laffixes = word[1]
+ raffixes = word[2]
+ input = word[3]
+ erc = word[4]
+ interm = copy.deepcopy(word[5])
+ interm.append(stem)
+ laffixes_rem = laffixes[:]
+ raffixes_rem = raffixes[:]
+
+ la = ra = ''
+ if laffixes_rem:
+ la = laffixes_rem.pop()
+ if raffixes_rem:
+ ra = raffixes_rem.pop()
+ if not (la + ra):
+ output.write(input+', '+stem+', '+erc+', '+('>'.join(word[5]))+'\n')
+ continue
+ else:
+
+ word[1] = laffixes_rem
+ word[2] = raffixes_rem
+
+
+ fstem = la + "#" + stem + "#" + ra
+ stem = la + "\#" + stem + "\#" + ra
+
+ # Produce the stem input
+ # print "writing", stem, infilename
+ loutput.write(stem+'\n')
+ word[5] = interm
+ word[0] = stem
+ wordsrem.append(word)
+
+ loutput.close()
+
+ if wordsrem==[]:
return
- # Show intermediate stages
- #else:
- #output.write(stem+', '+erc+':\n')
-
- fstem = la + "#" + stem + "#" + ra
- stem = la + "\#" + stem + "\#" + ra
-
- print "\n\nRecurse is processing the input:", fstem, "(remaining
affixes:", laffixes_rem, raffixes_rem,"ERCs: "+erc+")"
-
-
- # Produce the stem input
- loutput = open(infilename + "_ri.csv",'w')
- loutput.write(stem+'\n')
- loutput.close()
-
- # Generate ERCs
- if erc != "0":
- loutput = open('ercs.csv','w')
- loutput.write('\nERCS, gram\n')
- w=[]
- l=[]
- e=[]
- i=0
- ci=0
- while 1:
- #order W L E
- if i==len(erc):
- break
- if erc[i] == 'w':
- w.append(con[ci])
- elif erc[i] == 'l':
- l.append(con[ci])
- elif erc[i] == 'e':
- e.append(con[ci])
- elif erc[i] == '_':
- loutput.write((' '.join(w)) + ", " + (' '.join(l)) + ", " +
(' '.join(e)) + "\n")
- w=[]
- l=[]
- e=[]
- ci=-1
- i=i+1
- ci=ci+1
-
-
- loutput.write((' '.join(w)) + ", " + (' '.join(l)) + ", " +
(' '.join(e)) + "\n\nERCS, blank\n,,\n")
- loutput.close()
-
-
- cmd = "pyphon_maketableaux.py " + model + " " + infilename + "_ri OT"
- print cmd
- os.system(cmd)
-
- cmd = "pyphon_generate.py " + model + "-" + infilename + "_ri-OT " +
infilename + "_ro.csv OT"
- if erc != "0":
- cmd = cmd + " -f ercs -g gram"
- print cmd
- os.system(cmd)
-
- cmd = "cat " + infilename + "_ro.csv"
- print cmd
- os.system(cmd)
-
- process_typology(output, fstem, laffixes_rem, raffixes_rem)
-
+
+ pyphon_maketableaux.main(model, infilename+"_ri", "ot", "temptab")
+
+ cache = []
+
+ for word in wordsrem:
+ erc = word[4]
+ stem = word[0]
+
+ # Generate ERCs
+ if erc != "0":
+ eoutput = open('ercs.csv','w')
+ eoutput.write('\nERCS, gram\n')
+ w=[]
+ l=[]
+ e=[]
+ i=0
+ ci=0
+ while 1:
+ #order W L E
+ if i==len(erc):
+ break
+ if erc[i] == 'w':
+ w.append(con[ci])
+ elif erc[i] == 'l':
+ l.append(con[ci])
+ elif erc[i] == 'e':
+ e.append(con[ci])
+ elif erc[i] == '_':
+ eoutput.write((' '.join(w)) + ", " + (' '.join(l)) + ", " +
(' '.join(e)) + "\n")
+ w=[]
+ l=[]
+ e=[]
+ ci=-1
+ i=i+1
+ ci=ci+1
+
+
+ eoutput.write((' '.join(w)) + ", " + (' '.join(l)) + ", " +
(' '.join(e)) + "\n\nERCS, blank\n,,\n")
+ eoutput.close()
+
+ pyphon_generate.main("temptab.csv", "temptyp", "ot", "ercs", "gram")
+ else:
+ pyphon_generate.main("temptab.csv", "temptyp", "ot")
+ #Here we generate a typology for all of the inputs when we only need
one. It may shave runtime to limit this, but not my much.
+
+
+ infile = open("temptyp.csv", 'r')
+ erc = ""
+
+
+ for line in infile:
+
+ #if "Input,Output," in line:
+ # sectioned = line.rsplit(',')
+ # con = sectioned[2:]
+
+ if line[0]=="/":
+ line = line[:-1]
+
+ n = re.search('(?<=/).+(?=/)', line)
+ input = n.group(0)
+
+ stem = re.sub('(\\\)','',stem)
+ if input != stem:
+ continue
+
+ n = re.search('(?<=\{).*?(?=\})', line)
+ if n:
+ ercset = n.group(0)
+ else:
+ ercset = ""
+
+ ercs=[]
+
+ for n in re.finditer('[wel]+', ercset):
+ if n.group(0) not in ercs:
+ ercs.append(n.group(0))
+ if word[4]!="0":
+ ercs.append(word[4])
+
+ if ercs:
+ erc = "_".join(ercs)
+ else:
+ erc = "0"
+
+ for m in re.finditer('(?<=\[).*?(?=\])', line):
+ fstem_out = re.sub('[\#\.]','',m.group(0))
+ candidate = [fstem_out, word[1], word[2], word[3], erc, word[5]]
+ if candidate not in cache:
+ cache.append(candidate)
+ infile.close()
+ loutput.close()
+ process(output, con, cache)
+ #process_erc(output, con, words)
+
+
+
infilename = sys.argv[-1]
if ".csv" in infilename:
infilename = infilename[:-4]
model = sys.argv[-2]

# Read the first line of the input file. Hopefully this is the only line.
-input = open(infilename + ".csv", 'r').readline()
-input = input.rstrip("\n")
-print "Recurse over:", input, ""
-
-# Separate the affixes
-sectioned = input.rsplit('\#')
-if len(sectioned)>1:
- stem = sectioned[1];
- laffixes = sectioned[0].rsplit('\.') # Affixes are ordered by distance,
not orthography, so reverse the left
- raffixes = (sectioned[2].rsplit('\.'))[::-1]
-else:
- stem = sectioned[0] #robustness to inputs without morpheme boundries
- laffixes = raffixes = []
+infile = open(infilename + ".csv", 'r')
+print "Recurse over:"
+
+inputs = []
+
+for line in infile:
+ line = line.rstrip("\n")
+
+ # Separate the affixes
+ sectioned = line.rsplit('\#')
+ if len(sectioned)>1:
+ stem = sectioned[1];
+ laffixes = sectioned[0].rsplit('\.') # Affixes are ordered by distance,
not orthography, so reverse the left
+ raffixes = (sectioned[2].rsplit('\.'))[::-1]
+ else:
+ stem = sectioned[0] #robustness to inputs without morpheme boundries
+ laffixes = raffixes = []
+ inputs.append([stem, laffixes, raffixes, line, '0', []])

#print stem, laffixes, raffixes

# Produce the stem input
output = open(infilename + '_ri.csv','w')
-output.write('\#'+stem+'\#\n')
+for word in inputs:
+ output.write('\#'+word[0]+'\#\n')
output.close()

-# Process the stem
-print "\n\nRecurse is processing the stem:", stem, ""
-
-cmd = "pyphon_maketableaux.py " + model + " " + infilename + "_ri OT"
-print cmd
-os.system(cmd)
-
-cmd = "pyphon_generate.py " + model + "-" + infilename + "_ri-OT " +
infilename + "_ro.csv OT"
-print cmd
-os.system(cmd)
-
-cmd = "cat " + infilename + "_ro.csv"
-print cmd
-os.system(cmd)
-
output = open(model + '_' + infilename + '_recursive_typology.csv','w')
-output.write('Input, '+input+"\n")
-
-process_typology(output, stem, laffixes, raffixes)
+#output.write('Input, '+input+"\n")
+
+infile = open(model + ".csv", 'r')
+for line in infile:
+ if line[0:3]=="CON":
+ line = line[:-1]
+ sectioned = line.rsplit(',')
+ con = sectioned[2:]
+ for i in range(len(con)):
+ if con[i][0]==' ':
+ con[i]=con[i][1:]
+infile.close()
+
+process(output, con, inputs)
output.close()

=======================================
--- /trunk/pyphon/src/pyphon_twotier.py Thu Feb 10 14:55:26 2011
+++ /trunk/pyphon/src/pyphon_twotier.py Tue Jun 28 16:12:00 2011
@@ -3,40 +3,6 @@
sys.path[0:0] = 'src' # puts the /foo directory at the start of your path
from pyphon import *

-def writeercs(erc, con, filename):
- # Generate ERCs - not yet used here
- if erc != "0":
- loutput = open(filename+'.csv','w')
- loutput.write('\nERCS, gram\n')
- w=[]
- l=[]
- e=[]
- i=0
- ci=0
- while 1:
- #order W L E
- if i==len(erc):
- break
- if erc[i] == 'w':
- w.append(con[ci])
- elif erc[i] == 'l':
- l.append(con[ci])
- elif erc[i] == 'e':
- e.append(con[ci])
- elif erc[i] == '_':
- loutput.write((' '.join(w)) + ", " + (' '.join(l)) + ", " +
(' '.join(e)) + "\n")
- w=[]
- l=[]
- e=[]
- ci=-1
- i=i+1
- ci=ci+1
-
-
- loutput.write((' '.join(w)) + ", " + (' '.join(l)) + ", " + (' '.join(e))
+ "\n\nERCS, blank\n,,\n")
- loutput.close()
-
-
infilename = sys.argv[-1]
if ".csv" in infilename:
infilename = infilename[:-4]
@@ -49,6 +15,8 @@
ios = []
combinations = []
con = []
+failed = []
+
for line in infile:
if (line[0] == "#"):
continue
@@ -59,9 +27,11 @@
ios.append(pair)

for pair in ios: #For each IO pair
+ checked = []
intermediates = []
intermediate_ercs = []
- target = re.sub("\\\#", "", pair[1])
+ locally_satisfied = False
+ target = re.sub("(\\\#)|(\\\.)", "", pair[1])
print "Analyzing pair", pair[0], target

@@ -85,10 +55,13 @@
print line
sectioned = line.rsplit(', ')

+ if sectioned[0] in intermediates:
+ print 'hrnh?'
+
intermediates.append(sectioned[0])
- intermediate_ercs.append(sectioned[1])
-
- print "GotInterm", sectioned[1]
+ intermediate_ercs.append(sectioned[1])
+
+ #print "Twotier got intermmediate form", sectioned[0], sectioned[1]

#produce input line
loutput.write(sectioned[0]+'\n')
@@ -142,27 +115,37 @@
localercs.append(n.group(0))
localerc = "_".join(localercs)
else:
- print seg[-1]
localerc = 'e'*len(con)

- if target==output:
+ if target==output and [input, output] not in checked:
+ checked.append([input, output])
for i in range(len(intermediates)):
if input == intermediates[i]:
initialerc = intermediate_ercs[i]
if initialerc == '0':
initialerc = 'e'*len(con)
break
-
- print "Found output!",pair[0], input, output,initialerc,localerc,"\n"
+
+ #Found an I-I-O mapping!
+ locally_satisfied = True
+ #print "Found output!",pair[0], input, output,initialerc,localerc,"\n"
+ print line

log.write(pair[0]+","+input+","+output+","+initialerc+","+localerc+'\n')
combinations.append([pair[0], target, output, initialerc, localerc])
initialerc = re.sub("_", ", ", pair[1])
continue

-
-
+ if not locally_satisfied:
+ failed.append(pair)

infile.close()

log.close()
+
+if failed==[]:
+ cmd = "pyphon_find2tlanguages.py "+model+"_twotier.csv"
+ print cmd
+ os.system(cmd)
+else:
+ print "Some IO pairs could not be generated:", failed

Reply all

Reply to author

Forward

0 new messages