Modified:
/branches/thefuture-modules/thebeast-apps/src/main/scala/org/riedelcastro/thebeast/apps/DependencyParsing.scala
=======================================
---
/branches/thefuture-modules/thebeast-apps/src/main/scala/org/riedelcastro/thebeast/apps/DependencyParsing.scala
Sun Mar 21 13:01:44 2010
+++
/branches/thefuture-modules/thebeast-apps/src/main/scala/org/riedelcastro/thebeast/apps/DependencyParsing.scala
Thu Mar 25 22:50:25 2010
@@ -6,58 +6,113 @@
import vectors.{Vector, VectorVar}
import org.riedelcastro.thebeast.solve.SumProductBeliefPropagation
import org.riedelcastro.thebeast.env.TheBeastImplicits._
+import io.Source
+import collection.mutable.{HashMap, ArrayBuffer}
/**
* Simple Dependency Parsing model
*/
object DependencyParsing extends TheBeastEnv {
+
+ val maxLength = 5
+
+ val Tokens = Ints(0 until maxLength)
+ val Words = new MutableValues[String]()
+ val Tags = new MutableValues[String]()
+ val length = Var("length", Ints(1 until maxLength))
+ val link = Predicate("link", Tokens x Tokens)
+ val word = Predicate("word", Tokens x Words)
+ val pos = Predicate("pos", Tokens x Tags)
+ val token = Predicate("token", Tokens)
+
+ val ROOT = "Root"
+
+ def asTokenProperties[T](seq:Seq[T]):Seq[(Int,T)] = {
+ for (i <- 0 until seq.size) yield i -> seq(i)
+ }
+
+ def asTokenProperties[T](root:T, seq:Seq[T]):Seq[(Int,T)] = {
+ Seq((0,root)) ++ (for (i <- 0 until seq.size) yield i -> seq(i))
+ }
+
+
+ def loadCoNLLFile(file: String): Seq[Env] = {
+ val result = new ArrayBuffer[Env]
+ val rows = new ArrayBuffer[Array[String]]
+ for (line <- Source.fromFile(file).getLines.map(_.trim)) {
+ if ("" == line) {
+ val env = new MutableEnv
+ env.atoms(token) ++= Seq(0) ++ rows.map(row => row(0).toInt)
+ env.atoms(word) ++= asTokenProperties(ROOT,rows.map(row => row(1)))
+ env.atoms(pos) ++= asTokenProperties(ROOT,rows.map(row => row(3)))
+ env.atoms(link) ++= rows.map(row => row(6).toInt->row(0).toInt)
+ } else {
+ rows += line.split("\\s+")
+ }
+ }
+ result
+ }
+
+ def trainNaively(data:Seq[Env]) : Map[(String,String),Double] = {
+ val childCounts = new HashMap[String,Int]
+ val edgeCounts = new HashMap[(String,String),Int]
+ for (env:Env <- data){
+ val tags = Map() ++ env(pos).getSources(Some(true))
+ for (edge <- env(link).getSources(Some(true))){
+ val childTag = tags(edge._2)
+ var headTag = tags(edge._1)
+ childCounts(childTag) = childCounts.getOrElse(childTag,0) + 1
+ edgeCounts(headTag->childTag) =
edgeCounts.getOrElse(headTag->childTag,0) + 1
+ }
+ }
+ Map() ++ (for (edge <- edgeCounts.keys) yield edge ->
edgeCounts(edge).toDouble / childCounts(edge._2))
+ }
+
+
def main(args: Array[String]): Unit = {
Logging.level = Logging.DEBUG
- val maxLength = 5
-
- val Tokens = Ints(0 until maxLength)
- val Words = new MutableValues[String]()
- val Tags = new MutableValues[String]()
- val length = Var("length", Ints(1 until maxLength))
- val link = Predicate("link", Tokens x Tokens)
- val word = Predicate("word", Tokens x Words)
- val pos = Predicate("pos", Tokens x Tags)
- val token = Predicate("token", Tokens)
+
//first order formulae
- val bias = vectorSum(Tokens, Tokens) {(h, m) =>
- $(link(h, m)) * unit("bias")}
- val wordPair = vectorSum(Tokens, Tokens, Words, Words) {(h, m, h_word,
m_word) =>
- $(word(h, h_word) && word(m, m_word) && link(h, m)) * unit(h_word,
m_word)}
- val posPair = vectorSum(Tokens, Tokens, Tags, Tags) {(h, m, h_pos,
m_pos) =>
- $(pos(h, h_pos) && pos(m, m_pos) && link(h, m)) * unit(h_pos, m_pos)}
+ val bias = vectorSum(Tokens, Tokens) {
+ (h, m) =>
+ $(link(h, m)) * unit("bias")
+ }
+ val wordPair = vectorSum(Tokens, Tokens, Words, Words) {
+ (h, m, h_word, m_word) =>
+ $(word(h, h_word) && word(m, m_word) && link(h, m)) * unit(h_word,
m_word)
+ }
+ val posPair = vectorSum(Tokens, Tokens, Tags, Tags) {
+ (h, m, h_pos, m_pos) =>
+ $(pos(h, h_pos) && pos(m, m_pos) && link(h, m)) * unit(h_pos,
m_pos)
+ }
val treeConstraint = SpanningTreeConstraint(link, token, 0,
LessThan(Tokens))
val weightVar = VectorVar("weights")
//val linearModel = ((wordPair + posPair + bias) dot weightVar) +
treeConstraint
- val linearModel = ((wordPair + posPair + bias) dot weightVar)
- val probModel = normalize(exp(linearModel) *
ptree(link,token,0,LessThan(Tokens)))
+ val linearModel = ((wordPair + posPair + bias) dot weightVar)
+ val probModel = normalize(exp(linearModel) * ptree(link, token, 0,
LessThan(Tokens)))
//some example data
val sentence1 = new MutableEnv
sentence1(length) = 5
sentence1.atoms(word) ++=
List("Root", "The", "man", "is", "fast").zipWithIndex.map(_.swap)
- sentence1.atoms(pos) ++=
List("Root", "DT", "NN", "VB", "AD").zipWithIndex.map(_.swap)
- sentence1.atoms(link) ++= List((0,3),(3,2),(3,4),(2,1))
+ sentence1.atoms(pos) ++=
List("Root", "DT", "NN", "VB", "AD").zipWithIndex.map(_.swap)
+ sentence1.atoms(link) ++= List((0, 3), (3, 2), (3, 4), (2, 1))
sentence1.atoms(token) ++= (0 until 5)
- sentence1.close(word,true)
- sentence1.close(pos,true)
- sentence1.close(link,true)
- sentence1.close(token,true)
+ sentence1.close(word, true)
+ sentence1.close(pos, true)
+ sentence1.close(link, true)
+ sentence1.close(token, true)
val weights = new Vector
weights("bias") = -2.0
- weights("NN","DT") = 1.0
- weights("VB","NN") = 1.0
+ weights("NN", "DT") = 1.0
+ weights("VB", "NN") = 1.0
weights("Root", "VB") = 1.0
weights("VB", "AD") = 1.0
@@ -77,10 +132,10 @@
var sum = 0.0
for (h <- 0 until 5; if (h != 1)) {
- sum += marginals.belief(FunAppVar(link,(h,1))).belief(true)
+ sum += marginals.belief(FunAppVar(link, (h, 1))).belief(true)
}
println(sum)
- println(ptree(link,token,0,LessThan(Tokens)).asLogic)
+ println(ptree(link, token, 0, LessThan(Tokens)).asLogic)
}