Jaql really needs better pretty printing, but it is
straight-forward to do it by hand if you have parenthesis
matching in your editor of choice. I reformatted the explain and
added some comments to highlight possible trouble:
jaql> explain $scores -> write(hdfs("hdfs/nb/1c/
genre.colorInfo.certificate.keyword/scores-sum.bernoulli"));
(
$signature = ( system::const([
"genre",
"colorInfo",
"certificate",
"keyword"
])
-> system::combine(fn(a, b) (system::strcat(a, ".", b))) ),
$fd_0 = system::mapReduce({
("input"):(system::hdfs(system::strcat("hdfs/nb/1c/", $signature,
"/docAttrValueTF"))),
("output"):(system::HadoopTemp(schema=schema { "docid"?: any,
"values":
[ { "attribute"?: any, "value"?: any, "TF"?:
any } ... ] })),
("map"):(fn(schema [ any ... ] $mapIn0) (
$mapI0n
-> transform each $i0 ([{ ($i0).("docid") }, $i0]))),
//-----------------------------------------------------------------
// All the values for one group are forming a nested array,
// which can cause performance issues if the arrays are large.
// However, in this case, I believe that is exactly what you wanted
// to do, and the nested arrays are modest in size.
//-----------------------------------------------------------------
("reduce"): (fn(schema { "docid"?: any } keys, schema [ any ... ]
values) (
[{ (keys).*,
("values"): (values -> transform each $ ({ ($).
("attribute"), ($).("value"), ($). ("TF") })) }])),
("schema"):(system::const({ "key": schema { "docid"?: any },
"value": schema any })),
("options"):(null) }),
system::mapReduce({
("input"):(system::hdfs(system::strcat("hdfs/nb/1c/",
$signature, "/userClassAttrValueDefaultProb"))),
("map"): (fn(schema [ any ... ] $mapIn) (
$mapIn
-> expand each u (
//-----------------------------------------------------------------
// For each outer element, the entire inner file is re-read.
//-----------------------------------------------------------------
system::read($fd_0)
-> transform each d (
{ (u).("userid"),
(u).("class"),
(d).("docid"),
("P"):((
//-----------------------------------------------------------------
// For each pair of values [O(N^2)],
// a join is done [at least O(N^3), possibly O(N^4)]
// Of course these are different N, so it might be ok, or it
// might take a really long time. Is there some way to reuse
// computations over these O(N^2) joins?
//-----------------------------------------------------------------
group each $join_in in
( u.defaults -> transform each
$defaults { defaults?: $defaults })
by $join_on =
[ $join_in.defaults.attribute, $join_in.defaults.value ]
as $as_0,
( d.values -> transform each $values
{ values?: $values } )
by [ $join_in.values.attribute,
$join_in.values.value ]
as $as_1
expand (
if( not (isnull ($join_on)) ) (
system::nullElementOnEmpty($as_1)
-> expand each $b (
$as_0 -> transform each $a
({ ($a).*, ($b).* })
)
) else (
$as_0 -> transform each $a
({ ($a).* })
))
-> transform each $ {
$.defaults.attribute,
$.defaults.value,
P: $.defaults.P,
TF: system::firstNonNull($.values.TF,
$.defaults.TF) }
-> transform each $ {
$.attribute,
$.value,
P:( if( $.TF > 0 ) $.P else 1-$.P ) }
-> group each $ by $__unused__ = null as $
expand (
$ -> aggregate as $ full [ $ ->
transform each $ $.P -> system::sum() ]
-> transform each $ system::index($,
0)
-> system::singleton()
))
}))
-> transform each $fv [null, $fv]
)),