(require '[pigpen.pig])
(require '[pigpen.core :as pig])
(require '[pigpen.parquet :as pqt])
(spit "/tmp/test.tsv" "1\t2\t3\n4\t5\t6\n")
(defn copy-to-parquet
[input output]
(->> (pig/load-tsv input)
(pqt/store-parquet
output
(pqt/message "test"
(pqt/binary "v1")
(pqt/binary "v2")
(pqt/binary "v3")))))
;; this doesn't produce a output file
(copy-to-parquet "/tmp/test.tsv" "/tmp/test.pq")
(pigpen.pig/write-script
"my-script.pig"
(copy-to-parquet "/tmp/test.tsv" "/tmp/test.pq"))
Caused by: pigpen.PigPenException: java.lang.IllegalArgumentException: Key must be integer
Caused by: java.lang.IllegalArgumentException: Key must be integer
at clojure.lang.APersistentVector.invoke(APersistentVector.java:284)
at clojure.core$map$fn__4553.invoke(core.clj:2622)
at clojure.lang.LazySeq.sval(LazySeq.java:40)
at clojure.lang.LazySeq.seq(LazySeq.java:49)
at clojure.lang.RT.seq(RT.java:507)
at clojure.core$seq__4128.invoke(core.clj:137)
at clojure.core$apply.invoke(core.clj:630)
at pigpen.pig.runtime$eval9900$fn__9901$fn__9902.invoke(runtime.clj:266)
at pigpen.runtime$process__GT_bind$fn__6872$fn__6873.invoke(runtime.clj:152)
at pigpen.runtime$keyword_field_selector__GT_bind$fn__6858$fn__6860.invoke(runtime.clj:125)
at pigpen.runtime$map__GT_bind$fn__6838$fn__6839.invoke(runtime.clj:48)
at pigpen.runtime$process__GT_bind$fn__6872$fn__6873.invoke(runtime.clj:152)
at pigpen.pig.runtime$eval_udf.invoke(runtime.clj:153)
at clojure.lang.Var.invoke(Var.java:383)
at pigpen.PigPenFn.exec(PigPenFn.java:63)
at pigpen.PigPenFn.exec(PigPenFn.java:39)
at org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.POUserFunc.getNext(POUserFunc.java:345)
at org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.POUserFunc.getNextDataBag(POUserFunc.java:389)
at org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.getNext(PhysicalOperator.java:310)
at org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POForEach.processPlan(POForEach.java:378)
at org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POForEach.getNextTuple(POForEach.java:298)
at org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.processInput(PhysicalOperator.java:281)
at org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POForEach.getNextTuple(POForEach.java:242)
at org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigGenericMapBase.runPipeline(PigGenericMapBase.java:282)
at org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigGenericMapBase.map(PigGenericMapBase.java:277)
at org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigGenericMapBase.map(PigGenericMapBase.java:64)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:144)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:764)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370)
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:212)REGISTER pigpen.jar;
load34545 = LOAD '/tmp/test.tsv' USING PigStorage('\n') AS (value:chararray);
DEFINE udf34551 pigpen.PigPenFn('(clojure.core/require (quote [pigpen.runtime]) (quote [pigpen.extensions.core]))','(clojure.core/comp (pigpen.runtime/process->bind (pigpen.runtime/pre-process :pig :native)) (pigpen.runtime/map->bind (clojure.core/fn [s] (if s (pigpen.extensions.core/structured-split s "\\t")))) (pigpen.runtime/keyword-field-selector->bind [:v1 :v2 :v3]) (pigpen.runtime/process->bind (pigpen.runtime/post-process :pig :native)))');
project34549_0 = FOREACH load34545 GENERATE udf34551(value) AS (value0);
project34549 = FOREACH project34549_0 GENERATE FLATTEN(value0) AS (v1:chararray, v2:chararray, v3:chararray);
STORE project34549 INTO '/tmp/test.pq' USING parquet.pig.ParquetStorer(); :dependencies [[org.clojure/clojure "1.7.0"] [com.netflix.pigpen/pigpen-pig "0.3.1"] [com.netflix.pigpen/pigpen-parquet-pig "0.3.1"]]
:profiles {:dev {:dependencies [[org.apache.pig/pig "0.13.0"] [org.apache.hadoop/hadoop-core "1.1.2"]]}})(defn copy-to-parquet
[input output]
(->> (pig/load-tsv input)
(pig/map (fn [[v1 v2 v3]] {:v1 v1, :v2 v2, :v3 v3}))
(pqt/store-parquet
output
(pqt/message "test"
(pqt/binary "v1")
(pqt/binary "v2")
(pqt/binary "v3")))))
--
You received this message because you are subscribed to the Google Groups "PigPen Support" group.
To unsubscribe from this group and stop receiving emails from it, send an email to pigpen-suppor...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.
It should be stated more explicitly in the doc maybe with a simple example.
something like packing a uberjar and the use hadoop -jar to submit the job
/tmp/test.pq/tmp/test.pq/._SUCCESS.crc/tmp/test.pq/.part-m-00000.parquet.crc/tmp/test.pq/_SUCCESS/tmp/test.pq/part-m-00000.parquet(pig/dump
(pqt/load-parquet
"/tmp/test.pq"
(pqt/message "test"
(pqt/binary "v1")
(pqt/binary "v2")
(pqt/binary "v3"))))
InvalidInputException Input path does not exist: file:/tmp/test.pq/_SUCCESS org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus (FileInputFormat.java:235)
(pigpen.pig/write-script "my-script.pig" (pig/dump (load-from-parquet "/tmp/test.pq/part-m-00000.parquet")))AssertionError Assert failed: Query was not a pigpen query(->> query meta keys (some #{:pig :baked})) pigpen.oven/bake (oven.clj:366)(pigpen.pig/write-script"my-script.pig"(load-from-parquet "/tmp/test.pq/part-m-00000.parquet"))
A = LOAD 'myfile.txt' USING PigStorage('\t');
DUMP A;