/cdap/data/namespaces/testspace/data/file.txt
Below is code:
public class AppSpark extends AbstractSpark
{
@Override
protected void configure() {
createDataset("gold", FileSet.class,
FileSetProperties.builder().setBasePath("cdap")
.setInputFormat(TextInputFormat.class)
.setOutputFormat(TextOutputFormat.class)
.build());
createDataset("out", FileSet.class,
FileSetProperties.builder().setOutputProperty(TextOutputFormat.SEPERATOR, ",").setBasePath("result")
.setInputFormat(TextInputFormat.class)
.setOutputFormat(TextOutputFormat.class)
.build());
setName(AppSpark.class.getName());
setDescription("spark program to access HDFS data");
setMainClass(ScalaSparkHDFS.class);
setDriverResources(new Resources(1024));
setExecutorResources(new Resources(1024, 2));
}
}
class ScalaSparkHDFS extends SparkMain
{
override def run(implicit sec: SparkExecutionContext): Unit = {
val sc = new SparkContext
val lines: RDD[(LongWritable,Text)] = sc.fromDataset("gold")
val data = lines.map(line=>(line._1,line._2))
println(data.count())
data.saveAsDataset("nd2629","out")
}
}
*/
public class App extends AbstractApplication
{
@Override
public void configure() {
addSpark(new AppSpark());
}
}
Error message throwing:
SNAPSHOT.spark.sparkhdfs.AppSpark, runId=23ede9f3-5789-11e7-b613-acde48001122} 2017-06-22 15:27:02,870 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-3:a.r.RemoteActorRefProvider$RemotingTerminator@74] - Shutting down remote daemon. 2017-06-22 15:27:02,875 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-3:a.r.RemoteActorRefProvider$RemotingTerminator@74] - Remote daemon shut down; proceeding with flushing remote transports. 2017-06-22 15:27:02,892 - ERROR [SparkRunnersparkhdfs.AppSpark:c.c.c.i.a.r.ProgramControllerServiceAdapter@93] - Program terminated with exception java.util.concurrent.ExecutionException: java.io.IOException: No input paths specified in job at com.google.common.util.concurrent.AbstractFuture$Sync.getValue(AbstractFuture.java:294) ~[com.google.guava.guava-13.0.1.jar:na] at com.google.common.util.concurrent.AbstractFuture$Sync.get(AbstractFuture.java:281) ~[com.google.guava.guava-13.0.1.jar:na] at com.google.common.util.concurrent.AbstractFuture.get(AbstractFuture.java:116) ~[com.google.guava.guava-13.0.1.jar:na] at co.cask.cdap.app.runtime.spark.SparkRuntimeService.run(SparkRuntimeService.java:277) ~[co.cask.cdap.cdap-spark-core-4.1.1.jar:na] at com.google.common.util.concurrent.AbstractExecutionThreadService$1$1.run(AbstractExecutionThreadService.java:52) ~[com.google.guava.guava-13.0.1.jar:na] at co.cask.cdap.app.runtime.spark.SparkRuntimeService$2$1.run(SparkRuntimeService.java:335) [co.cask.cdap.cdap-spark-core-4.1.1.jar:na] at java.lang.Thread.run(Thread.java:745) [na:1.8.0_121] Caused by: java.io.IOException: No input paths specified in job at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:231) ~[org.apache.hadoop.hadoop-mapreduce-client-core-2.3.0.jar:na] at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:340) ~[org.apache.hadoop.hadoop-mapreduce-client-core-2.3.0.jar:na] at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:120) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) ~[na:na] at scala.Option.getOrElse(Option.scala:120) ~[org.scala-lang.scala-library-2.10.4.jar:na] at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) ~[na:na] at co.cask.cdap.app.runtime.spark.DatasetRDD.getPartitions(DatasetRDD.scala:83) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) ~[na:na] at scala.Option.getOrElse(Option.scala:120) ~[org.scala-lang.scala-library-2.10.4.jar:na] at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) ~[na:na] at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) ~[na:na] at scala.Option.getOrElse(Option.scala:120) ~[org.scala-lang.scala-library-2.10.4.jar:na] at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) ~[na:na] at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929) ~[na:na] at org.apache.spark.rdd.RDD.count(RDD.scala:1157) ~[na:na] at sparkhdfs.ScalaSparkHDFS.run(ScalaSparkHDFS.scala:22) ~[na:na] at co.cask.cdap.app.runtime.spark.SparkMainWrapper$.main(SparkMainWrapper.scala:100) ~[na:na] at co.cask.cdap.app.runtime.spark.SparkMainWrapper.main(SparkMainWrapper.scala) ~[na:na] at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) ~[na:1.8.0_121] at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) ~[na:1.8.0_121] at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) ~[na:1.8.0_121] at java.lang.reflect.Method.invoke(Method.java:498) ~[na:1.8.0_121] at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:731) ~[na:na] at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181) ~[na:na] at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206) ~[na:na] at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121) ~[na:na] at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) ~[na:na] at co.cask.cdap.app.runtime.spark.submit.AbstractSparkSubmitter.submit(AbstractSparkSubmitter.java:168) ~[na:na] at co.cask.cdap.app.runtime.spark.submit.AbstractSparkSubmitter.access$000(AbstractSparkSubmitter.java:55) ~[na:na] at co.cask.cdap.app.runtime.spark.submit.AbstractSparkSubmitter$5.run(AbstractSparkSubmitter.java:112) ~[na:na] at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) ~[na:1.8.0_121] at java.util.concurrent.FutureTask.run(FutureTask.java:266) ~[na:1.8.0_121] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) ~[na:1.8.0_121] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) ~[na:1.8.0_121] ... 1 common frames omitted 2017-06-22 15:27:02,893 - DEBUG [pcontroller-program:nd2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark-23ede9f3-5789-11e7-b613-acde48001122:c.c.c.a.r.AbstractProgramRuntimeService@433] - Removing RuntimeInfo: Spark sparkhdfs.AppSpark 23ede9f3-5789-11e7-b613-acde48001122 2017-06-22 15:27:02,893 - DEBUG [pcontroller-program:nd2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark-23ede9f3-5789-11e7-b613-acde48001122:c.c.c.a.r.AbstractProgramRuntimeService@436] - RuntimeInfo removed: RuntimeInfo{programId=program:nd2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark, twillRunId=null} 2017-06-22 15:27:02,906 - DEBUG [SparkRunnersparkhdfs.AppSpark:c.g.c.b.Throwables@33] - Uncaught exception in thread Thread[SparkRunnersparkhdfs.AppSpark,5,appfabric-executor-thread] java.lang.RuntimeException: java.util.concurrent.ExecutionException: java.io.IOException: No input paths specified in job at com.google.common.base.Throwables.propagate(Throwables.java:160) ~[com.google.guava.guava-13.0.1.jar:na] at com.google.common.util.concurrent.AbstractExecutionThreadService$1$1.run(AbstractExecutionThreadService.java:69) ~[com.google.guava.guava-13.0.1.jar:na] at co.cask.cdap.app.runtime.spark.SparkRuntimeService$2$1.run(SparkRuntimeService.java:335) ~[na:na] at java.lang.Thread.run(Thread.java:745) ~[na:1.8.0_121] Caused by: java.util.concurrent.ExecutionException: java.io.IOException: No input paths specified in job at com.google.common.util.concurrent.AbstractFuture$Sync.getValue(AbstractFuture.java:294) ~[com.google.guava.guava-13.0.1.jar:na] at com.google.common.util.concurrent.AbstractFuture$Sync.get(AbstractFuture.java:281) ~[com.google.guava.guava-13.0.1.jar:na] at com.google.common.util.concurrent.AbstractFuture.get(AbstractFuture.java:116) ~[com.google.guava.guava-13.0.1.jar:na] at co.cask.cdap.app.runtime.spark.SparkRuntimeService.run(SparkRuntimeService.java:277) ~[na:na] at com.google.common.util.concurrent.AbstractExecutionThreadService$1$1.run(AbstractExecutionThreadService.java:52) ~[com.google.guava.guava-13.0.1.jar:na] ... 2 common frames omitted Caused by: java.io.IOException: No input paths specified in job at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:231) ~[org.apache.hadoop.hadoop-mapreduce-client-core-2.3.0.jar:na] at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:340) ~[org.apache.hadoop.hadoop-mapreduce-client-core-2.3.0.jar:na] at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:120) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) ~[na:na] at scala.Option.getOrElse(Option.scala:120) ~[org.scala-lang.scala-library-2.10.4.jar:na] at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) ~[na:na] at co.cask.cdap.app.runtime.spark.DatasetRDD.getPartitions(DatasetRDD.scala:83) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) ~[na:na] at scala.Option.getOrElse(Option.scala:120) ~[org.scala-lang.scala-library-2.10.4.jar:na] at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) ~[na:na] at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) ~[na:na] at scala.Option.getOrElse(Option.scala:120) ~[org.scala-lang.scala-library-2.10.4.jar:na] at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) ~[na:na] at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929) ~[na:na] at org.apache.spark.rdd.RDD.count(RDD.scala:1157) ~[na:na] at sparkhdfs.ScalaSparkHDFS.run(ScalaSparkHDFS.scala:22) ~[na:na] at co.cask.cdap.app.runtime.spark.SparkMainWrapper$.main(SparkMainWrapper.scala:100) ~[na:na] at co.cask.cdap.app.runtime.spark.SparkMainWrapper.main(SparkMainWrapper.scala) ~[na:na] at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) ~[na:1.8.0_121] at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) ~[na:1.8.0_121] at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) ~[na:1.8.0_121] at java.lang.reflect.Method.invoke(Method.java:498) ~[na:1.8.0_121] at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:731) ~[na:na] at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181) ~[na:na] at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206) ~[na:na] at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121) ~[na:na] at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) ~[na:na] at co.cask.cdap.app.runtime.spark.submit.AbstractSparkSubmitter.submit(AbstractSparkSubmitter.java:168) ~[na:na] at co.cask.cdap.app.runtime.spark.submit.AbstractSparkSubmitter.access$000(AbstractSparkSubmitter.java:55) ~[na:na] at co.cask.cdap.app.runtime.spark.submit.AbstractSparkSubmitter$5.run(AbstractSparkSubmitter.java:112) ~[na:na] at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) ~[na:1.8.0_121] at java.util.concurrent.FutureTask.run(FutureTask.java:266) ~[na:1.8.0_121] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) ~[na:1.8.0_121] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) ~[na:1.8.0_121] ... 1 common frames omitted 2017-06-22 15:27:02,915 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-3:a.r.RemoteActorRefProvider$RemotingTerminator@74] - Remoting shut down.
createDataset("gold", FileSet.class,FileSetProperties.builder().setBasePath("cdap").setInputFormat(TextInputFormat.class).setOutputFormat(TextOutputFormat.class).build());
To view this discussion on the web visit https://groups.google.com/d/msgid/cdap-user/a8fe92c2-7185-4c94-8280-8f82d90b9aa4%40googlegroups.com.--
You received this message because you are subscribed to the Google Groups "CDAP User" group.
To unsubscribe from this group and stop receiving emails from it, send an email to cdap-user+unsubscribe@googlegroups.com.
To post to this group, send email to cdap...@googlegroups.com.
2017-06-23 15:17:11,616 - DEBUG [spark-submitter-sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:c.c.c.a.r.s.s.AbstractSparkSubmitter@162] - Calling SparkSubmit for program:nd2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark f052ddd1-5850-11e7-af09-acde48001122: [--master, local[2], --class, co.cask.cdap.app.runtime.spark.SparkMainWrapper, --conf, spark.app.name=sparkhdfs.AppSpark, --conf, spark.executor.memory=512m, --conf, spark.driver.memory=512m, --conf, spark.local.dir=/Users/nd2629/cdap/data/tmp/1498249031179-0, --conf, spark.driver.cores=1, --conf, spark.ui.port=0, --conf, spark.executor.cores=1, --conf, spark.metrics.conf=/Users/nd2629/cdap/data/tmp/1498249031179-0/metrics1320711439257084548.properties, --conf, spark.cdap.localized.resources=[], /Users/nd2629/cdap/data/tmp/1498249031179-0/emptyJob.jar, --userClass=sparkhdfs.ScalaSparkHDFS] 2017-06-23 15:17:11,780 - INFO [NettyHttpService STARTING:c.c.h.NettyHttpService@257] - Starting sparkhdfs.AppSpark-spark-tx http service on address localhost/127.0.0.1:0... 2017-06-23 15:17:11,782 - INFO [NettyHttpService STARTING:c.c.h.NettyHttpService@262] - Started sparkhdfs.AppSpark-spark-tx http service on address /127.0.0.1:61260 2017-06-23 15:17:11,788 - DEBUG [spark-submitter-sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:s.ScalaSparkHDFS@21] - ***********11********** 2017-06-23 15:17:11,827 - WARN [spark-submitter-sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:o.a.s.SparkConf@70] - In Spark 1.0 and later spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone and LOCAL_DIRS in YARN). 2017-06-23 15:17:15,463 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-4:a.e.s.Slf4jLogger@80] - Slf4jLogger started 2017-06-23 15:17:15,600 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-2:Remoting@74] - Starting remoting 2017-06-23 15:17:15,708 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-2:Remoting@74] - Remoting started; listening on addresses :[akka.tcp://sparkDriver...@10.20.13.80:61339] 2017-06-23 15:17:16,490 - DEBUG [spark-submitter-sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:c.c.c.a.r.s.SparkMetricsSink@51] - Using SparkMetricsSink for reporting metrics: {class=co.cask.cdap.app.runtime.spark.SparkMetricsSink} 2017-06-23 15:17:16,605 - DEBUG [spark-submitter-sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:s.ScalaSparkHDFS@23] - ***********2********** 2017-06-23 15:17:16,671 - DEBUG [spark-submitter-sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:s.ScalaSparkHDFS@25] - ***********3********** 2017-06-23 15:17:17,002 - DEBUG [spark-submitter-sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:s.ScalaSparkHDFS@28] - ***********4********** 2017-06-23 15:17:17,015 - DEBUG [spark-submitter-sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:c.c.c.d.m.w.BasicLineageWriter@63] - Writing access for run program_run:nd2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark.f052ddd1-5850-11e7-af09-acde48001122, dataset dataset:nd2629.inputdataset, accessType READ, component null, accessTime = 1498249037015 2017-06-23 15:17:17,658 - DEBUG [spark-submitter-sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:c.c.c.a.r.s.SparkRuntimeEnv$@247] - Shutting down Server and ThreadPool used by Spark org.apache.spark.SparkContext@6ea552db 2017-06-23 15:17:17,667 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-3:a.r.RemoteActorRefProvider$RemotingTerminator@74] - Shutting down remote daemon. 2017-06-23 15:17:17,671 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-3:a.r.RemoteActorRefProvider$RemotingTerminator@74] - Remote daemon shut down; proceeding with flushing remote transports. 2017-06-23 15:17:17,703 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-3:a.r.RemoteActorRefProvider$RemotingTerminator@74] - Remoting shut down. 2017-06-23 15:17:17,711 - DEBUG [spark-submitter-sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:c.c.c.a.r.s.SparkRuntimeEnv$@247] - Shutting down Server and ThreadPool used by Spark org.apache.spark.SparkContext@6ea552db 2017-06-23 15:17:17,714 - INFO [NettyHttpService STOPPING:c.c.h.NettyHttpService@274] - Stopping sparkhdfs.AppSpark-spark-tx http service on address localhost/127.0.0.1:61260... 2017-06-23 15:17:17,715 - INFO [NettyHttpService STOPPING:c.c.h.NettyHttpService@288] - Stopped sparkhdfs.AppSpark-spark-tx http service on address localhost/127.0.0.1:61260 2017-06-23 15:17:17,721 - DEBUG [SparkRunnersparkhdfs.AppSpark:c.c.c.a.r.s.SparkRuntimeService@717] - Running Spark shutdown hook org.apache.spark.util.SparkShutdownHookManager$$anon$2@294b96f9 2017-06-23 15:17:17,733 - DEBUG [SparkRunnersparkhdfs.AppSpark:c.c.c.a.r.s.SparkRuntimeService@752] - BeanIntrospector.ctorParamNamesCache has been invalidated. 2017-06-23 15:17:17,734 - DEBUG [SparkRunnersparkhdfs.AppSpark:c.c.c.a.r.s.SparkRuntimeService@307] - Spark program completed: SparkRuntimeContext{id=program:nd2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark, runId=f052ddd1-5850-11e7-af09-acde48001122} 2017-06-23 15:17:17,753 - ERROR [SparkRunnersparkhdfs.AppSpark:c.c.c.i.a.r.ProgramControllerServiceAdapter@93] - Program terminated with exception java.util.concurrent.ExecutionException: java.io.IOException: No input paths specified in job at com.google.common.util.concurrent.AbstractFuture$Sync.getValue(AbstractFuture.java:294) ~[com.google.guava.guava-13.0.1.jar:na] at com.google.common.util.concurrent.AbstractFuture$Sync.get(AbstractFuture.java:281) ~[com.google.guava.guava-13.0.1.jar:na] at com.google.common.util.concurrent.AbstractFuture.get(AbstractFuture.java:116) ~[com.google.guava.guava-13.0.1.jar:na] at co.cask.cdap.app.runtime.spark.SparkRuntimeService.run(SparkRuntimeService.java:277) ~[co.cask.cdap.cdap-spark-core-4.1.1.jar:na] at com.google.common.util.concurrent.AbstractExecutionThreadService$1$1.run(AbstractExecutionThreadService.java:52) ~[com.google.guava.guava-13.0.1.jar:na] at co.cask.cdap.app.runtime.spark.SparkRuntimeService$2$1.run(SparkRuntimeService.java:335) [co.cask.cdap.cdap-spark-core-4.1.1.jar:na] at java.lang.Thread.run(Thread.java:745) [na:1.8.0_121] Caused by: java.io.IOException: No input paths specified in job at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:231) ~[org.apache.hadoop.hadoop-mapreduce-client-core-2.3.0.jar:na] at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:340) ~[org.apache.hadoop.hadoop-mapreduce-client-core-2.3.0.jar:na] at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:120) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) ~[na:na] at scala.Option.getOrElse(Option.scala:120) ~[org.scala-lang.scala-library-2.10.4.jar:na] at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) ~[na:na] at co.cask.cdap.app.runtime.spark.DatasetRDD.getPartitions(DatasetRDD.scala:83) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) ~[na:na] at scala.Option.getOrElse(Option.scala:120) ~[org.scala-lang.scala-library-2.10.4.jar:na] at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) ~[na:na] at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) ~[na:na] at scala.Option.getOrElse(Option.scala:120) ~[org.scala-lang.scala-library-2.10.4.jar:na] at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) ~[na:na] at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929) ~[na:na] at org.apache.spark.rdd.RDD.count(RDD.scala:1157) ~[na:na] at sparkhdfs.ScalaSparkHDFS.run(ScalaSparkHDFS.scala:29) ~[na:na] at co.cask.cdap.app.runtime.spark.SparkMainWrapper$.main(SparkMainWrapper.scala:100) ~[na:na] at co.cask.cdap.app.runtime.spark.SparkMainWrapper.main(SparkMainWrapper.scala) ~[na:na] at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) ~[na:1.8.0_121] at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) ~[na:1.8.0_121] at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) ~[na:1.8.0_121] at java.lang.reflect.Method.invoke(Method.java:498) ~[na:1.8.0_121] at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:731) ~[na:na] at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181) ~[na:na] at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206) ~[na:na] at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121) ~[na:na] at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) ~[na:na] at co.cask.cdap.app.runtime.spark.submit.AbstractSparkSubmitter.submit(AbstractSparkSubmitter.java:168) ~[na:na] at co.cask.cdap.app.runtime.spark.submit.AbstractSparkSubmitter.access$000(AbstractSparkSubmitter.java:55) ~[na:na] at co.cask.cdap.app.runtime.spark.submit.AbstractSparkSubmitter$5.run(AbstractSparkSubmitter.java:112) ~[na:na] at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) ~[na:1.8.0_121] at java.util.concurrent.FutureTask.run(FutureTask.java:266) ~[na:1.8.0_121] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) ~[na:1.8.0_121] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) ~[na:1.8.0_121] ... 1 common frames omitted 2017-06-23 15:17:17,754 - DEBUG [pcontroller-program:nd2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:c.c.c.a.r.AbstractProgramRuntimeService@433] - Removing RuntimeInfo: Spark sparkhdfs.AppSpark f052ddd1-5850-11e7-af09-acde48001122 2017-06-23 15:17:17,754 - DEBUG [pcontroller-program:nd2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:c.c.c.a.r.AbstractProgramRuntimeService@436] - RuntimeInfo removed: RuntimeInfo{programId=program:nd2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark, twillRunId=null} 2017-06-23 15:17:17,765 - DEBUG [SparkRunnersparkhdfs.AppSpark:c.g.c.b.Throwables@33] - Uncaught exception in thread Thread[SparkRunnersparkhdfs.AppSpark,5,appfabric-executor-thread] java.lang.RuntimeException: java.util.concurrent.ExecutionException: java.io.IOException: No input paths specified in job at com.google.common.base.Throwables.propagate(Throwables.java:160) ~[com.google.guava.guava-13.0.1.jar:na] at com.google.common.util.concurrent.AbstractExecutionThreadService$1$1.run(AbstractExecutionThreadService.java:69) ~[com.google.guava.guava-13.0.1.jar:na] at co.cask.cdap.app.runtime.spark.SparkRuntimeService$2$1.run(SparkRuntimeService.java:335) ~[na:na] at java.lang.Thread.run(Thread.java:745) ~[na:1.8.0_121] Caused by: java.util.concurrent.ExecutionException: java.io.IOException: No input paths specified in job at com.google.common.util.concurrent.AbstractFuture$Sync.getValue(AbstractFuture.java:294) ~[com.google.guava.guava-13.0.1.jar:na] at com.google.common.util.concurrent.AbstractFuture$Sync.get(AbstractFuture.java:281) ~[com.google.guava.guava-13.0.1.jar:na] at com.google.common.util.concurrent.AbstractFuture.get(AbstractFuture.java:116) ~[com.google.guava.guava-13.0.1.jar:na] at co.cask.cdap.app.runtime.spark.SparkRuntimeService.run(SparkRuntimeService.java:277) ~[na:na] at com.google.common.util.concurrent.AbstractExecutionThreadService$1$1.run(AbstractExecutionThreadService.java:52) ~[com.google.guava.guava-13.0.1.jar:na] ... 2 common frames omitted Caused by: java.io.IOException: No input paths specified in job at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:231) ~[org.apache.hadoop.hadoop-mapreduce-client-core-2.3.0.jar:na] at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:340) ~[org.apache.hadoop.hadoop-mapreduce-client-core-2.3.0.jar:na] at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:120) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) ~[na:na] at scala.Option.getOrElse(Option.scala:120) ~[org.scala-lang.scala-library-2.10.4.jar:na] at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) ~[na:na] at co.cask.cdap.app.runtime.spark.DatasetRDD.getPartitions(DatasetRDD.scala:83) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) ~[na:na] at scala.Option.getOrElse(Option.scala:120) ~[org.scala-lang.scala-library-2.10.4.jar:na] at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) ~[na:na] at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) ~[na:na] at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) ~[na:na] at scala.Option.getOrElse(Option.scala:120) ~[org.scala-lang.scala-library-2.10.4.jar:na] at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) ~[na:na] at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929) ~[na:na] at org.apache.spark.rdd.RDD.count(RDD.scala:1157) ~[na:na] at sparkhdfs.ScalaSparkHDFS.run(ScalaSparkHDFS.scala:29) ~[na:na] at co.cask.cdap.app.runtime.spark.SparkMainWrapper$.main(SparkMainWrapper.scala:100) ~[na:na] at co.cask.cdap.app.runtime.spark.SparkMainWrapper.main(SparkMainWrapper.scala) ~[na:na] at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) ~[na:1.8.0_121] at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) ~[na:1.8.0_121] at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) ~[na:1.8.0_121] at java.lang.reflect.Method.invoke(Method.java:498) ~[na:1.8.0_121] at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:731) ~[na:na] at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181) ~[na:na] at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206) ~[na:na] at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121) ~[na:na] at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) ~[na:na] at co.cask.cdap.app.runtime.spark.submit.AbstractSparkSubmitter.submit(AbstractSparkSubmitter.java:168) ~[na:na] at co.cask.cdap.app.runtime.spark.submit.AbstractSparkSubmitter.access$000(AbstractSparkSubmitter.java:55) ~[na:na] at co.cask.cdap.app.runtime.spark.submit.AbstractSparkSubmitter$5.run(AbstractSparkSubmitter.java:112) ~[na:na] at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) ~[na:1.8.0_121] at java.util.concurrent.FutureTask.run(FutureTask.java:266) ~[na:1.8.0_121] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) ~[na:1.8.0_121] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) ~[na:1.8.0_121] ... 1 common frames omitted
To unsubscribe from this group and stop receiving emails from it, send an email to cdap-user+...@googlegroups.com.
Please find logs from cdap UI
2017-06-23 15:17:11,616 - DEBUG [spark-submitter-sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:c.c.c.a.r.s.s.AbstractSparkSubmitter@162] - Calling SparkSubmit for program:nd2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark f052ddd1-5850-11e7-af09-acde48001122: [--master, local[2], --class, co.cask.cdap.app.runtime.spark.SparkMainWrapper, --conf, spark.app.name=sparkhdfs.AppSpark, --conf, spark.executor.memory=512m, --conf, spark.driver.memory=512m, --conf, spark.local.dir=/Users/nd2629/cdap/data/tmp/1498249031179-0, --conf, spark.driver.cores=1, --conf, spark.ui.port=0, --conf, spark.executor.cores=1, --conf, spark.metrics.conf=/Users/nd2629/cdap/data/tmp/1498249031179-0/metrics1320711439257084548.properties, --conf, spark.cdap.localized.resources=[], /Users/nd2629/cdap/data/tmp/1498249031179-0/emptyJob.jar, --userClass=sparkhdfs.ScalaSparkHDFS] 2017-06-23 15:17:11,780 - INFO [NettyHttpService STARTING:c.c.h.NettyHttpService@257] - Starting sparkhdfs.AppSpark-spark-tx http service on address localhost/127.0.0.1:0... 2017-06-23 15:17:11,782 - INFO [NettyHttpService STARTING:c.c.h.NettyHttpService@262] - Started sparkhdfs.AppSpark-spark-tx http service on address /127.0.0.1:61260 2017-06-23 15:17:11,788 - DEBUG [spark-submitter-sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:s.ScalaSparkHDFS@21] - ***********11********** 2017-06-23 15:17:11,827 - WARN [spark-submitter-sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:o.a.s.SparkConf@70] - In Spark 1.0 and later spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone and LOCAL_DIRS in YARN). 2017-06-23 15:17:15,463 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-4:a.e.s.Slf4jLogger@80] - Slf4jLogger started 2017-06-23 15:17:15,600 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-2:Remoting@74
] - Starting remoting 2017-06-23 15:17:15,708 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-2:Remoting@74] - Remoting started; listening on addresses :[akka.tcp://sparkDriverActorSys...@10.20.13.80:61339] 2017-06-23 15:17:16,490 - DEBUG [spark-submitter-sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:c.c.c.a.r.s.SparkMetricsSink@51] - Using SparkMetricsSink for reporting metrics: {class=co.cask.cdap.app.runtime.spark.SparkMetricsSink} 2017-06-23 15:17:16,605 - DEBUG [spark-submitter-sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:s.ScalaSparkHDFS@23] - ***********2********** 2017-06-23 15:17:16,671 - DEBUG [spark-submitter-sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:s.ScalaSparkHDFS@25] - ***********3********** 2017-06-23 15:17:17,002 - DEBUG [spark-submitter-sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:s.ScalaSparkHDFS@28] - ***********4********** 2017-06-23 15:17:17,015 - DEBUG [spark-submitter-sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:c.c.c.d.m.w.BasicLineageWriter@63] - Writing access for run program_run:nd2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark.f052ddd1-5850-11e7-af09-acde48001122, dataset dataset:nd2629.inputdataset, accessType READ, component null, accessTime = 1498249037015 2017-06-23 15:17:17,658 - DEBUG [spark-submitter-sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:c.c.c.a.r.s.SparkRuntimeEnv$@247] - Shutting down Server and ThreadPool used by Spark org.apache.spark.SparkContext@6ea552db 2017-06-23 15:17:17,667 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-3:a.r.RemoteActorRefProvider$RemotingTerminator@74] - Shutting down remote daemon. 2017-06-23 15:17:17,671 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-3:a.r.RemoteActorRefProvider$RemotingTerminator@74] - Remote daemon shut down; proceeding with flushing remote transports. 2017-06-23 15:17:17,703 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-3:a.r.RemoteActorRefProvider$RemotingTerminator@74] - Remoting shut down. 2017-06-23 15:17:17,711 - DEBUG [spark-submitter-sparkhdfs.AppSpark-f052ddd1-5850-11e7-af09-acde48001122:c.c.c.a.r.s.SparkRuntimeEnv$@247] - Shutting down Server and ThreadPool used by Spark org.apache.spark.SparkContext@6ea552db 2017-06-23 15:17:17,714 - INFO [NettyHttpService STOPPING:c.c.h.NettyHttpService@274] - Stopping sparkhdfs.AppSpark-spark-tx http service on address localhost/127.0.0.1:61260... 2017-06-23 15:17:17,715 - INFO [NettyHttpService STOPPING:c.c.h.NettyHttpService@288] - Stopped sparkhdfs.AppSpark-spark-tx http service on address localhost/127.0.0.1:61260
To unsubscribe from this group and stop receiving emails from it, send an email to cdap-user+unsubscribe@googlegroups.com.
To post to this group, send email to cdap...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/cdap-user/92790ec5-5d26-4b90-9a3d-361c8a84972c%40googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/cdap-user/CA%2B2uA9VeB3wOY6v6MhZX9E7iafbQcnYWBoY3oqTFfO8QO8E-cg%40mail.gmail.com.
For more options, visit https://groups.google.com/d/optout.
Map<String, String> inputArgs = Maps.newHashMap();
FileSetArguments.setInputPaths(inputArgs, "file1.txt");
But, Its not feasible to give individual file each time. Requirement is to read all files in a directory. I was going through API's and don't see anything to read a files in a
base directory. Do we have any api to read files from base path?
Thank you,
Naresh
To unsubscribe from this group and stop receiving emails from it, send an email to cdap-user+unsubscribe@googlegroups.com.
To post to this group, send email to cdap...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/cdap-user/202eec09-29b1-4a35-b753-1ab55c0e3e11%40googlegroups.com.
createDataset("inputdataset", .... ,.setBasePath("cdap"), .. );
createDataset("out",... , setBasePath("result"), ......);
FileSetArguments.setInputPaths(inputArgs, "input");
JavaPairRDD<LongWritable,Text> data = jse.fromDataset("inputdataset",inputArgs);
To unsubscribe from this group and stop receiving emails from it, send an email to cdap-user+unsubscribe@googlegroups.com.
To post to this group, send email to cdap...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/cdap-user/1245d4aa-9d49-4cab-9dcf-12121fa7570a%40googlegroups.com.
FileSetArguments.setInputPaths(inputArgs, "user");
FileSetArguments.setOutputPath(outputArgs,"output");
JavaPairRDD<LongWritable,Text> userRdd = jse.fromDataset("inputdataset",inputArgs);
FileSetArguments.setInputPaths(inputArgs, "transaction");
JavaPairRDD<LongWritable,Text> txRdd = jse.fromDataset("inputdataset",inputArgs);
2017-06-26 11:34:50,090 - DEBUG [spark-submitter-sparkhdfs.AppSpark-5f9d439b-5a8d-11e7-a413-acde48001122:c.c.c.a.r.s.s.AbstractSparkSubmitter@162] - Calling SparkSubmit for program:etl2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark 5f9d439b-5a8d-11e7-a413-acde48001122: [--master, local[2], --class, co.cask.cdap.app.runtime.spark.SparkMainWrapper, --conf, spark.app.name=sparkhdfs.AppSpark, --conf, spark.executor.memory=512m, --conf, spark.driver.memory=512m, --conf, spark.local.dir=/Users/nd2629/cdap/data/tmp/1498494890059-0, --conf, spark.driver.cores=1, --conf, spark.ui.port=0, --conf, spark.executor.cores=1, --conf, spark.metrics.conf=/Users/nd2629/cdap/data/tmp/1498494890059-0/metrics8183355214313707732.properties, --conf, spark.cdap.localized.resources=[], /Users/nd2629/cdap/data/tmp/1498494890059-0/emptyJob.jar, --userClass=sparkhdfs.JAVASparkHDFS] 2017-06-26 11:34:50,619 - INFO [NettyHttpService STARTING:c.c.h.NettyHttpService@257] - Starting sparkhdfs.AppSpark-spark-tx http service on address localhost/127.0.0.1:0... 2017-06-26 11:34:50,620 - INFO [NettyHttpService STARTING:c.c.h.NettyHttpService@262] - Started sparkhdfs.AppSpark-spark-tx http service on address /127.0.0.1:60599 2017-06-26 11:34:50,675 - WARN [spark-submitter-sparkhdfs.AppSpark-5f9d439b-5a8d-11e7-a413-acde48001122:o.a.s.SparkConf@70] - In Spark 1.0 and later spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone and LOCAL_DIRS in YARN). 2017-06-26 11:34:51,752 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-2:a.e.s.Slf4jLogger@80] - Slf4jLogger started 2017-06-26 11:34:52,927 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-4:Remoting@74] - Starting remoting 2017-06-26 11:34:53,437 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-4:Remoting@74] - Remoting started; listening on addresses :[akka.tcp://sparkDriver...@10.100.38.105:60657] 2017-06-26 11:34:54,631 - DEBUG [spark-submitter-sparkhdfs.AppSpark-5f9d439b-5a8d-11e7-a413-acde48001122:c.c.c.a.r.s.SparkMetricsSink@51] - Using SparkMetricsSink for reporting metrics: {class=co.cask.cdap.app.runtime.spark.SparkMetricsSink} 2017-06-26 11:34:58,699 - DEBUG [spark-submitter-sparkhdfs.AppSpark-5f9d439b-5a8d-11e7-a413-acde48001122:c.c.c.d.m.w.BasicLineageWriter@63] - Writing access for run program_run:etl2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark.5f9d439b-5a8d-11e7-a413-acde48001122, dataset dataset:etl2629.inputdataset, accessType READ, component null, accessTime = 1498494898699 2017-06-26 11:34:59,406 - DEBUG [spark-submitter-sparkhdfs.AppSpark-5f9d439b-5a8d-11e7-a413-acde48001122:c.c.c.d.m.w.BasicLineageWriter@63] - Writing access for run program_run:etl2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark.5f9d439b-5a8d-11e7-a413-acde48001122, dataset dataset:etl2629.out, accessType WRITE, component null, accessTime = 1498494899406 2017-06-26 11:34:59,681 - DEBUG [SparkListenerBus:c.c.c.a.r.s.DefaultSparkExecutionContext@106] - Spark program=program:etl2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark, runId=5f9d439b-5a8d-11e7-a413-acde48001122, jobId=0 starts with auto-commit=false on transaction Transaction{readPointer: 1498494898683000000, transactionId: 1498494898692000000, writePointer: 1498494898692000000, invalids: [1498357848719000000, 1498363599519000000, 1498402426936000000, 1498402526771000000, 1498404408367000000, 1498404439066000000, 1498404971830000000, 1498491200958000000], inProgress: [1498494190406000000], firstShortInProgress: 9223372036854775807, type: LONG, checkpointWritePointers: [], visibilityLevel: SNAPSHOT} 2017-06-26 11:34:59,683 - DEBUG [SparkListenerBus:c.c.c.a.r.s.SparkTransactionService@140] - Spark job started: JobTransaction{jobId=0, stageIds=[0, 1], transaction=Transaction{readPointer: 1498494898683000000, transactionId: 1498494898692000000, writePointer: 1498494898692000000, invalids: [1498357848719000000, 1498363599519000000, 1498402426936000000, 1498402526771000000, 1498404408367000000, 1498404439066000000, 1498404971830000000, 1498491200958000000], inProgress: [1498494190406000000], firstShortInProgress: 9223372036854775807, type: LONG, checkpointWritePointers: [], visibilityLevel: SNAPSHOT}} 2017-06-26 11:35:03,472 - DEBUG [Executor task launch worker-0:s.JAVASparkHDFS@55] - user line is 1234,naresh,Dallas,8037167180 2017-06-26 11:35:03,472 - DEBUG [Executor task launch worker-1:s.JAVASparkHDFS@55] - user line is 1234,naresh,Dallas,8037167180 2017-06-26 11:35:03,485 - DEBUG [Executor task launch worker-0:s.JAVASparkHDFS@55] - user line is 2343,Aparna,Hyderabad,9441652089 2017-06-26 11:35:03,486 - DEBUG [Executor task launch worker-0:s.JAVASparkHDFS@55] - user line is 1233,Chitti,Karimnangar,1234567 2017-06-26 11:35:03,486 - DEBUG [Executor task launch worker-0:s.JAVASparkHDFS@55] - user line is 3434,Kelly,Dallas,1233 2017-06-26 11:35:03,488 - DEBUG [Executor task launch worker-1:s.JAVASparkHDFS@55] - user line is 2343,Aparna,Hyderabad,9441652089 2017-06-26 11:35:03,489 - DEBUG [Executor task launch worker-1:s.JAVASparkHDFS@55] - user line is 1233,Chitti,Karimnangar,1234567 2017-06-26 11:35:03,489 - DEBUG [Executor task launch worker-1:s.JAVASparkHDFS@55] - user line is 3434,Kelly,Dallas,1233 2017-06-26 11:35:13,618 - ERROR [driver-heartbeater:o.a.s.u.Utils@95] - Uncaught exception in thread driver-heartbeater java.io.IOException: java.lang.ClassCastException: cannot assign instance of scala.collection.immutable.HashMap$SerializationProxy to field org.apache.spark.executor.TaskMetrics._accumulatorUpdates of type scala.collection.immutable.Map in instance of org.apache.spark.executor.TaskMetrics at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1207) ~[co.cask.cdap.spark-assembly-1.6.1.jar:na] at org.apache.spark.executor.TaskMetrics.readObject(TaskMetrics.scala:219) ~[co.cask.cdap.spark-assembly-1.6.1.jar:na] at sun.reflect.GeneratedMethodAccessor196.invoke(Unknown Source) ~[na:na] at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) ~[na:1.8.0_121] at java.lang.reflect.Method.invoke(Method.java:498) ~[na:1.8.0_121] at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1058) ~[na:1.8.0_121] at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2122) ~[na:1.8.0_121] at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2013) ~[na:1.8.0_121] at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535) ~[na:1.8.0_121] at java.io.ObjectInputStream.readObject(ObjectInputStream.java:422) ~[na:1.8.0_121] at org.apache.spark.util.Utils$.deserialize(Utils.scala:92) ~[co.cask.cdap.spark-assembly-1.6.1.jar:na] at org.apache.spark.executor.Executor$$anonfun$org$apache$spark$executor$Executor$$reportHeartBeat$1$$anonfun$apply$6.apply(Executor.scala:437) ~[co.cask.cdap.spark-assembly-1.6.1.jar:na] at org.apache.spark.executor.Executor$$anonfun$org$apache$spark$executor$Executor$$reportHeartBeat$1$$anonfun$apply$6.apply(Executor.scala:427) ~[co.cask.cdap.spark-assembly-1.6.1.jar:na] at scala.Option.foreach(Option.scala:236) ~[org.scala-lang.scala-library-2.10.4.jar:na] at org.apache.spark.executor.Executor$$anonfun$org$apache$spark$executor$Executor$$reportHeartBeat$1.apply(Executor.scala:427) ~[co.cask.cdap.spark-assembly-1.6.1.jar:na] at org.apache.spark.executor.Executor$$anonfun$org$apache$spark$executor$Executor$$reportHeartBeat$1.apply(Executor.scala:425) ~[co.cask.cdap.spark-assembly-1.6.1.jar:na] at scala.collection.Iterator$class.foreach(Iterator.scala:727) ~[org.scala-lang.scala-library-2.10.4.jar:na] at scala.collection.AbstractIterator.foreach(Iterator.scala:1157) ~[org.scala-lang.scala-library-2.10.4.jar:na] at scala.collection.IterableLike$class.foreach(IterableLike.scala:72) ~[org.scala-lang.scala-library-2.10.4.jar:na] at scala.collection.AbstractIterable.foreach(Iterable.scala:54) ~[org.scala-lang.scala-library-2.10.4.jar:na] at org.apache.spark.executor.Executor.org$apache$spark$executor$Executor$$reportHeartBeat(Executor.scala:425) ~[co.cask.cdap.spark-assembly-1.6.1.jar:na] at org.apache.spark.executor.Executor$$anon$1$$anonfun$run$1.apply$mcV$sp(Executor.scala:470) ~[co.cask.cdap.spark-assembly-1.6.1.jar:na] at org.apache.spark.executor.Executor$$anon$1$$anonfun$run$1.apply(Executor.scala:470) ~[co.cask.cdap.spark-assembly-1.6.1.jar:na] at org.apache.spark.executor.Executor$$anon$1$$anonfun$run$1.apply(Executor.scala:470) ~[co.cask.cdap.spark-assembly-1.6.1.jar:na] at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1765) ~[co.cask.cdap.spark-assembly-1.6.1.jar:na] at org.apache.spark.executor.Executor$$anon$1.run(Executor.scala:470) [co.cask.cdap.spark-assembly-1.6.1.jar:na] at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) [na:1.8.0_121] at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308) [na:1.8.0_121] at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180) [na:1.8.0_121] at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294) [na:1.8.0_121] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) [na:1.8.0_121] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) [na:1.8.0_121] at java.lang.Thread.run(Thread.java:745) [na:1.8.0_121] Caused by: java.lang.ClassCastException: cannot assign instance of scala.collection.immutable.HashMap$SerializationProxy to field org.apache.spark.executor.TaskMetrics._accumulatorUpdates of type scala.collection.immutable.Map in instance of org.apache.spark.executor.TaskMetrics at java.io.ObjectStreamClass$FieldReflector.setObjFieldValues(ObjectStreamClass.java:2133) ~[na:1.8.0_121] at java.io.ObjectStreamClass.setObjFieldValues(ObjectStreamClass.java:1305) ~[na:1.8.0_121] at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2237) ~[na:1.8.0_121] at java.io.ObjectInputStream.defaultReadObject(ObjectInputStream.java:552) ~[na:1.8.0_121] at org.apache.spark.executor.TaskMetrics$$anonfun$readObject$1.apply$mcV$sp(TaskMetrics.scala:220) ~[co.cask.cdap.spark-assembly-1.6.1.jar:na] at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1204) ~[co.cask.cdap.spark-assembly-1.6.1.jar:na] ... 32 common frames omitted 2017-06-26 11:35:41,535 - DEBUG [SparkListenerBus:c.c.c.a.r.s.SparkTransactionService@174] - Spark job ended: JobTransaction{jobId=0, stageIds=[0, 1], transaction=Transaction{readPointer: 1498494898683000000, transactionId: 1498494898692000000, writePointer: 1498494898692000000, invalids: [1498357848719000000, 1498363599519000000, 1498402426936000000, 1498402526771000000, 1498404408367000000, 1498404439066000000, 1498404971830000000, 1498491200958000000], inProgress: [1498494190406000000], firstShortInProgress: 9223372036854775807, type: LONG, checkpointWritePointers: [], visibilityLevel: SNAPSHOT}} 2017-06-26 11:35:42,132 - DEBUG [spark-submitter-sparkhdfs.AppSpark-5f9d439b-5a8d-11e7-a413-acde48001122:c.c.c.a.r.s.SparkRuntimeEnv$@247] - Shutting down Server and ThreadPool used by Spark org.apache.spark.SparkContext@2fbb79cd 2017-06-26 11:35:42,139 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-3:a.r.RemoteActorRefProvider$RemotingTerminator@74] - Shutting down remote daemon. 2017-06-26 11:35:42,143 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-3:a.r.RemoteActorRefProvider$RemotingTerminator@74] - Remote daemon shut down; proceeding with flushing remote transports. 2017-06-26 11:35:42,175 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-3:a.r.RemoteActorRefProvider$RemotingTerminator@74] - Remoting shut down. 2017-06-26 11:35:42,185 - DEBUG [spark-submitter-sparkhdfs.AppSpark-5f9d439b-5a8d-11e7-a413-acde48001122:c.c.c.a.r.s.SparkRuntimeEnv$@247] - Shutting down Server and ThreadPool used by Spark org.apache.spark.SparkContext@2fbb79cd 2017-06-26 11:35:42,187 - INFO [NettyHttpService STOPPING:c.c.h.NettyHttpService@274] - Stopping sparkhdfs.AppSpark-spark-tx http service on address localhost/127.0.0.1:60599... 2017-06-26 11:35:42,188 - INFO [NettyHttpService STOPPING:c.c.h.NettyHttpService@288] - Stopped sparkhdfs.AppSpark-spark-tx http service on address localhost/127.0.0.1:60599 2017-06-26 11:35:42,189 - DEBUG [spark-submitter-sparkhdfs.AppSpark-5f9d439b-5a8d-11e7-a413-acde48001122:c.c.c.a.r.s.s.AbstractSparkSubmitter@169] - SparkSubmit returned for program:etl2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark 5f9d439b-5a8d-11e7-a413-acde48001122 2017-06-26 11:35:42,193 - DEBUG [SparkRunnersparkhdfs.AppSpark:c.c.c.a.r.s.SparkRuntimeService@717] - Running Spark shutdown hook org.apache.spark.util.SparkShutdownHookManager$$anon$2@7de41c59 2017-06-26 11:35:42,203 - DEBUG [SparkRunnersparkhdfs.AppSpark:c.c.c.a.r.s.SparkRuntimeService@752] - BeanIntrospector.ctorParamNamesCache has been invalidated. 2017-06-26 11:35:42,203 - DEBUG [SparkRunnersparkhdfs.AppSpark:c.c.c.a.r.s.SparkRuntimeService@307] - Spark program completed: SparkRuntimeContext{id=program:etl2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark, runId=5f9d439b-5a8d-11e7-a413-acde48001122} 2017-06-26 11:35:42,215 - DEBUG [pcontroller-program:etl2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark-5f9d439b-5a8d-11e7-a413-acde48001122:c.c.c.a.r.AbstractProgramRuntimeService@433] - Removing RuntimeInfo: Spark sparkhdfs.AppSpark 5f9d439b-5a8d-11e7-a413-acde48001122 2017-06-26 11:35:42,215 - DEBUG [pcontroller-program:etl2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark-5f9d439b-5a8d-11e7-a413-acde48001122:c.c.c.a.r.AbstractProgramRuntimeService@436] - RuntimeInfo removed: RuntimeInfo{programId=program:etl2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark, twillRunId=null}
Map<String, String> userinputArgs = new HashMap();
Map<String, String> txinputArgs = new HashMap();
Map<String, String> outputArgs = new HashMap();
FileSetArguments.setInputPaths(userinputArgs,"user");
FileSetArguments.setInputPath(txinputArgs,"transaction");
FileSetArguments.setOutputPath(outputArgs,"output");
JavaPairRDD<LongWritable,Text> userRdd = jse.fromDataset("inputdataset",userinputArgs);
JavaPairRDD<LongWritable,Text> txRdd = jse.fromDataset("inputdataset",txinputArgs);
Hi Ali,Not able to read using inputArgs "/".Attached is error message.Directory structure is<cdap>/namespace/<nameofnamespace>/data/cdap/user/file.txt<cdap>/namespace/<nameofnamespace>/data/cdap/transaction/txns.txtAnother Question?How can we create two different rdd's from two different paths as shown above directory structure?I used below below code, but its not able to read data from transactions directory.FileSetArguments.setInputPaths(inputArgs, "user");
FileSetArguments.setOutputPath(outputArgs,"output");
JavaPairRDD<LongWritable,Text> userRdd = jse.fromDataset("inputdataset",inputArgs);
FileSetArguments.setInputPaths(inputArgs, "transaction");
JavaPairRDD<LongWritable,Text> txRdd = jse.fromDataset("inputdataset",inputArgs);below is erro message thrown
2017-06-26 11:34:50,090 - DEBUG [spark-submitter-sparkhdfs.AppSpark-5f9d439b-5a8d-11e7-a413-acde48001122:c.c.c.a.r.s.s.AbstractSparkSubmitter@162] - Calling SparkSubmit for program:etl2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark 5f9d439b-5a8d-11e7-a413-acde48001122: [--master, local[2], --class, co.cask.cdap.app.runtime.spark.SparkMainWrapper, --conf, spark.app.name=sparkhdfs.AppSpark, --conf, spark.executor.memory=512m, --conf, spark.driver.memory=512m, --conf, spark.local.dir=/Users/nd2629/cdap/data/tmp/1498494890059-0, --conf, spark.driver.cores=1, --conf, spark.ui.port=0, --conf, spark.executor.cores=1, --conf, spark.metrics.conf=/Users/nd2629/cdap/data/tmp/1498494890059-0/metrics8183355214313707732.properties, --conf, spark.cdap.localized.resources=[], /Users/nd2629/cdap/data/tmp/1498494890059-0/emptyJob.jar, --userClass=sparkhdfs.JAVASparkHDFS] 2017-06-26 11:34:50,619 - INFO [NettyHttpService STARTING:c.c.h.NettyHttpService@257] - Starting sparkhdfs.AppSpark-spark-tx http service on address localhost/127.0.0.1:0... 2017-06-26 11:34:50,620 - INFO [NettyHttpService STARTING:c.c.h.NettyHttpService@262] - Started sparkhdfs.AppSpark-spark-tx http service on address /127.0.0.1:60599 2017-06-26 11:34:50,675 - WARN [spark-submitter-sparkhdfs.AppSpark-5f9d439b-5a8d-11e7-a413-acde48001122:o.a.s.SparkConf@70] - In Spark 1.0 and later spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone and LOCAL_DIRS in YARN). 2017-06-26 11:34:51,752 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-2:a.e.s.Slf4jLogger@80] - Slf4jLogger started 2017-06-26 11:34:52,927 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-4:Remoting@74
] - Starting remoting 2017-06-26 11:34:53,437 - INFO [sparkDriverActorSystem-akka.actor.default-dispatcher-4:Remoting@74] - Remoting started; listening on addresses :[akka.tcp://sparkDriverActorSys...@10.100.38.105:60657] 2017-06-26 11:34:54,631 - DEBUG [spark-submitter-sparkhdfs.AppSpark-5f9d439b-5a8d-11e7-a413-acde48001122:c.c.c.a.r.s.SparkMetricsSink@51] - Using SparkMetricsSink for reporting metrics: {class=co.cask.cdap.app.runtime.spark.SparkMetricsSink} 2017-06-26 11:34:58,699 - DEBUG [spark-submitter-sparkhdfs.AppSpark-5f9d439b-5a8d-11e7-a413-acde48001122:c.c.c.d.m.w.BasicLineageWriter@63] - Writing access for run program_run:etl2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark.5f9d439b-5a8d-11e7-a413-acde48001122, dataset dataset:etl2629.inputdataset, accessType READ, component null, accessTime = 1498494898699 2017-06-26 11:34:59,406 - DEBUG [spark-submitter-sparkhdfs.AppSpark-5f9d439b-5a8d-11e7-a413-acde48001122:c.c.c.d.m.w.BasicLineageWriter@63] - Writing access for run program_run:etl2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark.5f9d439b-5a8d-11e7-a413-acde48001122, dataset dataset:etl2629.out, accessType WRITE, component null, accessTime = 1498494899406 2017-06-26 11:34:59,681 - DEBUG [SparkListenerBus:c.c.c.a.r.s.DefaultSparkExecutionContext@106] - Spark program=program:etl2629.sparkhdfs.-SNAPSHOT.spark.sparkhdfs.AppSpark, runId=5f9d439b-5a8d-11e7-a413-acde48001122, jobId=0 starts with auto-commit=false on transaction Transaction{readPointer: 1498494898683000000, transactionId: 1498494898692000000, writePointer: 1498494898692000000, invalids: [1498357848719000000, 1498363599519000000, 1498402426936000000, 1498402526771000000, 1498404408367000000, 1498404439066000000, 1498404971830000000, 1498491200958000000], inProgress: [1498494190406000000], firstShortInProgress: 9223372036854775807, type: LONG, checkpointWritePointers: [], visibilityLevel: SNAPSHOT} 2017-06-26 11:34:59,683 - DEBUG [SparkListenerBus:c.c.c.a.r.s.SparkTransactionService@140] - Spark job started: JobTransaction{jobId=0, stageIds=[0, 1], transaction=Transaction{readPointer: 1498494898683000000, transactionId: 1498494898692000000, writePointer: 1498494898692000000, invalids: [1498357848719000000, 1498363599519000000, 1498402426936000000, 1498402526771000000, 1498404408367000000, 1498404439066000000, 1498404971830000000, 1498491200958000000], inProgress: [1498494190406000000], firstShortInProgress: 9223372036854775807, type: LONG, checkpointWritePointers: [], visibilityLevel: SNAPSHOT}} 2017-06-26 11:35:03,472 - DEBUG [Executor task launch worker-0:s.JAVASparkHDFS@55] - user line is 1234,naresh,Dallas,8037167180
...
--
You received this message because you are subscribed to the Google Groups "CDAP User" group.
To unsubscribe from this group and stop receiving emails from it, send an email to cdap-user+unsubscribe@googlegroups.com.
To post to this group, send email to cdap...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/cdap-user/eda9593a-502d-4af8-be91-faf90dd56994%40googlegroups.com.
FileSetArguments.setInputPaths(inputArgs, "user");
JavaPairRDD<LongWritable,Text> userRdd = jse.fromDataset("inputdataset",inputArgs);
FileSetArguments.setInputPaths(inputArgs, "transaction");
JavaPairRDD<LongWritable,Text> txRdd = jse.fromDataset("inputdataset",inputArgs);
Code snippet2:
Map<String, String> userinputArgs = new HashMap();
Map<String, String> txinputArgs = new HashMap();
Map<String, String> outputArgs = new HashMap();
FileSetArguments.setInputPaths(userinputArgs,"user");
FileSetArguments.setInputPath(txinputArgs,"transaction");
JavaPairRDD<LongWritable,Text> userRdd = jse.fromDataset("inputdataset",userinputArgs);
JavaPairRDD<LongWritable,Text> txRdd = jse.fromDataset("inputdataset",txinputArgs);
Error message:
- show quoted text -
...
To unsubscribe from this group and stop receiving emails from it, send an email to cdap-user+unsubscribe@googlegroups.com.
To post to this group, send email to cdap...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/cdap-user/f01d72fc-76e5-4710-8fd2-b56f9d0a0311%40googlegroups.com.
...
--
You received this message because you are subscribed to the Google Groups "CDAP User" group.
To unsubscribe from this group and stop receiving emails from it, send an email to cdap-user+unsubscribe@googlegroups.com.
To post to this group, send email to cdap...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/cdap-user/ff4a3aa4-3baf-4fb4-b762-68c4b2f7ca3f%40googlegroups.com.
--
You received this message because you are subscribed to the Google Groups "CDAP User" group.
To unsubscribe from this group and stop receiving emails from it, send an email to cdap-user+unsubscribe@googlegroups.com.
To post to this group, send email to cdap...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/cdap-user/ab55b78e-d130-4253-a989-6c86318151bd%40googlegroups.com.