I want to run spark job on yarn cluster, and put the relate alluxio jars on $hadoop/share/hadoop/yarn/
Firstly, put a file to Alluxio:
$ bin/alluxio fs copyFromLocal LICENSE /LICENSE
Secondly, submit a spark job to yarn cluster, shows error:java.lang.NoSuchFieldError: mProtocol.
Cannot read file on alluxio. And i tried to write a file by spark also failed. But in Some machine i could run the spark job succecssfully and some failed...
Then i use another method to submit these jar by spark method could success:
$spark-submit --class com.test --master yarn-cluster --num-executors 4 --executor-memory 1g --executor-cores 4 --jars alluxio-core-client-1.0.1.jar,alluxio-examples-1.0.1.jar,alluxio-underfs-hdfs-1.0.1.jar,alluxio-core-client-internal-1.0.1.jar,alluxio-keyvalue-client-internal-1.0.1.jar,alluxio-underfs-local-1.0.1.jar,alluxio-core-common-1.0.1.jar,alluxio-keyvalue-common-1.0.1.jar,alluxio-underfs-s3-1.0.1.jar /test.jar $LocalIP
My Spark Test Demo:
object test {
private val LOG = Logger.getLogger(this.getClass)
def main (args: Array[String]) {
LOG.error("======================================")
val ip = args(0)
val alluxioIp = s"alluxio://$ip:19998/LICENSE"
val conf = new SparkConf().setAppName("test")
val sc = new SparkContext(conf)
sc.hadoopConfiguration.set("fs.alluxio.impl", "alluxio.hadoop.FileSystem")
//READ file on alluxio
val s = sc.textFile(alluxioIp)
LOG.error(s.first())
//WRITE a dateframe into alluxio
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
// Create a simple DataFrame, stored into a partition directory
val df1 = sc.makeRDD(1 to 5).map(i => (i, i * 2)).toDF("single", "double")
df1.write.parquet(s"alluxio://$ip:19998/test")
LOG.error("SUCCESS!")
}
}
AND HERE is error message:
16/06/15 13:39:47 ERROR ApplicationMaster: User class threw exception: org.apache.spark.sql.AnalysisException: path alluxio://208.208.102.230:19998/test already exists.;
org.apache.spark.sql.AnalysisException: path alluxio://208.208.102.230:19998/test already exists.;
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation.run(InsertIntoHadoopFsRelation.scala:76)
at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult$lzycompute(commands.scala:57)
at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult(commands.scala:57)
at org.apache.spark.sql.execution.ExecutedCommand.doExecute(commands.scala:69)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:140)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:138)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:138)
at org.apache.spark.sql.SQLContext$QueryExecution.toRdd$lzycompute(SQLContext.scala:933)
at org.apache.spark.sql.SQLContext$QueryExecution.toRdd(SQLContext.scala:933)
at org.apache.spark.sql.execution.datasources.ResolvedDataSource$.apply(ResolvedDataSource.scala:197)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:146)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:137)
at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:304)
at com.uniview.salut.spark.traffic.test$.main(test.scala:46)
at com.uniview.salut.spark.traffic.test.main(test.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at org.apache.spark.deploy.yarn.ApplicationMaster$$anon$2.run(ApplicationMaster.scala:525)