org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 0.0 failed 4 times, most recent failure: Lost task 1.3 in stage 0.0 (TID 4, dcsac028): java.io.IOException: Could not read footer: java.lang.IllegalArgumentException: No enum constant alluxio.exception.AlluxioExceptionType.alluxio.exception.BlockDoesNotExistException at org.apache.parquet.hadoop.ParquetFileReader.readAllFootersInParallel(ParquetFileReader.java:247) at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anonfun$27.apply(ParquetRelation.scala:786) at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anonfun$27.apply(ParquetRelation.scala:775) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RDD.scala:710) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RDD.scala:710) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66) at org.apache.spark.scheduler.Task.run(Task.scala:89) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) at java.lang.Thread.run(Thread.java:745)Caused by: java.lang.IllegalArgumentException: No enum constant alluxio.exception.AlluxioExceptionType.alluxio.exception.BlockDoesNotExistException at java.lang.Enum.valueOf(Enum.java:236) at alluxio.exception.AlluxioExceptionType.valueOf(AlluxioExceptionType.java:19) at alluxio.exception.AlluxioException.from(AlluxioException.java:69) at alluxio.AbstractClient.retryRPC(AbstractClient.java:324) at alluxio.client.block.BlockWorkerClient.cancelBlock(BlockWorkerClient.java:156) at alluxio.client.block.RemoteBlockOutStream.cancel(RemoteBlockOutStream.java:83) at alluxio.client.file.FileInStream.closeCacheStream(FileInStream.java:288) at alluxio.client.file.FileInStream.close(FileInStream.java:117) at alluxio.hadoop.HdfsFileInputStream.close(HdfsFileInputStream.java:114) at java.io.FilterInputStream.close(FilterInputStream.java:181) at org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:432) at org.apache.parquet.hadoop.ParquetFileReader$2.call(ParquetFileReader.java:237) at org.apache.parquet.hadoop.ParquetFileReader$2.call(ParquetFileReader.java:233) at java.util.concurrent.FutureTask.run(FutureTask.java:262) ... 3 more
Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799) at scala.Option.foreach(Option.scala:257) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929) at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:927) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111) at org.apache.spark.rdd.RDD.withScope(RDD.scala:316) at org.apache.spark.rdd.RDD.collect(RDD.scala:926) at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$.mergeSchemasInParallel(ParquetRelation.scala:799) at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$MetadataCache.org$apache$spark$sql$execution$datasources$parquet$ParquetRelation$MetadataCache$$readSchema(ParquetRelation.scala:517) at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$MetadataCache$$anonfun$12.apply(ParquetRelation.scala:421) at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$MetadataCache$$anonfun$12.apply(ParquetRelation.scala:421) at scala.Option.orElse(Option.scala:289) at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$MetadataCache.refresh(ParquetRelation.scala:421) at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation.org$apache$spark$sql$execution$datasources$parquet$ParquetRelation$$metadataCache$lzycompute(ParquetRelation.scala:145) at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation.org$apache$spark$sql$execution$datasources$parquet$ParquetRelation$$metadataCache(ParquetRelation.scala:143) at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anonfun$6.apply(ParquetRelation.scala:202) at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anonfun$6.apply(ParquetRelation.scala:202) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation.dataSchema(ParquetRelation.scala:202) at org.apache.spark.sql.sources.HadoopFsRelation.schema$lzycompute(interfaces.scala:636) at org.apache.spark.sql.sources.HadoopFsRelation.schema(interfaces.scala:635) at org.apache.spark.sql.execution.datasources.LogicalRelation.<init>(LogicalRelation.scala:37) at org.apache.spark.sql.SQLContext.baseRelationToDataFrame(SQLContext.scala:442) at org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:316) ... 48 elidedCaused by: java.io.IOException: Could not read footer: java.lang.IllegalArgumentException: No enum constant alluxio.exception.AlluxioExceptionType.alluxio.exception.BlockDoesNotExistException at org.apache.parquet.hadoop.ParquetFileReader.readAllFootersInParallel(ParquetFileReader.java:247) at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anonfun$27.apply(ParquetRelation.scala:786) at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anonfun$27.apply(ParquetRelation.scala:775) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RDD.scala:710) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RDD.scala:710) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66) at org.apache.spark.scheduler.Task.run(Task.scala:89) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) at java.lang.Thread.run(Thread.java:745)Caused by: java.lang.IllegalArgumentException: No enum constant alluxio.exception.AlluxioExceptionType.alluxio.exception.BlockDoesNotExistException at java.lang.Enum.valueOf(Enum.java:236) at alluxio.exception.AlluxioExceptionType.valueOf(AlluxioExceptionType.java:19) at alluxio.exception.AlluxioException.from(AlluxioException.java:69) at alluxio.AbstractClient.retryRPC(AbstractClient.java:324) at alluxio.client.block.BlockWorkerClient.cancelBlock(BlockWorkerClient.java:156) at alluxio.client.block.RemoteBlockOutStream.cancel(RemoteBlockOutStream.java:83) at alluxio.client.file.FileInStream.closeCacheStream(FileInStream.java:288) at alluxio.client.file.FileInStream.close(FileInStream.java:117) at alluxio.hadoop.HdfsFileInputStream.close(HdfsFileInputStream.java:114) at java.io.FilterInputStream.close(FilterInputStream.java:181) at org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:432) at org.apache.parquet.hadoop.ParquetFileReader$2.call(ParquetFileReader.java:237) at org.apache.parquet.hadoop.ParquetFileReader$2.call(ParquetFileReader.java:233) at java.util.concurrent.FutureTask.run(FutureTask.java:262) ... 3 more
val df = sqlContext.read.parquet("alluxio://dcsac035:19998/directory/file.parquet")
#!/usr/bin/env bash
# This file contains environment variables required to run Alluxio. Copy it as alluxio-env.sh and
# edit that to configure Alluxio for your site. At a minimum,
# the following variables should be set:
#
# - JAVA_HOME, to point to your JAVA installation
# - ALLUXIO_MASTER_ADDRESS, to bind the master to a different IP address or hostname
# - ALLUXIO_UNDERFS_ADDRESS, to set the under filesystem address.
# - ALLUXIO_WORKER_MEMORY_SIZE, to set how much memory to use (e.g. 1000mb, 2gb) per worker
# - ALLUXIO_RAM_FOLDER, to set where worker stores in memory data
JAVA_HOME=/usr/lib/jvm/java-8-oracle
ALLUXIO_MASTER_ADDRESS=dcsac035
ALLUXIO_UNDERFS_ADDRESS=hdfs://dcsac022:8020
ALLUXIO_WORKER_MEMORY_SIZE=100gb
ALLUXIO_RAM_FOLDER=/dev/shm
# Support for Multihomed Networks.
# You can specify a hostname to bind each of services. If a wildcard
# is applied, you should select one of network interfaces and use its hostname to connect the service.
# If no hostname is defined, Alluxio will automatically select an externally visible localhost name.
# The various possibilities shown in the following table:
#
# +--------------------+------------------------+---------------------+
# | ALLUXIO_*_HOSTNAME | ALLUXIO_*_BIND_HOST | Actual Connect Host |
# +--------------------+------------------------+---------------------+
# | hostname | hostname | hostname |
# | not defined | hostname | hostname |
# | hostname | 0.0.0.0 or not defined | hostname |
# | not defined | 0.0.0.0 or not defined | localhost |
# +--------------------+------------------------+---------------------+
#
# Configuration Examples:
#
# Environment variables for service bind
# ALLUXIO_MASTER_BIND_HOST=${ALLUXIO_MASTER_BIND_HOST:-$(hostname -A | cut -d" " -f1)}
# ALLUXIO_MASTER_WEB_BIND_HOST=${ALLUXIO_MASTER_WEB_BIND_HOST:-0.0.0.0}
# ALLUXIO_WORKER_BIND_HOST=${ALLUXIO_WORKER_BIND_HOST:-$(hostname -A | cut -d" " -f1)}
# ALLUXIO_WORKER_DATA_BIND_HOST=${ALLUXIO_WORKER_DATA_BIND_HOST:-$(hostname -A | cut -d" " -f1)}
# ALLUXIO_WORKER_WEB_BIND_HOST=${ALLUXIO_WORKER_WEB_BIND_HOST:-0.0.0.0}
#
# Environment variables for service connection
# ALLUXIO_MASTER_HOSTNAME=${ALLUXIO_MASTER_HOSTNAME:-$(hostname -A | cut -d" " -f1)}
# ALLUXIO_MASTER_WEB_HOSTNAME=${ALLUXIO_MASTER_WEB_HOSTNAME:-$(hostname -A | cut -d" " -f1)}
# ALLUXIO_WORKER_HOSTNAME=${ALLUXIO_WORKER_HOSTNAME:-$(hostname -A | cut -d" " -f1)}
# ALLUXIO_WORKER_DATA_HOSTNAME=${ALLUXIO_WORKER_DATA_HOSTNAME:-$(hostname -A | cut -d" " -f1)}
# ALLUXIO_WORKER_WEB_HOSTNAME=${ALLUXIO_WORKER_WEB_HOSTNAME:-$(hostname -A | cut -d" " -f1)}
# The following gives an example:
# Uncomment this section to add a local installation of Hadoop to Alluxio's CLASSPATH.
# The hadoop command must be in the path to automatically populate the Hadoop classpath.
#
# if type "hadoop" > /dev/null 2>&1; then
# export HADOOP_ALLUXIO_CLASSPATH=$(hadoop classpath)
# fi
# export ALLUXIO_CLASSPATH=${ALLUXIO_CLASSPATH:-${HADOOP_ALLUXIO_CLASSPATH}}
if [[ $(uname -s) == Darwin ]]; then
# Assuming Mac OS X
export JAVA_HOME=${JAVA_HOME:-$(/usr/libexec/java_home)}
export ALLUXIO_RAM_FOLDER=/Volumes/ramdisk
export ALLUXIO_JAVA_OPTS="-Djava.security.krb5.realm= -Djava.security.krb5.kdc="
else
# Assuming Linux
if [[ -z "$JAVA_HOME" ]]; then
if [ -d /usr/lib/jvm/java-7-oracle ]; then
export JAVA_HOME=/usr/lib/jvm/java-7-oracle
else
# openjdk will set this
if [[ -d /usr/lib/jvm/jre-1.7.0 ]]; then
export JAVA_HOME=/usr/lib/jvm/jre-1.7.0
fi
fi
fi
if [[ -z "${ALLUXIO_RAM_FOLDER}" ]]; then
export ALLUXIO_RAM_FOLDER="/mnt/ramdisk"
fi
fi
if [[ -z "${JAVA_HOME}" ]]; then
export JAVA_HOME="$(dirname $(which java))/.."
fi
export JAVA="${JAVA_HOME}/bin/java"
export ALLUXIO_MASTER_ADDRESS=${ALLUXIO_MASTER_ADDRESS:-localhost}
export ALLUXIO_UNDERFS_ADDRESS=${ALLUXIO_UNDERFS_ADDRESS:-${ALLUXIO_HOME}/underFSStorage}
#export ALLUXIO_UNDERFS_ADDRESS=${ALLUXIO_UNDERFS_ADDRESS:-hdfs://localhost:9000/alluxio}
export ALLUXIO_WORKER_MEMORY_SIZE=${ALLUXIO_WORKER_MEMORY_SIZE:-1GB}
export ALLUXIO_SSH_FOREGROUND=${ALLUXIO_SSH_FOREGROUND:-"yes"}
export ALLUXIO_WORKER_SLEEP=${ALLUXIO_WORKER_SLEEP:-"0.02"}
# Prepend Alluxio classes before classes specified by ALLUXIO_CLASSPATH
# in the Java classpath. May be necessary if there are jar conflicts
#export ALLUXIO_PREPEND_ALLUXIO_CLASSES=${ALLUXIO_PREPEND_ALLUXIO_CLASSES:-"yes"}
# Where log files are stored. $ALLUXIO_HOME/logs by default.
#export ALLUXIO_LOGS_DIR=${ALLUXIO_LOGS_DIR:-${ALLUXIO_HOME}/logs}
CONF_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
export ALLUXIO_JAVA_OPTS+="
-Dlog4j.configuration=file:${CONF_DIR}/log4j.properties
-Dalluxio.worker.tieredstore.levels=1
-Dalluxio.worker.tieredstore.level0.alias=MEM
-Dalluxio.worker.tieredstore.level0.dirs.path=${ALLUXIO_RAM_FOLDER}
-Dalluxio.worker.tieredstore.level0.dirs.quota=${ALLUXIO_WORKER_MEMORY_SIZE}
-Dalluxio.underfs.address=${ALLUXIO_UNDERFS_ADDRESS}
-Dalluxio.worker.memory.size=${ALLUXIO_WORKER_MEMORY_SIZE}
-Dalluxio.master.hostname=${ALLUXIO_MASTER_ADDRESS}
-Dorg.apache.jasper.compiler.disablejsr199=true
-Djava.net.preferIPv4Stack=true
-Dalluxio.user.file.readtype.default=CACHE_PROMOTE
-Dalluxio.user.file.writetype.default=CACHE_THROUGH
-Dalluxio.user.file.cache.incomplete.block=true
-Dalluxio.underfs.hdfs.configuration=/etc/hadoop/conf/core-site.xml
-Dalluxio.debug=true
"
# Master specific parameters. Default to ALLUXIO_JAVA_OPTS.
export ALLUXIO_MASTER_JAVA_OPTS="${ALLUXIO_JAVA_OPTS}"
# Worker specific parameters that will be shared to all workers. Default to ALLUXIO_JAVA_OPTS.
export ALLUXIO_WORKER_JAVA_OPTS="${ALLUXIO_JAVA_OPTS}"
I forgot to note, I have built Alluxio from source based on this pull request: https://github.com/Alluxio/alluxio/pull/3089 as it seemed to address the incomplete block issue, but have had no luck.
--
You received this message because you are subscribed to the Google Groups "Alluxio Users" group.
To unsubscribe from this group and stop receiving emails from it, send an email to alluxio-user...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.
I have -Dalluxio.user.file.cache.incomplete.block=true as part of my ALLUXIO_JAVA_OPTS in alluxio-env.sh. Is that what you're referring to?
scala> val df = sqlContext.read.parquet("alluxio://dcsac035:19998/daily/2016-01-01")
16/05/06 13:46:28 INFO logger.type: initialize(alluxio://dcsac035:19998/daily/2016-01-01,
Configuration: core-default.xml, core-site.xml, mapred-default.xml,
mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml,
hdfs-site.xml). Connecting to Alluxio:
alluxio://dcsac035:19998/daily/2016-01-01
16/05/06 13:46:28 INFO logger.type: Loading Alluxio properties from Hadoop configuration: {}
16/05/06 13:46:28 INFO logger.type: alluxio://dcsac035:19998 alluxio://dcsac035:19998
16/05/06 13:46:28 INFO logger.type: getWorkingDirectory: /
16/05/06 13:46:28 INFO logger.type: getWorkingDirectory: /
16/05/06 13:46:28 INFO parquet.ParquetRelation: Listing alluxio://dcsac035:19998/daily/2016-01-01 on driver
16/05/06 13:46:28 INFO logger.type: listStatus(alluxio://dcsac035:19998/daily/2016-01-01)
16/05/06 13:46:28 INFO logger.type: Alluxio client (version 1.1.0-SNAPSHOT) is trying to connect with FileSystemMasterClient master @ dcsac035/10.5.151.135:19998
16/05/06 13:46:28 INFO logger.type: Client registered with FileSystemMasterClient master @ dcsac035/10.5.151.135:19998
java.lang.AssertionError: assertion failed: No predefined schema found, and no Parquet data files or summary files found under alluxio://dcsac035:19998/daily/2016-01-01.
at scala.Predef$.assert(Predef.scala:165)
at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$MetadataCache.org$apache$spark$sql$execution$datasources$parquet$ParquetRelation$MetadataCache$$readSchema(ParquetRelation.scala:512)
at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$MetadataCache$$anonfun$12.apply(ParquetRelation.scala:421)
at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$MetadataCache$$anonfun$12.apply(ParquetRelation.scala:421)
at scala.Option.orElse(Option.scala:289)
at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$MetadataCache.refresh(ParquetRelation.scala:421)
at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation.org$apache$spark$sql$execution$datasources$parquet$ParquetRelation$$metadataCache$lzycompute(ParquetRelation.scala:145)
at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation.org$apache$spark$sql$execution$datasources$parquet$ParquetRelation$$metadataCache(ParquetRelation.scala:143)
at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anonfun$6.apply(ParquetRelation.scala:202)
at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anonfun$6.apply(ParquetRelation.scala:202)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation.dataSchema(ParquetRelation.scala:202)
at org.apache.spark.sql.sources.HadoopFsRelation.schema$lzycompute(interfaces.scala:636)
at org.apache.spark.sql.sources.HadoopFsRelation.schema(interfaces.scala:635)
at org.apache.spark.sql.execution.datasources.LogicalRelation.<init>(LogicalRelation.scala:37)
at org.apache.spark.sql.SQLContext.baseRelationToDataFrame(SQLContext.scala:442)
at org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:316)
... 48 elided
scala> val df = sqlContext.read.parquet("alluxio://dcsac035:19998/daily/2016-01-01")
16/05/06 13:56:10 INFO logger.type: initialize(alluxio://dcsac035:19998/daily/2016-01-01,
Configuration: core-default.xml, core-site.xml, mapred-default.xml,
mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml,
hdfs-site.xml). Connecting to Alluxio:
alluxio://dcsac035:19998/daily/2016-01-01
16/05/06 13:56:10 INFO logger.type: Loading Alluxio properties from Hadoop configuration: {}
16/05/06 13:56:10 INFO logger.type: alluxio://dcsac035:19998 alluxio://dcsac035:19998
16/05/06 13:56:10 INFO logger.type: getWorkingDirectory: /
16/05/06 13:56:10 INFO logger.type: getWorkingDirectory: /
16/05/06 13:56:10 INFO parquet.ParquetRelation: Listing alluxio://dcsac035:19998/daily/2016-01-01 on driver
16/05/06 13:56:10 INFO logger.type: listStatus(alluxio://dcsac035:19998/daily/2016-01-01)
16/05/06 13:56:10 INFO logger.type: Alluxio client (version 1.1.0-SNAPSHOT) is trying to connect with FileSystemMasterClient master @ dcsac035/10.5.151.135:19998
16/05/06 13:56:10 INFO logger.type: Client registered with FileSystemMasterClient master @ dcsac035/10.5.151.135:19998
16/05/06 13:56:11 INFO spark.SparkContext: Starting job: parquet at <console>:22
16/05/06 13:56:11 INFO scheduler.DAGScheduler: Got job 0 (parquet at <console>:22) with 8 output partitions
16/05/06 13:56:11 INFO scheduler.DAGScheduler: Final stage: ResultStage 0 (parquet at <console>:22)
16/05/06 13:56:11 INFO scheduler.DAGScheduler: Parents of final stage: List()
16/05/06 13:56:11 INFO scheduler.DAGScheduler: Missing parents: List()
16/05/06 13:56:11 INFO scheduler.DAGScheduler: Submitting ResultStage 0 (MapPartitionsRDD[1] at parquet at <console>:22), which has no missing parents
16/05/06 13:56:11 INFO storage.MemoryStore: Block broadcast_0 stored as values in memory (estimated size 67.3 KB, free 67.3 KB)
16/05/06 13:56:11 INFO storage.MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 23.7 KB, free 91.1 KB)
16/05/06 13:56:11 INFO storage.BlockManagerInfo: Added broadcast_0_piece0 in memory on 10.5.151.121:47578 (size: 23.7 KB, free: 5.5 GB)
16/05/06 13:56:11 INFO spark.SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:1006
16/05/06 13:56:11 INFO scheduler.DAGScheduler: Submitting 8 missing tasks from ResultStage 0 (MapPartitionsRDD[1] at parquet at <console>:22)
16/05/06 13:56:11 INFO cluster.YarnScheduler: Adding task set 0.0 with 8 tasks
16/05/06 13:56:11 INFO scheduler.FairSchedulableBuilder: Added task set TaskSet_0 tasks to pool default
16/05/06 13:56:11 INFO scheduler.TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, dcsac027, partition 0,PROCESS_LOCAL, 1894 bytes)
16/05/06 13:56:11 INFO scheduler.TaskSetManager: Starting task 1.0 in stage 0.0 (TID 1, dcsac023, partition 1,PROCESS_LOCAL, 1894 bytes)
16/05/06 13:56:11 INFO scheduler.TaskSetManager: Starting task 2.0 in stage 0.0 (TID 2, dcsac031, partition 2,PROCESS_LOCAL, 1894 bytes)
16/05/06 13:56:11 INFO scheduler.TaskSetManager: Starting task 3.0 in stage 0.0 (TID 3, dcsac026, partition 3,PROCESS_LOCAL, 1894 bytes)
16/05/06 13:56:11 INFO scheduler.TaskSetManager: Starting task 4.0 in stage 0.0 (TID 4, dcsac032, partition 4,PROCESS_LOCAL, 1894 bytes)
16/05/06 13:56:11 INFO scheduler.TaskSetManager: Starting task 5.0 in stage 0.0 (TID 5, dcsac030, partition 5,PROCESS_LOCAL, 1894 bytes)
16/05/06 13:56:11 INFO scheduler.TaskSetManager: Starting task 6.0 in stage 0.0 (TID 6, dcsac034, partition 6,PROCESS_LOCAL, 1894 bytes)
16/05/06 13:56:11 INFO scheduler.TaskSetManager: Starting task 7.0 in stage 0.0 (TID 7, dcsac024, partition 7,PROCESS_LOCAL, 1960 bytes)
16/05/06 13:56:12 INFO storage.BlockManagerInfo: Added broadcast_0_piece0 in memory on dcsac032:54069 (size: 23.7 KB, free: 2.7 GB)
16/05/06 13:56:12 INFO storage.BlockManagerInfo: Added broadcast_0_piece0 in memory on dcsac027:38286 (size: 23.7 KB, free: 2.7 GB)
16/05/06 13:56:12 INFO storage.BlockManagerInfo: Added broadcast_0_piece0 in memory on dcsac034:60178 (size: 23.7 KB, free: 2.7 GB)
16/05/06 13:56:12 INFO storage.BlockManagerInfo: Added broadcast_0_piece0 in memory on dcsac026:53379 (size: 23.7 KB, free: 2.7 GB)
16/05/06 13:56:12 INFO storage.BlockManagerInfo: Added broadcast_0_piece0 in memory on dcsac030:60346 (size: 23.7 KB, free: 2.7 GB)
16/05/06 13:56:12 INFO storage.BlockManagerInfo: Added broadcast_0_piece0 in memory on dcsac031:32866 (size: 23.7 KB, free: 2.7 GB)
16/05/06 13:56:12 INFO storage.BlockManagerInfo: Added broadcast_0_piece0 in memory on dcsac023:52648 (size: 23.7 KB, free: 2.7 GB)
16/05/06 13:56:12 INFO storage.BlockManagerInfo: Added broadcast_0_piece0 in memory on dcsac024:56921 (size: 23.7 KB, free: 2.7 GB)
16/05/06 13:56:12 INFO scheduler.TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 629 ms on dcsac027 (1/8)
16/05/06 13:56:12 INFO scheduler.TaskSetManager: Finished task 3.0 in stage 0.0 (TID 3) in 580 ms on dcsac026 (2/8)
16/05/06 13:56:12 INFO scheduler.TaskSetManager: Finished task 5.0 in stage 0.0 (TID 5) in 572 ms on dcsac030 (3/8)
16/05/06 13:56:12 INFO scheduler.TaskSetManager: Finished task 4.0 in stage 0.0 (TID 4) in 577 ms on dcsac032 (4/8)
16/05/06 13:56:12 INFO scheduler.TaskSetManager: Finished task 6.0 in stage 0.0 (TID 6) in 570 ms on dcsac034 (5/8)
16/05/06 13:56:12 INFO scheduler.TaskSetManager: Finished task 2.0 in stage 0.0 (TID 2) in 590 ms on dcsac031 (6/8)
16/05/06 13:56:12 INFO scheduler.TaskSetManager: Finished task 1.0 in stage 0.0 (TID 1) in 609 ms on dcsac023 (7/8)
16/05/06 13:56:13 WARN scheduler.TaskSetManager: Lost task 7.0 in stage 0.0 (TID 7, dcsac024): java.io.IOException: Could not read footer: java.io.IOException: Could not read footer for file FileStatus{path=alluxio://dcsac035:19998/daily/2016-01-01/_common_metadata;
isDirectory=false; length=49271; replication=0; blocksize=0;
modification_time=0; access_time=0; owner=; group=;
permission=rw-rw-rw-; isSymlink=false}
at org.apache.parquet.hadoop.ParquetFileReader.readAllFootersInParallel(ParquetFileReader.java:247)
at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anonfun$27.apply(ParquetRelation.scala:786)
at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anonfun$27.apply(ParquetRelation.scala:775)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RDD.scala:710)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RDD.scala:710)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.io.IOException: Could not read footer for file FileStatus{path=alluxio://dcsac035:19998/daily/2016-01-01/_common_metadata;
isDirectory=false; length=49271; replication=0; blocksize=0;
modification_time=0; access_time=0; owner=; group=;
permission=rw-rw-rw-; isSymlink=false}
at org.apache.parquet.hadoop.ParquetFileReader$2.call(ParquetFileReader.java:239)
at org.apache.parquet.hadoop.ParquetFileReader$2.call(ParquetFileReader.java:233)
at java.util.concurrent.FutureTask.run(FutureTask.java:262)
... 3 more
Caused by: java.io.IOException: alluxio.exception.BlockDoesNotExistException: TempBlockMeta not found for blockId 496,555,261,952
at alluxio.client.block.RemoteBlockOutStream.cancel(RemoteBlockOutStream.java:67)
at alluxio.client.file.FileInStream.closeCacheStream(FileInStream.java:295)
at alluxio.client.file.FileInStream.close(FileInStream.java:123)
at alluxio.hadoop.HdfsFileInputStream.close(HdfsFileInputStream.java:115)
at java.io.FilterInputStream.close(FilterInputStream.java:181)
at org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:432)
at org.apache.parquet.hadoop.ParquetFileReader$2.call(ParquetFileReader.java:237)
... 5 more
Caused by: alluxio.exception.BlockDoesNotExistException: TempBlockMeta not found for blockId 496,555,261,952
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:526)
at alluxio.exception.AlluxioException.from(AlluxioException.java:83)
at alluxio.AbstractClient.retryRPC(AbstractClient.java:328)
at alluxio.client.block.BlockWorkerClient.cancelBlock(BlockWorkerClient.java:164)
at alluxio.client.block.RemoteBlockOutStream.cancel(RemoteBlockOutStream.java:65)
... 11 more
16/05/06 13:56:13 INFO scheduler.TaskSetManager: Starting task 7.1 in stage 0.0 (TID 8, dcsac024, partition 7,PROCESS_LOCAL, 1960 bytes)
16/05/06 13:56:13 INFO scheduler.TaskSetManager: Lost task 7.1 in stage 0.0 (TID 8) on executor dcsac024: java.io.IOException (Could not read footer: java.io.IOException: Could not read footer for file FileStatus{path=alluxio://dcsac035:19998/daily/2016-01-01/_common_metadata;
isDirectory=false; length=49271; replication=0; blocksize=0;
modification_time=0; access_time=0; owner=; group=;
permission=rw-rw-rw-; isSymlink=false}) [duplicate 1]
16/05/06 13:56:13 INFO scheduler.TaskSetManager: Starting task 7.2 in stage 0.0 (TID 9, dcsac026, partition 7,PROCESS_LOCAL, 1960 bytes)
16/05/06 13:56:14 INFO scheduler.TaskSetManager: Lost task 7.2 in stage 0.0 (TID 9) on executor dcsac026: java.io.IOException (Could not read footer: java.io.IOException: Could not read footer for file FileStatus{path=alluxio://dcsac035:19998/daily/2016-01-01/_common_metadata;
isDirectory=false; length=49271; replication=0; blocksize=0;
modification_time=0; access_time=0; owner=; group=;
permission=rw-rw-rw-; isSymlink=false}) [duplicate 2]
16/05/06 13:56:14 INFO scheduler.TaskSetManager: Starting task 7.3 in stage 0.0 (TID 10, dcsac026, partition 7,PROCESS_LOCAL, 1960 bytes)
16/05/06 13:56:14 INFO scheduler.TaskSetManager: Lost task 7.3 in stage 0.0 (TID 10) on executor dcsac026: java.io.IOException (Could not read footer: java.io.IOException: Could not read footer for file FileStatus{path=alluxio://dcsac035:19998/daily/2016-01-01/_common_metadata;
isDirectory=false; length=49271; replication=0; blocksize=0;
modification_time=0; access_time=0; owner=; group=;
permission=rw-rw-rw-; isSymlink=false}) [duplicate 3]
16/05/06 13:56:14 ERROR scheduler.TaskSetManager: Task 7 in stage 0.0 failed 4 times; aborting job
16/05/06 13:56:14 INFO cluster.YarnScheduler: Removed TaskSet 0.0, whose tasks have all completed, from pool default
16/05/06 13:56:14 INFO cluster.YarnScheduler: Cancelling stage 0
16/05/06 13:56:14 INFO scheduler.DAGScheduler: ResultStage 0 (parquet at <console>:22) failed in 2.790 s
16/05/06 13:56:14 INFO scheduler.DAGScheduler: Job 0 failed: parquet at <console>:22, took 3.033282 s
org.apache.spark.SparkException: Job aborted due to stage failure: Task 7 in stage 0.0 failed 4 times, most recent failure: Lost task 7.3 in stage 0.0 (TID 10, dcsac026): java.io.IOException: Could not read footer: java.io.IOException: Could not read footer for file FileStatus{path=alluxio://dcsac035:19998/daily/2016-01-01/_common_metadata;
isDirectory=false; length=49271; replication=0; blocksize=0;
modification_time=0; access_time=0; owner=; group=;
permission=rw-rw-rw-; isSymlink=false}
at org.apache.parquet.hadoop.ParquetFileReader.readAllFootersInParallel(ParquetFileReader.java:247)
at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anonfun$27.apply(ParquetRelation.scala:786)
at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anonfun$27.apply(ParquetRelation.scala:775)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RDD.scala:710)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RDD.scala:710)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.io.IOException: Could not read footer for file FileStatus{path=alluxio://dcsac035:19998/daily/2016-01-01/_common_metadata;
isDirectory=false; length=49271; replication=0; blocksize=0;
modification_time=0; access_time=0; owner=; group=;
permission=rw-rw-rw-; isSymlink=false}
at org.apache.parquet.hadoop.ParquetFileReader$2.call(ParquetFileReader.java:239)
at org.apache.parquet.hadoop.ParquetFileReader$2.call(ParquetFileReader.java:233)
at java.util.concurrent.FutureTask.run(FutureTask.java:262)
... 3 more
Caused by: java.io.IOException: alluxio.exception.BlockDoesNotExistException: TempBlockMeta not found for blockId 496,555,261,952
at alluxio.client.block.RemoteBlockOutStream.cancel(RemoteBlockOutStream.java:67)
at alluxio.client.file.FileInStream.closeCacheStream(FileInStream.java:295)
at alluxio.client.file.FileInStream.close(FileInStream.java:123)
at alluxio.hadoop.HdfsFileInputStream.close(HdfsFileInputStream.java:115)
at java.io.FilterInputStream.close(FilterInputStream.java:181)
at org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:432)
at org.apache.parquet.hadoop.ParquetFileReader$2.call(ParquetFileReader.java:237)
... 5 more
Caused by: alluxio.exception.BlockDoesNotExistException: TempBlockMeta not found for blockId 496,555,261,952
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:526)
at alluxio.exception.AlluxioException.from(AlluxioException.java:83)
at alluxio.AbstractClient.retryRPC(AbstractClient.java:328)
at alluxio.client.block.BlockWorkerClient.cancelBlock(BlockWorkerClient.java:164)
at alluxio.client.block.RemoteBlockOutStream.cancel(RemoteBlockOutStream.java:65)
Caused by: java.io.IOException: Could not read footer: java.io.IOException: Could not read footer for file FileStatus{path=alluxio://dcsac035:19998/daily/2016-01-01/_common_metadata;
isDirectory=false; length=49271; replication=0; blocksize=0;
modification_time=0; access_time=0; owner=; group=;
permission=rw-rw-rw-; isSymlink=false}
at org.apache.parquet.hadoop.ParquetFileReader.readAllFootersInParallel(ParquetFileReader.java:247)
at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anonfun$27.apply(ParquetRelation.scala:786)
at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anonfun$27.apply(ParquetRelation.scala:775)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RDD.scala:710)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RDD.scala:710)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.io.IOException: Could not read footer for file FileStatus{path=alluxio://dcsac035:19998/daily/2016-01-01/_common_metadata;
isDirectory=false; length=49271; replication=0; blocksize=0;
modification_time=0; access_time=0; owner=; group=;
permission=rw-rw-rw-; isSymlink=false}
at org.apache.parquet.hadoop.ParquetFileReader$2.call(ParquetFileReader.java:239)
at org.apache.parquet.hadoop.ParquetFileReader$2.call(ParquetFileReader.java:233)
at java.util.concurrent.FutureTask.run(FutureTask.java:262)
... 3 more
Caused by: java.io.IOException: alluxio.exception.BlockDoesNotExistException: TempBlockMeta not found for blockId 496,555,261,952
at alluxio.client.block.RemoteBlockOutStream.cancel(RemoteBlockOutStream.java:67)
at alluxio.client.file.FileInStream.closeCacheStream(FileInStream.java:295)
at alluxio.client.file.FileInStream.close(FileInStream.java:123)
at alluxio.hadoop.HdfsFileInputStream.close(HdfsFileInputStream.java:115)
at java.io.FilterInputStream.close(FilterInputStream.java:181)
at org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:432)
at org.apache.parquet.hadoop.ParquetFileReader$2.call(ParquetFileReader.java:237)
... 5 more
Caused by: alluxio.exception.BlockDoesNotExistException: TempBlockMeta not found for blockId 496,555,261,952
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:526)
at alluxio.exception.AlluxioException.from(AlluxioException.java:83)
at alluxio.AbstractClient.retryRPC(AbstractClient.java:328)
at alluxio.client.block.BlockWorkerClient.cancelBlock(BlockWorkerClient.java:164)
at alluxio.client.block.RemoteBlockOutStream.cancel(RemoteBlockOutStream.java:65)
... 11 more
scala> val df = sqlContext.read.parquet("alluxio://dcsac035:19998/daily/2016-01-01")
16/05/06 14:05:20 INFO logger.type: getWorkingDirectory: /
16/05/06 14:05:20 INFO logger.type: getWorkingDirectory: /
16/05/06 14:05:20 INFO parquet.ParquetRelation: Listing alluxio://dcsac035:19998/daily/2016-01-01 on driver
16/05/06 14:05:20 INFO logger.type: listStatus(alluxio://dcsac035:19998/daily/2016-01-01)
16/05/06 14:05:20 INFO spark.SparkContext: Starting job: parquet at <console>:22
16/05/06 14:05:20 INFO scheduler.DAGScheduler: Got job 3 (parquet at <console>:22) with 2 output partitions
16/05/06 14:05:20 INFO scheduler.DAGScheduler: Final stage: ResultStage 3 (parquet at <console>:22)
16/05/06 14:05:20 INFO scheduler.DAGScheduler: Parents of final stage: List()
16/05/06 14:05:20 INFO scheduler.DAGScheduler: Missing parents: List()
16/05/06 14:05:20 INFO scheduler.DAGScheduler: Submitting ResultStage 3 (MapPartitionsRDD[7] at parquet at <console>:22), which has no missing parents
16/05/06 14:05:20 INFO storage.MemoryStore: Block broadcast_3 stored as values in memory (estimated size 67.3 KB, free 249.4 KB)
16/05/06 14:05:20 INFO storage.MemoryStore: Block broadcast_3_piece0 stored as bytes in memory (estimated size 23.7 KB, free 273.2 KB)
16/05/06 14:05:20 INFO storage.BlockManagerInfo: Added broadcast_3_piece0 in memory on 10.5.151.121:47578 (size: 23.7 KB, free: 5.5 GB)
16/05/06 14:05:20 INFO spark.SparkContext: Created broadcast 3 from broadcast at DAGScheduler.scala:1006
16/05/06 14:05:20 INFO scheduler.DAGScheduler: Submitting 2 missing tasks from ResultStage 3 (MapPartitionsRDD[7] at parquet at <console>:22)
16/05/06 14:05:20 INFO cluster.YarnScheduler: Adding task set 3.0 with 2 tasks
16/05/06 14:05:20 INFO scheduler.FairSchedulableBuilder: Added task set TaskSet_3 tasks to pool default
16/05/06 14:05:21 INFO spark.ExecutorAllocationManager: Requesting 1 new executor because tasks are backlogged (new desired total will be 1)
16/05/06 14:05:22 INFO spark.ExecutorAllocationManager: Requesting 1 new executor because tasks are backlogged (new desired total will be 2)
16/05/06 14:05:24 INFO cluster.YarnClientSchedulerBackend: Registered executor NettyRpcEndpointRef(null) (dcsac026:33926) with ID 13
16/05/06 14:05:24 INFO spark.ExecutorAllocationManager: New executor 13 has registered (new total is 1)
16/05/06 14:05:24 INFO scheduler.TaskSetManager: Starting task 0.0 in stage 3.0 (TID 21, dcsac026, partition 0,PROCESS_LOCAL, 1894 bytes)
16/05/06 14:05:24 INFO storage.BlockManagerMasterEndpoint: Registering block manager dcsac026:50064 with 2.7 GB RAM, BlockManagerId(13, dcsac026, 50064)
16/05/06 14:05:25 INFO storage.BlockManagerInfo: Added broadcast_3_piece0 in memory on dcsac026:50064 (size: 23.7 KB, free: 2.7 GB)
16/05/06 14:05:25 INFO scheduler.TaskSetManager: Starting task 1.0 in stage 3.0 (TID 22, dcsac026, partition 1,PROCESS_LOCAL, 1960 bytes)
16/05/06 14:05:25 INFO scheduler.TaskSetManager: Finished task 0.0 in stage 3.0 (TID 21) in 487 ms on dcsac026 (1/2)
16/05/06 14:05:26 INFO cluster.YarnClientSchedulerBackend: Registered executor NettyRpcEndpointRef(null) (dcsac024:56234) with ID 14
16/05/06 14:05:26 INFO spark.ExecutorAllocationManager: New executor 14 has registered (new total is 2)
16/05/06 14:05:26 INFO storage.BlockManagerMasterEndpoint: Registering block manager dcsac024:39410 with 2.7 GB RAM, BlockManagerId(14, dcsac024, 39410)
16/05/06 14:05:27 INFO scheduler.TaskSetManager: Finished task 1.0 in stage 3.0 (TID 22) in 2048 ms on dcsac026 (2/2)
16/05/06 14:05:27 INFO cluster.YarnScheduler: Removed TaskSet 3.0, whose tasks have all completed, from pool default
16/05/06 14:05:27 INFO scheduler.DAGScheduler: ResultStage 3 (parquet at <console>:22) finished in 6.307 s
16/05/06 14:05:27 INFO scheduler.DAGScheduler: Job 3 finished: parquet at <console>:22, took 6.347910 s
16/05/06 14:05:27 INFO logger.type: getWorkingDirectory: /
df: org.apache.spark.sql.DataFrame = [...
It is recommended to co-locate Alluxio and Spark (i.e. run spark on alluxio workers). You can see bigger performance gain that way.
Thank you.
Pei
spark.driver.extraJavaOptions="-Dalluxio.user.file.cache.partially.read.block=false -Dalluxio.user.file.readtype.default=CACHE_PROMOTE"
mikew@dcsac035:/opt/alluxio$ ./bin/alluxio loadufs alluxio://dcsac035:19998/daily hdfs://dcsac022:8020/daily
java.io.IOException: ThriftIOException(message:Mount path alluxio://dcsac035:19998/daily shadows an existing path in the default underlying filesystem)
at alluxio.AbstractClient.retryRPC(AbstractClient.java:330)
at alluxio.client.file.FileSystemMasterClient.mount(FileSystemMasterClient.java:258)
at alluxio.client.file.BaseFileSystem.mount(BaseFileSystem.java:234)
at alluxio.client.file.BaseFileSystem.mount(BaseFileSystem.java:225)
at alluxio.client.UfsUtils.loadUfs(UfsUtils.java:79)
at alluxio.client.UfsUtils.main(UfsUtils.java:129)
Caused by: ThriftIOException(message:Mount path alluxio://dcsac035:19998/daily shadows an existing path in the default underlying filesystem)
at alluxio.thrift.FileSystemMasterClientService$mount_result$mount_resultStandardScheme.read(FileSystemMasterClientService.java:13326)
at alluxio.thrift.FileSystemMasterClientService$mount_result$mount_resultStandardScheme.read(FileSystemMasterClientService.java:13303)
at alluxio.thrift.FileSystemMasterClientService$mount_result.read(FileSystemMasterClientService.java:13245)
at org.apache.thrift.TServiceClient.receiveBase(TServiceClient.java:86)
at alluxio.thrift.FileSystemMasterClientService$Client.recv_mount(FileSystemMasterClientService.java:556)
at alluxio.thrift.FileSystemMasterClientService$Client.mount(FileSystemMasterClientService.java:541)
at alluxio.client.file.FileSystemMasterClient$10.call(FileSystemMasterClient.java:261)
at alluxio.client.file.FileSystemMasterClient$10.call(FileSystemMasterClient.java:258)
at alluxio.AbstractClient.retryRPC(AbstractClient.java:326)
... 5 more
Usage: java -cp target/alluxio-1.1.0-SNAPSHOT-jar-with-dependencies.jar alluxio.client.UfsUtils <AlluxioUri> <UfsUri> [<Optional ExcludePathPrefix, separated by ;>]
Example: java -cp target/alluxio-1.1.0-SNAPSHOT-jar-with-dependencies.jar alluxio.client.UfsUtils alluxio://127.0.0.1:19998/a hdfs://localhost:9000/b c
Example: java -cp target/alluxio-1.1.0-SNAPSHOT-jar-with-dependencies.jar alluxio.client.UfsUtils alluxio://127.0.0.1:19998/a file:///b c
Example: java -cp target/alluxio-1.1.0-SNAPSHOT-jar-with-dependencies.jar alluxio.client.UfsUtils alluxio://127.0.0.1:19998/a /b c
In the Alluxio file system, /b will be mounted to /a, and the metadata
for all files under /b will be loaded except for those with prefix c
...
--
You received this message because you are subscribed to the Google Groups "Alluxio Users" group.
To unsubscribe from this group and stop receiving emails from it, send an email to alluxio-user...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.
spark.driver.extraJavaOptions="-Dalluxio.user.file.cache.partially.read.block=true -Dalluxio.user.file.readtype.default=CACHE_PROMOTE"
data = sqlContext.read.parquet("alluxio://dcsac035:19998/daily/03-01-01")
fdata = data.filter(data.endTime.substr(0,11) == "01 Mar 2016")
fdata.count()
var a = sqlContext.read.parquet("alluxio://Master:19998/data_500m_copy")
[Stage 0:> (0 + 0) / 4]16/05/14 00:54:01 WARN type: Should cache partially read block.
[Stage 0:============================================> (3 + 1) / 4]SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
a: org.apache.spark.sql.DataFrame = [single: int, double: int]
...
- Error returnedEnter code here...
16/05/06 13<s
--
You received this message because you are subscribed to the Google Groups "Alluxio Users" group.
To unsubscribe from this group and stop receiving emails from it, send an email to alluxio-user...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.
var a = sqlContext.read.parquet("alluxio://dcsac035:19998/user/keith/file.parquet")
spark.driver.extraJavaOptions="-Dalluxio.user.file.cache.partially.read.block=true -Dalluxio.user.file.readtype.default=CACHE_PROMOTE"
export SPARK_CLASSPATH="/opt/alluxio/alluxio-keith.jar:$SPARK_CLASSPATH". Is this what you're referring to?
-Keith
Pei
- Error returnedEnter code here...
/
16/05/06<span style=
--
You received this message because you are subscribed to the Google Groups "Alluxio Users" group.
To unsubscribe from this group and stop receiving emails from it, send an email to alluxio-user...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/data/3/yarn/nm/filecache/10187/spark16.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/jars/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/jars/avro-tools-1.7.6-cdh5.5.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
16/05/24 14:27:48 INFO executor.CoarseGrainedExecutorBackend: Registered signal handlers for [TERM, HUP, INT]
16/05/24 14:27:48 INFO spark.SecurityManager: Changing view acls to: yarn,keith
16/05/24 14:27:48 INFO spark.SecurityManager: Changing modify acls to: yarn,keith
16/05/24 14:27:48 INFO spark.SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(yarn, keith); users with modify permissions: Set(yarn, keith)
16/05/24 14:27:49 INFO spark.SecurityManager: Changing view acls to: yarn,keith
16/05/24 14:27:49 INFO spark.SecurityManager: Changing modify acls to: yarn,keith
16/05/24 14:27:49 INFO spark.SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(yarn, keith); users with modify permissions: Set(yarn, keith)
16/05/24 14:27:49 INFO slf4j.Slf4jLogger: Slf4jLogger started
16/05/24 14:27:49 INFO Remoting: Starting remoting
16/05/24 14:27:50 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkExecutorActorSystem@dcsac029:40358]
16/05/24 14:27:50 INFO util.Utils: Successfully started service 'sparkExecutorActorSystem' on port 40358.
16/05/24 14:27:50 INFO storage.DiskBlockManager: Created local directory at /data/0/yarn/nm/usercache/keith/appcache/application_1464015602645_0090/blockmgr-a5940aeb-8af2-402d-9ff5-569cccb99816
16/05/24 14:27:50 INFO storage.DiskBlockManager: Created local directory at /data/1/yarn/nm/usercache/keith/appcache/application_1464015602645_0090/blockmgr-1a78ccd4-fcc7-4dcc-b4d2-2062feef4901
16/05/24 14:27:50 INFO storage.DiskBlockManager: Created local directory at /data/2/yarn/nm/usercache/keith/appcache/application_1464015602645_0090/blockmgr-184ec2d6-cbea-4523-ac49-b851b2bbd99f
16/05/24 14:27:50 INFO storage.DiskBlockManager: Created local directory at /data/3/yarn/nm/usercache/keith/appcache/application_1464015602645_0090/blockmgr-e93e6525-65dc-46fa-b479-a2a77b7dccb1
16/05/24 14:27:50 INFO storage.MemoryStore: MemoryStore started with capacity 2.7 GB
16/05/24 14:27:50 INFO executor.CoarseGrainedExecutorBackend: Connecting to driver: spark://CoarseGrain...@10.5.151.121:59557
16/05/24 14:27:50 INFO executor.CoarseGrainedExecutorBackend: Successfully registered with driver
16/05/24 14:27:50 INFO executor.Executor: Starting executor ID 7 on host 10.5.151.129
16/05/24 14:27:50 INFO util.Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 50573.
16/05/24 14:27:50 INFO netty.NettyBlockTransferService: Server created on 50573
16/05/24 14:27:50 INFO storage.BlockManager: external shuffle service port = 7337
16/05/24 14:27:50 INFO storage.BlockManagerMaster: Trying to register BlockManager
16/05/24 14:27:50 INFO storage.BlockManagerMaster: Registered BlockManager
16/05/24 14:27:50 INFO storage.BlockManager: Registering executor with local external shuffle service.
16/05/24 14:27:50 INFO executor.Executor: Using REPL class URI: http://10.5.151.121:58322
16/05/24 14:28:22 INFO executor.CoarseGrainedExecutorBackend: Got assigned task 7
16/05/24 14:28:22 INFO executor.Executor: Running task 7.0 in stage 0.0 (TID 7)
16/05/24 14:28:22 INFO broadcast.TorrentBroadcast: Started reading broadcast variable 0
16/05/24 14:28:22 INFO storage.MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 23.7 KB, free 23.7 KB)
16/05/24 14:28:22 INFO broadcast.TorrentBroadcast: Reading broadcast variable 0 took 100 ms
16/05/24 14:28:22 INFO storage.MemoryStore: Block broadcast_0 stored as values in memory (estimated size 67.3 KB, free 91.1 KB)
16/05/24 14:28:23 INFO logger.type: initialize(alluxio://dcsac035:19998/user/keith/test.parquet/_common_metadata, Configuration: ). Connecting to Alluxio: alluxio://dcsac035:19998/user/keith/test.parquet/_common_metadata
16/05/24 14:28:23 INFO logger.type: Loading Alluxio properties from Hadoop configuration: {}
16/05/24 14:28:23 INFO logger.type: alluxio://dcsac035:19998 alluxio://dcsac035:19998
16/05/24 14:28:23 INFO logger.type: open(alluxio://dcsac035:19998/user/keith/test.parquet/_common_metadata, 65536)
16/05/24 14:28:23 INFO logger.type: Alluxio client (version 1.1.0-SNAPSHOT) is trying to connect with FileSystemMasterClient master @ dcsac035/10.5.151.135:19998
16/05/24 14:28:23 INFO logger.type: Client registered with FileSystemMasterClient master @ dcsac035/10.5.151.135:19998
16/05/24 14:29:23 ERROR logger.type: java.net.SocketTimeoutException: Read timed out
alluxio.org.apache.thrift.transport.TTransportException: java.net.SocketTimeoutException: Read timed out
at alluxio.org.apache.thrift.transport.TIOStreamTransport.read(TIOStreamTransport.java:129)
at alluxio.org.apache.thrift.transport.TTransport.readAll(TTransport.java:86)
at alluxio.org.apache.thrift.transport.TFramedTransport.readFrame(TFramedTransport.java:129)
at alluxio.org.apache.thrift.transport.TFramedTransport.read(TFramedTransport.java:101)
at alluxio.org.apache.thrift.transport.TTransport.readAll(TTransport.java:86)
at alluxio.org.apache.thrift.protocol.TBinaryProtocol.readStringBody(TBinaryProtocol.java:380)
at alluxio.org.apache.thrift.protocol.TBinaryProtocol.readString(TBinaryProtocol.java:374)
at alluxio.org.apache.thrift.protocol.TProtocolDecorator.readString(TProtocolDecorator.java:207)
at alluxio.thrift.FileBlockInfo$FileBlockInfoStandardScheme.read(FileBlockInfo.java:544)
at alluxio.thrift.FileBlockInfo$FileBlockInfoStandardScheme.read(FileBlockInfo.java:507)
at alluxio.thrift.FileBlockInfo.read(FileBlockInfo.java:440)
at alluxio.thrift.FileInfo$FileInfoStandardScheme.read(FileInfo.java:2225)
at alluxio.thrift.FileInfo$FileInfoStandardScheme.read(FileInfo.java:2026)
at alluxio.thrift.FileInfo.read(FileInfo.java:1862)
at alluxio.thrift.FileSystemMasterClientService$getStatus_result$getStatus_resultStandardScheme.read(FileSystemMasterClientService.java:8008)
at alluxio.thrift.FileSystemMasterClientService$getStatus_result$getStatus_resultStandardScheme.read(FileSystemMasterClientService.java:7993)
at alluxio.thrift.FileSystemMasterClientService$getStatus_result.read(FileSystemMasterClientService.java:7932)
at alluxio.org.apache.thrift.TServiceClient.receiveBase(TServiceClient.java:86)
at alluxio.thrift.FileSystemMasterClientService$Client.recv_getStatus(FileSystemMasterClientService.java:398)
at alluxio.thrift.FileSystemMasterClientService$Client.getStatus(FileSystemMasterClientService.java:385)
at alluxio.client.file.FileSystemMasterClient$6.call(FileSystemMasterClient.java:187)
at alluxio.client.file.FileSystemMasterClient$6.call(FileSystemMasterClient.java:184)
at alluxio.AbstractClient.retryRPC(AbstractClient.java:326)
at alluxio.client.file.FileSystemMasterClient.getStatus(FileSystemMasterClient.java:184)
at alluxio.client.file.BaseFileSystem.getStatus(BaseFileSystem.java:175)
at alluxio.client.file.BaseFileSystem.getStatus(BaseFileSystem.java:167)
at alluxio.hadoop.HdfsFileInputStream.<init>(HdfsFileInputStream.java:89)
at alluxio.hadoop.AbstractFileSystem.open(AbstractFileSystem.java:495)
at alluxio.hadoop.FileSystem.open(FileSystem.java:25)
at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:775)
at org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:406)
at org.apache.parquet.hadoop.ParquetFileReader$2.call(ParquetFileReader.java:237)
at org.apache.parquet.hadoop.ParquetFileReader$2.call(ParquetFileReader.java:233)
at java.util.concurrent.FutureTask.run(FutureTask.java:262)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.net.SocketTimeoutException: Read timed out
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.read(SocketInputStream.java:152)
at java.net.SocketInputStream.read(SocketInputStream.java:122)
at java.io.BufferedInputStream.fill(BufferedInputStream.java:235)
at java.io.BufferedInputStream.read1(BufferedInputStream.java:275)
at java.io.BufferedInputStream.read(BufferedInputStream.java:334)
at alluxio.org.apache.thrift.transport.TIOStreamTransport.read(TIOStreamTransport.java:127)
... 36 more
16/05/24 14:29:23 INFO logger.type: Alluxio client (version 1.1.0-SNAPSHOT) is trying to connect with FileSystemMasterClient master @ dcsac035/10.5.151.135:19998
16/05/24 14:29:23 INFO logger.type: Client registered with FileSystemMasterClient master @ dcsac035/10.5.151.135:19998
--
You received this message because you are subscribed to the Google Groups "Alluxio Users" group.
To unsubscribe from this group and stop receiving emails from it, send an email to alluxio-user...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.