spar_v2.py
from pyspark_cassandra import CassandraSparkContext, Row
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
import datetime
now = datetime.datetime.now().strftime("%Y%m%d %H:%M:%S")
conf = SparkConf().setAppName("test_" + now).setMaster("spark://192.168.23.31:7077").set("spark.cassandra.connection.host", "192.168.23.31")
sc = CassandraSparkContext(conf=conf)
table = sc.cassandraTable("lebara_diameter_codes","nl_lebara_diameter_codes")
#food_count = table.select("errorcode2001").groupBy("errorcode2001").count()
count = table.where('errorcode2001 > 1200')
count.take(1)
print(count)
Output
[Stage 1:=============================> (2 + 2) / 4]16/06/30 16:53:59 ERROR TaskSchedulerImpl: Lost executor 1 on as4: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
16/06/30 16:54:00 ERROR TaskSchedulerImpl: Lost executor 0 on as5: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
[Stage 1:============================================> (3 + 1) / 4]16/06/30 16:54:01 ERROR TaskSchedulerImpl: Lost executor 2 on as3: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
[Stage 1:============================================> (3 + 1) / 4]16/06/30 16:54:06 ERROR TaskSchedulerImpl: Lost executor 3 on as5: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
16/06/30 16:54:06 ERROR TaskSetManager: Task 1 in stage 1.0 failed 4 times; aborting job
Traceback (most recent call last):
File "/mnt/spark-1.6.1-bin-hadoop2.6/spark_v2.py", line 15, in <module>
count.take(1)
File "build/bdist.linux-x86_64/egg/pyspark_cassandra/rdd.py", line 185, in take
File "build/bdist.linux-x86_64/egg/pyspark_cassandra/rdd.py", line 183, in take
File "/mnt/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 1297, in take
File "/mnt/spark/python/lib/pyspark.zip/pyspark/context.py", line 939, in runJob
File "/mnt/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 813, in __call__
File "/mnt/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py", line 308, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 1.0 failed 4 times, most recent failure: Lost task 1.3 in stage 1.0 (TID 7, as5): ExecutorLostFailure (executor 3 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:393)
at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
On the Workers:
16/06/30 16:54:06 ERROR Utils: Uncaught exception in thread stdout writer for python
java.lang.AbstractMethodError: pyspark_cassandra.DeferringRowReader.read(Lcom/datastax/driver/core/Row;Lcom/datastax/spark/connector/CassandraRowMetadata;)Ljava/lang/Object;
at com.datastax.spark.connector.rdd.CassandraTableScanRDD$$anonfun$17.apply(CassandraTableScanRDD.scala:315)
at com.datastax.spark.connector.rdd.CassandraTableScanRDD$$anonfun$17.apply(CassandraTableScanRDD.scala:315)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$13.next(Iterator.scala:372)
at com.datastax.spark.connector.util.CountingIterator.next(CountingIterator.scala:16)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$GroupedIterator.takeDestructively(Iterator.scala:914)
at scala.collection.Iterator$GroupedIterator.go(Iterator.scala:929)
at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:968)
at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:972)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:452)
at org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3.apply(PythonRDD.scala:280)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1765)
at org.apache.spark.api.python.PythonRunner$WriterThread.run(PythonRDD.scala:239)
16/06/30 16:54:06 ERROR SparkUncaughtExceptionHandler: Uncaught exception in thread Thread[stdout writer for python,5,main]
java.lang.AbstractMethodError: pyspark_cassandra.DeferringRowReader.read(Lcom/datastax/driver/core/Row;Lcom/datastax/spark/connector/CassandraRowMetadata;)Ljava/lang/Object;
at com.datastax.spark.connector.rdd.CassandraTableScanRDD$$anonfun$17.apply(CassandraTableScanRDD.scala:315)
at com.datastax.spark.connector.rdd.CassandraTableScanRDD$$anonfun$17.apply(CassandraTableScanRDD.scala:315)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$13.next(Iterator.scala:372)
at com.datastax.spark.connector.util.CountingIterator.next(CountingIterator.scala:16)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$GroupedIterator.takeDestructively(Iterator.scala:914)
at scala.collection.Iterator$GroupedIterator.go(Iterator.scala:929)
at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:968)
at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:972)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:452)
at org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3.apply(PythonRDD.scala:280)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1765)
at org.apache.spark.api.python.PythonRunner$WriterThread.run(PythonRDD.scala:239)
--
You received this message because you are subscribed to the Google Groups "DataStax Spark Connector for Apache Cassandra" group.
To unsubscribe from this group and stop receiving emails from it, send an email to spark-connector-...@lists.datastax.com.