I have a problem processing ~1000 files from S3 totalling only 150 MB on a machine with 8GB RAM.
12/08/31 17:47:53 WARN io.nio: java.net.SocketException: Transport endpoint is not connected
12/08/31 17:47:53 ERROR spark.LocalScheduler: Exception in task 0
java.lang.OutOfMemoryError: GC overhead limit exceeded
at java.util.Arrays.copyOf(Arrays.java:2894)
at java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:117)
at java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:532)
at java.lang.StringBuilder.append(StringBuilder.java:206)
at java.io.ObjectInputStream$BlockDataInputStream.readUTFSpan(ObjectInputStream.java:3115)
at java.io.ObjectInputStream$BlockDataInputStream.readUTFBody(ObjectInputStream.java:3023)
at java.io.ObjectInputStream$BlockDataInputStream.readUTF(ObjectInputStream.java:2836)
at java.io.ObjectInputStream.readString(ObjectInputStream.java:1616)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1337)
at java.io.ObjectInputStream.readArray(ObjectInputStream.java:1684)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1340)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:1963)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1887)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1770)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1346)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:1963)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1887)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1770)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1346)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:368)
at spark.JavaDeserializationStream.readObject(JavaSerializer.scala:18)
at spark.SimpleShuffleFetcher$$anonfun$fetch$5$$anonfun$apply$1.apply$mcVI$sp(SimpleShuffleFetcher.scala:30)
at spark.SimpleShuffleFetcher$$anonfun$fetch$5$$anonfun$apply$1.apply(SimpleShuffleFetcher.scala:21)
at spark.SimpleShuffleFetcher$$anonfun$fetch$5$$anonfun$apply$1.apply(SimpleShuffleFetcher.scala:21)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:60)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at spark.SimpleShuffleFetcher$$anonfun$fetch$5.apply(SimpleShuffleFetcher.scala:21)
at spark.SimpleShuffleFetcher$$anonfun$fetch$5.apply(SimpleShuffleFetcher.scala:20)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:60)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at spark.SimpleShuffleFetcher.fetch(SimpleShuffleFetcher.scala:20)
at spark.ShuffledRDD.compute(ShuffledRDD.scala:39)
Exception in thread "main" spark.SparkException: Task failed: ResultTask(0, 0), reason: ExceptionFailure(java.lang.OutOfMemoryError: GC overhead limit exceeded)
at spark.DAGScheduler$class.runJob(DAGScheduler.scala:312)
at spark.LocalScheduler.runJob(LocalScheduler.scala:11)
at spark.SparkContext.runJob(SparkContext.scala:284)
at spark.SparkContext.runJob(SparkContext.scala:295)
at spark.SparkContext.runJob(SparkContext.scala:306)
at spark.RDD.count(RDD.scala:215)
at com.celtra.analyzer.LogAnalyzer.analyze(LogAnalyzer.scala:81)
at com.celtra.analyzer.LogAnalyzer.analyzeSufficientS3Logs(LogAnalyzer.scala:55)
at com.celtra.analyzer.App.main(App.scala)
I don't get how this is possible. It seems to me the 150 MB should easily fit into memory, and even if they didn't should they just spill to disk?