I didn’t have much time today but it looks like there’s a issue reading null values in parquet. This is easily reproducable by running a sqoop with the —as-parquet-file against a table with a nullable column and no data in the column. Sqoop succeeds but checking the files with parquet-tools throws an error. I was able to produce this stack trace via the copy task to move the data to a second dataset.
I also changed Sqoops Avro schema generator to add a null as default but it didn’t help.
INFO jobcontrol.CrunchControlledJob: java.io.IOException: Could not read footer: java.lang.NullPointerException
at parquet.hadoop.ParquetFileReader.readAllFootersInParallel(ParquetFileReader.java:193)
at parquet.hadoop.ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(ParquetFileReader.java:148)
at parquet.hadoop.ParquetInputFormat.getFooters(ParquetInputFormat.java:597)
at parquet.hadoop.ParquetInputFormat.getFooters(ParquetInputFormat.java:573)
at parquet.hadoop.ParquetInputFormat.getSplits(ParquetInputFormat.java:412)
at org.kitesdk.data.spi.filesystem.FileSystemViewKeyInputFormat.getSplits(FileSystemViewKeyInputFormat.java:123)
at org.kitesdk.data.mapreduce.DatasetKeyInputFormat.getSplits(DatasetKeyInputFormat.java:254)
at org.apache.hadoop.mapreduce.JobSubmitter.writeNewSplits(JobSubmitter.java:589)
at org.apache.hadoop.mapreduce.JobSubmitter.writeSplits(JobSubmitter.java:606)
at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:490)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1295)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1292)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1642)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:1292)
at org.apache.crunch.hadoop.mapreduce.lib.jobcontrol.CrunchControlledJob.submit(CrunchControlledJob.java:329)
at org.apache.crunch.hadoop.mapreduce.lib.jobcontrol.CrunchJobControl.startReadyJobs(CrunchJobControl.java:204)
at org.apache.crunch.hadoop.mapreduce.lib.jobcontrol.CrunchJobControl.pollJobStatusAndStartNewOnes(CrunchJobControl.java:238)
at org.apache.crunch.impl.mr.exec.MRExecutor.monitorLoop(MRExecutor.java:112)
at org.apache.crunch.impl.mr.exec.MRExecutor.access$000(MRExecutor.java:55)
at org.apache.crunch.impl.mr.exec.MRExecutor$1.run(MRExecutor.java:83)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.NullPointerException
at parquet.format.converter.ParquetMetadataConverter.fromParquetStatistics(ParquetMetadataConverter.java:248)
at parquet.format.converter.ParquetMetadataConverter.fromParquetMetadata(ParquetMetadataConverter.java:425)
at parquet.format.converter.ParquetMetadataConverter.readParquetMetadata(ParquetMetadataConverter.java:403)
at parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:299)
at parquet.hadoop.ParquetFileReader$2.call(ParquetFileReader.java:183)
at parquet.hadoop.ParquetFileReader$2.call(ParquetFileReader.java:179)
at java.util.concurrent.FutureTask.run(FutureTask.java:262)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
... 1 more
1 job failure(s) occurred:
org.kitesdk.tools.CopyTask: Kite(dataset:hdfs://quickstart.cloudera:8020/data/lz/quic... ID=1 (1/1)(1): java.io.IOException: Could not read footer: java.lang.NullPointerException
at parquet.hadoop.ParquetFileReader.readAllFootersInParallel(ParquetFileReader.java:193)
at parquet.hadoop.ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(ParquetFileReader.java:148)
at parquet.hadoop.ParquetInputFormat.getFooters(ParquetInputFormat.java:597)
at parquet.hadoop.ParquetInputFormat.getFooters(ParquetInputFormat.java:573)
at parquet.hadoop.ParquetInputFormat.getSplits(ParquetInputFormat.java:412)
at org.kitesdk.data.spi.filesystem.FileSystemViewKeyInputFormat.getSplits(FileSystemViewKeyInputFormat.java:123)
at org.kitesdk.data.mapreduce.DatasetKeyInputFormat.getSplits(DatasetKeyInputFormat.java:254)
at org.apache.hadoop.mapreduce.JobSubmitter.writeNewSplits(JobSubmitter.java:589)
at org.apache.hadoop.mapreduce.JobSubmitter.writeSplits(JobSubmitter.java:606)
at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:490)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1295)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1292)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1642)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:1292)
at org.apache.crunch.hadoop.mapreduce.lib.jobcontrol.CrunchControlledJob.submit(CrunchControlledJob.java:329)
at org.apache.crunch.hadoop.mapreduce.lib.jobcontrol.CrunchJobControl.startReadyJobs(CrunchJobControl.java:204)
at org.apache.crunch.hadoop.mapreduce.lib.jobcontrol.CrunchJobControl.pollJobStatusAndStartNewOnes(CrunchJobControl.java:238)
at org.apache.crunch.impl.mr.exec.MRExecutor.monitorLoop(MRExecutor.java:112)
at org.apache.crunch.impl.mr.exec.MRExecutor.access$000(MRExecutor.java:55)
at org.apache.crunch.impl.mr.exec.MRExecutor$1.run(MRExecutor.java:83)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.NullPointerException
at parquet.format.converter.ParquetMetadataConverter.fromParquetStatistics(ParquetMetadataConverter.java:248)
at parquet.format.converter.ParquetMetadataConverter.fromParquetMetadata(ParquetMetadataConverter.java:425)
at parquet.format.converter.ParquetMetadataConverter.readParquetMetadata(ParquetMetadataConverter.java:403)
at parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:299)
at parquet.hadoop.ParquetFileReader$2.call(ParquetFileReader.java:183)
at parquet.hadoop.ParquetFileReader$2.call(ParquetFileReader.java:179)
at java.util.concurrent.FutureTask.run(FutureTask.java:262)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
... 1 more