Not able to do batch data ingestion

166 views
Skip to first unread message

Akash Deep Verma

unread,
May 22, 2018, 2:01:24 PM5/22/18
to Druid User
Hi,

I'm submitting a job 

task definition is as follows
{
 
"task": "index_hadoop_package_2018-05-22T17:49:47.440Z",
 
"payload": {
   
"id": "index_hadoop_package_2018-05-22T17:49:47.440Z",
   
"spec": {
     
"dataSchema": {
       
"dataSource": "package",
       
"parser": {
         
"type": "hadoopyString",
         
"parseSpec": {
           
"format": "json",
           
"timestampSpec": {
             
"column": "ud",
             
"format": "auto"
           
},
           
"dimensionsSpec": {
             
"dimensions": [
               
"wbn"
             
],
             
"dimensionExclusions": [
               
             
],
             
"spatialDimensions": [
               
             
]
           
}
         
}
       
},
       
"metricsSpec": [
         
{
           
"type": "count",
           
"name": "count"
         
},
         
{
           
"type": "doubleSum",
           
"name": "added",
           
"fieldName": "added",
           
"expression": null
         
},
         
{
           
"type": "doubleSum",
           
"name": "deleted",
           
"fieldName": "deleted",
           
"expression": null
         
},
         
{
           
"type": "doubleSum",
           
"name": "delta",
           
"fieldName": "delta",
           
"expression": null
         
}
       
],
       
"granularitySpec": {
         
"type": "uniform",
         
"segmentGranularity": "DAY",
         
"queryGranularity": {
           
"type": "none"
         
},
         
"rollup": true,
         
"intervals": [
           
"2018-03-24T00:00:00.000Z/2018-03-25T00:00:00.000Z"
         
]
       
}
     
},
     
"ioConfig": {
       
"type": "hadoop",
       
"inputSpec": {
         
"type": "static",
         
"paths": "s3n://prod-integration-s3-package-ad-json/ad=2018-03-24-04/Integrate-Package.info+10+0085289945.json"
       
},
       
"metadataUpdateSpec": null,
       
"segmentOutputPath": null
     
},
     
"tuningConfig": {
       
"type": "hadoop",
       
"workingPath": null,
       
"version": "2018-05-22T17:49:47.440Z",
       
"partitionsSpec": {
         
"type": "hashed",
         
"targetPartitionSize": -1,
         
"maxPartitionSize": -1,
         
"assumeGrouped": false,
         
"numShards": -1,
         
"partitionDimensions": [
           
         
]
       
},
       
"shardSpecs": {
         
       
},
       
"indexSpec": {
         
"bitmap": {
           
"type": "concise"
         
},
         
"dimensionCompression": "lz4",
         
"metricCompression": "lz4",
         
"longEncoding": "longs"
       
},
       
"maxRowsInMemory": 75000,
       
"leaveIntermediate": false,
       
"cleanupOnFailure": true,
       
"overwriteFiles": false,
       
"ignoreInvalidRows": false,
       
"jobProperties": {
         
"mapreduce.job.classloader": "true",
         
"mapreduce.job.classloader.system.classes": "-javax.validation.,java.,javax.,org.apache.commons.logging.,org.apache.log4j.,org.apache.hadoop.",
         
"fs.s3.awsAccessKeyId": <ACCESS_KEY>,
         
"fs.s3.awsSecretAccessKey": <SECRET_KEY>,
         
"fs.s3n.awsAccessKeyId": <ACCESS_KEY>,
         
"fs.s3n.awsSecretAccessKey": <SECRET_KEY>,
         
"fs.s3.impl": "org.apache.hadoop.fs.s3native.NativeS3FileSystem",
         
"fs.s3n.impl": "org.apache.hadoop.fs.s3native.NativeS3FileSystem",
         
"io.compression.codecs": "org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec"
       
},
       
"combineText": false,
       
"useCombiner": false,
       
"buildV9Directly": true,
       
"numBackgroundPersistThreads": 0,
       
"forceExtendableShardSpecs": false,
       
"useExplicitVersion": false,
       
"allowedHadoopPrefix": [
         
       
]
     
},
     
"uniqueId": "da4ff84e675b45f7908e6737a02435a7"
   
},
   
"hadoopDependencyCoordinates": [
     
"org.apache.hadoop:hadoop-client:2.7.3",
     
"org.apache.hadoop:hadoop-aws:2.7.3"
   
],
   
"classpathPrefix": null,
   
"context": null,
   
"groupId": "index_hadoop_package_2018-05-22T17:49:47.440Z",
   
"dataSource": "package",
   
"resource": {
     
"availabilityGroup": "index_hadoop_package_2018-05-22T17:49:47.440Z",
     
"requiredCapacity": 1
   
}
 
}
}


but still I'm getting error, though I have followed the doc http://druid.io/docs/latest/operations/other-hadoop.html and got stuck here after fixing multiple steps.

2018-05-22T17:01:37,603 INFO [task-runner-0-priority-0] org.apache.hadoop.mapreduce.Job -  map 0% reduce 0%
2018-05-22T17:01:55,151 INFO [task-runner-0-priority-0] org.apache.hadoop.mapreduce.Job - Task Id : attempt_1527007990232_0001_m_000000_0, Status : FAILED
Error: org.apache.hadoop.fs.s3.S3Exception: org.jets3t.service.ServiceException: Request Error: java.lang.ClassCastException: org.jets3t.service.utils.RestUtils$ConnManagerFactory cannot be cast to org.apache.http.conn.ClientConnectionManagerFactory
	at org.apache.hadoop.fs.s3native.Jets3tNativeFileSystemStore.processException(Jets3tNativeFileSystemStore.java:478)
	at org.apache.hadoop.fs.s3native.Jets3tNativeFileSystemStore.processException(Jets3tNativeFileSystemStore.java:427)
	at org.apache.hadoop.fs.s3native.Jets3tNativeFileSystemStore.handleException(Jets3tNativeFileSystemStore.java:411)
	at org.apache.hadoop.fs.s3native.Jets3tNativeFileSystemStore.retrieveMetadata(Jets3tNativeFileSystemStore.java:181)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:191)
	at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:102)
	at org.apache.hadoop.fs.s3native.$Proxy31.retrieveMetadata(Unknown Source)
	at org.apache.hadoop.fs.s3native.NativeS3FileSystem.getFileStatus(NativeS3FileSystem.java:477)
	at org.apache.hadoop.fs.s3native.NativeS3FileSystem.open(NativeS3FileSystem.java:625)
	at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:773)
	at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.initialize(LineRecordReader.java:85)
	at org.apache.hadoop.mapreduce.lib.input.DelegatingRecordReader.initialize(DelegatingRecordReader.java:84)
	at org.apache.hadoop.mapred.MapTask$NewTrackingRecordReader.initialize(MapTask.java:557)
	at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:795)
	at org.apache.hadoop.mapred.MapTask.run(MapTask.java:342)
	at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
	at java.security.AccessController.doPrivileged(Native Method)
	at javax.security.auth.Subject.doAs(Subject.java:422)
	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1698)
	at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
Caused by: org.jets3t.service.ServiceException: Request Error: java.lang.ClassCastException: org.jets3t.service.utils.RestUtils$ConnManagerFactory cannot be cast to org.apache.http.conn.ClientConnectionManagerFactory
	at org.jets3t.service.impl.rest.httpclient.RestStorageService.performRequest(RestStorageService.java:574)
	at org.jets3t.service.impl.rest.httpclient.RestStorageService.performRequest(RestStorageService.java:281)
	at org.jets3t.service.impl.rest.httpclient.RestStorageService.performRestHead(RestStorageService.java:942)
	at org.jets3t.service.impl.rest.httpclient.RestStorageService.getObjectImpl(RestStorageService.java:2148)
	at org.jets3t.service.impl.rest.httpclient.RestStorageService.getObjectDetailsImpl(RestStorageService.java:2075)
	at org.jets3t.service.StorageService.getObjectDetails(StorageService.java:1093)
	at org.jets3t.service.StorageService.getObjectDetails(StorageService.java:548)
	at org.apache.hadoop.fs.s3native.Jets3tNativeFileSystemStore.retrieveMetadata(Jets3tNativeFileSystemStore.java:174)
	... 20 more
Caused by: java.lang.ClassCastException: org.jets3t.service.utils.RestUtils$ConnManagerFactory cannot be cast to org.apache.http.conn.ClientConnectionManagerFactory
	at org.apache.http.impl.client.AbstractHttpClient.createClientConnectionManager(AbstractHttpClient.java:284)
	at org.apache.http.impl.client.AbstractHttpClient.getConnectionManager(AbstractHttpClient.java:437)
	at org.apache.http.impl.client.AbstractHttpClient.createHttpContext(AbstractHttpClient.java:246)
	at org.apache.http.impl.client.AbstractHttpClient.doExecute(AbstractHttpClient.java:771)
	at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:83)
	at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:56)
	at org.jets3t.service.impl.rest.httpclient.RestStorageService.performRequest(RestStorageService.java:334)
	... 27 more

Container killed by the ApplicationMaster.
Container killed on request. Exit code is 143
Container exited with a non-zero exit code 143

Akash Deep Verma

unread,
May 23, 2018, 3:26:15 AM5/23/18
to Druid User
I have solved it by upgrading EMR. It's working fine now

Akash Deep Verma

unread,
May 23, 2018, 1:55:31 PM5/23/18
to Druid User
I'm still getting the same error.
Problem was not resolved earlier, I have copied wrong hadoop xml files to druid conf, which results in picking the batch ingestion tasks by middlemanager node itself, instead of pushing to emr.
Message has been deleted

Alexander Ryazanov

unread,
Sep 14, 2018, 5:56:30 AM9/14/18
to Druid User
I was lucky and fixed it with setting `mapreduce.job.user.classpath.first = true` instead of `mapreduce.job.classloader = true`.  Worth trying.
Reply all
Reply to author
Forward
0 new messages