Hi Guys,
I’m trying to update an external HIVE dataset's schema but I’m having problems with schema evolution. I’m using the Sqoop to generate the Avro Schema and use this to initially create the dataset, then I update the source database table and extract the schema again. Finally I want to merge the schemas and update the dataset with the new repo but I get an incompatible schema error?
The inbound data will be from Sqoop but I can't directly write as Parquet since I need to use the direct modes to get the best extract performance.
def merge(source_schema: Schema) = {
val target_descriptor = dataset.getDescriptor
val target_schema = target_descriptor.getSchema
log.info(source_schema.toString(true))
if (!repo.exists(database, name)) log.error("Dataset %s not found".format(dataset_path))
else {
if (source_schema == target_schema) {
log.info("No change in schemas detected.")
}
else {
val updated_descriptor: DatasetDescriptor = new DatasetDescriptor.Builder(target_descriptor)
.schema(source_schema)
.build()
//Datasets.update(dataset.getUri, updated_descriptor)
repo.update(database, name, updated_descriptor)
}
}
}
Exception in thread "main" org.kitesdk.data.IncompatibleSchemaException: Schema cannot read data written using existing schema. Schema: {
"type" : "record",
"name" : "sqoop_import_categories",
"doc" : "Sqoop import of categories",
"fields" : [ {
"name" : "category_id",
"type" : [ "int", "null" ],
"columnName" : "category_id",
"sqlType" : "4"
}, {
"name" : "category_department_id",
"type" : [ "int", "null" ],
"columnName" : "category_department_id",
"sqlType" : "4"
}, {
"name" : "category_name",
"type" : [ "string", "null" ],
"columnName" : "category_name",
"sqlType" : "12"
}, {
"name" : "my_test_col",
"type" : [ "int", "null" ],
"columnName" : "my_test_col",
"sqlType" : "4"
}, {
"name" : "my_test_col2",
"type" : [ "int", "null" ],
"columnName" : "my_test_col2",
"sqlType" : "4"
}, {
"name" : "my_test_col3",
"type" : [ "int", "null" ],
"columnName" : "my_test_col3",
"sqlType" : "4"
} ],
"tableName" : "categories"
}
Existing schema: {
"type" : "record",
"name" : "sqoop_import_categories",
"doc" : "Sqoop import of categories",
"fields" : [ {
"name" : "category_id",
"type" : [ "int", "null" ],
"columnName" : "category_id",
"sqlType" : "4"
}, {
"name" : "category_department_id",
"type" : [ "int", "null" ],
"columnName" : "category_department_id",
"sqlType" : "4"
}, {
"name" : "category_name",
"type" : [ "string", "null" ],
"columnName" : "category_name",
"sqlType" : "12"
}, {
"name" : "my_test_col",
"type" : [ "int", "null" ],
"columnName" : "my_test_col",
"sqlType" : "4"
}, {
"name" : "my_test_col2",
"type" : [ "int", "null" ],
"columnName" : "my_test_col2",
"sqlType" : "4"
} ],
"tableName" : "categories"
}
Thanks
Andrew