I started to write some support to read Orc files in scoobi, mostly by copying and modifying the sequence input. It is not really polished, but are you interested in adding it to scoobi?
Me and Joe also worked on a macro-based generator of case classes from the ORC schema, which make it simple to get nice API for ORC files.
If you are interested I can try to clean it up and send you a pull request.
import scala.annotation.StaticAnnotation
import scala.language.experimental.macros
import scala.reflect.macros.Context
import scala.collection.JavaConversions._
import org.apache.hadoop.hive.serde2.objectinspector._
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils
import org.apache.hadoop.hive.ql.io.orc.OrcStruct
class OrcGenerator(schema: String) extends StaticAnnotation {
def macroTransform(annottees: Any*): Any = macro OrcGenerator.impl
}
object OrcGenerator {
def impl(c: Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
import c.universe._
val schema = c.macroApplication match {
case Apply(Select(Apply(_, List(Literal(Constant(s: String)))), _), _) => s
case _ => c.abort(c.enclosingPosition, s"Schema must be a string literal and not ${showRaw(c.macroApplication)}")
}
val ti = TypeInfoUtils.getTypeInfoFromTypeString(schema)
val oi = OrcStruct.createObjectInspector(ti).asInstanceOf[StructObjectInspector]
val fields = oi.getAllStructFieldRefs
annottees.map(_.tree) match {
case List(q"case class $className(orc: OrcStruct) { ..$body }") if body.isEmpty =>
val companionDefs = fields
.zipWithIndex
.map({
case (f, i) =>
val name = f.getFieldName
q"""def ${newTermName(name)} = oi.getStructFieldRef($name)"""
})
val termName = className.toTermName
val caseClassDefs = fields
.zipWithIndex
.map({
case (f, i) =>
val name = f.getFieldName
val orcType = f.getFieldObjectInspector().getTypeName()
val inspector = f.getFieldObjectInspector.getClass.getSimpleName
val inspectorFactory = inspector(0).toLower + inspector.substring(1)
val accessor = if (inspectorFactory == "writableStringObjectInspector" || inspectorFactory.toLowerCase.contains("java")) "getPrimitiveJavaObject" else "get"
val defbody = q"""org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.${newTermName(inspectorFactory)}.${newTermName(accessor)}(
$termName.oi.getStructFieldData(orc, $termName.${newTermName(name)}))"""
val (scalaType, converter) = orcType match {
case "date" => (tq"org.joda.time.LocalDate", q"org.joda.time.LocalDate.fromDateFields($defbody)")
case "timestamp" => (tq"org.joda.time.DateTime", q"new org.joda.time.DateTime($defbody)")
case s if s.startsWith("decimal") => (tq"BigDecimal", q"BigDecimal($defbody.bigDecimalValue())")
case "int" => (tq"Int", defbody)
case "bigint" => (tq"Long", defbody)
case "string" => (tq"String", defbody)
}
//${ newTypeName(scalaType) }
q"""def ${newTermName(name)} : $scalaType = $converter"""
})
c.Expr[Any](
q"""
object $termName {
val schema = $schema
val ti = org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils.getTypeInfoFromTypeString(schema)
val oi = org.apache.hadoop.hive.ql.io.orc.OrcStruct.createObjectInspector(ti).asInstanceOf[org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector]
..$companionDefs
}
case class ${className.toTypeName}(orc: org.apache.hadoop.hive.ql.io.orc.OrcStruct) {
..$caseClassDefs
}
""")
case _ @ expr => c.abort(c.enclosingPosition, s"This annotation should be used on an case class with no body instead of ${expr}.")
}
}
}