I just commited a change to the dkpro-bigdata git repository. You will need to check it out and
compile it or add the ukp snapshot maven repo to your maven configuration.
With this, you should be able to use the example given below (use the examples project in
The class defines custom Document and MetadataExtractors for Text2CASInputFormat. Here you can plug in
your code that parses the Tweet JSON. Prerequisite is that all records are in a separate line each.
It will run a simple pipeline on the data and store the results as XMI Cas in a SequenceFile.
These files then can be read and processed again by dkpro bigdata directly.
For small documents such as tweets the xml format is more effective because it does not
need to store a typesystem with each document.
Disclaimer: I didn't really test the code below, use at your own risk :-)
Best,
-hp
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.ToolRunner;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.bigdata.hadoop.DkproHadoopDriver;
import de.tudarmstadt.ukp.dkpro.bigdata.hadoop.DkproMapper;
import de.tudarmstadt.ukp.dkpro.bigdata.hadoop.DkproReducer;
import de.tudarmstadt.ukp.dkpro.bigdata.io.hadoop.CASWritable;
import de.tudarmstadt.ukp.dkpro.bigdata.io.hadoop.Text2CASInputFormat;
import de.tudarmstadt.ukp.dkpro.bigdata.io.hadoop.Text2CASInputFormat.DocumentMetadataExtractor;
import de.tudarmstadt.ukp.dkpro.bigdata.io.hadoop.Text2CASInputFormat.DocumentTextExtractor;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.snowball.SnowballStemmer;
import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter;
public class Tweet2CASExample extends DkproHadoopDriver {
// This extractor assumes the tweet data as tab-seperated values: timestamp\tuser\ttweet
public static class TweetTextExtractor implements DocumentTextExtractor,
DocumentMetadataExtractor {
@Override
public void extractDocumentMetaData(Text key, Text value,
DocumentMetaData metadata) {
String[] values = value.toString().split("\t");
// set document id as user%timestamp
metadata.setDocumentId(key.toString() + "%" + values[0]);
}
@Override
public Text extractDocumentText(Text key, Text value) {
// Put your JSON-Parser here.
String[] values = value.toString().split("\t");
return new Text(values[1]);
}
}
public AnalysisEngineDescription buildMapperEngine(Configuration job)
throws ResourceInitializationException {
AnalysisEngineDescription tokenizer = createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription stemmer = createEngineDescription(
SnowballStemmer.class, SnowballStemmer.PARAM_LANGUAGE, "en");
return createEngineDescription(tokenizer, stemmer);
}
public static void main(String[] args) throws Exception {
Tweet2CASExample pipeline = new Tweet2CASExample();
pipeline.setMapperClass(DkproMapper.class);
pipeline.setReducerClass(DkproReducer.class);
ToolRunner.run(new Configuration(), pipeline, args);
}
@Override
public void configure(JobConf job) {
/*
* Use custom extractors
*/
job.set("dkpro.uima.text2casinputformat.documentmetadataextractor",TweetTextExtractor.class.getCanonicalName());
job.set("dkpro.uima.text2casinputformat.documenttextextractor",TweetTextExtractor.class.getCanonicalName());
/*
* Tweets are very small documents, the default BinCasWithTypeSystem output is very inefficient for
* this kind of data, therefore we use XMI serialization.
*/
job.setOutputValueClass(CASWritable.class);
/*
* Use Text2Cas InputFormat, read texts directly from hdfs
*/
job.setInputFormat(Text2CASInputFormat.class);
}
@Override
public Class getInputFormatClass() {
return Text2CASInputFormat.class;
}
}