it's my first time using Hadoop and R. Here i'm trying to run word count in R but i always get this error. I'm using Hadoop 2.7.1 on mac osx 10.9.5 and R 3.2.1
Error: Could not find or load main class org.apache.hadoop.util.RunJar
Show Traceback
Rerun with Debug
Error in mr(map = map, reduce = reduce, combine = combine, vectorized.reduce, :
hadoop streaming failed with error code 1
---------------------------------------------------------------------------------------------------------------
Sys.setenv("HADOOP_PREFIX"="/usr/local/Cellar/hadoop/2.7.1")
Sys.setenv("HADOOP_CMD"="/usr/local/CELLAR/hadoop/2.7.1/bin/hadoop")
Sys.setenv("HADOOP_STREAMING"="/usr/local/Cellar/hadoop/2.7.1/libexec/share/hadoop/tools/lib/hadoop-streaming-2.7.1.jar")
library(rmr2)
## map function
map <- function(k,lines) {
words.list <- strsplit(lines, '\\s')
words <- unlist(words.list)
return( keyval(words, 1) )
}
## reduce function
reduce <- function(word, counts) {
keyval(word, sum(counts))
}
wordcount <- function (input, output=NULL) {
mapreduce(input=input, output=output, input.format="text",
map=map, reduce=reduce)
}
## delete previous result if any
#system("/Users/hadoop/hadoop-1.1.2/bin/hadoop fs -rm-r hdfs://localhost:9000/data/wordcount/outcome")
## Submit job
hdfs.root <- '/wordscount/data'
hdfs.data <- file.path(hdfs.root, 'README.text')
hdfs.out <- file.path(hdfs.root, 'res')
out <- wordcount(hdfs.data, hdfs.out)
## Fetch results from HDFS
results <- from.dfs(out)
## check top 30 frequent words
results.df <- as.data.frame(results, stringsAsFactors=F)
colnames(results.df) <- c('word', 'count')
head(results.df[order(results.df$count, decreasing=T), ], 30)