library("rmr2")
## Settings.
n <- 200 # n is related to the size of input for the mapper.
w.size <- 10000 # w.size is relate to the number of key for the reducer.
## w.size <- 5000 # No error occurs.
## Create the key val pairs.
word.set <-
unique(sapply(1:w.size,
function(x) paste0(sample(letters,
sample(4:10, 1),
replace = TRUE),
collapse = "")))
key <- as.character(sapply(1:n, rep, times = 500))
val <- unlist(lapply(1:n, function(x){
a <- sample(c(0,0,0,0,0,1,2), 500, replace = TRUE)
names(a) <- sample(word.set, 500)
return(a)}))
test <- keyval(key, val)
## Function that needed.
mp <- function(input) {
mapper <- function(keys, word.count) {
words <- names(word.count)
word.set <- sort(unique(words))
return.keys <- return.vals <- NULL
for (this.word in word.set) {
doc.count <- length(which(words == this.word))
return.keys <- c(return.keys, this.word)
return.vals <- c(return.vals, doc.count)
}
return(keyval(return.keys, return.vals))
}
reducer <- function(term, freq) {
stopifnot(length(term) == 1, all(is.finite(freq)))
return(keyval(term, sum(freq)))
}
return(mapreduce(input = input,
map = mapper, reduce = reducer, combine = TRUE))
}
## Run in Hadoop Backend.
rmr.options(backend = "hadoop")
DTDF <- mp(to.dfs(test))
anyDuplicated(from.dfs(DTDF)$key)
## Expected Result is : [1] 0
## True Result is : [1] <non-zero integer>
## Run in Local Backend.
rmr.options(backend = "local")
DTDF <- mp(to.dfs(test))
anyDuplicated(from.dfs(DTDF)$key)
## Expected Result is : [1] 0
## True Result is : [1] 0
--
post: rha...@googlegroups.com ||
unsubscribe: rhadoop+u...@googlegroups.com ||
web: https://groups.google.com/d/forum/rhadoop?hl=en-US
---
You received this message because you are subscribed to the Google Groups "RHadoop" group.
To unsubscribe from this group and stop receiving emails from it, send an email to rhadoop+u...@googlegroups.com.
For more options, visit https://groups.google.com/groups/opt_out.
library(rmr2)
load("test.RData")
rmr.options(keyval.length = 3)
result <- from.dfs(mp(to.dfs(test)))$keyanyDuplicated(result)
[1] 50
[1] 0
library(rmr2)
load("test.RData")
rmr.options(keyval.length = 3)
result <- from.dfs(mp(to.dfs(test)))$keyanyDuplicated(result)
I tried on several machines, with different OS.
Machine 1
OS: Mac OS X 10.8.4 64-bit
R version 2.15.3 (2013-03-01)
Platform: x86_64-apple-darwin9.8.0/x86_64 (64-bit)
Hadoop version: Apache 1.1.2
Machine 2
OS: Ubuntu Saucy Salamander (development branch) 32-bit
R version 3.0.0 (2013-04-03)
Platform: i686-pc-linux-gnu (32-bit)
Hadoop version: Apache 1.0.4
--
R version 3.0.1 (2013-05-16)