The code is
# 1692082 //Total redords (1.4Gb)
library(rmr2)
library(arules)
## Local apriori finding frequent itemsets with !support=0.5!
readRows <- function(file, sep="\n", split=" ", ...){
tt <- strsplit(
scan(file, what="list", sep=sep, ...),
split=split)
out <- lapply(tt, function(i) as.numeric(i))
out
}
webdoc <- readRows('/home/hduser/data/webdoc-10000.dat') ///whether this is local user file system or hdfs user file.
tr_webdoc <- as(webdoc, "transactions")
fItemL <- apriori(tr_webdoc, parameter=new("APparameter", support=0.5,target="frequent itemsets", maxlen=5))
## paralell-apriori finding frequent itemsets with !support=0.3!
##
Reason: Because we are finding "local" frequent itemset's support
count in the mapper, then in the reducer we accumulate the support count
for each local-frequent itemset. (An itemset with global support 0.5
will not have local support of 0.5 on every datanode, so we will find
relative lower local-support 0.3 itemset's support count in the mapper.
Then after the mapreduce we can get global-frequent itemset with a
higher support by elimanating itemsets below the higher support 0.5, see
below.)
papriori =
function(
input,
output = NULL,
pattern = " ",
support=0.3,
maxlen=5 #This is important!
){
## papriori-map
pa.map =
function(., lines) {
if((LL=length(lines))>5000){
fItems <- apriori(as(lapply(strsplit(
x = lines,
split = pattern), unique),
"transactions"),
parameter=new("APparameter",
support=support,
target="frequent itemsets",
maxlen=maxlen))
recNum <- fItems@info$ntransactions[1]
keyval(as(items(fItems), "list"),
fItems@quality$support*recNum)}
else
keyval(list("-1"),LL) #Number of records skiped.
}
## papiori-reduce
pa.reduce =
function(word, counts ) {
keyval(word, sum(counts))}
## papiori-mapreduce
mapreduce(
input = input ,
output = output,
input.format = "text",
map = pa.map,
reduce = pa.reduce,
combine = T)
}
rmr.options(backend = "hadoop")
rmr.options(keyval.length=10000) /////how to declare the keyval.length
out.hadoop = from.dfs(papriori("/user/hduser/webdoc", pattern = " +")) ///whether this is local or hdfs user data... if no then when i fire the commend this give me error path not found...if yes it give me following error