* R code : csv file contain 30 columns and 200000 records / Please suggest me code to build glm model using sparkr it should be faster and which function i have to use to build that model..
# Set Spark Home
Sys.setenv(SPARK_HOME="C:/spark/spark-2.0.0-bin-hadoop2.7")
# set library path
.libPaths(c(file.path(Sys.getenv("SPARK_HOME"),"R","lib"), .libPaths()))
Sys.setenv(JAVA_HOME="C:/Program Files/Java/jdk1.7.0_71")
# loading SparkR library
library(SparkR)
library(rJava)
sc <- sparkR.session(enableHiveSupport = FALSE,master = "local[*]",appName = "SparkR-Modi",sparkConfig = list(spark.sql.warehouse.dir="file:///c:/tmp/spark-warehouse"))
sqlContext <- sparkRSQL.init(sc)
spdf <- read.df(sqlContext, "C:/Users/prasann/Desktop/V/bigdata11.csv", source = "com.databricks.spark.csv", header = "true")
showDF(spdf)
ERROR :
> showDF(spdf)
Error in invokeJava(isStatic = FALSE, objId$id, methodName, ...) :
org.apache.spark.sql.AnalysisException: Unable to resolve No.Rcg_CurrMth given [Geo_Zone, Mbrshp_Tier, Vintagemths, FamilySize, Mth_Income, Employment, Gender, NetpackSMSdate, NP_SMSsent, NP_SMS_TIMEID, NP_SMS_hr, NP_SMS_Time, NP_SMS_Day, NP_OfferCode, No.Rcg_CurrMth, RcgAmt_CurrMth, Start_Balance, NO_OF_RCG_L1 , RcgAmt_L1, NO_OF_RCG_L2 , RcgAmt_L2, NO_OF_RCG_L3 , RcgAmt_L3, I_T_NO_INT2XL_L1 , I_T_NO_INT2XL_L2 , I_T_NO_INT2XL_L3 , I_T_NO_OTHER2XL_L1 , I_T_NO_OTHER2XL_L2 , CustomerFeedback, NP_OfferCurrentResponse];
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1$$anonfun$apply$5.apply(LogicalPlan.scala:134)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1$$anonfun$apply$5.apply(LogicalPlan.scala:134)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$re
Thanks
Prasann Modi