Hi
Below is config.json -
{
"engine_config": "./engine.json",
"splitting": {
"version": "1",
"source_file": "hdfs://x.x.x.x:9000/cv-exp001/data",
"train_file": "hdfs://x.x.x.x:9000/cv-exp001/train",
"test_file": "hdfs://x.x.x.x:9000/cv-exp001/test",
"type": "date",
"train_ratio": 0.8,
"random_seed": 29750,
"split_event": "eventTime"
},
"reporting": {
"file": "./report.xlsx"
},
"testing": {
"map_k": 10,
"non_zero_users_file": "./non_zero_users.dat",
"consider_non_zero_scores_only": true,
"custom_combos": {
"event_groups": [["purchased","viewed"]]
}
},
"spark": {
"master": "spark://x.x.x.x:7077"
}
}
I have following setup -
1) All files of analysis tool are present in universal recommender folder including config.json
2) I have remote spark cluster, remote hbase and separate VM where pio is installed
3) Runing this command from pio VM -
SPARK_HOME=/usr/local/spark PYTHONPATH=/usr/local/spark/python:/usr/local/spark/python/lib/py4j-0.9-src.zip ./map_test.py split
4) Data is also there in hdfs.
Any Help? Still getting this error.
--
You received this message because you are subscribed to the Google Groups "actionml-user" group.
To unsubscribe from this group and stop receiving emails from it, send an email to actionml-use...@googlegroups.com.
To post to this group, send email to action...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/actionml-user/ea66f2a2-5617-417d-9977-99af91529e03%40googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/actionml-user/e64b578f-e8e0-4d91-ac38-8de0683cbfa9%40googlegroups.com.
date_rdd = (df .filter("event = '%s'" % (PRIMARY_EVENT_NAME)) .select("Date") .sort("Date", ascending=True) .rdd)
date_rdd.collect()
[Row(Date=datetime.datetime(2016, 8, 24, 20, 25, 7, 572000)), Row(Date=datetime.datetime(2016, 8, 25, 15, 37, 7, 572000)), Row(Date=datetime.datetime(2016, 8, 26, 10, 49, 7, 572000)), Row(Date=datetime.datetime(2016, 8, 27, 6, 1, 7, 572000)), Row(Date=datetime.datetime(2016, 8, 28, 1, 13, 7, 572000)), Row(Date=datetime.datetime(2016, 8, 28, 20, 25, 7, 572000)), Row(Date=datetime.datetime(2016, 9, 6, 15, 37, 7, 572000)), Row(Date=datetime.datetime(2016, 9, 7, 10, 49, 7, 572000)), Row(Date=datetime.datetime(2016, 9, 8, 6, 1, 7, 572000))]
total_primary_events = date_rdd.count()
Py4JJavaError Traceback (most recent call last) <ipython-input-18-84dbd0a5d19b> in <module>() 5 .rdd) 6 ----> 7 total_primary_events = date_rdd.count() /home/rasna/PredictionIO/vendors/spark-1.6.2/python/pyspark/rdd.py in count(self) 1002 3 1003 """ -> 1004 return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum() 1005 1006 def stats(self): /home/rasna/PredictionIO/vendors/spark-1.6.2/python/pyspark/rdd.py in sum(self) 993 6.0 994 """ --> 995 return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add) 996 997 def count(self): /home/rasna/PredictionIO/vendors/spark-1.6.2/python/pyspark/rdd.py in fold(self, zeroValue, op) 867 # zeroValue provided to each partition is unique from the one provided 868 # to the final reduce call --> 869 vals = self.mapPartitions(func).collect() 870 return reduce(op, vals, zeroValue) 871 /home/rasna/PredictionIO/vendors/spark-1.6.2/python/pyspark/rdd.py in collect(self) 769 """ 770 with SCCallSiteSync(self.context) as css: --> 771 port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd()) 772 return list(_load_from_socket(port, self._jrdd_deserializer)) 773 /usr/local/lib/python3.5/dist-packages/py4j/java_gateway.py in __call__(self, *args) 1131 answer = self.gateway_client.send_command(command) 1132 return_value = get_return_value( -> 1133 answer, self.gateway_client, self.target_id, self.name) 1134 1135 for temp_arg in temp_args: /home/rasna/PredictionIO/vendors/spark-1.6.2/python/pyspark/sql/utils.py in deco(*a, **kw) 43 def deco(*a, **kw): 44 try: ---> 45 return f(*a, **kw) 46 except py4j.protocol.Py4JJavaError as e: 47 s = e.java_exception.toString() /usr/local/lib/python3.5/dist-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 317 raise Py4JJavaError( 318 "An error occurred while calling {0}{1}{2}.\n". --> 319 format(target_id, ".", name), value) 320 else: 321 raise Py4JError( Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe. : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 37.0 failed 1 times, most recent failure: Lost task 0.0 in stage 37.0 (TID 952, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last): File "/home/rasna/PredictionIO/vendors/spark-1.6.2/python/lib/pyspark.zip/pyspark/worker.py", line 64, in main ("%d.%d" % sys.version_info[:2], version)) Exception: Python in worker has different version 2.7 than that in driver 3.5, PySpark cannot run with different minor versions at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166) at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207) at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125) at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66) at org.apache.spark.scheduler.Task.run(Task.scala:89) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745)
--
You received this message because you are subscribed to the Google Groups "actionml-user" group.
To unsubscribe from this group and stop receiving emails from it, send an email to actionml-use...@googlegroups.com.
To post to this group, send email to action...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/actionml-user/61d26573-85fe-40dd-833e-7cebc08bc7d8%40googlegroups.com.