pyspark_location = 'lib/pymongo_spark.py'
HDFS_HOME = 'hdfs://1.1.1.1/'
INPUT_FILE = 'really_large_bson.gz'
class BsonEncoder(JSONEncoder):
def default(self, obj):
if isinstance(obj, ObjectId):
return str(obj)
elif isinstance(obj, datetime):
return obj.isoformat()
return JSONEncoder.default(self, obj)
def setup_spark_with_pymongo(app_name='PySprkApp'):
conf = SparkConf().setAppName(app_name)
sc = SparkContext(conf=conf)
sc.addPyFile(pyspark_location)
return sc
def main():
spark_context = setup_spark_with_pymongo()
filename = HDFS_HOME + INPUT_FILE
import pymongo_spark
pymongo_spark.activate()
rdd = spark_context.BSONFileRDD(filename)
print(rdd.first()) # ValueError: RDD is empty