from pyspark.sql import SparkSessionimport urllibimport pprintssl_directory = r'/etc'password = urllib.parse.quote_plus('Hv@cint006')host_ip = "192.168.0.21"spark_session = SparkSession \.builder \.appName("Test_Spark_MongoDB") \.config("spark.mongodb.input.uri", "mongodb://Gandalf:" + password + "@" + host_ip + ":27017/Test.Value?ssl=true&ssl_ca_certs=" + ssl_directory + r'/rootCA.pem' + "&ssl_certfile=" + ssl_directory + r'/mongo-client.pem' + "&ssl_pem_passphrase=" + password + "&ssl_cert_reqs=CERT_NONE" + "&authSource=admin") \.config("spark.mongodb.output.uri", "mongodb://Gandalf:" + password + "@" + host_ip + ":27017/Test.Value?ssl=true&ssl_ca_certs=" + ssl_directory + r'/rootCA.pem' + "&ssl_certfile=" + ssl_directory + r'/mongo-client.pem' + "&ssl_pem_passphrase=" + password + "&ssl_cert_reqs=CERT_NONE" + "&authSource=admin") \.getOrCreate()data_frame = spark_session.read.format("com.mongodb.spark.sql.DefaultSource").load()data_frame.printSchema()pprint.pprint(data_frame)Traceback (most recent call last): File "/usr/local/spark/python/pyspark/sql/utils.py", line 63, in deco return f(*a, **kw) File "/usr/local/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value format(target_id, ".", name), value)py4j.protocol.Py4JJavaError: An error occurred while calling o308.load.: java.lang.IllegalArgumentException: requirement failed: Invalid uri: 'mongodb://Gandalf:Hv%40ci...@192.168.0.21:27017/Test.Value?ssl=true&ssl_ca_certs=/etc/rootCA.pem&ssl_certfile=/etc/mongo-client.pem&ssl_pem_passphrase=Hv%40cint006&ssl_cert_reqs=CERT_NONE&authSource=admin' at scala.Predef$.require(Predef.scala:224) at com.mongodb.spark.config.MongoCompanionConfig$class.connectionString(MongoCompanionConfig.scala:279) at com.mongodb.spark.config.ReadConfig$.connectionString(ReadConfig.scala:39) at com.mongodb.spark.config.ReadConfig$.apply(ReadConfig.scala:51) at com.mongodb.spark.config.ReadConfig$.apply(ReadConfig.scala:39) at com.mongodb.spark.config.MongoCompanionConfig$class.apply(MongoCompanionConfig.scala:124) at com.mongodb.spark.config.ReadConfig$.apply(ReadConfig.scala:39) at com.mongodb.spark.config.MongoCompanionConfig$class.apply(MongoCompanionConfig.scala:113) at com.mongodb.spark.config.ReadConfig$.apply(ReadConfig.scala:39) at com.mongodb.spark.sql.DefaultSource.createRelation(DefaultSource.scala:67) at com.mongodb.spark.sql.DefaultSource.createRelation(DefaultSource.scala:50) at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:330) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:152) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:125) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:280) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:214) at java.lang.Thread.run(Thread.java:748)During handling of the above exception, another exception occurred:Traceback (most recent call last): File "/tmp/zeppelin_python-7072083241956426957.py", line 270, in <module> exec(code, _zcUserQueryNameSpace) File "<stdin>", line 23, in <module> File "/usr/local/spark/python/pyspark/sql/readwriter.py", line 155, in load return self._df(self._jreader.load()) File "/usr/local/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__ answer, self.gateway_client, self.target_id, self.name) File "/usr/local/spark/python/pyspark/sql/utils.py", line 79, in deco raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)pyspark.sql.utils.IllegalArgumentException: "requirement failed: Invalid uri: 'mongodb://Gandalf:Hv%40ci...@192.168.0.21:27017/Test.Value?ssl=true&ssl_ca_certs=/etc/rootCA.pem&ssl_certfile=/etc/mongo-client.pem&ssl_pem_passphrase=Hv%40cint006&ssl_cert_reqs=CERT_NONE&authSource=admin'"During handling of the above exception, another exception occurred:Traceback (most recent call last): File "/tmp/zeppelin_python-7072083241956426957.py", line 282, in <module> raise Exception(traceback.format_exc())Exception: Traceback (most recent call last): File "/usr/local/spark/python/pyspark/sql/utils.py", line 63, in deco return f(*a, **kw) File "/usr/local/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value format(target_id, ".", name), value)py4j.protocol.Py4JJavaError: An error occurred while calling o308.load.: java.lang.IllegalArgumentException: requirement failed: Invalid uri: 'mongodb://Gandalf:Hv%40ci...@192.168.0.21:27017/Test.Value?ssl=true&ssl_ca_certs=/etc/rootCA.pem&ssl_certfile=/etc/mongo-client.pem&ssl_pem_passphrase=Hv%40cint006&ssl_cert_reqs=CERT_NONE&authSource=admin' at scala.Predef$.require(Predef.scala:224) at com.mongodb.spark.config.MongoCompanionConfig$class.connectionString(MongoCompanionConfig.scala:279) at com.mongodb.spark.config.ReadConfig$.connectionString(ReadConfig.scala:39) at com.mongodb.spark.config.ReadConfig$.apply(ReadConfig.scala:51) at com.mongodb.spark.config.ReadConfig$.apply(ReadConfig.scala:39) at com.mongodb.spark.config.MongoCompanionConfig$class.apply(MongoCompanionConfig.scala:124) at com.mongodb.spark.config.ReadConfig$.apply(ReadConfig.scala:39) at com.mongodb.spark.config.MongoCompanionConfig$class.apply(MongoCompanionConfig.scala:113) at com.mongodb.spark.config.ReadConfig$.apply(ReadConfig.scala:39) at com.mongodb.spark.sql.DefaultSource.createRelation(DefaultSource.scala:67) at com.mongodb.spark.sql.DefaultSource.createRelation(DefaultSource.scala:50) at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:330) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:152) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:125) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:280) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:214) at java.lang.Thread.run(Thread.java:748)During handling of the above exception, another exception occurred:Traceback (most recent call last): File "/tmp/zeppelin_python-7072083241956426957.py", line 270, in <module> exec(code, _zcUserQueryNameSpace) File "<stdin>", line 23, in <module> File "/usr/local/spark/python/pyspark/sql/readwriter.py", line 155, in load return self._df(self._jreader.load()) File "/usr/local/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__ answer, self.gateway_client, self.target_id, self.name) File "/usr/local/spark/python/pyspark/sql/utils.py", line 79, in deco raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)pyspark.sql.utils.IllegalArgumentException: "requirement failed: Invalid uri: 'mongodb://Gandalf:Hv%40ci...@192.168.0.21:27017/Test.Value?ssl=true&ssl_ca_certs=/etc/rootCA.pem&ssl_certfile=/etc/mongo-client.pem&ssl_pem_passphrase=Hv%40cint006&ssl_cert_reqs=CERT_NONE&authSource=admin'"Even though he is using Python, the MongoDB Spark Connector is based on the Java MongoDB driver. The Java MongoDB driver does not allow you to add the trust store and CA information in the URI like the Python driver does. Therefore, you have to configure the location of your trust store and CA info to Zeppelin by updating zeppelin-env.sh and adding/updating the following line:
export SPARK_SUBMIT_OPTIONS="--driver-java-options -Djavax.net.ssl.trustStore=/your/location/truststore.ts --conf spark.executor.extraJavaOptions=--Djavax.net.ssl.trustStore=/your/location/truststore.ts"
I am able to get this working on my HDP 2.5 sandbox, but not on our HDP DEV / PROD clusters. I am working with the Admin team and Hortonworks to resolve, but there is lots of confusion… It would be great if MongoDB would step up and document how this is supposed to work.
Reference: https://jira.mongodb.org/browse/SPARK-115