from pyspark.sql import SQLContext
from pyspark import SparkFiles
sc.setLogLevel("ERROR")
url = "https://raw.githubusercontent.com/guru99-edu/R-Programming/master/adult_data.csv"
sc.addFile(url)
sqlContext = SQLContext(sc)
SparkFiles.getRootDirectory()
df = sqlContext.read.csv(SparkFiles.get("adult_data.csv"), header=True, inferSchema= True)
At this point I receive errors that the file does not exist:
user@cluster-6ef9-m:~$ wget https://raw.githubusercontent.com/guru99-edu/R-Programming/master/adult_data.csv
user@cluster-6ef9-m:~$ hdfs dfs -put adult_data.csv
user@cluster-6ef9-m:~$ hdfs dfs -ls
Found 2 items
drwxrwxrwt - user hadoop 0 2020-05-02 18:38 .sparkStaging
-rw-r--r-- 2 user hadoop 5608318 2020-05-02 18:39 adult_data.csv
By pyspark also fails to find this file.
from pyspark.sql import SQLContext
from pyspark import SparkFiles
sc.setLogLevel("ERROR")
url = "https://raw.githubusercontent.com/guru99-edu/R-Programming/master/adult_data.csv"
sqlContext = SQLContext(sc)
df = sqlContext.read.csv("hdfs:///mydata/adult_data.csv", header=True, inferSchema= True)
Output:
Using Python version 2.7.13 (default, Sep 26 2018 18:42:22)
SparkSession available as 'spark'.
>>> from pyspark.sql import SQLContext
>>> from pyspark import SparkFiles
>>> sc.setLogLevel("ERROR")
>>> url = "https://raw.githubusercontent.com/guru99-edu/R-Programming/master/adult_data.csv"
>>> sqlContext = SQLContext(sc)
>>> df = sqlContext.read.csv("hdfs:///mydata/adult_data.csv", header=True, inferSchema= True)
ivysettings.xml file not found in HIVE_HOME or HIVE_CONF_DIR,/etc/hive/conf.dist/ivysettings.xml will be used
>>>
>>> df.head(5)
[Row(x=1, age=25, workclass=u'Private', fnlwgt=226802, education=u'11th', educational-num=7, marital-status=u'Never-married', occupation=u'Machine-op-inspct', relationship=u'Own-child', race=u'Black', gender=u'Male', capital-gain=0, capital-loss=0, hours-per-week=40, native-country=u'United-States', income=u'<=50K'), Row(x=2, age=38, workclass=u'Private', fnlwgt=89814, education=u'HS-grad', educational-num=9, marital-status=u'Married-civ-spouse', occupation=u'Farming-fishing', relationship=u'Husband', race=u'White', gender=u'Male', capital-gain=0, capital-loss=0, hours-per-week=50, native-country=u'United-States', income=u'<=50K'), Row(x=3, age=28, workclass=u'Local-gov', fnlwgt=336951, education=u'Assoc-acdm', educational-num=12, marital-status=u'Married-civ-spouse', occupation=u'Protective-serv', relationship=u'Husband', race=u'White', gender=u'Male', capital-gain=0, capital-loss=0, hours-per-week=40, native-country=u'United-States', income=u'>50K'), Row(x=4, age=44, workclass=u'Private', fnlwgt=160323, education=u'Some-college', educational-num=10, marital-status=u'Married-civ-spouse', occupation=u'Machine-op-inspct', relationship=u'Husband', race=u'Black', gender=u'Male', capital-gain=7688, capital-loss=0, hours-per-week=40, native-country=u'United-States', income=u'>50K'), Row(x=5, age=18, workclass=u'?', fnlwgt=103497, education=u'Some-college', educational-num=10, marital-status=u'Never-married', occupation=u'?', relationship=u'Own-child', race=u'White', gender=u'Female', capital-gain=0, capital-loss=0, hours-per-week=30, native-country=u'United-States', income=u'<=50K')]