pd.read_csv(compression='gzip') fails with html url

396 views
Skip to first unread message

Nick Schultz

unread,
Sep 26, 2014, 8:01:25 PM9/26/14
to pyd...@googlegroups.com
I'm thinking there is a problem with pandas when trying to read a gzip'd CSV file via an html url.  Loading gzip'd CSVs via the filesystem works fine, so I'm thinking it's something specific with the HTML url. Can anybody reproduce the following outcomes (see below)?  What could be some possible workarounds?

Thanks,

Nick

uncompressed CSV:

import pandas as pd

filename
= r'http://samplecsvs.s3.amazonaws.com/SalesJan2009.csv'
df
= pd.read_csv(filename)
print(df.shape)

Output:
(998, 12)

with gzip'd CSV:
import pandas as pd


filename
= r'http://nodestreams.com/input/people.csv.gz'
df
= pd.read_csv(filename, compression='gzip')
print(df.shape)

Output:
Traceback (most recent call last):
  File "/nfs/site/home/nschultz/mydisk4/web/test.py", line 33, in <module>
    df = pd.read_csv(filename, compression='gzip')
  File "/nfs/fm/disks/fm_cse_05026/nschultz/python/lib/python3.3/site-packages/pandas-0.14.1-py3.3-linux-x86_64.egg/pandas/io/parsers.py", line 452, in parser_f
    return _read(filepath_or_buffer, kwds)
  File "/nfs/fm/disks/fm_cse_05026/nschultz/python/lib/python3.3/site-packages/pandas-0.14.1-py3.3-linux-x86_64.egg/pandas/io/parsers.py", line 234, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/nfs/fm/disks/fm_cse_05026/nschultz/python/lib/python3.3/site-packages/pandas-0.14.1-py3.3-linux-x86_64.egg/pandas/io/parsers.py", line 542, in __init__
    self._make_engine(self.engine)
  File "/nfs/fm/disks/fm_cse_05026/nschultz/python/lib/python3.3/site-packages/pandas-0.14.1-py3.3-linux-x86_64.egg/pandas/io/parsers.py", line 679, in _make_engine
    self._engine = CParserWrapper(self.f, **self.options)
  File "/nfs/fm/disks/fm_cse_05026/nschultz/python/lib/python3.3/site-packages/pandas-0.14.1-py3.3-linux-x86_64.egg/pandas/io/parsers.py", line 1041, in __init__
    self._reader = _parser.TextReader(src, **kwds)
  File "parser.pyx", line 485, in pandas.parser.TextReader.__cinit__ (pandas/parser.c:4413)
  File "parser.pyx", line 600, in pandas.parser.TextReader._get_header (pandas/parser.c:5649)
  File "parser.pyx", line 791, in pandas.parser.TextReader._tokenize_rows (pandas/parser.c:7599)
  File "parser.pyx", line 1699, in pandas.parser.raise_parser_error (pandas/parser.c:19062)
pandas.parser.CParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.



with gzip'd CSV (engine ='python'):
import pandas as pd


filename
= r'http://nodestreams.com/input/people.csv.gz'
df
= pd.read_csv(filename, compression='gzip', engine= 'python')
print(df.shape)

 
Output:
Traceback (most recent call last):
  File "/nfs/site/home/nschultz/mydisk4/web/test.py", line 33, in <module>
    df = pd.read_csv(filename, compression='gzip', engine= 'python')
  File "/nfs/fm/disks/fm_cse_05026/nschultz/python/lib/python3.3/site-packages/pandas-0.14.1-py3.3-linux-x86_64.egg/pandas/io/parsers.py", line 452, in parser_f
    return _read(filepath_or_buffer, kwds)
  File "/nfs/fm/disks/fm_cse_05026/nschultz/python/lib/python3.3/site-packages/pandas-0.14.1-py3.3-linux-x86_64.egg/pandas/io/parsers.py", line 234, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/nfs/fm/disks/fm_cse_05026/nschultz/python/lib/python3.3/site-packages/pandas-0.14.1-py3.3-linux-x86_64.egg/pandas/io/parsers.py", line 542, in __init__
    self._make_engine(self.engine)
  File "/nfs/fm/disks/fm_cse_05026/nschultz/python/lib/python3.3/site-packages/pandas-0.14.1-py3.3-linux-x86_64.egg/pandas/io/parsers.py", line 685, in _make_engine
    self._engine = klass(self.f, **self.options)
  File "/nfs/fm/disks/fm_cse_05026/nschultz/python/lib/python3.3/site-packages/pandas-0.14.1-py3.3-linux-x86_64.egg/pandas/io/parsers.py", line 1373, in __init__
    self.columns, self.num_original_columns = self._infer_columns()
  File "/nfs/fm/disks/fm_cse_05026/nschultz/python/lib/python3.3/site-packages/pandas-0.14.1-py3.3-linux-x86_64.egg/pandas/io/parsers.py", line 1587, in _infer_columns
    line = self._buffered_line()
  File "/nfs/fm/disks/fm_cse_05026/nschultz/python/lib/python3.3/site-packages/pandas-0.14.1-py3.3-linux-x86_64.egg/pandas/io/parsers.py", line 1713, in _buffered_line
    return self._next_line()
  File "/nfs/fm/disks/fm_cse_05026/nschultz/python/lib/python3.3/site-packages/pandas-0.14.1-py3.3-linux-x86_64.egg/pandas/io/parsers.py", line 1738, in _next_line
    orig_line = next(self.data)
  File "/usr/intel/pkgs/python/3.3.2/lib/python3.3/gzip.py", line 393, in read1
    self._read()
  File "/usr/intel/pkgs/python/3.3.2/lib/python3.3/gzip.py", line 441, in _read
    self._read_gzip_header()
  File "/usr/intel/pkgs/python/3.3.2/lib/python3.3/gzip.py", line 285, in _read_gzip_header
    magic = self.fileobj.read(2)
  File "/usr/intel/pkgs/python/3.3.2/lib/python3.3/gzip.py", line 93, in read
    self.file.read(size-self._length+read)
TypeError: can't concat bytes to str



 
Reply all
Reply to author
Forward
0 new messages