0.10.0b1Out:['2', '3']
ValueError Traceback (most recent call last) <ipython-input-2-1731c0f94070> in <module>() ----> 1 df = readTable() <ipython-input-1-b533b535b474> in readTable() 40 41 def readTable(): ---> 42 store = pd.HDFStore(r'C:\Table.h5',mode='r') 43 df = store.select('df',[ Term('index<1000'), Term('columns', '=', ['X','Y']) ]) 44 return df C:\Python27\lib\site-packages\pandas\io\pytables.pyc in __init__(self, path, mode, complevel, complib, fletcher32) 196 self.fletcher32 = fletcher32 197 self.filters = None --> 198 self.open(mode=mode, warn=False) 199 200 @property C:\Python27\lib\site-packages\pandas\io\pytables.pyc in open(self, mode, warn) 291 292 try: --> 293 self.handle = _tables().openFile(self.path, self.mode) 294 except IOError, e: # pragma: no cover 295 if 'can not be written' in str(e): C:\Python27\lib\site-packages\pandas\io\pytables.pyc in _tables() 86 87 # version requirements ---> 88 major, minor, subv = tables.__version__.split('.') 89 if int(major) >= 2 and int(minor[0]) >= 3: 90 _table_supports_index = True ValueError: need more than 2 values to unpack
--------------------------------------------------------------------------- NotImplementedError Traceback (most recent call last) <ipython-input-5-1731c0f94070> in <module>() ----> 1 df = readTable() <ipython-input-4-7460fc5e8e3b> in readTable() 43 def readTable(): 44 store = pd.HDFStore(r'C:\Table.h5',mode='r') ---> 45 df = store.select('df',[ Term('index<1000'), Term('columns', '=', ['X','Y']) ]) 46 return df C:\Python27\lib\site-packages\pandas\io\pytables.pyc in select(self, key, where) 344 if where is not None and not _is_table_type(group): 345 raise Exception('can only select with where on objects written as tables') --> 346 return self._read_group(group, where) 347 348 def put(self, key, value, table=False, append=False, C:\Python27\lib\site-packages\pandas\io\pytables.pyc in _read_group(self, group, where) 832 kind = _LEGACY_MAP.get(kind, kind) 833 handler = self._get_handler(op='read', kind=kind) --> 834 return handler(group, where) 835 836 def _read_series(self, group, where=None): C:\Python27\lib\site-packages\pandas\io\pytables.pyc in _read_frame_table(self, group, where) 863 def _read_frame_table(self, group, where=None): 864 t = create_table(self, group) --> 865 return t.read(where) 866 867 C:\Python27\lib\site-packages\pandas\io\pytables.pyc in read(self, where) 1606 def read(self, where=None): 1607 -> 1608 self.read_axes(where) 1609 1610 index = Index(self.index_axes[0].values) C:\Python27\lib\site-packages\pandas\io\pytables.pyc in read_axes(self, where) 1273 # create the selection 1274 self.selection = Selection(self, where) -> 1275 self.selection.select() 1276 1277 # convert the data C:\Python27\lib\site-packages\pandas\io\pytables.pyc in select(self) 2045 """ 2046 if self.condition is not None: -> 2047 self.values = self.table.table.readWhere(self.condition) 2048 else: 2049 self.values = self.table.table.read() C:\Python27\lib\site-packages\tables\table.pyc in readWhere(self, condition, condvars, field, start, stop, step) 1482 1483 coords = [ p.nrow for p in -> 1484 self._where(condition, condvars, start, stop, step) ] 1485 self._whereCondition = None # reset the conditions 1486 if len(coords) > 1: C:\Python27\lib\site-packages\tables\table.pyc in _where(self, condition, condvars, start, stop, step) 1435 # Compile the condition and extract usable index conditions. 1436 condvars = self._requiredExprVars(condition, condvars, depth=3) -> 1437 compiled = self._compileCondition(condition, condvars) 1438 1439 # Can we use indexes? C:\Python27\lib\site-packages\tables\table.pyc in _compileCondition(self, condition, condvars) 1314 indexedcols = frozenset(indexedcols) 1315 # Now let ``compile_condition()`` do the Numexpr-related job. -> 1316 compiled = compile_condition(condition, typemap, indexedcols, copycols) 1317 1318 # Check that there actually are columns in the condition. C:\Python27\lib\site-packages\tables\conditions.pyc in compile_condition(condition, typemap, indexedcols, copycols) 392 except NotImplementedError, nie: 393 # Try to make this Numexpr error less cryptic. --> 394 raise _unsupported_operation_error(nie) 395 params = varnames 396 NotImplementedError: unsupported operand types for *gt*: str, long
<class 'pandas.io.pytables.HDFStore'> File path: C:\Table.h5 /df frame_table (typ->appendable,nrows->2742428)
C:\TableAppend.h5 (File) '' Last modif.: 'Wed Dec 12 11:09:43 2012' Object Tree: / (RootGroup) '' /df (Group) '' /df/table (Table(2742428,)) ''
--
ValueError: cannot convert float NaN to integer%time df = appendCSV()
CPU times: user 298.00 s, sys: 0.00 s, total: 298.00 s Wall time: 298.00 s
In [3]:
%time loadTableRegular(df)
CPU times: user 74.65 s, sys: 0.00 s, total: 74.65 s Wall time: 74.66 s
%time loadTableAppend(df)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-3-fe79bd31133f> in <module>() ----> 1 get_ipython().magic(u'time loadTableAppend(df)') C:\Python27\lib\site-packages\IPython\core\interactiveshell.pyc in magic(self, arg_s) 2134 magic_name, _, magic_arg_s = arg_s.partition(' ') 2135 magic_name = magic_name.lstrip(prefilter.ESC_MAGIC) -> 2136 return self.run_line_magic(magic_name, magic_arg_s) 2137 2138 #------------------------------------------------------------------------- C:\Python27\lib\site-packages\IPython\core\interactiveshell.pyc in run_line_magic(self, magic_name, line) 2060 args.append(sys._getframe(stack_depth).f_locals) 2061 with self.builtin_trap: -> 2062 result = fn(*args) 2063 return result 2064 C:\Python27\lib\site-packages\IPython\core\magics\execution.pyc in time(self, parameter_s, user_locals) C:\Python27\lib\site-packages\IPython\core\magic.pyc in <lambda>(f, *a, **k) 189 # but it's overkill for just that one bit of state. 190 def magic_deco(arg): --> 191 call = lambda f, *a, **k: f(*a, **k) 192 193 if callable(arg): C:\Python27\lib\site-packages\IPython\core\magics\execution.pyc in time(self, parameter_s, user_locals) 893 if mode=='eval': 894 st = clock2() --> 895 out = eval(code, glob, user_locals) 896 end = clock2() 897 else: <timed eval> in <module>() <ipython-input-1-3aa5c26906a5> in loadTableAppend(df) 20 def loadTableAppend(df): 21 store = pd.HDFStore(r'C:\TableAppend.h5',mode='w') ---> 22 store.append('df',df) 23 store.close() 24 C:\Python27\lib\site-packages\pandas\io\pytables.pyc in append(self, key, value, **kwargs) 426 data in the table, so be careful 427 """ --> 428 self._write_to_group(key, value, table=True, append=True, **kwargs) 429 430 def create_table_index(self, key, **kwargs): C:\Python27\lib\site-packages\pandas\io\pytables.pyc in _write_to_group(self, key, value, table, append, comp, **kwargs) 505 wrapper = lambda value: handler(group, value) 506 --> 507 wrapper(value) 508 group._v_attrs.pandas_type = kind 509 group._v_attrs.pandas_version = _version C:\Python27\lib\site-packages\pandas\io\pytables.pyc in <lambda>(value) 495 handler = self._get_handler(op='write', kind=kind) 496 wrapper = lambda value: handler(group, value, append=append, --> 497 comp=comp, **kwargs) 498 else: 499 if append: C:\Python27\lib\site-packages\pandas\io\pytables.pyc in _write_frame_table(self, group, df, append, comp, axes, **kwargs) 650 axes = [0] 651 t = create_table(self, group, typ = 'appendable_frame') --> 652 t.write(axes=axes, obj=df, append=append, compression=comp, **kwargs) 653 654 _read_frame_table = _read_ndim_table C:\Python27\lib\site-packages\pandas\io\pytables.pyc in write(self, axes, obj, append, compression, complevel, min_itemsize, **kwargs) 1675 1676 # create the axes -> 1677 self.create_axes(axes = axes, obj = obj, validate = append, min_itemsize = min_itemsize) 1678 1679 if 'table' not in self.group: C:\Python27\lib\site-packages\pandas\io\pytables.pyc in create_axes(self, axes, obj, validate, min_itemsize) 1459 itemsize = eci 1460 -> 1461 atom = _tables().StringCol(itemsize = itemsize, shape = shape) 1462 utype = 'S%s' % itemsize 1463 kind = 'string' C:\Python27\lib\site-packages\tables\description.pyc in __init__(self, *args, **kwargs) 189 pos = kwargs.pop('pos', None) 190 class_from_prefix = self._class_from_prefix --> 191 atombase.__init__(self, *args, **kwargs) 192 # The constructor of an abstract atom may have changed 193 # the class of `self` to something different of `NewCol` C:\Python27\lib\site-packages\tables\atom.pyc in __init__(self, itemsize, shape, dflt) 586 587 def __init__(self, itemsize, shape=(), dflt=_defvalue): --> 588 if not hasattr(itemsize, '__int__') or int(itemsize) < 0: 589 raise ValueError( "invalid item size for kind ``%s``: %r; " 590 "it must be a positive integer" ValueError: cannot convert float NaN to integer
Thanks for any help,
-Gagi
--
%time loadTableAppend(df)
CPU times: user 444.99 s, sys: 0.00 s, total: 444.99 s
Wall time: 444.99 s <-- Create 8.37 GB (8,987,793,386 bytes) HDF5 Table @ ~18.8 MB/sec
%time df2 = readTable()
CPU times: user 3.79 s, sys: 0.00 s, total: 3.79 s Wall time: 3.79 s <-- Read From Disk 1M row x 4 column slice. Pretty quick Reading back 1 object and 3 floats.
%time df2 = readTable()
CPU times: user 1.99 s, sys: 0.00 s, total: 1.99 s Wall time: 1.99 s
df2.shape(100000, 4)
--
tbl.cols.colum_name.createIndex() --
CPU times: user 396.95 s, sys: 0.00 s, total: 396.95 s
Wall time: 396.95 s <-- Writing 8.36 GB (8,986,835,144 bytes) HDF5 table @ ~21 MB/Sec%time df2 = readTable()
print df2.shape
CPU times: user 41.09 s, sys: 0.00 s, total: 41.09 s Wall time: 41.10 s <-- Get back 1M rows by 2 cols in 41 Sec not too bad. :) (1000000, 2)
If I modify the code to pull all columns:
df = store.select('df',[ Term('index<100000') ])
CPU times: user 54.81 s, sys: 0.00 s, total: 54.81 s Wall time: 54.82 s <-- Takes 10 Sec more time and probably most of the 10 Sec difference (1000000, 350) is attributed to converting more columns to the pandas DataFrame object from table.
CPU times: user 7.72 s, sys: 0.00 s, total: 7.72 s Wall time: 7.72 s <-- Pulls 48.4k rows by 350 columns in 7.72 sec. Quite good indexing on column 'X' (48405, 350) Not sure if calling any other indexing command/options can help speed this up?Selecting only two columns not much performance difference as Jeff stated:
CPU times: user 7.70 s, sys: 0.00 s, total: 7.70 s
Wall time: 7.70 s <-- Get back 48.4k rows by 2 columns in almost the same time.
(48405, 2) This confirms what Jeff was saying. There is no speedup in sub-selecting columns since hdf5 table entries are stored as rows and columns are only filtered after all--
CPU times: user 5.50 s, sys: 0.00 s, total: 5.50 s
Wall time: 5.50 s <-- Versus 7.72 sec without calling store.create_table_index('df', columns = ['index','X'])
(48405, 2)CPU times: user 40.59 s, sys: 0.00 s, total: 40.59 s Wall time: 40.59 s <-- vs 54.82 Sec without indexing. (1000000, 350)
--
--
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-12-cd8b6f868aec> in <module>() 1 data = 'a,b,c\n1.1,2,3\nDog,5,6\n7.7,8,9.5' ----> 2 df = pd.read_csv(StringIO.StringIO(data), dtype={'a': np.float}) 3 df.dtypes C:\Python27\lib\site-packages\pandas\io\parsers.pyc in parser_f(filepath_or_buffer, sep, dialect, compression, doublequote, escapechar, quotechar, quoting, skipinitialspace, lineterminator, header, index_col, names, prefix, skiprows, skipfooter, skip_footer, na_values, true_values, false_values, delimiter, converters, dtype, usecols, engine, delim_whitespace, as_recarray, na_filter, compact_ints, use_unsigned, low_memory, buffer_lines, warn_bad_lines, error_bad_lines, keep_default_na, thousands, comment, decimal, parse_dates, keep_date_col, dayfirst, date_parser, memory_map, nrows, iterator, chunksize, verbose, encoding, squeeze) 389 buffer_lines=buffer_lines) 390 --> 391 return _read(filepath_or_buffer, kwds) 392 393 parser_f.__name__ = name C:\Python27\lib\site-packages\pandas\io\parsers.pyc in _read(filepath_or_buffer, kwds) 205 return parser 206 --> 207 return parser.read() 208 209 _parser_defaults = { C:\Python27\lib\site-packages\pandas\io\parsers.pyc in read(self, nrows) 622 # self._engine.set_error_bad_lines(False) 623 --> 624 ret = self._engine.read(nrows) 625 626 if self.options.get('as_recarray'): C:\Python27\lib\site-packages\pandas\io\parsers.pyc in read(self, nrows) 943 944 try: --> 945 data = self._reader.read(nrows) 946 except StopIteration: 947 if nrows is None: C:\Python27\lib\site-packages\pandas\_parser.pyd in pandas._parser.TextReader.read (pandas\src\parser.c:5785)() C:\Python27\lib\site-packages\pandas\_parser.pyd in pandas._parser.TextReader._read_low_memory (pandas\src\parser.c:6002)() C:\Python27\lib\site-packages\pandas\_parser.pyd in pandas._parser.TextReader._read_rows (pandas\src\parser.c:6870)() C:\Python27\lib\site-packages\pandas\_parser.pyd in pandas._parser.TextReader._convert_column_data (pandas\src\parser.c:7919)() AttributeError: 'NoneType' object has no attribute 'dtype'
C:\Python27\lib\site-packages\pandas\io\pytables.pyc in validate_attr(self, append) 1158 if (existing_fields is not None and 1159 existing_fields != list(self.values)): -> 1160 raise Exception("appended items do not match existing items" 1161 " in table!") 1162 Exception: appended items do not match existing items in table!
df = store.select('df',[ Term('index<1000'), Term('columns', '=', ['X','Y']) ])
return df
Output:
In [2]:%time df = appendCSV()
CPU times: user 298.00 s, sys: 0.00 s, total: 298.00 s Wall time: 298.00 s
In [3]:%time loadTableRegular(df)
CPU times: user 74.65 s, sys: 0.00 s, total: 74.65 s Wall time: 74.66 sIn [3]:%time loadTableAppend(df)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
------------
--
%time df3 = AppendCSV2HDF5()
...
CPU times: user 327.91 s, sys: 0.00 s, total: 327.91 s Wall time: 327.91 s <-- Sniffed then Appended all 38 CSV files to a 8.37 GB (8,996,754,715 bytes) HDF5 file @ ~25.5 MB/Sec
I did notice that the 'index' data column in this new HDF5 table is non-unique after this type of serialized append.
This makes sense because its just a repeating index between 0~75k for each csv file appended in. It makes me wonder if we even
need this index column or if there is a way to make it unique like the pandas 'ignore_index=True' read_csv attribute.
Perhaps I'm missing an hdf5 flush somewhere that can create a unique 'index' column.
So part my HDF5 File looks like
absolute_index | index | values_block_0
756753 | 0 | 0.5
756754 | 1 | 0.6
756755 | 2 | 0.7
...
831754 | 0 | 0.8 <-- the 'index' column repets itself but the absolute table index does not.
Thanks,
-Gagi
--
--
pandas-0.10.0.dev-6826609.win-amd64-py2.7.exe 12-Dec-2012 00:07 1974432...
pandas-0.10.1.dev-c934e02.win-amd64-py2.6.exe 30-Dec-2012 22:45 2031018 pandas-0.10.1.dev-c934e02.win-amd64-py3.1.exe 30-Dec-2012 22:02 2013126 pandas-0.10.1.dev-c934e02.win-amd64-py3.2.exe 30-Dec-2012 22:02 2024535 pandas-0.10.1.dev-c934e02.win32-py2.6.exe 30-Dec-2012 22:25 1869961 pandas-0.10.1.dev-c934e02.win32-py3.1.exe 30-Dec-2012 22:36 1852440 pandas-0.10.1.dev-c934e02.win32-py3.2.exe 30-Dec-2012 22:41 1852436
-> 2047 self<sp...
Show original
--
--
min_itemsize = { 'COl_A' : 50, 'COl_B' : 25, 'COl_C' : 10 }