Re: HDFStore & PyTables Version Tuple Unpacking issue

2,797 views
Skip to first unread message
Message has been deleted

Gagi

unread,
Dec 12, 2012, 1:46:31 PM12/12/12
to pyd...@googlegroups.com
I currently have PyTables 2.3 installed on Win 64bit from the UCI repository installer.

import pandas as pd
print pd.__version__
0.10.0b1

import tables
print tables.__version__.split('.')

Out:['2', '3']

When I try to open a local HDF5 data Table I get the following error when checking the PyTables Version:
store = pd.HDFStore(r'C:\Table.h5',mode='r')

ValueError                                Traceback (most recent call last)
<ipython-input-2-1731c0f94070> in <module>()
----> 1 df = readTable()

<ipython-input-1-b533b535b474> in readTable()
     40 
     41 def readTable():
---> 42     store = pd.HDFStore(r'C:\Table.h5',mode='r')
     43     df = store.select('df',[ Term('index<1000'), Term('columns', '=', ['X','Y']) ])
     44     return df

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in __init__(self, path, mode, complevel, complib, fletcher32)
    196         self.fletcher32 = fletcher32
    197         self.filters = None
--> 198         self.open(mode=mode, warn=False)
    199 
    200     @property

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in open(self, mode, warn)
    291 
    292         try:
--> 293             self.handle = _tables().openFile(self.path, self.mode)
    294         except IOError, e:  # pragma: no cover
    295             if 'can not be written' in str(e):

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in _tables()
     86 
     87         # version requirements
---> 88         major, minor, subv = tables.__version__.split('.')
     89         if int(major) >= 2 and int(minor[0]) >= 3:
     90             _table_supports_index = True

ValueError: need more than 2 values to unpack

Looks like its expecting to unpack a 3 dimensional tuple but my PyTables version 2.3 only has 2 entries. I will upgrade to PyTables 2.4 to see if this goes away but its something to check.


Now here comes the interesting part, if I run the exact same line of code again (without restarting the python kernel) it bypasses the above error and gives me something else when trying to select a few columns and rows out of a HDF5 table.

---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
<ipython-input-5-1731c0f94070> in <module>()
----> 1 df = readTable()

<ipython-input-4-7460fc5e8e3b> in readTable()
     43 def readTable():
     44     store = pd.HDFStore(r'C:\Table.h5',mode='r')
---> 45     df = store.select('df',[ Term('index<1000'), Term('columns', '=', ['X','Y']) ])
     46     return df

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in select(self, key, where)
    344         if where is not None and not _is_table_type(group):
    345             raise Exception('can only select with where on objects written as tables')
--> 346         return self._read_group(group, where)
    347 
    348     def put(self, key, value, table=False, append=False,

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in _read_group(self, group, where)
    832         kind = _LEGACY_MAP.get(kind, kind)
    833         handler = self._get_handler(op='read', kind=kind)
--> 834         return handler(group, where)
    835 
    836     def _read_series(self, group, where=None):

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in _read_frame_table(self, group, where)
    863     def _read_frame_table(self, group, where=None):
    864         t = create_table(self, group)
--> 865         return t.read(where)
    866 
    867 

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in read(self, where)
   1606     def read(self, where=None):
   1607 
-> 1608         self.read_axes(where)
   1609 
   1610         index   = Index(self.index_axes[0].values)

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in read_axes(self, where)
   1273         # create the selection
   1274         self.selection = Selection(self, where)
-> 1275         self.selection.select()
   1276 
   1277         # convert the data

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in select(self)
   2045         """
   2046         if self.condition is not None:
-> 2047             self.values = self.table.table.readWhere(self.condition)
   2048         else:
   2049             self.values = self.table.table.read()

C:\Python27\lib\site-packages\tables\table.pyc in readWhere(self, condition, condvars, field, start, stop, step)
   1482 
   1483         coords = [ p.nrow for p in
-> 1484                    self._where(condition, condvars, start, stop, step) ]
   1485         self._whereCondition = None  # reset the conditions
   1486         if len(coords) > 1:

C:\Python27\lib\site-packages\tables\table.pyc in _where(self, condition, condvars, start, stop, step)
   1435         # Compile the condition and extract usable index conditions.
   1436         condvars = self._requiredExprVars(condition, condvars, depth=3)
-> 1437         compiled = self._compileCondition(condition, condvars)
   1438 
   1439         # Can we use indexes?

C:\Python27\lib\site-packages\tables\table.pyc in _compileCondition(self, condition, condvars)
   1314         indexedcols = frozenset(indexedcols)
   1315         # Now let ``compile_condition()`` do the Numexpr-related job.
-> 1316         compiled = compile_condition(condition, typemap, indexedcols, copycols)
   1317 
   1318         # Check that there actually are columns in the condition.

C:\Python27\lib\site-packages\tables\conditions.pyc in compile_condition(condition, typemap, indexedcols, copycols)
    392     except NotImplementedError, nie:
    393         # Try to make this Numexpr error less cryptic.
--> 394         raise _unsupported_operation_error(nie)
    395     params = varnames
    396 

NotImplementedError: unsupported operand types for *gt*: str, long

I'm sure the first error is an easy to fix but the error above seems to be more involved. BTW the HDF5 table object seems to be correct as shown printed below:

print store
<class 'pandas.io.pytables.HDFStore'>
File path: C:\Table.h5
/df     frame_table (typ->appendable,nrows->2742428)

Thanks in advance for any help,
-Gagi

Jeff

unread,
Dec 12, 2012, 2:14:32 PM12/12/12
to pyd...@googlegroups.com
Gagi,
 
ok...first part is fixed (I just pushed changes)....once merged you can try again (https://github.com/jreback/pandas/commit/6c6e19da4b6add0944455e8e4c55cfd36030fbd7)
was not detecting version exactly correctly
 
2nd part is a bit more complicated.
 
can you provide a small sample of your df?
and post this:
 
store.handle

Gagi

unread,
Dec 12, 2012, 2:32:22 PM12/12/12
to pyd...@googlegroups.com
Here is the handle:

print store.handle
C:\TableAppend.h5 (File) ''
Last modif.: 'Wed Dec 12 11:09:43 2012'
Object Tree: 
/ (RootGroup) ''
/df (Group) ''
/df/table (Table(2742428,)) ''

I can't show the data itself but I can work on creating a similarly huge data-set to test the HDF5 reading and writing capability.

The df has 300 real value columns and 50 categorical. Most of the column names contain  symbols such as "@" hopefully this wont confuse anything. There are occasional missing values but the data is mostly dense. I'll try a synthetic data-set and see how it behaves.

Thanks,
-Gagi

Jeff

unread,
Dec 12, 2012, 2:40:49 PM12/12/12
to pyd...@googlegroups.com
sorry...meant (or equivalently outside of python:   ptdump -av file.hdf5
 
store.handle.root.df.table
 
( I want to see what the table description looks like)
 
If you do a psudo data set would be great
 
pls include the same columns/types that you have
 
doesn't need to be that long
 
I suspect it has to do with some of the multi dtypes and/or index types
 
thanks
 

On Wednesday, December 12, 2012 1:46:31 PM UTC-5, Gagi wrote:

Gagi

unread,
Dec 12, 2012, 3:40:27 PM12/12/12
to pyd...@googlegroups.com
Hi Jeff,

Here is the HDF5 PyTables file dump. Note: I have commented out the highlighted sections that list all my tables attribute names. If this file looks good perhaps a native call from pytables to index it might work?

Note:

I have not explicitly chosen an index column so the first list of parameters had all 350 of the attribute (column) names.
The second list of float64 columns had 314 column names.
The last list of string64 columns had 36 column names.

From my looking over the column (atribute) names every one looked to be parsed & stored correctly. The next question is do we actually need to run create_table_index prior to attempting selection from a appendable_frame HDF5 table?

Thanks,
-Gagi

/ (RootGroup) ''
  /._v_attrs (AttributeSet), 4 attributes:
   [CLASS := 'GROUP',
    PYTABLES_FORMAT_VERSION := '2.0',
    TITLE := '',
    VERSION := '1.0']
/df (Group) ''
  /df._v_attrs (AttributeSet), 8 attributes:
   [CLASS := 'GROUP',
    TITLE := '',
    VERSION := '1.0',
    index_cols := [(0, 'index')],
    non_index_axes := [(1, ['commented', 'out', 'list', 'of', 'attributes'])],
    pandas_type := 'frame_table',
    table_type := 'appendable_frame',
    values_cols := ['values_block_0', 'values_block_1']]
/df/table (Table(2742428,)) ''
  description := {
  "index": Int64Col(shape=(), dflt=0, pos=0),
  "values_block_0": Float64Col(shape=(314,), dflt=0.0, pos=1),
  "values_block_1": StringCol(itemsize=8, shape=(36,), dflt='', pos=2)}
  byteorder := 'little'
  chunkshape := (46,)
  /df/table._v_attrs (AttributeSet), 15 attributes:
   [CLASS := 'TABLE',
    FIELD_0_FILL := 0,
    FIELD_0_NAME := 'index',
    FIELD_1_FILL := 0.0,
    FIELD_1_NAME := 'values_block_0',
    FIELD_2_FILL := '',
    FIELD_2_NAME := 'values_block_1',
    NROWS := 2742428,
    TITLE := '',
    VERSION := '2.6',
    index_kind := 'integer',
    values_0_dtype := 'float64',
    values_0_kind := ['commented', 'out', 'list', 'of', 'attributes'],
    values_1_dtype := 'string64',
    values_1_kind := ['commented', 'out', 'list', 'of', 'attributes']]

Jeff

unread,
Dec 12, 2012, 4:01:04 PM12/12/12
to pyd...@googlegroups.com
ok...found the problem, was not converting the indexing specification (e.g. index<1000) correctly
fixed up...
once this is done, then you should be able to select (and create the index) correctly.....
 
let me know how it works out and check out query speeds!
 
(also have pushed several memory fix issues.....)
 
 

On Wednesday, December 12, 2012 1:46:31 PM UTC-5, Gagi wrote:

Gagi

unread,
Dec 12, 2012, 4:14:35 PM12/12/12
to pyd...@googlegroups.com
Awesome work Jeff!

Will there be a build of these fixes on: "http://pandas.pydata.org/pandas-build/dev/"? I rely on the installers because of firewall/proxy issues with git hub, and I have yet to correctly set up building python modules on my windows environment. :] Once there is an installer up I will gladly test the system and get back performance figures.

Thanks,
-Gagi

Jeff

unread,
Dec 12, 2012, 4:16:16 PM12/12/12
to pyd...@googlegroups.com
git master will prob be updated in next day or 2.....i am sure we is planning a beta2....i think he wants the final on monday.....so should be soon...
let us know results!

On Wednesday, December 12, 2012 1:46:31 PM UTC-5, Gagi wrote:

Jeff Reback

unread,
Dec 13, 2012, 6:57:36 PM12/13/12
to pyd...@googlegroups.com
we merged in my changes - so this should all work (and I updated docs to better explain indexing)
I think that u said that u can't update from master (u r on windows)?
builds might be available soon


I can be reached on my cell 917-971-6387
--
 
 

Gagi

unread,
Dec 13, 2012, 7:18:21 PM12/13/12
to pyd...@googlegroups.com
Just shoot me a reply when the builds are up I'll gladly test it out and get back to you guys.

Thanks,
-Gagi

Gagi

unread,
Dec 17, 2012, 2:45:51 PM12/17/12
to pyd...@googlegroups.com
I ran my code again on pandas 0.10.0, PyTables 2.4.0, and got the following error, this time I can't write an appendable table to disk, whereas last time the table could not be indexed from disk. The code and data have not changed.

ValueError: cannot convert float NaN to integer

Code:
import numpy as np
import pandas as pd
import os, re
from pandas.io.pytables import Term

def sort_nicely( l ):
    """Sort the given list in the way that humans expect."""
    convert = lambda text: int(text) if text.isdigit() else text
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
    l.sort( key=alphanum_key )

def appendCSV():
    """Appends many CSVs into one CSV file checking for matching & aligning columns."""
    path = r'C:\CSV'
    files = os.listdir(path)
    sort_nicely(files)
    df_append = pd.concat([pd.read_csv(os.path.join(path,name)) for name in files], ignore_index=True)
    return df_append

def loadTableAppend(df):
    store = pd.HDFStore(r'C:\TableAppend.h5',mode='w')
    store.append('df',df)
    store.close()
   
def loadTableRegular(df):
    store = pd.HDFStore(r'C:\TableRegular.h5',mode='w')
    store['df'] = df
    store.close()
   
def readTable():
    store = pd.HDFStore(r'C:\TableAppend.h5',mode='r')
    print store
    print store.handle.root.df.table

    df = store.select('df',[ Term('index<1000'), Term('columns', '=', ['X','Y']) ])
    return df

Output:
In [2]:
%time df = appendCSV()
CPU times: user 298.00 s, sys: 0.00 s, total: 298.00 s
Wall time: 298.00 s

In [3]:
%time loadTableRegular(df)
CPU times: user 74.65 s, sys: 0.00 s, total: 74.65 s
Wall time: 74.66 s

In [3]:
%time loadTableAppend(df)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-3-fe79bd31133f> in <module>()
----> 1 get_ipython().magic(u'time loadTableAppend(df)')

C:\Python27\lib\site-packages\IPython\core\interactiveshell.pyc in magic(self, arg_s)
   2134         magic_name, _, magic_arg_s = arg_s.partition(' ')
   2135         magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2136         return self.run_line_magic(magic_name, magic_arg_s)
   2137 
   2138     #-------------------------------------------------------------------------

C:\Python27\lib\site-packages\IPython\core\interactiveshell.pyc in run_line_magic(self, magic_name, line)
   2060                 args.append(sys._getframe(stack_depth).f_locals)
   2061             with self.builtin_trap:
-> 2062                 result = fn(*args)
   2063             return result
   2064 

C:\Python27\lib\site-packages\IPython\core\magics\execution.pyc in time(self, parameter_s, user_locals)

C:\Python27\lib\site-packages\IPython\core\magic.pyc in <lambda>(f, *a, **k)
    189     # but it's overkill for just that one bit of state.
    190     def magic_deco(arg):
--> 191         call = lambda f, *a, **k: f(*a, **k)
    192 
    193         if callable(arg):

C:\Python27\lib\site-packages\IPython\core\magics\execution.pyc in time(self, parameter_s, user_locals)
    893         if mode=='eval':
    894             st = clock2()
--> 895             out = eval(code, glob, user_locals)
    896             end = clock2()
    897         else:

<timed eval> in <module>()

<ipython-input-1-3aa5c26906a5> in loadTableAppend(df)
     20 def loadTableAppend(df):
     21     store = pd.HDFStore(r'C:\TableAppend.h5',mode='w')
---> 22     store.append('df',df)
     23     store.close()
     24 

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in append(self, key, value, **kwargs)
    426         data in the table, so be careful
    427         """
--> 428         self._write_to_group(key, value, table=True, append=True, **kwargs)
    429 
    430     def create_table_index(self, key, **kwargs):

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in _write_to_group(self, key, value, table, append, comp, **kwargs)
    505             wrapper = lambda value: handler(group, value)
    506 
--> 507         wrapper(value)
    508         group._v_attrs.pandas_type = kind
    509         group._v_attrs.pandas_version = _version

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in <lambda>(value)
    495             handler = self._get_handler(op='write', kind=kind)
    496             wrapper = lambda value: handler(group, value, append=append,
--> 497                                             comp=comp, **kwargs)
    498         else:
    499             if append:

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in _write_frame_table(self, group, df, append, comp, axes, **kwargs)
    650             axes = [0]
    651         t = create_table(self, group, typ = 'appendable_frame')
--> 652         t.write(axes=axes, obj=df, append=append, compression=comp, **kwargs)
    653 
    654     _read_frame_table = _read_ndim_table

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in write(self, axes, obj, append, compression, complevel, min_itemsize, **kwargs)
   1675 
   1676         # create the axes
-> 1677         self.create_axes(axes = axes, obj = obj, validate = append, min_itemsize = min_itemsize)
   1678 
   1679         if 'table' not in self.group:

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in create_axes(self, axes, obj, validate, min_itemsize)
   1459                         itemsize = eci
   1460 
-> 1461                 atom  = _tables().StringCol(itemsize = itemsize, shape = shape)
   1462                 utype = 'S%s' % itemsize
   1463                 kind  = 'string'

C:\Python27\lib\site-packages\tables\description.pyc in __init__(self, *args, **kwargs)
    189                 pos = kwargs.pop('pos', None)
    190                 class_from_prefix = self._class_from_prefix
--> 191                 atombase.__init__(self, *args, **kwargs)
    192                 # The constructor of an abstract atom may have changed
    193                 # the class of `self` to something different of `NewCol`

C:\Python27\lib\site-packages\tables\atom.pyc in __init__(self, itemsize, shape, dflt)
    586 
    587     def __init__(self, itemsize, shape=(), dflt=_defvalue):
--> 588         if not hasattr(itemsize, '__int__') or int(itemsize) < 0:
    589             raise ValueError( "invalid item size for kind ``%s``: %r; "
    590                               "it must be a positive integer"

ValueError: cannot convert float NaN to integer

Thanks for any help,
-Gagi


Jeff Reback

unread,
Dec 17, 2012, 2:58:23 PM12/17/12
to pyd...@googlegroups.com
can you send this:

df.get_dtype_counts()
and
df._data.blocks

then do:

df = df.convert_objects()

and print again

and then try the appending again 



I think you have floats in an 'int' column...which is 
--
 
 

Jeff Reback

unread,
Dec 17, 2012, 3:52:25 PM12/17/12
to pyd...@googlegroups.com
gagi,

ok...I reproduced your problem (I think)

 df = DataFrame(data=[[1,2],[0,1],[1,2],[0,0]])
 df['mixed_column'] = 'testing'
 df.ix[2,'mixed_column'] = np.nan

then store.append('df', df) will give you the same error...

essentially you have a non-string in a string column.....
so on your string columns do this

df['colum'].fillna(value=nan_value,inplace=True)

where nan_value is a string
you shouldn't be allowed to have np.nan in there (or they should be represented somehow else)

I think this will allow your append to work

Gagi

unread,
Dec 17, 2012, 9:47:35 PM12/17/12
to pyd...@googlegroups.com, je...@reback.net
Hi Jeff,

Thanks for getting back to me, I believe your intuition is correct.

Indeed I have columns that contain numeric and non-numerical values. For example: ['1.0', '0.0', 'F', 'G', 'W']. The CSV parser correctly sees this column as an 'object' type because of the non-parseable strings. I can see why these 'object' type columns will cause issues when writing to an HDF5 table as they do not have a fixed binary width. This brings up a good point. How should/does pandas store missing string/char data types? Currently I believe its represented as np.nan but as you pointed out this makes string type columns not directly writable to an HDF5 table. It seems that one has to manually convert all missing string column values to empty strings prior to writing to an HDF5 appendable table. Is there any way to convert nan to empty strings on the fly for 'object' column types? What are the best practices for these string/char objects?

On to the testing:

I modified my loadTableAppend function to convert all the nan vales to empty strings for each object type column and it worked! :)

def loadTableAppend(df):
    store = pd.HDFStore(r'C:\Users\ddrmanac\Work\Python\TableAppend.h5',mode='w')
    for name in df.ix[:,df.dtypes == object].columns.tolist():
        df[name].fillna(value='',inplace=True)

    store.append('df',df)
    store.create_table_index('df')
    store.close()



%time loadTableAppend(df) 
CPU times: user 444.99 s, sys: 0.00 s, total: 444.99 s
Wall time: 444.99 s <-- Create 8.37 GB (8,987,793,386 bytes) HDF5 Table @ ~18.8 MB/sec

%time df2 = readTable() 
CPU times: user 3.79 s, sys: 0.00 s, total: 3.79 s
Wall time: 3.79 s <-- Read From Disk 1M row x 4 column slice. Pretty quick Reading back 1 object and 3 floats.

With several back-to-back reads you get improved performance. More proof that on CPU caching works. : )

%time df2 = readTable()
CPU times: user 1.99 s, sys: 0.00 s, total: 1.99 s
Wall time: 1.99 s
 

df2.shape
(100000, 4)


Now for some questions:

1. How can I set a specific column as an index in the HDF5 table to slice on something other than 'index'. For example select all rows where column_bob = 'ABC'? I couldnt find any docstring on store.create_table_index('df').

2. Is there any guidance on converting missing (nan) entries for non-numeric data to empty string/char objects prior to writing an appendable table? Could this be automated so that nan entries are automatically converted to empty strings for string type data?

Thanks for all your help,
-Gagi

Jeff Reback

unread,
Dec 17, 2012, 10:23:42 PM12/17/12
to pyd...@googlegroups.com, pyd...@googlegroups.com, je...@reback.net
Gagi

I am glad you were able to track down the issue in strings. 

your questions in reverse order

2) strings are stored as object with np.nan as the missing value - as u found out we cannot serialize this to hdf5 like that, so prob easiest will be to have automatic nan conversion (like the parser does - eg allow use to specify the nan to/from string) - maybe empty string (though I suppose nan would work for some people)

this seems straightforward 

1) u can't index in columns - the data is stored in rows that are arrays - the search Terms allows u to search on the columns but in essense are doing filtering what u get back (u can examine the data in its raw form by: ptdump -d file.h5) 

however there are options depends king on what u want to do

say u had a query that u do that u apriori know u only ever want a subset of the columns - it might make sense to split your frame up into smaller (width wise) tables - they could even be in separate files 

similar to this is making a panel out of the frame with your 'fixed' columns as items; then u can have. 2 searchable dimensions, major_axis (eg index), and minor_axis (columns)

so the bottom line is that the nature of your storing (eg r u writing big files or piece by piece) and your queries determine how best to store your data

as an aside - I would try appending your data in smaller chunks - unless u have a lot of main memory u could be paging on the writing

I just reread your question - you want to  essentially do on-disk queries like u would with a DataFrame? 

Jeff
--
 
 

Gagi

unread,
Dec 18, 2012, 1:55:18 AM12/18/12
to pyd...@googlegroups.com, je...@reback.net
Thanks for the explanation Jeff,

Yes, basically I want to write a big block of data to disk and select subsets of rows indexed by the values in one or more columns. From the PyTables documentation it looks like this indexing should be possible by calling createIndex() on the column_name:

tbl.cols.colum_name.createIndex()

Is this how pandas HDF5 interface is intend to work: Select a column to create an index, and then use that index to select subsets rows?

Is it possible to have multiple index columns, Eg, if you have a DF with a hierarchical index? I thought the point of using PyTables was to enable this type of on-disk indexing/selection of parts of big files rather than loading up memory just to select a subset. I suppose its possible to create multiple hdf5 files on disk split by the index column, but that limits you to one type of indexing rather than allowing dynamic subset selection.

Thanks,
-Gagi

Jeff Reback

unread,
Dec 18, 2012, 2:21:42 PM12/18/12
to pyd...@googlegroups.com
Gagi,

you have inspired me to fix this!

take the attached pytables.py file and replace the existing one in pandas/io/ (backup your existing one!)

Should be completely backwards compabitble and will allow this:

arbitray selection on data_columns
---------------------------------------------------
store.append('df', df, columns = ['B'], nan_rep = 'nan')
store.select('df', [ Term('B>0') ])
you can also do store.create_table_index('df', columns = ['B']) to create an index on this data_column

note...currently B cannot be a string column....working on fixing this (it will throw an exception)

nan_rep default to 'nan' (string) for any string columns (e.g. you don't need to do fillna at the top-level)


try this and let me know

hth,

Jeff


--
 
 

pytables.py

Gagi

unread,
Dec 18, 2012, 9:07:43 PM12/18/12
to pyd...@googlegroups.com, je...@reback.net
Jeff this is wonderful! Thanks for the great work!

I used your new pytables.py file and it allowed me to index by a specific numeric column. I noticed that the new attribute you added is called 'columns'. Will this eventuality support adding multiple indexed columns, eg columns=['X', 'Y']? I also noticed that the indexed column is added as a separate data block in the hdf5 file. It seems like we can have multiple index columns joined on as needed. As it stands with the code below I can index using the Terms 'index<1000000' or 'X=10' since both 'index' and 'X' data columns are present in the hdf5 file.

Unfortunately my main indexing columns of interest are strings. : ) Hopefully indexing on string columns will be enabled soon.


Code Testing:
I modified my code with the following. Here I'm appending a df to an hdf5 file and additionally requesting to index on column X, while replacing any NaNs in string (object) columns with the empty string.

store.append('df', df, columns=['X'], nan_rep='') <-- Using pytables.py file Jeff provided.

CPU times: user 396.95 s, sys: 0.00 s, total: 396.95 s
Wall time: 396.95 s <-- Writing 8.36 GB (8,986,835,144 bytes) HDF5 table @ ~21 MB/Sec


I use the following code to slice into and select 1M rows & 2 columns of my 8.7GB on-disk hdf5 file by the table 'index':
df = store.select('df',[ Term('index<100000'), Term('columns', '=', ['X', 'Y']) ])

%time df2 = readTable()
print df2.shape
CPU times: user 41.09 s, sys: 0.00 s, total: 41.09 s
Wall time: 41.10 s <-- Get back 1M rows by 2 cols in 41 Sec not too bad. :)
(1000000, 2)

If I modify the code to pull all columns:
df = store.select('df',[ Term('index<100000') ])

CPU times: user 54.81 s, sys: 0.00 s, total: 54.81 s Wall time: 54.82 s <-- Takes 10 Sec more time and probably most of the 10 Sec difference (1000000, 350) is attributed to converting more columns to the pandas DataFrame object from table.


Now the cool part, I can index on my own column 'X':
df = store.select('df',[Term('X=9')])

On a fresh kernel reading from HDF5 for the first time:
CPU times: user 7.72 s, sys: 0.00 s, total: 7.72 s
Wall time: 7.72 s <-- Pulls 48.4k rows by 350 columns in 7.72 sec. Quite good indexing on column 'X'
(48405, 350)          Not sure if calling any other indexing command/options can help speed this up?

Selecting only two columns not much performance difference as Jeff stated:
df = store.select('df',[Term('X=9'), Term('columns', '=', ['X', 'Y'])])

CPU times: user 7.70 s, sys: 0.00 s, total: 7.70 s
Wall time: 7.70 s <-- Get back 48.4k rows by 2 columns in almost the same time. 
(48405, 2)            This confirms what Jeff was saying. There is no speedup in sub-selecting columns
                      since hdf5 table entries are stored as rows and columns are only filtered after all
                      data in selected rows are returned. This works for me since I usually want all cols.


So far this is looking amazing. I've been itching for an on-disk high speed query-able binary file format, the fact that this is being wrapped to interface with Pandas objects makes it that much better. My final testing will be to see how can I index from multiple columns (including strings) and possibly more importantly how to store all my data as 16bit to save lots of space/time. I simply dont need 64bit float resolution for numbers with only 6 significant digits.

Thanks for the updates, looking forward to future enhancements.
-Gagi


Jeff Reback

unread,
Dec 18, 2012, 9:29:42 PM12/18/12
to pyd...@googlegroups.com
glad it works

I have pushed a new version that supports string indexing and multiple columns (I had to change some cython code so giving u a new pytables.py) is not going to work - thought I think Wes does builds of master pretty frequently (and creates windows binaries)

there are some doc updates here
https://github.com/pydata/pandas/pull/2561/files

also indexing on these columns should work (and should make queries much faster)

u prob want to index everything u r querying
eg
store.creat_table_index('df', columns = ['index','X'])

API is still in flux - eg maybe if u specificy columns in the append call then should automatically index

I'll let u know when some binaries come out and u can try further

but give indexing a try and let me know
--
 
 

Gagi

unread,
Dec 19, 2012, 1:13:02 AM12/19/12
to pyd...@googlegroups.com
Hi Jeff,

Indeed selecting with the indexed columns did improve performance!

Selecting with indexed column 'X':

df = store.select('df',[Term('X=9')])
CPU times: user 5.50 s, sys: 0.00 s, total: 5.50 s
Wall time: 5.50 s <-- Versus 7.72 sec without calling store.create_table_index('df', columns = ['index','X'])
(48405, 2)

When selecting using the indexed 'index' column:
df = store.select('df',[Term('index<1000000')])

CPU times: user 40.59 s, sys: 0.00 s, total: 40.59 s
Wall time: 40.59 s <-- vs 54.82 Sec without indexing.
(1000000, 350)

Looks like indexing the columns improves selection performance but does not really impact how quickly the HDF5 file is written to disk or its overall size. I would suggest modifying the API to automatically index the selected index columns by default, possibly with an index=True attribute.

Let me know when the next build is up, I look forward to further test various types of indexing on more diverse data types.

One thing I did notice was that writing a big DF to HDF5 table seems to peak the memory usage. Since the table format allows appending is there a way to serialize the appending by smaller chunks to keep the memory usage down.

Thanks,
-Gagi
Thanks,
-Gagi

Jeff Reback

unread,
Dec 19, 2012, 6:25:10 AM12/19/12
to pyd...@googlegroups.com
Gagi

here are 2 more things to try:

1) instead of reading all of your csv files in and concating and then writing out - much better to append as you go

for f in files:
     store.append('df', read_csv(f), columns = [' ....)

this should be much faster and will avoid the paging issue - HDFStore uses at least 2x if original memory size when writing -
could do some sort of chunking in memory
but as I said - much better to have the user chunk

you MAY need to specify the argument min_itemsize = { 'values' = min size of your strings)
see the notes section in pandas HDF5 docs

2) after your file is written try using ptrepack (comes with pytables)

u want something like

ptrepack --chunkshape=auto --complib=blosc --complevel=9 --propindexes in.h5 out.h5

will create a new h5 file that is compressed and that recreates the indicies; chunkshape will be recompute which matters if u append

your file should be much smaller - but more important ill bet your query times will increase (a but counterintuitive but basically the disk io is the blocking factor here rather than CPU - so compression helps!)

you might want to read pytables.org optimization tips as well 

Hth
--
 
 

Jeff Reback

unread,
Dec 19, 2012, 6:37:54 AM12/19/12
to pyd...@googlegroups.com
I just created a table naively like yours
300 float columns, 20 string (short strings though)
2.5m rows - about 6.7gb 

took about 160s

I am running with. 24gb main memory (and it maxed it out) - 6x fast cores; I am storing via a pretty fast nfs which is on a disk array. I am pretty sure it write at 50 mb/s + so thg shouldn't be a problem

what is your config?





I can be reached on my cell 917-971-6387
--
 
 

Gagi

unread,
Dec 19, 2012, 1:11:09 PM12/19/12
to pyd...@googlegroups.com
Hi Jeff,

I'm running a 3.47GHz Core i7 X990, 6 physical / 12 virtual cores, 24GB Ram, 240GB and 600GB SSDs for storage.

The reason I read all my CSVs into memory when appending a pandas DF is because the data comes from possibly unstable sources, where entire columns may be sporadically missing between CSV files. To insure I have the super-set of all columns (and missing values (NaNs) for columns that have missing data) I use the in memory append. Currently the HDF5 table format does not support appending columns. I suppose its possible to read/append the first two rows of each CSV as a first pass and then using that column super-set as a base table append the rest of the data to HDF5. I will need to do a bit more work to decied on the exact data types and widths prior to appending on the fly. I do like the idea of appending massive amounts of CSV data directly to hdf5 with a small memory footprint. This looks promising for caching big data directly from a database and enabling quick in-memory analytics.

Regarding the memory usage, I'm wondering if its possible to convert the in-memory pandas DF row by row chunk into the PyTables table format rather than doubling memory usage. Does the conversion from a column-centric DF to a row-centric PyTables table format require everything to be in memory at once or is there a way to serialize to hdf5 row by row keeping a smaller working memory of converted data prior to writing the HDF5 file?

I'll give your suggestions a try and report back. I'm very interested in the HDF5 compression too. Is it possible to compress as the data as it is being written to HDF5 or is it always a cleanup operation?

Thanks again for the great advice,
-Gagi

Jeff

unread,
Dec 19, 2012, 3:19:26 PM12/19/12
to pyd...@googlegroups.com
you can see list of updated features here:

added index=True and chunksize on append....
this significantly fixes the writing memory issue...and doesn't seem to affect write speed at all...so win-win
also added docs on compression and more advanced queries (lke you are doing).....

once this is merged....i think you can try a dev build...

I would reiterate that looping thru your files and computnig your final column selection (e.g. sniffing them)....
then reindexing the frames before you appending should be signficantly faster

Jeff

On Wednesday, December 12, 2012 1:46:31 PM UTC-5, Gagi wrote:

Gagi

unread,
Dec 19, 2012, 4:30:19 PM12/19/12
to pyd...@googlegroups.com
Looks great Jeff! I'm excited for this upcoming dev build. :)

I ran a quick test and as expected my non-clean CSVs have columns that may or may not be present across each file. Also depending if there are erroneous data entries in a column it might be differently parsed between separate CSV reads as a string/object vs int/float. So there are two things going on: 1. Missing Columns between files, 2. Possibly differently parsed columns between files.

I am inspired to find a way around this and I think its possible with the file file sniffing.

Questions:
Regarding the second question: Is there a way to force a dtype on a column and ensure that any not-parseable entry is filled as NaN? Eg: Below I want column 'a' to be parsed as a np.float but the 'Dog' entry messes the call up. Is there a way to tell read_csv to force parsing as np.float and if an entry is not parseable as the specified dtype then fill it as NaN?

data = 'a,b,c\n1.1,2,3\nDog,5,6\n7.7,8,9.5'
df = pd.read_csv(StringIO.StringIO(data), dtype={'a': np.float})
df.dtypes

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-12-cd8b6f868aec> in <module>()
      1 data = 'a,b,c\n1.1,2,3\nDog,5,6\n7.7,8,9.5'
----> 2 df = pd.read_csv(StringIO.StringIO(data), dtype={'a': np.float})
      3 df.dtypes

C:\Python27\lib\site-packages\pandas\io\parsers.pyc in parser_f(filepath_or_buffer, sep, dialect, compression, doublequote, escapechar, quotechar, quoting, skipinitialspace, lineterminator, header, index_col, names, prefix, skiprows, skipfooter, skip_footer, na_values, true_values, false_values, delimiter, converters, dtype, usecols, engine, delim_whitespace, as_recarray, na_filter, compact_ints, use_unsigned, low_memory, buffer_lines, warn_bad_lines, error_bad_lines, keep_default_na, thousands, comment, decimal, parse_dates, keep_date_col, dayfirst, date_parser, memory_map, nrows, iterator, chunksize, verbose, encoding, squeeze)
    389                     buffer_lines=buffer_lines)
    390 
--> 391         return _read(filepath_or_buffer, kwds)
    392 
    393     parser_f.__name__ = name

C:\Python27\lib\site-packages\pandas\io\parsers.pyc in _read(filepath_or_buffer, kwds)
    205         return parser
    206 
--> 207     return parser.read()
    208 
    209 _parser_defaults = {

C:\Python27\lib\site-packages\pandas\io\parsers.pyc in read(self, nrows)
    622             #     self._engine.set_error_bad_lines(False)
    623 
--> 624         ret = self._engine.read(nrows)
    625 
    626         if self.options.get('as_recarray'):

C:\Python27\lib\site-packages\pandas\io\parsers.pyc in read(self, nrows)
    943 
    944         try:
--> 945             data = self._reader.read(nrows)
    946         except StopIteration:
    947             if nrows is None:

C:\Python27\lib\site-packages\pandas\_parser.pyd in pandas._parser.TextReader.read (pandas\src\parser.c:5785)()

C:\Python27\lib\site-packages\pandas\_parser.pyd in pandas._parser.TextReader._read_low_memory (pandas\src\parser.c:6002)()

C:\Python27\lib\site-packages\pandas\_parser.pyd in pandas._parser.TextReader._read_rows (pandas\src\parser.c:6870)()

C:\Python27\lib\site-packages\pandas\_parser.pyd in pandas._parser.TextReader._convert_column_data (pandas\src\parser.c:7919)()

AttributeError: 'NoneType' object has no attribute 'dtype'


Regarding the first question: After I sniff the column headers I need to force a dtype on each column name to ensure parsing is consistent for each subset of columns for each CSV. I'm hoping I can set everything up on the sniffed rows and then pass the { column_name : dtype } dict to the subsequent CSV reads and get the job done. Before this can work I'll have to guarantee that the CSV parser can force a DTYPE on a column which comes back to question 1.

Here is the naive CSV append to HDF5 implementation: (It was workign quite fast until it hit the extra columns.)

def AppendCSV2HDF5():
    """Appends many CSVs into one HDF5 file one by one minimizing memory usage."""

    path = r'C:\CSV'
    store = pd.HDFStore(r'C:\OnTheFlyAppend.h5', mode='a')
    files = os.listdir(path)
    sort_nicely(files)
    for name in files:
        store.append('df', pd.read_csv(os.path.join(path,name)), columns=['X'], nan_rep='')
    store.close()


...And the non-matching cause a table exception.

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in validate_attr(self, append)
   1158             if (existing_fields is not None and
   1159                 existing_fields != list(self.values)):
-> 1160                 raise Exception("appended items do not match existing items"
   1161                                 " in table!")
   1162 

Exception: appended items do not match existing items in table!

Thanks,
-Gagi
    df = store.select('df',[ Term('index<1000'), Term('columns', '=', ['X','Y']) ])
    return df


Output:
In [2]:
%time df = appendCSV()
CPU times: user 298.00 s, sys: 0.00 s, total: 298.00 s
Wall time: 298.00 s

In [3]:
%time loadTableRegular(df)
CPU times: user 74.65 s, sys: 0.00 s, total: 74.65 s
Wall time: 74.66 s

In [3]:
%time loadTableAppend(df)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
--
 
 
--
 
 


--
 
 
--
 
 

--
 
 
--
 
 

Jeff

unread,
Dec 19, 2012, 5:11:22 PM12/19/12
to pyd...@googlegroups.com
I would ask this on the pandas list separately about read_csv, I really don't use it much
that said...the parser has lots of options to control things like that

as for your second question:

you must have the same fields in a subsequent append.

here is another strategy you might try, kind of off the top of my head

loop thru your files

    read as csv, clean up as needed
    append as a SEPARATE node to an h5 file (or a separate file) (so you are doing a single append of a smallish table)
    in of like converting the csvs -> h5

then loop thru the keys() of the file (or the separate h5) files
and create a big one (after either keeping track of the columns in your first pass or sniffing the h5 files)

and this would be pretty quick as a) your columns are already conformed
finding the common columns is easy

# will get you the columns of a table without reading anything (a little hacky)...maybe I should make a method for this....
store.get_node('df')._v_attrs.non_index_axes[0][1]

then just loop:

     store.append(cleaned_up_store.select().reindex(columns = all_columns))




On Wednesday, December 12, 2012 1:46:31 PM UTC-5, Gagi wrote:

Gagi

unread,
Dec 19, 2012, 8:07:33 PM12/19/12
to pyd...@googlegroups.com
Hi Jeff,

I'm trying to use HDFStore and the python with statement to avoid unclosed HDF5 files.

with HDFStore('C:\store.h5') as store:
    do something with store and close store if something goes wrong.

Seems like the __enter__ and __exit__ methods are not implemented for this yet. Any plans for this? I suppose I can always use try blocks and exceptions.

Thanks,
-Gagi

Jeff Reback

unread,
Dec 19, 2012, 8:26:51 PM12/19/12
to pyd...@googlegroups.com
it's documented in pytables.py

from pandas.io.pytables import get_store

with get_store('file.h5') as store:
    store.append('df', df)



I can be reached on my cell 917-971-6387
--
 
 

Gagi

unread,
Dec 19, 2012, 9:28:48 PM12/19/12
to pyd...@googlegroups.com
Thanks Jeff,

CSV sniffing did the trick so far. Like you predicted it was extremely memory efficient (python was hovering only around 300MB ram used and peaking for a few seconds to 700MB) the end result produced a very-similar 8.37 GB (8,996,754,715 bytes) HDF5 file!

In pass one I append all the first rows from each CSV file to a data frame 'df_append' in memory . Then when appending the full CSVs to HDF5 I set the csv_read dtypes attribute to df_append.dtypes.to_dict(). Fortunately the dtypes dict can have extra unseen column and wont cause any exception to read_csv. This way I ensure I have the super-set of all columns and have essentially serialized appending CSVs to HDF5 in an elegant, fast, and memory efficient way. Thanks for all the help! If you or anyone on the pandas team is ever in the SF bay area (specifically peninsula) let me know I owe you a beer/coffee/tea! : )

def AppendCSV2HDF5():
    """Appends many CSVs into one HDF5 file one by one minimizing memory usage."""
    path = r'C:\CSV'
    store = pd.HDFStore(r'C:\OnTheFlyAppend.h5', mode='a')
    files = os.listdir(path)
    sort_nicely(files)
    df_append = pd.concat([pd.read_csv(os.path.join(path,name), nrows=1) for name in files], ignore_index=True)
    for name in files:
        print name
        store.append('df', pd.read_csv(os.path.join(path,name), dtype=df_append.dtypes.to_dict()), columns=['X'], nan_rep='')

    store.create_table_index('df', columns = ['index', 'X'])
    store.close()
    return df_append



%time df3 = AppendCSV2HDF5()
...

CPU times: user 327.91 s, sys: 0.00 s, total: 327.91 s Wall time: 327.91 s <-- Sniffed then Appended all 38 CSV files to a 8.37 GB (8,996,754,715 bytes) HDF5 file @ ~25.5 MB/Sec

I did notice that the 'index' data column in this new HDF5 table is non-unique after this type of serialized append.
This makes sense because its just a repeating index between 0~75k for each csv file appended in. It makes me wonder if we even
need this index column or if there is a way to make it unique like the pandas 'ignore_index=True' read_csv attribute.
Perhaps I'm missing an hdf5 flush somewhere that can create a unique 'index' column.

So part my HDF5 File looks like

absolute_index | index | values_block_0
756753 | 0 | 0.5
756754 | 1 | 0.6
756755 | 2 | 0.7
...
831754 | 0 | 0.8 <-- the 'index' column repets itself but the absolute table index does not.

Thanks,
-Gagi

Jeff Reback

unread,
Dec 19, 2012, 9:52:49 PM12/19/12
to pyd...@googlegroups.com
hmm

I usually have a unique index on my frames, but it is not a requirement - depends on how u r selecting on them
Esp with data columns functionality I suppose it's not so important 

that said u create one pretty easily
the table has an nrows attribute u can use that as the base for the next set of data (u will be able to do: store.get_table('df').nrows)

I think u need a unique index column (have u been getting a print message in select tht says: duplicate values????)


not sure if its useful but going to add start/stop in select so u can limit the rows which u select from as well

also u may want to experiment (again have to wait for new binary) with passing expectedrows=2,5m or whatever your total selected size is - this optimizes pytables (u should make a guess up front btw)

any luck with compression
try blosc and lzo (which u prob have to install separately)

in theory should give u big speed up in queries that r out of core (eg big data set)

if ever in San Fran will let u know - I am in NYC - so don't get there too often

I can be reached on my cell 917-971-6387
--
 
 

Jeff Reback

unread,
Dec 28, 2012, 3:57:01 PM12/28/12
to pyd...@googlegroups.com

these will be updated tonight I think 

pls check out build and let me know

lots of new features as well

Jeff


I can be reached on my cell 917-971-6387
On Dec 19, 2012, at 9:28 PM, Gagi <drag...@gmail.com> wrote:

--
 
 

Gagi

unread,
Jan 2, 2013, 12:56:15 PM1/2/13
to pyd...@googlegroups.com
Hi Jeff,

It looks like the latest AMD64 Python 2.7 dev build is not appearing on the download site:

pandas-0.10.0.dev-6826609.win-amd64-py2.7.exe      12-Dec-2012 00:07             1974432
...
pandas-0.10.1.dev-c934e02.win-amd64-py2.6.exe      30-Dec-2012 22:45             2031018
pandas-0.10.1.dev-c934e02.win-amd64-py3.1.exe      30-Dec-2012 22:02             2013126
pandas-0.10.1.dev-c934e02.win-amd64-py3.2.exe      30-Dec-2012 22:02             2024535
pandas-0.10.1.dev-c934e02.win32-py2.6.exe          30-Dec-2012 22:25             1869961
pandas-0.10.1.dev-c934e02.win32-py3.1.exe          30-Dec-2012 22:36             1852440
pandas-0.10.1.dev-c934e02.win32-py3.2.exe          30-Dec-2012 22:41             1852436

Thanks,
-Gagi
-> 2047             self<sp...
Show original

Jeff Reback

unread,
Jan 6, 2013, 10:21:59 AM1/6/13
to pyd...@googlegroups.com
track this issue.
hopefully the build will be posted soon
when you do try, pls reread the docs ...lots of new features (some which you suggested!)
 
some things to try:
 
append now passed index=True, which creates the indexes on all indexables and data_columns
you can try this with index=False to not create initially (and then call create_table_index to create later)
also you can pass index=list of columns to index (to index a specific list of indexables/data_columns)
 
chunksize=50000 is automatically passed, this controls how much data is written in each chunk, you can play with this number (smaller means smaller memory footprint)...
i don't think writing speed is affected (though smaller is slightly faster)....fwiw..your original case was effectively chunksize=number of total rows that you are writing....(2.7m)!
 
expectedrows=number of total rows to write in this table in total....so pass something like 2.7m here....pytables will optimize your queries based on this (note - if you copy or ptrepack the table
then this will be the number of rows in the table....this is most useful when you append sequentially like you are doing)
 
min_itemsize = dict( a_column = an_int )....if you need to increase the string size for a column that you are appending to...this may not be necessary...it dependends if you have variable string lenghts IN A SINGLE column (and they are not in the same append)....
 
so all this keeps your data in a single table.....
 
you can also try: append_as_multiple/select_as_multiple to create 2 or more tables with the selector table being smaller and index most/all of the columns, and another table to hold 'the data'...
the example should be clear....i'll bet this will really speed up your queries..
 
 
good luck
 
Jeff

--
 
 

Gagi

unread,
Jan 6, 2013, 3:17:59 PM1/6/13
to pyd...@googlegroups.com, je...@reback.net
Thanks for the updates and detailed information Jeff! This is looking better every day!

I'm really looking forward to building my big data repository with this implementation. I'll give the append/select_as_multiple a shot and see how it works and report back.

Thanks,
-Gagi

Gagi

unread,
Jan 18, 2013, 5:59:27 PM1/18/13
to pyd...@googlegroups.com, je...@reback.net
Hi Jeff,

I have been trying to get indexing on data columns working but seem to be running into some issues. I have created a simple random float data set with 3 Columns A, B, C. I would like to set all the columns as indexers and return all data where 'A<0.5' Using the term('index<100') works but indexing on other columns causes an exception.

In [4]: print pd.__version__
0.10.1.dev-6e2b6ea

In [5]: print np.__version__
1.6.2
 
PyTables 2.4.0

When I run this code:

import numpy as np
import pandas as pd
from pandas.io.pytables import Term

print pd.__version__
print np.__version__

def loadTableAppend(df):
    with pd.io.pytables.get_store(r'C:\Append.h5',mode='w') as store:
        store.append('df', df, complevel=9, complib='blosc')
        store.create_table_index('df', columns=True)
       
def readTableAppend():
    with pd.io.pytables.get_store(r'C:\Append.h5',mode='r') as store:
        return store['df']

def indexTableAppend():
    with pd.io.pytables.get_store(r'C:\Append.h5',mode='r') as store:
        #df = store.select('df',[Term('index<10')])
        df = store.select('df', [Term('A<0.5')])
    return df

NUM_ROWS = 1000
NUM_COLS = 3
#columns=map(str,np.arange(NUM_COLS).tolist())
%time df = pd.DataFrame(np.random.random((NUM_ROWS,NUM_COLS)), dtype=np.float64, columns=['A','B','C'])
%time loadTableAppend(df)
%time df2 = readTableAppend()
%time df2 = df2.astype(np.float32)
%time df3 = indexTableAppend()


I get this exception:


C:\Python27\lib\site-packages\pandas\io\pytables.pyc in generate(self, where)
   3023
   3024         queryables = self.table.queryables()
-> 3025         return [Term(c, queryables=queryables) for c in where]
   3026
   3027     def select(self):

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in __init__(self, field, op, value, queryables)
   2873
   2874         if len(self.q):
-> 2875             self.eval()
   2876
   2877     def __str__(self):

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in eval(self)
   2899
   2900         if not self.is_valid:
-> 2901             raise Exception("query term is not valid [%s]" % str(self))
   2902
   2903         # convert values if we are in the table

Exception: query term is not valid [field->A,op-><,value->['0.5']] <-- This actually looks like it correctly parsed the term.


Thanks in advance for any help,
-Gagi

Jeff

unread,
Jan 18, 2013, 6:08:52 PM1/18/13
to pyd...@googlegroups.com, je...@reback.net
you are very close....i think I used the keyword columns at one time...(now its data_columns), as a keywork to append....try this:
 
 
store.append('df', df, complevel=9, complib='blosc', data_columns = ['A','B'])
# also index will be created automaticially on the main df index, and A, B,
# you can control by passing index = ['A'], in practice you would probably pass index=False (so it wouldn't create as you go)...not sure if this actually makes any difference though
# then create the index after
 
columns keyword is only used in select (and create_table_index where your usage is correct - though index is already created)

Gagi

unread,
Jan 18, 2013, 9:10:29 PM1/18/13
to pyd...@googlegroups.com, je...@reback.net
Thanks Jeff!

That did the trick! I ran some tests to confirm. I created a 1M row x 500 col 32 bit float HDF5 table and indexed 10 of the columns. Then I looped through the indexer columns and selected about 15% of the rows at random by index column and compared the resulting table to the same in memory index.

Everything seemed to work fine with compression and even with 32bit floats stored in the HDF5 file. Unfortunately the select method returns the data as 64bit floats when reading from hdf5, perhaps there is a way around this. Also is there any fundamental limit to indexing on date-time cols? I thought they were just stored as 64bit ints.

Regardless this code seems to work great! Thanks again for all the hard work!

Code:

import numpy as np
import pandas as pd
from pandas.io.pytables import Term

NUM_ROWS = 1000000
NUM_COLS = 500
col_names = ['A'+num for num in map(str,np.arange(NUM_COLS).tolist())]
index_cols = col_names[:10]

def loadTableAppend(df):
    with pd.io.pytables.get_store(r'C:\32BitAppend.h5',mode='w') as store:
        store.append('df', df, complevel=9, complib='blosc', data_columns=index_cols)

def readTableAppend():
    with pd.io.pytables.get_store(r'C:\32BitAppend.h5',mode='r') as store:
        return store['df']

def indexTableAppend(cols, val='0.15', df=None, check_equal=False):
    with pd.io.pytables.get_store(r'C:\32BitAppend.h5',mode='r') as store:
        for col in cols:
            df2 = store.select('df', [Term(col + '<' + val)])
            if check_equal is True:
                eq = np.array_equal(df2, df[df[col]<0.15])
                print 'Correct Index of Column %s is %s!' % (col, eq)
        return df2

%time df = pd.DataFrame(np.random.random((NUM_ROWS,NUM_COLS)), dtype=np.float32, columns=col_names)
%time loadTableAppend(df)
#%time readTableAppend()
%time df3 = indexTableAppend(index_cols, df=df, check_equal=True)
%time df3 = indexTableAppend(['A5'], val='0.15')
%time df3 = indexTableAppend(['index'], val='150000')

Output:
In [11]: %time df = pd.DataFrame(np.random.random((NUM_ROWS,NUM_COLS)), dtype=np.float32, columns=col_names)
CPU times: user 9.52 s, sys: 0.00 s, total: 9.52 s
Wall time: 9.52 s

In [12]: %time loadTableAppend(df)
CPU times: user 112.26 s, sys: 0.00 s, total: 112.26 s
Wall time: 112.27 s <-- Write HDF5 Table to Disk is about 3X as fast if Index=False

In [13]: #%time readTableAppend()
   ....: %time df3 = indexTableAppend(index_cols, df=df, check_equal=True)
   ....:
Correct Index of Column A0 is True!
Correct Index of Column A1 is True!

Correct Index of Column A2 is True!

Correct Index of Column A3 is True!

Correct Index of Column A4 is True!

Correct Index of Column A5 is True!

Correct Index of Column A6 is True!

Correct Index of Column A7 is True!

Correct Index of Column A8 is True!

Correct Index of Column A9 is True!
CPU times: user 132.59 s, sys: 0.00 s, total: 132.59 s
Wall time: 132.60 s <-- Indexing on all columns from disk produces same result as in-memory indexing. :)

In [14]: %time df3 = indexTableAppend(['A5'], val='0.15')
CPU times: user 12.46 s, sys: 0.00 s, total: 12.46 s
Wall time: 12.46 s <-- Selecting random 150k rows takes 12.5 sec, not so bad ~36MB/sec

In [15]: %time df3 = indexTableAppend(['index'], val='150000')
CPU times: user 3.96 s, sys: 0.00 s, total: 3.96 s
Wall time: 3.96 s <-- Slicing contiguous first 150k rows takes 4 sec, 3x faster ~112MB/sec!

-Gagi

Jeff Reback

unread,
Jan 18, 2013, 9:32:21 PM1/18/13
to pyd...@googlegroups.com
ok....good news and bad news
 
bad news is that trying to store float32 first converted to float64, then shoved them into float32 columns in the table (not sure of what effect this
actually has....maybe nothing, maybe just made the file bigger, don't really know)
 
selecting was always converting to float64 (a bug in how was recording what was going on)...
 
so good news is that I fixed this (I know you are windows, but you can look at https://github.com/pydata/pandas/pull/2675 if interested)
 
other bad news is that for pretty deep reasons can't get int32 to work easily
 
but float32 should help you I think
 
one caveat - do not mix float32 and float64 AT ALL or it will convert all to float64....not much I can do about this right now
 
Jeff
--
 
 

Gagi

unread,
Jan 25, 2013, 6:42:44 PM1/25/13
to pyd...@googlegroups.com, je...@reback.net
Hi Jeff,

Is there a way to store different width string columns in an HDF5 Table? Eg, can we set a different min_itemsize for each column?

min_itemsize = { 'COl_A' : 50, 'COl_B' : 25, 'COl_C' : 10 }

Thanks,
-Gagi

Gagi

unread,
Jan 25, 2013, 10:40:25 PM1/25/13
to pyd...@googlegroups.com, je...@reback.net
Hi Jeff,

I ran an example to test this and it appears that the string types are still stored in one fixed width block despite explicitly setting a different column width in min_itemsize. I have 3 columns: a 3-char string, a 1-char string, and a float. After appending to an HDF5 Table it looks like in the HDF5 table viewer there is only one block of 3-char sized strings for both the 1 and 3 char strings.

-Gagi


Code:

import numpy as np
import pandas as pd
import StringIO

data = 'a,b,c\nAAA,T,3.3\nBBB,F,6.6\nCCC,F,9.9'
df = pd.read_csv(StringIO.StringIO(data))

with pd.io.pytables.get_store(r'C:\MixedType.h5', mode='w') as store:
    store.append('df', df, min_itemsize={'a':3, 'b':1}, nan_rep='')
    store.root.df.table

Gagi

unread,
Jan 28, 2013, 5:25:17 PM1/28/13
to pyd...@googlegroups.com
Hi Jeff,

I'm trying to store this simple data frame to an HDF5 table. I'm getting an error when attempting to store the datetime column.


import numpy as np
import pandas as pd
import StringIO
from datetime import datetime

data = 'a,b,c,d\nAAA,T,3.3,2013-01-01 01:02:03\n,,6.6,\nCCC,F,,2013-01-02 03:02:03'
data2 = 'a,b,c,d\nAAA,T\n,,6.6\nCCC,F,'
dt_format = '%Y-%m-%d %H:%M:%S'
dt_parse = lambda dt: datetime.strptime(dt, dt_format) if dt is not np.nan else dt

df = pd.read_csv(StringIO.StringIO(data), dtype={'a':np.dtype('S3'), 'b':np.dtype('S1'), 'c':np.dtype('f4')}, parse_dates=['d'], date_parser=dt_parse)


with pd.io.pytables.get_store(r'C:\MixedType.h5', mode='w') as store:
    store.append('df', df, min_itemsize={'a':3, 'b':1}, nan_rep='')
    print store.root.df.table

-------------------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-10-9eebda890adc> in <module>()
      1 with pd.io.pytables.get_store(r'C:\MixedType.h5', mode='w') as store:
----> 2     store.append('df', df, min_itemsize={'a':3, 'b':1}, nan_rep='')
      3     print store.root.df.table

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in append(self, key, value, columns, **kwargs)
    530             raise Exception("columns is not a supported keyword in append, try data_columns")
    531
--> 532         self._write_to_group(key, value, table=True, append=True, **kwargs)
    533
    534     def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, **kwargs):

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in _write_to_group(self, key, value, index, table, append, complib, **kwargs)
    786             raise ValueError('Compression not supported on non-table')
    787
--> 788         s.write(obj = value, append=append, complib=complib, **kwargs)
    789         if s.is_table and index:
    790             s.create_index(columns = index)

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in write(self, obj, axes, append, complib, complevel, fletcher32, min_itemsize, chunksize, expectedrows, **kwargs)
   2489         # create the axes
   2490         self.create_axes(axes=axes, obj=obj, validate=append,
-> 2491                          min_itemsize=min_itemsize, **kwargs)
   2492
   2493         if not self.is_exists:

C:\Python27\lib\site-packages\pandas\io\pytables.pyc in create_axes(self, axes, obj, validate, nan_rep, data_columns, min_itemsize, **kwargs)
   2252                 raise
   2253             except (Exception), detail:
-> 2254                 raise Exception("cannot find the correct atom type -> [dtype->%s,items->%s] %s" % (b.dtype.name, b.items, str(detail)))
   2255             j += 1
   2256

Exception: cannot find the correct atom type -> [dtype->object,items->Index([a, b, d], dtype=object)] object of type 'datetime.datetime' has no len()

Here is how the dtypes look:
In [11]: df.dtypes
Out[11]: a     object
b     object
c    float64
d     object

The 'd' column is actually a date time with one missing value:
In [12]: df['d'][0]
Out[12]: datetime.datetime(2013, 1, 1, 1, 2, 3)

In [13]: df
Out[13]:     
         a  b    c                    d
0  AAA  T  3.3  2013-01-01 01:02:03
1          6.6                  NaN
2  CCC  F  NaN  2013-01-02 03:02:03

Thanks,
-Gagi

Jeff

unread,
Jan 28, 2013, 6:00:57 PM1/28/13
to pyd...@googlegroups.com
try this to force the dtype (i don't believe you can do this with read_csv, do it after)

df['d'] = pd.Series(df['d'].values,dtype='M8[ns]')

the dtype of the datetime columns must be datetime64[ns] for the storing to work, yours
is object; I think this is a bug that is fixed after 0.10.1 (in 0.10.2 or 0.11)
Reply all
Reply to author
Forward
0 new messages