pandas groupby aggregate histogram bin columns

1,629 views
Skip to first unread message

DavidT

unread,
Mar 6, 2016, 10:26:04 PM3/6/16
to PyData
I'm trying to take a dataframe of format:

  timestamp,type,value

Group the data by minutes and type and bucket the values for each into histogram bin labeled columns containing the count of values for that bin, minute and type.

Here is code that attempts to do this, but it's not quite right:

import datetime
import random
import pandas as pd
import numpy as np

start_time = datetime.datetime.now()
type_list = ['a', 'b', 'c', 'd']
ten_minute_time_frame = 10 * 60 * 1000000
hist_bins = [i*20 for i in range(5)]

data = []
for i in range(500):
    d = {
        'timestamp': start_time + datetime.timedelta(microseconds=random.randint(0, ten_minute_time_frame)),
        'type': random.choice(type_list),
        'value': random.randint(0, 100),
    }
    data.append(d)

df = pd.DataFrame(data)
df['minute_stamp'] = map(lambda x: x.to_period(freq='T'), df['timestamp'])
print df.groupby(['minute_stamp', 'type']).agg(lambda x: np.histogram(x, bins=hist_bins))

What I'm looking for instead is output like this:

minute_stamp,type,20,40,60,80,100
2016-03-06 15:01,a,3,4,2,6,1
2016-03-06 15:02,a,1,7,4,5,4
2016-03-06 15:03,a,4,3,3,2,4

Can someone help?

Thanks,
David

Joshua Baboo

unread,
Mar 7, 2016, 10:02:02 AM3/7/16
to PyData
slightly modified bins & grouping:
    hist_bins = [i*20 for i in range(6)]
    df['bins'] = pd.cut(df['value'], bins=hist_bins)
    df_bin = df.groupby(['minute_stamp', 'type', 'bins'], as_index=False).agg({'value': pd.Series.count})
    print df_bin.pivot_table(index=['minute_stamp', 'type'], columns=['bins'], values='value').reset_index()

output:
bins     minute_stamp type  (0, 20]  (20, 40]  (40, 60]  (60, 80]  (80, 100]
0    2016-03-07 20:25    a      NaN       NaN         2       NaN        NaN
1    2016-03-07 20:25    b      NaN       NaN         1         1        NaN
2    2016-03-07 20:25    c      NaN         1       NaN         1          3
3    2016-03-07 20:25    d        1         1         1         1          1
4    2016-03-07 20:26    a        1         1         3         2          4
5    2016-03-07 20:26    b        3         3         3         2          3
6    2016-03-07 20:26    c      NaN       NaN         3         3          2
7    2016-03-07 20:26    d      NaN         5         3         3        NaN

see if this serves your requirement

Regards,
Joshua

DavidT

unread,
Mar 7, 2016, 12:28:21 PM3/7/16
to PyData
This is great.  Thanks for the help!

Here is exactly where I ended up:

import datetime
import random
import re
import pandas as pd
import numpy as np

start_time = datetime.datetime.now()
type_list = ['a', 'b', 'c', 'd']
ten_minute_time_frame = 10 * 60 * 1000000
hist_bins = [i*20 for i in range(5)]
column_name_re = re.compile('\(\d+\,\ \d+\]')


def translate_column_names(c_name):
    if column_name_re.match(c_name):
        return c_name.split(',')[-1].strip(']').strip()
    return c_name


data = []
for i in range(500):
    d = {
        'timestamp': start_time + datetime.timedelta(microseconds=random.randint(0, ten_minute_time_frame)),
        'type': random.choice(type_list),
        'value': random.randint(0, 100),
    }
    data.append(d)

df = pd.DataFrame(data)
df['minute_stamp'] = map(lambda x: x.to_period(freq='T'), df['timestamp'])
df['bins'] = pd.cut(df['value'], bins=hist_bins)
df_bin = df.groupby(['minute_stamp', 'type', 'bins'], as_index=False).agg({'value': pd.Series.count})
df_bin = df_bin.pivot_table(index=['minute_stamp', 'type'], columns=['bins'], values='value', fill_value=0).reset_index()
df_bin.columns = map(translate_column_names, list(df_bin.columns.values))
print df_bin
Reply all
Reply to author
Forward
0 new messages