pandas groupby aggregate histogram bin columns

DavidT

unread,

Mar 6, 2016, 10:26:04 PM3/6/16

to PyData

I'm trying to take a dataframe of format:

timestamp,type,value

Group the data by minutes and type and bucket the values for each into histogram bin labeled columns containing the count of values for that bin, minute and type.

Here is code that attempts to do this, but it's not quite right:

import datetime
import random
import pandas as pd
import numpy as np

start_time = datetime.datetime.now()
type_list = ['a', 'b', 'c', 'd']
ten_minute_time_frame = 10 * 60 * 1000000
hist_bins = [i*20 for i in range(5)]

data = []
for i in range(500):
    d = {
        'timestamp': start_time + datetime.timedelta(microseconds=random.randint(0, ten_minute_time_frame)),
        'type': random.choice(type_list),
        'value': random.randint(0, 100),
    }
    data.append(d)

df = pd.DataFrame(data)
df['minute_stamp'] = map(lambda x: x.to_period(freq='T'), df['timestamp'])
print df.groupby(['minute_stamp', 'type']).agg(lambda x: np.histogram(x, bins=hist_bins))

What I'm looking for instead is output like this:

minute_stamp,type,20,40,60,80,100
2016-03-06 15:01,a,3,4,2,6,1
2016-03-06 15:02,a,1,7,4,5,42016-03-06 15:03,a,4,3,3,2,4

Can someone help?

Thanks,

David

Joshua Baboo

unread,

Mar 7, 2016, 10:02:02 AM3/7/16

to PyData

slightly modified bins & grouping:

    hist_bins = [i*20 for i in range(6)]
    df['bins'] = pd.cut(df['value'], bins=hist_bins)
    df_bin = df.groupby(['minute_stamp', 'type', 'bins'], as_index=False).agg({'value': pd.Series.count})
    print df_bin.pivot_table(index=['minute_stamp', 'type'], columns=['bins'], values='value').reset_index()

output:

bins     minute_stamp type  (0, 20]  (20, 40]  (40, 60]  (60, 80]  (80, 100]
0    2016-03-07 20:25    a      NaN       NaN         2       NaN        NaN
1    2016-03-07 20:25    b      NaN       NaN         1         1        NaN
2    2016-03-07 20:25    c      NaN         1       NaN         1          3
3    2016-03-07 20:25    d        1         1         1         1          1
4    2016-03-07 20:26    a        1         1         3         2          4
5    2016-03-07 20:26    b        3         3         3         2          3
6    2016-03-07 20:26    c      NaN       NaN         3         3          2
7    2016-03-07 20:26    d      NaN         5         3         3        NaN

see if this serves your requirement

Regards,

Joshua

DavidT

unread,

Mar 7, 2016, 12:28:21 PM3/7/16

to PyData

This is great. Thanks for the help!

Here is exactly where I ended up:

import datetime
import random
import re

import pandas as pd
import numpy as np

start_time = datetime.datetime.now()
type_list = ['a', 'b', 'c', 'd']
ten_minute_time_frame = 10 * 60 * 1000000
hist_bins = [i*20 for i in range(5)]

column_name_re = re.compile('\(\d+\,\ \d+\]')


def translate_column_names(c_name):
    if column_name_re.match(c_name):
        return c_name.split(',')[-1].strip(']').strip()
    return c_name



data = []
for i in range(500):
    d = {
        'timestamp': start_time + datetime.timedelta(microseconds=random.randint(0, ten_minute_time_frame)),
        'type': random.choice(type_list),
        'value': random.randint(0, 100),
    }
    data.append(d)

df = pd.DataFrame(data)
df['minute_stamp'] = map(lambda x: x.to_period(freq='T'), df['timestamp'])

df['bins'] = pd.cut(df['value'], bins=hist_bins)
df_bin = df.groupby(['minute_stamp', 'type', 'bins'], as_index=False).agg({'value': pd.Series.count})

df_bin = df_bin.pivot_table(index=['minute_stamp', 'type'], columns=['bins'], values='value', fill_value=0).reset_index()
df_bin.columns = map(translate_column_names, list(df_bin.columns.values))
print df_bin

Reply all

Reply to author

Forward