import datetimeimport randomimport pandas as pdimport numpy as np
start_time = datetime.datetime.now()type_list = ['a', 'b', 'c', 'd']ten_minute_time_frame = 10 * 60 * 1000000hist_bins = [i*20 for i in range(5)]
data = []for i in range(500): d = { 'timestamp': start_time + datetime.timedelta(microseconds=random.randint(0, ten_minute_time_frame)), 'type': random.choice(type_list), 'value': random.randint(0, 100), } data.append(d)
df = pd.DataFrame(data)df['minute_stamp'] = map(lambda x: x.to_period(freq='T'), df['timestamp'])print df.groupby(['minute_stamp', 'type']).agg(lambda x: np.histogram(x, bins=hist_bins))minute_stamp,type,20,40,60,80,1002016-03-06 15:01,a,3,4,2,6,12016-03-06 15:02,a,1,7,4,5,42016-03-06 15:03,a,4,3,3,2,4 hist_bins = [i*20 for i in range(6)] df['bins'] = pd.cut(df['value'], bins=hist_bins) df_bin = df.groupby(['minute_stamp', 'type', 'bins'], as_index=False).agg({'value': pd.Series.count}) print df_bin.pivot_table(index=['minute_stamp', 'type'], columns=['bins'], values='value').reset_index()
bins minute_stamp type (0, 20] (20, 40] (40, 60] (60, 80] (80, 100]
0 2016-03-07 20:25 a NaN NaN 2 NaN NaN
1 2016-03-07 20:25 b NaN NaN 1 1 NaN
2 2016-03-07 20:25 c NaN 1 NaN 1 3
3 2016-03-07 20:25 d 1 1 1 1 1
4 2016-03-07 20:26 a 1 1 3 2 4
5 2016-03-07 20:26 b 3 3 3 2 3
6 2016-03-07 20:26 c NaN NaN 3 3 2
7 2016-03-07 20:26 d NaN 5 3 3 NaN
import datetimeimport randomimport reimport pandas as pdimport numpy as np
start_time = datetime.datetime.now()type_list = ['a', 'b', 'c', 'd']ten_minute_time_frame = 10 * 60 * 1000000hist_bins = [i*20 for i in range(5)]column_name_re = re.compile('\(\d+\,\ \d+\]')
def translate_column_names(c_name): if column_name_re.match(c_name): return c_name.split(',')[-1].strip(']').strip() return c_name
data = []for i in range(500): d = { 'timestamp': start_time + datetime.timedelta(microseconds=random.randint(0, ten_minute_time_frame)), 'type': random.choice(type_list), 'value': random.randint(0, 100), } data.append(d)
df = pd.DataFrame(data)df['minute_stamp'] = map(lambda x: x.to_period(freq='T'), df['timestamp'])df['bins'] = pd.cut(df['value'], bins=hist_bins)df_bin = df.groupby(['minute_stamp', 'type', 'bins'], as_index=False).agg({'value': pd.Series.count})df_bin = df_bin.pivot_table(index=['minute_stamp', 'type'], columns=['bins'], values='value', fill_value=0).reset_index()df_bin.columns = map(translate_column_names, list(df_bin.columns.values))print df_bin