In [10]: df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],....: 'key2' : ['one', 'two', 'one', 'two', 'one'],....: 'data1' : np.random.randn(5),....: 'data2' : np.random.randn(5)})In [11]: df
Out[11]: data1 data2 key1 key2
0 -0.204708 1.393406 a one
1 0.478943 0.092908 a two
2 -0.519439 0.281746 b one
3 -0.555730 0.769023 b two
4 1.965781 1.246435 a one
In [12]: grouped = df['data1'].groupby(df['key1'])In [13]: grouped
Out[13]: &roupby.SeriesGroupBy object at 0x7faa31537390>
In [14]: an()
Out[14]:
key1
a 0.746672
b -0.537585
Name: data1, dtype: float64
In [15]: means = df['data1'].groupby([df['key1'], df['key2']]).mean()In [16]: means
Out[16]:
key1 key2
a one 0.880536two 0.478943
b one -0.519439two -0.555730
Name: data1, dtype: float64
In [17]: means.unstack()
Out[17]:
key2 one two
key1
a 0.880536 0.478943
b -0.519439 -0.555730
In [18]: states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])In [19]: years = np.array([2005, 2005, 2006, 2005, 2006])In [20]: df['data1'].groupby([states, years]).mean()
Out[20]:
California 2005 0.4789432006 -0.519439
Ohio 2005 -0.3802192006 1.965781
Name: data1, dtype: float64
In [21]: df.groupby('key1').mean()
Out[21]: data1 data2
key1
a 0.746672 0.910916
b -0.537585 0.525384In [22]: df.groupby(['key1', 'key2']).mean()
Out[22]: data1 data2
key1 key2
a one 0.880536 1.319920two 0.478943 0.092908
b one -0.519439 0.281746two -0.555730 0.769023
In [23]: df.groupby(['key1', 'key2']).size()
Out[23]:
key1 key2
a one 2two 1
b one 1two 1
dtype: int64
In [24]: for name, group upby('key1'):print(name)print(group)
adata1 data2 key1 key2
0 -0.204708 1.393406 a one
1 0.478943 0.092908 a two
4 1.965781 1.246435 a one
bdata1 data2 key1 key2
2 -0.519439 0.281746 b one
3 -0.555730 0.769023 b two
In [25]:for (k1, k2), group upby(['key1', 'key2']):print((k1, k2))print(group)
('a', 'one')data1 data2 key1 key2
0 -0.204708 1.393406 a one
4 1.965781 1.246435 a one
('a', 'two')data1 data2 key1 key2
1 0.478943 0.092908 a two
('b', 'one')data1 data2 key1 key2
2 -0.519439 0.281746 b one
('b', 'two')data1 data2 key1 key2
3 -0.55573 0.769023 b two
In [26]: pieces = dict(upby('key1')))In [27]: pieces['b']
Out[27]: data1 data2 key1 key2
2 -0.519439 0.281746 b one
3 -0.555730 0.769023 b two
In [28]: df.dtypes
Out[28]:
data1 float64
data2 float64
key1 object
key2 object
dtype: objectIn [29]: grouped = df.groupby(df.dtypes, axis=1)
In [30]:for dtype, group in grouped:print(dtype)print(group)float64data1 data2
0 -0.204708 1.393406
1 0.478943 0.092908
2 -0.519439 0.281746
3 -0.555730 0.769023
4 1.965781 1.246435
objectkey1 key2
0 a one
1 a two
2 b one
3 b two
4 a one
df.groupby('key1')['data1']
df.groupby('key1')[['data2']]
df['data1'].groupby(df['key1'])
df[['data2']].groupby(df['key1'])
In [31]: df.groupby(['key1', 'key2'])[['data2']].mean()
Out[31]: data2
key1 key2
a one 1.319920two 0.092908
b one 0.281746two 0.769023
In [35]: people = pd.DataFrame(np.random.randn(5, 5),....: columns=['a', 'b', 'c', 'd', 'e'],....: index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])In [36]: people.iloc[2:3, [1, 2]] = np.nan # Add a few NA valuesIn [37]: people
Out[37]: a b c d e
Joe 1.007189 -1.296221 0.274992 0.228913 1.352917
Steve 0.886429 -2.001637 -0.371843 1.669025 -0.438570
Wes -0.539741 NaN NaN -1.021228 -0.577087
Jim 0.124121 0.302614 0.523772 0.000940 1.343810
Travis -0.713544 -0.831154 -2.370232 -1.860761 -0.860757
In [38]: mapping = {'a': 'red', 'b': 'red', 'c': 'blue','d': 'blue', 'e': 'red', 'f' : 'orange'}
In [39]: by_column = upby(mapping, axis=1)In [40]: by_column.sum()
Out[40]: blue red
Joe 0.503905 1.063885
Steve 1.297183 -1.553778
Wes -1.021228 -1.116829
Jim 0.524712 1.770545
Travis -4.230992 -2.405455
In [41]: map_series = pd.Series(mapping)In [42]: map_series
Out[42]:
a red
b red
c blue
d blue
e red
f orange
dtype: objectIn [43]: upby(map_series, axis=1).count()
Out[43]: blue red
Joe 2 3
Steve 2 3
Wes 1 2
Jim 2 3
Travis 2 3
In [44]: upby(len).sum()
Out[44]: a b c d e
3 0.591569 -0.993608 0.798764 -0.791374 2.119639
5 0.886429 -2.001637 -0.371843 1.669025 -0.438570
6 -0.713544 -0.831154 -2.370232 -1.860761 -0.860757
In [45]: key_list = ['one', 'one', 'one', 'two', 'two']In [46]: upby([len, key_list]).min()
Out[46]: a b c d e
3 one -0.539741 -1.296221 0.274992 -1.021228 -0.577087two 0.124121 0.302614 0.523772 0.000940 1.343810
5 one 0.886429 -2.001637 -0.371843 1.669025 -0.438570
6 two -0.713544 -0.831154 -2.370232 -1.860761 -0.860757
In [47]: columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],[1, 3, 5, 1, 3]],names=['cty', 'tenor'])In [48]: hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)In [49]: hier_df
Out[49]:
cty US JP
tenor 1 3 5 1 3
0 0.560145 -1.265934 0.119827 -1.063512 0.332883
1 -2.359419 -0.199543 -1.541996 -0.970736 -1.307030
2 0.286350 0.377984 -0.753887 0.331286 1.349742
3 0.069877 0.246674 -0.011862 1.004812 1.327195
In [50]: upby(level='cty', axis=1).count()
Out[50]:
cty JP US
0 2 3
1 2 3
2 2 3
3 2 3
In [51]: df
Out[51]: data1 data2 key1 key2
0 -0.204708 1.393406 a one
1 0.478943 0.092908 a two
2 -0.519439 0.281746 b one
3 -0.555730 0.769023 b two
4 1.965781 1.246435 a oneIn [52]: grouped = df.groupby('key1')In [53]: grouped['data1'].quantile(0.9)
Out[53]:
key1
a 1.668413
b -0.523068
Name: data1, dtype: float64
In [54]: def peak_to_peak(arr):return arr.max() - arr.min()
In [55]: grouped.agg(peak_to_peak)
Out[55]: data1 data2
key1
a 2.170488 1.300498
b 0.036292 0.487276
In [56]: grouped.describe()
Out[56]: data1 count mean std min 25% 50% 75%
key1
a 3.0 0.746672 1.109736 -0.204708 0.137118 0.478943 1.222362
b 2.0 -0.537585 0.025662 -0.555730 -0.546657 -0.537585 -0.528512 data2
max count mean std min 25% 50%
key1
a 1.965781 3.0 0.910916 0.712217 0.092908 0.669671 1.246435
b -0.519439 2.0 0.525384 0.344556 0.281746 0.403565 0.525384 75% max
key1
a 1.319920 1.393406
b 0.647203 0.769023
笔记:自定义聚合函数要比表10-1中那些经过优化的函数慢得多。这是因为在构造中间分组数据块时存在非常大的开销(函数调用、数据重排等)。
In [57]: tips = pd.read_csv('tips.csv')# Add tip percentage of total bill
In [58]: tips['tip_pct'] = tips['tip'] / tips['total_bill']In [59]: tips[:6]
Out[59]: total_bill tip smoker day time size tip_pct
0 16.99 1.01 No Sun Dinner 2 0.059447
1 10.34 1.66 No Sun Dinner 3 0.160542
2 21.01 3.50 No Sun Dinner 3 0.166587
3 23.68 3.31 No Sun Dinner 2 0.139780
4 24.59 3.61 No Sun Dinner 4 0.146808
5 25.29 4.71 No Sun Dinner 4 0.186240
In [60]: grouped = upby(['day', 'smoker'])
In [61]: grouped_pct = grouped['tip_pct']In [62]:
Out[62]:
day smoker
Fri No 0.151650Yes 0.174783
Sat No 0.158048Yes 0.147906
Sun No 0.160113Yes 0.187250
Thur No 0.160298Yes 0.163863
Name: tip_pct, dtype: float64
In [63]: grouped_pct.agg(['mean', 'std', peak_to_peak])
Out[63]: mean std peak_to_peak
day smoker
Fri No 0.151650 0.028123 0.067349Yes 0.174783 0.051293 0.159925
Sat No 0.158048 0.039767 0.235193Yes 0.147906 0.061375 0.290095
Sun No 0.160113 0.042347 0.193226Yes 0.187250 0.154134 0.644685
Thur No 0.160298 0.038774 0.193350Yes 0.163863 0.039389 0.151240
In [64]: grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])
Out[64]: foo bar
day smoker
Fri No 0.151650 0.028123Yes 0.174783 0.051293
Sat No 0.158048 0.039767Yes 0.147906 0.061375
Sun No 0.160113 0.042347Yes 0.187250 0.154134
Thur No 0.160298 0.038774Yes 0.163863 0.039389
In [65]: functions = ['count', 'mean', 'max']In [66]: result = grouped['tip_pct', 'total_bill'].agg(functions)In [67]: result
Out[67]: tip_pct total_bill count mean max count mean max
day smoker
Fri No 4 0.151650 0.187735 4 18.420000 22.75Yes 15 0.174783 0.263480 15 16.813333 40.17
Sat No 45 0.158048 0.291990 45 19.661778 48.33Yes 42 0.147906 0.325733 42 21.276667 50.81
Sun No 57 0.160113 0.252672 57 20.506667 48.17Yes 19 0.187250 0.710345 19 24.120000 45.35
Thur No 45 0.160298 0.266312 45 17.113111 41.19Yes 17 0.163863 0.241255 17 19.190588 43.11
In [68]: result['tip_pct']
Out[68]: count mean max
day smoker
Fri No 4 0.151650 0.187735Yes 15 0.174783 0.263480
Sat No 45 0.158048 0.291990Yes 42 0.147906 0.325733
Sun No 57 0.160113 0.252672Yes 19 0.187250 0.710345
Thur No 45 0.160298 0.266312Yes 17 0.163863 0.241255
In [69]: ftuples = [('Durchschnitt', 'mean'),('Abweichung', np.var)]In [70]: grouped['tip_pct', 'total_bill'].agg(ftuples)
Out[70]: tip_pct total_bill Durchschnitt Abweichung Durchschnitt Abweichung
day smoker
Fri No 0.151650 0.000791 18.420000 25.596333Yes 0.174783 0.002631 16.813333 82.562438
Sat No 0.158048 0.001581 19.661778 79.908965Yes 0.147906 0.003767 21.276667 101.387535
Sun No 0.160113 0.001793 20.506667 66.099980Yes 0.187250 0.023757 24.120000 109.046044
Thur No 0.160298 0.001503 17.113111 59.625081Yes 0.163863 0.001551 19.190588 69.808518
In [71]: grouped.agg({'tip' : np.max, 'size' : 'sum'})
Out[71]: tip size
day smoker
Fri No 3.50 9Yes 4.73 31
Sat No 9.00 115Yes 10.00 104
Sun No 6.00 167Yes 6.50 49
Thur No 6.70 112Yes 5.00 40In [72]: grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],....: 'size' : 'sum'})
Out[72]: tip_pct sizemin max mean std sum
day smoker
Fri No 0.120385 0.187735 0.151650 0.028123 9Yes 0.103555 0.263480 0.174783 0.051293 31
Sat No 0.056797 0.291990 0.158048 0.039767 115Yes 0.035638 0.325733 0.147906 0.061375 104
Sun No 0.059447 0.252672 0.160113 0.042347 167Yes 0.065660 0.710345 0.187250 0.154134 49
Thur No 0.072961 0.266312 0.160298 0.038774 112Yes 0.090014 0.241255 0.163863 0.039389 40
In [73]: upby(['day', 'smoker'], as_index=False).mean()
Out[73]: day smoker total_bill tip size tip_pct
0 Fri No 18.420000 2.812500 2.250000 0.151650
1 Fri Yes 16.813333 2.714000 2.066667 0.174783
2 Sat No 19.661778 3.102889 2.555556 0.158048
3 Sat Yes 21.276667 2.875476 2.476190 0.147906
4 Sun No 20.506667 3.167895 2.929825 0.160113
5 Sun Yes 24.120000 3.516842 2.578947 0.187250
6 Thur No 17.113111 2.673778 2.488889 0.160298
7 Thur Yes 19.190588 3.030000 2.352941 0.163863
In [74]: def top(df, n=5, column='tip_pct'):....: return df.sort_values(by=column)[-n:]In [75]: top(tips, n=6)
Out[75]: total_bill tip smoker day time size tip_pct
109 14.31 4.00 Yes Sat Dinner 2 0.279525
183 23.17 6.50 Yes Sun Dinner 4 0.280535
232 11.61 3.39 No Sat Dinner 2 0.291990
67 3.07 1.00 Yes Sat Dinner 1 0.325733
178 9.60 4.00 Yes Sun Dinner 2 0.416667
172 7.25 5.15 Yes Sun Dinner 2 0.710345
In [76]: upby('smoker').apply(top)
Out[76]: total_bill tip smoker day time size tip_pct
smoker
No 88 24.71 5.85 No Thur Lunch 2 0.236746185 20.69 5.00 No Sun Dinner 5 0.24166351 10.29 2.60 No Sun Dinner 2 0.252672149 7.51 2.00 No Thur Lunch 2 0.266312232 11.61 3.39 No Sat Dinner 2 0.291990
Yes 109 14.31 4.00 Yes Sat Dinner 2 0.279525183 23.17 6.50 Yes Sun Dinner 4 0.28053567 3.07 1.00 Yes Sat Dinner 1 0.325733178 9.60 4.00 Yes Sun Dinner 2 0.416667172 7.25 5.15 Yes Sun Dinner 2 0.710345
In [77]: upby(['smoker', 'day']).apply(top, n=1, column='total_bill')
Out[77]: total_bill tip smoker day time size tip_pct
smoker day
No Fri 94 22.75 3.25 No Fri Dinner 2 0.142857Sat 212 48.33 9.00 No Sat Dinner 4 0.186220Sun 156 48.17 5.00 No Sun Dinner 6 0.103799Thur 142 41.19 5.00 No Thur Lunch 5 0.121389
Yes Fri 95 40.17 4.73 Yes Fri Dinner 4 0.117750Sat 170 50.81 10.00 Yes Sat Dinner 3 0.196812Sun 182 45.35 3.50 Yes Sun Dinner 3 0.077178Thur 197 43.11 5.00 Yes Thur Lunch 4 0.115982
笔记:除这些基本用法之外,能否充分发挥apply的威力很大程度上取决于你的创造力。传入的那个函数能做什么全由你说了算,它只需返回一个pandas对象或标量值即可。本章后续部分的示例主要用于讲解如何利用groupby解决各种各样的问题。
In [78]: result = upby('smoker')['tip_pct'].describe()In [79]: result
Out[79]: count mean std min 25% 50% 75%
smoker
No 151.0 0.159328 0.039910 0.056797 0.136906 0.155625 0.185014
Yes 93.0 0.163196 0.085119 0.035638 0.106771 0.153846 0.195059 max
smokerNo 0.291990
Yes 0.710345 In [80]: result.unstack('smoker')
Out[80]: smoker
count No 151.000000Yes 93.000000
mean No 0.159328Yes 0.163196
std No 0.039910Yes 0.085119
min No 0.056797Yes 0.035638
25% No 0.136906Yes 0.106771
50% No 0.155625Yes 0.153846
75% No 0.185014Yes 0.195059
max No 0.291990Yes 0.710345
dtype: float64
f = lambda x: x.describe()
grouped.apply(f)
In [81]: upby('smoker', group_keys=False).apply(top)
Out[81]: total_bill tip smoker day time size tip_pct
88 24.71 5.85 No Thur Lunch 2 0.236746
185 20.69 5.00 No Sun Dinner 5 0.241663
51 10.29 2.60 No Sun Dinner 2 0.252672
149 7.51 2.00 No Thur Lunch 2 0.266312
232 11.61 3.39 No Sat Dinner 2 0.291990
109 14.31 4.00 Yes Sat Dinner 2 0.279525
183 23.17 6.50 Yes Sun Dinner 4 0.280535
67 3.07 1.00 Yes Sat Dinner 1 0.325733
178 9.60 4.00 Yes Sun Dinner 2 0.416667
172 7.25 5.15 Yes Sun Dinner 2 0.710345
In [82]: frame = pd.DataFrame({'data1': np.random.randn(1000),....: 'data2': np.random.randn(1000)})In [83]: quartiles = pd.cut(frame.data1, 4)In [84]: quartiles[:10]
Out[84]:
0 (-1.23, 0.489]
1 (-2.956, -1.23]
2 (-1.23, 0.489]
3 (0.489, 2.208]
4 (-1.23, 0.489]
5 (0.489, 2.208]
6 (-1.23, 0.489]
7 (-1.23, 0.489]
8 (0.489, 2.208]
9 (0.489, 2.208]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-2.956, -1.23] < (-1.23, 0.489] < (0.489, 2.
208] < (2.208, 3.928]]
In [85]: def get_stats(group):....: return {'min': group.min(), 'max': group.max(),....: 'count': unt(), 'mean': an()}In [86]: grouped = upby(quartiles)In [87]: grouped.apply(get_stats).unstack()
Out[87]: count max mean min
data1
(-2.956, -1.23] 95.0 1.670835 -0.039521 -3.399312
(-1.23, 0.489] 598.0 3.260383 -0.002051 -2.989741
(0.489, 2.208] 297.0 2.954439 0.081822 -3.745356
(2.208, 3.928] 10.0 1.765640 0.024750 -1.929776
# Return quantile numbers
In [88]: grouping = pd.qcut(frame.data1, 10, labels=False)In [89]: grouped = upby(grouping)In [90]: grouped.apply(get_stats).unstack()
Out[90]: count max mean min
data1
0 100.0 1.670835 -0.049902 -3.399312
1 100.0 2.628441 0.030989 -1.950098
2 100.0 2.527939 -0.067179 -2.925113
3 100.0 3.260383 0.065713 -2.315555
4 100.0 2.074345 -0.111653 -2.047939
5 100.0 2.184810 0.052130 -2.989741
6 100.0 2.458842 -0.021489 -2.223506
7 100.0 2.954439 -0.026459 -3.056990
8 100.0 2.735527 0.103406 -3.745356
9 100.0 2.377020 0.220122 -2.064111
In [91]: s = pd.Series(np.random.randn(6))In [92]: s[::2] = np.nanIn [93]: s
Out[93]:
0 NaN
1 -0.125921
2 NaN
3 -0.884475
4 NaN
5 0.227290
dtype: float64In [94]: s.an())
Out[94]:
0 -0.261035
1 -0.125921
2 -0.261035
3 -0.884475
4 -0.261035
5 0.227290
dtype: float64
In [95]: states = ['Ohio', 'New York', 'Vermont', 'Florida',....: 'Oregon', 'Nevada', 'California', 'Idaho']In [96]: group_key = ['East'] * 4 + ['West'] * 4In [97]: data = pd.Series(np.random.randn(8), index=states)In [98]: data
Out[98]:
Ohio 0.922264
New York -2.153545
Vermont -0.365757
Florida -0.375842
Oregon 0.329939
Nevada 0.981994
California 1.105913
Idaho -1.613716
dtype: float64
In [99]: data[['Vermont', 'Nevada', 'Idaho']] = np.nanIn [100]: data
Out[100]:
Ohio 0.922264
New York -2.153545
Vermont NaN
Florida -0.375842
Oregon 0.329939
Nevada NaN
California 1.105913
Idaho NaN
dtype: float64In [101]: upby(group_key).mean()
Out[101]:
East -0.535707
West 0.717926
dtype: float64
In [102]: fill_mean = lambda g: g.an())In [103]: upby(group_key).apply(fill_mean)
Out[103]:
Ohio 0.922264
New York -2.153545
Vermont -0.535707
Florida -0.375842
Oregon 0.329939
Nevada 0.717926
California 1.105913
Idaho 0.717926
dtype: float64
In [104]: fill_values = {'East': 0.5, 'West': -1}In [105]: fill_func = lambda g: g.fillna(fill_values[g.name])In [106]: upby(group_key).apply(fill_func)
Out[106]:
Ohio 0.922264
New York -2.153545
Vermont 0.500000
Florida -0.375842
Oregon 0.329939
Nevada -1.000000
California 1.105913
Idaho -1.000000
dtype: float64
# Hearts, Spades, Clubs, Diamonds(红桃、黑桃、梅花、方块)
suits = ['H', 'S', 'C', 'D']
card_val = (list(range(1, 11)) + [10] * 3) * 4
base_names = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']
cards = []
for suit in ['H', 'S', 'C', 'D']:d(str(num) + suit for num in base_names)deck = pd.Series(card_val, index=cards)
In [108]: deck[:13]
Out[108]:
AH 1
2H 2
3H 3
4H 4
5H 5
6H 6
7H 7
8H 8
9H 9
10H 10
JH 10
KH 10
QH 10
dtype: int64
In [109]:def draw(deck, n=5):return deck.sample(n)In [110]: draw(deck)
Out[110]:
AD 1
8C 8
5H 5
KC 10
2C 2
dtype: int64
In [111]: get_suit = lambda card: card[-1] # last letter is suitIn [112]: upby(get_suit).apply(draw, n=2)
Out[112]:
C 2C 23C 3
D KD 108D 8
H KH 103H 3
S 2S 24S 4
dtype: int64
In [113]: upby(get_suit, group_keys=False).apply(draw, n=2)
Out[113]:
KC 10
JC 10
AD 1
5D 5
5H 5
6H 6
7S 7
KS 10
dtype: int64
In [114]:df = pd.DataFrame({'category': ['a', 'a', 'a', 'a','b', 'b', 'b', 'b'],'data': np.random.randn(8),'weights': np.random.rand(8)})In [115]: df
Out[115]: category data weights
0 a 1.561587 0.957515
1 a 1.219984 0.347267
2 a -0.482239 0.581362
3 a 0.315667 0.217091
4 b -0.047852 0.894406
5 b -0.454145 0.918564
6 b -0.556774 0.277825
7 b 0.253321 0.955905
In [116]: grouped = df.groupby('category')In [117]: get_wavg = lambda g: np.average(g['data'], weights=g['weights'])In [118]: grouped.apply(get_wavg)
Out[118]:
category
a 0.811643
b -0.122262
dtype: float64
In [119]:close_px = pd.read_csv('stock_px_2.csv', parse_dates=True,index_col=0)In [120]: close_px.info()
<class frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
AAPL 2214 non-null float64
MSFT 2214 non-null float64
XOM 2214 non-null float64
SPX 2214 non-null float64
dtypes: float64(4)
memory usage: 86.5 KBIn [121]: close_px[-4:]
Out[121]: AAPL MSFT XOM SPX
2011-10-11 400.29 27.00 76.27 1195.54
2011-10-12 402.19 26.96 77.16 1207.25
2011-10-13 408.43 27.18 76.37 1203.66
2011-10-14 422.00 27.27 78.11 1224.58
In [122]: spx_corr = lambda x: x.corrwith(x['SPX'])
In [123]: rets = close_px.pct_change().dropna()
In [124]: get_year = lambda x: x.yearIn [125]: by_year = upby(get_year)In [126]: by_year.apply(spx_corr)
Out[126]: AAPL MSFT XOM SPX
2003 0.541124 0.745174 0.661265 1.0
2004 0.374283 0.588531 0.557742 1.0
2005 0.467540 0.562374 0.631010 1.0
2006 0.428267 0.406126 0.518514 1.0
2007 0.508118 0.658770 0.786264 1.0
2008 0.681434 0.804626 0.828303 1.0
2009 0.707103 0.654902 0.797921 1.0
2010 0.710105 0.730118 0.839057 1.0
2011 0.691931 0.800996 0.859975 1.0
In [127]: by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))
Out[127]:
2003 0.480868
2004 0.259024
2005 0.300093
2006 0.161735
2007 0.417738
2008 0.611901
2009 0.432738
2010 0.571946
2011 0.581987
dtype: float64
import statsmodels.api as sm
def regress(data, yvar, xvars):Y = data[yvar]X = data[xvars]X['intercept'] = 1.result = sm.OLS(Y, X).fit()return result.params
In [129]: by_year.apply(regress, 'AAPL', ['SPX'])
Out[129]: SPX intercept
2003 1.195406 0.000710
2004 1.363463 0.004201
2005 1.766415 0.003246
2006 1.645496 0.000080
2007 1.198761 0.003438
2008 0.968016 -0.001110
2009 0.879103 0.002954
2010 1.052608 0.001261
2011 0.806605 0.001514
In [130]: tips.pivot_table(index=['day', 'smoker'])
Out[130]: size tip tip_pct total_bill
day smoker
Fri No 2.250000 2.812500 0.151650 18.420000Yes 2.066667 2.714000 0.174783 16.813333
Sat No 2.555556 3.102889 0.158048 19.661778Yes 2.476190 2.875476 0.147906 21.276667
Sun No 2.929825 3.167895 0.160113 20.506667Yes 2.578947 3.516842 0.187250 24.120000
Thur No 2.488889 2.673778 0.160298 17.113111Yes 2.352941 3.030000 0.163863 19.190588
In [131]: tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],.....: columns='smoker')
Out[131]: size tip_pct
smoker No Yes No Yes
time day
Dinner Fri 2.000000 2.222222 0.139622 0.165347Sat 2.555556 2.476190 0.158048 0.147906Sun 2.929825 2.578947 0.160113 0.187250Thur 2.000000 NaN 0.159744 NaN
Lunch Fri 3.000000 1.833333 0.187735 0.188937Thur 2.500000 2.352941 0.160311 0.163863
In [132]: tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],.....: columns='smoker', margins=True)
Out[132]: size tip_pct
smoker No Yes All No Yes All
time day
Dinner Fri 2.000000 2.222222 2.166667 0.139622 0.165347 0.158916Sat 2.555556 2.476190 2.517241 0.158048 0.147906 0.153152Sun 2.929825 2.578947 2.842105 0.160113 0.187250 0.166897Thur 2.000000 NaN 2.000000 0.159744 NaN 0.159744
Lunch Fri 3.000000 1.833333 2.000000 0.187735 0.188937 0.188765Thur 2.500000 2.352941 2.459016 0.160311 0.163863 0.161301
All 2.668874 2.408602 2.569672 0.159328 0.163196 0.160803
In [133]: tips.pivot_table('tip_pct', index=['time', 'smoker'], columns='day',.....: aggfunc=len, margins=True)
Out[133]:
day Fri Sat Sun Thur All
time smoker
Dinner No 3.0 45.0 57.0 1.0 106.0Yes 9.0 42.0 19.0 NaN 70.0
Lunch No 1.0 NaN NaN 44.0 45.0Yes 6.0 NaN NaN 17.0 23.0
All 19.0 87.0 76.0 62.0 244.0
In [134]: tips.pivot_table('tip_pct', index=['time', 'size', 'smoker'],.....: columns='day', aggfunc='mean', fill_value=0)
Out[134]:
day Fri Sat Sun Thur
time size smoker
Dinner 1 No 0.000000 0.137931 0.000000 0.000000Yes 0.000000 0.325733 0.000000 0.0000002 No 0.139622 0.162705 0.168859 0.159744Yes 0.171297 0.148668 0.207893 0.0000003 No 0.000000 0.154661 0.152663 0.000000Yes 0.000000 0.144995 0.152660 0.0000004 No 0.000000 0.150096 0.148143 0.000000Yes 0.117750 0.124515 0.193370 0.0000005 No 0.000000 0.000000 0.206928 0.000000
Yes 0.000000 0.106572 0.065660 0.000000
... ... ... ... ...
Lunch 1 No 0.000000 0.000000 0.000000 0.181728Yes 0.223776 0.000000 0.000000 0.0000002 No 0.000000 0.000000 0.000000 0.166005Yes 0.181969 0.000000 0.000000 0.1588433 No 0.187735 0.000000 0.000000 0.084246Yes 0.000000 0.000000 0.000000 0.2049524 No 0.000000 0.000000 0.000000 0.138919Yes 0.000000 0.000000 0.000000 0.1554105 No 0.000000 0.000000 0.000000 0.1213896 No 0.000000 0.000000 0.000000 0.173706
[21 rows x 4 columns]
In [138]: data
Out[138]:Sample Nationality Handedness
0 1 USA Right-handed
1 2 Japan Left-handed
2 3 USA Right-handed
3 4 Japan Right-handed
4 5 Japan Left-handed
5 6 Japan Right-handed
6 7 USA Right-handed
7 8 USA Left-handed
8 9 Japan Right-handed
9 10 USA Right-handed
In [139]: pd.crosstab(data.Nationality, data.Handedness, margins=True)
Out[139]:
Handedness Left-handed Right-handed All
Nationality
Japan 2 3 5
USA 1 4 5
All 3 7 10
In [140]: pd.crosstab([tips.time, tips.day], tips.smoker, margins=True)
Out[140]:
smoker No Yes All
time day
Dinner Fri 3 9 12Sat 45 42 87Sun 57 19 76Thur 1 0 1
Lunch Fri 1 6 7Thur 44 17 61
All 151 93 244
本文发布于:2024-02-01 16:05:19,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170677471737812.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |