pandas索引

阅读: 评论:0

pandas索引

pandas索引

# .html
# MultiIndex / Advanced Indexing
# pandas 0.22.0 
import pandas as pd
import numpy as np
import random; random.shuffle(tuples)
# Hierarchical indexing (MultiIndex)  分层索引 多重索引# 创建多重索引对象,如同标准的索引类,他们存放轴axis标签labels
# 创建方式:
# from a list of arrays -- using MultiIndex.from_arrays
# from an array of tuples -- using MultiIndex.from_tuples
# from a crossed set of iterables -- using MultiIndex.from_product arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
arrays
# *arrays
zip(*arrays)  # 这里的* 是“解开”list , list[0] list[1] ……
list(zip(*arrays))
list(zip(arrays[0], arrays[1]))
tuples = list(zip(*arrays))
tuples
[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]<zip at 0xf96f888>[('bar', 'one'),('bar', 'two'),('baz', 'one'),('baz', 'two'),('foo', 'one'),('foo', 'two'),('qux', 'one'),('qux', 'two')][('bar', 'one'),('bar', 'two'),('baz', 'one'),('baz', 'two'),('foo', 'one'),('foo', 'two'),('qux', 'one'),('qux', 'two')][('bar', 'one'),('bar', 'two'),('baz', 'one'),('baz', 'two'),('foo', 'one'),('foo', 'two'),('qux', 'one'),('qux', 'two')]
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index
MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],names=['first', 'second'])
s = pd.Series(np.random.randn(8), index=index)
s
first  second
bar    one       0.271000two      -1.276230
baz    one      -1.018103two      -0.620292
foo    one       1.008070two       0.759145
qux    one      -2.141050two      -0.927688
dtype: float64
# 更简洁的方式, 当每个元素对都来自于可迭代对象时
iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']]  # 创建了与上面相同的index
pd.MultiIndex.from_product(iterables, names=['first', 'second'])  # 多重索引可以接受命名,默认为None
MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],names=['first', 'second'])
# 也可以直接在创建df或series时传入矩阵array 的 列表list
arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']),np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])]
arrays  
[array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],dtype='<U3'),array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'],dtype='<U3')]
s = pd.Series(np.random.randn(8), index=arrays)
s
bar  one    0.817791two    0.510420
baz  one   -0.494160two   -0.529997
foo  one    0.641282two   -0.202762
qux  one    0.050320two    2.097300
dtype: float64
df = pd.DataFrame(np.random.randn(8, 4), index=arrays)
df
0123
barone-2.0909890.0010521.4676370.267938
two1.2246100.8518940.765531-0.505116
bazone1.444246-0.2477950.267462-0.945641
two0.8360460.2747320.530525-0.560081
fooone-3.709465-0.1570890.608778-0.003217
two-0.8488181.478306-0.389401-1.205956
quxone-1.0697751.272440-0.797613-0.194223
two1.5972180.454815-0.7560220.481038
# 可以对不同的轴向(如行索引/行名或列索引/列名)设置
df = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=index)
df
firstbarbazfooqux
secondonetwoonetwoonetwoonetwo
A-1.608162-0.0073121.048244-0.029907-0.437866-1.8533982.0268750.359521
B1.207609-0.272366-0.530191-0.689641-0.244362-1.4762520.8184930.353771
C-0.3694631.862253-0.118297-0.1483261.147616-1.3899650.8177160.787394
# 同时对两个方向设置index,注意index的长度与数据在不同方向上的长度
pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6])
firstbarbazfoo
secondonetwoonetwoonetwo
firstsecond
barone0.020204-0.5490890.3818300.326558-1.420590-1.551863
two1.3117752.2949080.2039811.381199-0.7433872.119027
bazone0.6408561.089627-1.4635030.727607-0.959549-0.037316
two-0.906859-0.7207020.8626140.0820660.209276-0.391039
fooone-0.328704-1.0151170.2798260.141166-0.053601-1.171920
two0.342074-0.196049-0.3879460.196228-1.2649320.144251
pd.Series(np.random.randn(8), index=tuples)  # 多重索引,相当于元组index
(bar, one)   -0.267177
(bar, two)   -0.239632
(baz, one)    1.212249
(baz, two)    0.289517
(foo, one)    1.311922
(foo, two)   -0.797733
(qux, one)   -1.395485
(qux, two)   -0.451327
dtype: float64
# 可以控制索引的显示方式,通过在 pandas.set_options() 设置 multi_sparse 选项
pd.set_option('display.multi_sparse', False)
df
pd.set_option('display.multi_sparse', True)
df
firstbarbarbazbazfoofooquxqux
secondonetwoonetwoonetwoonetwo
A-1.608162-0.0073121.048244-0.029907-0.437866-1.8533982.0268750.359521
B1.207609-0.272366-0.530191-0.689641-0.244362-1.4762520.8184930.353771
C-0.3694631.862253-0.118297-0.1483261.147616-1.3899650.8177160.787394
firstbarbazfooqux
secondonetwoonetwoonetwoonetwo
A-1.608162-0.0073121.048244-0.029907-0.437866-1.8533982.0268750.359521
B1.207609-0.272366-0.530191-0.689641-0.244362-1.4762520.8184930.353771
C-0.3694631.862253-0.118297-0.1483261.147616-1.3899650.8177160.787394
# Reconstructing the level labels 重建层级标签
# The method get_level_values will return a vector of the labels for each location at a particular level:
# get_level_values 方法返回指定层级的标签向量
_level_values(0)  # 使用整数序号
_level_values("second")  # 使用name
Index(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], dtype='object', name='first')Index(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'], dtype='object', name='second')
# Basic indexing on axis with MultiIndex # 在轴方向的基础索引
df['bar']
df['bar', 'one']
df['bar']['one']  # 不建议使用,链式
s['qux']
secondonetwo
A-1.608162-0.007312
B1.207609-0.272366
C-0.3694631.862253
A   -1.608162
B    1.207609
C   -0.369463
Name: (bar, one), dtype: float64A   -1.608162
B    1.207609
C   -0.369463
Name: one, dtype: float64one    0.05032
two    2.09730
dtype: float64
# Defined Levels 指定层级
df.columns  # 原index
df[['foo','qux']].columns  # 切片后的结果,层级levels中的项目没有减少,labels减少了
# 这样做避免了重新计算层级,使切片保持高效
MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],names=['first', 'second'])MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],labels=[[2, 2, 3, 3], [0, 1, 0, 1]],names=['first', 'second'])
# 查看切片实际选择levels
df[['foo','qux']].columns.values
df[['foo','qux']]._level_values(0)  # 指定层级
array([('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')],dtype=object)Index(['foo', 'foo', 'qux', 'qux'], dtype='object', name='first')
# 用有效的used层级重建多重索引
df[['foo','qux']].ve_unused_levels()
MultiIndex(levels=[['foo', 'qux'], ['one', 'two']],labels=[[0, 0, 1, 1], [0, 1, 0, 1]],names=['first', 'second'])
# Data alignment and using reindex 数据定位和使用reindex
s
bar  one    0.817791two    0.510420
baz  one   -0.494160two   -0.529997
foo  one    0.641282two   -0.202762
qux  one    0.050320two    2.097300
dtype: float64
# 当两个index不同的对象计算时与一般的index一样
s + s[:-2]
s + s[::2]
bar  one    1.635582two    1.020840
baz  one   -0.988319two   -1.059993
foo  one    1.282563two   -0.405523
qux  one         NaNtwo         NaN
dtype: float64bar  one    1.635582two         NaN
baz  one   -0.988319two         NaN
foo  one    1.282563two         NaN
qux  one    0.100640two         NaN
dtype: float64
# reindex 可以被另外一个 multiindex 或者 元组的list 或 array 调用
index
index[:3]
s.reindex(index[:3])
s.reindex([('foo', 'two'), ('bar', 'one'), ('qux', 'one'), ('baz', 'one')])
MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],names=['first', 'second'])MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],labels=[[0, 0, 1], [0, 1, 0]],names=['first', 'second'])first  second
bar    one       0.817791two       0.510420
baz    one      -0.494160
dtype: float64foo  two   -0.202762
bar  one    0.817791
qux  one    0.050320
baz  one   -0.494160
dtype: float64
# Advanced indexing with hierarchical index
# 使用层次索引的高级索引方法
df = df.T
df
ABC
firstsecond
barone-1.6081621.207609-0.369463
two-0.007312-0.2723661.862253
bazone1.048244-0.530191-0.118297
two-0.029907-0.689641-0.148326
fooone-0.437866-0.2443621.147616
two-1.853398-1.476252-1.389965
quxone2.0268750.8184930.817716
two0.3595210.3537710.787394
# .loc 定位
df.loc["bar"]
df.loc["bar", "two"]  # 返回了一个series(不是一行,而是“一列”),其索引是原df的列名
ABC
second
one-1.6081621.207609-0.369463
two-0.007312-0.2723661.862253
A   -0.007312
B   -0.272366
C    1.862253
Name: (bar, two), dtype: float64
# loc 中使用切片,切片的值可以是元组
df.loc['baz':'foo']
df.loc[('baz', 'two'):('qux', 'one')]
df.loc[('baz', 'two'):'foo']
ABC
firstsecond
bazone1.048244-0.530191-0.118297
two-0.029907-0.689641-0.148326
fooone-0.437866-0.2443621.147616
two-1.853398-1.476252-1.389965
ABC
firstsecond
baztwo-0.029907-0.689641-0.148326
fooone-0.437866-0.2443621.147616
two-1.853398-1.476252-1.389965
quxone2.0268750.8184930.817716
ABC
firstsecond
baztwo-0.029907-0.689641-0.148326
fooone-0.437866-0.2443621.147616
two-1.853398-1.476252-1.389965
# 可以给loc传入元组或标签列表,取得不连续的索引
df.loc[[('bar', 'two'), ('qux', 'one')]]
ABC
firstsecond
bartwo-0.007312-0.2723661.862253
quxone2.0268750.8184930.817716
# Using slicers 使用切片
# 可以用多重索引对象进行切片。可以用 切片值、 标签或标签列表、 布尔索引等选择器
# 可以用slice(None) 选择那一级的所有的内容。不用特别指定所有深度的级别,他们默认是slice(None)
# 注意:使用loc应该规定所有的轴方向,包括行index和列columns。
# 推荐方式:df.loc[(slice('A1','A3'),.....), :]   注意 冒号前面的逗号,逗号前表示行方向切片(选择器),逗号后面表示列方向切片(选择器)
# 不推荐:  df.loc[(slice('A1','A3'),.....)]  可能产生歧义
def mklbl(prefix, n):# mklbl("a", 3) --> ['a0', 'a1', 'a2']return ["%s%s" % (prefix, i) for i in range(n)]
mklbl("a", 3)
['a0', 'a1', 'a2']
miindex = pd.MultiIndex.from_product([mklbl('A',4),mklbl('B',2),mklbl('C',4),mklbl('D',2)])
miindex  # 由列表生成4重(4级)索引对象,共生成4*2*4*2=64行
MultiIndex(levels=[['A0', 'A1', 'A2', 'A3'], ['B0', 'B1'], ['C0', 'C1', 'C2', 'C3'], ['D0', 'D1']],labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]])
micolumns = pd.MultiIndex.from_tuples([('a','foo'),('a','bar'),('b','foo'),('b','bah')],names=['lvl0', 'lvl1'])
micolumns  # 由元组生成2重索引对象,共4行,并对2重(两级)分别命名
MultiIndex(levels=[['a', 'b'], ['bah', 'bar', 'foo']],labels=[[0, 0, 1, 1], [2, 1, 2, 0]],names=['lvl0', 'lvl1'])
row_l = len(miindex)
col_l = len(micolumns)
dfmi = pd.DataFrame(np.arange(row_l * col_l).reshape((row_l, col_l)),index=miindex,columns=micolumns).sort_index().sort_index(axis=1)
dfmi 
lvl0ab
lvl1barfoobahfoo
A0B0C0D01032
D15476
C1D0981110
D113121514
C2D017161918
D121202322
C3D025242726
D129283130
B1C0D033323534
D137363938
C1D041404342
D145444746
C2D049485150
D153525554
C3D057565958
D161606362
A1B0C0D065646766
D169687170
C1D073727574
D177767978
C2D081808382
D185848786
C3D089889190
D193929594
B1C0D097969998
D1101100103102
C1D0105104107106
D1109108111110
C2D0113112115114
D1117116119118
........................
A2B0C1D0137136139138
D1141140143142
C2D0145144147146
D1149148151150
C3D0153152155154
D1157156159158
B1C0D0161160163162
D1165164167166
C1D0169168171170
D1173172175174
C2D0177176179178
D1181180183182
C3D0185184187186
D1189188191190
A3B0C0D0193192195194
D1197196199198
C1D0201200203202
D1205204207206
C2D0209208211210
D1213212215214
C3D0217216219218
D1221220223222
B1C0D0225224227226
D1229228231230
C1D0233232235234
D1237236239238
C2D0241240243242
D1245244247246
C3D0249248251250
D1253252255254

64 rows × 4 columns

# Basic multi-index slicing using slices, lists, and labels.
dfmi.loc[(slice('A1','A3'), slice(None), ['C1', 'C3']), :]  # slice('A1','A3') 相当于 ['A1':'A3']
# dfmi.loc[(slice('A1','A3'), slice(None), ['C1', 'C3']) :]  # 错误 冒号前面必须有逗号
lvl0ab
lvl1barfoobahfoo
A1B0C1D073727574
D177767978
C3D089889190
D193929594
B1C1D0105104107106
D1109108111110
C3D0121120123122
D1125124127126
A2B0C1D0137136139138
D1141140143142
C3D0153152155154
D1157156159158
B1C1D0169168171170
D1173172175174
C3D0185184187186
D1189188191190
A3B0C1D0201200203202
D1205204207206
C3D0217216219218
D1221220223222
B1C1D0233232235234
D1237236239238
C3D0249248251250
D1253252255254
# You can use a pd.IndexSlice to have a more natural syntax using : 
# rather than using slice(None)
# 使用pd.IndexSlice 可以用冒号 : 代替 slice(None)idx = pd.IndexSlice
dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']]  # 默认必须是一个元组()来指定关键字,而且全选的话只能使用slice(None)
lvl0ab
lvl1foofoo
A0B0C1D0810
D11214
C3D02426
D12830
B1C1D04042
D14446
C3D05658
D16062
A1B0C1D07274
D17678
C3D08890
D19294
B1C1D0104106
D1108110
C3D0120122
D1124126
A2B0C1D0136138
D1140142
C3D0152154
D1156158
B1C1D0168170
D1172174
C3D0184186
D1188190
A3B0C1D0200202
D1204206
C3D0216218
D1220222
B1C1D0232234
D1236238
C3D0248250
D1252254
# 一次执行复杂选取
dfmi.loc['A1', (slice(None), 'foo')]
lvl0ab
lvl1foofoo
B0C0D06466
D16870
C1D07274
D17678
C2D08082
D18486
C3D08890
D19294
B1C0D09698
D1100102
C1D0104106
D1108110
C2D0112114
D1116118
C3D0120122
D1124126
dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']]
lvl0ab
lvl1foofoo
A0B0C1D0810
D11214
C3D02426
D12830
B1C1D04042
D14446
C3D05658
D16062
A1B0C1D07274
D17678
C3D08890
D19294
B1C1D0104106
D1108110
C3D0120122
D1124126
A2B0C1D0136138
D1140142
C3D0152154
D1156158
B1C1D0168170
D1172174
C3D0184186
D1188190
A3B0C1D0200202
D1204206
C3D0216218
D1220222
B1C1D0232234
D1236238
C3D0248250
D1252254
# Using a boolean indexer you can provide selection related to the values. 
# 使用布尔索引
mask = dfmi[('a', 'foo')] > 200
mask
dfmi.loc[idx[mask, :, ['C1', 'C3']], idx[:, 'foo']]
A0  B0  C0  D0    FalseD1    FalseC1  D0    FalseD1    FalseC2  D0    FalseD1    FalseC3  D0    FalseD1    FalseB1  C0  D0    FalseD1    FalseC1  D0    FalseD1    FalseC2  D0    FalseD1    FalseC3  D0    FalseD1    False
A1  B0  C0  D0    FalseD1    FalseC1  D0    FalseD1    FalseC2  D0    FalseD1    FalseC3  D0    FalseD1    FalseB1  C0  D0    FalseD1    FalseC1  D0    FalseD1    FalseC2  D0    FalseD1      
A2  B0  C1  D0    FalseD1    FalseC2  D0    FalseD1    FalseC3  D0    FalseD1    FalseB1  C0  D0    FalseD1    FalseC1  D0    FalseD1    FalseC2  D0    FalseD1    FalseC3  D0    FalseD1    False
A3  B0  C0  D0    FalseD1    FalseC1  D0    FalseD1     TrueC2  D0     TrueD1     TrueC3  D0     TrueD1     TrueB1  C0  D0     TrueD1     TrueC1  D0     TrueD1     TrueC2  D0     TrueD1     TrueC3  D0     TrueD1     True
Name: (a, foo), Length: 64, dtype: bool
lvl0ab
lvl1foofoo
A3B0C1D1204206
C3D0216218
D1220222
B1C1D0232234
D1236238
C3D0248250
D1252254
# 指定轴参数axis,说明传入的切片在一个轴上
dfmi.loc(axis=0)[:, :, ['C1', 'C3']]
lvl0ab
lvl1barfoobahfoo
A0B0C1D0981110
D113121514
C3D025242726
D129283130
B1C1D041404342
D145444746
C3D057565958
D161606362
A1B0C1D073727574
D177767978
C3D089889190
D193929594
B1C1D0105104107106
D1109108111110
C3D0121120123122
D1125124127126
A2B0C1D0137136139138
D1141140143142
C3D0153152155154
D1157156159158
B1C1D0169168171170
D1173172175174
C3D0185184187186
D1189188191190
A3B0C1D0201200203202
D1205204207206
C3D0217216219218
D1221220223222
B1C1D0233232235234
D1237236239238
C3D0249248251250
D1253252255254
# 可以使用这种指定轴方向的方式赋值
df2 = py()
df2.loc(axis=0)[:, :, ['C1', 'C3']] = -10
df2
lvl0ab
lvl1barfoobahfoo
A0B0C0D01032
D15476
C1D0-10-10-10-10
D1-10-10-10-10
C2D017161918
D121202322
C3D0-10-10-10-10
D1-10-10-10-10
B1C0D033323534
D137363938
C1D0-10-10-10-10
D1-10-10-10-10
C2D049485150
D153525554
C3D0-10-10-10-10
D1-10-10-10-10
A1B0C0D065646766
D169687170
C1D0-10-10-10-10
D1-10-10-10-10
C2D081808382
D185848786
C3D0-10-10-10-10
D1-10-10-10-10
B1C0D097969998
D1101100103102
C1D0-10-10-10-10
D1-10-10-10-10
C2D0113112115114
D1117116119118
........................
A2B0C1D0-10-10-10-10
D1-10-10-10-10
C2D0145144147146
D1149148151150
C3D0-10-10-10-10
D1-10-10-10-10
B1C0D0161160163162
D1165164167166
C1D0-10-10-10-10
D1-10-10-10-10
C2D0177176179178
D1181180183182
C3D0-10-10-10-10
D1-10-10-10-10
A3B0C0D0193192195194
D1197196199198
C1D0-10-10-10-10
D1-10-10-10-10
C2D0209208211210
D1213212215214
C3D0-10-10-10-10
D1-10-10-10-10
B1C0D0225224227226
D1229228231230
C1D0-10-10-10-10
D1-10-10-10-10
C2D0241240243242
D1245244247246
C3D0-10-10-10-10
D1-10-10-10-10

64 rows × 4 columns

# You can use a right-hand-side of an alignable object as well.
# 可以在右侧使用可定位的对象?
df2 = py()
df2.loc[idx[:, :, ['C1', 'C3']], :] = df2 * 1000
df2
lvl0ab
lvl1barfoobahfoo
A0B0C0D01032
D15476
C1D0900080001100010000
D113000120001500014000
C2D017161918
D121202322
C3D025000240002700026000
D129000280003100030000
B1C0D033323534
D137363938
C1D041000400004300042000
D145000440004700046000
C2D049485150
D153525554
C3D057000560005900058000
D161000600006300062000
A1B0C0D065646766
D169687170
C1D073000720007500074000
D177000760007900078000
C2D081808382
D185848786
C3D089000880009100090000
D193000920009500094000
B1C0D097969998
D1101100103102
C1D0105000104000107000106000
D1109000108000111000110000
C2D0113112115114
D1117116119118
........................
A2B0C1D0137000136000139000138000
D1141000140000143000142000
C2D0145144147146
D1149148151150
C3D0153000152000155000154000
D1157000156000159000158000
B1C0D0161160163162
D1165164167166
C1D0169000168000171000170000
D1173000172000175000174000
C2D0177176179178
D1181180183182
C3D0185000184000187000186000
D1189000188000191000190000
A3B0C0D0193192195194
D1197196199198
C1D0201000200000203000202000
D1205000204000207000206000
C2D0209208211210
D1213212215214
C3D0217000216000219000218000
D1221000220000223000222000
B1C0D0225224227226
D1229228231230
C1D0233000232000235000234000
D1237000236000239000238000
C2D0241240243242
D1245244247246
C3D0249000248000251000250000
D1253000252000255000254000

64 rows × 4 columns

# Cross-section 断面
# xs 方法 另外提供了一个级别level参数 用来选择多重索引中的部分级别
# xs 当提供轴参数时,也可用于列的选择
df
ABC
firstsecond
barone-1.6081621.207609-0.369463
two-0.007312-0.2723661.862253
bazone1.048244-0.530191-0.118297
two-0.029907-0.689641-0.148326
fooone-0.437866-0.2443621.147616
two-1.853398-1.476252-1.389965
quxone2.0268750.8184930.817716
two0.3595210.3537710.787394
df.xs("one", level="second")
df.loc[(slice(None), "one"), :]  # 使用切片得到同样的选择
ABC
first
bar-1.6081621.207609-0.369463
baz1.048244-0.530191-0.118297
foo-0.437866-0.2443621.147616
qux2.0268750.8184930.817716
ABC
firstsecond
barone-1.6081621.207609-0.369463
bazone1.048244-0.530191-0.118297
fooone-0.437866-0.2443621.147616
quxone2.0268750.8184930.817716
df = df.T
df.xs('one', level='second', axis=1)  # 在列方向选择
df.loc[:, (slice(None),'one')]  # 使用切片方式
firstbarbazfooqux
A-1.6081621.048244-0.4378662.026875
B1.207609-0.530191-0.2443620.818493
C-0.369463-0.1182971.1476160.817716
firstbarbazfooqux
secondoneoneoneone
A-1.6081621.048244-0.4378662.026875
B1.207609-0.530191-0.2443620.818493
C-0.369463-0.1182971.1476160.817716
# xs 方法使用多重关键字keys,关键字元组
df.xs(('one', 'bar'), level=('second', 'first'), axis=1)  # 关键字元组可以与级层顺序不一致,与给定level顺序一致
df.loc[:, ("bar", "one")]
# df.loc[:, ("one", "bar")]  # 错误,元组元素的顺序与级层顺序一致
firstbar
secondone
A-1.608162
B1.207609
C-0.369463
A   -1.608162
B    1.207609
C   -0.369463
Name: (bar, one), dtype: float64
# drop_level=False 参数可以使xs保留选定的层级,而不是舍弃,这样的话,与切片得到的结果完全相同
df.xs('one', level='second', axis=1, drop_level=False)  # 默认 drop_level=True
df.loc[:, (slice(None) ,"one")]
firstbarbazfooqux
secondoneoneoneone
A-1.6081621.048244-0.4378662.026875
B1.207609-0.530191-0.2443620.818493
C-0.369463-0.1182971.1476160.817716
firstbarbazfooqux
secondoneoneoneone
A-1.6081621.048244-0.4378662.026875
B1.207609-0.530191-0.2443620.818493
C-0.369463-0.1182971.1476160.817716
# Advanced reindexing and alignment 高级索引和定位
# level参数加在索引reindex和定位align方法中。可用于通过级层进行广播值。midx = pd.MultiIndex(levels=[['zero', 'one'], ['x','y']],labels=[[1,1,0,0],[1,0,1,0]])  # 指定了labels,指定了层级间对应关系
midx
MultiIndex(levels=[['zero', 'one'], ['x', 'y']],labels=[[1, 1, 0, 0], [1, 0, 1, 0]])
df = pd.DataFrame(np.random.randn(4,2), index=midx)
df
01
oney-1.388542-1.170054
x0.240534-0.656707
zeroy-0.848351-1.394871
x-0.2122480.051445
# 不同层级索引的广播计算
df2 = df.mean(level=1)
df2
df2 = df.mean(level=0)
df2
01
y-1.118447-1.282462
x0.014143-0.302631
01
one-0.574004-0.913381
zero-0.530300-0.671713
# 重构索引
index(df.index, level=0)
01
oney-0.574004-0.913381
x-0.574004-0.913381
zeroy-0.530300-0.671713
x-0.530300-0.671713
# 定位/对齐
df
df2
df.align(df2, level=0)
df_aligned, df2_aligned = df.align(df2, level=0)  # ??
df_aligned
df2_aligned
01
oney-1.388542-1.170054
x0.240534-0.656707
zeroy-0.848351-1.394871
x-0.2122480.051445
01
one-0.574004-0.913381
zero-0.530300-0.671713
(               0         1one  y -1.388542 -1.170054x  0.240534 -0.656707zero y -0.848351 -1.394871x -0.212248  0.051445,                0         1one  y -0.574004 -0.913381x -0.574004 -0.913381zero y -0.530300 -0.671713x -0.530300 -0.671713)
01
oney-1.388542-1.170054
x0.240534-0.656707
zeroy-0.848351-1.394871
x-0.2122480.051445
01
oney-0.574004-0.913381
x-0.574004-0.913381
zeroy-0.530300-0.671713
x-0.530300-0.671713
# 交换层级 swaplevel()
df.swaplevel(0, 1, axis=0)
01
yone-1.388542-1.170054
xone0.240534-0.656707
yzero-0.848351-1.394871
xzero-0.2122480.051445
# reorder_levels 概况了 swaplevel 函数, 可以一步交换层级索引
df.reorder_levels([1,0], axis=0)  # 看上去结果与swaplevel一样,传入参数不一样
01
yone-1.388542-1.170054
xone0.240534-0.656707
yzero-0.848351-1.394871
xzero-0.2122480.051445
# 多重索引排序
# 排序是为了搞笑的索引和切片。任何索引都可以使用sort_indextuples
s = pd.Series(np.random.randn(8), index=pd.MultiIndex.from_tuples(tuples))
s
[('baz', 'two'),('qux', 'two'),('bar', 'one'),('foo', 'one'),('qux', 'one'),('baz', 'one'),('foo', 'two'),('bar', 'two')]baz  two    1.365155
qux  two   -1.331225
bar  one   -1.512430
foo  one    0.468294
qux  one   -0.667115
baz  one   -0.502417
foo  two    1.685553
bar  two   -1.611271
dtype: float64
s.sort_index()
s.sort_index(level=1)  # 默认是level=0排序
s.sort_index(level=0)
bar  one   -1.512430two   -1.611271
baz  one   -0.502417two    1.365155
foo  one    0.468294two    1.685553
qux  one   -0.667115two   -1.331225
dtype: float64bar  one   -1.512430
baz  one   -0.502417
foo  one    0.468294
qux  one   -0.667115
bar  two   -1.611271
baz  two    1.365155
foo  two    1.685553
qux  two   -1.331225
dtype: float64bar  one   -1.512430two   -1.611271
baz  one   -0.502417two    1.365155
foo  one    0.468294two    1.685553
qux  one   -0.667115two   -1.331225
dtype: float64
# level参数除了可以用整型序号,还可以使用层级的names
s.index.set_names(['L1', 'L2'], inplace=True)
s
s.sort_index(level="L1")
s.sort_index(level="L2")
L1   L2 
baz  two    1.365155
qux  two   -1.331225
bar  one   -1.512430
foo  one    0.468294
qux  one   -0.667115
baz  one   -0.502417
foo  two    1.685553
bar  two   -1.611271
dtype: float64L1   L2 
bar  one   -1.512430two   -1.611271
baz  one   -0.502417two    1.365155
foo  one    0.468294two    1.685553
qux  one   -0.667115two   -1.331225
dtype: float64L1   L2 
bar  one   -1.512430
baz  one   -0.502417
foo  one    0.468294
qux  one   -0.667115
bar  two   -1.611271
baz  two    1.365155
foo  two    1.685553
qux  two   -1.331225
dtype: float64
# 可以指定排序的轴方向
df.T
df.T.sort_index(level=1, axis=1)
onezero
yxyx
0-1.3885420.240534-0.848351-0.212248
1-1.170054-0.656707-1.3948710.051445
zeroonezeroone
xxyy
0-0.2122480.240534-0.848351-1.388542
10.051445-0.656707-1.394871-1.170054
# 即使数据没有排序也可以索引,但是这样效率低下。
# 返回值是拷贝
dfm = pd.DataFrame({'jim': [0, 0, 1, 1],'joe': ['x', 'x', 'z', 'y'],'jolie': np.random.rand(4)})
dfm
jimjoejolie
00x0.844228
10x0.317508
21z0.413824
31y0.074264
dfm = dfm.set_index(["jim", "joe"])
dfm
jolie
jimjoe
0x0.844228
x0.317508
1z0.413824
y0.074264
dfm.loc[(1, "z")]  # 会提示PerformanceWarning
d:python36-64libsite-packagesipykernel_launcher.py:1: PerformanceWarning: indexing past lexsort depth may impact performance."""Entry point for launching an IPython kernel.
jolie
jimjoe
1z0.413824
# dfm.loc[(0,'y'):(1, 'z')]  # 错误 无法定位
dfm.index.is_lexsorted()
dfm.index.lexsort_depth
False1
dfm = dfm.sort_index()  # 索引排序,默认对所有层级
dfm
jolie
jimjoe
0x0.844228
x0.317508
1y0.074264
z0.413824
dfm.index.is_lexsorted()
dfm.index.lexsort_depth
dfm.loc[(0,'y'):(1, 'z')]
True2
jolie
jimjoe
1y0.074264
z0.413824
# Take Methods take 方法 (拿、取)
# 与numpy的数组类似,padas的index、series、Dataframe也提供take方法
# 用来检索给定轴方向上给定的指数indices(必须是整数列表或者整数数组,可以是负整数)# 在性能方面,由于take方法管理了一个更窄的输入范围,它能提供比想象的索引更快的性能index = pd.Index(np.random.randint(0, 1000, 10))
index
Int64Index([523, 532, 386, 998, 832, 71, 965, 274, 389, 59], dtype='int64')
positions = [0, 9, 3]
index[positions]
index.take(positions)
Int64Index([523, 59, 998], dtype='int64')Int64Index([523, 59, 998], dtype='int64')
ser = pd.Series(np.random.randn(10))
ser
0    0.733196
1    0.975773
2   -0.261602
3   -0.055134
4    0.959253
5    1.189025
6   -0.434102
7    0.653628
8    0.248894
9   -0.203562
dtype: float64
ser.iloc[positions]
ser.take(positions)
0    0.733196
9   -0.203562
3   -0.055134
dtype: float640    0.733196
9   -0.203562
3   -0.055134
dtype: float64
# 对DataFrame,indices应该是一个一维 的列表或数组,规定了行或列的位置
frm = pd.DataFrame(np.random.randn(5, 3))
frm
012
0-0.722107-1.7582710.580805
10.555332-0.856173-1.143862
2-0.6369941.3123400.046131
3-0.1548130.3119310.933192
4-1.277001-0.144097-1.871135
frm.take([1, 4, 3])  # 默认取行方向
frm.take([0, 2], axis=1)
012
10.555332-0.856173-1.143862
4-1.277001-0.144097-1.871135
3-0.1548130.3119310.933192
02
0-0.7221070.580805
10.555332-1.143862
2-0.6369940.046131
3-0.1548130.933192
4-1.277001-1.871135
# 注意:take方法不要用于布尔indices
arr = np.random.randn(10)
arr
array([-0.00772525,  0.95419469,  1.80636718, -2.46742236, -0.025503  ,0.44203691,  0.48626739, -0.74160374, -0.22453771,  0.8813933 ])
arr.take([False, False, True, True])  # 相当于取了[0,0,1,1]
arr[[0, 1]]
array([-0.00772525, -0.00772525,  0.95419469,  0.95419469])array([-0.00772525,  0.95419469])
ser = pd.Series(np.random.randn(10))
ser
0    1.782426
1    0.531882
2   -0.339277
3    0.500497
4   -0.333816
5   -1.713753
6   -0.125252
7   -0.857100
8    0.385080
9    1.247962
dtype: float64
ser.take([False, False, True, True])  # 相当于取了[0,0,1,1]
ser.iloc[[0, 1]]
0    1.782426
0    1.782426
1    0.531882
1    0.531882
dtype: float640    1.782426
1    0.531882
dtype: float64
# Index Types 索引 index 对象
# 其他一些索引对象
# CategoricalIndex   绝对索引?类别索引?
# 用于支持重复的索引
from pes import CategoricalDtype
df = pd.DataFrame({'A': np.arange(6),'B': list('aabbca')})
df
AB
00a
11a
22b
33b
44c
55a
df['B'] = df['B'].astype(CategoricalDtype(list('cab')))
df
df.dtypes
df.B.cat.categories
AB
00a
11a
22b
33b
44c
55a
A       int32
B    category
dtype: objectIndex(['c', 'a', 'b'], dtype='object')
df2 = df.set_index('B')
df2
df2.index
A
B
a0
a1
b2
b3
c4
a5
CategoricalIndex(['a', 'a', 'b', 'b', 'c', 'a'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')
# 使用 __getitem__/.iloc/.loc 索引时,索引对象 必须 在类别里面,否则操作将挂起
df2.loc['a']
df2.loc['a'].index  # 保留了全部的 CategoricalIndex 
df2.sort_index()  # 按照categoies给定的顺序排序
A
B
a0
a1
a5
CategoricalIndex(['a', 'a', 'a'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')
A
B
c4
a0
a1
a5
b2
b3
&roupby.DataFrameGroupBy object at 0x00000000117AC710>
A
B
c4
a6
b5
CategoricalIndex(['c', 'a', 'b'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')
A
B
a0.0
a1.0
a5.0
eNaN
Index(['a', 'a', 'a', 'e'], dtype='object', name='B')
A
B
a0.0
a1.0
a5.0
eNaN
CategoricalIndex(['a', 'a', 'a', 'e'], categories=['a', 'b', 'c', 'd', 'e'], ordered=False, name='B', dtype='category')
# 注意:变形和比较操作必须有同样的categories,否则报错
# Int64Index and RangeIndex
# Int64Index 是pandas基础索引。
# RangeIndex是Int64Index的一个子集,现在作为所有NDFrame对象的默认索引。
# Float64Index 当创建索引index时,传入浮点数或者浮点与整数混合值,就默认是Float64Indexindexf = pd.Index([1.5, 2, 3, 4.5, 5])
indexf
Float64Index([1.5, 2.0, 3.0, 4.5, 5.0], dtype='float64')
sf = pd.Series(range(5), index=indexf)
sf
1.5    0
2.0    1
3.0    2
4.5    3
5.0    4
dtype: int64
# [] .loc 基于 label,整数将被转为浮点值
sf[1.5:4.5]
sf[1:4]
sf.loc[3]  # label,不是位置索引
# sf[3.2]  # 错误,传入值必须在labels中
1.5    0
2.0    1
3.0    2
4.5    3
dtype: int641.5    0
2.0    1
3.0    2
dtype: int642
sf.iloc[3]  # 基于位置,传入整数,不能传入浮点数
3
# 例子: 有不规则的数据表,其索引类似时间间隔,但数值是浮点型的
dfir1 = pd.DataFrame(np.random.randn(5,2),index=np.arange(5) * 250.0,columns=list('AB'))
dfir1
dfir2 = pd.DataFrame(np.random.randn(6,2),index=np.arange(4,10) * 250.1,columns=list('AB'))
dfir2
dfir = pd.concat([dfir1,dfir2])
dfir
AB
0.01.1584610.595743
250.01.4575560.268541
500.0-0.437650-0.299700
750.0-1.095812-2.079684
1000.00.242220-0.868812
AB
1000.4-0.858327-0.364968
1250.5-1.445806-2.129608
1500.60.7990491.232102
1750.7-1.1325380.283472
2000.8-1.1578840.398119
2250.9-1.330821-0.563333
AB
0.01.1584610.595743
250.01.4575560.268541
500.0-0.437650-0.299700
750.0-1.095812-2.079684
1000.00.242220-0.868812
1000.4-0.858327-0.364968
1250.5-1.445806-2.129608
1500.60.7990491.232102
1750.7-1.1325380.283472
2000.8-1.1578840.398119
2250.9-1.330821-0.563333
# 选取第1秒前的数据
dfir[:1000]
AB
0.01.1584610.595743
250.01.4575560.268541
500.0-0.437650-0.299700
750.0-1.095812-2.079684
1000.00.242220-0.868812
# IntervalIndex  区间索引  (数学上的开闭区间)
df = pd.DataFrame({'A': [1, 2, 3, 4]},index=pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4]))
df
A
(0, 1]1
(1, 2]2
(2, 3]3
(3, 4]4
df.loc[2]  # loc 可以是区间的边缘
df.loc[2.5]
df.loc[1.5:2.5]
A    2
Name: (1, 2], dtype: int64A    3
Name: (2, 3], dtype: int64
A
(1, 2]2
(2, 3]3
# Interval and IntervalIndex are used by cut and qcut
# 区间数值类型和区间类型索引可以使用 cut qcut 方法??
c = pd.cut(range(4), bins=2)
c
c.categories
[(-0.003, 1.5], (-0.003, 1.5], (1.5, 3.0], (1.5, 3.0]]
Categories (2, interval[float64]): [(-0.003, 1.5] < (1.5, 3.0]]IntervalIndex([(-0.003, 1.5], (1.5, 3.0]]closed='right',dtype='interval[float64]')
pd.cut([0, 3, 5, 1], bins=c.categories)  # 允许一个interval类型去bin(分隔)其他数据
[(-0.003, 1.5], (1.5, 3.0], NaN, (-0.003, 1.5]]
Categories (2, interval[float64]): [(-0.003, 1.5] < (1.5, 3.0]]
# Miscellaneous indexing FAQ 杂项 常见问题
# Integer indexing  整数型索引
# 整型的索引是label,应满足label的要求
# 在pandas中,一般认为标签label事项大于整数定位。s = pd.Series(range(5))
s
# s[-1]  # 异常
# s.loc[-1]  # 异常
s.loc[-1:]  # 允许
s.iloc[-1]  # 允许
df = pd.DataFrame(np.random.randn(5, 4))
df
df.loc[-2:]
# df.loc[-2]  # 异常
0    0
1    1
2    2
3    3
4    4
dtype: int640    0
1    1
2    2
3    3
4    4
dtype: int644
0123
00.0210330.127054-0.864734-1.835828
1-0.4006110.594981-1.758866-1.059539
2-0.1085970.7840000.306035-0.695933
3-0.078048-1.742895-0.1597400.934115
4-0.5246330.433224-0.7323340.442827
0123
00.0210330.127054-0.864734-1.835828
1-0.4006110.594981-1.758866-1.059539
2-0.1085970.7840000.306035-0.695933
3-0.078048-1.742895-0.1597400.934115
4-0.5246330.433224-0.7323340.442827
# Non-monotonic indexes require exact matches 非单调索引要求精确匹配
# 如果series或Dataframe的索引是单调增或单调减的,则基于标签的切片可以超出索引的范围。
# 就像对一般python列表list的索引切片。
# 可以用is_monotonic_increasing和is_monotonic_decreasing测试单调属性df = pd.DataFrame(index=[2,3,3,4,5], columns=['data'], data=list(range(5)))
df
data
20
31
32
43
54
df.index.is_monotonic_increasing
df.index.is_monotonic_decreasing
TrueFalse
df.loc[0:4, :]  # 没有0和1行,但是返回了label为2、3、4的行
df.loc[13:15, :]  # 超出界限,返回空
data
20
31
32
43
# 非单调索引,切片必须在index内,而且边界的值必须是唯一的df = pd.DataFrame(index=[2,3,1,4,3,5], columns=['data'], data=list(range(6)))
df
df.index.is_monotonic_increasing
data
20
31
12
43
34
55
False
df.loc[2:4, :]
# df.loc[0:4, :]  # 错误没有0标签
# df.loc[2:3, :]  # 错误,边界标签3不是唯一的
data
20
31
12
43
# Index.is_monotonic_increasing() and Index.is_monotonic_decreasing() 只检测弱单调(可以有重复值)
# 结合使用 Index.is_unique()  可以检测严格单调性weakly_monotonic = pd.Index(['a', 'b', 'c', 'c'])
weakly_monotonic
weakly_monotonic.is_monotonic_increasing
weakly_monotonic.is_monotonic_increasing & weakly_monotonic.is_unique
Index(['a', 'b', 'c', 'c'], dtype='object')TrueFalse
# Endpoints are inclusive 端点(边界)包括在内
# 与标准的python切片(不包括右端点值)相比,pandas中的标签切片包含端点值。
# 主要原因是经常不可能轻易断定 在索引的局部标签后 的 后继或者下一个元素
s = pd.Series(np.random.randn(6), index=list('abcdef'))
s
a    1.280483
b    1.562738
c    0.904503
d   -0.470785
e   -0.008048
f   -0.413812
dtype: float64
s[2:5]  # 基于整型的索引,与既有标签的不同,不包括右端点
c    0.904503
d   -0.470785
e   -0.008048
dtype: float64
# 如果用标签,不容易取得下一个标签
# s.loc['c':'e'+1]  # 错误
s.loc['c':'e']
c    0.904503
d   -0.470785
e   -0.008048
dtype: float64
# Indexing potentially changes underlying Series dtype 
# 在series类型下索引可能出现变化
# The different indexing operation can potentially change the dtype of a Series.
# 不同的索引操作可能会潜在的改变series的类型series1 = pd.Series([1, 2, 3])
series1.dtype  # int
series1
res = index([0, 4])
res.dtype  # float
res
dtype('int64')0    1
1    2
2    3
dtype: int64dtype('float64')0    1.0
4    NaN
dtype: float64
series2 = pd.Series([True])
series2.dtype  # 布尔类型
series2
res = index_like(series1)
res.dtype  # '0' 型  (空?)
res
dtype('bool')0    True
dtype: booldtype('O')0    True
1     NaN
2     NaN
dtype: object
# 由于默认插入NaN,引起了dtype的改变。
# 这会导致一些问题,当使用如 numpy.logical_and. 的np ufuncs 时

#  2018-02-22

转载于:.html

本文发布于:2024-01-28 13:25:18,感谢您对本站的认可!

本文链接:https://www.4u4v.net/it/17064195237727.html

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。

标签:索引   pandas
留言与评论(共有 0 条评论)
   
验证码:

Copyright ©2019-2022 Comsenz Inc.Powered by ©

网站地图1 网站地图2 网站地图3 网站地图4 网站地图5 网站地图6 网站地图7 网站地图8 网站地图9 网站地图10 网站地图11 网站地图12 网站地图13 网站地图14 网站地图15 网站地图16 网站地图17 网站地图18 网站地图19 网站地图20 网站地图21 网站地图22/a> 网站地图23