pandas索引

阅读：评论：0

pandas索引

# .html
# MultiIndex / Advanced Indexing
# pandas 0.22.0

import pandas as pd
import numpy as np
import random; random.shuffle(tuples)

# Hierarchical indexing (MultiIndex)  分层索引 多重索引# 创建多重索引对象，如同标准的索引类，他们存放轴axis标签labels
# 创建方式：
# from a list of arrays -- using MultiIndex.from_arrays
# from an array of tuples -- using MultiIndex.from_tuples
# from a crossed set of iterables -- using MultiIndex.from_product arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
arrays
# *arrays
zip(*arrays)  # 这里的* 是“解开”list ， list[0] list[1] ……
list(zip(*arrays))
list(zip(arrays[0], arrays[1]))
tuples = list(zip(*arrays))
tuples

[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]<zip at 0xf96f888>[('bar', 'one'),('bar', 'two'),('baz', 'one'),('baz', 'two'),('foo', 'one'),('foo', 'two'),('qux', 'one'),('qux', 'two')][('bar', 'one'),('bar', 'two'),('baz', 'one'),('baz', 'two'),('foo', 'one'),('foo', 'two'),('qux', 'one'),('qux', 'two')][('bar', 'one'),('bar', 'two'),('baz', 'one'),('baz', 'two'),('foo', 'one'),('foo', 'two'),('qux', 'one'),('qux', 'two')]

index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],names=['first', 'second'])

s = pd.Series(np.random.randn(8), index=index)
s

first  second
bar    one       0.271000two      -1.276230
baz    one      -1.018103two      -0.620292
foo    one       1.008070two       0.759145
qux    one      -2.141050two      -0.927688
dtype: float64

# 更简洁的方式, 当每个元素对都来自于可迭代对象时
iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']]  # 创建了与上面相同的index
pd.MultiIndex.from_product(iterables, names=['first', 'second'])  # 多重索引可以接受命名，默认为None

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],names=['first', 'second'])

# 也可以直接在创建df或series时传入矩阵array 的 列表list
arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']),np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])]
arrays

[array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],dtype='<U3'),array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'],dtype='<U3')]

s = pd.Series(np.random.randn(8), index=arrays)
s

bar  one    0.817791two    0.510420
baz  one   -0.494160two   -0.529997
foo  one    0.641282two   -0.202762
qux  one    0.050320two    2.097300
dtype: float64

df = pd.DataFrame(np.random.randn(8, 4), index=arrays)
df

		0	1	2	3
bar	one	-2.090989	0.001052	1.467637	0.267938
bar	two	1.224610	0.851894	0.765531	-0.505116
baz	one	1.444246	-0.247795	0.267462	-0.945641
baz	two	0.836046	0.274732	0.530525	-0.560081
foo	one	-3.709465	-0.157089	0.608778	-0.003217
foo	two	-0.848818	1.478306	-0.389401	-1.205956
qux	one	-1.069775	1.272440	-0.797613	-0.194223
qux	two	1.597218	0.454815	-0.756022	0.481038

# 可以对不同的轴向（如行索引/行名或列索引/列名）设置
df = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=index)
df

first	bar		baz		foo		qux
second	one	two	one	two	one	two	one	two
A	-1.608162	-0.007312	1.048244	-0.029907	-0.437866	-1.853398	2.026875	0.359521
B	1.207609	-0.272366	-0.530191	-0.689641	-0.244362	-1.476252	0.818493	0.353771
C	-0.369463	1.862253	-0.118297	-0.148326	1.147616	-1.389965	0.817716	0.787394

# 同时对两个方向设置index，注意index的长度与数据在不同方向上的长度
pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6])

	first	bar		baz		foo
	second	one	two	one	two	one	two
first	second
bar	one	0.020204	-0.549089	0.381830	0.326558	-1.420590	-1.551863
bar	two	1.311775	2.294908	0.203981	1.381199	-0.743387	2.119027
baz	one	0.640856	1.089627	-1.463503	0.727607	-0.959549	-0.037316
baz	two	-0.906859	-0.720702	0.862614	0.082066	0.209276	-0.391039
foo	one	-0.328704	-1.015117	0.279826	0.141166	-0.053601	-1.171920
foo	two	0.342074	-0.196049	-0.387946	0.196228	-1.264932	0.144251

pd.Series(np.random.randn(8), index=tuples)  # 多重索引，相当于元组index

(bar, one)   -0.267177
(bar, two)   -0.239632
(baz, one)    1.212249
(baz, two)    0.289517
(foo, one)    1.311922
(foo, two)   -0.797733
(qux, one)   -1.395485
(qux, two)   -0.451327
dtype: float64

# 可以控制索引的显示方式，通过在 pandas.set_options() 设置 multi_sparse 选项
pd.set_option('display.multi_sparse', False)
df
pd.set_option('display.multi_sparse', True)
df

first	bar	bar	baz	baz	foo	foo	qux	qux
second	one	two	one	two	one	two	one	two
A	-1.608162	-0.007312	1.048244	-0.029907	-0.437866	-1.853398	2.026875	0.359521
B	1.207609	-0.272366	-0.530191	-0.689641	-0.244362	-1.476252	0.818493	0.353771
C	-0.369463	1.862253	-0.118297	-0.148326	1.147616	-1.389965	0.817716	0.787394

first	bar		baz		foo		qux
second	one	two	one	two	one	two	one	two
A	-1.608162	-0.007312	1.048244	-0.029907	-0.437866	-1.853398	2.026875	0.359521
B	1.207609	-0.272366	-0.530191	-0.689641	-0.244362	-1.476252	0.818493	0.353771
C	-0.369463	1.862253	-0.118297	-0.148326	1.147616	-1.389965	0.817716	0.787394

# Reconstructing the level labels 重建层级标签
# The method get_level_values will return a vector of the labels for each location at a particular level:
# get_level_values 方法返回指定层级的标签向量
_level_values(0)  # 使用整数序号
_level_values("second")  # 使用name

Index(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], dtype='object', name='first')Index(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'], dtype='object', name='second')

# Basic indexing on axis with MultiIndex # 在轴方向的基础索引
df['bar']
df['bar', 'one']
df['bar']['one']  # 不建议使用，链式
s['qux']

second	one	two
A	-1.608162	-0.007312
B	1.207609	-0.272366
C	-0.369463	1.862253

A   -1.608162
B    1.207609
C   -0.369463
Name: (bar, one), dtype: float64A   -1.608162
B    1.207609
C   -0.369463
Name: one, dtype: float64one    0.05032
two    2.09730
dtype: float64

# Defined Levels 指定层级
df.columns  # 原index
df[['foo','qux']].columns  # 切片后的结果，层级levels中的项目没有减少，labels减少了
# 这样做避免了重新计算层级，使切片保持高效

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],names=['first', 'second'])MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],labels=[[2, 2, 3, 3], [0, 1, 0, 1]],names=['first', 'second'])

# 查看切片实际选择levels
df[['foo','qux']].columns.values
df[['foo','qux']]._level_values(0)  # 指定层级

array([('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')],dtype=object)Index(['foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

# 用有效的used层级重建多重索引
df[['foo','qux']].ve_unused_levels()

MultiIndex(levels=[['foo', 'qux'], ['one', 'two']],labels=[[0, 0, 1, 1], [0, 1, 0, 1]],names=['first', 'second'])

# Data alignment and using reindex 数据定位和使用reindex
s

bar  one    0.817791two    0.510420
baz  one   -0.494160two   -0.529997
foo  one    0.641282two   -0.202762
qux  one    0.050320two    2.097300
dtype: float64

# 当两个index不同的对象计算时与一般的index一样
s + s[:-2]
s + s[::2]

bar  one    1.635582two    1.020840
baz  one   -0.988319two   -1.059993
foo  one    1.282563two   -0.405523
qux  one         NaNtwo         NaN
dtype: float64bar  one    1.635582two         NaN
baz  one   -0.988319two         NaN
foo  one    1.282563two         NaN
qux  one    0.100640two         NaN
dtype: float64

# reindex 可以被另外一个 multiindex 或者 元组的list 或 array 调用
index
index[:3]
s.reindex(index[:3])
s.reindex([('foo', 'two'), ('bar', 'one'), ('qux', 'one'), ('baz', 'one')])

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],names=['first', 'second'])MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],labels=[[0, 0, 1], [0, 1, 0]],names=['first', 'second'])first  second
bar    one       0.817791two       0.510420
baz    one      -0.494160
dtype: float64foo  two   -0.202762
bar  one    0.817791
qux  one    0.050320
baz  one   -0.494160
dtype: float64

# Advanced indexing with hierarchical index
# 使用层次索引的高级索引方法
df = df.T
df

		A	B	C
first	second
bar	one	-1.608162	1.207609	-0.369463
bar	two	-0.007312	-0.272366	1.862253
baz	one	1.048244	-0.530191	-0.118297
baz	two	-0.029907	-0.689641	-0.148326
foo	one	-0.437866	-0.244362	1.147616
foo	two	-1.853398	-1.476252	-1.389965
qux	one	2.026875	0.818493	0.817716
qux	two	0.359521	0.353771	0.787394

# .loc 定位
df.loc["bar"]
df.loc["bar", "two"]  # 返回了一个series（不是一行，而是“一列”），其索引是原df的列名

	A	B	C
second
one	-1.608162	1.207609	-0.369463
two	-0.007312	-0.272366	1.862253

A   -0.007312
B   -0.272366
C    1.862253
Name: (bar, two), dtype: float64

# loc 中使用切片，切片的值可以是元组
df.loc['baz':'foo']
df.loc[('baz', 'two'):('qux', 'one')]
df.loc[('baz', 'two'):'foo']

		A	B	C
first	second
baz	one	1.048244	-0.530191	-0.118297
baz	two	-0.029907	-0.689641	-0.148326
foo	one	-0.437866	-0.244362	1.147616
foo	two	-1.853398	-1.476252	-1.389965

		A	B	C
first	second
baz	two	-0.029907	-0.689641	-0.148326
foo	one	-0.437866	-0.244362	1.147616
foo	two	-1.853398	-1.476252	-1.389965
qux	one	2.026875	0.818493	0.817716

		A	B	C
first	second
baz	two	-0.029907	-0.689641	-0.148326
foo	one	-0.437866	-0.244362	1.147616
foo	two	-1.853398	-1.476252	-1.389965

# 可以给loc传入元组或标签列表，取得不连续的索引
df.loc[[('bar', 'two'), ('qux', 'one')]]

		A	B	C
first	second
bar	two	-0.007312	-0.272366	1.862253
qux	one	2.026875	0.818493	0.817716

# Using slicers 使用切片
# 可以用多重索引对象进行切片。可以用 切片值、 标签或标签列表、 布尔索引等选择器
# 可以用slice(None) 选择那一级的所有的内容。不用特别指定所有深度的级别，他们默认是slice(None)
# 注意：使用loc应该规定所有的轴方向，包括行index和列columns。
# 推荐方式：df.loc[(slice('A1','A3'),.....), :]   注意 冒号前面的逗号，逗号前表示行方向切片（选择器），逗号后面表示列方向切片（选择器）
# 不推荐：  df.loc[(slice('A1','A3'),.....)]  可能产生歧义

def mklbl(prefix, n):# mklbl("a", 3) --> ['a0', 'a1', 'a2']return ["%s%s" % (prefix, i) for i in range(n)]

mklbl("a", 3)

['a0', 'a1', 'a2']

miindex = pd.MultiIndex.from_product([mklbl('A',4),mklbl('B',2),mklbl('C',4),mklbl('D',2)])
miindex  # 由列表生成4重（4级）索引对象，共生成4*2*4*2=64行

MultiIndex(levels=[['A0', 'A1', 'A2', 'A3'], ['B0', 'B1'], ['C0', 'C1', 'C2', 'C3'], ['D0', 'D1']],labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]])

micolumns = pd.MultiIndex.from_tuples([('a','foo'),('a','bar'),('b','foo'),('b','bah')],names=['lvl0', 'lvl1'])
micolumns  # 由元组生成2重索引对象，共4行，并对2重（两级）分别命名

MultiIndex(levels=[['a', 'b'], ['bah', 'bar', 'foo']],labels=[[0, 0, 1, 1], [2, 1, 2, 0]],names=['lvl0', 'lvl1'])

row_l = len(miindex)
col_l = len(micolumns)
dfmi = pd.DataFrame(np.arange(row_l * col_l).reshape((row_l, col_l)),index=miindex,columns=micolumns).sort_index().sort_index(axis=1)
dfmi

			lvl0	a		b
			lvl1	bar	foo	bah	foo
A0	B0	C0	D0	1	0	3	2
		C0	D1	5	4	7	6
		C1	D0	9	8	11	10
		C1	D1	13	12	15	14
		C2	D0	17	16	19	18
		C2	D1	21	20	23	22
		C3	D0	25	24	27	26
		C3	D1	29	28	31	30
	B1	C0	D0	33	32	35	34
		C0	D1	37	36	39	38
		C1	D0	41	40	43	42
		C1	D1	45	44	47	46
		C2	D0	49	48	51	50
		C2	D1	53	52	55	54
		C3	D0	57	56	59	58
		C3	D1	61	60	63	62
A1	B0	C0	D0	65	64	67	66
		C0	D1	69	68	71	70
		C1	D0	73	72	75	74
		C1	D1	77	76	79	78
		C2	D0	81	80	83	82
		C2	D1	85	84	87	86
		C3	D0	89	88	91	90
		C3	D1	93	92	95	94
	B1	C0	D0	97	96	99	98
		C0	D1	101	100	103	102
		C1	D0	105	104	107	106
		C1	D1	109	108	111	110
		C2	D0	113	112	115	114
		C2	D1	117	116	119	118
...	...	...	...	...	...	...	...
A2	B0	C1	D0	137	136	139	138
		C1	D1	141	140	143	142
		C2	D0	145	144	147	146
		C2	D1	149	148	151	150
		C3	D0	153	152	155	154
		C3	D1	157	156	159	158
	B1	C0	D0	161	160	163	162
		C0	D1	165	164	167	166
		C1	D0	169	168	171	170
		C1	D1	173	172	175	174
		C2	D0	177	176	179	178
		C2	D1	181	180	183	182
		C3	D0	185	184	187	186
		C3	D1	189	188	191	190
A3	B0	C0	D0	193	192	195	194
		C0	D1	197	196	199	198
		C1	D0	201	200	203	202
		C1	D1	205	204	207	206
		C2	D0	209	208	211	210
		C2	D1	213	212	215	214
		C3	D0	217	216	219	218
		C3	D1	221	220	223	222
	B1	C0	D0	225	224	227	226
		C0	D1	229	228	231	230
		C1	D0	233	232	235	234
		C1	D1	237	236	239	238
		C2	D0	241	240	243	242
		C2	D1	245	244	247	246
		C3	D0	249	248	251	250
		C3	D1	253	252	255	254

64 rows × 4 columns

# Basic multi-index slicing using slices, lists, and labels.
dfmi.loc[(slice('A1','A3'), slice(None), ['C1', 'C3']), :]  # slice('A1','A3') 相当于 ['A1':'A3']
# dfmi.loc[(slice('A1','A3'), slice(None), ['C1', 'C3']) :]  # 错误 冒号前面必须有逗号

			lvl0	a		b
			lvl1	bar	foo	bah	foo
A1	B0	C1	D0	73	72	75	74
		C1	D1	77	76	79	78
		C3	D0	89	88	91	90
		C3	D1	93	92	95	94
	B1	C1	D0	105	104	107	106
		C1	D1	109	108	111	110
		C3	D0	121	120	123	122
		C3	D1	125	124	127	126
A2	B0	C1	D0	137	136	139	138
		C1	D1	141	140	143	142
		C3	D0	153	152	155	154
		C3	D1	157	156	159	158
	B1	C1	D0	169	168	171	170
		C1	D1	173	172	175	174
		C3	D0	185	184	187	186
		C3	D1	189	188	191	190
A3	B0	C1	D0	201	200	203	202
		C1	D1	205	204	207	206
		C3	D0	217	216	219	218
		C3	D1	221	220	223	222
	B1	C1	D0	233	232	235	234
		C1	D1	237	236	239	238
		C3	D0	249	248	251	250
		C3	D1	253	252	255	254

# You can use a pd.IndexSlice to have a more natural syntax using : 
# rather than using slice(None)
# 使用pd.IndexSlice 可以用冒号 : 代替 slice(None)idx = pd.IndexSlice
dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']]  # 默认必须是一个元组()来指定关键字，而且全选的话只能使用slice(None)

			lvl0	a	b
			lvl1	foo	foo
A0	B0	C1	D0	8	10
		C1	D1	12	14
		C3	D0	24	26
		C3	D1	28	30
	B1	C1	D0	40	42
		C1	D1	44	46
		C3	D0	56	58
		C3	D1	60	62
A1	B0	C1	D0	72	74
		C1	D1	76	78
		C3	D0	88	90
		C3	D1	92	94
	B1	C1	D0	104	106
		C1	D1	108	110
		C3	D0	120	122
		C3	D1	124	126
A2	B0	C1	D0	136	138
		C1	D1	140	142
		C3	D0	152	154
		C3	D1	156	158
	B1	C1	D0	168	170
		C1	D1	172	174
		C3	D0	184	186
		C3	D1	188	190
A3	B0	C1	D0	200	202
		C1	D1	204	206
		C3	D0	216	218
		C3	D1	220	222
	B1	C1	D0	232	234
		C1	D1	236	238
		C3	D0	248	250
		C3	D1	252	254

# 一次执行复杂选取
dfmi.loc['A1', (slice(None), 'foo')]

		lvl0	a	b
		lvl1	foo	foo
B0	C0	D0	64	66
	C0	D1	68	70
	C1	D0	72	74
	C1	D1	76	78
	C2	D0	80	82
	C2	D1	84	86
	C3	D0	88	90
	C3	D1	92	94
B1	C0	D0	96	98
	C0	D1	100	102
	C1	D0	104	106
	C1	D1	108	110
	C2	D0	112	114
	C2	D1	116	118
	C3	D0	120	122
	C3	D1	124	126

dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']]

			lvl0	a	b
			lvl1	foo	foo
A0	B0	C1	D0	8	10
		C1	D1	12	14
		C3	D0	24	26
		C3	D1	28	30
	B1	C1	D0	40	42
		C1	D1	44	46
		C3	D0	56	58
		C3	D1	60	62
A1	B0	C1	D0	72	74
		C1	D1	76	78
		C3	D0	88	90
		C3	D1	92	94
	B1	C1	D0	104	106
		C1	D1	108	110
		C3	D0	120	122
		C3	D1	124	126
A2	B0	C1	D0	136	138
		C1	D1	140	142
		C3	D0	152	154
		C3	D1	156	158
	B1	C1	D0	168	170
		C1	D1	172	174
		C3	D0	184	186
		C3	D1	188	190
A3	B0	C1	D0	200	202
		C1	D1	204	206
		C3	D0	216	218
		C3	D1	220	222
	B1	C1	D0	232	234
		C1	D1	236	238
		C3	D0	248	250
		C3	D1	252	254

# Using a boolean indexer you can provide selection related to the values. 
# 使用布尔索引
mask = dfmi[('a', 'foo')] > 200
mask
dfmi.loc[idx[mask, :, ['C1', 'C3']], idx[:, 'foo']]

A0  B0  C0  D0    FalseD1    FalseC1  D0    FalseD1    FalseC2  D0    FalseD1    FalseC3  D0    FalseD1    FalseB1  C0  D0    FalseD1    FalseC1  D0    FalseD1    FalseC2  D0    FalseD1    FalseC3  D0    FalseD1    False
A1  B0  C0  D0    FalseD1    FalseC1  D0    FalseD1    FalseC2  D0    FalseD1    FalseC3  D0    FalseD1    FalseB1  C0  D0    FalseD1    FalseC1  D0    FalseD1    FalseC2  D0    FalseD1      
A2  B0  C1  D0    FalseD1    FalseC2  D0    FalseD1    FalseC3  D0    FalseD1    FalseB1  C0  D0    FalseD1    FalseC1  D0    FalseD1    FalseC2  D0    FalseD1    FalseC3  D0    FalseD1    False
A3  B0  C0  D0    FalseD1    FalseC1  D0    FalseD1     TrueC2  D0     TrueD1     TrueC3  D0     TrueD1     TrueB1  C0  D0     TrueD1     TrueC1  D0     TrueD1     TrueC2  D0     TrueD1     TrueC3  D0     TrueD1     True
Name: (a, foo), Length: 64, dtype: bool

			lvl0	a	b
			lvl1	foo	foo
A3	B0	C1	D1	204	206
		C3	D0	216	218
		C3	D1	220	222
	B1	C1	D0	232	234
		C1	D1	236	238
		C3	D0	248	250
		C3	D1	252	254

# 指定轴参数axis，说明传入的切片在一个轴上
dfmi.loc(axis=0)[:, :, ['C1', 'C3']]

			lvl0	a		b
			lvl1	bar	foo	bah	foo
A0	B0	C1	D0	9	8	11	10
		C1	D1	13	12	15	14
		C3	D0	25	24	27	26
		C3	D1	29	28	31	30
	B1	C1	D0	41	40	43	42
		C1	D1	45	44	47	46
		C3	D0	57	56	59	58
		C3	D1	61	60	63	62
A1	B0	C1	D0	73	72	75	74
		C1	D1	77	76	79	78
		C3	D0	89	88	91	90
		C3	D1	93	92	95	94
	B1	C1	D0	105	104	107	106
		C1	D1	109	108	111	110
		C3	D0	121	120	123	122
		C3	D1	125	124	127	126
A2	B0	C1	D0	137	136	139	138
		C1	D1	141	140	143	142
		C3	D0	153	152	155	154
		C3	D1	157	156	159	158
	B1	C1	D0	169	168	171	170
		C1	D1	173	172	175	174
		C3	D0	185	184	187	186
		C3	D1	189	188	191	190
A3	B0	C1	D0	201	200	203	202
		C1	D1	205	204	207	206
		C3	D0	217	216	219	218
		C3	D1	221	220	223	222
	B1	C1	D0	233	232	235	234
		C1	D1	237	236	239	238
		C3	D0	249	248	251	250
		C3	D1	253	252	255	254

# 可以使用这种指定轴方向的方式赋值
df2 = py()
df2.loc(axis=0)[:, :, ['C1', 'C3']] = -10
df2

			lvl0	a		b
			lvl1	bar	foo	bah	foo
A0	B0	C0	D0	1	0	3	2
		C0	D1	5	4	7	6
		C1	D0	-10	-10	-10	-10
		C1	D1	-10	-10	-10	-10
		C2	D0	17	16	19	18
		C2	D1	21	20	23	22
		C3	D0	-10	-10	-10	-10
		C3	D1	-10	-10	-10	-10
	B1	C0	D0	33	32	35	34
		C0	D1	37	36	39	38
		C1	D0	-10	-10	-10	-10
		C1	D1	-10	-10	-10	-10
		C2	D0	49	48	51	50
		C2	D1	53	52	55	54
		C3	D0	-10	-10	-10	-10
		C3	D1	-10	-10	-10	-10
A1	B0	C0	D0	65	64	67	66
		C0	D1	69	68	71	70
		C1	D0	-10	-10	-10	-10
		C1	D1	-10	-10	-10	-10
		C2	D0	81	80	83	82
		C2	D1	85	84	87	86
		C3	D0	-10	-10	-10	-10
		C3	D1	-10	-10	-10	-10
	B1	C0	D0	97	96	99	98
		C0	D1	101	100	103	102
		C1	D0	-10	-10	-10	-10
		C1	D1	-10	-10	-10	-10
		C2	D0	113	112	115	114
		C2	D1	117	116	119	118
...	...	...	...	...	...	...	...
A2	B0	C1	D0	-10	-10	-10	-10
		C1	D1	-10	-10	-10	-10
		C2	D0	145	144	147	146
		C2	D1	149	148	151	150
		C3	D0	-10	-10	-10	-10
		C3	D1	-10	-10	-10	-10
	B1	C0	D0	161	160	163	162
		C0	D1	165	164	167	166
		C1	D0	-10	-10	-10	-10
		C1	D1	-10	-10	-10	-10
		C2	D0	177	176	179	178
		C2	D1	181	180	183	182
		C3	D0	-10	-10	-10	-10
		C3	D1	-10	-10	-10	-10
A3	B0	C0	D0	193	192	195	194
		C0	D1	197	196	199	198
		C1	D0	-10	-10	-10	-10
		C1	D1	-10	-10	-10	-10
		C2	D0	209	208	211	210
		C2	D1	213	212	215	214
		C3	D0	-10	-10	-10	-10
		C3	D1	-10	-10	-10	-10
	B1	C0	D0	225	224	227	226
		C0	D1	229	228	231	230
		C1	D0	-10	-10	-10	-10
		C1	D1	-10	-10	-10	-10
		C2	D0	241	240	243	242
		C2	D1	245	244	247	246
		C3	D0	-10	-10	-10	-10
		C3	D1	-10	-10	-10	-10

64 rows × 4 columns

# You can use a right-hand-side of an alignable object as well.
# 可以在右侧使用可定位的对象？
df2 = py()
df2.loc[idx[:, :, ['C1', 'C3']], :] = df2 * 1000
df2

			lvl0	a		b
			lvl1	bar	foo	bah	foo
A0	B0	C0	D0	1	0	3	2
		C0	D1	5	4	7	6
		C1	D0	9000	8000	11000	10000
		C1	D1	13000	12000	15000	14000
		C2	D0	17	16	19	18
		C2	D1	21	20	23	22
		C3	D0	25000	24000	27000	26000
		C3	D1	29000	28000	31000	30000
	B1	C0	D0	33	32	35	34
		C0	D1	37	36	39	38
		C1	D0	41000	40000	43000	42000
		C1	D1	45000	44000	47000	46000
		C2	D0	49	48	51	50
		C2	D1	53	52	55	54
		C3	D0	57000	56000	59000	58000
		C3	D1	61000	60000	63000	62000
A1	B0	C0	D0	65	64	67	66
		C0	D1	69	68	71	70
		C1	D0	73000	72000	75000	74000
		C1	D1	77000	76000	79000	78000
		C2	D0	81	80	83	82
		C2	D1	85	84	87	86
		C3	D0	89000	88000	91000	90000
		C3	D1	93000	92000	95000	94000
	B1	C0	D0	97	96	99	98
		C0	D1	101	100	103	102
		C1	D0	105000	104000	107000	106000
		C1	D1	109000	108000	111000	110000
		C2	D0	113	112	115	114
		C2	D1	117	116	119	118
...	...	...	...	...	...	...	...
A2	B0	C1	D0	137000	136000	139000	138000
		C1	D1	141000	140000	143000	142000
		C2	D0	145	144	147	146
		C2	D1	149	148	151	150
		C3	D0	153000	152000	155000	154000
		C3	D1	157000	156000	159000	158000
	B1	C0	D0	161	160	163	162
		C0	D1	165	164	167	166
		C1	D0	169000	168000	171000	170000
		C1	D1	173000	172000	175000	174000
		C2	D0	177	176	179	178
		C2	D1	181	180	183	182
		C3	D0	185000	184000	187000	186000
		C3	D1	189000	188000	191000	190000
A3	B0	C0	D0	193	192	195	194
		C0	D1	197	196	199	198
		C1	D0	201000	200000	203000	202000
		C1	D1	205000	204000	207000	206000
		C2	D0	209	208	211	210
		C2	D1	213	212	215	214
		C3	D0	217000	216000	219000	218000
		C3	D1	221000	220000	223000	222000
	B1	C0	D0	225	224	227	226
		C0	D1	229	228	231	230
		C1	D0	233000	232000	235000	234000
		C1	D1	237000	236000	239000	238000
		C2	D0	241	240	243	242
		C2	D1	245	244	247	246
		C3	D0	249000	248000	251000	250000
		C3	D1	253000	252000	255000	254000

64 rows × 4 columns

# Cross-section 断面
# xs 方法 另外提供了一个级别level参数 用来选择多重索引中的部分级别
# xs 当提供轴参数时，也可用于列的选择
df

		A	B	C
first	second
bar	one	-1.608162	1.207609	-0.369463
bar	two	-0.007312	-0.272366	1.862253
baz	one	1.048244	-0.530191	-0.118297
baz	two	-0.029907	-0.689641	-0.148326
foo	one	-0.437866	-0.244362	1.147616
foo	two	-1.853398	-1.476252	-1.389965
qux	one	2.026875	0.818493	0.817716
qux	two	0.359521	0.353771	0.787394

df.xs("one", level="second")
df.loc[(slice(None), "one"), :]  # 使用切片得到同样的选择

	A	B	C
first
bar	-1.608162	1.207609	-0.369463
baz	1.048244	-0.530191	-0.118297
foo	-0.437866	-0.244362	1.147616
qux	2.026875	0.818493	0.817716

		A	B	C
first	second
bar	one	-1.608162	1.207609	-0.369463
baz	one	1.048244	-0.530191	-0.118297
foo	one	-0.437866	-0.244362	1.147616
qux	one	2.026875	0.818493	0.817716

df = df.T
df.xs('one', level='second', axis=1)  # 在列方向选择
df.loc[:, (slice(None),'one')]  # 使用切片方式

first	bar	baz	foo	qux
A	-1.608162	1.048244	-0.437866	2.026875
B	1.207609	-0.530191	-0.244362	0.818493
C	-0.369463	-0.118297	1.147616	0.817716

first	bar	baz	foo	qux
second	one	one	one	one
A	-1.608162	1.048244	-0.437866	2.026875
B	1.207609	-0.530191	-0.244362	0.818493
C	-0.369463	-0.118297	1.147616	0.817716

# xs 方法使用多重关键字keys，关键字元组
df.xs(('one', 'bar'), level=('second', 'first'), axis=1)  # 关键字元组可以与级层顺序不一致，与给定level顺序一致
df.loc[:, ("bar", "one")]
# df.loc[:, ("one", "bar")]  # 错误，元组元素的顺序与级层顺序一致

first	bar
second	one
A	-1.608162
B	1.207609
C	-0.369463

A   -1.608162
B    1.207609
C   -0.369463
Name: (bar, one), dtype: float64

# drop_level=False 参数可以使xs保留选定的层级，而不是舍弃，这样的话，与切片得到的结果完全相同
df.xs('one', level='second', axis=1, drop_level=False)  # 默认 drop_level=True
df.loc[:, (slice(None) ,"one")]

first	bar	baz	foo	qux
second	one	one	one	one
A	-1.608162	1.048244	-0.437866	2.026875
B	1.207609	-0.530191	-0.244362	0.818493
C	-0.369463	-0.118297	1.147616	0.817716

first	bar	baz	foo	qux
second	one	one	one	one
A	-1.608162	1.048244	-0.437866	2.026875
B	1.207609	-0.530191	-0.244362	0.818493
C	-0.369463	-0.118297	1.147616	0.817716

# Advanced reindexing and alignment 高级索引和定位
# level参数加在索引reindex和定位align方法中。可用于通过级层进行广播值。midx = pd.MultiIndex(levels=[['zero', 'one'], ['x','y']],labels=[[1,1,0,0],[1,0,1,0]])  # 指定了labels,指定了层级间对应关系
midx

MultiIndex(levels=[['zero', 'one'], ['x', 'y']],labels=[[1, 1, 0, 0], [1, 0, 1, 0]])

df = pd.DataFrame(np.random.randn(4,2), index=midx)
df

		0	1
one	y	-1.388542	-1.170054
one	x	0.240534	-0.656707
zero	y	-0.848351	-1.394871
zero	x	-0.212248	0.051445

# 不同层级索引的广播计算
df2 = df.mean(level=1)
df2
df2 = df.mean(level=0)
df2

	0	1
y	-1.118447	-1.282462
x	0.014143	-0.302631

	0	1
one	-0.574004	-0.913381
zero	-0.530300	-0.671713

# 重构索引
index(df.index, level=0)

		0	1
one	y	-0.574004	-0.913381
one	x	-0.574004	-0.913381
zero	y	-0.530300	-0.671713
zero	x	-0.530300	-0.671713

# 定位/对齐
df
df2
df.align(df2, level=0)
df_aligned, df2_aligned = df.align(df2, level=0)  # ??
df_aligned
df2_aligned

		0	1
one	y	-1.388542	-1.170054
one	x	0.240534	-0.656707
zero	y	-0.848351	-1.394871
zero	x	-0.212248	0.051445

	0	1
one	-0.574004	-0.913381
zero	-0.530300	-0.671713

(               0         1one  y -1.388542 -1.170054x  0.240534 -0.656707zero y -0.848351 -1.394871x -0.212248  0.051445,                0         1one  y -0.574004 -0.913381x -0.574004 -0.913381zero y -0.530300 -0.671713x -0.530300 -0.671713)

		0	1
one	y	-1.388542	-1.170054
one	x	0.240534	-0.656707
zero	y	-0.848351	-1.394871
zero	x	-0.212248	0.051445

		0	1
one	y	-0.574004	-0.913381
one	x	-0.574004	-0.913381
zero	y	-0.530300	-0.671713
zero	x	-0.530300	-0.671713

# 交换层级 swaplevel()
df.swaplevel(0, 1, axis=0)

		0	1
y	one	-1.388542	-1.170054
x	one	0.240534	-0.656707
y	zero	-0.848351	-1.394871
x	zero	-0.212248	0.051445

# reorder_levels 概况了 swaplevel 函数， 可以一步交换层级索引
df.reorder_levels([1,0], axis=0)  # 看上去结果与swaplevel一样，传入参数不一样

		0	1
y	one	-1.388542	-1.170054
x	one	0.240534	-0.656707
y	zero	-0.848351	-1.394871
x	zero	-0.212248	0.051445

# 多重索引排序
# 排序是为了搞笑的索引和切片。任何索引都可以使用sort_indextuples
s = pd.Series(np.random.randn(8), index=pd.MultiIndex.from_tuples(tuples))
s

[('baz', 'two'),('qux', 'two'),('bar', 'one'),('foo', 'one'),('qux', 'one'),('baz', 'one'),('foo', 'two'),('bar', 'two')]baz  two    1.365155
qux  two   -1.331225
bar  one   -1.512430
foo  one    0.468294
qux  one   -0.667115
baz  one   -0.502417
foo  two    1.685553
bar  two   -1.611271
dtype: float64

s.sort_index()
s.sort_index(level=1)  # 默认是level=0排序
s.sort_index(level=0)

bar  one   -1.512430two   -1.611271
baz  one   -0.502417two    1.365155
foo  one    0.468294two    1.685553
qux  one   -0.667115two   -1.331225
dtype: float64bar  one   -1.512430
baz  one   -0.502417
foo  one    0.468294
qux  one   -0.667115
bar  two   -1.611271
baz  two    1.365155
foo  two    1.685553
qux  two   -1.331225
dtype: float64bar  one   -1.512430two   -1.611271
baz  one   -0.502417two    1.365155
foo  one    0.468294two    1.685553
qux  one   -0.667115two   -1.331225
dtype: float64

# level参数除了可以用整型序号，还可以使用层级的names
s.index.set_names(['L1', 'L2'], inplace=True)
s
s.sort_index(level="L1")
s.sort_index(level="L2")

L1   L2 
baz  two    1.365155
qux  two   -1.331225
bar  one   -1.512430
foo  one    0.468294
qux  one   -0.667115
baz  one   -0.502417
foo  two    1.685553
bar  two   -1.611271
dtype: float64L1   L2 
bar  one   -1.512430two   -1.611271
baz  one   -0.502417two    1.365155
foo  one    0.468294two    1.685553
qux  one   -0.667115two   -1.331225
dtype: float64L1   L2 
bar  one   -1.512430
baz  one   -0.502417
foo  one    0.468294
qux  one   -0.667115
bar  two   -1.611271
baz  two    1.365155
foo  two    1.685553
qux  two   -1.331225
dtype: float64

# 可以指定排序的轴方向
df.T
df.T.sort_index(level=1, axis=1)

	one		zero
	y	x	y	x
0	-1.388542	0.240534	-0.848351	-0.212248
1	-1.170054	-0.656707	-1.394871	0.051445

	zero	one	zero	one
	x	x	y	y
0	-0.212248	0.240534	-0.848351	-1.388542
1	0.051445	-0.656707	-1.394871	-1.170054

# 即使数据没有排序也可以索引，但是这样效率低下。
# 返回值是拷贝
dfm = pd.DataFrame({'jim': [0, 0, 1, 1],'joe': ['x', 'x', 'z', 'y'],'jolie': np.random.rand(4)})
dfm

	jim	joe	jolie
0	0	x	0.844228
1	0	x	0.317508
2	1	z	0.413824
3	1	y	0.074264

dfm = dfm.set_index(["jim", "joe"])
dfm

		jolie
jim	joe
0	x	0.844228
0	x	0.317508
1	z	0.413824
1	y	0.074264

dfm.loc[(1, "z")]  # 会提示PerformanceWarning

d:python36-64libsite-packagesipykernel_launcher.py:1: PerformanceWarning: indexing past lexsort depth may impact performance."""Entry point for launching an IPython kernel.

		jolie
jim	joe
1	z	0.413824

# dfm.loc[(0,'y'):(1, 'z')]  # 错误 无法定位
dfm.index.is_lexsorted()
dfm.index.lexsort_depth

False1

dfm = dfm.sort_index()  # 索引排序，默认对所有层级
dfm

		jolie
jim	joe
0	x	0.844228
0	x	0.317508
1	y	0.074264
1	z	0.413824

dfm.index.is_lexsorted()
dfm.index.lexsort_depth
dfm.loc[(0,'y'):(1, 'z')]

True2

		jolie
jim	joe
1	y	0.074264
1	z	0.413824

# Take Methods take 方法 (拿、取)
# 与numpy的数组类似，padas的index、series、Dataframe也提供take方法
# 用来检索给定轴方向上给定的指数indices（必须是整数列表或者整数数组，可以是负整数）# 在性能方面，由于take方法管理了一个更窄的输入范围，它能提供比想象的索引更快的性能index = pd.Index(np.random.randint(0, 1000, 10))
index

Int64Index([523, 532, 386, 998, 832, 71, 965, 274, 389, 59], dtype='int64')

positions = [0, 9, 3]
index[positions]
index.take(positions)

Int64Index([523, 59, 998], dtype='int64')Int64Index([523, 59, 998], dtype='int64')

ser = pd.Series(np.random.randn(10))
ser

0    0.733196
1    0.975773
2   -0.261602
3   -0.055134
4    0.959253
5    1.189025
6   -0.434102
7    0.653628
8    0.248894
9   -0.203562
dtype: float64

ser.iloc[positions]
ser.take(positions)

0    0.733196
9   -0.203562
3   -0.055134
dtype: float640    0.733196
9   -0.203562
3   -0.055134
dtype: float64

# 对DataFrame，indices应该是一个一维 的列表或数组，规定了行或列的位置
frm = pd.DataFrame(np.random.randn(5, 3))
frm

	0	1	2
0	-0.722107	-1.758271	0.580805
1	0.555332	-0.856173	-1.143862
2	-0.636994	1.312340	0.046131
3	-0.154813	0.311931	0.933192
4	-1.277001	-0.144097	-1.871135

frm.take([1, 4, 3])  # 默认取行方向
frm.take([0, 2], axis=1)

	0	1	2
1	0.555332	-0.856173	-1.143862
4	-1.277001	-0.144097	-1.871135
3	-0.154813	0.311931	0.933192

	0	2
0	-0.722107	0.580805
1	0.555332	-1.143862
2	-0.636994	0.046131
3	-0.154813	0.933192
4	-1.277001	-1.871135

# 注意：take方法不要用于布尔indices
arr = np.random.randn(10)
arr

array([-0.00772525,  0.95419469,  1.80636718, -2.46742236, -0.025503  ,0.44203691,  0.48626739, -0.74160374, -0.22453771,  0.8813933 ])

arr.take([False, False, True, True])  # 相当于取了[0,0,1,1]
arr[[0, 1]]

array([-0.00772525, -0.00772525,  0.95419469,  0.95419469])array([-0.00772525,  0.95419469])

ser = pd.Series(np.random.randn(10))
ser

0    1.782426
1    0.531882
2   -0.339277
3    0.500497
4   -0.333816
5   -1.713753
6   -0.125252
7   -0.857100
8    0.385080
9    1.247962
dtype: float64

ser.take([False, False, True, True])  # 相当于取了[0,0,1,1]
ser.iloc[[0, 1]]

0    1.782426
0    1.782426
1    0.531882
1    0.531882
dtype: float640    1.782426
1    0.531882
dtype: float64

# Index Types 索引 index 对象
# 其他一些索引对象

# CategoricalIndex   绝对索引？类别索引？
# 用于支持重复的索引
from pes import CategoricalDtype

df = pd.DataFrame({'A': np.arange(6),'B': list('aabbca')})
df

	A	B
0	0	a
1	1	a
2	2	b
3	3	b
4	4	c
5	5	a

df['B'] = df['B'].astype(CategoricalDtype(list('cab')))
df
df.dtypes
df.B.cat.categories

	A	B
0	0	a
1	1	a
2	2	b
3	3	b
4	4	c
5	5	a

A       int32
B    category
dtype: objectIndex(['c', 'a', 'b'], dtype='object')

df2 = df.set_index('B')
df2
df2.index

	A
B
a	0
a	1
b	2
b	3
c	4
a	5

CategoricalIndex(['a', 'a', 'b', 'b', 'c', 'a'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')

# 使用 __getitem__/.iloc/.loc 索引时，索引对象 必须 在类别里面，否则操作将挂起
df2.loc['a']
df2.loc['a'].index  # 保留了全部的 CategoricalIndex 
df2.sort_index()  # 按照categoies给定的顺序排序

	A
B
a	0
a	1
a	5

CategoricalIndex(['a', 'a', 'a'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')

	A
B
c	4
a	0
a	1
a	5
b	2
b	3

&roupby.DataFrameGroupBy object at 0x00000000117AC710>

	A
B
c	4
a	6
b	5

CategoricalIndex(['c', 'a', 'b'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')

	A
B
a	0.0
a	1.0
a	5.0
e	NaN

Index(['a', 'a', 'a', 'e'], dtype='object', name='B')

	A
B
a	0.0
a	1.0
a	5.0
e	NaN

CategoricalIndex(['a', 'a', 'a', 'e'], categories=['a', 'b', 'c', 'd', 'e'], ordered=False, name='B', dtype='category')

# 注意：变形和比较操作必须有同样的categories，否则报错

# Int64Index and RangeIndex
# Int64Index 是pandas基础索引。
# RangeIndex是Int64Index的一个子集，现在作为所有NDFrame对象的默认索引。

# Float64Index 当创建索引index时，传入浮点数或者浮点与整数混合值，就默认是Float64Indexindexf = pd.Index([1.5, 2, 3, 4.5, 5])
indexf

Float64Index([1.5, 2.0, 3.0, 4.5, 5.0], dtype='float64')

sf = pd.Series(range(5), index=indexf)
sf

1.5    0
2.0    1
3.0    2
4.5    3
5.0    4
dtype: int64

# [] .loc 基于 label，整数将被转为浮点值
sf[1.5:4.5]
sf[1:4]
sf.loc[3]  # label，不是位置索引
# sf[3.2]  # 错误，传入值必须在labels中

1.5    0
2.0    1
3.0    2
4.5    3
dtype: int641.5    0
2.0    1
3.0    2
dtype: int642

sf.iloc[3]  # 基于位置，传入整数，不能传入浮点数

# 例子： 有不规则的数据表，其索引类似时间间隔，但数值是浮点型的
dfir1 = pd.DataFrame(np.random.randn(5,2),index=np.arange(5) * 250.0,columns=list('AB'))
dfir1
dfir2 = pd.DataFrame(np.random.randn(6,2),index=np.arange(4,10) * 250.1,columns=list('AB'))
dfir2
dfir = pd.concat([dfir1,dfir2])
dfir

	A	B
0.0	1.158461	0.595743
250.0	1.457556	0.268541
500.0	-0.437650	-0.299700
750.0	-1.095812	-2.079684
1000.0	0.242220	-0.868812

	A	B
1000.4	-0.858327	-0.364968
1250.5	-1.445806	-2.129608
1500.6	0.799049	1.232102
1750.7	-1.132538	0.283472
2000.8	-1.157884	0.398119
2250.9	-1.330821	-0.563333

	A	B
0.0	1.158461	0.595743
250.0	1.457556	0.268541
500.0	-0.437650	-0.299700
750.0	-1.095812	-2.079684
1000.0	0.242220	-0.868812
1000.4	-0.858327	-0.364968
1250.5	-1.445806	-2.129608
1500.6	0.799049	1.232102
1750.7	-1.132538	0.283472
2000.8	-1.157884	0.398119
2250.9	-1.330821	-0.563333

# 选取第1秒前的数据
dfir[:1000]

	A	B
0.0	1.158461	0.595743
250.0	1.457556	0.268541
500.0	-0.437650	-0.299700
750.0	-1.095812	-2.079684
1000.0	0.242220	-0.868812

# IntervalIndex  区间索引  (数学上的开闭区间)
df = pd.DataFrame({'A': [1, 2, 3, 4]},index=pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4]))
df

	A
(0, 1]	1
(1, 2]	2
(2, 3]	3
(3, 4]	4

df.loc[2]  # loc 可以是区间的边缘
df.loc[2.5]
df.loc[1.5:2.5]

A    2
Name: (1, 2], dtype: int64A    3
Name: (2, 3], dtype: int64

	A
(1, 2]	2
(2, 3]	3

# Interval and IntervalIndex are used by cut and qcut
# 区间数值类型和区间类型索引可以使用 cut qcut 方法??
c = pd.cut(range(4), bins=2)
c
c.categories

[(-0.003, 1.5], (-0.003, 1.5], (1.5, 3.0], (1.5, 3.0]]
Categories (2, interval[float64]): [(-0.003, 1.5] < (1.5, 3.0]]IntervalIndex([(-0.003, 1.5], (1.5, 3.0]]closed='right',dtype='interval[float64]')

pd.cut([0, 3, 5, 1], bins=c.categories)  # 允许一个interval类型去bin（分隔）其他数据

[(-0.003, 1.5], (1.5, 3.0], NaN, (-0.003, 1.5]]
Categories (2, interval[float64]): [(-0.003, 1.5] < (1.5, 3.0]]

# Miscellaneous indexing FAQ 杂项 常见问题

# Integer indexing  整数型索引
# 整型的索引是label，应满足label的要求
# 在pandas中，一般认为标签label事项大于整数定位。s = pd.Series(range(5))
s
# s[-1]  # 异常
# s.loc[-1]  # 异常
s.loc[-1:]  # 允许
s.iloc[-1]  # 允许
df = pd.DataFrame(np.random.randn(5, 4))
df
df.loc[-2:]
# df.loc[-2]  # 异常

0    0
1    1
2    2
3    3
4    4
dtype: int640    0
1    1
2    2
3    3
4    4
dtype: int644

	0	1	2	3
0	0.021033	0.127054	-0.864734	-1.835828
1	-0.400611	0.594981	-1.758866	-1.059539
2	-0.108597	0.784000	0.306035	-0.695933
3	-0.078048	-1.742895	-0.159740	0.934115
4	-0.524633	0.433224	-0.732334	0.442827

	0	1	2	3
0	0.021033	0.127054	-0.864734	-1.835828
1	-0.400611	0.594981	-1.758866	-1.059539
2	-0.108597	0.784000	0.306035	-0.695933
3	-0.078048	-1.742895	-0.159740	0.934115
4	-0.524633	0.433224	-0.732334	0.442827

# Non-monotonic indexes require exact matches 非单调索引要求精确匹配
# 如果series或Dataframe的索引是单调增或单调减的，则基于标签的切片可以超出索引的范围。
# 就像对一般python列表list的索引切片。
# 可以用is_monotonic_increasing和is_monotonic_decreasing测试单调属性df = pd.DataFrame(index=[2,3,3,4,5], columns=['data'], data=list(range(5)))
df

	data
2	0
3	1
3	2
4	3
5	4

df.index.is_monotonic_increasing
df.index.is_monotonic_decreasing

TrueFalse

df.loc[0:4, :]  # 没有0和1行，但是返回了label为2、3、4的行
df.loc[13:15, :]  # 超出界限，返回空

	data
2	0
3	1
3	2
4	3

# 非单调索引，切片必须在index内，而且边界的值必须是唯一的df = pd.DataFrame(index=[2,3,1,4,3,5], columns=['data'], data=list(range(6)))
df
df.index.is_monotonic_increasing

	data
2	0
3	1
1	2
4	3
3	4
5	5

False

df.loc[2:4, :]
# df.loc[0:4, :]  # 错误没有0标签
# df.loc[2:3, :]  # 错误，边界标签3不是唯一的

	data
2	0
3	1
1	2
4	3

# Index.is_monotonic_increasing() and Index.is_monotonic_decreasing() 只检测弱单调（可以有重复值）
# 结合使用 Index.is_unique()  可以检测严格单调性weakly_monotonic = pd.Index(['a', 'b', 'c', 'c'])
weakly_monotonic
weakly_monotonic.is_monotonic_increasing
weakly_monotonic.is_monotonic_increasing & weakly_monotonic.is_unique

Index(['a', 'b', 'c', 'c'], dtype='object')TrueFalse

# Endpoints are inclusive 端点（边界）包括在内
# 与标准的python切片（不包括右端点值）相比，pandas中的标签切片包含端点值。
# 主要原因是经常不可能轻易断定 在索引的局部标签后 的 后继或者下一个元素

s = pd.Series(np.random.randn(6), index=list('abcdef'))
s

a    1.280483
b    1.562738
c    0.904503
d   -0.470785
e   -0.008048
f   -0.413812
dtype: float64

s[2:5]  # 基于整型的索引，与既有标签的不同，不包括右端点

c    0.904503
d   -0.470785
e   -0.008048
dtype: float64

# 如果用标签，不容易取得下一个标签
# s.loc['c':'e'+1]  # 错误
s.loc['c':'e']

c    0.904503
d   -0.470785
e   -0.008048
dtype: float64

# Indexing potentially changes underlying Series dtype 
# 在series类型下索引可能出现变化
# The different indexing operation can potentially change the dtype of a Series.
# 不同的索引操作可能会潜在的改变series的类型series1 = pd.Series([1, 2, 3])
series1.dtype  # int
series1
res = index([0, 4])
res.dtype  # float
res

dtype('int64')0    1
1    2
2    3
dtype: int64dtype('float64')0    1.0
4    NaN
dtype: float64

series2 = pd.Series([True])
series2.dtype  # 布尔类型
series2
res = index_like(series1)
res.dtype  # '0' 型  （空？）
res

dtype('bool')0    True
dtype: booldtype('O')0    True
1     NaN
2     NaN
dtype: object

# 由于默认插入NaN，引起了dtype的改变。
# 这会导致一些问题，当使用如 numpy.logical_and. 的np ufuncs 时


#  2018-02-22

转载于:.html

本文发布于:2024-01-28 13:25:18，感谢您对本站的认可！

本文链接：https://www.4u4v.net/it/17064195237727.html

上一篇：【Vue】Vue基础自用笔记Day04

下一篇：渗透测试工具Nmap从初级到高级使用教程

标签：索引 pandas

留言与评论（共有 0 条评论）