目录
1. Series
1.1 Series 创建和切片
1.2 Series 的索引和值
2. DataFrame
2.1 pandas 读取外部数据
2.2 DataFrame 的基础属性
2.3 pandas 的索引
2.3.1 df.loc 方法
2.3.2 df.iloc 方法
2.3.3 布尔索引
2.4 缺失数据的处理
Series:一维,带标签的数组。
#测试pandas.Series的创建和切片import pandas as pdt = pd.Series([13,34,45,2])
print(t)
print(t[t>20])#通过字典创建一个Series,其中的索引就是字典的键
a = {"name":"老白","age":26,"career":"waiter","company":"同福客栈"}
t1 = pd.Series(a)
print(t1)
print("*"*50)
print(t1["career"])
print("*"*50)
print(t1[0])
print("*"*50)
print(t1[["name","company"]])#重新指定索引
t2 = pd.Series([445,234,523,56,45],index=list("abcde"))
print(t2)
print("*"*50)
print(t2["c"])
print("*"*50)
print(t2[:2])
print("*"*50)
print(t2[[1,3]])'''
输出结果:
0 13
1 34
2 45
3 2
dtype: int64
1 34
2 45
dtype: int64
name 老白
age 26
career waiter
company 同福客栈
dtype: object
**************************************************
waiter
**************************************************
老白
**************************************************
name 老白
company 同福客栈
dtype: object
a 445
b 234
c 523
d 56
e 45
dtype: int64
**************************************************
523
**************************************************
a 445
b 234
dtype: int64
**************************************************
b 234
d 56
dtype: int64
'''
Series对象本质上由两个数组构成:
一个数组构成对象的键(index,索引);一个数组构成对象的值(values)。
#测试pandas.Series的索引和值import pandas as pda = {"name":"老白","age":26,"career":"waiter","company":"同福客栈"}
t = pd.Series(a)for i in t.index:print(i)
print("*"*50)
print(type(t.index))
print("*"*50)
print(len(t.index))
print("*"*50)
print(list(t.index)[:3])print("*"*50)
print(t.values)
print("*"*50)
print(type(t.values))'''
输出结果:
name
age
career
company
**************************************************
<class indexes.base.Index'>
**************************************************
4
**************************************************
['name', 'age', 'career']
**************************************************
['老白' 26 'waiter' '同福客栈']
**************************************************
<class 'numpy.ndarray'>
'''
DataFrame:二维,Series容器。
#测试pandas读取外部数据import pandas as pddf = pd.read_csv("./dogNames2.csv")
print(df.info())#DataFrame中排序的方法
df = df.sort_values(by="Count_AnimalName",ascending=False)
print(df.head()) #显示头部几行,默认5行'''
输出结果:
<class frame.DataFrame'>
RangeIndex: 16220 entries, 0 to 16219
Data columns (total 2 columns):# Column Non-Null Count Dtype
--- ------ -------------- ----- 0 Row_Labels 16217 non-null object1 Count_AnimalName 16220 non-null int64
dtypes: int64(1), object(1)
memory usage: 253.6+ KB
None
**************************************************Row_Labels Count_AnimalName
1156 BELLA 1195
9140 MAX 1153
2660 CHARLIE 856
3251 COCO 852
12368 ROCKY 823
'''
#测试DataFrame的创建、基础属性import pandas as pd
import numpy as npt = pd.DataFrame(np.arange(12).reshape((3,4)),index=list("abc"),columns=list("WXYZ"))
print(t)d1 = {"name":["老白","赛貂蝉"],"age":[26,20],"career":["waiter","CEO"],"company":["同福客栈","怡红楼"]}
t1 = pd.DataFrame(d1)
print(t1)
print("*"*50)
print(type(t1))
print("*"*50)
print(t1.index) #行索引
print("*"*50)
lumns) #列索引
print("*"*50)
print(t1.values) #对象值,二维ndarray数组
print("*"*50)
print(t1.shape) #行数,列数
print("*"*50)
print(t1.dtypes) #列数据类型
print("*"*50)
print(t1.ndim) #数据维度
print("*"*50)
print(t1.info()) #相关信息
print("*"*50)
print(t1.describe()) #快速综合统计结果'''
输出结果:
W X Y Z
a 0 1 2 3
b 4 5 6 7
c 8 9 10 11name age career company
0 老白 26 waiter 同福客栈
1 赛貂蝉 20 CEO 怡红楼
**************************************************
<class frame.DataFrame'>
**************************************************
RangeIndex(start=0, stop=2, step=1)
**************************************************
Index(['name', 'age', 'career', 'company'], dtype='object')
**************************************************
[['老白' 26 'waiter' '同福客栈']['赛貂蝉' 20 'CEO' '怡红楼']]
**************************************************
(2, 4)
**************************************************
name object
age int64
career object
company object
dtype: object
**************************************************
2
**************************************************
<class frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 4 columns):# Column Non-Null Count Dtype
--- ------ -------------- ----- 0 name 2 non-null object1 age 2 non-null int64 2 career 2 non-null object3 company 2 non-null object
dtypes: int64(1), object(3)
memory usage: 192.0+ bytes
None
**************************************************age
count 2.000000
mean 23.000000
std 4.242641
min 20.000000
25% 21.500000
50% 23.000000
75% 24.500000
max 26.000000
'''
- [数字]表示取行索引,对行进行操作;
- [字符串]表示取列索引,对列进行操作。
#测试DataFrame的索引import pandas as pddf = pd.read_csv("./dogNames2.csv")#进行降序排列
df = df.sort_values(by="Count_AnimalName",ascending=False)
print(df[:5]) #按照行进行操作
print("*"*50)
print(df[:5]["Row_Labels"]) #取前5行的第“Row_Labels”列
print("*"*50)
print(df["Row_Labels"][:5]) #取第“Row_Labels”列的前5行
print("*"*50)
print(type(df["Row_Labels"])) #Series类型
print("*"*50)
print(df[["Row_Labels","Count_AnimalName"]][:5])'''
输出结果:
Row_Labels Count_AnimalName
1156 BELLA 1195
9140 MAX 1153
2660 CHARLIE 856
3251 COCO 852
12368 ROCKY 823
**************************************************
1156 BELLA
9140 MAX
2660 CHARLIE
3251 COCO
12368 ROCKY
Name: Row_Labels, dtype: object
**************************************************
1156 BELLA
9140 MAX
2660 CHARLIE
3251 COCO
12368 ROCKY
Name: Row_Labels, dtype: object
**************************************************
<class series.Series'>
**************************************************Row_Labels Count_AnimalName
1156 BELLA 1195
9140 MAX 1153
2660 CHARLIE 856
3251 COCO 852
12368 ROCKY 823
'''
通过标签索引数据
import pandas as pdt = pd.DataFrame(np.arange(12).reshape((3,4)),index=list("abc"),columns=list("WXYZ"))# loc方法,通过标签索引数据
print(t.loc["a","Z"])
print("*"*50)
print(t.loc["a"])
# print(t.loc["a",:])
print("*"*50)
print(t.loc[:,"Y"])
print("*"*50)
print(t.loc[["a","c"],["W","Z"]])
print("*"*50)
print(t.loc["a":"c",["W","Z"]]) #在loc中,冒号索引是闭合的,即会选择到冒号后面的数据'''
输出结果:
3
**************************************************
W 0
X 1
Y 2
Z 3
Name: a, dtype: int32
**************************************************
a 2
b 6
c 10
Name: Y, dtype: int32
**************************************************W Z
a 0 3
c 8 11
**************************************************W Z
a 0 3
b 4 7
c 8 11
'''
通过位置获取数据
import pandas as pdt = pd.DataFrame(np.arange(12).reshape((3,4)),index=list("abc"),columns=list("WXYZ"))# iloc方法,通过位置获取行数据
print(t.iloc[1])
print("*"*50)
print(t.iloc[:,[2,1]])
print("*"*50)
print(t.iloc[[1,0],[2,1]])
print("*"*50)
print(t.iloc[1:,:2]) #取第二行后的每一行、第三列前的每一列
print("*"*50)
t.iloc[1:,:2] = np.nan
print(t)'''
输出结果:
W 4
X 5
Y 6
Z 7
Name: b, dtype: int32
**************************************************Y X
a 2 1
b 6 5
c 10 9
**************************************************Y X
b 6 5
a 2 1
**************************************************W X
b 4 5
c 8 9
**************************************************W X Y Z
a 0.0 1.0 2 3
b NaN NaN 6 7
c NaN NaN 10 11
'''
#测试pandas的布尔索引import pandas as pddf = pd.read_csv("./dogNames2.csv")#找到所有的使用次数超过700并且名字的字符串的长度大于4的狗的名字
print(df[(df["Row_Labels"].str.len()>4)&(df["Count_AnimalName"]>700)])'''
输出结果:
Row_Labels Count_AnimalName
1156 BELLA 1195
2660 CHARLIE 856
8552 LUCKY 723
12368 ROCKY 823
'''
pd.dropna(axis=0,how='any') #只要该行存在nan,就删除该行
pd.dropna(axis=0,how='all',inplace=False) #只有该行全部为nan,才删除该行,inplace参数判断是否进行原地修改
#测试pandas缺失数据的处理import pandas as pd
import numpy as npt = pd.DataFrame(np.arange(12).reshape((3,4)),index=list("abc"),columns=list("WXYZ"))
t.iloc[1:,:2] = np.nanprint(pd.isnull(t))
print("*"*50)
print(ull(t["W"])]) #t中"W"这一列数值不为nan的所有行
print("*"*50)#删除NaN所在的行、列
t1 = t.dropna(axis=0,how="any",inplace=False)
print(t1)
print("*"*50)#填充NaN的数据
t2 = t.fillna(0) #用0填充NaN
print(t2)
print("*"*50)t3 = t.an()) #用平均值填充NaN
print(t3)
print("*"*50)t4 = t["X"].fillna(t["X"].median()) #用"X"列的中位数填充"X"列的NaN
print(t4)'''
输出数据:W X Y Z
a False False False False
b True True False False
c True True False False
**************************************************W X Y Z
a 0.0 1.0 2 3
**************************************************W X Y Z
a 0.0 1.0 2 3
**************************************************W X Y Z
a 0.0 1.0 2 3
b 0.0 0.0 6 7
c 0.0 0.0 10 11
**************************************************W X Y Z
a 0.0 1.0 2 3
b 0.0 1.0 6 7
c 0.0 1.0 10 11
**************************************************
a 1.0
b 1.0
c 1.0
Name: X, dtype: float64
'''
本文发布于:2024-02-02 01:23:39,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170681271140499.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |