2.pandas

joker ... 2022-4-7 大约 3 分钟

# 2.pandas

# 2.1 基本介绍

import pandas as pd
import numpy as np
s=pd.Series([1,3,6,np.nan,44,1])
print(s)
dates=pd.date_range("20210701",periods=6)
print(dates)
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=["a","b","c","d"])
print(df)
# 可以使用字典的方式进行
df2=pd.DataFrame({"A":1,"B":"kk","C":np.array([1,2,3])})
print(df2)
# 打印每一列的属性
print(df2.dtypes)
# 打印列的值
print(df2.columns)
print(df2.T)
# 进行排序
print(df2.sort_index(axis=1,ascending=False))

print(df2.sort_index(axis=0,ascending=False))
# 对于矩阵中的值进行排序
print(df2.sort_values(by="C",ascending=False))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

# 2.2 选择数据

import pandas as pd
import numpy as np
dates=pd.date_range("20210703",periods=6)
df=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns={"A","B","C","D"})
print(df["A"])
print(df.A)
print("0-3 行",df[0:3])
print(df["20210703":"20210705"])
# loc 是纯标签的筛选
print(df.loc["20210704"])
print(df.loc["20210704",["A","B"]])
# iloc 是纯数字的筛选
print(df.iloc[3,1])
print(df.iloc[3:5,1:3])
# ix 是混合的筛选
print(df.ix[:3,["A","C"]])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

# 2.3 设置值

import pandas as pd
import numpy as np
dates=pd.date_range("20210703",periods=6)
df=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns={"A","B","C","D"})
# 选择标签,然后设置值
df.iloc[2,2]=1111

print(df)
df.loc["20210703","B"]=2222
print(df)
df.B[df.A>8]=0
print(df)
df["E"]=np.nan
print(df)
df["F"]=[1,2,3,4,5,6]
print(df)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

# 2.4 处理丢失数据

import pandas as pd
import numpy as np
dates=pd.date_range("20210703",periods=6)
df=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns={"A","B","C","D"})
df["E"]=np.nan
df.iloc[0,1]=np.nan
df.iloc[1,2]=np.nan
print(df)
# 如果一行中,有nan ,就全部丢掉,这边是全部丢掉
print(df.dropna(axis=0,how="any"))
# 丢掉列,某一列全部是nan,才全部丢掉
print(df.dropna(axis=1,how="all"))

# 填充其中的nan
print(df.fillna(value=0))

print(df.isnull())
# 返回是否有一个nan
print(np.any(df.isnull())==True)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

# 2.5 导入导出数据

api 网址

https://pandas.pydata.org/docs/reference/io.html
1

样例

import pandas as pd
import numpy as np
# 首先准备一个csv
data=pd.read_csv(r"C:\Users\bn\Desktop\1.csv")
print(data)
data.to_pickle("1.pickle")
1
2
3
4
5
6

image-20210703105902829

# 2.6 合并

# concat

import pandas as pd
import numpy as np
df1=pd.DataFrame(np.ones((3,4))*0,columns=["a","b","c","d"])
df2=pd.DataFrame(np.ones((3,4))*1,columns=["a","b","c","d"])
df3=pd.DataFrame(np.ones((3,4))*2,columns=["a","b","c","d"])
print(df1)
print(df2)
res=pd.concat([df1,df2,df3],axis=0)
print(res)

df1=pd.DataFrame(np.ones((3,4))*0,columns=["a","b","c","d"],index=[1,2,3])
df2=pd.DataFrame(np.ones((3,4))*1,columns=["b","c","d","e"],index=[2,3,4])
res=pd.concat([df1,df2])
# 多余的部分会用NaN 连接
print(res)

# 寻找相同的列
res=pd.concat([df1,df2],join="inner",ignore_index=True)
print(res)

# 还有append数据
df1=pd.DataFrame(np.ones((3,4))*0,columns=["a","b","c","d"])
df2=pd.DataFrame(np.ones((3,4))*1,columns=["a","b","c","d"])
df3=pd.DataFrame(np.ones((3,4))*2,columns=["a","b","c","d"])
res=df1.append([df2,df3])
print(res)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27

# merge

import pandas as pd
import numpy as np
left=pd.DataFrame({"key":["K0","K1","K2","K3"],
                   "A":["A0","A1","A2","A3"],
                   "B":["B0","B1","B2","B3"],
                  })
right=pd.DataFrame({"key":["K0","K1","K2","K3"],
                   "C":["C0","C1","C2","C3"],
                   "D":["D0","D1","D2","D3"],
                  })
print(left)
print(right)
res=pd.merge(left,right,on="key")
print(res)
1
2
3
4
5
6
7
8
9
10
11
12
13
14

# 2.7 打印

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
data=pd.Series(np.random.randn(1000),index=np.arange(1000))
data=data.cumsum()
data.plot()
plt.show()

# 矩阵的数据
data=pd.DataFrame(np.random.randn(1000,4),
                  index=np.arange(1000),columns=list("ABCD"))
print(data.head())

data=data.cumsum()
data.plot()
plt.show()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

image-20210703160523722