Python进阶—Pandas

标签：index 01 进阶 Python df1 df2 K0 pd Pandas

Pandas再来一次

文章目录
一、Series和DataFrame
二、选择数据
三、赋值及操作(增、删、改)
四、处理丢失数据
五、读取并写入文件
六、concat拼接
七、merge合并
八、pandas的plot

配合机器学习食用更佳。

一、Series和DataFrame

series的创建（列表、数组、字典）、values/index、索引（默认、指定）取一个值取多个值、遍历
dataframe的创建、values/index/columns、df.sort_index(axis=0) 行排序和列排序、df.sort_values(by=’’) # 根据值排序，可以指定行还是列
df.describe()/df.T

import pandas as pd
import numpy as np

s1 = pd.Series([1,2,3,-5]) # 创建一个series，索引为默认值
print(s1)

0    1
1    2
2    3
3   -5
dtype: int64

s1.values # series的值

array([ 1,  2,  3, -5])

s1.index # series的索引

RangeIndex(start=0, stop=4, step=1)

for i in s1.index:
    print(i)

s1[0] # 根据索引取值

s2 = pd.Series([1,2,3,4],index=['a','b','c','d']) # 指定索引
print(s2)

a    1
b    2
c    3
d    4
dtype: int64

s2[0]

s2['a']

s2[['a','b','c','d']]

a    1
b    2
c    3
d    4
dtype: int64

s2[[0,1,2]]

a    1
b    2
c    3
dtype: int64

for i in s1:
    print(i)

for i in s2:
    print(i)

'a' in s2

True

# Series可以看成是一个定长的有序字典
dic1 = {'apple':5,'pen':3}
s3 = pd.Series(dic1)
print(s3)

apple    5
pen      3
dtype: int64

# DataFrame
data = {'year':[2014,2015,2016,2017],
       'income':[1000,2000,3000,4000],
       'pay':[5000,20000,30000,40000]}
df1 = pd.DataFrame(data)
df1

	year	income	pay
0	2014	1000	5000
1	2015	2000	20000
2	2016	3000	30000
3	2017	4000	40000

df2 = pd.DataFrame(np.arange(12).reshape(3,-1))
print(df2)

0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11

df3 = pd.DataFrame(np.arange(12).reshape(3,4),index=['a','b','c'],columns=['A','B','C','D'])
print(df3)

A  B   C   D
a  0  1   2   3
b  4  5   6   7
c  8  9  10  11

df1.index

RangeIndex(start=0, stop=4, step=1)

df1.columns

Index(['year', 'income', 'pay'], dtype='object')

df1.values

array([[ 2014,  1000,  5000],
       [ 2015,  2000, 20000],
       [ 2016,  3000, 30000],
       [ 2017,  4000, 40000]])

df1.describe()

	year	income	pay
count	4.000000	4.000000	4.000000
mean	2015.500000	2500.000000	23750.000000
std	1.290994	1290.994449	14930.394056
min	2014.000000	1000.000000	5000.000000
25%	2014.750000	1750.000000	16250.000000
50%	2015.500000	2500.000000	25000.000000
75%	2016.250000	3250.000000	32500.000000
max	2017.000000	4000.000000	40000.000000

print(df1)

year  income    pay
0  2014    1000   5000
1  2015    2000  20000
2  2016    3000  30000
3  2017    4000  40000

print(df1.T)

0      1      2      3
year    2014   2015   2016   2017
income  1000   2000   3000   4000
pay     5000  20000  30000  40000

df1.sort_index(axis=1) # columns索引排序

	income	pay	year
0	1000	5000	2014
1	2000	20000	2015
2	3000	30000	2016
3	4000	40000	2017

df1.sort_index(axis=0)

	year	income	pay
0	2014	1000	5000
1	2015	2000	20000
2	2016	3000	30000
3	2017	4000	40000

df3.sort_values(by='a',axis=1) # 根据某一列排序

	A	B	C	D
a	0	1	2	3
b	4	5	6	7
c	8	9	10	11

df3

	A	B	C	D
a	0	1	2	3
b	4	5	6	7
c	8	9	10	11

二、选择数据

取行、取列、取多行、取多列、取元素、取True元素、切片
df[] # 只能取列可以取多列
df.loc[] # 按照标签取
df.iloc[] # 按照位置取
df[df>6] # 取True元素

dates = pd.date_range('20170101',periods=6)
df1 = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
print(df1)

A   B   C   D
2017-01-01   0   1   2   3
2017-01-02   4   5   6   7
2017-01-03   8   9  10  11
2017-01-04  12  13  14  15
2017-01-05  16  17  18  19
2017-01-06  20  21  22  23

df1['A'] # 获取一列 == df1.A  df1[['A','B']]  取多列

2017-01-01     0
2017-01-02     4
2017-01-03     8
2017-01-04    12
2017-01-05    16
2017-01-06    20
Freq: D, Name: A, dtype: int64

df1[0:2] # 切片 前两行

	A	B	C	D
2017-01-01	0	1	2	3
2017-01-02	4	5	6	7

df1['20170102':'20170104']

	A	B	C	D
2017-01-02	4	5	6	7
2017-01-03	8	9	10	11
2017-01-04	12	13	14	15

# 通过标签选择数据
df1.loc['20170102'] # 取行

A    4
B    5
C    6
D    7
Name: 2017-01-02 00:00:00, dtype: int64

df1.loc['20170101','A'] # 取元素

df1.loc[:,'A'] # 取列

2017-01-01     0
2017-01-02     4
2017-01-03     8
2017-01-04    12
2017-01-05    16
2017-01-06    20
Freq: D, Name: A, dtype: int64

df1.loc[:,['A','B']]

	A	B
2017-01-01	0	1
2017-01-02	4	5
2017-01-03	8	9
2017-01-04	12	13
2017-01-05	16	17
2017-01-06	20	21

df1.loc[::2,['A','B']]

	A	B
2017-01-01	0	1
2017-01-03	8	9
2017-01-05	16	17

# 通过位置选择数据
df1.iloc[2] # 第二行

A     8
B     9
C    10
D    11
Name: 2017-01-03 00:00:00, dtype: int64

df1.iloc[[1,2]]

	A	B	C	D
2017-01-02	4	5	6	7
2017-01-03	8	9	10	11

df1.iloc[1:3,2:4]

	C	D
2017-01-02	6	7
2017-01-03	10	11

df1 > 6

	A	B	C	D
2017-01-01	False	False	False	False
2017-01-02	False	False	False	True
2017-01-03	True	True	True	True
2017-01-04	True	True	True	True
2017-01-05	True	True	True	True
2017-01-06	True	True	True	True

df1.A > 6

2017-01-01    False
2017-01-02    False
2017-01-03     True
2017-01-04     True
2017-01-05     True
2017-01-06     True
Freq: D, Name: A, dtype: bool

df1[df1>6]

	A	B	C	D
2017-01-01	NaN	NaN	NaN	NaN
2017-01-02	NaN	NaN	NaN	7.0
2017-01-03	8.0	9.0	10.0	11.0
2017-01-04	12.0	13.0	14.0	15.0
2017-01-05	16.0	17.0	18.0	19.0
2017-01-06	20.0	21.0	22.0	23.0

三、赋值及操作(增、删、改)

增：在尾部增加一行、一列、在指定位置插入一列(insert) 添加一行只能先拆开后合并
删：del pop
改：找到相应位置改即可

df1

	A	B	C	D
2017-01-01	0	1	2	3
2017-01-02	4	5	6	7
2017-01-03	8	9	10	11
2017-01-04	12	13	14	15
2017-01-05	16	17	18	19
2017-01-06	20	21	22	23

df1.loc['20170101','A'] = 10000

df1

	A	B	C	D
2017-01-01	10000	1	2	3
2017-01-02	4	5	6	7
2017-01-03	8	9	10	11
2017-01-04	12	13	14	15
2017-01-05	16	17	18	19
2017-01-06	20	21	22	23

df1[df1>10] = -1

df1

	A	B	C	D
2017-01-01	-1	1	2	3
2017-01-02	4	5	6	7
2017-01-03	8	9	10	-1
2017-01-04	-1	-1	-1	-1
2017-01-05	-1	-1	-1	-1
2017-01-06	-1	-1	-1	-1

df1['E'] = 10 # 添加一列

df1

	A	B	C	D	E
2017-01-01	-1	1	2	3	10
2017-01-02	4	5	6	7	10
2017-01-03	8	9	10	-1	10
2017-01-04	-1	-1	-1	-1	10
2017-01-05	-1	-1	-1	-1	10
2017-01-06	-1	-1	-1	-1	10

df1['F'] = pd.Series([1,2,3,4,5,6],index=dates)#添加一列
df1

	A	B	C	D	E	F
2017-01-01	-1	1	2	3	10	1
2017-01-02	4	5	6	7	10	2
2017-01-03	8	9	10	-1	10	3
2017-01-04	-1	-1	-1	-1	10	4
2017-01-05	-1	-1	-1	-1	10	5
2017-01-06	-1	-1	-1	-1	10	6

s1 = pd.Series([1,2,3,4],index=['A','B','C','D'],)
df2 = df1.append(s1,ignore_index=True) # 增加一行
df2

	A	B	C	D	E	F
0	-1.0	1.0	2.0	3.0	10.0	1.0
1	4.0	5.0	6.0	7.0	10.0	2.0
2	8.0	9.0	10.0	-1.0	10.0	3.0
3	-1.0	-1.0	-1.0	-1.0	10.0	4.0
4	-1.0	-1.0	-1.0	-1.0	10.0	5.0
5	-1.0	-1.0	-1.0	-1.0	10.0	6.0
6	1.0	2.0	3.0	4.0	NaN	NaN

df2.loc[7,['A','B','C']] = [1,2,3] # 增加一行

df2

	A	B	C	D	E	F
0	-1.0	1.0	2.0	3.0	10.0	1.0
1	4.0	5.0	6.0	7.0	10.0	2.0
2	8.0	9.0	10.0	-1.0	10.0	3.0
3	-1.0	-1.0	-1.0	-1.0	10.0	4.0
4	-1.0	-1.0	-1.0	-1.0	10.0	5.0
5	-1.0	-1.0	-1.0	-1.0	10.0	6.0
6	1.0	2.0	3.0	4.0	NaN	NaN
7	1.0	2.0	3.0	NaN	NaN	NaN

s2 = pd.Series([1,2,4,5,6],index=['A','B','C','D','E']) # 增加一行，必须指定name的请客
s2.name = 'wang'
df2.append(s2)

	A	B	C	D	E	F
0	-1.0	1.0	2.0	3.0	10.0	1.0
1	4.0	5.0	6.0	7.0	10.0	2.0
2	8.0	9.0	10.0	-1.0	10.0	3.0
3	-1.0	-1.0	-1.0	-1.0	10.0	4.0
4	-1.0	-1.0	-1.0	-1.0	10.0	5.0
5	-1.0	-1.0	-1.0	-1.0	10.0	6.0
6	1.0	2.0	3.0	4.0	NaN	NaN
7	1.0	2.0	3.0	NaN	NaN	NaN
wang	1.0	2.0	4.0	5.0	6.0	NaN

df1

	A	B	C	D	E	F
2017-01-01	-1	1	2	3	10	1
2017-01-02	4	5	6	7	10	2
2017-01-03	8	9	10	-1	10	3
2017-01-04	-1	-1	-1	-1	10	4
2017-01-05	-1	-1	-1	-1	10	5
2017-01-06	-1	-1	-1	-1	10	6

df1.insert(1,'Z',df2['E']) # 在第一列插入索引为G的df2中的E列

df1

	A	Z	G	B	C	D	E	F
2017-01-01	-1	NaN	NaN	1	2	3	10	1
2017-01-02	4	NaN	NaN	5	6	7	10	2
2017-01-03	8	NaN	NaN	9	10	-1	10	3
2017-01-04	-1	NaN	NaN	-1	-1	-1	10	4
2017-01-05	-1	NaN	NaN	-1	-1	-1	10	5
2017-01-06	-1	NaN	NaN	-1	-1	-1	10	6

df2['E']

0    10.0
1    10.0
2    10.0
3    10.0
4    10.0
5    10.0
6     NaN
7     NaN
Name: E, dtype: float64

g = df1.pop('G') # 弹出G列
g

2017-01-01   NaN
2017-01-02   NaN
2017-01-03   NaN
2017-01-04   NaN
2017-01-05   NaN
2017-01-06   NaN
Freq: D, Name: G, dtype: float64

del df1['Z'] # 删除Z列

df1

	A	B	C	D	E	F
2017-01-01	-1	1	2	3	10	1
2017-01-02	4	5	6	7	10	2
2017-01-03	8	9	10	-1	10	3
2017-01-04	-1	-1	-1	-1	10	4
2017-01-05	-1	-1	-1	-1	10	5
2017-01-06	-1	-1	-1	-1	10	6

df2 = df1.drop(['A','B'],axis=1)#删除AB列

df2

	C	D	E	F
2017-01-01	2	3	10	1
2017-01-02	6	7	10	2
2017-01-03	10	-1	10	3
2017-01-04	-1	-1	10	4
2017-01-05	-1	-1	10	5
2017-01-06	-1	-1	10	6

df1

	A	B	C	D	E	F
2017-01-01	-1	1	2	3	10	1
2017-01-02	4	5	6	7	10	2
2017-01-03	8	9	10	-1	10	3
2017-01-04	-1	-1	-1	-1	10	4
2017-01-05	-1	-1	-1	-1	10	5
2017-01-06	-1	-1	-1	-1	10	6

四、处理丢失数据

dropna
fillna
isnull
np.any
np.all

dates = np.arange(20170101,20170105)
df1 = pd.DataFrame(np.arange(12).reshape(4,3),index=dates,columns=['A','B','C'])
df1

	A	B	C
20170101	0	1	2
20170102	3	4	5
20170103	6	7	8
20170104	9	10	11

df2 = pd.DataFrame(df1,index=dates,columns=['A','B','C','D','E'])
df2

	A	B	C	D	E
20170101	0	1	2	NaN	NaN
20170102	3	4	5	NaN	NaN
20170103	6	7	8	NaN	NaN
20170104	9	10	11	NaN	NaN

s1 = pd.Series([3,4,6],index=dates[:3])
s2 = pd.Series([32,5,2],index=dates[1:])
df2['D'] = s1
df2['E'] = s2
df2

	A	B	C	D	E
20170101	0	1	2	3.0	NaN
20170102	3	4	5	4.0	32.0
20170103	6	7	8	6.0	5.0
20170104	9	10	11	NaN	2.0

df2.dropna(axis=0,how='any') #axis=[0,1] 0代表行，1代表列。how=['any','all'] any任意一个或多个 all全部

	A	B	C	D	E
20170102	3	4	5	4.0	32.0
20170103	6	7	8	6.0	5.0

df2.dropna(axis=0,how='all')

	A	B	C	D	E
20170101	0	1	2	3.0	NaN
20170102	3	4	5	4.0	32.0
20170103	6	7	8	6.0	5.0
20170104	9	10	11	NaN	2.0

df2.dropna(axis=1,how='any')

	A	B	C
20170101	0	1	2
20170102	3	4	5
20170103	6	7	8
20170104	9	10	11

df2.dropna(axis=1,how='all')  # 当一行所有的值为空的时候就删除这列

	A	B	C	D	E
20170101	0	1	2	3.0	NaN
20170102	3	4	5	4.0	32.0
20170103	6	7	8	6.0	5.0
20170104	9	10	11	NaN	2.0

df2.fillna(value=0) # 空值赋为0

	A	B	C	D	E
20170101	0	1	2	3.0	0.0
20170102	3	4	5	4.0	32.0
20170103	6	7	8	6.0	5.0
20170104	9	10	11	0.0	2.0

df2.isnull() # 查看空值

	A	B	C	D	E
20170101	False	False	False	False	True
20170102	False	False	False	False	False
20170103	False	False	False	False	False
20170104	False	False	False	True	False

np.any(df2.isnull()) # 只要有一个或多个空值就会返回True

True

np.all(df2.isnull()) # 所有为空值才返回NTrue

False

五、读取并写入文件

read
to

file = pd.read_csv('../people.csv',encoding='gbk')
file

	地点	名字	职位	工资	在职情况
0	北京	小红	渠道合作经理	15000	在职
1	北京	小明	行政专员	8000	离职
2	北京	小白	行政专员	9000	在职
3	上海	小绿	商户经理	12000	在职
4	上海	小黄	商户经理	10000	离职
5	上海	小黑	团队长	12000	在职
6	广州	小紫	渠道合作主管	20000	在职
7	广州	小粉	渠道合作主管	20000	在职
8	广州	小青	经理	10000	离职
9	广州	小蓝	团队长	13000	在职

file.loc[1,'地点'] = '志哥'

file

	地点	名字	职位	工资	在职情况
0	北京	小红	渠道合作经理	15000	在职
1	志哥	小明	行政专员	8000	离职
2	北京	小白	行政专员	9000	在职
3	上海	小绿	商户经理	12000	在职
4	上海	小黄	商户经理	10000	离职
5	上海	小黑	团队长	12000	在职
6	广州	小紫	渠道合作主管	20000	在职
7	广州	小粉	渠道合作主管	20000	在职
8	广州	小青	经理	10000	离职
9	广州	小蓝	团队长	13000	在职

file.to_csv('peo2.csv')

六、concat拼接

0行1列 outer:默认，缺少部分填空 inner：缺少部分去掉 ignore_index=True不考虑原来的index

df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['a','b','c','d'])
df2 = pd.DataFrame(np.arange(12,24).reshape((3,4)),columns=['a','b','c','d'])
df3 = pd.DataFrame(np.arange(24,36).reshape((3,4)),columns=['a','b','c','d'])
print(df1)
print(df2)
print(df3)

a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
    a   b   c   d
0  12  13  14  15
1  16  17  18  19
2  20  21  22  23
    a   b   c   d
0  24  25  26  27
1  28  29  30  31
2  32  33  34  35

df4 = pd.concat([df1,df2,df3],axis=0)#纵向合并
df4

	a	b	c	d
0	0	1	2	3
1	4	5	6	7
2	8	9	10	11
0	12	13	14	15
1	16	17	18	19
2	20	21	22	23
0	24	25	26	27
1	28	29	30	31
2	32	33	34	35

df4 = pd.concat([df1,df2,df3],axis=0,ignore_index=True)#纵向合并，不考虑原来的index
df4

	a	b	c	d
0	0	1	2	3
1	4	5	6	7
2	8	9	10	11
3	12	13	14	15
4	16	17	18	19
5	20	21	22	23
6	24	25	26	27
7	28	29	30	31
8	32	33	34	35

df5 = pd.concat([df1,df2,df3],axis=1)#横向合并
df5

	a	b	c	d	a	b	c	d	a	b	c	d
0	0	1	2	3	12	13	14	15	24	25	26	27
1	4	5	6	7	16	17	18	19	28	29	30	31
2	8	9	10	11	20	21	22	23	32	33	34	35

df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['a','b','c','f'])
df2 = pd.DataFrame(np.arange(12,24).reshape((3,4)),columns=['a','c','d','e'])
print(df1)
print(df2)

a  b   c   f
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
    a   c   d   e
0  12  13  14  15
1  16  17  18  19
2  20  21  22  23

df6 = pd.concat([df1,df2],join='outer',ignore_index=True)#合并两个表，缺少的部分填充NaN
df6

	a	b	c	f	d	e
0	0	1.0	2	3.0	NaN	NaN
1	4	5.0	6	7.0	NaN	NaN
2	8	9.0	10	11.0	NaN	NaN
3	12	NaN	13	NaN	14.0	15.0
4	16	NaN	17	NaN	18.0	19.0
5	20	NaN	21	NaN	22.0	23.0

df7 = pd.concat([df1,df2],join='inner',ignore_index=True)#合并两个表，缺少的部分去掉
df7

	a	c
0	0	2
1	4	6
2	8	10
3	12	13
4	16	17
5	20	21

df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['a','b','c','f'])
df2 = pd.DataFrame(np.arange(12,24).reshape((4,3)),columns=['a','c','d'])
print(df1)
print(df2)

a  b   c   f
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
    a   c   d
0  12  13  14
1  15  16  17
2  18  19  20
3  21  22  23

df8 = pd.concat([df1,df2],axis=0)#纵向向合并，index使用df1的index
df8

	a	b	c	f	d
0	0	1.0	2	3.0	NaN
1	4	5.0	6	7.0	NaN
2	8	9.0	10	11.0	NaN
0	12	NaN	13	NaN	14.0
1	15	NaN	16	NaN	17.0
2	18	NaN	19	NaN	20.0
3	21	NaN	22	NaN	23.0

df8 = pd.concat([df1,df2],axis=1)#横向合并
df8

	a	b	c	f	a	c	d
0	0.0	1.0	2.0	3.0	12	13	14
1	4.0	5.0	6.0	7.0	15	16	17
2	8.0	9.0	10.0	11.0	18	19	20
3	NaN	NaN	NaN	NaN	21	22	23

七、merge合并

outter、inner（默认）、left、right
suffixes 为columns名增加后缀
left_index right_index 左index索引和右index索引拼接
indicator 显示merge信息

left = pd.DataFrame({'key':['K0','K1','K2','K3'],
                     'A':['A0','A1','A2','A3'],
                     'B':['B0','B1','B2','B3']})

right = pd.DataFrame({'key':['K0','K1','K2','K3'],
                     'C':['C0','C1','C2','C3'],
                     'D':['D0','D1','D2','D3']})

print(left)
print(right)

key   A   B
0  K0  A0  B0
1  K1  A1  B1
2  K2  A2  B2
3  K3  A3  B3
  key   C   D
0  K0  C0  D0
1  K1  C1  D1
2  K2  C2  D2
3  K3  C3  D3

res = pd.merge(left,right,on='key')
res

	key	A	B	C	D
0	K0	A0	B0	C0	D0
1	K1	A1	B1	C1	D1
2	K2	A2	B2	C2	D2
3	K3	A3	B3	C3	D3

pd.concat([left,right],axis=1)

	key	A	B	key	C	D
0	K0	A0	B0	K0	C0	D0
1	K1	A1	B1	K1	C1	D1
2	K2	A2	B2	K2	C2	D2
3	K3	A3	B3	K3	C3	D3

left = pd.DataFrame({'key1':['K0','K0','K1','K2'],
                     'key2':['K0','K1','K0','K1'],
                     'A':['A0','A1','A2','A3'],
                     'B':['B0','B1','B2','B3']})

right = pd.DataFrame({'key1':['K0','K1','K1','K3'],
                      'key2':['K0','K0','K0','K0'],
                     'C':['C0','C1','C2','C3'],
                     'D':['D0','D1','D2','D3']})

print(left)
print(right)

key1 key2   A   B
0   K0   K0  A0  B0
1   K0   K1  A1  B1
2   K1   K0  A2  B2
3   K2   K1  A3  B3
  key1 key2   C   D
0   K0   K0  C0  D0
1   K1   K0  C1  D1
2   K1   K0  C2  D2
3   K3   K0  C3  D3

#how = ['left','right','inner','outer']
res = pd.merge(left,right,on=['key1','key2'],how='outer')#how默认inner
res

	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K0	K1	A1	B1	NaN	NaN
2	K1	K0	A2	B2	C1	D1
3	K1	K0	A2	B2	C2	D2
4	K2	K1	A3	B3	NaN	NaN
5	K3	K0	NaN	NaN	C3	D3

#how = ['left','right','inner','outer']
res = pd.merge(left,right,on=['key1','key2'],how='inner')#how默认inner
res

	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K1	K0	A2	B2	C1	D1
2	K1	K0	A2	B2	C2	D2

#how = ['left','right','inner','outer']
res = pd.merge(left,right,on=['key1','key2'],how='left')#how默认inner
res

	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K0	K1	A1	B1	NaN	NaN
2	K1	K0	A2	B2	C1	D1
3	K1	K0	A2	B2	C2	D2
4	K2	K1	A3	B3	NaN	NaN

#how = ['left','right','inner','outer']
res = pd.merge(left,right,on=['key1','key2'],how='outer',indicator=True)#显示merge信息
res

	key1	key2	A	B	C	D	_merge
0	K0	K0	A0	B0	C0	D0	both
1	K0	K1	A1	B1	NaN	NaN	left_only
2	K1	K0	A2	B2	C1	D1	both
3	K1	K0	A2	B2	C2	D2	both
4	K2	K1	A3	B3	NaN	NaN	left_only
5	K3	K0	NaN	NaN	C3	D3	right_only

#how = ['left','right','inner','outer']
res = pd.merge(left,right,on=['key1','key2'],how='outer',indicator='indicator_column')#显示merge信息
res

	key1	key2	A	B	C	D	indicator_column
0	K0	K0	A0	B0	C0	D0	both
1	K0	K1	A1	B1	NaN	NaN	left_only
2	K1	K0	A2	B2	C1	D1	both
3	K1	K0	A2	B2	C2	D2	both
4	K2	K1	A3	B3	NaN	NaN	left_only
5	K3	K0	NaN	NaN	C3	D3	right_only

left = pd.DataFrame({'A':['A0','A1','A2'],
                     'B':['B0','B1','B2']},
                     index = ['K0','K1','K2'])
right = pd.DataFrame({'C':['C0','C2','C3'],
                      'D':['D0','D2','D3']},
                      index=['K0','K2','K3'])
print(left)
print(right)

A   B
K0  A0  B0
K1  A1  B1
K2  A2  B2
     C   D
K0  C0  D0
K2  C2  D2
K3  C3  D3

res = pd.merge(left,right,left_index=True,right_index=True,how='outer')
res

	A	B	C	D
K0	A0	B0	C0	D0
K1	A1	B1	NaN	NaN
K2	A2	B2	C2	D2
K3	NaN	NaN	C3	D3

boys = pd.DataFrame({'k':['K0','K1','K2'],'age':[1,2,3]})

girls = pd.DataFrame({'k':['K0','K0','K3'],'age':[4,5,6]})

print(boys)
print(girls)

# suffixes 第一张表格签名加_boy 后面加_girl
res = pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='outer')
res

	k	age_boy	age_girl
0	K0	1.0	4.0
1	K0	1.0	5.0
2	K1	2.0	NaN
3	K2	3.0	NaN
4	K3	NaN	6.0

八、pandas的plot

画二维数据 index和value

import matplotlib.pyplot as plt

data = pd.Series(np.random.randn(1000),index=np.arange(1000)) # np.random.randn(1000):返回长度为1000 范围[0,1)的数组
data = data.cumsum() # 累计求和
data.plot()
plt.show()
print(data.head())

Python进阶—Pandas_多列

0    1.034952
1    1.375026
2    1.193759
3    1.519312
4    1.499621
dtype: float64

data = pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=['A','B','C','D'])
data = data.cumsum()
print(data.head()) # 显示前五条

A         B         C         D
0  0.026638 -0.482280 -1.087122  0.081180
1 -0.855294  0.472860 -1.712153 -0.122494
2 -2.542649  0.781326 -1.618599 -1.055795
3 -2.540139  1.202146  0.936892 -2.537980
4 -1.896418  0.601826  1.585485 -2.012536

data.plot()
plt.show()

Python进阶—Pandas_开发语言_02

ax = data.plot.scatter(x='A',y='B',color='Blue',label='class 1')
data.plot.scatter(x='A',y='C',color='Green',label='class 2',ax=ax) # ax=ax 放一个图里
plt.show()

Python进阶—Pandas_多列_03

标签：index,01,进阶,Python,df1,df2,K0,pd,Pandas
From： https://blog.51cto.com/u_14608932/5976372

文章目录