标签:03 04 ... pdata 2019 80 pandas
1.pandas基础
1.1 Series
import pandas as pd
import numpy as np
创建
## 1.创建Series对象
sdata = pd.Series(np.arange(1,4), index=list('abc'))
sdata
a 1
b 2
c 3
dtype: int32
访问
## 2.Series对象访问
# 默认数字索引
print(sdata.iloc[0])
# 使用标签[a,b,c]
print(sdata['b'])
# 使用loc方式,只能使用标签
print(sdata.loc['c'])
1
2
3
## 3.获取index与value
print(sdata.index)
print(sdata.values)
Index(['a', 'b', 'c'], dtype='object')
[1 2 3]
## 4.index与value转成列表
# 将索引转成列表
sdata.index.tolist()
sdata.values.tolist()
[1, 2, 3]
# series对象访问
for item in sdata.items():
print(item)
('a', 1)
('b', 2)
('c', 3)
1.2 dataframe
创建
# 一维数据
pd.DataFrame(data=np.arange(1,4))
# 多维数据 data为4X4
data = np.arange(16).reshape(4,4)
pd.DataFrame(data=data)
|
0 |
1 |
2 |
3 |
0 |
0 |
1 |
2 |
3 |
1 |
4 |
5 |
6 |
7 |
2 |
8 |
9 |
10 |
11 |
3 |
12 |
13 |
14 |
15 |
# 设置index与columns
data = np.arange(16).reshape(4,4)
pdata = pd.DataFrame(data=data, index=list('abcd'), columns=['c1','c2','c3','c4'])
pdata
|
c1 |
c2 |
c3 |
c4 |
a |
0 |
1 |
2 |
3 |
b |
4 |
5 |
6 |
7 |
c |
8 |
9 |
10 |
11 |
d |
12 |
13 |
14 |
15 |
# 设置index与columns
data = {'c1':[1,2,3], 'c2':[4,5,6]}
pdata = pd.DataFrame(data=data)
pdata
# 设置列标签
pdata.columns = ['t1','t2']
pdata
访问
# 取一列,返回Series对象
pdata['t1']
0 1
1 2
2 3
Name: t1, dtype: int64
# 取多列数据
pdata[['t1','t2']]
loc操作
# 获取第一行
pdata.loc[0]
t1 1
t2 4
Name: 0, dtype: int64
# 获取第一、二行指定t1,t2列
pdata.loc[[0,1],['t1','t2']]
DataFrame修改
data = {'c1':[1,2,3], 'c2':[4,5,6], 'c3':[7,8,9]}
pdata = pd.DataFrame(data=data)
pdata
|
c1 |
c2 |
c3 |
0 |
1 |
4 |
7 |
1 |
2 |
5 |
8 |
2 |
3 |
6 |
9 |
# 修改c1列值
pdata['c1'] = 0
pdata
|
c1 |
c2 |
c3 |
0 |
0 |
4 |
7 |
1 |
0 |
5 |
8 |
2 |
0 |
6 |
9 |
# 插入新列
pdata['c4'] = [1,1,1]
pdata
|
c1 |
c2 |
c3 |
c4 |
0 |
0 |
4 |
7 |
1 |
1 |
0 |
5 |
8 |
1 |
2 |
0 |
6 |
9 |
1 |
# 插入行
pdata.loc[3] = [2,2,2,2]
pdata
|
c1 |
c2 |
c3 |
c4 |
0 |
0 |
4 |
7 |
1 |
1 |
0 |
5 |
8 |
1 |
2 |
0 |
6 |
9 |
1 |
3 |
2 |
2 |
2 |
2 |
2 pandas数据导入与保存
数据导入
pd.read_csv(filepath, sep=',', delimiter=None, header='infer', names=None, index_col=None, ...) #读取CSV文件
pd.read_excel(filepath,names=None, index_col=None, usecols=None, ...) #读取Excel文件
pd.read_json(path, orient=None, dtype=None, ...) #读取JSON文件
fpath = r"D:\AI实践课\pokemon.csv"
df = pd.read_csv(fpath,names = ['妖怪','姓名','属性1','属性2','种族值','体力','物攻','防御','特攻','特防','速度'],header = 0)
df
|
妖怪 |
姓名 |
属性1 |
属性2 |
种族值 |
体力 |
物攻 |
防御 |
特攻 |
特防 |
速度 |
0 |
1 |
Bulbasaur |
Grass |
Poison |
318 |
45 |
49 |
49 |
65 |
65 |
45 |
1 |
2 |
Ivysaur |
Grass |
Poison |
405 |
60 |
62 |
63 |
80 |
80 |
60 |
2 |
3 |
Venusaur |
Grass |
Poison |
525 |
80 |
82 |
83 |
100 |
100 |
80 |
3 |
3 |
VenusaurMega Venusaur |
Grass |
Poison |
625 |
80 |
100 |
123 |
122 |
120 |
80 |
4 |
4 |
Charmander |
Fire |
NaN |
309 |
39 |
52 |
43 |
60 |
50 |
65 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
795 |
719 |
Diancie |
Rock |
Fairy |
600 |
50 |
100 |
150 |
100 |
150 |
50 |
796 |
719 |
DiancieMega Diancie |
Rock |
Fairy |
700 |
50 |
160 |
110 |
160 |
110 |
110 |
797 |
720 |
HoopaHoopa Confined |
Psychic |
Ghost |
600 |
80 |
110 |
60 |
150 |
130 |
70 |
798 |
720 |
HoopaHoopa Unbound |
Psychic |
Dark |
680 |
80 |
160 |
60 |
170 |
130 |
80 |
799 |
721 |
Volcanion |
Fire |
Water |
600 |
80 |
110 |
120 |
130 |
90 |
70 |
800 rows × 11 columns
# 无表头指定None
pdata = pd.read_csv(fpath,header=None)
pdata
|
0 |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
0 |
# |
Name |
Type 1 |
Type 2 |
Total |
HP |
Attack |
Defense |
Sp. Atk |
Sp. Def |
Speed |
1 |
1 |
Bulbasaur |
Grass |
Poison |
318 |
45 |
49 |
49 |
65 |
65 |
45 |
2 |
2 |
Ivysaur |
Grass |
Poison |
405 |
60 |
62 |
63 |
80 |
80 |
60 |
3 |
3 |
Venusaur |
Grass |
Poison |
525 |
80 |
82 |
83 |
100 |
100 |
80 |
4 |
3 |
VenusaurMega Venusaur |
Grass |
Poison |
625 |
80 |
100 |
123 |
122 |
120 |
80 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
796 |
719 |
Diancie |
Rock |
Fairy |
600 |
50 |
100 |
150 |
100 |
150 |
50 |
797 |
719 |
DiancieMega Diancie |
Rock |
Fairy |
700 |
50 |
160 |
110 |
160 |
110 |
110 |
798 |
720 |
HoopaHoopa Confined |
Psychic |
Ghost |
600 |
80 |
110 |
60 |
150 |
130 |
70 |
799 |
720 |
HoopaHoopa Unbound |
Psychic |
Dark |
680 |
80 |
160 |
60 |
170 |
130 |
80 |
800 |
721 |
Volcanion |
Fire |
Water |
600 |
80 |
110 |
120 |
130 |
90 |
70 |
801 rows × 11 columns
# 导入指定表头
pdata = pd.read_csv(fpath,header=1)
pdata
|
1 |
Bulbasaur |
Grass |
Poison |
318 |
45 |
49 |
49.1 |
65 |
65.1 |
45.1 |
0 |
2 |
Ivysaur |
Grass |
Poison |
405 |
60 |
62 |
63 |
80 |
80 |
60 |
1 |
3 |
Venusaur |
Grass |
Poison |
525 |
80 |
82 |
83 |
100 |
100 |
80 |
2 |
3 |
VenusaurMega Venusaur |
Grass |
Poison |
625 |
80 |
100 |
123 |
122 |
120 |
80 |
3 |
4 |
Charmander |
Fire |
NaN |
309 |
39 |
52 |
43 |
60 |
50 |
65 |
4 |
5 |
Charmeleon |
Fire |
NaN |
405 |
58 |
64 |
58 |
80 |
65 |
80 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
794 |
719 |
Diancie |
Rock |
Fairy |
600 |
50 |
100 |
150 |
100 |
150 |
50 |
795 |
719 |
DiancieMega Diancie |
Rock |
Fairy |
700 |
50 |
160 |
110 |
160 |
110 |
110 |
796 |
720 |
HoopaHoopa Confined |
Psychic |
Ghost |
600 |
80 |
110 |
60 |
150 |
130 |
70 |
797 |
720 |
HoopaHoopa Unbound |
Psychic |
Dark |
680 |
80 |
160 |
60 |
170 |
130 |
80 |
798 |
721 |
Volcanion |
Fire |
Water |
600 |
80 |
110 |
120 |
130 |
90 |
70 |
799 rows × 11 columns
数据保存
pdata.to_csv(path_or_buf=None, sep=',', ...)
pdata.to_excel(excel_writer, sheet_name='Sheet1', na_rep='', ...)
pdata.to_json(path_or_buf=None, orient=None, ...)
3 缺失数据处理
s1 = [10, 10.5, None, 11]
s2 = [7, 6.9,7.5,None]
df = pd.DataFrame({'s1':s1, 's2':s2})
df
|
s1 |
s2 |
0 |
10.0 |
7.0 |
1 |
10.5 |
6.9 |
2 |
NaN |
7.5 |
3 |
11.0 |
NaN |
缺失值判断
s1 = [10, 10.5, None, 11]
s2 = [7, 6.9,7.5,None]
pdata = pd.DataFrame({'s1':s1, 's2':s2})
pd.isnull(pdata)
|
s1 |
s2 |
0 |
False |
False |
1 |
False |
False |
2 |
True |
False |
3 |
False |
True |
# pd.notnull,若包含缺省值,缺省值对应值为False
# np.all:若对象中包含假,返回False, 否则返回真
np.all(pd.notnull(pdata))
# 返回False, 说明包含缺省值,否则不包含缺省值
False
# isnull:缺省值对应值为True
# any:对象中包含真,返回True
np.any(pd.isnull(pdata))
# 返回False,说明不含缺省值,返回True说明包括缺省值
True
缺省值处理方式
1 缺省值过滤
bindex = np.all(pdata.notnull(), axis=1)
bindex
0 True
1 True
2 False
3 False
dtype: bool
pdata[bindex]
|
s1 |
s2 |
0 |
10.0 |
7.0 |
1 |
10.5 |
6.9 |
2 删除缺省值
pdata.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
- axis : 0 或 'index':按行操作,1 或 'columns':按列操作
- how :根据 axis 指定操作方式,any:只要有一个 Na 就删除,all:全部为 Na 才删除
- thresh: 指定非 Na 值的数量,非 Na 数量大于等于 thresh 时不删除
- subset: 指定操作的列子集
- inplace: True:在原始数据中进行修改350
s1 = [10, 10.5, None, 11]
s2 = [7, 6.9,7.5,None]
s3 = [7, 6.9,7.5,7]
s4 = [None, 6.9,None,7.2]
pdata = pd.DataFrame({'s1':s1, 's2':s2, 's3':s3,'s4':s4})
pdata
|
s1 |
s2 |
s3 |
s4 |
0 |
10.0 |
7.0 |
7.0 |
NaN |
1 |
10.5 |
6.9 |
6.9 |
6.9 |
2 |
NaN |
7.5 |
7.5 |
NaN |
3 |
11.0 |
NaN |
7.0 |
7.2 |
# 删除包含缺省值行
pdata.dropna()
|
s1 |
s2 |
s3 |
s4 |
1 |
10.5 |
6.9 |
6.9 |
6.9 |
# 缺省值数量大于1,thresh设置为3
# thresh:指定非Na数量(非Na数量>=thresh,不删除)
pdata.dropna(thresh=3)
|
s1 |
s2 |
s3 |
s4 |
0 |
10.0 |
7.0 |
7.0 |
NaN |
1 |
10.5 |
6.9 |
6.9 |
6.9 |
3 |
11.0 |
NaN |
7.0 |
7.2 |
# 指定列:['s1','s4']
pdata.dropna(subset=['s1','s4'])
|
s1 |
s2 |
s3 |
s4 |
1 |
10.5 |
6.9 |
6.9 |
6.9 |
3 |
11.0 |
NaN |
7.0 |
7.2 |
# 删除包含缺省值列
pdata.dropna(axis=1)
|
s3 |
0 |
7.0 |
1 |
6.9 |
2 |
7.5 |
3 |
7.0 |
3 缺失值填充
pdata.fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None, **kwargs)
- value 填充值-
method 填充方式:{'backfill', 'bfill', 'pad', 'ffill', None}-
axis 指定行列:0 或 'index' 表示按行,1 或 'columns' 表示按- 列
limit 插入数量限制
pdata
|
s1 |
s2 |
s3 |
s4 |
0 |
10.0 |
7.0 |
7.0 |
NaN |
1 |
10.5 |
6.9 |
6.9 |
6.9 |
2 |
NaN |
7.5 |
7.5 |
NaN |
3 |
11.0 |
NaN |
7.0 |
7.2 |
# 固定值0
pdata.fillna(0)
|
s1 |
s2 |
s3 |
s4 |
0 |
10.0 |
7.0 |
7.0 |
0.0 |
1 |
10.5 |
6.9 |
6.9 |
6.9 |
2 |
0.0 |
7.5 |
7.5 |
0.0 |
3 |
11.0 |
0.0 |
7.0 |
7.2 |
# 使用前填充 (ffill)
pdata.ffill()
|
s1 |
s2 |
s3 |
s4 |
0 |
10.0 |
7.0 |
7.0 |
NaN |
1 |
10.5 |
6.9 |
6.9 |
6.9 |
2 |
10.5 |
7.5 |
7.5 |
6.9 |
3 |
11.0 |
7.5 |
7.0 |
7.2 |
# 使用后填充 (bfill),行
pdata.bfill()
|
s1 |
s2 |
s3 |
s4 |
0 |
10.0 |
7.0 |
7.0 |
6.9 |
1 |
10.5 |
6.9 |
6.9 |
6.9 |
2 |
11.0 |
7.5 |
7.5 |
7.2 |
3 |
11.0 |
NaN |
7.0 |
7.2 |
# 使用后填充 (bfill),列
pdata.bfill(axis=1)
|
s1 |
s2 |
s3 |
s4 |
0 |
10.0 |
7.0 |
7.0 |
NaN |
1 |
10.5 |
6.9 |
6.9 |
6.9 |
2 |
7.5 |
7.5 |
7.5 |
NaN |
3 |
11.0 |
7.0 |
7.0 |
7.2 |
#插入均值
pdata.fillna(pdata.mean())
|
s1 |
s2 |
s3 |
s4 |
0 |
10.0 |
7.000000 |
7.0 |
7.05 |
1 |
10.5 |
6.900000 |
6.9 |
6.90 |
2 |
10.5 |
7.500000 |
7.5 |
7.05 |
3 |
11.0 |
7.133333 |
7.0 |
7.20 |
#插入中位数
pdata.fillna(pdata.median())
|
s1 |
s2 |
s3 |
s4 |
0 |
10.0 |
7.0 |
7.0 |
7.05 |
1 |
10.5 |
6.9 |
6.9 |
6.90 |
2 |
10.5 |
7.5 |
7.5 |
7.05 |
3 |
11.0 |
7.0 |
7.0 |
7.20 |
4 数据清洗
names = list('ABCD')
math = [90,100,50,80]
chinese = [89,96,58,77]
pdata = pd.DataFrame({'name':names, 'math':math, 'chinese':chinese})
pdata
|
name |
math |
chinese |
0 |
A |
90 |
89 |
1 |
B |
100 |
96 |
2 |
C |
50 |
58 |
3 |
D |
80 |
77 |
# 需求1:数学成绩大于80的所有成绩;
bindex = pdata['math'] > 80
pdata[bindex]
|
name |
math |
chinese |
0 |
A |
90 |
89 |
1 |
B |
100 |
96 |
# 需求2:获取同学A的成绩;
pdata[pdata['name']=='A']
|
name |
math |
chinese |
0 |
A |
90 |
89 |
# 需求1:获取数学语文都及格成绩
# 注意:两个条件要加括号
pdata[(pdata['math']>59) & (pdata['chinese']>59)]
|
name |
math |
chinese |
0 |
A |
90 |
89 |
1 |
B |
100 |
96 |
3 |
D |
80 |
77 |
# 需求2:获取数学语文有一门大于等于80分
# 注意:两个条件要加括号
pdata[(pdata['math']>=80) | (pdata['chinese']>=80)]
|
name |
math |
chinese |
0 |
A |
90 |
89 |
1 |
B |
100 |
96 |
3 |
D |
80 |
77 |
# 根据集合获取数据
bindex = pdata['math'].isin([100, 90])
pdata[bindex]
|
name |
math |
chinese |
0 |
A |
90 |
89 |
1 |
B |
100 |
96 |
# 根据索引排序,降序,ascending=False
pdata.sort_index(ascending=False)
|
name |
math |
chinese |
3 |
D |
80 |
77 |
2 |
C |
50 |
58 |
1 |
B |
100 |
96 |
0 |
A |
90 |
89 |
# 根据数学与语文成绩排序,降序,ascending=False
# sort_values中加入两列数据
pdata.sort_values(['math', 'chinese'], ascending=False)
|
name |
math |
chinese |
1 |
B |
100 |
96 |
0 |
A |
90 |
89 |
3 |
D |
80 |
77 |
2 |
C |
50 |
58 |
5 pandas汇总与描述性统计
#最大值,最小值,四分位数,均值,数量,标准差
pdata.describe()
|
math |
chinese |
count |
4.000000 |
4.0000 |
mean |
80.000000 |
80.0000 |
std |
21.602469 |
16.6333 |
min |
50.000000 |
58.0000 |
25% |
72.500000 |
72.2500 |
50% |
85.000000 |
83.0000 |
75% |
92.500000 |
90.7500 |
max |
100.000000 |
96.0000 |
#计算每个学生总分,平均分
print(pdata.iloc[:, 1:].sum(axis=1))
print(pdata.iloc[:, 1:].mean(axis=1))
0 179
1 196
2 108
3 157
dtype: int64
0 89.5
1 98.0
2 54.0
3 78.5
dtype: float64
6 索引
pdata
|
name |
math |
chinese |
0 |
A |
90 |
89 |
1 |
B |
100 |
96 |
2 |
C |
50 |
58 |
3 |
D |
80 |
77 |
# 设置索引
# set_index:inplace返回副本,新数据
ndata = pdata.set_index('name')
ndata
|
math |
chinese |
name |
|
|
A |
90 |
89 |
B |
100 |
96 |
C |
50 |
58 |
D |
80 |
77 |
7 时间与时间序列
#年月日
print(pd.Timestamp(2020, 6, 2))
#字符串
print(pd.Timestamp('2020-05-07'))
#字符串时间
print(pd.Timestamp('2020-05-07 04:02:01'))
#2017-03-01与format对应
print(pd.to_datetime('2017-02-01',format="%Y-%m-%d"))
#20170301与年月日对应
print(pd.to_datetime('20170301',format="%Y%m%d"))
2020-06-02 00:00:00
2020-05-07 00:00:00
2020-05-07 04:02:01
2017-02-01 00:00:00
2017-03-01 00:00:00
# 周期单位为Day
print(pd.date_range('2017-01-01', periods=2))
# 周期单位为hour
print(pd.date_range('2017-01-01 02', periods=2, freq='h'))
# 每个月月初
print(pd.date_range('2017-01', periods=3, freq='MS'))
# 每个月月底
print(pd.date_range('2017-01', periods=3, freq='ME'))
DatetimeIndex(['2017-01-01', '2017-01-02'], dtype='datetime64[ns]', freq='D')
DatetimeIndex(['2017-01-01 02:00:00', '2017-01-01 03:00:00'], dtype='datetime64[ns]', freq='h')
DatetimeIndex(['2017-01-01', '2017-02-01', '2017-03-01'], dtype='datetime64[ns]', freq='MS')
DatetimeIndex(['2017-01-31', '2017-02-28', '2017-03-31'], dtype='datetime64[ns]', freq='ME')
8 数据清洗
ts = ['2019-03-25','2019-03-26','2019-03-26','2019-03-26','2019-03-29','2019-03-30','2019-03-31',
'2019-04-01','2019-04-02','2019-04-03','2019-04-04','2019-05-01','2019-04-05',]
values = np.arange(len(ts))
pdata = pd.DataFrame({'ts':ts, 'values':values})
pdata
|
ts |
values |
0 |
2019-03-25 |
0 |
1 |
2019-03-26 |
1 |
2 |
2019-03-26 |
2 |
3 |
2019-03-26 |
3 |
4 |
2019-03-29 |
4 |
5 |
2019-03-30 |
5 |
6 |
2019-03-31 |
6 |
7 |
2019-04-01 |
7 |
8 |
2019-04-02 |
8 |
9 |
2019-04-03 |
9 |
10 |
2019-04-04 |
10 |
11 |
2019-05-01 |
11 |
12 |
2019-04-05 |
12 |
# 唯一值
pdata.ts.unique()
array(['2019-03-25', '2019-03-26', '2019-03-29', '2019-03-30',
'2019-03-31', '2019-04-01', '2019-04-02', '2019-04-03',
'2019-04-04', '2019-05-01', '2019-04-05'], dtype=object)
# 数值出现次数
pdata.ts.value_counts()
ts
2019-03-26 3
2019-03-25 1
2019-03-29 1
2019-03-30 1
2019-03-31 1
2019-04-01 1
2019-04-02 1
2019-04-03 1
2019-04-04 1
2019-05-01 1
2019-04-05 1
Name: count, dtype: int64
# 删除指定行列
# 删除单行
print(pdata.drop(0))
ts values
1 2019-03-26 1
2 2019-03-26 2
3 2019-03-26 3
4 2019-03-29 4
5 2019-03-30 5
6 2019-03-31 6
7 2019-04-01 7
8 2019-04-02 8
9 2019-04-03 9
10 2019-04-04 10
11 2019-05-01 11
12 2019-04-05 12
# 删除多行
print(pdata.drop(index=[1,2,3]))
ts values
0 2019-03-25 0
4 2019-03-29 4
5 2019-03-30 5
6 2019-03-31 6
7 2019-04-01 7
8 2019-04-02 8
9 2019-04-03 9
10 2019-04-04 10
11 2019-05-01 11
12 2019-04-05 12
# 删除列
print(pdata.drop(columns='ts'))
values
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
11 11
12 12
# 删除index为0值,删除列为ts的值
pdata.drop(index=0, columns='ts')
|
values |
1 |
1 |
2 |
2 |
3 |
3 |
4 |
4 |
5 |
5 |
6 |
6 |
7 |
7 |
8 |
8 |
9 |
9 |
10 |
10 |
11 |
11 |
12 |
12 |
# 去重
# 保留第一次
pdata.drop_duplicates('ts')
|
ts |
values |
0 |
2019-03-25 |
0 |
1 |
2019-03-26 |
1 |
4 |
2019-03-29 |
4 |
5 |
2019-03-30 |
5 |
6 |
2019-03-31 |
6 |
7 |
2019-04-01 |
7 |
8 |
2019-04-02 |
8 |
9 |
2019-04-03 |
9 |
10 |
2019-04-04 |
10 |
11 |
2019-05-01 |
11 |
12 |
2019-04-05 |
12 |
#保留最后一次
pdata.drop_duplicates(subset='ts', keep='last')
|
ts |
values |
0 |
2019-03-25 |
0 |
3 |
2019-03-26 |
3 |
4 |
2019-03-29 |
4 |
5 |
2019-03-30 |
5 |
6 |
2019-03-31 |
6 |
7 |
2019-04-01 |
7 |
8 |
2019-04-02 |
8 |
9 |
2019-04-03 |
9 |
10 |
2019-04-04 |
10 |
11 |
2019-05-01 |
11 |
12 |
2019-04-05 |
12 |
9 pandas数据处理常用函数
apply
n = list('ABCD')
math = [90,80,47,69]
chinese = [95,78,96,59]
nclass = ['001','001','002','002']
df = pd.DataFrame({'name':n,'math':math, 'chinese':chinese,'class':nclass})
df
|
name |
math |
chinese |
class |
0 |
A |
90 |
95 |
001 |
1 |
B |
80 |
78 |
001 |
2 |
C |
47 |
96 |
002 |
3 |
D |
69 |
59 |
002 |
# 修改name名字
name =df['name']
name.apply(lambda x:x+x)
0 AA
1 BB
2 CC
3 DD
Name: name, dtype: object
# 将成绩转成True或者False
df[['math','chinese']].apply(lambda x : x>59)
|
math |
chinese |
0 |
True |
True |
1 |
True |
True |
2 |
False |
True |
3 |
True |
False |
func
def func(value):
print(type(value))
return np.mean(value)
df[['math','chinese']].apply(func)
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
math 71.5
chinese 82.0
dtype: float64
map
df['math'].map(lambda x: 'pass' if x > 59 else 'failed')
0 pass
1 pass
2 failed
3 pass
Name: math, dtype: object
replace
# 将[A,B]替换成*
df.replace(['A','B'], '*')
|
name |
math |
chinese |
class |
0 |
* |
90 |
95 |
001 |
1 |
* |
80 |
78 |
001 |
2 |
C |
47 |
96 |
002 |
3 |
D |
69 |
59 |
002 |
# 一组数据替换
df.replace(list('ABCD'), list('abcd'))
|
name |
math |
chinese |
class |
0 |
a |
90 |
95 |
001 |
1 |
b |
80 |
78 |
001 |
2 |
c |
47 |
96 |
002 |
3 |
d |
69 |
59 |
002 |
# 正则:将所有字母替换成*
df.replace(r'[A-Z]','*', regex=True)
|
name |
math |
chinese |
class |
0 |
* |
90 |
95 |
001 |
1 |
* |
80 |
78 |
001 |
2 |
* |
47 |
96 |
002 |
3 |
* |
69 |
59 |
002 |
10 分组处理
classname = ['001','001','002','002','003','003']
name = ['sun','li','zhou','wang','zao','wu']
height = [169, 172,180,170,165,175]
weights = [61,53,75,64,50,58]
df = pd.DataFrame({'cname':classname, 'user':name, 'height':height, 'weights':weights})
df
|
cname |
user |
height |
weights |
0 |
001 |
sun |
169 |
61 |
1 |
001 |
li |
172 |
53 |
2 |
002 |
zhou |
180 |
75 |
3 |
002 |
wang |
170 |
64 |
4 |
003 |
zao |
165 |
50 |
5 |
003 |
wu |
175 |
58 |
groupby分组
dfg = df.groupby('cname')
dfg.groups
{'001': [0, 1], '002': [2, 3], '003': [4, 5]}
# 分组统计
dfg.count()
|
user |
height |
weights |
cname |
|
|
|
001 |
2 |
2 |
2 |
002 |
2 |
2 |
2 |
003 |
2 |
2 |
2 |
# 根据多列进行分组:
dfg = df.groupby(['cname','height'])
dfg.groups
{('001', 169): [0], ('001', 172): [1], ('002', 170): [3], ('002', 180): [2], ('003', 165): [4], ('003', 175): [5]}
# 统计结果为多级索引
dfg.count()
|
|
user |
weights |
cname |
height |
|
|
001 |
169 |
1 |
1 |
172 |
1 |
1 |
002 |
170 |
1 |
1 |
180 |
1 |
1 |
003 |
165 |
1 |
1 |
175 |
1 |
1 |
标签:03,
04,
...,
pdata,
2019,
80,
pandas
From: https://www.cnblogs.com/1019-Yan/p/18315461