1.什么是异常值
偏离观测值较大的值。
?mtcars mydata <- mtcars #删除第8,9列 mydata1 <- mydata[,-c(8,9)] mydata1
运行结果:
> #删除第8,9列 > mydata1 <- mydata[,-c(8,9)] > mydata1 mpg cyl disp hp drat wt qsec gear carb Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 4 4 Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 4 4 Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 4 1 Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 3 1 Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 3 2 Valiant 18.1 6 225.0 105 2.76 3.460 20.22 3 1 Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 3 4 Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 4 2 Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 4 2 Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 4 4 Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 4 4 Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 3 3 Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 3 3 Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 3 3 Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 3 4 Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 3 4 Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 3 4 Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 4 1 Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 4 2 Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 4 1 Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 3 1 Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 3 2 AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 3 2 Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 3 4 Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 3 2 Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 4 1 Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 5 2 Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 5 2 Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 5 4 Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 5 6 Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 5 8 Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 4 2
2.识别异常值
2.1 range 查看第一列的取值范围(最大最小值)
#range 查看第一列的取值范围(最大最小值) range(mydata1[,1])
结果:
> range(mydata1[,1]) [1] 10.4 33.9
2.2 sapply 查看每一列的取值范围(最大最小值)
#sapply 查看每一列的取值范围(最大最小值) sapply(mydata1, range)
结果:
> sapply(mydata1, range) mpg cyl disp hp drat wt qsec gear carb [1,] 10.4 4 71.1 52 2.76 1.513 14.5 3 1 [2,] 33.9 8 472.0 335 4.93 5.424 22.9 5 8
#给mydata1添加一列“A” ,全部赋值10 mydata1["A"] <- 10 mydata1 #删除第10列 mydata1 <- mydata1[,-10] mydata1
结果:
> #给mydata1添加一列“A” ,全部赋值10 > mydata1["A"] <- 10 > mydata1 mpg cyl disp hp drat wt qsec gear carb A Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 4 4 10 Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 4 4 10 Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 4 1 10 Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 3 1 10 Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 3 2 10 Valiant 18.1 6 225.0 105 2.76 3.460 20.22 3 1 10 Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 3 4 10 Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 4 2 10 Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 4 2 10 Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 4 4 10 Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 4 4 10 Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 3 3 10 Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 3 3 10 Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 3 3 10 Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 3 4 10 Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 3 4 10 Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 3 4 10 Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 4 1 10 Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 4 2 10 Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 4 1 10 Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 3 1 10 Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 3 2 10 AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 3 2 10 Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 3 4 10 Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 3 2 10 Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 4 1 10 Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 5 2 10 Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 5 2 10 Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 5 4 10 Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 5 6 10 Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 5 8 10 Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 4 2 10 > mydata1 <- mydata1[,-10] > mydata1 mpg cyl disp hp drat wt qsec gear carb Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 4 4 Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 4 4 Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 4 1 Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 3 1 Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 3 2 Valiant 18.1 6 225.0 105 2.76 3.460 20.22 3 1 Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 3 4 Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 4 2 Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 4 2 Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 4 4 Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 4 4 Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 3 3 Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 3 3 Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 3 3 Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 3 4 Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 3 4 Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 3 4 Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 4 1 Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 4 2 Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 4 1 Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 3 1 Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 3 2 AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 3 2 Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 3 4 Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 3 2 Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 4 1 Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 5 2 Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 5 2 Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 5 4 Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 5 6 Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 5 8 Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 4 2
2.3 rnorm
#生成100个随机数的正态分布 rnorm(100) plot(density(rnorm(100)))
结果:
> rnorm(100) [1] 1.16812126 0.09121299 0.83385038 -0.65646582 -0.29552396 -0.23749601 [7] -1.79031884 -1.11282169 0.19235486 -0.52366489 0.13862362 1.62990541 [13] -1.11707451 -0.49513437 0.79154736 -1.09576397 -0.10932575 -0.01755107 [19] -0.09119959 -1.24834172 1.83782360 0.02472749 -1.56179781 0.04525409 [25] -0.72421202 -0.46173428 0.96779545 -0.38041404 -0.12286390 1.17213849 [31] -1.78280748 -0.76331701 -2.29322659 0.71971508 0.77251970 1.14962475 [37] 2.10222194 -0.91887320 0.43631280 0.29946445 0.06919667 -0.88430606 [43] 0.08755868 0.10892476 -0.45061797 -0.15228235 -1.06998692 0.70049225 [49] 0.99716511 -0.48687140 0.25055817 1.63833960 0.51469283 0.16183685 [55] 1.11752290 0.56776107 0.40052308 1.03374586 0.58174528 0.41615202 [61] -0.54214179 0.71509075 -0.81845411 -1.42947307 0.02506982 0.00746341 [67] 0.89657339 -0.22376070 0.81466225 -0.00293954 -0.89429036 -0.54005125 [73] -0.55493918 -0.48971420 1.00472584 -1.55206657 -1.49556127 0.20762548 [79] -0.31241761 1.49858090 0.38622734 -0.17299046 1.95234367 -0.79700794 [85] -1.13437636 1.07083286 -0.03549790 -1.40575778 0.26609714 1.05151806 [91] -0.56646006 1.53527908 -0.84863214 0.33583422 -0.79183892 -0.16254438 [97] -0.41619916 0.12012751 1.03583217 -1.06057864
#构造一个向量,包含正态矩阵和一个异常值200 d <- c(rnorm(100),200) d
结果:
> #构造一个向量,包含正态矩阵和一个异常值200 > d <- c(rnorm(100),200) > d [1] -0.56001355 1.05238153 0.12994937 0.97432264 -0.60238840 [6] 0.42358562 0.81800149 0.72166480 0.28844043 -1.37637747 [11] 1.57283573 -0.07490066 -0.19501345 -0.38519336 -1.99926797 [16] 0.06252341 0.03724015 -1.01916856 -1.15813221 -0.77049628 [21] 1.03625025 -0.62089178 0.37124752 -0.91627128 -2.11080250 [26] -1.50630384 -0.42341748 -0.02465207 -0.45395521 -0.52911020 [31] -0.89918862 1.80961574 0.49575298 1.11614184 1.44507961 [36] 0.86854770 0.62513437 0.63165574 -0.72413959 -0.19831873 [41] -1.08628031 1.68811785 -1.65809492 -1.02777044 -0.80751298 [46] 0.07285811 -0.84382591 -0.11219811 1.08828834 0.90835285 [51] 0.23240490 0.34246963 0.55302456 1.08317735 -0.86765258 [56] 0.52689518 0.06547722 -1.16916802 0.20016424 1.24468497 [61] 0.63140325 0.76889757 2.16373627 0.01097345 -1.21209642 [66] -1.14192094 1.20751949 -1.21909596 -0.51250581 0.17740712 [71] 0.47884778 -1.54210797 -1.71087851 -1.02963780 -1.70329772 [76] -0.44682489 0.87068263 0.28908129 -2.21313570 -0.04418836 [81] 0.85362812 1.45023516 -1.19358314 0.22766356 -0.46518512 [86] 0.05028882 -2.07874394 -2.06102070 1.25449825 -2.69988603 [91] 0.04762420 -0.25764206 0.27635400 -0.82097771 0.40923734 [96] -0.99378444 2.16360257 -0.31332890 0.69996468 -0.66460736 [101] 200.00000000
2.4 3δ方法找异常值
#3δ方法找异常值 #找到数据d的均值 m <- mean(d) #找到数据d的标准差 s <- sd(d) #计算3倍的标准差 #上限 u <- m+3*s #下限 l <- m-3*s #d小于下限l或者d大于上限u,则为异常值 d<l|d>u
结果:
> #3δ方法找异常值 > #找到数据d的均值 > m <- mean(d) > #找到数据d的标准差 > s <- sd(d) > #计算3倍的标准差 > #上限 > u <- m+3*s > #下限 > l <- m-3*s > #d小于下限l或者d大于上限u,则为异常值 > d<l|d>u [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [97] FALSE FALSE FALSE FALSE TRUE(异常值)
#查看异常值 d[d<l|d>u] #查看异常值的位置 which(d<l|d>u)
结果:
> #查看异常值 > d[d<l|d>u] [1] 200 > #查看异常值的位置 > which(d<l|d>u) [1] 101
将代码提取为函数:选中代码片段——菜单栏code——extract function
sigma(mydata1[,1]) sapply(mydata1, sigma)
2.5 箱线图分析寻找异常值
#箱线图分析寻找异常值 boxplot(mydata1[,1]) mydata1
结果:
> #箱线图分析寻找异常值 > boxplot(mydata1[,1]) > mydata1 mpg cyl disp hp drat wt qsec gear carb Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 4 4 Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 4 4 Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 4 1 Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 3 1 Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 3 2 Valiant 18.1 6 225.0 105 2.76 3.460 20.22 3 1 Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 3 4 Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 4 2 Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 4 2 Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 4 4 Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 4 4 Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 3 3 Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 3 3 Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 3 3 Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 3 4 Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 3 4 Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 3 4 Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 4 1 Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 4 2 Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 4 1 Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 3 1 Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 3 2 AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 3 2 Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 3 4 Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 3 2 Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 4 1 Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 5 2 Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 5 2 Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 5 4 Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 5 6 Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 5 8 Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 4 2
#定义1个函数 boxout <- function(x){ #计算上四分位数 prob75 = quantile(x, probs = 0.75) #计算下四分位数 prob25 = quantile(x, probs = 0.25) #计算四分位间距 iqr = prob75 - prob25 #计算上边界线 u = prob75 + 1.5*iqr #计算上边界线 l = prob25 - 1.5*iqr #大于u小于l的值认定为异常值 x[x<l|x>u] which(x<l|x>u) } boxout(mydata1[,9])
运行结果:
[1] 31
3. 异常值的处理
3.1 删除异常值所在的整行
boxout(mydata1[,9]) #删除异常值所在的行 mydata1[boxout(mydata1[,9]),]
3.2 对数变换
log
标签:10,No.11,识别,FALSE,mydata1,3.07,3.92,Merc,异常 From: https://www.cnblogs.com/bltstop/p/18686411