<?php
/**
* nginx日志过滤脚本
*/
$file = 'access.log';
$access1 = './access1.log';
$access2 = './access2.log';
$access3 = './access3.log';
$access4 = './access4.log';
$access5 = './access5.log';
$access_res = './access6.log';
file_exists($access_res) ? unlink ($access_res) : false;
file_exists($access1) ? unlink ($access1) : false;
file_exists($access2) ? unlink ($access2) : false;
file_exists($access3) ? unlink ($access3) : false;
file_exists($access4) ? unlink ($access4) : false;
//过滤第一个元素为'-' 的行
$command1 = '/bin/awk \'{if($1 !~ /-/) print $0}\' ' . $file . ' > ' . $access1;
//以 '+0800]'拆分行, 并输出后半部分
$command2 = '/bin/awk -F \'+0800]\' \'{print $2}\' ' . $access1 . ' > ' . $access2;
//过滤符合正则的url
$command3 = '/bin/awk -F \' \' \'{if($2 ~ /content\/[0-9]+/) print $0}\' ' . $access2 . ' > ' . $access3;
//过滤掉爬虫、搜索引擎访问记录
$command4 = '/bin/awk \'{if($0 !~ ' .
'/Googlebot|dotbot|spider|Baiduspider|python-requests|Sogou|bingbot|ToutiaoSpider|Spider|BLEXBot|MJ12bot|Yahoo|AhrefsBot|grapeshot|archive|seokicks|linkdex|SinaWeiboBot/' .
') print $0}\' ' . $access3 . ' > ' . $access4;
//过滤掉请求失败记录
$command5 = '/bin/awk \'{if($4 ~ /200/ ) print $0}\' ' . $access4 . ' > ' . $access5;
// $command = '/bin/awk -F \'[ /]\' \'{print $5,"\t",$0}\' ' . $access4 . ' > ' . $access_res;
$output = shell_exec($command1);
$output = shell_exec($command2);
$output = shell_exec($command3);
$output = shell_exec($command4);
$output = shell_exec($command5);
// $output = shell_exec($command);
//多条awk命令组合方式
// $command = '/bin/awk -F\'"\' \'{if($6 !~ /Googlebot|Baiduspider|Sogou|bingbot|Yahoo|AhrefsBot|grapeshot|archive|seokicks|linkdex|SinaWeiboBot/) print $0}\' ' . $access . '| /bin/awk \'{if($9 == 200 || $9 == 301 || $9 == 302) print $0}\' | /bin/awk \'{if($7 ~ /content\/[0-9]+/) print $1,$7}\' > ' . $access6;