一、PV统计(页面访问量)
(1) 基本概念
通常是衡量一个网络新闻频道或网站甚至一条网络新闻的主要指标。网页浏览数是评价网站流量最常用的指标之一,简称为PV。监测网站PV的变化趋势和分析其变化原因是很多站长定期要做的工作。 Page Views中的Page一般是指普通的html网页,也包含php、jsp等动态产生的html内容。来自浏览器的一次html内容请求会被看作一个PV,逐渐累计成为PV总数。
(2) 计算方法
用户每1次对网站中的每个网页访问均被记录1次。用户对同一页面的多次访问,访问量累计。
(3) 统计分析
--1.创建数据库
create database jfyun;
use jfyun;
--2. 创建用户访问记录表,先预先创建好分区表
create external table data_collect(
accessDate string,
accessHour int,
requestMethod string,
referUrl string,
requestProtocal string,
returnStatus string,
requestUrl string,
referDomain string,
userOrigin string,
originWord string,
browser string,
browserVersion string,
operateSystem string,
requestIp string,
ipNumber int,
userProvince string,
screenSize string,
screenColor string,
pageTitle string,
siteType string,
userFlag string,
visitFlag string,
sFlag string,
timeOnPage int
)partitioned by (access_day string)
row format delimited
fields terminated by '\t'
location '/user/hadoop/external/jfpc/output';
--3. 为表创建分区(先创建分区,然后再使用分区即往分区中加载数据
alter table data_collect add partition(access_day='20150705' );
alter table data_collect add partition(access_day='20150706' );
--4. 执行mapreduce程序存放数据/加载数据到分区表
hadoop jar jfyun.jar com.yun.job.AccessLogEnhanceImportHDFS external/jfpc/input/20150705/130/clickdata-2015070500.log external/jfpc/output/access_day=20150705
hadoop jar jfyun.jar com.yun.job.AccessLogEnhanceImportHDFS external/jfpc/input/20150705/131/clickdata-2015070500.log external/jfpc/output/access_day=20150705
hadoop jar jfyun.jar com.yun.job.AccessLogEnhanceImportHDFS external/jfpc/input/20150705/130/clickdata-2015070501.log external/jfpc/output/access_day=20150705
hadoop jar jfyun.jar com.yun.job.AccessLogEnhanceImportHDFS external/jfpc/input/20150705/131/clickdata-2015070501.log external/jfpc/output/access_day=20150705
hadoop jar jfyun.jar com.yun.job.AccessLogEnhanceImportHDFS external/jfpc/input/20150706 external/jfpc/output/access_day=20150706
--5. 显示表的分区
show partitions data_collect;
--6. 根据分区条件查看分区数据
select * from data_collect where access_day='20150705';
select * from data_collect where access_day='20150706';
--7. 通过hive分析pv数据
--7.1. pv 按天统计
select substr (accessdate,1,8) ,count(1) from data_collect where access_day='20150706' group by substr(accessdate,1,8);
--7.2. pv 按小时统计,插入到指定的表
select accesshour,count (1) staCount from data_collect where access_day='20150706' group by accesshour;
--7.3 每天每个省份PV
select substr (accessdate,1,8),userprovince,count(1) from data_collect where access_day='20150706' group by substr (accessdate,1,8),userprovince
--7.4 每天每个省份 每个小时统计
select substr(accessdate,1,8),userprovince,accesshour,count(1) from data_collect where access_day='20150706' group by substr(accessdate,1,8),userprovince,accesshour
二、UV统计(独立访客)
(1) 基本概念
独立IP:是指独立用户/独立访客。指访问某个站点或点击某条新闻的不同IP地址的人数
(2) 计算方法
在同一天的00:00-24:00内,独立IP只记录第一次进入网站的具有独立IP的访问者,可以通过设置cookie,记录第一次访问设置新用户,后续为老用户
(3) 统计分析
项目需求:
(1)用户访问电商网站,通过js插码的方式收集用户行为日志,然后通过mapreduce程序把用户日志统计到hbase中,按照UV表进行计算。
(2)入库后的数据要进行统计分析
(3)用户日志格式(模拟的数据)
"06/Jul/2015:00:01:04 +0800" "GET" "http%3A//jf.10086.cn/m/" "HTTP/1.1" "200" "http://jf.10086.cn/m/subject/100000000000009_0.html" "Mozilla/5.0 (Linux; U; Android 4.4.2; zh-cn; Lenovo A3800-d Build/LenovoA3800-d) AppleWebKit/533.1 (KHTML, like Gecko)Version/4.0 MQQBrowser/5.4 TBS/025438 Mobile Safari/533.1 MicroMessenger/6.2.0.70_r1180778.561 NetType/cmnet Language/zh_CN" "10.139.198.176" "480x854" "24" "%u5927%u7C7B%u5217%u8868%u9875_%u4E2D%u56FD%u79FB%u52A8%u79EF%u5206%u5546%u57CE" "0" "3037487029517069460000" "3037487029517069460000" "1" "75"
"06/Jul/2015:01:01:04 +0800" "GET" "http%3A//jf.10086.cn/portal/ware/web/SearchWareAction%3Faction%3DsearchWareInfo%26pager.offset%3D144" "HTTP/1.1" "200" "http://jf.10086.cn/portal/ware/web/SearchWareAction?action=searchWareInfo&pager.offset=156" "Mozilla/5.0 (Linux; U; Android 4.4.2; zh-CN; HUAWEI MT2-L01 Build/HuaweiMT2-L01) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 UCBrowser/10.5.2.598 U3/0.8.0 Mobile Safari/534.30" "223.73.104.224" "720x1208" "32" "%u641C%u7D22_%u4E2D%u56FD%u79FB%u52A8%u79EF%u5206%u5546%u57CE" "0" "3046252153674140570000" "3046252153674140570000" "1" "2699"
"06/Jul/2015:02:01:04 +0800" "GET" "" "HTTP/1.1" "200" "http://jf.10086.cn/" "Mozilla/5.0 (Linux; Android 4.4.4; vivo Y13L Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/33.0.0.0 Mobile Safari/537.36 baiduboxapp/5.1 (Baidu; P1 4.4.4)" "10.154.210.240" "480x855" "32" "%u9996%u9875_%u4E2D%u56FD%u79FB%u52A8%u79EF%u5206%u5546%u57CE" "0" "3098781670304015290000" "3098781670304015290000" "0" "831"
"06/Jul/2015:03:01:07 +0800" "GET" "http%3A//wx.10086.cn/wechat-website/wechatwebsite/AccumulatePoints" "HTTP/1.1" "200" "http://jf.10086.cn/m/" "Mozilla/5.0 (Linux; U; Android 4.4.2; zh-cn; Lenovo A3800-d Build/LenovoA3800-d) AppleWebKit/533.1 (KHTML, like Gecko)Version/4.0 MQQBrowser/5.4 TBS/025438 Mobile Safari/533.1 MicroMessenger/6.2.0.70_r1180778.561 NetType/cmnet Language/zh_CN" "10.139.198.176" "480x854" "24" "%u9996%u9875_%u4E2D%u56FD%u79FB%u52A8%u79EF%u5206%u5546%u57CE" "0" "3037487029517069460000" "3037487029517069460000" "1" "135"
(4)数据来源,可以参考下面的网站
技术方案:
(1) 编写Mapreduce,读取每行数据然后保存HBase
(2) 让Hive操作HBase表的数据
(3) Hive统计分析HBase表的数据,分析用户访客行为
1、创建表
create 'UserVisitInfo' , {NAME => 'info'}
2、导入hbase
hadoop jar jfyun.jar com.yun.job.AccessLogImportHBase external/jfpc/input/20150705/130/clickdata-2015070500.log
hadoop jar jfyun.jar com.yun.job.AccessLogImportHBase external/jfpc/input/20150705/131/clickdata-2015070500.log
hadoop jar jfyun.jar com.yun.job.AccessLogImportHBase external/jfpc/input/20150705/130/clickdata-2015070501.log
hadoop jar jfyun.jar com.yun.job.AccessLogImportHBase external/jfpc/input/20150705/131/clickdata-2015070501.log
hadoop jar jfyun.jar com.yun.job.AccessLogImportHBase external/jfpc/input/20150706
3、查看hbase中的数据
3.1 全表查看
scan 'UserVisitInfo'
3.2 根据rowkey查看
hbase(main):012:0> get 'UserVisitInfo','20150706_3037487029517069460000'
COLUMN CELL
info:FirstAccessUrl timestamp=1443000064923, value=/m/subject/100000000000009_0.html
info:browser timestamp=1443000064923, value=Safari
info:browserVersion timestamp=1443000064923, value=533.1
info:firstAccessTime timestamp=1443000064923, value=20150706000104
info:operateSystem timestamp=1443000064923, value=linux
info:recentAccessTime timestamp=1443000065001, value=20150706030107
info:recentAccessUrl timestamp=1443000065001, value=/m/
info:screenColor timestamp=1443000064923, value=24
info:screenSize timestamp=1443000064923, value=480x854
info:siteType timestamp=1443000064923, value=0
info:userFlag timestamp=1443000064923, value=3037487029517069460000
info:userProvince timestamp=1443000064923, value=999
info:userVisitId timestamp=1443000064923, value=20150706_3037487029517069460000
info:visitCount timestamp=1443000065001, value=2
info:visitDay timestamp=1443000064923, value=20150706
info:visitFlag timestamp=1443000064923, value=3037487029517069460000
info:visitHour timestamp=1443000064923, value=0
info:visitIp timestamp=1443000064923, value=10.139.198.176
info:visitKeepTime timestamp=1443000065001, value=10803
4、统计hive分析hbase表的数据
4.1 创建HBase表,将数据添加到HBase表中
UserVisitInfo
4.2 创建HBase表映射的Hive表
(1) 创建表
CREATE external TABLE User_Visit_Info
(
userVisitId string,
FirstAccessUrl string,
browserVersion string,
firstAccessTime string ,
operateSystem string,
recentAccessTime string,
recentAccessUrl string,
screenColor string,
screenSize string,
siteType string,
userFlag string,
userProvince string,
visitCount string,
visitDay string,
visitFlag string,
visitHour string,
visitIp string,
visitKeepTime string
)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,
info:FirstAccessUrl,info:browserVersion,info:firstAccessTime,info:operateSystem,
info:recentAccessTime,info:recentAccessUrl,info:screenColor,info:screenSize,info:siteType,
info:userFlag,info:userProvince,info:visitCount,info:visitDay,info:visitFlag,info:visitHour,
info:visitIp,info:visitKeepTime
")
TBLPROPERTIES ("hbase.table.name" = "UserVisitInfo");
4.3 使用Hive统计分析