今天完成了大数据的测试以下是相关代码,代码没有最后的可视化展示
代码作者万事胜意k的主页 - 博客园 (cnblogs.com)
## 数据库
**创建数据库**
```sql
create database journal;
```
**切换到数据库**
```sql
use journal
```
**建立初始表**
```sql
--建立初始表
create table data(
`ip` string comment "城市",
`time` string comment "时间",
`day` string comment "天数",
`traffic` double comment "流量",
`type` string comment "类型 视频/文章",
`id` string comment "视频或者文章的id"
)
row format delimited
fields terminated by ','
lines terminated by '\n';
```
**数据导入**
```sql
--导入数据
load data local inpath '/home/hadoop/result.txt' into table data;
```
**验证**
```sql
SELECT * from data d ;
```
![image-20231013102914842](https://ysk-de-img.oss-cn-beijing.aliyuncs.com/img/202310131029952.png)
## 数据清洗
```sql
-- 数据清洗,将 10/Nov/2016:00:01:02 +0800 修改为 2016-11-10 00:01:03 形式
create table newdata as
select ip,
date_format(from_unixtime(unix_timestamp(`time`,'dd/MMM/yyyy:HH:mm:ss Z'), 'yyyy-MM-dd HH:mm:ss'),
'yyyy-MM-dd HH:mm:ss') as `time`,
day,
traffic,
type,
id
from data;
```
**统计最受欢迎的视频/文章的Top10访问次数**
```sql
--统计最受欢迎的视频/文章的Top10访问次数
CREATE TABLE top_visits AS
SELECT type, id, COUNT(*) AS visit_count
FROM newdata n
GROUP BY type, id
ORDER BY visit_count DESC
LIMIT 10;
```
**按照地市统计最受欢迎的Top10课程**
```sql
CREATE TABLE top_courses_by_city AS
SELECT ip , type, id, COUNT(*) AS visit_count
FROM newdata
GROUP BY ip, type, id
ORDER BY visit_count DESC
LIMIT 10;
```
**按照流量统计最受欢迎的Top10课程**
```sql
CREATE TABLE top_courses_by_traffic AS
SELECT type, id, SUM(traffic) AS total_traffic
FROM newdata
GROUP BY type, id
ORDER BY total_traffic DESC
LIMIT 10;
```
## 数据导入到mysql
**创建数据库**
```sql
create database journal
```
**统计最受欢迎的视频/文章的Top10访问次数**
```sql
CREATE TABLE `top_visits` (
`type` varchar(20) DEFAULT NULL,
`id` varchar(20) DEFAULT NULL,
`visit_count` int(11) DEFAULT NULL
)
```
**导出**
```shell
bin/sqoop export \
--connect jdbc:mysql://node1:3306/journal \
--username root \
--password 123456 \
--table top_visits \
--export-dir /user/hive/warehouse/journal.db/top_visits --input-fields-terminated-by '\001' \
--columns 'type,id,visit_count'
```
**按照地市统计最受欢迎的Top10课程**
```sql
CREATE TABLE `top_courses_by_city` (
`ip` VARCHAR(255),
`type` VARCHAR(255),
`id` VARCHAR(255),
`visit_count` BIGINT
);
```
**导出**
```shell
bin/sqoop export \
--connect jdbc:mysql://node1:3306/journal \
--username root \
--password 123456 \
--table top_courses_by_city \
--export-dir /user/hive/warehouse/journal.db/top_courses_by_city --input-fields-terminated-by '\001' \
--columns 'ip,type,id,visit_count'
```
**按照流量统计最受欢迎的Top10课程**
**创建表**
```sql
CREATE TABLE `top_courses_by_traffic` (
`type` VARCHAR(255),
`id` VARCHAR(255),
`total_traffic` DOUBLE
);
```
**导出**
```sql
bin/sqoop export \
--connect jdbc:mysql://node1:3306/journal \
--username root \
--password 123456 \
--table top_courses_by_traffic \
--export-dir /user/hive/warehouse/journal.db/top_courses_by_traffic --input-fields-terminated-by '\001' \
--columns 'type,id,total_traffic'
```