大数据处理与应用hive
#修改云主机host文件,添加内网IP,对应映射名为hadoop000,实现云主机自身使用root用户ssh访问hadoop000免密登陆
vi /etc/hosts
172.18.39.103 hadoop000
hostnamectl set-hostname hadoop000
bash
#配置免密
ssh hadoop000
exit
#2.格式化HDFS文件系统########################################################
hdfs namenode -format
#3.启动Hadoop集群############################################################
#在本机上使用以下指令启动 NameNode 进程:
hadoop-daemon.sh start namenode
#脚本一键启动和关闭
start-all.sh
#4.开启mysql服务###############################################################
#启动MySQL:
systemctl start mysqld.service
#5.初始化Hive元数据库(数据库类型为mysql),进入Hive客户端,创建hive数据库#########################
schematool -dbType mysql -initSchema
#创建hive数据库:
create database hive;
use hive;
#数据分析#################################
########################################
#1.创建project数据###################################
create database project;
create database if not exists project;
#2.project数据库下创建theft数据表,字段见任务说明,分割字符为‘,’
use project;
create table if not exists project.theft(
id string,
case_type string,
case_subtype string,
casename string,
loss string,
case_source string,
time_toplimit string,
time_lowerlimit string,
address string,
accept_time string,
report_time string
)
row format delimited fields terminated by ',';
show tables;
#3.使用load data子句加载云主机数据/root/college/theft.csv据至管理表theft
load data local inpath '/root/college/theft.csv' into table project.theft;
#4.统计2021年5月份发生的案件总数(以报案时间为准),结果写入云主机/root/theft/result01/000000_0文件中
insert overwrite local directory '/root/theft/result01'
row format delimited fields terminated by '\t'
select count(report_time) num from project.theft
where substr(report_time,1,8)='2021年05月';
#5.统计2021年4月份经济损失总额(以报案时间为准),结果写入云主机/root/theft/result02/000000_0文件中
insert overwrite local directory '/root/theft/result02'
row format delimited fields terminated by '\t'
select sum(split(loss,'元')[0]) sum_loss from project.theft
where substr(report_time,1,8)='2021年4月';
INSERT OVERWRITE DIRECTORY '/root/theft/result02'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
SELECT SUM(CAST(SUBSTR(loss, 1, LENGTH(loss) - 1) AS FLOAT)) as total_loss
FROM project.theft
WHERE substr(report_time,1,8)='2021年4月';
#6.查询案发频次最高的地区及对应的案发频次,结果写入云主机/root/theft/result03/000000_0文件中
INSERT OVERWRITE DIRECTORY '/user/hadoop/result03'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
SELECT district, count(*) as freq
FROM project.theft
GROUP BY district
ORDER BY freq DESC
LIMIT 1;
#7.统计"经济损失"最少的案件副类别,结果写入云主机/root/theft/result04/000000_0文件中
#8.统计在A市C区案件中的各案件损失金额前三的副类别,结果写入云主机/root/theft/result05/000000_0文件中
insert overwrite 1ocal directory '/root/theft/result05'
row format delimited fields terminated by '\t'
select case_subtype,sum(split(loss,'元')[0]) sum_loss
from project.theft
where address='A市C区'
group by case_subtype
order by sum_loss desc 1imit 3;
#9.统计盗窃居民小区车辆案件数,结果写入云主机/root/theft/result06/000000_0文件中
insert overwrite local directory '/root/theft/result06'
row format delimited fields terminated by '\t'
select count(case_subtype) num
from project.theft
where case_subtype='盗窃居民小区车辆';
#10.分析2021年6月经济损失最多的案件副类别,结果写入云主机/root/theft/result07/000000_0文件中
#https://blog.csdn.net/lulu001128/article/details/131031727
标签:数据分析,loss,string,project,官方,theft,time,root
From: https://www.cnblogs.com/Cuckoo123456/p/17470504.html