一、临时表创建技巧:
drop view mytab2;
create or replace view mytab2 as
select 1 as c1 union all
select 2 as c1 union all
select 3 as c1 ;
--1、子查询
select * ,
c1*(c2*3) as c3
from
(select c1,
c1+2 as c2
from mytab2) ;
--2、with as
with t1 as (
select c1,
c1+2 as c2
from mytab2
),
t2 as (
select * ,
c1*(c2*3) as c3
from t1
)
select * from t2
;
--3、用视图view
create view temp_view1 as
select c1,
c1+2 as c2
from mytab2;
create view temp_view2 as
select * ,
c1*(c2*3) as c3
from temp_view1;
select * from temp_view2;
--4、用临时视图
create temporary view temp_view3 as
select c1,
c1+2 as c2
from mytab2;
create temporary view temp_view4 as
select * ,
c1*(c2*3) as c3
from temp_view3;
select * from temp_view4;
--5、用内存表cache table
drop table cache_table1;
cache table cache_table1 as
select c1,
c1+2 as c2
from mytab2;
cache table cache_table2 as
select * ,
c1*(c2*3) as c3
from cache_table1;
select * from cache_table2;
-
总结:with as , 视图view,临时视图temporary view, 缓存表cache table,表 table的区别
-
1、with T1 as (查询语句1)查询语句N
- 好处:比子查询的可读性高。可以复用多次某个逻辑。
-
2、(不建议)视图view:保存一段查询语句的【逻辑】,而不是查询语句的【结果集】,【永久】有效,查询这个视图,相当于查询一个【虚拟表逻辑】,如果保存的查询逻辑复杂,这查询视图也【耗时】。
- create or replace view 这种方式创建的表不是结果集,保存的是表形成的逻辑,也就是说,用到它时,得从头开始运算得到结果,极其耗时
- 支持重新覆盖 【create or replace view view1 as】
-
3、(不建议)临时视图temporary view,只在当前会话【有效】,如果会话结束,则临时视图【销毁】用 show tables 来查看临时的表或视图,isTemporory = true,表示是临时的。
- 类似于SparkSQL中的DataFrame.createOrReplaceTempView(临时视图名)
- hive【不支持这个语法】
- 支持重新覆盖【create or replace temporary view temp_view3 as】
-
4、(不建议)缓存表cache table :只在当前会话【有效】,将一段查询结果集缓存到【内存】,并赋予一个表名。
- 立即触发。程序结束,表消失。
- hive【不支持这个语法】
-
5、table:永久有效,保存数据结构和数据本身到磁盘。
- create table xxxx as select * from yyyy;
-
二、快速生产一个序列表
-- 方式一:
select explode(array_repeat(0,5)) id;
select row_number() over(order by id) as id
from (select explode(array_repeat(0,5) ) t;
-- 方式二:
select explode(sequence(1,5)) id;
-- 方式三:
select stack(5,1,2,3,4,5) id;
三、max和min 、greatest和least
select class_id,
max(height),
min(height)
from table group by class_id;
select h1,
h2,
h3
greatest(h1,h2,h3),
least(h1,h2,h3)
from table ;
四、sequence(to_date(xx),to_date(xx), interval 1 day )
select sequence(to_date('2021-09-02'),to_date('2021-09-06'), interval 1 day ) days;
create or replace temporary view marketing as
select '华为' brand, '2018-08-04' startdate, '2018-08-05' enddate union all
select '华为' brand, '2018-08-04' startdate, '2020-12-25' enddate union all
select '小米' brand, '2018-08-15' startdate, '2018-08-20' enddate union all
select '小米' brand, '2020-01-01' startdate, '2020-01-05' enddate union all
select '苹果' brand, '2018-09-01' startdate, '2018-09-05' enddate union all
select '苹果' brand, '2018-09-03' startdate, '2018-09-06' enddate union all
select '苹果' brand, '2018-09-09' startdate, '2018-09-15' enddate;
with temp as (
select distinct
brand,
explode(sequence(to_date(startdate),to_date(enddate),interval 1 day)) day
from marketing
)
select
brand ,
count(1) Total
from temp
group by brand
order by count(1) ;
五、SparkSQL工具类
package util
import org.apache.commons.lang3.StringUtils
import org.apache.spark.sql.{DataFrame, SparkSession}
import scala.io.Source
object SparkUtil {
//加载文本文件,提取其中的文本,将里面的每个SQL,都用spark.sql语句执行
def executeSQLFile(spark: SparkSession, filename: String) = {
//01. 加载文本文件
val inputStream = this.getClass.getClassLoader.getResourceAsStream(filename)
//02. 提取其中的文本
val text: String = Source.fromInputStream(inputStream).mkString
//03. 处理文本内容,提取SQL语句执行
text.split(";")
//过滤空行
.filter(sql => StringUtils.isNotBlank(sql))
.foreach(sql => {
//记录sql执行开始时间(毫秒值)
val startTime: Long = System.currentTimeMillis()
println(sql + ";")
val df: DataFrame = spark.sql(sql)
//如果一个SQL语句是以select开头的,则将数据展示。需要剔除掉--注释的行
val temp = sql.split("\n").filter(!_.trim.startsWith("--")).mkString.trim
if (temp.startsWith("select") || temp.startsWith("show")) {
df.show()
}
//记录sql执行结束时间(毫秒值)
val endTime: Long = System.currentTimeMillis()
//展示sql执行耗时:
println("耗时:"+ (endTime-startTime)/1000 + "秒")
println("====================================")
})
}
}
- 测试SparkUtil工具类
- 使用时注意:
- 复制hive-site.xml到resources目录
- SparkSession代码中加上 enableHiveSupport()
package policy
import org.apache.spark.sql.SparkSession
import util.SparkUtil
object Main {
def main(args: Array[String]): Unit = {
val spark:SparkSession = SparkSession.builder()
//连接spark-thriftServer
//.config("hive.metastore.uris", "thrift://node3:9083")
//指定sparkSQL表数据存储位置(hive表存储地址)
// .config("spark.sql.warehouse.dir","hdfs://node1:8020/user/hive/warehouse")
.enableHiveSupport()
.appName("policy_calc")
.master("local[*]")
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
SparkUtil.executeSQLFile(spark,"test.sql")
}
}
- 注意:从resource文件夹里找文件加载执行。