查询
查询
数据类型
官方文档 LanguageManual Types
复杂类型使用示例
CREATE TABLE movies(
participants ARRAY<string>,
release_dates MAP<string,timestamp>,
studio_addr STRUICT<state:string,city:string,zip:string,streetnbr:int,streetname:string,unit:string>,
complex_participants MAP<string,STRUCT<address:string,attributes MAP<string,string>>>
misc UNIONTYPE<int,string,ARRAY<double>>
);
select movie_name,
release_dates["USA"],
studio_addr.zip,
complex_participants["Leonardo Dicaprio"].attributes["fav_color"],
misc
from movies;
Array
创建包含Array类型的表
create table mobilephones (
id string,
title string,
cost float,
colors array<string>,
screen_size array<float>
);
insert into table mobilephones
select
"redminote7", "Redmi Note 7", 300,
array("white", "silver", "black"), array(float(4.5))
UNION ALL
select
"motoGplus", "Moto G Plus", 200, array("black", "gold"),
array(float(4.5), float(5.5));
select * from mobilephones;
select id, colors from mobilephones;
select id, colors[0] from mobilephones;
执行数组类型字段的分隔符并导入数据
create table mobilephones (
id string,
title string,
cost float,
colors array<string>,
screen_size array<float>
)
row format delimited fields terminated by ','
collection items terminated by '#';
load data local inpath 'mobilephones.csv'
into table mobilephones;
查询数组
SELECT name,subordinates[0] FROM employees;
collect_set (会去重)collect_list(不去重)
select cookie_id,collect_set(ad_id) as orders
from click_log
group by cookie_id;
-- 输出
cookie_id orders
11 ["ad_101","ad_104"]
22 ["ad_104","ad_102","ad_103"]
impala中不支持数组,但是可以用group_concat函数达到同样的效果
select
cookie_id,group_concat(ad_id,'|') as orders
from click_log
group by cookie_id;
LATERAL VIEW
这里catalogs是数组. 通过LATERAL VIEW 语句让数组横向展示.
select click.cookie_id,ad.catalog from click_log click
left outer join (
select ad_ikd,catalog from ad_list LATERAL VIEW OUTER expolode(catalogs) t AS catalog
) ad
on (click.ad_id = ad.ad_id);
数组排序
select ad_id,sort_array(catalogs) from ad_list;
查询数组是否包含某值
select ad_id,catalogs from ad_list where array_contains(catalogs,'catalog1');
Map
create table mobilephones (
id string,
title string,
cost float,
colors array<string>,
screen_size array<float>,
features map<string, boolean>
)
row format delimited fields terminated by ','
collection items terminated by '#'
map keys terminated by ':';
Edit the file to add features
load data local inpath 'mobilephones.csv'
into table mobilephones;
select id, features[' camera'] from mobilephones;
创建Map
-- 使用 map 函数,格式: (key1, value1, key2, value2, ...)
select map('key1',1,'key2',2)
-- 使用 str_to_map 函数,格式:str_to_map(text[, delimiter1, delimiter2])
-- Delimiter1用来分割键值对,默认是逗号','
-- Delimiter2用来区分键和值,默认为冒号':'
select str_to_map('aaa:11&bbb:22', '&', ':')
取key,value: map_keys
、map_values
使用: map
与lateral view
查看大小:size(Map<K.V>)
查看是否包含某个key:array_contains(a,'test') from t1
查找key中包含item8的行
select * from f_orders where array_contains(map_keys(items),'item8');
侧向显示items
select user_id,order_id,item,amount from f_orders LATERAL VIEW explode(items) t AS item,amount;
侧向展示,并显示在原来map中的位置,貌似只能用数组
select username, pos, ts, page_id from ts_int
lateral view posexplode(visits) t as pos, ts, page_id;
将两个字段之前相差的数字进行行转列
select t.f1
,t.start_r - pe.i as seq_no
from (select 'ABC' as f1,62971 as start_r,62937 as end_r) t
lateral view posexplode(split(space(start_r - end_r),' ')) pe as i,s
;
Struct
create table mobilephones (
id string,
title string,
cost float,
colors array<string>,
screen_size array<float>,
features map<string, boolean>,
information struct<battery:string,camera:string>
)
row format delimited fields terminated by ','
collection items terminated by '#'
map keys terminated by ':';
--Edit the file to add information
load data local inpath 'mobilephones.csv'
into table mobilephones;
select id, features, information
from mobilephones;
select id, features['camera'], information.battery
from mobilephones;
select id, features['camera'] as CameraPresent, information.battery
from mobilephones;
查询
Grouping Sets,Cube,Rollup
Grouping Sets
SELECT a, b, c, SUM(d) FROM t1 GROUP BY a, b WITH CUBE
--相当于
SELECT a, b, SUM(c) FROM t1 GROUP BY a, b
UNION ALL
SELECT a, NULL, SUM(c) FROM t1 GROUP BY a
SELECT a, b, SUM(c) FROM t1 GROUP BY a, b GROUPING SETS (a,b,())
--相当于
SELECT a, NULL, SUM(c) FROM t1 GROUP BY a
UNION ALL
SELECT NULL, b, SUM(c) FROM t1 GROUP BY b
UNION ALL
SELECT NULL, NULL, SUM(c) FROM t1
Cube
SELECT a, b, c, SUM(d) FROM t1 GROUP BY a, b WITH CUBE
--相当于
SELECT a, b, c, SUM(d) FROM t1 GROUP BY a, b, c GROUPING SETS
((a,b,c),(a,b),(b,c),(a,c),a,b,c,())
Rollup
SELECT a, b, c, SUM(d) FROM t1 GROUP BY a, b WITH ROLLUP
--相当于
SELECT a, b, c, SUM(d) FROM t1 GROUP BY a, b, c GROUPING SETS
((a,b,c),(a,b),a,())
显示函数
SHOW FUNCTIONS;
DESCRIBE FUNCTION length;
Lateral View
Lateral View通过UDTF函数作为输入,然后提供组合的查询结果。
--语法
SELECT a,b,columnAlias
FROM baseTable
LATERAL VIEW UDTF(expression) tableAlias AS columnAlias;
--例子
SELECT a,b,col1,col2
FROM baseTable
LATERAL VIEW UDTF(x )t1 AS col1
LATERAL VIEW UDTF(col1)52 AS col2;
RLIKE 语句
RLIKE可以让我们可以在Hive中使用Java正则表达式进行查询。
SELECT name,address.street
FROM employees WHERE address.street RLIKE '.*(Chicago|Ontario).*';
通过正则选择字段
SELECT语句支持通过正则表达式选择字段,用的是Java的正则语法,可以通过该网站进行正则的验证http://www.fileformat.info/tool/regex.htm,以下语句表示选择除ds和hr之外的所有字段。
SELECT `(ds|hr)?+.+` FROM sales
提示
实际应用中不建议使用,维护和理解都会有问题。
JOIN
HIVE的JOIN类型有很多,Inner Join、LEFT OUTER Join、RIGHT OUTER Join、FULL OUTER Join、LEFT SEMI Join、Cartesian Product Join、Map-side Join。挑几个说下。
LEFT SEMI JOIN
返回符合ON谓词条件的左边表的记录。比Inner JOIN效率高
SELECT a.val FROM a LEFT SEMI JOIN b ON (a.key = b.key);
--与下面语句等价
SELECT a.val FROM a WHERE a.key IN (SELECT b.key FROM b) - Not Supported
SELECT a.val FROM a WHERE EXISTS (SELECT 1 FROM b WHERE b.key = a.key) - Not Supported
Cartesian Product Join
左右两边笛卡尔积
SELECT * FROM CUSTOMERS JOIN ORDERS;
Map-side Join
根据mapjoin的计算原理,MAPJION会把小表全部读入内存中,在map阶段直接拿另外一个表的数据和内存中表数据做匹配。
SELECT /*+MAPJOIN(o)*/ c.ID,c.NAME,o.AMOUNT
FROM CUSTOMERS c JOIN ORDERS o
ON (c.ID=o.CUSTOMER_ID)
其他
Job合并文件
set hive.merge.smallfiles.avgsize=256000000;当输出平均大小小于该值,启用新的job合并文件。
set hive.merge.size.per.task = 64000000;合并后的文件大小。
限制用户查询
设置以下选项用户查询数据的时候必须指定分区和返回数量
配置文件中
<property>
<name>hive.mapred.mode</name>
<value>strict</value>
</property>
或者
set hive.mapred.mode=strict;
压缩
设置压缩
-- SEQUENCEFILE
set hive.exec.compress.output=true;
set mapred.output.compress=true;
set mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
set mapred.output.compression.type=BLOCK;
select count(1) from odl_order_eb_snappy;
-- ORC
set hive.exec.orc.default.compress=SNAPPY
或者
<property>
<name>hive.exec.compress.output</name>
<value>true</value>
</property>
<property>
<name>mapred.output.compression.codec</name>
<value>org.apache.hadoop.io.compress.SnappyCodec</value>
</property>
<property>
<name>mapred.output.compression.type</name>
<value>BLOCK</value>
</property>