【大数据】HIVE大全-526互联

HIVE 内部表与外部表的区别


# HIVE储存数据位置
[hadoop@test1 hive]$ hadoop fs -ls /user/hive/warehouse/testhive.db/info/
Found 1 items
-rw-r--r--   3 hadoop supergroup       ...   /user/hive/warehouse/testhive.db/info/000000_0

# 内部表-创建:  create table ......
# 外部表-创建:  create external table ......

# 内部表-存储位置:    Hive管理，默认/user/hive/warehouse
# 外部表-存储位置:    随意，location关键字指定

# 内部表-删除数据:    删除元数据（表信息）, 删除数据
# 外部表-存储位置:    删除元数据（表信息），保留数据

# 内部表-理念:        Hive管理表持久使用，删除时会全部一起删除
# 外部表-理念:        临时链接外部数据用，删除时会删除元数据与表结构，但是不会删除实际数据

# 总的来说，内部表适合在 Hive 中进行数据管理和查询操作，而外部表适合在 Hive 中进行数据分析和集成外部数据。外部表通常用于引入和处理外部数据源，而内部表通常用于存储经过处理的数据。

HIVE 内/外表的命令区别


# 内部表命令
create database testHive;
use testHive;
create table testHive.info(id int,name string);
insert into testHive.info values (1,'周杰伦'),(2,'林俊杰');
select * from testHive.info;


# 外部表命令
# 先创建外部表，然后移动数据到LOCATION目录
hadoop fs -ls /tmp						        # 确认不存在/tmp/test_ext1目录
create external table test_ext1(id int, name string) row format delimited fields terminated by '\t' location '/tmp/test_ext1';   					 # 创建表
select * from test_ext1; 						 # 空结果，无数据
hadoop fs -put test_external.txt /tmp/test_ext1/   # 上传数据： 
select * from test_ext1                            # 即可看到数据结果
# 演示先存在数据，后创建外部表
hadoop fs -mkdir /tmp/test_ext2
hadoop fs -put test_external.txt /tmp/test_ext2/
create external table test_ext2(id int, name string) row format delimited fields terminated by ‘\t’ location ‘/tmp/test_ext2’;
select * from test_ext2;

HIVE 数据导入


# 使用local，数据不在HDFS，需使用file://协议指定路径。 不使用local，数据在HDFS，可以使用HDFS://协议指定路径
# 使用OVERWRITE进行覆盖数据  不使用OVERWRITE则不覆盖数据
load data local inpath '本地文件路径' overwrite into table 表名
# <表名>是要导入数据的目标表名。如果使用OVERWRITE参数，将会覆盖目标表中的数据。

# 示例命令：
load data local inpath '/home/hadoop/search_log.txt' into table myhive.test_load;
load data inpath '/tmp/search_log.txt' overwrite into table myhive.test_load;

# 将查出的书 追加INTO 或者覆盖OVERWRITE 到指定的表里面
insert into table tbl1 select * from tbl2;
insert overwrite table tbl1 select * from tbl2;


数据在本地:
		推荐 load data local加载
数据在HDFS:
		如果不保留原始文件:推荐使用LOAD方式直接加载
		如果保留原始文件:推荐使用外部表先关联数据，然后通过INSERT SELECT 外部表的形式加载数据
数据已经在表中:
		只可以insert select

# 推荐选择加载数据到内部表内

HIVE 数据导出


# 保存
# 将查询的结果导出到本地 - 使用默认列分隔符
insert overwrite local directory '/home/hadoop/export1' select * from test_load;

# 将查询的结果导出到本地 - 指定列分隔符
insert overwrite local directory '/home/hadoop/export2' row format delimited fields terminated by '\t' select * from test_load;

# 将查询的结果导出到HDFS上(不带local关键字)
insert overwrite directory '/tmp/export' row format delimited fields terminated by '\t' select * from test_load;


# hive表数据导出 - hive shell：hive -f/-e 执行语句或者脚本 > file）
bin/hive -e "select * from myhive.test_load;" > /home/hadoop/export3/export4.txt
bin/hive -f export.sql > /home/hadoop/export4/export4.txt

HIVE 分区表

-- 创建分区 多分区的时候使用逗号隔开partitioned by (year string, month string)
create table testhive.score(id string, cid string, score int)  partitioned by (month string)
row format delimited fields terminated by '\t';

-- 加载数据   partition(month='202005')：将这段加载的数据全部标志成month='202005'
-- 多分区的时候使用逗号隔开    partition(year='2023',month='202005');
load data local inpath '/home/hadoop/score.txt' overwrite into table testHive.score partition(month='202005');
周杰轮,语文,99,202005
周杰轮,数学,66,202005
周杰轮,英语,88,202005
林均街,语文,66,202005
林均街,数学,77,202005
...

HIVE 分桶表

# 开启分桶自动优化  自动匹配reduce task数量和桶数量一致
SET hive.enforce.bucketing=true;

-- 创建分桶表
create table course(c_id string,c_name string,t_id string) clustered by (c_id) into 3 buckets
row format delimited  fields terminated by '\t';


-- 加载数据 只能走insert select 需要创建临时表导入数据后 查询数据后导入
-- 1：创建临时表
create table testhive.lst(c_id string,c_name string,t_id string) row format delimited  fields terminated by '\t';
-- 2: 加载数据到临时表
load data local inpath '/home/hadoop/course.txt' into table testhive.lst;
-- 3: insert select 向分桶表里面加载数据   cluster不需要带ed
insert overwrite table testHive.course select * from testHive.lst cluster by (c_id);


# 查看分桶数量
[hadoop@test1 ~]$ hadoop fs -ls /user/hive/warehouse/testhive.db/course
Found 3 items
-rw-r--r--   3 hadoop supergroup      /user/hive/warehouse/testhive.db/course/000000_0
-rw-r--r--   3 hadoop supergroup      /user/hive/warehouse/testhive.db/course/000001_0
-rw-r--r--   3 hadoop supergroup      /user/hive/warehouse/testhive.db/course/000002_0


# 分桶时为什么不能直接使用load数据加载
需要insert select触发MapReduce进行hash取模计算，来基于分桶列的值，确定哪一条数据进入到哪一个桶文件中
（哈希加密后除分桶数量取余分配），同样key (分桶列的值)的数据，会在同一个桶中。

# 好处：
单值过滤
join
group by

HIVE 基本命令

# T为自定义数据表

# 表重命名
alter table T1 rename to T2;

# 修改表属性值
alter table T set tblproperties table_properties;

# 查看表类型：
desc formatted T;

# 内部表转外部表
alter table T set tblproperties('EXTERNAL'='TRUE');

# 外部表转内部表
alter table T set tblproperties('EXTERNAL'='FALSE');

# 添加分区 新分区是空的没数据，需要手动添加或上传数据文件
alter table T add partition(month='201101');

# 修改分区值
alter table T partition(month='202005') rename to partition (month='201105');

# 删除分区
alter table T drop partition(month='201105');

# 添加列
alter table T add columns (v1 int, v2 string);

# 修改列名            当前名称        新列名        数据类型
alter table T change column_name  new_column_name column_type;
alter table T change v1   v1New  int;

# 删除表
drop table T;

# 清空表   只可以清空内部表
truncate table T;

HIVE Array类型

# 建表语句
create table testhive.test_array(name string, work_locations array<string>)
row format delimited fields terminated by '\t'
COLLECTION ITEMS TERMINATED BY ',';       # 数组以','相隔开

-- 导入本地数据
load data local inpath '/home/hadoop/data_for_array_type.txt' into table testHive.test_array;

-- 查询所有数据
select * from testhive.test_array;

-- 查询loction数组中第一个元素
select name, work_locations[0] location from testhive.test_array;

-- 查询location数组中元素的个数
select name, size(work_locations) location from testhive.test_array;

-- 查询location数组中包含tianjin的信息
select * from testhive.test_array where array_contains(work_locations,'tianjin');

HIVE Map类型

# 建表语句      k-v分隔符 ':'       字段分隔符 '#'
create table testhive.test_map(id int, name string, members map<string,string>, age int)
row format delimited fields terminated by ','
COLLECTION ITEMS TERMINATED BY '#'    
MAP KEYS TERMINATED BY ':';

# 导入数据
load data local inpath '/home/hadoop/data_for_map_type.txt' into table testhive.test_map;

# 查询全部
select * from testhive.test_map;

# 查询所有列        表   ['key 名'] 自定义列名
select id, name, members['father'] father, members['mother'] mother, age from testhive.test_map;
id,name,father,mother,age
1,林杰均,林大明,小甜甜,28
2,周杰伦,马小云,黄大奕,22

# 查询全部map的key，使用map_keys函数，结果是array类型
select id, name, map_keys(members) as relation from testhive.test_map;
1,林杰均,"[""father"",""mother"",""brother""]"
2,周杰伦,"[""father"",""mother"",""brother""]"


# 查询全部map的value，使用map_values函数，结果是array类型
select id, name, map_values(members) as relation from testhive.test_map;
1,林杰均,"[""林大明"",""小甜甜"",""小甜""]"
2,周杰伦,"[""马小云"",""黄大奕"",""小天""]"

# 查询map类型的KV对数量
select id,name,size(members) num from testhive.test_map;

# 查询map的key中有brother的数据  精确匹配
select * from testhive.test_map where array_contains(map_keys(members), 'brother');
select * from testhive.test_map where array_contains(map_values(members), '马小云');

# 模糊查询
select * from testhive.test_map where map_values(members) like '%马%';

HIVE Struct类型

# 建表语句 字段分隔符'#'
create table testhive.test_struct(id string, info struct<name:string, age:int>)
row format delimited fields terminated by '#'
COLLECTION ITEMS TERMINATED BY ':';

load data local inpath '/home/hadoop/data_for_struct_type.txt' into table testHive.test_struct;

select * from test_struct;

# 直接使用列名.字段名  即可从struct中取出子列查询
select id,info.name from test_struct;

HIVE Sampling采样（数据抽样）

# 随机分桶抽样
select ... from tbl tablesample(bucket x out of y on(colname | rand()))

y表示将表数据随机划分成y份（y个桶）
x表示从y里面随机抽取x份数据作为取样
colname表示随机的依据基于某个列的值
rand()表示随机的依据基于整行

# 示例：
# 按username使用哈希分成10个桶，每个桶抽取1个拼接到一起
SELECT username, orderId, totalmoney FROM itheima.orders TABLESAMPLE(BUCKET 1 OUT OF 10 ON username);

# 按照随机字段进行分分成10个桶，每个桶抽取1个拼接到一起
SELECT * FROM itheima.orders TABLESAMPLE(BUCKET 1 OUT OF 10 ON rand());



# 数据块抽样
select ... from tbl tablesample(num rows | num percent | num(k|m|g));
num ROWS 表示抽样num条数据
num PERCENT 表示抽样num百分百比例的数据
num(K|M|G) 表示抽取num大小的数据，单位可以是K、M、G表示KB、MB、GB

# 示例
select * from orders tablesample ( 100 rows )

HIVE 虚拟列

# 虚拟列是Hive内置的可以在查询语句中使用的特殊标记，可以查询数据本身的详细参数。
Hive目前可用3个虚拟列：
    INPUT__FILE__NAME，显示数据行所在的具体文件
    BLOCK__OFFSET__INSIDE__FILE，显示数据行所在文件的偏移量
    ROW__OFFSET__INSIDE__BLOCK，显示数据所在HDFS块的偏移量
    此虚拟列需要设置：SET hive.exec.rowoffset=true 才可使用

# 示例：
SELECT *, INPUT__FILE__NAME, BLOCK__OFFSET__INSIDE__FILE, ROW__OFFSET__INSIDE__BLOCK FROM testhive.course;

HIVE 常用内置函数

# 使用 show functions 查看当下可用的所有函数；

# 通过 describe function extended funcname 来查看函数的使用方式。


--取整函数: round  返回double类型的整数值部分 （遵循四舍五入）
select round(3.1415926);

--指定精度取整函数: round(double a, int d) 返回指定精度d的double类型
select round(3.1415926,4);

--取随机数函数: rand 每次执行都不一样 返回一个0到1范围内的随机数
select rand();

--指定种子取随机数函数: rand(int seed) 得到一个稳定的随机数序列
select rand(3);

--求数字的绝对值
select abs(-3);

--得到pi值（小数点后15位精度）
select pi();

-- 集合函数：
    size(array/map)：返回数组或map的元素个数。
    array(value1, value2, ...)：创建一个数组。
    map(key1, value1, key2, value2, ...)：创建一个map。
    
-- 数组函数：
    array_contains(array, value)：检查数组中是否包含指定的元素。
    array_length(array)：返回数组的长度。
    array_sort(array)：对数组进行排序。
    array_max(array)：返回数组中的最大值。
    array_min(array)：返回数组中的最小值。
    array_distinct(array)：返回数组中的唯一值。
    array_join(array, delimiter)：将数组元素连接为一个字符串。
    
-- map函数：
    map_keys(map)：返回map中的所有键。
    map_values(map)：返回map中的所有值。
    map_keys_sorted(map)：返回按键排序的map。
    map_values_sorted(map)：返回按值排序的map。
    map_contains_key(map, key)：检查map中是否包含指定的键。
    
-- 集合操作函数：
    union(array1, array2)：返回两个数组的并集。
    intersect(array1, array2)：返回两个数组的交集。
    difference(array1, array2)：返回两个数组的差集。
    

-------- 类型转换函数 -------

to string:
    cast(value as string)：将值转换为字符串类型。
    string(value)：将值转换为字符串类型。
    concat_ws(separator, value1, value2, ...)：将多个值按指定分隔符连接为一个字符串。
    
to numeric:
    cast(value as int)：将值转换为整数类型。
    cast(value as bigint)：将值转换为长整数类型。
    cast(value as float)：将值转换为浮点数类型。
    cast(value as double)：将值转换为双精度浮点数类型。
    
to boolean:
    cast(value as boolean)：将值转换为布尔类型。
    
to date/time:
    cast(value as date)：将值转换为日期类型。
    cast(value as timestamp)：将值转换为时间戳类型。
    
to collection:
    array(value1, value2, ...)：将多个值转换为数组类型。
    map(key1, value1, key2, value2, ...)：将多个键值对转换为map类型。
    
to complex types:
	struct(value1, value2, ...)：将多个值转换为结构类型。	

-------- 日期函数 -------

current_date(): 返回当前日期，格式为yyyy-MM-dd。

current_timestamp(): 返回当前时间戳，格式为yyyy-MM-dd HH:mm:ss.SSS。

year(date): 返回给定日期的年份。

month(date): 返回给定日期的月份（1-12）。

day(date): 返回给定日期的天数（1-31）。

hour(timestamp): 返回给定时间戳的小时数（0-23）。

minute(timestamp): 返回给定时间戳的分钟数（0-59）。

second(timestamp): 返回给定时间戳的秒数（0-59）。

date_format(date, pattern): 使用指定的模式格式化日期。常见的模式包括"yyyy-MM-dd"（年-月-日），"MM/dd/yyyy"（月/日/年）等。

from_unixtime(unixtime, pattern): 将Unix时间戳转换为指定格式的日期。

unix_timestamp(): 返回当前时间的Unix时间戳。

datediff(end_date, start_date): 返回两个日期之间的天数差。

add_months(start_date, num_months): 返回在给定日期上增加指定月数后的日期。

trunc(date, format): 根据指定的格式截断日期。常见的格式包括"YYYY"（年），"MM"（月），"DD"（日）等。

date_add(start_date, num_days): 返回在给定日期上增加指定天数后的日期。

date_sub(start_date, num_days): 返回在给定日期上减去指定天数后的日期。


------   条件函数  -------   

if(condition, value_if_true, value_if_false): 如果条件为真，则返回value_if_true；否则返回value_if_false。

case expr when value1 then result1 when value2 then result2 ... else default_result end: 根据表达式的值匹配多个可能的结果，并返回匹配的结果。如果没有匹配项，则返回默认结果。

coalesce(value1, value2, ...): 返回第一个非空值。如果所有值都为空，则返回NULL。

isnull(value): 如果值为空，则返回true；否则返回false。

isnotnull(value): 如果值不为空，则返回true；否则返回false。

nullif(expr1, expr2): 如果expr1等于expr2，则返回NULL；否则返回expr1。


------   字符串函数  -------

length(string): 返回字符串的长度。

lower(string)/upper(string): 将字符串转换为小写/大写。

trim(string): 去除字符串两端的空格。

substring(string, start, length)/substr(string, start, length): 返回从指定位置开始的子字符串。

concat(string1, string2, ...): 连接多个字符串。

replace(string, search, replace): 将字符串中的指定子字符串替换为新字符串。

split(string, delimiter): 将字符串按指定分隔符拆分为数组。

regexp_extract(string, pattern, index): 从字符串中提取符合正则表达式的指定位置的子字符串。

locate(substr, string)/instr(string, substr): 返回子字符串在字符串中第一次出现的位置。

lpad/rpad(string, length, pad): 在字符串的左侧/右侧填充指定字符，使其达到指定长度。


------ 脱敏函数 --------

mask(string, n): 将字符串的前n个字符替换为*号。

hash(string): 对字符串进行哈希处理，生成不可逆的摘要。

sha1(string)/md5(string): 对字符串进行SHA1/MD5散列处理，生成不可逆的摘要。

encrypt(string, key): 使用指定密钥对字符串进行加密。

decrypt(string, key): 使用指定密钥对字符串进行解密。

scramble(string): 对字符串进行乱序处理。

shuffle(string): 对字符串进行随机打乱处理。

redact(string, pattern): 将字符串中符合指定正则表达式的部分替换为指定字符。



--------- 其他函数 ---------

int hash(a1[, a2...]): 返回参数的hash数字。这个函数可以用于快速生成输入参数的哈希值，用于数据的散列分布和分区操作。

string current_user(): 返回当前登录用户的用户名。这个函数可以用于获取当前会话中正在操作的用户，用于权限控制和审计。

string current_database(): 返回当前选择的数据库的名称。这个函数可以用于获取当前会话中正在使用的数据库，用于操作数据库对象。

string version(): 返回当前 Hive 的版本号。这个函数可以用于获取 Hive 的版本信息，用于判断功能支持和升级决策。

string md5(string/binary): 返回给定参数的 MD5 值。这个函数可以用于对字符串或二进制数据进行 MD5 哈希计算，用于数据完整性校验或加密存储。