HBase 分布式部署
安装部署 hadoop ha 分布式环境(前面部署完成,这里直接跳过)
部署HA参考:https://www.cnblogs.com/skyrainmom/p/17438814.html
解压安装文件,配置环境变量
#解压
[root@master ~]# tar xf hbase-1.2.1-bin.tar.gz -C /usr/local/src/
[root@master ~]# cd /usr/local/src/
[root@master src]# ls
hadoop hbase-1.2.1 hive jdk sqoop zookeeper
#更改名称
[root@master src]# mv hbase-1.2.1/ hbase
[root@master src]# ls
hadoop hbase hive jdk sqoop zookeeper
#配置环境变量
[root@master src]# vim /etc/profile.d/hbase.sh
#写入以下内容
export HBASE_HOME=/usr/local/src/hbase
export PATH=$PATH:$HBASE_HOME/bin
#同步到slave节点
[root@master ~]# scp -r /etc/profile.d/hbase.sh slave1:/etc/profile.d/
hbase.sh 100% 73 0.7KB/s 00:00
[root@master ~]# scp -r /etc/profile.d/hbase.sh slave2:/etc/profile.d/
hbase.sh 100% 73 0.7KB/s 00:00
修改配置文件(master 节点,conf文件修改)
[root@master ~]# cd /usr/local/src/hbase/conf/
[root@master conf]# vim hbase-env.sh
#写入以下
export JAVA_HOME=/usr/local/src/jdk
export HADOOP_HOME=/usr/local/src/hadoop
export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
export HBASE_MANAGES_ZK=false
export HBASE_LOG_DIR=${HBASE_HOME}/logs
export HBASE_PID_DIR=${HBASE_HOME}/pid
#注释掉下面两行
#export HBASE_MASTER_OPTS="$HBASE_MASTER_OPTS -XX:PermSize=128m -XX:MaxPermSize=128m"
#export HBASE_REGIONSERVER_OPTS="$HBASE_REGIONSERVER_OPTS -XX:PermSize=128m -XX:MaxPermSize=128m"
#修改配置文件 hbase-site.xml
[root@master conf]# vim hbase-site.xml
<configuration>
<property>
<name>hbase.rootdir</name>
<value>hdfs://master:8020/hbase</value>
</property>
<property>
<name>hbase.master.info.port</name>
<value>16010</value>
</property>
<property>
<name>hbase.zookeeper.property.clientPort</name>
<value>2181</value>
</property>
<property>
<name>hbase.tmp.dir</name>
<value>/usr/local/src/hbase/tmp</value>
</property>
<property>
<name>zookeeper.session.timeout</name>
<value>120000</value>
</property>
<property>
<name>hbase.cluster.distributed</name>
<value>true</value>
</property>
<property>
<name>hbase.zookeeper.quorum</name>
<value>master,slave1,slave2</value>
</property>
<property>
<name>hbase.zookeeper.property.dataDir</name>
<value>/usr/local/src/hbase/tmp/zookeeper-hbase</value>
</property>
</configuration>
#修改 regionservers 文件,删除 localhost,添加以下内容
[root@master conf]# vim regionservers
slave1
slave2
#将 core-site.xml 和 hdfs-site.xml 两个文件拷贝到 $HBASE_HOME/conf/ 目录下
[root@master conf]# cp /usr/local/src/hadoop/etc/hadoop/core-site.xml /usr/local/src/hbase/conf/
[root@master conf]# cp /usr/local/src/hadoop/etc/hadoop/hdfs-site.xml /usr/local/src/hbase/conf/
集群分发
#将 master 节点配置好的 HBase 安装包分发给 slave1,slave2 节点
[root@master ~]# scp -r /usr/local/src/hbase/ slave1:/usr/local/src/
[root@master ~]# scp -r /usr/local/src/hbase/ slave2:/usr/local/src/
#更改所有节点文件夹权限
[root@master ~]# chown -R hadoop.hadoop /usr/local/src/
[root@slave1 ~]# chown -R hadoop.hadoop /usr/local/src/
[root@slave2 ~]# chown -R hadoop.hadoop /usr/local/src/
HBase 集群启动
#先启动切换用户启动zookeeper
[hadoop@master ~]$ zkServer.sh start
ZooKeeper JMX enabled by default
Using config: /usr/local/src/zookeeper/bin/../conf/zoo.cfg
Starting zookeeper ... STARTED
[hadoop@master ~]$ zkServer.sh status
ZooKeeper JMX enabled by default
Using config: /usr/local/src/zookeeper/bin/../conf/zoo.cfg
Mode: follower
[hadoop@slave1 ~]$ zkServer.sh start
ZooKeeper JMX enabled by default
Using config: /usr/local/src/zookeeper/bin/../conf/zoo.cfg
Starting zookeeper ... STARTED
[hadoop@slave1 ~]$ zkServer.sh status
ZooKeeper JMX enabled by default
Using config: /usr/local/src/zookeeper/bin/../conf/zoo.cfg
Mode: follower
[hadoop@slave2 ~]$ zkServer.sh start
ZooKeeper JMX enabled by default
Using config: /usr/local/src/zookeeper/bin/../conf/zoo.cfg
Starting zookeeper ... STARTED
[hadoop@slave2 ~]$ zkServer.sh status
ZooKeeper JMX enabled by default
Using config: /usr/local/src/zookeeper/bin/../conf/zoo.cfg
Mode: leader
#启动HBase,先启动hadoop集群,否则master节点会持续退出
[hadoop@master ~]$ start-hbase.sh
starting master, logging to /usr/local/src/hbase/logs/hbase-hadoop-master-master.out
slave1: regionserver running as process 29293. Stop it first.
slave2: regionserver running as process 28815. Stop it first.
[hadoop@master ~]$ start-all.sh
[hadoop@master ~]$ yarn-daemon.sh start proxyserver
starting proxyserver, logging to /usr/local/src/hadoop/logs/yarn-hadoop-proxyserver-master.out
[hadoop@master ~]$ mr-jobhistory-daemon.sh start historyserver
starting historyserver, logging to /usr/local/src/hadoop/logs/mapred-hadoop-historyserver-master.out
[hadoop@master ~]$ jps
46320 NodeManager
53409 HMaster
67525 JobHistoryServer
45429 JournalNode
45030 DataNode
67639 Jps
45783 DFSZKFailoverController
28591 QuorumPeerMain
44799 NameNode
46095 ResourceManager
[hadoop@slave1 ~]$ jps
42240 DataNode
42513 JournalNode
42770 DFSZKFailoverController
41990 NameNode
43062 NodeManager
56921 Jps
26762 QuorumPeerMain
29293 HRegionServer
[hadoop@slave2 ~]$ jps
41682 DataNode
26371 QuorumPeerMain
41975 JournalNode
56920 Jps
28815 HRegionServer
42495 NodeManager
Web界面查看:http://192.168.88.10:16010(特别强调 hbase2.0 的端口是 16010)
HBase 库操作与表操作
HBase 库操作
HBase 动态删除节点
节点升级或者硬盘扩容在存储服务器上属于正常现象,当某存储节点需要扩容升级短暂下线后需要该节点下线
#graceful_stop.sh 脚本会自行关闭平衡器,移动 slaves2 节点上的数据到其他节点上,
此步骤会消耗大量时间等待
[hadoop@master ~]$ graceful_stop.sh slave2
......
2023-05-31 13:11:17,016 INFO [RubyThread-6: /usr/local/src/hbase/bin/thread-pool.rb:28] region_mover: Moved region hbase:namespace,,1685509218437.02f6bf95c972e49580aca5a27dbc853f. cost: 2.605
2023-05-31 13:11:17,019 INFO [main] region_mover: Pool completed
2023-05-31 13:11:17,030 INFO [main] region_mover: Wrote list of moved regions to /tmp/slave2
2023-05-31T13:11:17 Unloaded slave2 region(s)
2023-05-31T13:11:17 Stopping regionserver on slave2
slave2: stopping regionserver.........
2023-05-31T13:11:27 Restoring balancer state to true
#同时需要 hadoop 中删除节点。在 hdfs-site.xml 中添加配置。需要新建 exclude 文件,
该文件写入删除节点名称,dfs.hosts.exclude:表示需要删除 exclude 中的节点
[hadoop@master ~]$ vim /usr/local/src/hadoop/etc/hadoop/exclude
slave2
[hadoop@master ~]$ vim /usr/local/src/hadoop/etc/hadoop/hdfs-site.xml
<property>
<name>dfs.hosts.exclude</name>
<value>/usr/local/src/hadoop/etc/hadoop/exclude</value>
</property>
#刷新配置生效
[hadoop@master ~]$ hadoop dfsadmin -refreshNodes
[hadoop@master ~]$ hadoop dfsadmin -refreshNodes
DEPRECATED: Use of this script to execute hdfs command is deprecated.
Instead use the hdfs command for it.
Refresh nodes successful for slave1/192.168.88.20:8020
Refresh nodes successful for master/192.168.88.10:8020
Web界面查看:http://192.168.88.10:16010
打开 Web UI 监控页面查看,发现此节点显示(Decommission In Progress),表示节点正在做数据迁移,等待后节点停止,dead node 列表显示下线节点,然后收尾
#节点下线后需要将 slaves 与 exclude 文件中 slave2 删除,刷新 hadoop 命令,此
时全部结束
[hadoop@master ~]$ vim /usr/local/src/hadoop/etc/hadoop/exclude
slave2 #删除它
[hadoop@master ~]$ hadoop dfsadmin -refreshNodes
DEPRECATED: Use of this script to execute hdfs command is deprecated.
Instead use the hdfs command for it.
Refresh nodes successful for slave1/192.168.88.20:8020
Refresh nodes successful for master/192.168.88.10:8020
HBase 动态增加节点
#在新的节点上启动服务。切换到新增节点上,使用以下命令
[hadoop@slave2 ~]$ hbase-daemon.sh start regionserver
starting regionserver, logging to /usr/local/src/hbase/logs/hbase-hadoop-regionserver-slave2.out
Web界面查看:http://192.168.88.10:16010
HBase 表管理
建立表,两个列簇:name 和 num
#进入 HBase 命令行
[hadoop@master ~]$ hbase shell
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/local/src/hbase/lib/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/local/src/hadoop/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
HBase Shell; enter 'help<RETURN>' for list of supported commands.
Type "exit<RETURN>" to leave the HBase Shell
Version 1.2.1, r8d8a7107dc4ccbf36a92f64675dc60392f85c015, Wed Mar 30 11:19:21 CDT 2016
hbase(main):001:0>
#建立表 student,两个列簇:name 和 num
hbase(main):001:0> create 'student',{NAME=>'name'},{NAME=>'num'}
0 row(s) in 4.7690 seconds
=> Hbase::Table - student
#新建学生表,存储姓名与学号。
#语法:create <table>, {NAME => <family>, VERSIONS => <VERSIONS>}
Web界面查看:http://192.168.88.10:16010
查看所有表与详细信息
hbase(main):002:0> list
TABLE
student
1 row(s) in 0.0250 seconds
=> ["student"]
#查看建表详细信息
hbase(main):003:0> describe 'student'
Table student is ENABLED
student
COLUMN FAMILIES DESCRIPTION
{NAME => 'name', BLOOMFILTER => 'ROW', VERSIONS => '1', IN_MEMORY => 'false', KEE
P_DELETED_CELLS => 'FALSE', DATA_BLOCK_ENCODING => 'NONE', TTL => 'FOREVER', COMP
RESSION => 'NONE', MIN_VERSIONS => '0', BLOCKCACHE => 'true', BLOCKSIZE => '65536
', REPLICATION_SCOPE => '0'}
{NAME => 'num', BLOOMFILTER => 'ROW', VERSIONS => '1', IN_MEMORY => 'false', KEEP
_DELETED_CELLS => 'FALSE', DATA_BLOCK_ENCODING => 'NONE', TTL => 'FOREVER', COMPR
ESSION => 'NONE', MIN_VERSIONS => '0', BLOCKCACHE => 'true', BLOCKSIZE => '65536'
, REPLICATION_SCOPE => '0'}
2 row(s) in 0.1310 seconds
#在建立表时没有指定列的详细属性,系统根据默认设置。
#语法:describe <table>
修改表
#新增加新的列 tel,alter 也可以对列删除,对属性进行修改
hbase(main):002:0> alter 'student' ,{NAME=>'tel'}
Updating all regions with the new schema...
0/1 regions updated.
1/1 regions updated.
Done.
0 row(s) in 2.9960 seconds
#修改原 name 列的 VERSIONS 属性为 2。删除刚增加的 tel 列
hbase(main):004:0> alter 'student' ,{'NAME'=>'name',VERSIONS=>'2'}
Updating all regions with the new schema...
1/1 regions updated.
Done.
0 row(s) in 1.9430 seconds
hbase(main):003:0> alter 'student',NAME=>'tel',METHOD=>'delete'
Updating all regions with the new schema...
1/1 regions updated.
Done.
0 row(s) in 2.1870 seconds
删除表
hbase(main):009:0> disable 'student'
0 row(s) in 2.3020 seconds
hbase(main):010:0> drop 'student'
0 row(s) in 1.2800 seconds
hbase(main):011:0> list
TABLE
0 row(s) in 0.0310 seconds
=> []
hbase(main):012:0> status
1 active master, 0 backup masters, 2 servers, 0 dead, 1.0000 average load
HBase 数据操作
插入数据和修改
#建立表 student,两个列簇:name 和 num
hbase(main):001:0> create 'student',{NAME=>'name'},{NAME=>'num'}
0 row(s) in 1.5070 seconds
=> Hbase::Table - student
hbase(main):002:0> list
TABLE
student
1 row(s) in 0.0150 seconds
=> ["student"]
#插入两条数据
hbase(main):003:0> put 'student','rk1','name','Tom'
0 row(s) in 0.1710 seconds
hbase(main):004:0> put 'student','rk1','num','123456'
0 row(s) in 0.0410 seconds
hbase(main):005:0> put 'student','rk2','name','Sun'
0 row(s) in 0.0160 seconds
hbase(main):006:0> put 'student','rk2','num','123456'
0 row(s) in 0.0120 seconds
hbase(main):007:0> put 'student','rk3','name:cha','wangyu'
0 row(s) in 0.2690 seconds
#查看整个表记录,修改操作也是用 put 命令,就是重新添加内容把,把以前的内容覆盖。
#语法:put <table>,<rowkey>,<family:column>,<value>,<timestamp>。其中,'table_name'为表名,'rk1'为 rowkey,'name: cha ' name 为列族,cha 为列,'Tom'为值,同一个列族下可以
有多个列,同一个 rowkey 视为同一行
读取指定行、指定行中的列的信息
hbase(main):002:0> get 'student','rk1'
COLUMN CELL
name: timestamp=1685511775270, value=Tom
num: timestamp=1685511788806, value=123456
2 row(s) in 0.0420 seconds
hbase(main):003:0> get 'student','rk1','name'
COLUMN CELL
name: timestamp=1685511775270, value=Tom
1 row(s) in 0.0140 seconds
scan 命令扫描全表
#语法:scan <table>, {COLUMNS => [ <family:column>,.... ], LIMIT => num}
#注:数据导入时,要注意数据的格式,否则显示为十六进制
hbase(main):004:0> scan 'student'
ROW COLUMN+CELL
rk1 column=name:, timestamp=1685511775270, value=Tom
rk1 column=num:, timestamp=1685511788806, value=123456
rk2 column=name:, timestamp=1685511809717, value=Sun
rk2 column=num:, timestamp=1685511817420, value=123456
rk3 column=name:cha, timestamp=1685511868653, value=wangyu
3 row(s) in 0.0550 seconds
删除指定行中的列、指定行,清空表。
#语法:delete <table>, <rowkey>, <family:column> , <timestamp>,必须指定列名,这里需要注意,如果该列保存有多个版本的数据,将一并被删除
hbase(main):005:0> delete 'student','rk2','name'
0 row(s) in 0.0420 seconds
#语法:deleteall <table>, <rowkey>, <family:column> , <timestamp>,可以不指定列名,删除整行数据
hbase(main):006:0> deleteall 'student','rk2'
0 row(s) in 0.0280 seconds
#使用 truncate 命令,删除 table_name 表中的所有数据。
#语法:truncate <table> 其具体过程是:disable table -> drop table -> create table
hbase(main):007:0> truncate 'student'
Truncating 'student' table (it may take a while):
- Disabling table...
- Truncating table...
0 row(s) in 4.6550 seconds
模糊查询
限制查询
hbase(main):008:0> put 'student','rk1','name','Tom'
0 row(s) in 0.1380 seconds
hbase(main):009:0> put 'student','rk1','num','123456'
0 row(s) in 0.0230 seconds
hbase(main):010:0> put 'student','rk2','name','Sun'
0 row(s) in 0.0110 seconds
hbase(main):011:0> put 'student','rk2','num','123456'
0 row(s) in 0.0130 seconds
hbase(main):012:0> put 'student','rk3','name:cha','wangyu'
0 row(s) in 0.2650 seconds
#语法:scan <table> ,{COLUMNS=>' column '}
#count 对表计数时 INTERVAL: 每隔多少行显示一次 count,默认是 1000,CACHE:每
次去取的缓存区大小,默认是 10,调整该参数可提高查询速度,大表查询通过参数设置可
以加快计算速度
hbase(main):002:0> scan 'student',{COLUMNS=>'name'}
ROW COLUMN+CELL
rk1 column=name:, timestamp=1685512161987, value=Tom
rk2 column=name:, timestamp=1685512189067, value=Sun
rk3 column=name:cha, timestamp=1685512266266, value=wangyu
3 row(s) in 0.0500 seconds
#语法:count <table>, {INTERVAL => intervalNum, CACHE => cacheNum}
hbase(main):013:0> scan 'student',{COLUMNS=>['name','num'],LIMIT=>2}
ROW COLUMN+CELL
rk1 column=name:, timestamp=1685512161987, value=Tom
rk1 column=num:, timestamp=1685512177938, value=123456
rk2 column=name:, timestamp=1685512189067, value=Sun
rk2 column=num:, timestamp=1685512197213, value=123456
2 row(s) in 0.0200 seconds
限制时间范围
#[]里面的值需要看上面那两条命令得出的结果去推断时间范围
hbase(main):004:0> scan 'student', {TIMERANGE =>[1685512161987,1685512266266]}
ROW COLUMN+CELL
rk1 column=name:, timestamp=1685512161987, value=Tom
rk1 column=num:, timestamp=1685512177938, value=123456
rk2 column=name:, timestamp=1685512189067, value=Sun
rk2 column=num:, timestamp=1685512197213, value=123456
2 row(s) in 0.0470 seconds
#时间戳是 1970 年 01 月 01 日 00 时 00 分 00 秒起至当下的总秒数。通常表示提供一份电子证据,以证明用户的某些数据的产生时间
PrefixFilter:rowKey 前缀过滤
hbase(main):005:0> scan 'student',{FILTER=>"PrefixFilter('rk')"}
ROW COLUMN+CELL
rk1 column=name:, timestamp=1685512161987, value=Tom
rk1 column=num:, timestamp=1685512177938, value=123456
rk2 column=name:, timestamp=1685512189067, value=Sun
rk2 column=num:, timestamp=1685512197213, value=123456
rk3 column=name:cha, timestamp=1685512266266, value=wangyu
3 row(s) in 0.0550 seconds
#同时也有 QualifierFilter:列名过滤器、TimestampsFilter:时间戳过滤器等,支持“且”操作。
ValueFilter:值确定查询(value=Tom)与模糊查询(value 包含 m)
hbase(main):006:0> scan 'student',FILTER=>"ValueFilter(=,'binary:Tom')"
ROW COLUMN+CELL
rk1 column=name:, timestamp=1685512161987, value=Tom
1 row(s) in 0.0450 seconds
hbase(main):007:0> scan 'student',FILTER=>"ValueFilter(=,'substring:m')"
ROW COLUMN+CELL
rk1 column=name:, timestamp=1685512161987, value=Tom
1 row(s) in 0.0540 seconds
批量导入/导出
ImportTsv 工具
#先将导入导出的文件上传到节点
#命令:bin/hbase org.apache.hadoop.hbase.mapreduce.ImportTsv
#Usage: importtsv -Dimporttsv.columns=a,b,c <tablename> <inputdir>
[hadoop@master ~]$ mkdir student
[hadoop@master ~]$ cd student
[hadoop@master student]$ ls
student.csv student.txt
[hadoop@master student]$ hdfs dfs -put /home/hadoop/student/student.csv /input
[hadoop@master student]$ hbase org.apache.hadoop.hbase.mapreduce.ImportTsv -Dimporttsv.separator="," -Dimporttsv.columns=HBASE_ROW_KEY,name,num student /input/student.csv
.......
2023-05-31 14:09:10,193 INFO [main] mapreduce.Job: Running job: job_1685509012864_0001
2023-05-31 14:09:28,392 INFO [main] mapreduce.Job: Job job_1685509012864_0001 running in uber mode : false
2023-05-31 14:09:28,393 INFO [main] mapreduce.Job: map 0% reduce 0%
2023-05-31 14:09:41,539 INFO [main] mapreduce.Job: map 100% reduce 0%
2023-05-31 14:09:42,570 INFO [main] mapreduce.Job: Job job_1685509012864_0001 completed successfully
2023-05-31 14:09:42,917 INFO [main] mapreduce.Job: Counters: 31
........
[hadoop@master student]$ hdfs dfs -ls /input
Found 1 items
-rw-r--r-- 2 hadoop supergroup 8696 2023-05-31 14:07 /input/student.csv
Export 数据导出
#命令:bin/hbase org.apache.hadoop.hbase.mapreduce.Export
#Usage: <tablename> <hdfsdir>
[hadoop@master bin]$ cd /usr/local/src/hbase/bin
[hadoop@master bin]$ hbase org.apache.hadoop.hbase.mapreduce.Export student /output/hbase-data-back
.........
2023-05-31 14:14:11,176 INFO [main] mapreduce.Job: The url to track the job: http://master:8088/proxy/application_1685509012864_0002/
2023-05-31 14:14:11,177 INFO [main] mapreduce.Job: Running job: job_1685509012864_0002
2023-05-31 14:14:21,443 INFO [main] mapreduce.Job: Job job_1685509012864_0002 running in uber mode : false
2023-05-31 14:14:21,445 INFO [main] mapreduce.Job: map 0% reduce 0%
2023-05-31 14:14:40,636 INFO [main] mapreduce.Job: map 100% reduce 0%
2023-05-31 14:14:40,658 INFO [main] mapreduce.Job: Job job_1685509012864_0002 completed successfully
...........
Web界面查看:http://192.168.88.10:50070
标签:src,可用,hadoop,master,student,HBase,main,hbase From: https://www.cnblogs.com/skyrainmom/p/17489362.html