Hbase过滤查询
转载于:http://chengjianxiaoxue.iteye.com/blog/2312179
list 查看表 带有正则写法:
- hbase(main):014:0> list 'zm.*'
- TABLE
- zmtest1
- 1 row(s) in 0.0060 seconds
- => ["zmtest1"]
如下是案例中总结出来的命令:
- rowkey: PrefixFilter
- 列名:ColumnPrefixFilter
- 列值:ValueFilter (精确/模糊)
- ValueFilter(>,'binary:\x00\x00\x00\x00\x00\x00\x00\x0F')
- ValueFilter(=,'binary:sku188')
- ValueFilter(=,'substring:1')
- 可以结合limit使用 LIMIT => 10
- scan 'zmtest1', LIMIT => 10, FILTER=>"( ColumnPrefixFilter('app') AND ValueFilter(>,'binary:\x00\x00\x00\x00\x00\x00\x00\x0F') )"
- scan 'zmtest1', LIMIT => 10 查看10条记录
- <span style="">FirstKeyOnlyFilter: 只获取hbase表第一个列和对应value值
- RowFilter: 只获取rowkey
- </span>
- hbase 列值为binary下映射到hive:
- CREATE EXTERNAL TABLE user_app_cookie_list ( username STRING, app1_cookie_id BIGINT, app2_cookie_id BIGINT )
- STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
- WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key, lf:app1#b, lf:app2#b")
- TBLPROPERTIES("hbase.table.name" = "zmtest1");
- 因为下面案例 hbase shell put的值都是二进制,因此在创建hive外表时,需要指定 #b,否则
- 关联后查询的hive列值是null.
- 而在案例 <a style="text-decoration: underline; color: #108ac6;" href="/blog/2279407">hbase+hive结合使用</a><span class="Apple-converted-space"><strong><span style="font-size: medium;"> 中</span></strong><span style="font-size: medium;">,因为shell put的数据都是字符串因此创建外表
- 不需要指定格式,曾实验用#b #s创建hive外表,查询结果都是正确的,
- 估计默认就是#s格式, #b只在hbase shell中显示插入二进制数据时才用到
- </span></span>
- -- 增加列族名
- alter 'zmtest1', NAME => 'cf'
- -- 删除列族
- alter 'zmtest1', NAME => 'cf', METHOD => 'delete'
- 或者
- alter 'zmtest1', 'delete' => 'cf'
=============如下是案例============================
- create 'zmtest1', 'lf', 'sf'
- -- lf: column family of LONG values (binary value)
- -- sf: column family of STRING values
- -- 一个用户(userX),在什么时间(tsX),作为rowkey
- -- 对什么产品(value:skuXXX),做了什么操作作为列名,比如,c1: click from homepage; c2: click from ad; s1: search from homepage; b1: buy
- put 'zmtest1', 'user1|ts1', 'sf:c1', 'sku1' -- 向表zmtest1中,rowkey为user1|ts1的记录,添加列族为sf下列名为c1,列值为sku1
- put 'zmtest1', 'user1|ts2', 'sf:c1', 'sku188'
- put 'zmtest1', 'user1|ts3', 'sf:s1', 'sku123'
- put 'zmtest1', 'user2|ts4', 'sf:c1', 'sku2'
- put 'zmtest1', 'user2|ts5', 'sf:c2', 'sku288'
- put 'zmtest1', 'user2|ts6', 'sf:s1', 'sku222'
- ====================================hbase shell filter=======================================================
- scan 'zmtest1', FILTER=>"ValueFilter(=,'binary:sku188')" -- 过滤列值value, binary表示列值二进制下的精确查找
- hbase(main):012:0> scan 'zmtest1',FILTER=>"ValueFilter(=,'binary:sku188')"
- ROW COLUMN+CELL
- user1|ts2 column=sf:c1, timestamp=1468893517729, value=sku188
- scan 'zmtest1', FILTER=>"ValueFilter(=,'substring:1')"
- hbase(main):013:0> scan 'zmtest1',FILTER=>"ValueFilter(=,'substring:1')" substring表示列值的截取串查找
- ROW COLUMN+CELL
- user1|ts1 column=sf:c1, timestamp=1468893512967, value=sku1
- user1|ts2 column=sf:c1, timestamp=1468893517729, value=sku188
- user1|ts3 column=sf:s1, timestamp=1468893522589, value=sku123
- scan 'zmtest1', FILTER=>"ValueFilter(=,'substring:88')"
- scan 'zmtest1', FILTER=>"ColumnPrefixFilter('c2') AND ValueFilter(=,'substring:88')" -- ColumnPrefixFilter表示列名的substring包含,列名和列值的组合过滤
- scan 'zmtest1', FILTER=>"ColumnPrefixFilter('s') AND ( ValueFilter(=,'substring:123') OR ValueFilter(=,'substring:222') )"
- =============================================================================================================
- c, 一个rowkey可以有多个版本,以timestamp来区别,而且同一个key 会有多个列名,column, 只拿出key里面第一个column的第一个version版本的值,
- KeyOnlyFilter(): 只拿出key(keyonly), 去掉value的获取 这样在做count的操作时,就提高效率。
- scan 'zmtest1', FILTER=>"FirstKeyOnlyFilter() AND ValueFilter(=,'binary:sku188') AND KeyOnlyFilter()"
- --结果
- ROW COLUMN+CELL
- user1|ts2 column=sf:c1, timestamp=1468893517729, value=
- scan 'zmtest1', FILTER=>"FirstKeyOnlyFilter() AND ValueFilter(=,'binary:sku188')"
- --结果
- ROW COLUMN+CELL
- user1|ts2 column=sf:c1, timestamp=1468893517729, value=sku188
- -- rowkey过滤PrefixFilter
- scan 'zmtest1', FILTER => "PrefixFilter('user1')"
- --结果:
- ROW COLUMN+CELL
- user1|ts1 column=sf:c1, timestamp=1468893512967, value=sku1
- user1|ts2 column=sf:c1, timestamp=1468893517729, value=sku188
- user1|ts3 column=sf:s1, timestamp=1468893522589, value=sku123
- -- STARTROW表示从这一行开始,包含这一行
- -- STOPROW 表示到这行结束,不包含这一样
- -- 其中,startrow stoprow的数值不一定就是精确值,可以是rowkey里存在的某一个子串
- scan 'zmtest1', {STARTROW=>'user1|ts2', FILTER => "PrefixFilter ('user1')"}
- --结果
- ROW COLUMN+CELL
- user1|ts2 column=sf:c1, timestamp=1468893517729, value=sku188
- user1|ts3 column=sf:s1, timestamp=1468893522589, value=sku123
- scan 'zmtest1', {STARTROW=>'user1|ts2', STOPROW=>'user2'}
- --结果
- ROW COLUMN+CELL
- user1|ts2 column=sf:c1, timestamp=1468893517729, value=sku188
- user1|ts3 column=sf:s1, timestamp=1468893522589, value=sku123
- scan 'zmtest1', {STARTROW=>'user1', STOPROW=>'user2'} -- user2不一定就是一个精确的rowkey
- --结果
- ROW COLUMN+CELL
- user1|ts1 column=sf:c1, timestamp=1468893512967, value=sku1
- user1|ts2 column=sf:c1, timestamp=1468893517729, value=sku188
- user1|ts3 column=sf:s1, timestamp=1468893522589, value=sku123
- ==========================上面的filter是直接可以调用的,下面的filter是需要引用后才能使用===============================
- --
- import org.apache.hadoop.hbase.filter.CompareFilter
- import org.apache.hadoop.hbase.filter.SubstringComparator
- import org.apache.hadoop.hbase.filter.RowFilter
- -- rowkey里面包含ts3
- scan 'zmtest1', {FILTER => RowFilter.new(CompareFilter::CompareOp.valueOf('EQUAL'), SubstringComparator.new('ts3'))}
- eg:
- hbase(main):024:0> import org.apache.hadoop.hbase.filter.CompareFilter
- => Java::OrgApacheHadoopHbaseFilter::CompareFilter
- hbase(main):025:0> import org.apache.hadoop.hbase.filter.SubstringComparator
- => Java::OrgApacheHadoopHbaseFilter::SubstringComparator
- hbase(main):026:0> import org.apache.hadoop.hbase.filter.RowFilter
- => Java::OrgApacheHadoopHbaseFilter::RowFilter
- hbase(main):027:0> scan 'zmtest1', {FILTER => RowFilter.new(CompareFilter::CompareOp.valueOf('EQUAL'), SubstringComparator.new('ts3'))}
- ROW COLUMN+CELL
- user1|ts3 column=sf:s1, timestamp=1468893522589, value=sku123
- -- 引用正则
- import org.apache.hadoop.hbase.filter.RegexStringComparator
- put 'zmtest1', 'user2|err', 'sf:s1', 'sku999'
- scan 'zmtest1', {FILTER => RowFilter.new(CompareFilter::CompareOp.valueOf('EQUAL'),RegexStringComparator.new('^user\d+\|ts\d+$'))}
- --结果
- ROW COLUMN+CELL
- user1|ts1 column=sf:c1, timestamp=1468893512967, value=sku1
- user1|ts2 column=sf:c1, timestamp=1468893517729, value=sku188
- user1|ts3 column=sf:s1, timestamp=1468893522589, value=sku123
- user2|ts4 column=sf:c1, timestamp=1468893527021, value=sku2
- user2|ts5 column=sf:c2, timestamp=1468893527046, value=sku288
- user2|ts6 column=sf:s1, timestamp=1468893528197, value=sku222
- import org.apache.hadoop.hbase.filter.CompareFilter
- import org.apache.hadoop.hbase.filter.SingleColumnValueFilter
- import org.apache.hadoop.hbase.filter.SubstringComparator
- import org.apache.hadoop.hbase.util.Bytes
- scan 't1', { COLUMNS => 'family:qualifier', FILTER =>
- SingleColumnValueFilter.new
- (Bytes.toBytes('family'),
- Bytes.toBytes('qualifier'),
- CompareFilter::CompareOp.valueOf('EQUAL'),
- SubstringComparator.new('somevalue'))
- }
- put 'zmtest1', 'user1|ts9', 'sf:b1', 'sku1'
- scan 'zmtest1', FILTER=>"ColumnPrefixFilter('b1') AND ValueFilter(=,'binary:sku1')"
- scan 'zmtest1', {COLUMNS => 'sf:b1', FILTER => SingleColumnValueFilter.new(Bytes.toBytes('sf'), Bytes.toBytes('b1'), CompareFilter::CompareOp.valueOf('EQUAL'), Bytes.toBytes('sku1'))}
- -- binary value --
- org.apache.hadoop.hbase.util.Bytes.toString("Hello HBase".to_java_bytes)
- org.apache.hadoop.hbase.util.Bytes.toString("\x48\x65\x6c\x6c\x6f\x20\x48\x42\x61\x73\x65".to_java_bytes)
- -- 用户userX,作为rowkey,他的各种设备(brwoser, app, pc)作为列名,所对应的cookie_id作为value (长整型变量)
- put 'zmtest1', 'user1', 'lf:browser1', "\x00\x00\x00\x00\x00\x00\x00\x02"
- put 'zmtest1', 'user1', 'lf:app1', "\x00\x00\x00\x00\x00\x00\x00\x0F"
- put 'zmtest1', 'user1', 'lf:app2', "\x00\x00\x00\x00\x00\x00\x00\x10"
- put 'zmtest1', 'user2', 'lf:app1', "\x00\x00\x00\x00\x00\x00\x00\x11"
- put 'zmtest1', 'user2', 'lf:pc1', "\x00\x00\x00\x00\x00\x00\x00\x12"
- -- ValueFilter后可以跟着 = > < 等符号
- scan 'zmtest1', {STOPROW=>'user2', FILTER=>"( ColumnPrefixFilter('app') AND ValueFilter(>,'binary:\x00\x00\x00\x00\x00\x00\x00\x0F') )"}
- scan 'zmtest1', LIMIT => 10, FILTER=>"( ColumnPrefixFilter('app') AND ValueFilter(>,'binary:\x00\x00\x00\x00\x00\x00\x00\x0F') )"
- --结果
- ROW COLUMN+CELL
- user1 column=lf:app2, timestamp=1468906199761, value=\x00\x00\x00\x00\x00\x00\x00\x10
- user2 column=lf:app1, timestamp=1468906199806, value=\x00\x00\x00\x00\x00\x00\x00\x11
- -- 增加列族名
- alter 'zmtest1', NAME => 'cf'
- -- 删除列族
- alter 'zmtest1', NAME => 'cf', METHOD => 'delete'
- 或者
- alter 'zmtest1', 'delete' => 'cf'
- -- desc 'zmtest1'
- -- hive hbase mapping -- 其中,:key可以忽略掉
- CREATE EXTERNAL TABLE user_app_cookie_list ( username STRING, app1_cookie_id BIGINT, app2_cookie_id BIGINT )
- STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
- WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key, lf:app1#b, lf:app2#b")
- TBLPROPERTIES("hbase.table.name" = "zmtest1");
- select * from user_app_cookie_list;
- --
- #b 表示binary
- #s 表示字符串
- 是base.columns.mapping的关键字
- --
- hbase符合上述条件的数据:
- user1 column=lf:app1, timestamp=1468906199730, value=\x00\x00\x00\x00\x00\x00\x00\x0F
- user1 column=lf:app2, timestamp=1468906199761, value=\x00\x00\x00\x00\x00\x00\x00\x10
- user2 column=lf:app1, timestamp=1468906199806, value=\x00\x00\x00\x00\x00\x00\x00\x11
- user2 column=lf:pc1, timestamp=1468906200773, value=\x00\x00\x00\x00\x00\x00\x00\x12
- hive> select * from user_app_cookie_list;
- OK
- user1 15 16
- user2 17 NULL
- -- 用户userX,作为rowkey,在什么时间(timestamp)作为列名,访问了什么页面的id作为value:page_id (整型变量)
- put 'zmtest1', 'user1', 'cf:1399999999', "\x00\x00\x00\x09"
- put 'zmtest1', 'user1', 'cf:1400000000', "\x00\x00\x00\x08"
- put 'zmtest1', 'user1', 'cf:1400000001', "\x00\x00\x00\x07"
- put 'zmtest1', 'user1', 'cf:1400000002', "\x00\x00\x20\xFB"
- put 'zmtest1', 'user2', 'cf:1500000000', "\x00\x00\x00\x11"
- put 'zmtest1', 'user2', 'cf:1500000001', "\x00\x00\x20\xFC"
- 表 rowkey 列族:列名 列值
- -- hive hbase mapping cf with binary --
- http://www.abcn.net/2013/11/hive-hbase-mapping-column-family-with-binary-value.html
- CREATE EXTERNAL TABLE ts_string ( username STRING, visits map<string, int> )
- STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
- WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key, cf:#s:b")
- TBLPROPERTIES("hbase.table.name" = "zmtest1");
- CREATE EXTERNAL TABLE ts_int ( username STRING, visits map<int, int> )
- STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
- WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key, cf:#s:b")
- TBLPROPERTIES("hbase.table.name" = "zmtest1");
- CREATE EXTERNAL TABLE ts_int_long ( username STRING, visits map<int, bigint> )
- STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
- WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key, cf:#s:b")
- TBLPROPERTIES("hbase.table.name" = "zmtest1");
- select * from ts_int
- lateral view explode(visits) t as ts, page;
- select username, ts, page_id from ts_int
- lateral view explode(visits) t as ts, page_id;
- select username, pos, ts, page_id from ts_int
- lateral view posexplode(visits) t as pos, ts, page_id;
- username pos ts page_id
- user1 1 1399999999 9
- user1 2 1400000000 8
- user1 3 1400000001 7
- user1 4 1400000002 8443
- user2 1 1500000000 17
- user2 2 1500000001 8444
- select username, from_unixtime(ts), page_id from ts_int lateral view explode(visits) t as ts, page_id;