HBASE-11676 Scan FORMATTER is not applied for columns using non-printable name in shell (#2161)

- In HBase::Table, the instance variable @converters is used to map column
  names to converters. This patch fixes how HBase::Table#_get_internal and
  HBase::Table#_scan_internal generate the column name key used to access
  @converters.
- Refactor parsing of family:qualifier:converter specifications so that the
  code is more readable and reusable. As part of this change, I added two
  private methods and marked HBase::Table#set_converter as deprecated for
  removal in HBase 4.0.0.
- Add unit testing for the fixed bug

Signed-off-by: stack <stack@apache.org>
This commit is contained in:
Elliot 2020-07-28 23:43:19 -04:00 committed by GitHub
parent 4471a644f6
commit 7974a1e9bf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 93 additions and 24 deletions

View File

@ -449,18 +449,23 @@ EOF
# Print out results. Result can be Cell or RowResult.
res = {}
result.listCells.each do |c|
family = convert_bytes_with_position(c.getFamilyArray,
c.getFamilyOffset, c.getFamilyLength, converter_class, converter)
qualifier = convert_bytes_with_position(c.getQualifierArray,
c.getQualifierOffset, c.getQualifierLength, converter_class, converter)
# Get the family and qualifier of the cell without escaping non-printable characters. It is crucial that
# column is constructed in this consistent way to that it can be used as a key.
family_bytes = org.apache.hadoop.hbase.util.Bytes.copy(c.getFamilyArray, c.getFamilyOffset, c.getFamilyLength)
qualifier_bytes = org.apache.hadoop.hbase.util.Bytes.copy(c.getQualifierArray, c.getQualifierOffset, c.getQualifierLength)
column = "#{family_bytes}:#{qualifier_bytes}"
column = "#{family}:#{qualifier}"
value = to_string(column, c, maxlength, converter_class, converter)
# Use the FORMATTER to determine how column is printed
family = convert_bytes(family_bytes, converter_class, converter)
qualifier = convert_bytes(qualifier_bytes, converter_class, converter)
formatted_column = "#{family}:#{qualifier}"
if block_given?
yield(column, value)
yield(formatted_column, value)
else
res[column] = value
res[formatted_column] = value
end
end
@ -604,19 +609,24 @@ EOF
is_stale |= row.isStale
row.listCells.each do |c|
family = convert_bytes_with_position(c.getFamilyArray,
c.getFamilyOffset, c.getFamilyLength, converter_class, converter)
qualifier = convert_bytes_with_position(c.getQualifierArray,
c.getQualifierOffset, c.getQualifierLength, converter_class, converter)
# Get the family and qualifier of the cell without escaping non-printable characters. It is crucial that
# column is constructed in this consistent way to that it can be used as a key.
family_bytes = org.apache.hadoop.hbase.util.Bytes.copy(c.getFamilyArray, c.getFamilyOffset, c.getFamilyLength)
qualifier_bytes = org.apache.hadoop.hbase.util.Bytes.copy(c.getQualifierArray, c.getQualifierOffset, c.getQualifierLength)
column = "#{family_bytes}:#{qualifier_bytes}"
column = "#{family}:#{qualifier}"
cell = to_string(column, c, maxlength, converter_class, converter)
# Use the FORMATTER to determine how column is printed
family = convert_bytes(family_bytes, converter_class, converter)
qualifier = convert_bytes(qualifier_bytes, converter_class, converter)
formatted_column = "#{family}:#{qualifier}"
if block_given?
yield(key, "column=#{column}, #{cell}")
yield(key, "column=#{formatted_column}, #{cell}")
else
res[key] ||= {}
res[key][column] = cell
res[key][formatted_column] = cell
end
end
# One more row processed
@ -729,11 +739,15 @@ EOF
org.apache.hadoop.hbase.TableName::META_TABLE_NAME.equals(@table.getName)
end
# Returns family and (when has it) qualifier for a column name
# Given a column specification in the format FAMILY[:QUALIFIER[:CONVERTER]]
# 1. Save the converter for the given column
# 2. Return a 2-element Array with [family, qualifier or nil], discarding the converter if provided
#
# @param [String] column specification
def parse_column_name(column)
split = org.apache.hadoop.hbase.CellUtil.parseColumn(column.to_java_bytes)
set_converter(split) if split.length > 1
[split[0], split.length > 1 ? split[1] : nil]
spec = parse_column_format_spec(column)
set_column_converter(spec.family, spec.qualifier, spec.converter) unless spec.converter.nil?
[spec.family, spec.qualifier]
end
def toISO8601(millis)
@ -806,9 +820,46 @@ EOF
eval(converter_class).method(converter_method).call(bytes, offset, len)
end
# store the information designating what part of a column should be printed, and how
ColumnFormatSpec = Struct.new(:family, :qualifier, :converter)
##
# Parse the column specification for formatting used by shell commands like :scan
#
# Strings should be structured as follows:
# FAMILY:QUALIFIER[:CONVERTER]
# Where:
# - FAMILY is the column family
# - QUALIFIER is the column qualifier. Non-printable characters should be left AS-IS and should NOT BE escaped.
# - CONVERTER is optional and is the name of a converter (like toLong) to apply
#
# @param [String] column
# @return [ColumnFormatSpec] family, qualifier, and converter as Java bytes
private def parse_column_format_spec(column)
split = org.apache.hadoop.hbase.CellUtil.parseColumn(column.to_java_bytes)
family = split[0]
qualifier = nil
converter = nil
if split.length > 1
parts = org.apache.hadoop.hbase.CellUtil.parseColumn(split[1])
qualifier = parts[0]
if parts.length > 1
converter = parts[1]
end
end
ColumnFormatSpec.new(family, qualifier, converter)
end
private def set_column_converter(family, qualifier, converter)
@converters["#{String.from_java_bytes(family)}:#{String.from_java_bytes(qualifier)}"] = String.from_java_bytes(converter)
end
# if the column spec contains CONVERTER information, to get rid of :CONVERTER info from column pair.
# 1. return back normal column pair as usual, i.e., "cf:qualifier[:CONVERTER]" to "cf" and "qualifier" only
# 2. register the CONVERTER information based on column spec - "cf:qualifier"
#
# Deprecated for removal in 4.0.0
def set_converter(column)
family = String.from_java_bytes(column[0])
parts = org.apache.hadoop.hbase.CellUtil.parseColumn(column[1])
@ -817,6 +868,8 @@ EOF
column[1] = parts[0]
end
end
extend Gem::Deprecate
deprecate :set_converter, "4.0.0", nil, nil
#----------------------------------------------------------------------------------------------
# Get the split points for the table

View File

@ -239,6 +239,7 @@ module Hbase
@test_ts = 12345678
@test_table.put(1, "x:a", 1)
@test_table.put(1, "x:b", 2, @test_ts)
@test_table.put(1, "x:\x11", [921].pack("N"))
@test_table.put(2, "x:a", 11)
@test_table.put(2, "x:b", 12, @test_ts)
@ -333,9 +334,10 @@ module Hbase
end
define_test "get should work with hash columns spec and an array of strings COLUMN parameter" do
res = @test_table._get_internal('1', COLUMN => [ 'x:a', 'x:b' ])
res = @test_table._get_internal('1', COLUMN => [ "x:\x11", 'x:a', 'x:b' ])
assert_not_nil(res)
assert_kind_of(Hash, res)
assert_not_nil(res['x:\x11'])
assert_not_nil(res['x:a'])
assert_not_nil(res['x:b'])
end
@ -356,6 +358,18 @@ module Hbase
assert_not_nil(res['x:b'])
end
define_test "get should work with non-printable columns and values" do
res = @test_table._get_internal('1', COLUMNS => [ "x:\x11" ])
assert_not_nil(res)
assert_kind_of(Hash, res)
assert_match(/value=\\x00\\x00\\x03\\x99/, res[ 'x:\x11' ])
res = @test_table._get_internal('1', COLUMNS => [ "x:\x11:toInt" ])
assert_not_nil(res)
assert_kind_of(Hash, res)
assert_match(/value=921/, res[ 'x:\x11' ])
end
define_test "get should work with hash columns spec and TIMESTAMP only" do
res = @test_table._get_internal('1', TIMESTAMP => @test_ts)
assert_not_nil(res)
@ -412,10 +426,10 @@ module Hbase
assert_not_nil(res['x:b'])
end
define_test "get with a block should yield (column, value) pairs" do
define_test "get with a block should yield (formatted column, value) pairs" do
res = {}
@test_table._get_internal('1') { |col, val| res[col] = val }
assert_equal(res.keys.sort, [ 'x:a', 'x:b' ])
assert_equal([ 'x:\x11', 'x:a', 'x:b' ], res.keys.sort)
end
define_test "get should support COLUMNS with value CONVERTER information" do
@ -709,12 +723,14 @@ module Hbase
define_test "scan should support COLUMNS with value CONVERTER information" do
@test_table.put(1, "x:c", [1024].pack('N'))
@test_table.put(1, "x:d", [98].pack('N'))
@test_table.put(1, "x:\x11", [712].pack('N'))
begin
res = @test_table._scan_internal COLUMNS => ['x:c:toInt', 'x:d:c(org.apache.hadoop.hbase.util.Bytes).toInt']
res = @test_table._scan_internal COLUMNS => ['x:c:toInt', 'x:d:c(org.apache.hadoop.hbase.util.Bytes).toInt', "x:\x11:toInt"]
assert_not_nil(res)
assert_kind_of(Hash, res)
assert_not_nil(/value=1024/.match(res['1']['x:c']))
assert_not_nil(/value=98/.match(res['1']['x:d']))
assert_match(/value=1024/, res['1']['x:c'])
assert_match(/value=98/, res['1']['x:d'])
assert_match(/value=712/, res['1']['x:\x11'])
ensure
# clean up newly added columns for this test only.
@test_table.deleteall(1, 'x:c')