# HBase ruby classes. # Has wrapper classes for org.apache.hadoop.hbase.client.HBaseAdmin # and for org.apache.hadoop.hbase.client.HTable. Classes take # Formatters on construction and outputs any results using # Formatter methods. These classes are only really for use by # the hirb.rb HBase Shell script; they don't make much sense elsewhere. # For example, the exists method on Admin class prints to the formatter # whether the table exists and returns nil regardless. include Java include_class('java.lang.Integer') {|package,name| "J#{name}" } include_class('java.lang.Long') {|package,name| "J#{name}" } include_class('java.lang.Boolean') {|package,name| "J#{name}" } import org.apache.hadoop.hbase.KeyValue import org.apache.hadoop.hbase.client.HBaseAdmin import org.apache.hadoop.hbase.client.HTable import org.apache.hadoop.hbase.client.Get import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.client.Scan import org.apache.hadoop.hbase.client.Delete import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter import org.apache.hadoop.hbase.HConstants import org.apache.hadoop.hbase.io.hfile.Compression import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.HColumnDescriptor import org.apache.hadoop.hbase.HTableDescriptor import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.util.Writables import org.apache.hadoop.hbase.HRegionInfo import org.apache.zookeeper.ZooKeeper import org.apache.zookeeper.ZooKeeperMain module HBase COLUMN = "COLUMN" COLUMNS = "COLUMNS" TIMESTAMP = "TIMESTAMP" NAME = HConstants::NAME VERSIONS = HConstants::VERSIONS IN_MEMORY = HConstants::IN_MEMORY STOPROW = "STOPROW" STARTROW = "STARTROW" ENDROW = STOPROW LIMIT = "LIMIT" METHOD = "METHOD" MAXLENGTH = "MAXLENGTH" CACHE_BLOCKS = "CACHE_BLOCKS" # Wrapper for org.apache.hadoop.hbase.client.HBaseAdmin class Admin def initialize(configuration, formatter) @admin = HBaseAdmin.new(configuration) connection = @admin.getConnection() @zkWrapper = connection.getZooKeeperWrapper() zk = @zkWrapper.getZooKeeper() @zkMain = ZooKeeperMain.new(zk) @formatter = formatter end def list now = Time.now @formatter.header() for t in @admin.listTables() @formatter.row([t.getNameAsString()]) end @formatter.footer(now) end def describe(tableName) now = Time.now @formatter.header(["DESCRIPTION", "ENABLED"], [64]) found = false tables = @admin.listTables().to_a tables.push(HTableDescriptor::META_TABLEDESC, HTableDescriptor::ROOT_TABLEDESC) for t in tables if t.getNameAsString() == tableName @formatter.row([t.to_s, "%s" % [@admin.isTableEnabled(tableName)]], true, [64]) found = true end end if not found raise ArgumentError.new("Failed to find table named " + tableName) end @formatter.footer(now) end def exists(tableName) now = Time.now @formatter.header() e = @admin.tableExists(tableName) @formatter.row([e.to_s]) @formatter.footer(now) end def flush(tableNameOrRegionName) now = Time.now @formatter.header() @admin.flush(tableNameOrRegionName) @formatter.footer(now) end def compact(tableNameOrRegionName) now = Time.now @formatter.header() @admin.compact(tableNameOrRegionName) @formatter.footer(now) end def major_compact(tableNameOrRegionName) now = Time.now @formatter.header() @admin.majorCompact(tableNameOrRegionName) @formatter.footer(now) end def split(tableNameOrRegionName) now = Time.now @formatter.header() @admin.split(tableNameOrRegionName) @formatter.footer(now) end def enable(tableName) # TODO: Need an isEnabled method now = Time.now @admin.enableTable(tableName) @formatter.header() @formatter.footer(now) end def disable(tableName) # TODO: Need an isDisabled method now = Time.now @admin.disableTable(tableName) @formatter.header() @formatter.footer(now) end def enable_region(regionName) online(regionName, false) end def disable_region(regionName) online(regionName, true) end def online(regionName, onOrOff) now = Time.now meta = HTable.new(HConstants::META_TABLE_NAME) bytes = Bytes.toBytes(regionName) g = Get.new(bytes) g.addColumn(HConstants::CATALOG_FAMILY, HConstants::REGIONINFO_QUALIFIER) hriBytes = meta.get(g).value() hri = Writables.getWritable(hriBytes, HRegionInfo.new()); hri.setOffline(onOrOff) put = Put.new(bytes) put.add(HConstants::CATALOG_FAMILY, HConstants::REGIONINFO_QUALIFIER, Writables.getBytes(hri)) meta.put(put); @formatter.header() @formatter.footer(now) end def drop(tableName) now = Time.now @formatter.header() if @admin.isTableEnabled(tableName) raise IOError.new("Table " + tableName + " is enabled. Disable it first") else @admin.deleteTable(tableName) flush(HConstants::META_TABLE_NAME); major_compact(HConstants::META_TABLE_NAME); end @formatter.footer(now) end def truncate(tableName) now = Time.now @formatter.header() hTable = HTable.new(tableName) tableDescription = hTable.getTableDescriptor() puts 'Truncating ' + tableName + '; it may take a while' puts 'Disabling table...' disable(tableName) puts 'Dropping table...' drop(tableName) puts 'Creating table...' @admin.createTable(tableDescription) @formatter.footer(now) end # Pass tablename and an array of Hashes def create(tableName, args) now = Time.now # Pass table name and an array of Hashes. Later, test the last # array to see if its table options rather than column family spec. raise TypeError.new("Table name must be of type String") \ unless tableName.instance_of? String # For now presume all the rest of the args are column family # hash specifications. TODO: Add table options handling. htd = HTableDescriptor.new(tableName) for arg in args if arg.instance_of? String htd.addFamily(HColumnDescriptor.new(arg)) else raise TypeError.new(arg.class.to_s + " of " + arg.to_s + " is not of Hash type") \ unless arg.instance_of? Hash htd.addFamily(hcd(arg)) end end @admin.createTable(htd) @formatter.header() @formatter.footer(now) end def alter(tableName, args) now = Time.now raise TypeError.new("Table name must be of type String") \ unless tableName.instance_of? String htd = @admin.getTableDescriptor(tableName.to_java_bytes) method = args.delete(METHOD) if method == "delete" @admin.deleteColumn(tableName, args[NAME]) elsif method == "table_att" args[MAX_FILESIZE]? htd.setMaxFileSize(JLong.valueOf(args[MAX_FILESIZE])) : htd.setMaxFileSize(HTableDescriptor::DEFAULT_MAX_FILESIZE); args[READONLY]? htd.setReadOnly(JBoolean.valueOf(args[READONLY])) : htd.setReadOnly(HTableDescriptor::DEFAULT_READONLY); args[MEMSTORE_FLUSHSIZE]? htd.setMemStoreFlushSize(JLong.valueOf(args[MEMSTORE_FLUSHSIZE])) : htd.setMemStoreFlushSize(HTableDescriptor::DEFAULT_MEMSTORE_FLUSH_SIZE); @admin.modifyTable(tableName.to_java_bytes, htd) else descriptor = hcd(args) if (htd.hasFamily(descriptor.getNameAsString().to_java_bytes)) @admin.modifyColumn(tableName, descriptor.getNameAsString(), descriptor); else @admin.addColumn(tableName, descriptor); end end @formatter.header() @formatter.footer(now) end def close_region(regionName, server) now = Time.now s = nil s = [server].to_java if server @admin.closeRegion(regionName, s) @formatter.header() @formatter.footer(now) end def shutdown() @admin.shutdown() end def status(format) status = @admin.getClusterStatus() if format != nil and format == "detailed" puts("version %s" % [ status.getHBaseVersion() ]) # Put regions in transition first because usually empty puts("%d regionsInTransition" % status.getRegionsInTransition().size()) for k, v in status.getRegionsInTransition() puts(" %s" % [v]) end puts("%d live servers" % [ status.getServers() ]) for server in status.getServerInfo() puts(" %s:%d %d" % \ [ server.getServerAddress().getHostname(), \ server.getServerAddress().getPort(), server.getStartCode() ]) puts(" %s" % [ server.getLoad().toString() ]) for region in server.getLoad().getRegionsLoad() puts(" %s" % [ region.getNameAsString() ]) puts(" %s" % [ region.toString() ]) end end puts("%d dead servers" % [ status.getDeadServers() ]) for server in status.getDeadServerNames() puts(" %s" % [ server ]) end elsif format != nil and format == "simple" puts("%d live servers" % [ status.getServers() ]) for server in status.getServerInfo() puts(" %s:%d %d" % \ [ server.getServerAddress().getHostname(), \ server.getServerAddress().getPort(), server.getStartCode() ]) puts(" %s" % [ server.getLoad().toString() ]) end puts("%d dead servers" % [ status.getDeadServers() ]) for server in status.getDeadServerNames() puts(" %s" % [ server ]) end else puts("%d servers, %d dead, %.4f average load" % \ [ status.getServers(), status.getDeadServers(), \ status.getAverageLoad()]) end end def hcd(arg) # Return a new HColumnDescriptor made of passed args # TODO: This is brittle code. # Here is current HCD constructor: # public HColumnDescriptor(final byte [] familyName, final int maxVersions, # final String compression, final boolean inMemory, # final boolean blockCacheEnabled, final int blocksize, # final int maxValueLength, # final int timeToLive, final boolean bloomFilter) { name = arg[NAME] raise ArgumentError.new("Column family " + arg + " must have a name") \ unless name # TODO: What encoding are Strings in jruby? return HColumnDescriptor.new(name.to_java_bytes, # JRuby uses longs for ints. Need to convert. Also constants are String arg[VERSIONS]? JInteger.new(arg[VERSIONS]): HColumnDescriptor::DEFAULT_VERSIONS, arg[HColumnDescriptor::COMPRESSION]? arg[HColumnDescriptor::COMPRESSION]: HColumnDescriptor::DEFAULT_COMPRESSION, arg[IN_MEMORY]? JBoolean.valueOf(arg[IN_MEMORY]): HColumnDescriptor::DEFAULT_IN_MEMORY, arg[HColumnDescriptor::BLOCKCACHE]? JBoolean.valueOf(arg[HColumnDescriptor::BLOCKCACHE]): HColumnDescriptor::DEFAULT_BLOCKCACHE, arg[HColumnDescriptor::BLOCKSIZE]? JInteger.valueOf(arg[HColumnDescriptor::BLOCKSIZE]): HColumnDescriptor::DEFAULT_BLOCKSIZE, arg[HColumnDescriptor::TTL]? JInteger.new(arg[HColumnDescriptor::TTL]): HColumnDescriptor::DEFAULT_TTL, arg[HColumnDescriptor::BLOOMFILTER]? JBoolean.valueOf(arg[HColumnDescriptor::BLOOMFILTER]): HColumnDescriptor::DEFAULT_BLOOMFILTER) end def zk(args) line = args.join(' ') line = 'help' if line.empty? @zkMain.executeLine(line) end def zk_dump puts @zkWrapper.dump end end # Wrapper for org.apache.hadoop.hbase.client.HTable class Table def initialize(configuration, tableName, formatter) @table = HTable.new(configuration, tableName) @formatter = formatter end # Delete a cell def delete(row, column, timestamp = HConstants::LATEST_TIMESTAMP) now = Time.now d = Delete.new(row.to_java_bytes, timestamp, nil) split = KeyValue.parseColumn(column.to_java_bytes) d.deleteColumn(split[0], split.length > 1 ? split[1] : nil, timestamp) @table.delete(d) @formatter.header() @formatter.footer(now) end def deleteall(row, column = nil, timestamp = HConstants::LATEST_TIMESTAMP) now = Time.now d = Delete.new(row.to_java_bytes, timestamp, nil) if column != nil split = KeyValue.parseColumn(column.to_java_bytes) d.deleteColumns(split[0], split.length > 1 ? split[1] : nil, timestamp) end @table.delete(d) @formatter.header() @formatter.footer(now) end def getAllColumns htd = @table.getTableDescriptor() result = [] for f in htd.getFamilies() n = f.getNameAsString() n << ':' result << n end result end def scan(args = {}) now = Time.now limit = -1 maxlength = -1 if args != nil and args.length > 0 limit = args["LIMIT"] || -1 maxlength = args["MAXLENGTH"] || -1 filter = args["FILTER"] || nil startrow = args["STARTROW"] || "" stoprow = args["STOPROW"] || nil timestamp = args["TIMESTAMP"] || nil columns = args["COLUMNS"] || getAllColumns() cache = args["CACHE_BLOCKS"] || true versions = args["VERSIONS"] || 1 if columns.class == String columns = [columns] elsif columns.class != Array raise ArgumentError.new("COLUMNS must be specified as a String or an Array") end if stoprow scan = Scan.new(startrow.to_java_bytes, stoprow.to_java_bytes) else scan = Scan.new(startrow.to_java_bytes) end for c in columns split = KeyValue.parseColumn(c.to_java_bytes) if split.length > 1 scan.addColumn(split[0], split[1]) else scan.addFamily(split[0]) end end if filter != nil scan.setFilter(filter) end if timestamp != nil scan.setTimeStamp(timestamp) end scan.setCacheBlocks(cache) scan.setMaxVersions(versions) if versions > 1 else scan = Scan.new() end s = @table.getScanner(scan) count = 0 @formatter.header(["ROW", "COLUMN+CELL"]) i = s.iterator() while i.hasNext() r = i.next() row = String.from_java_bytes r.getRow() count += 1 if limit != -1 and count >= limit break end for kv in r.list family = String.from_java_bytes kv.getFamily() qualifier = String.from_java_bytes kv.getQualifier() column = family + ':' + qualifier cell = toString(column, kv, maxlength) @formatter.row([row, "column=%s, %s" % [column, cell]]) end end @formatter.footer(now, count) end def put(row, column, value, timestamp = nil) now = Time.now p = nil if timestamp p = Put.new(row.to_java_bytes, timestamp) else p = Put.new(row.to_java_bytes) end split = KeyValue.parseColumn(column.to_java_bytes) if split.length > 1 p.add(split[0], split[1], value.to_java_bytes) else p.add(split[0], nil, value.to_java_bytes) end @table.put(p) @formatter.header() @formatter.footer(now) end def isMetaTable() tn = @table.getTableName() return Bytes.equals(tn, HConstants::META_TABLE_NAME) || Bytes.equals(tn, HConstants::ROOT_TABLE_NAME) end # Make a String of the passed kv # Intercept cells whose format we know such as the info:regioninfo in .META. def toString(column, kv, maxlength) if isMetaTable() if column == 'info:regioninfo' hri = Writables.getHRegionInfoOrNull(kv.getValue()) return "timestamp=%d, value=%s" % [kv.getTimestamp(), hri.toString()] elsif column == 'info:serverstartcode' return "timestamp=%d, value=%s" % [kv.getTimestamp(), \ Bytes.toLong(kv.getValue())] end end val = "timestamp=" + kv.getTimestamp().to_s + ", value=" + Bytes.toStringBinary(kv.getValue()) maxlength != -1 ? val[0, maxlength] : val end # Get from table def get(row, args = {}) now = Time.now result = nil if args == nil or args.length == 0 or (args.length == 1 and args[MAXLENGTH] != nil) get = Get.new(row.to_java_bytes) else # Its a hash. columns = args[COLUMN] if columns == nil # Maybe they used the COLUMNS key columns = args[COLUMNS] end if columns == nil # May have passed TIMESTAMP and row only; wants all columns from ts. ts = args[TIMESTAMP] if not ts raise ArgumentError.new("Failed parse of " + args + ", " + args.class) end get = Get.new(row.to_java_bytes, ts) else get = Get.new(row.to_java_bytes) # Columns are non-nil if columns.class == String # Single column split = KeyValue.parseColumn(columns.to_java_bytes) if (split.length > 1) get.addColumn(split[0], split[1]) else get.addFamily(split[0]) end elsif columns.class == Array for column in columns split = KeyValue.parseColumn(columns.to_java_bytes) if (split.length > 1) get.addColumn(split[0], split[1]) else get.addFamily(split[0]) end end else raise ArgumentError.new("Failed parse column argument type " + args + ", " + args.class) end get.setMaxVersions(args[VERSIONS] ? args[VERSIONS] : 1) if args[TIMESTAMP] get.setTimeStamp(args[TIMESTAMP]) end end end result = @table.get(get) # Print out results. Result can be Cell or RowResult. maxlength = args[MAXLENGTH] || -1 @formatter.header(["COLUMN", "CELL"]) if !result.isEmpty() for kv in result.list() family = String.from_java_bytes kv.getFamily() qualifier = String.from_java_bytes kv.getQualifier() column = family + ':' + qualifier @formatter.row([column, toString(column, kv, maxlength)]) end end @formatter.footer(now) end def count(interval = 1000) now = Time.now scan = Scan.new() scan.setCacheBlocks(false) # We can safely set scanner caching with the first key only filter scan.setCaching(10) scan.setFilter(FirstKeyOnlyFilter.new()) s = @table.getScanner(scan) count = 0 i = s.iterator() @formatter.header() while i.hasNext() r = i.next() count += 1 if count % interval == 0 @formatter.row(["Current count: " + count.to_s + ", row: " + \ (String.from_java_bytes r.getRow())]) end end @formatter.footer(now, count) end end # Testing. To run this test, there needs to be an hbase cluster up and # running. Then do: ${HBASE_HOME}/bin/hbase org.jruby.Main bin/HBase.rb if $0 == __FILE__ # Add this directory to LOAD_PATH; presumption is that Formatter module # sits beside this one. Then load it up. $LOAD_PATH.unshift File.dirname($PROGRAM_NAME) require 'Formatter' # Make a console formatter formatter = Formatter::Console.new(STDOUT) # Now add in java and hbase classes configuration = HBaseConfiguration.new() admin = Admin.new(configuration, formatter) # Drop old table. If it does not exist, get an exception. Catch and # continue TESTTABLE = "HBase_rb_testtable" begin admin.disable(TESTTABLE) admin.drop(TESTTABLE) rescue org.apache.hadoop.hbase.TableNotFoundException # Just suppress not found exception end admin.create(TESTTABLE, [{NAME => 'x', VERSIONS => 5}]) # Presume it exists. If it doesn't, next items will fail. table = Table.new(configuration, TESTTABLE, formatter) for i in 1..10 table.put('x%d' % i, 'x:%d' % i, 'x%d' % i) end table.get('x1', {COLUMNS => 'x:1'}) if formatter.rowCount() != 1 raise IOError.new("Failed first put") end table.scan({COLUMNS => ['x:']}) if formatter.rowCount() != 10 raise IOError.new("Failed scan of expected 10 rows") end # Verify that limit works. table.scan({COLUMNS => ['x:'], LIMIT => 4}) if formatter.rowCount() != 3 raise IOError.new("Failed scan of expected 3 rows") end # Should only be two rows if we start at 8 (Row x10 sorts beside x1). table.scan({COLUMNS => ['x:'], STARTROW => 'x8', LIMIT => 3}) if formatter.rowCount() != 2 raise IOError.new("Failed scan of expected 2 rows") end # Scan between two rows table.scan({COLUMNS => ['x:'], STARTROW => 'x5', ENDROW => 'x8'}) if formatter.rowCount() != 3 raise IOError.new("Failed endrow test") end # Verify that delete works table.delete('x1', 'x:1'); table.scan({COLUMNS => ['x:1']}) scan1 = formatter.rowCount() table.scan({COLUMNS => ['x:']}) scan2 = formatter.rowCount() if scan1 != 0 or scan2 != 9 raise IOError.new("Failed delete test") end # Verify that deletall works table.put('x2', 'x:1', 'x:1') table.deleteall('x2') table.scan({COLUMNS => ['x:2']}) scan1 = formatter.rowCount() table.scan({COLUMNS => ['x:']}) scan2 = formatter.rowCount() if scan1 != 0 or scan2 != 8 raise IOError.new("Failed deleteall test") end admin.disable(TESTTABLE) admin.drop(TESTTABLE) end end