From f0731894479d90deace11d4560e2218776a8b2ac Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Wed, 25 Feb 2009 05:59:26 +0000 Subject: [PATCH] HBASE-61 Create an HBase-specific MapFile implementation git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@747672 13f79535-47bb-0310-9956-ffa450edef68 --- conf/hbase-default.xml | 16 + .../hadoop/hbase/io/HalfHFileReader.java | 198 ++ .../hadoop/hbase/io/hfile/BlockCache.java | 42 + .../io/hfile/BoundedRangeFileInputStream.java | 143 ++ .../hadoop/hbase/io/hfile/Compression.java | 324 +++ .../apache/hadoop/hbase/io/hfile/HFile.java | 1423 +++++++++++ .../hadoop/hbase/io/hfile/HFileScanner.java | 112 + .../hbase/io/hfile/SimpleBlockCache.java | 56 + .../apache/hadoop/hbase/io/hfile/package.html | 25 + .../hadoop/hbase/regionserver/HStore.java | 2207 ----------------- .../hadoop/hbase/regionserver/HStoreFile.java | 558 ----- .../hadoop/hbase/regionserver/Store.java | 1807 ++++++++++++++ .../hadoop/hbase/regionserver/StoreFile.java | 443 ++++ .../{HStoreScanner.java => StoreScanner.java} | 36 +- .../hbase/HFilePerformanceEvaluation.java | 365 +++ .../hbase/PerformanceEvaluationCommons.java | 78 + .../apache/hadoop/hbase/TestHStoreKey.java | 128 + .../hadoop/hbase/io/hfile/KVGenerator.java | 111 + .../hadoop/hbase/io/hfile/KeySampler.java | 64 + .../hadoop/hbase/io/hfile/NanoTimer.java | 198 ++ .../hbase/io/hfile/RandomDistribution.java | 271 ++ .../hadoop/hbase/io/hfile/RandomSeek.java | 124 + .../hadoop/hbase/io/hfile/TestHFile.java | 248 ++ .../hbase/io/hfile/TestHFilePerformance.java | 387 +++ .../hadoop/hbase/io/hfile/TestHFileSeek.java | 510 ++++ .../hadoop/hbase/io/hfile/TestSeekTo.java | 146 ++ .../hbase/regionserver/TestHStoreFile.java | 388 --- .../hbase/regionserver/TestStoreFile.java | 295 +++ 28 files changed, 7528 insertions(+), 3175 deletions(-) create mode 100644 src/java/org/apache/hadoop/hbase/io/HalfHFileReader.java create mode 100644 src/java/org/apache/hadoop/hbase/io/hfile/BlockCache.java create mode 100644 src/java/org/apache/hadoop/hbase/io/hfile/BoundedRangeFileInputStream.java create mode 100644 src/java/org/apache/hadoop/hbase/io/hfile/Compression.java create mode 100644 src/java/org/apache/hadoop/hbase/io/hfile/HFile.java create mode 100644 src/java/org/apache/hadoop/hbase/io/hfile/HFileScanner.java create mode 100644 src/java/org/apache/hadoop/hbase/io/hfile/SimpleBlockCache.java create mode 100644 src/java/org/apache/hadoop/hbase/io/hfile/package.html delete mode 100644 src/java/org/apache/hadoop/hbase/regionserver/HStore.java delete mode 100644 src/java/org/apache/hadoop/hbase/regionserver/HStoreFile.java create mode 100644 src/java/org/apache/hadoop/hbase/regionserver/Store.java create mode 100644 src/java/org/apache/hadoop/hbase/regionserver/StoreFile.java rename src/java/org/apache/hadoop/hbase/regionserver/{HStoreScanner.java => StoreScanner.java} (91%) create mode 100644 src/test/org/apache/hadoop/hbase/HFilePerformanceEvaluation.java create mode 100644 src/test/org/apache/hadoop/hbase/PerformanceEvaluationCommons.java create mode 100644 src/test/org/apache/hadoop/hbase/TestHStoreKey.java create mode 100644 src/test/org/apache/hadoop/hbase/io/hfile/KVGenerator.java create mode 100644 src/test/org/apache/hadoop/hbase/io/hfile/KeySampler.java create mode 100644 src/test/org/apache/hadoop/hbase/io/hfile/NanoTimer.java create mode 100644 src/test/org/apache/hadoop/hbase/io/hfile/RandomDistribution.java create mode 100644 src/test/org/apache/hadoop/hbase/io/hfile/RandomSeek.java create mode 100644 src/test/org/apache/hadoop/hbase/io/hfile/TestHFile.java create mode 100644 src/test/org/apache/hadoop/hbase/io/hfile/TestHFilePerformance.java create mode 100644 src/test/org/apache/hadoop/hbase/io/hfile/TestHFileSeek.java create mode 100644 src/test/org/apache/hadoop/hbase/io/hfile/TestSeekTo.java delete mode 100644 src/test/org/apache/hadoop/hbase/regionserver/TestHStoreFile.java create mode 100644 src/test/org/apache/hadoop/hbase/regionserver/TestStoreFile.java diff --git a/conf/hbase-default.xml b/conf/hbase-default.xml index cbc51518b05..30b4f422ffa 100644 --- a/conf/hbase-default.xml +++ b/conf/hbase-default.xml @@ -323,6 +323,22 @@ if you have small cells and want faster random-access of individual cells. + + hfile.min.blocksize.size + 65536 + Minimum store file block size. The smaller you make this, the + bigger your index and the less you fetch on a random-access. Set size down + if you have small cells and want faster random-access of individual cells. + + + + hfile.min.blocksize.size + 65536 + Minimum store file block size. The smaller you make this, the + bigger your index and the less you fetch on a random-access. Set size down + if you have small cells and want faster random-access of individual cells. + + hbase.io.seqfile.compression.type NONE diff --git a/src/java/org/apache/hadoop/hbase/io/HalfHFileReader.java b/src/java/org/apache/hadoop/hbase/io/HalfHFileReader.java new file mode 100644 index 00000000000..7794cfa51a9 --- /dev/null +++ b/src/java/org/apache/hadoop/hbase/io/HalfHFileReader.java @@ -0,0 +1,198 @@ +/** + * Copyright 2008 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.io; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.io.hfile.BlockCache; +import org.apache.hadoop.hbase.io.hfile.HFile; +import org.apache.hadoop.hbase.io.hfile.HFileScanner; +import org.apache.hadoop.hbase.io.hfile.HFile.Reader; +import org.apache.hadoop.hbase.util.Bytes; + +/** + * A facade for a {@link org.apache.hadoop.io.hfile.HFile.Reader} that serves up + * either the top or bottom half of a HFile where 'bottom' is the first half + * of the file containing the keys that sort lowest and 'top' is the second half + * of the file with keys that sort greater than those of the bottom half. + * The top includes the split files midkey, of the key that follows if it does + * not exist in the file. + * + *

This type works in tandem with the {@link Reference} type. This class + * is used reading while Reference is used writing. + * + *

This file is not splitable. Calls to {@link #midKey()} return null. + */ +public class HalfHFileReader extends HFile.Reader { + static final Log LOG = LogFactory.getLog(HalfHFileReader.class); + private final boolean top; + // This is the key we split around. Its the first possible entry on a row: + // i.e. empty column and a timestamp of LATEST_TIMESTAMP. + private final byte [] splitkey; + + /** + * @param fs + * @param f + * @param c + * @param r + * @throws IOException + */ + public HalfHFileReader(final FileSystem fs, final Path p, final BlockCache c, + final Reference r) + throws IOException { + super(fs, p, c); + // This is not actual midkey for this half-file; its just border + // around which we split top and bottom. Have to look in files to find + // actual last and first keys for bottom and top halves. Half-files don't + // have an actual midkey themselves. No midkey is how we indicate file is + // not splittable. + this.splitkey = r.getSplitKey(); + // Is it top or bottom half? + this.top = Reference.isTopFileRegion(r.getFileRegion()); + } + + public HFileScanner getScanner() { + final HFileScanner s = super.getScanner(); + return new HFileScanner() { + final HFileScanner delegate = s; + + public ByteBuffer getKey() { + return delegate.getKey(); + } + + public String getKeyString() { + return delegate.getKeyString(); + } + + public ByteBuffer getValue() { + return delegate.getValue(); + } + + public String getValueString() { + return delegate.getValueString(); + } + + public boolean next() throws IOException { + boolean b = delegate.next(); + if (!b) { + return b; + } + if (!top) { + ByteBuffer bb = getKey(); + if (getComparator().compare(bb.array(), bb.arrayOffset(), bb.limit(), + splitkey, 0, splitkey.length) >= 0) { + return false; + } + } + return true; + } + + public boolean seekBefore(byte[] key) throws IOException { + if (top) { + if (getComparator().compare(key, splitkey) < 0) { + return false; + } + } else { + if (getComparator().compare(key, splitkey) >= 0) { + return seekBefore(splitkey); + } + } + return this.delegate.seekBefore(key); + } + + public boolean seekTo() throws IOException { + if (top) { + int r = this.delegate.seekTo(splitkey); + if (r < 0) { + // midkey is < first key in file + return this.delegate.seekTo(); + } + if (r > 0) { + return this.delegate.next(); + } + return true; + } + + boolean b = delegate.seekTo(); + if (!b) { + return b; + } + // Check key. + ByteBuffer k = this.delegate.getKey(); + return this.delegate.getReader().getComparator(). + compare(k.array(), k.arrayOffset(), k.limit(), + splitkey, 0, splitkey.length) < 0; + } + + public int seekTo(byte[] key) throws IOException { + if (top) { + if (getComparator().compare(key, splitkey) < 0) { + return -1; + } + } else { + if (getComparator().compare(key, splitkey) >= 0) { + // we would place the scanner in the second half. + // it might be an error to return false here ever... + boolean res = delegate.seekBefore(splitkey); + if (!res) { + throw new IOException("Seeking for a key in bottom of file, but key exists in top of file, failed on seekBefore(midkey)"); + } + return 1; + } + } + return delegate.seekTo(key); + } + + public Reader getReader() { + return this.delegate.getReader(); + } + + public boolean isSeeked() { + return this.delegate.isSeeked(); + } + }; + } + + public byte[] getLastKey() { + if (top) { + return super.getLastKey(); + } else { + HFileScanner scanner = getScanner(); + try { + if (scanner.seekBefore(this.splitkey)) { + return Bytes.toBytes(scanner.getKey()); + } + } catch (IOException e) { + LOG.warn("Failed seekBefore " + Bytes.toString(this.splitkey), e); + } + return null; + } + } + + public byte[] midkey() throws IOException { + // Returns null to indicate file is not splitable. + return null; + } +} \ No newline at end of file diff --git a/src/java/org/apache/hadoop/hbase/io/hfile/BlockCache.java b/src/java/org/apache/hadoop/hbase/io/hfile/BlockCache.java new file mode 100644 index 00000000000..91cfaf6613f --- /dev/null +++ b/src/java/org/apache/hadoop/hbase/io/hfile/BlockCache.java @@ -0,0 +1,42 @@ +/** + * Copyright 2009 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.io.hfile; + +import java.nio.ByteBuffer; + +/** + * Block cache interface. + * TODO: Add filename or hash of filename to block cache key. + */ +public interface BlockCache { + /** + * Add block to cache. + * @param blockNumber Zero-based file block number. + * @param buf The block contents wrapped in a ByteBuffer. + */ + public void cacheBlock(String blockName, ByteBuffer buf); + + /** + * Fetch block from cache. + * @param blockNumber Block number to fetch. + * @return Block or null if block is not in the cache. + */ + public ByteBuffer getBlock(String blockName); +} \ No newline at end of file diff --git a/src/java/org/apache/hadoop/hbase/io/hfile/BoundedRangeFileInputStream.java b/src/java/org/apache/hadoop/hbase/io/hfile/BoundedRangeFileInputStream.java new file mode 100644 index 00000000000..ae7734a941b --- /dev/null +++ b/src/java/org/apache/hadoop/hbase/io/hfile/BoundedRangeFileInputStream.java @@ -0,0 +1,143 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package org.apache.hadoop.hbase.io.hfile; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.hadoop.fs.FSDataInputStream; + +/** + * BoundedRangeFIleInputStream abstracts a contiguous region of a Hadoop + * FSDataInputStream as a regular input stream. One can create multiple + * BoundedRangeFileInputStream on top of the same FSDataInputStream and they + * would not interfere with each other. + * Copied from hadoop-335 tfile. + */ +class BoundedRangeFileInputStream extends InputStream { + + private FSDataInputStream in; + private long pos; + private long end; + private long mark; + private final byte[] oneByte = new byte[1]; + + /** + * Constructor + * + * @param in + * The FSDataInputStream we connect to. + * @param offset + * Beginning offset of the region. + * @param length + * Length of the region. + * + * The actual length of the region may be smaller if (off_begin + + * length) goes beyond the end of FS input stream. + */ + public BoundedRangeFileInputStream(FSDataInputStream in, long offset, + long length) { + if (offset < 0 || length < 0) { + throw new IndexOutOfBoundsException("Invalid offset/length: " + offset + + "/" + length); + } + + this.in = in; + this.pos = offset; + this.end = offset + length; + this.mark = -1; + } + + @Override + public int available() throws IOException { + int avail = in.available(); + if (pos + avail > end) { + avail = (int) (end - pos); + } + + return avail; + } + + @Override + public int read() throws IOException { + int ret = read(oneByte); + if (ret == 1) return oneByte[0] & 0xff; + return -1; + } + + @Override + public int read(byte[] b) throws IOException { + return read(b, 0, b.length); + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + if ((off | len | (off + len) | (b.length - (off + len))) < 0) { + throw new IndexOutOfBoundsException(); + } + + int n = (int) Math.min(Integer.MAX_VALUE, Math.min(len, (end - pos))); + if (n == 0) return -1; + int ret = 0; + synchronized (in) { + in.seek(pos); + ret = in.read(b, off, n); + } + // / ret = in.read(pos, b, off, n); + if (ret < 0) { + end = pos; + return -1; + } + pos += ret; + return ret; + } + + @Override + /* + * We may skip beyond the end of the file. + */ + public long skip(long n) throws IOException { + long len = Math.min(n, end - pos); + pos += len; + return len; + } + + @Override + public void mark(int readlimit) { + mark = pos; + } + + @Override + public void reset() throws IOException { + if (mark < 0) throw new IOException("Resetting to invalid mark"); + pos = mark; + } + + @Override + public boolean markSupported() { + return true; + } + + @Override + public void close() { + // Invalidate the state of the stream. + in = null; + pos = end; + mark = -1; + } +} \ No newline at end of file diff --git a/src/java/org/apache/hadoop/hbase/io/hfile/Compression.java b/src/java/org/apache/hadoop/hbase/io/hfile/Compression.java new file mode 100644 index 00000000000..249bc425913 --- /dev/null +++ b/src/java/org/apache/hadoop/hbase/io/hfile/Compression.java @@ -0,0 +1,324 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package org.apache.hadoop.hbase.io.hfile; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.FilterOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.compress.CodecPool; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.CompressionInputStream; +import org.apache.hadoop.io.compress.CompressionOutputStream; +import org.apache.hadoop.io.compress.Compressor; +import org.apache.hadoop.io.compress.Decompressor; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.io.compress.LzoCodec; + +/** + * Compression related stuff. + * Copied from hadoop-3315 tfile. + */ +public final class Compression { + static final Log LOG = LogFactory.getLog(Compression.class); + + /** + * Prevent the instantiation of class. + */ + private Compression() { + super(); + } + + static class FinishOnFlushCompressionStream extends FilterOutputStream { + public FinishOnFlushCompressionStream(CompressionOutputStream cout) { + super(cout); + } + + @Override + public void write(byte b[], int off, int len) throws IOException { + out.write(b, off, len); + } + + @Override + public void flush() throws IOException { + CompressionOutputStream cout = (CompressionOutputStream) out; + cout.finish(); + cout.flush(); + cout.resetState(); + } + } + + /** + * Compression algorithms. + */ + public static enum Algorithm { + LZO("lzo") { + private LzoCodec codec; + + @Override + CompressionCodec getCodec() { + if (codec == null) { + Configuration conf = new Configuration(); + conf.setBoolean("hadoop.native.lib", true); + codec = new LzoCodec(); + codec.setConf(conf); + } + + return codec; + } + + @Override + public synchronized InputStream createDecompressionStream( + InputStream downStream, Decompressor decompressor, + int downStreamBufferSize) throws IOException { + InputStream bis1 = null; + if (downStreamBufferSize > 0) { + bis1 = new BufferedInputStream(downStream, downStreamBufferSize); + } + else { + bis1 = downStream; + } + codec.getConf() + .setInt("io.compression.codec.lzo.buffersize", 64 * 1024); + CompressionInputStream cis = + codec.createInputStream(bis1, decompressor); + BufferedInputStream bis2 = new BufferedInputStream(cis, DATA_IBUF_SIZE); + return bis2; + } + + @Override + public synchronized OutputStream createCompressionStream( + OutputStream downStream, Compressor compressor, + int downStreamBufferSize) throws IOException { + OutputStream bos1 = null; + if (downStreamBufferSize > 0) { + bos1 = new BufferedOutputStream(downStream, downStreamBufferSize); + } + else { + bos1 = downStream; + } + codec.getConf() + .setInt("io.compression.codec.lzo.buffersize", 64 * 1024); + CompressionOutputStream cos = + codec.createOutputStream(bos1, compressor); + BufferedOutputStream bos2 = + new BufferedOutputStream(new FinishOnFlushCompressionStream(cos), + DATA_OBUF_SIZE); + return bos2; + } + }, + + GZ("gz") { + private GzipCodec codec; + + @Override + CompressionCodec getCodec() { + if (codec == null) { + Configuration conf = new Configuration(); + conf.setBoolean("hadoop.native.lib", true); + codec = new GzipCodec(); + codec.setConf(conf); + } + + return codec; + } + + @Override + public synchronized InputStream createDecompressionStream( + InputStream downStream, Decompressor decompressor, + int downStreamBufferSize) throws IOException { + // Set the internal buffer size to read from down stream. + if (downStreamBufferSize > 0) { + codec.getConf().setInt("io.file.buffer.size", downStreamBufferSize); + } + CompressionInputStream cis = + codec.createInputStream(downStream, decompressor); + BufferedInputStream bis2 = new BufferedInputStream(cis, DATA_IBUF_SIZE); + return bis2; + } + + @Override + public synchronized OutputStream createCompressionStream( + OutputStream downStream, Compressor compressor, + int downStreamBufferSize) throws IOException { + OutputStream bos1 = null; + if (downStreamBufferSize > 0) { + bos1 = new BufferedOutputStream(downStream, downStreamBufferSize); + } + else { + bos1 = downStream; + } + codec.getConf().setInt("io.file.buffer.size", 32 * 1024); + CompressionOutputStream cos = + codec.createOutputStream(bos1, compressor); + BufferedOutputStream bos2 = + new BufferedOutputStream(new FinishOnFlushCompressionStream(cos), + DATA_OBUF_SIZE); + return bos2; + } + }, + + NONE("none") { + @Override + CompressionCodec getCodec() { + return null; + } + + @Override + public synchronized InputStream createDecompressionStream( + InputStream downStream, Decompressor decompressor, + int downStreamBufferSize) throws IOException { + if (downStreamBufferSize > 0) { + return new BufferedInputStream(downStream, downStreamBufferSize); + } + // else { + // Make sure we bypass FSInputChecker buffer. + // return new BufferedInputStream(downStream, 1024); + // } + // } + return downStream; + } + + @Override + public synchronized OutputStream createCompressionStream( + OutputStream downStream, Compressor compressor, + int downStreamBufferSize) throws IOException { + if (downStreamBufferSize > 0) { + return new BufferedOutputStream(downStream, downStreamBufferSize); + } + + return downStream; + } + }; + + private final String compressName; + // data input buffer size to absorb small reads from application. + private static final int DATA_IBUF_SIZE = 1 * 1024; + // data output buffer size to absorb small writes from application. + private static final int DATA_OBUF_SIZE = 4 * 1024; + + Algorithm(String name) { + this.compressName = name; + } + + abstract CompressionCodec getCodec(); + + public abstract InputStream createDecompressionStream( + InputStream downStream, Decompressor decompressor, + int downStreamBufferSize) throws IOException; + + public abstract OutputStream createCompressionStream( + OutputStream downStream, Compressor compressor, int downStreamBufferSize) + throws IOException; + + public Compressor getCompressor() { + CompressionCodec codec = getCodec(); + if (codec != null) { + Compressor compressor = CodecPool.getCompressor(codec); + if (compressor != null) { + if (compressor.finished()) { + // Somebody returns the compressor to CodecPool but is still using + // it. + LOG + .warn("Compressor obtained from CodecPool is already finished()"); + // throw new AssertionError( + // "Compressor obtained from CodecPool is already finished()"); + } + else { + LOG.debug("Got a compressor: " + compressor.hashCode()); + } + compressor.reset(); + } + return compressor; + } + return null; + } + + public void returnCompressor(Compressor compressor) { + if (compressor != null) { + LOG.debug("Return a compressor: " + compressor.hashCode()); + CodecPool.returnCompressor(compressor); + } + } + + public Decompressor getDecompressor() { + CompressionCodec codec = getCodec(); + if (codec != null) { + Decompressor decompressor = CodecPool.getDecompressor(codec); + if (decompressor != null) { + if (decompressor.finished()) { + // Somebody returns the decompressor to CodecPool but is still using + // it. + LOG + .warn("Deompressor obtained from CodecPool is already finished()"); + // throw new AssertionError( + // "Decompressor obtained from CodecPool is already finished()"); + } + else { + LOG.debug("Got a decompressor: " + decompressor.hashCode()); + } + decompressor.reset(); + } + return decompressor; + } + + return null; + } + + public void returnDecompressor(Decompressor decompressor) { + if (decompressor != null) { + LOG.debug("Returned a decompressor: " + decompressor.hashCode()); + CodecPool.returnDecompressor(decompressor); + } + } + + public String getName() { + return compressName; + } + } + + static Algorithm getCompressionAlgorithmByName(String compressName) { + Algorithm[] algos = Algorithm.class.getEnumConstants(); + + for (Algorithm a : algos) { + if (a.getName().equals(compressName)) { + return a; + } + } + + throw new IllegalArgumentException( + "Unsupported compression algorithm name: " + compressName); + } + + static String[] getSupportedAlgorithms() { + Algorithm[] algos = Algorithm.class.getEnumConstants(); + + String[] ret = new String[algos.length]; + int i = 0; + for (Algorithm a : algos) { + ret[i++] = a.getName(); + } + + return ret; + } +} diff --git a/src/java/org/apache/hadoop/hbase/io/hfile/HFile.java b/src/java/org/apache/hadoop/hbase/io/hfile/HFile.java new file mode 100644 index 00000000000..3a2915d05d0 --- /dev/null +++ b/src/java/org/apache/hadoop/hbase/io/hfile/HFile.java @@ -0,0 +1,1423 @@ +/** + * Copyright 2009 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.io.hfile; + +import java.io.Closeable; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.io.HbaseMapWritable; +import org.apache.hadoop.hbase.io.HeapSize; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.RawComparator; +import org.apache.hadoop.io.compress.Compressor; +import org.apache.hadoop.io.compress.Decompressor; + +/** + * File format for hbase. + * A file of sorted key/value pairs. Both keys and values are byte arrays. + *

+ * The memory footprint of a HFile includes the following (below is taken from + * Hadoop-3315 tfile + * but applies also to HFile): + *

+ * Suggestions on performance optimization. + * + * + * For more on the background behind HFile, see HBASE-61. + *

+ * File is made of data blocks followed by meta data blocks (if any), a fileinfo + * block, data block index, meta data block index, and a fixed size trailer + * which records the offsets at which file changes content type. + *

<data blocks><meta blocks><fileinfo><data index><meta index><trailer>
+ * Each block has a bit of magic at its start. Block are comprised of + * key/values. In data blocks, they are both byte arrays. Metadata blocks are + * a String key and a byte array value. An empty file looks like this: + *
<fileinfo><trailer>
. That is, there are not data nor meta + * blocks present. + *

+ * TODO: Bloomfilters. Need to add hadoop 0.20. first since it has bug fixes + * on the hadoop bf package. + * * TODO: USE memcmp by default? Write the keys out in an order that allows + * my using this -- reverse the timestamp. + * TODO: Add support for fast-gzip and for lzo. + * TODO: Do scanners need to be able to take a start and end row? + * TODO: Should BlockIndex know the name of its file? Should it have a Path + * that points at its file say for the case where an index lives apart from + * an HFile instance? + */ +public class HFile { + static final Log LOG = LogFactory.getLog(HFile.class); + + /* These values are more or less arbitrary, and they are used as a + * form of check to make sure the file isn't completely corrupt. + */ + final static byte [] DATABLOCKMAGIC = + {'D', 'A', 'T', 'A', 'B', 'L', 'K', 42 }; + final static byte [] INDEXBLOCKMAGIC = + { 'I', 'D', 'X', 'B', 'L', 'K', 41, 43 }; + final static byte [] METABLOCKMAGIC = + { 'M', 'E', 'T', 'A', 'B', 'L', 'K', 99 }; + final static byte [] TRAILERBLOCKMAGIC = + { 'T', 'R', 'A', 'B', 'L', 'K', 34, 36 }; + + /** + * Maximum length of key in HFile. + */ + public final static int MAXIMUM_KEY_LENGTH = 64 * 1024; + + /** + * Default blocksize for hfile. + */ + public final static int DEFAULT_BLOCKSIZE = 64 * 1024; + + /** + * Default compression: none. + */ + public final static String DEFAULT_COMPRESSION = + Compression.Algorithm.NONE.getName(); + + /** + * HFile Writer. + */ + public static class Writer implements Closeable { + // FileSystem stream to write on. + private FSDataOutputStream outputStream; + // True if we opened the outputStream (and so will close it). + private boolean closeOutputStream; + + // Name for this object used when logging or in toString. Is either + // the result of a toString on stream or else toString of passed file Path. + private String name; + + // Total uncompressed bytes, maybe calculate a compression ratio later. + private int totalBytes = 0; + + // Total # of key/value entries, ie: how many times add() was called. + private int entryCount = 0; + + // Used calculating average key and value lengths. + private long keylength = 0; + private long valuelength = 0; + + // Used to ensure we write in order. + private final RawComparator comparator; + + // A stream made per block written. + private DataOutputStream out; + + // Number of uncompressed bytes per block. Reinitialized when we start + // new block. + private int blocksize; + + // Offset where the current block began. + private long blockBegin; + + // First key in a block (Not first key in file). + private byte [] firstKey = null; + + // Key previously appended. Becomes the last key in the file. + private byte [] lastKey = null; + + // See {@link BlockIndex}. Below four fields are used to write the block + // index. + ArrayList blockKeys = new ArrayList(); + // Block offset in backing stream. + ArrayList blockOffsets = new ArrayList(); + // Raw (decompressed) data size. + ArrayList blockDataSizes = new ArrayList(); + + // Meta block system. + private ArrayList metaNames = new ArrayList(); + private ArrayList metaData = new ArrayList(); + + // Used compression. Used even if no compression -- 'none'. + private final Compression.Algorithm compressAlgo; + private Compressor compressor; + + // Special datastructure to hold fileinfo. + private FileInfo fileinfo = new FileInfo(); + + // May be null if we were passed a stream. + private Path path = null; + + /** + * Constructor that uses all defaults for compression and block size. + * @param fs + * @param path + * @throws IOException + */ + public Writer(FileSystem fs, Path path) + throws IOException { + this(fs, path, DEFAULT_BLOCKSIZE, null, null); + } + + /** + * Constructor that takes a Path. + * @param fs + * @param path + * @param blocksize + * @param compress + * @param comparator + * @throws IOException + */ + public Writer(FileSystem fs, Path path, int blocksize, String compress, + final RawComparator comparator) + throws IOException { + this(fs.create(path), blocksize, compress, comparator); + this.closeOutputStream = true; + this.name = path.toString(); + this.path = path; + } + + /** + * Constructor that takes a stream. + * @param ostream Stream to use. + * @param blocksize + * @param compress + * @param c + * @throws IOException + */ + public Writer(final FSDataOutputStream ostream, final int blocksize, + final String compress, final RawComparator c) + throws IOException { + this.outputStream = ostream; + this.closeOutputStream = false; + this.blocksize = blocksize; + this.comparator = c == null? Bytes.BYTES_RAWCOMPARATOR: c; + this.name = this.outputStream.toString(); + this.compressAlgo = + Compression.getCompressionAlgorithmByName(compress == null? + Compression.Algorithm.NONE.getName(): compress); + } + + /* + * If at block boundary, opens new block. + * @throws IOException + */ + private void checkBlockBoundary() throws IOException { + if (this.out != null && this.out.size() < blocksize) return; + finishBlock(); + newBlock(); + } + + /* + * Do the cleanup if a current block. + * @throws IOException + */ + private void finishBlock() throws IOException { + if (this.out == null) return; + long size = releaseCompressingStream(this.out); + this.out = null; + + blockKeys.add(firstKey); + int written = longToInt(size); + blockOffsets.add(Long.valueOf(blockBegin)); + blockDataSizes.add(Integer.valueOf(written)); + this.totalBytes += written; + } + + /* + * Ready a new block for writing. + * @throws IOException + */ + private void newBlock() throws IOException { + // This is where the next block begins. + blockBegin = outputStream.getPos(); + this.out = getCompressingStream(); + this.out.write(DATABLOCKMAGIC); + firstKey = null; + } + + /* + * Sets up a compressor and creates a compression stream on top of + * this.outputStream. Get one per block written. + * @return A compressing stream; if 'none' compression, returned stream + * does not compress. + * @throws IOException + * @see {@link #releaseCompressingStream(DataOutputStream)} + */ + private DataOutputStream getCompressingStream() throws IOException { + this.compressor = compressAlgo.getCompressor(); + // Get new DOS compression stream. In tfile, the DOS, is not closed, + // just finished, and that seems to be fine over there. TODO: Check + // no memory retention of the DOS. Should I disable the 'flush' on the + // DOS as the BCFile over in tfile does? It wants to make it so flushes + // don't go through to the underlying compressed stream. Flush on the + // compressed downstream should be only when done. I was going to but + // looks like when we call flush in here, its legitimate flush that + // should go through to the compressor. + OutputStream os = + this.compressAlgo.createCompressionStream(this.outputStream, + this.compressor, 0); + return new DataOutputStream(os); + } + + /* + * Let go of block compressor and compressing stream gotten in call + * {@link #getCompressingStream}. + * @param dos + * @return How much was written on this stream since it was taken out. + * @see #getCompressingStream() + * @throws IOException + */ + private int releaseCompressingStream(final DataOutputStream dos) + throws IOException { + dos.flush(); + this.compressAlgo.returnCompressor(this.compressor); + this.compressor = null; + return dos.size(); + } + + /** + * Add a meta block to the end of the file. Call before close(). + * Metadata blocks are expensive. Fill one with a bunch of serialized data + * rather than do a metadata block per metadata instance. If metadata is + * small, consider adding to file info using + * {@link #appendFileInfo(byte[], byte[])} + * @param metaBlockName name of the block + * @param bytes uninterpreted bytes of the block. + */ + public void appendMetaBlock(String metaBlockName, byte [] bytes) { + metaNames.add(Bytes.toBytes(metaBlockName)); + metaData.add(bytes); + } + + /** + * Add to the file info. Added key value can be gotten out of the return + * from {@link Reader#loadFileInfo()}. + * @param k Key + * @param v Value + * @throws IOException + */ + public void appendFileInfo(final byte [] k, final byte [] v) + throws IOException { + appendFileInfo(this.fileinfo, k, v, true); + } + + FileInfo appendFileInfo(FileInfo fi, final byte [] k, final byte [] v, + final boolean checkPrefix) + throws IOException { + if (k == null || v == null) { + throw new NullPointerException("Key nor value may be null"); + } + if (checkPrefix && + Bytes.toString(k).toLowerCase().startsWith(FileInfo.RESERVED_PREFIX)) { + throw new IOException("Keys with a " + FileInfo.RESERVED_PREFIX + + " are reserved"); + } + fi.put(k, v); + return fi; + } + + /** + * @return Path or null if we were passed a stream rather than a Path. + */ + public Path getPath() { + return this.path; + } + + public String toString() { + return "writer=" + this.name + ", compression=" + + this.compressAlgo.getName(); + } + + /** + * Add key/value to file. + * Keys must be added in an order that agrees with the RawComparator passed + * on construction. + * @param key Key to add. Cannot be empty nor null. + * @param value Value to add. Cannot be empty nor null. + * @throws IOException + */ + public void append(final byte [] key, final byte [] value) + throws IOException { + checkKey(key); + checkValue(value); + checkBlockBoundary(); + // Write length of key and value and then actual key and value bytes. + this.out.writeInt(key.length); + this.keylength += key.length; + this.out.writeInt(value.length); + this.valuelength += valuelength; + this.out.write(key); + this.out.write(value); + // Are we the first key in this block? + if (this.firstKey == null) this.firstKey = key; + this.lastKey = key; + this.entryCount ++; + } + + /* + * @param key Key to check. + * @throws IOException + */ + private void checkKey(final byte [] key) throws IOException { + if (key == null || key.length <= 0) { + throw new IOException("Key cannot be null or empty"); + } + if (key.length > MAXIMUM_KEY_LENGTH) { + throw new IOException("Key length " + key.length + " > " + + MAXIMUM_KEY_LENGTH); + } + if (this.lastKey != null) { + if (this.comparator.compare(this.lastKey, key) > 0) { + throw new IOException("Added a key not lexically larger than" + + " previous: key=" + Bytes.toString(key) + ", lastkey=" + + Bytes.toString(lastKey)); + } + } + } + + private void checkValue(final byte [] value) throws IOException { + if (value == null || value.length <= 0) { + throw new IOException("Value cannot be null or empty"); + } + } + + public void close() throws IOException { + if (this.outputStream == null) { + return; + } + // Write out the end of the data blocks, then write meta data blocks. + // followed by fileinfo, data block index and meta block index. + + finishBlock(); + + FixedFileTrailer trailer = new FixedFileTrailer(); + + // Write out the metadata blocks if any. + ArrayList metaOffsets = null; + ArrayList metaDataSizes = null; + if (metaNames.size() > 0) { + metaOffsets = new ArrayList(metaNames.size()); + metaDataSizes = new ArrayList(metaNames.size()); + for (int i = 0 ; i < metaNames.size() ; ++ i ) { + metaOffsets.add(Long.valueOf(outputStream.getPos())); + metaDataSizes. + add(Integer.valueOf(METABLOCKMAGIC.length + metaData.get(i).length)); + writeMetaBlock(metaData.get(i)); + } + } + + // Write fileinfo. + trailer.fileinfoOffset = writeFileInfo(this.outputStream); + + // Write the data block index. + trailer.dataIndexOffset = BlockIndex.writeIndex(this.outputStream, + this.blockKeys, this.blockOffsets, this.blockDataSizes); + + // Meta block index. + if (metaNames.size() > 0) { + trailer.metaIndexOffset = BlockIndex.writeIndex(this.outputStream, + this.metaNames, metaOffsets, metaDataSizes); + } + + // Now finish off the trailer. + trailer.dataIndexCount = blockKeys.size(); + trailer.metaIndexCount = metaNames.size(); + + trailer.totalUncompressedBytes = totalBytes; + trailer.entryCount = entryCount; + + trailer.compressionCodec = this.compressAlgo.ordinal(); + + trailer.serialize(outputStream); + + if (this.closeOutputStream) { + this.outputStream.close(); + this.outputStream = null; + } + } + + /* Write a metadata block. + * @param metadata + * @throws IOException + */ + private void writeMetaBlock(final byte [] b) throws IOException { + DataOutputStream dos = getCompressingStream(); + dos.write(METABLOCKMAGIC); + dos.write(b); + releaseCompressingStream(dos); + } + + /* + * Add last bits of metadata to fileinfo and then write it out. + * Reader will be expecting to find all below. + * @param o Stream to write on. + * @return Position at which we started writing. + * @throws IOException + */ + private long writeFileInfo(FSDataOutputStream o) throws IOException { + if (this.lastKey != null) { + appendFileInfo(this.fileinfo, FileInfo.LASTKEY, this.lastKey, false); + } + int avgKeyLen = this.entryCount == 0? 0: + (int)(this.keylength/this.entryCount); + appendFileInfo(this.fileinfo, FileInfo.AVG_KEY_LEN, + Bytes.toBytes(avgKeyLen), false); + int avgValueLen = this.entryCount == 0? 0: + (int)(this.keylength/this.entryCount); + appendFileInfo(this.fileinfo, FileInfo.AVG_VALUE_LEN, + Bytes.toBytes(avgValueLen), false); + appendFileInfo(this.fileinfo, FileInfo.COMPARATOR, + Bytes.toBytes(this.comparator.getClass().getName()), false); + long pos = o.getPos(); + this.fileinfo.write(o); + return pos; + } + } + + /** + * HFile Reader. + */ + public static class Reader implements Closeable { + // Stream to read from. + private FSDataInputStream istream; + // True if we should close istream when done. We don't close it if we + // didn't open it. + private boolean closeIStream; + + // These are read in when the file info is loaded. + HFile.BlockIndex blockIndex; + private BlockIndex metaIndex; + FixedFileTrailer trailer; + private volatile boolean fileInfoLoaded = false; + + // Filled when we read in the trailer. + private Compression.Algorithm compressAlgo; + + // Last key in the file. Filled in when we read in the file info + private byte [] lastkey = null; + // Stats read in when we load file info. + private int avgKeyLen = -1; + private int avgValueLen = -1; + + // Used to ensure we seek correctly. + RawComparator comparator; + + // Size of this file. + private final long fileSize; + + // Block cache to use. + private final BlockCache cache; + public int cacheHits = 0; + public int blockLoads = 0; + + // Name for this object used when logging or in toString. Is either + // the result of a toString on the stream or else is toString of passed + // file Path plus metadata key/value pairs. + private String name; + + /* + * Do not expose the default constructor. + */ + @SuppressWarnings("unused") + private Reader() throws IOException { + this(null, null, null); + } + + /** + * Opens a HFile. You must load the file info before you can + * use it by calling {@link #loadFileInfo()}. + * + * @param fs filesystem to load from + * @param path path within said filesystem + * @param cache block cache. Pass null if none. + * @throws IOException + */ + public Reader(FileSystem fs, Path path, BlockCache cache) + throws IOException { + this(fs.open(path), fs.getFileStatus(path).getLen(), cache); + this.closeIStream = true; + this.name = path.toString(); + } + + /** + * Opens a HFile. You must load the index before you can + * use it by calling {@link #loadFileInfo()}. + * + * @param fsdis input stream. Caller is responsible for closing the passed + * stream. + * @param size Length of the stream. + * @param cache block cache. Pass null if none. + * @throws IOException + */ + public Reader(final FSDataInputStream fsdis, final long size, + final BlockCache cache) + throws IOException { + this.cache = cache; + this.fileSize = size; + this.istream = fsdis; + this.closeIStream = false; + this.name = this.istream.toString(); + } + + public String toString() { + return "reader=" + this.name + + (!isFileInfoLoaded()? "": + ", compression=" + this.compressAlgo.getName() + + ", firstKey=" + Bytes.toString(getFirstKey()) + + ", lastKey=" + Bytes.toString(getLastKey()) + + ", avgKeyLen=" + this.avgKeyLen + + ", avgValueLen=" + this.avgValueLen + + ", entries=" + this.trailer.entryCount + + ", length=" + this.fileSize); + } + + public long length() { + return this.fileSize; + } + + /** + * Read in the index and file info. + * @return A map of fileinfo data. + * See {@link Writer#appendFileInfo(byte[], byte[])}. + * @throws IOException + */ + public Map loadFileInfo() throws IOException { + this.trailer = readTrailer(); + + // Read in the fileinfo and get what we need from it. + this.istream.seek(this.trailer.fileinfoOffset); + FileInfo fi = new FileInfo(); + fi.readFields(this.istream); + this.lastkey = fi.get(FileInfo.LASTKEY); + this.avgKeyLen = Bytes.toInt(fi.get(FileInfo.AVG_KEY_LEN)); + this.avgValueLen = Bytes.toInt(fi.get(FileInfo.AVG_VALUE_LEN)); + String clazzName = Bytes.toString(fi.get(FileInfo.COMPARATOR)); + this.comparator = getComparator(clazzName); + + // Read in the data index. + this.blockIndex = BlockIndex.readIndex(this.comparator, this.istream, + this.trailer.dataIndexOffset, this.trailer.dataIndexCount); + + // Read in the metadata index. + if (trailer.metaIndexCount > 0) { + this.metaIndex = BlockIndex.readIndex(Bytes.BYTES_RAWCOMPARATOR, + this.istream, this.trailer.metaIndexOffset, trailer.metaIndexCount); + } + this.fileInfoLoaded = true; + return fi; + } + + boolean isFileInfoLoaded() { + return this.fileInfoLoaded; + } + + @SuppressWarnings("unchecked") + private RawComparator getComparator(final String clazzName) + throws IOException { + if (clazzName == null || clazzName.length() == 0) { + return null; + } + try { + return (RawComparator)Class.forName(clazzName).newInstance(); + } catch (InstantiationException e) { + throw new IOException(e); + } catch (IllegalAccessException e) { + throw new IOException(e); + } catch (ClassNotFoundException e) { + throw new IOException(e); + } + } + + /* Read the trailer off the input stream. As side effect, sets the + * compression algorithm. + * @return Populated FixedFileTrailer. + * @throws IOException + */ + private FixedFileTrailer readTrailer() throws IOException { + FixedFileTrailer fft = new FixedFileTrailer(); + long seekPoint = this.fileSize - FixedFileTrailer.trailerSize(); + this.istream.seek(seekPoint); + fft.deserialize(this.istream); + // Set up the codec. + this.compressAlgo = + Compression.Algorithm.values()[fft.compressionCodec]; + return fft; + } + + /** + * Create a Scanner on this file. No seeks or reads are done on creation. + * Call {@link HFileScanner#seekTo(byte[])} to position an start the read. + * There is nothing to clean up in a Scanner. Letting go of your references + * to the scanner is sufficient. + * @return Scanner on this file. + */ + public HFileScanner getScanner() { + return new Scanner(this); + } + /** + * @param key Key to search. + * @return Block number of the block containing the key or -1 if not in this + * file. + */ + protected int blockContainingKey(final byte [] key) { + if (blockIndex == null) { + throw new RuntimeException("Block index not loaded"); + } + return blockIndex.blockContainingKey(key); + } + /** + * @param metaBlockName + * @return Block wrapped in a ByteBuffer + * @throws IOException + */ + public ByteBuffer getMetaBlock(String metaBlockName) throws IOException { + if (metaIndex == null) { + throw new IOException("Meta index not loaded"); + } + int block = metaIndex.blockContainingKey(Bytes.toBytes(metaBlockName)); + if (block == -1) + return null; + long blockSize; + if (block == metaIndex.count - 1) { + blockSize = trailer.fileinfoOffset - metaIndex.blockOffsets[block]; + } else { + blockSize = metaIndex.blockOffsets[block+1] - metaIndex.blockOffsets[block]; + } + + ByteBuffer buf = decompress(metaIndex.blockOffsets[block], + longToInt(blockSize), metaIndex.blockDataSizes[block]); + byte [] magic = new byte[METABLOCKMAGIC.length]; + buf.get(magic, 0, magic.length); + + if (! Arrays.equals(magic, METABLOCKMAGIC)) { + throw new IOException("Meta magic is bad in block " + block); + } + // Toss the header. May have to remove later due to performance. + buf.compact(); + buf.limit(buf.limit() - METABLOCKMAGIC.length); + buf.rewind(); + return buf; + } + /** + * Read in a file block. + * @param block Index of block to read. + * @return Block wrapped in a ByteBuffer. + * @throws IOException + */ + ByteBuffer readBlock(int block) throws IOException { + if (blockIndex == null) { + throw new IOException("Block index not loaded"); + } + if (block < 0 || block > blockIndex.count) { + throw new IOException("Requested block is out of range: " + block + + ", max: " + blockIndex.count); + } + + // For any given block from any given file, synchronize reads for said + // block. + // Without a cache, this synchronizing is needless overhead, but really + // the other choice is to duplicate work (which the cache would prevent you from doing). + synchronized (blockIndex.blockKeys[block]) { + blockLoads++; + // Check cache for block. If found return. + if (cache != null) { + ByteBuffer cachedBuf = cache.getBlock(name + block); + if (cachedBuf != null) { + // LOG.debug("Reusing block for: " + block); + // Return a distinct 'copy' of the block, so pos doesnt get messed by + // the scanner + cacheHits++; + return cachedBuf.duplicate(); + } + // Carry on, please load. + } + + // Load block from filesystem. + long onDiskBlockSize; + if (block == blockIndex.count - 1) { + // last block! The end of data block is first meta block if there is + // one or if there isn't, the fileinfo offset. + long offset = this.metaIndex != null? + this.metaIndex.blockOffsets[0]: this.trailer.fileinfoOffset; + onDiskBlockSize = offset - blockIndex.blockOffsets[block]; + } else { + onDiskBlockSize = blockIndex.blockOffsets[block+1] - + blockIndex.blockOffsets[block]; + } + ByteBuffer buf = decompress(blockIndex.blockOffsets[block], + longToInt(onDiskBlockSize), this.blockIndex.blockDataSizes[block]); + + byte [] magic = new byte[DATABLOCKMAGIC.length]; + buf.get(magic, 0, magic.length); + // LOG.debug("read block:"+buf.position() + " lim:" + buf.limit()); + if (!Arrays.equals(magic, DATABLOCKMAGIC)) { + throw new IOException("Data magic is bad in block " + block); + } + // Toss the header. May have to remove later due to performance. + buf.compact(); + buf.limit(buf.limit() - DATABLOCKMAGIC.length); + // LOG.debug("read block:"+buf.position() + " lim:" + buf.limit()); + buf.rewind(); + // LOG.debug("read block:"+buf.position() + " lim:" + buf.limit()); + + // Cache a copy, not the one we are sending back, so the position doesnt + // get messed. + if (cache != null) { + cache.cacheBlock(name + block, buf.duplicate()); + } + + return buf; + } + } + + /* + * Decompress compressedSize bytes off the backing + * FSDataInputStream. + * @param offset + * @param compressedSize + * @param decompressedSize + * @return + * @throws IOException + */ + private ByteBuffer decompress(final long offset, final int compressedSize, + final int decompressedSize) + throws IOException { + Decompressor decompressor = this.compressAlgo.getDecompressor(); + // My guess is that the bounded range fis is needed to stop the + // decompressor reading into next block -- IIRC, it just grabs a + // bunch of data w/o regard to whether decompressor is coming to end of a + // decompression. + InputStream is = this.compressAlgo.createDecompressionStream( + new BoundedRangeFileInputStream(this.istream, offset, compressedSize), + decompressor, 0); + ByteBuffer buf = ByteBuffer.allocate(decompressedSize); + IOUtils.readFully(is, buf.array(), 0, buf.capacity()); + return buf; + } + + /** + * @return First key in the file. + */ + public byte [] getFirstKey() { + if (blockIndex == null) { + throw new RuntimeException("Block index not loaded"); + } + return blockIndex.blockKeys[0]; + } + + public int getEntries() { + if (!this.isFileInfoLoaded()) { + throw new RuntimeException("File info not loaded"); + } + return this.trailer.entryCount; + } + + /** + * @return Last key in the file. + */ + public byte [] getLastKey() { + if (!isFileInfoLoaded()) { + throw new RuntimeException("Load file info first"); + } + return this.lastkey; + } + + /** + * @return Comparator. + */ + public RawComparator getComparator() { + return this.comparator; + } + + /** + * @return index size + */ + public long indexSize() { + return (this.blockIndex != null? this.blockIndex.heapSize(): 0) + + ((this.metaIndex != null)? this.metaIndex.heapSize(): 0); + } + + /** + * @return Midkey for this file. We work with block boundaries only so + * returned midkey is an approximation only. + * @throws IOException + */ + public byte [] midkey() throws IOException { + if (!isFileInfoLoaded() || this.blockIndex.isEmpty()) { + return null; + } + return this.blockIndex.midkey(); + } + + public void close() throws IOException { + if (this.closeIStream && this.istream != null) { + this.istream.close(); + this.istream = null; + } + } + + /* + * Implementation of {@link HFileScanner} interface. + */ + private static class Scanner implements HFileScanner { + private final Reader reader; + private ByteBuffer block; + private int currBlock; + + private int currKeyLen = 0; + private int currValueLen = 0; + + public int blockFetches = 0; + + public Scanner(Reader r) { + this.reader = r; + } + + public ByteBuffer getKey() { + if (this.block == null || this.currKeyLen == 0) { + throw new RuntimeException("you need to seekTo() before calling getKey()"); + } + ByteBuffer keyBuff = this.block.slice(); + keyBuff.limit(this.currKeyLen); + keyBuff.rewind(); + // Do keyBuff.asReadOnly()? + return keyBuff; + } + + public ByteBuffer getValue() { + if (block == null || currKeyLen == 0) { + throw new RuntimeException("you need to seekTo() before calling getValue()"); + } + // TODO: Could this be done with one ByteBuffer rather than create two? + ByteBuffer valueBuff = this.block.slice(); + valueBuff.position(this.currKeyLen); + valueBuff = valueBuff.slice(); + valueBuff.limit(currValueLen); + valueBuff.rewind(); + return valueBuff; + } + + public boolean next() throws IOException { + // LOG.deug("rem:" + block.remaining() + " p:" + block.position() + + // " kl: " + currKeyLen + " kv: " + currValueLen); + if (block == null) { + throw new IOException("Next called on non-seeked scanner"); + } + block.position(block.position() + currKeyLen + currValueLen); + if (block.remaining() <= 0) { + // LOG.debug("Fetch next block"); + currBlock++; + if (currBlock >= reader.blockIndex.count) { + // damn we are at the end + currBlock = 0; + block = null; + return false; + } + block = reader.readBlock(currBlock); + currKeyLen = block.getInt(); + currValueLen = block.getInt(); + blockFetches++; + return true; + } + // LOG.debug("rem:" + block.remaining() + " p:" + block.position() + + // " kl: " + currKeyLen + " kv: " + currValueLen); + + currKeyLen = block.getInt(); + currValueLen = block.getInt(); + return true; + } + + public int seekTo(byte[] key) throws IOException { + int b = reader.blockContainingKey(key); + if (b < 0) return -1; // falls before the beginning of the file! :-( + // Avoid re-reading the same block (that'd be dumb). + loadBlock(b); + + return blockSeek(key, false); + } + + /** + * Within a loaded block, seek looking for the first key + * that is smaller than (or equal to?) the key we are interested in. + * + * A note on the seekBefore - if you have seekBefore = true, AND the + * first key in the block = key, then you'll get thrown exceptions. + * @param key to find + * @param seekBefore find the key before the exact match. + * @return + */ + private int blockSeek(byte[] key, boolean seekBefore) { + int klen, vlen; + int lastLen = 0; + do { + klen = block.getInt(); + vlen = block.getInt(); + int comp = this.reader.comparator.compare(key, 0, key.length, + block.array(), block.arrayOffset() + block.position(), klen); + if (comp == 0) { + if (seekBefore) { + block.position(block.position() - lastLen - 16); + currKeyLen = block.getInt(); + currValueLen = block.getInt(); + return 1; // non exact match. + } + currKeyLen = klen; + currValueLen = vlen; + return 0; // indicate exact match + } + if (comp < 0) { + // go back one key: + block.position(block.position() - lastLen - 16); + currKeyLen = block.getInt(); + currValueLen = block.getInt(); + return 1; + } + block.position(block.position() + klen + vlen); + lastLen = klen + vlen ; + } while( block.remaining() > 0 ); + // ok we are at the end, so go back a littleeeeee.... + block.position(block.position() - lastLen - 8); + currKeyLen = block.getInt(); + currValueLen = block.getInt(); + return 1; // didn't exactly find it. + } + + public boolean seekBefore(byte[] key) throws IOException { + int b = reader.blockContainingKey(key); + if (b < 0) + return false; // key is before the start of the file. + + // Question: does this block begin with 'key'? + if (this.reader.comparator.compare(reader.blockIndex.blockKeys[b], key) == 0) { + // Ok the key we're interested in is the first of the block, so go back one. + if (b == 0) { + // we have a 'problem', the key we want is the first of the file. + return false; + } + b--; + // TODO shortcut: seek forward in this block to the last key of the block. + } + loadBlock(b); + blockSeek(key, true); + return true; + } + + public String getKeyString() { + return Bytes.toString(block.array(), block.arrayOffset() + + block.position(), currKeyLen); + } + + public String getValueString() { + return Bytes.toString(block.array(), block.arrayOffset() + + block.position() + currKeyLen, currValueLen); + } + + public Reader getReader() { + return this.reader; + } + + public boolean isSeeked(){ + return this.block != null; + } + + public boolean seekTo() throws IOException { + if (this.reader.blockIndex.isEmpty()) { + return false; + } + if (block != null && currBlock == 0) { + block.rewind(); + currKeyLen = block.getInt(); + currValueLen = block.getInt(); + } + currBlock = 0; + block = reader.readBlock(currBlock); + currKeyLen = block.getInt(); + currValueLen = block.getInt(); + blockFetches++; + return true; + } + + private void loadBlock(int bloc) throws IOException { + if (block == null) { + block = reader.readBlock(bloc); + currBlock = bloc; + blockFetches++; + } else { + if (bloc != currBlock) { + block = reader.readBlock(bloc); + currBlock = bloc; + blockFetches++; + } else { + // we are already in the same block, just rewind to seek again. + block.rewind(); + } + } + } + } + } + /* + * The RFile has a fixed trailer which contains offsets to other variable + * parts of the file. Also includes basic metadata on this file. + */ + private static class FixedFileTrailer { + // Offset to the data block index. + long dataIndexOffset; + // Offset to the fileinfo data, a small block of vitals.. + long fileinfoOffset; + // How many index counts are there (aka: block count) + int dataIndexCount; + // Offset to the meta block index. + long metaIndexOffset; + // How many meta block index entries (aka: meta block count) + int metaIndexCount; + long totalUncompressedBytes; + int entryCount; + int compressionCodec; + int version = 1; + + FixedFileTrailer() { + super(); + } + + static int trailerSize() { + // Keep this up to date... + final int intSize = 4; + final int longSize = 8; + return + ( intSize * 5 ) + + ( longSize * 4 ) + + TRAILERBLOCKMAGIC.length; + } + + void serialize(DataOutputStream outputStream) throws IOException { + outputStream.write(TRAILERBLOCKMAGIC); + outputStream.writeLong(fileinfoOffset); + outputStream.writeLong(dataIndexOffset); + outputStream.writeInt(dataIndexCount); + outputStream.writeLong(metaIndexOffset); + outputStream.writeInt(metaIndexCount); + outputStream.writeLong(totalUncompressedBytes); + outputStream.writeInt(entryCount); + outputStream.writeInt(compressionCodec); + outputStream.writeInt(version); + } + + void deserialize(DataInputStream inputStream) throws IOException { + byte [] header = new byte[TRAILERBLOCKMAGIC.length]; + inputStream.readFully(header); + if ( !Arrays.equals(header, TRAILERBLOCKMAGIC)) { + throw new IOException("Trailer 'header' is wrong; does the trailer " + + "size match content?"); + } + fileinfoOffset = inputStream.readLong(); + dataIndexOffset = inputStream.readLong(); + dataIndexCount = inputStream.readInt(); + + metaIndexOffset = inputStream.readLong(); + metaIndexCount = inputStream.readInt(); + + totalUncompressedBytes = inputStream.readLong(); + entryCount = inputStream.readInt(); + compressionCodec = inputStream.readInt(); + version = inputStream.readInt(); + + if (version != 1) { + throw new IOException("Wrong version: " + version); + } + } + + public String toString() { + return "fileinfoOffset=" + fileinfoOffset + + ", dataIndexOffset=" + dataIndexOffset + + ", dataIndexCount=" + dataIndexCount + + ", metaIndexOffset=" + metaIndexOffset + + ", metaIndexCount=" + metaIndexCount + + ", totalBytes=" + totalUncompressedBytes + + ", entryCount=" + entryCount + + ", version=" + version; + } + } + + /* + * The block index for a RFile. + * Used reading. + */ + static class BlockIndex implements HeapSize { + // How many actual items are there? The next insert location too. + int count = 0; + byte [][] blockKeys; + long [] blockOffsets; + int [] blockDataSizes; + int size = 0; + + /* Needed doing lookup on blocks. + */ + RawComparator comparator; + + /* + * Shutdown default constructor + */ + @SuppressWarnings("unused") + private BlockIndex() { + this(null); + } + + /** + * Constructor + * @param trailer File tail structure with index stats. + */ + BlockIndex(final RawComparator c) { + this.comparator = c; + // Guess that cost of three arrays + this object is 4 * 8 bytes. + this.size += (4 * 8); + } + + /** + * @return True if block index is empty. + */ + boolean isEmpty() { + return this.blockKeys.length <= 0; + } + + /** + * Adds a new entry in the block index. + * + * @param key Last key in the block + * @param offset file offset where the block is stored + * @param dataSize the uncompressed data size + */ + void add(final byte[] key, final long offset, final int dataSize) { + blockOffsets[count] = offset; + blockKeys[count] = key; + blockDataSizes[count] = dataSize; + count++; + this.size += (Bytes.SIZEOF_INT * 2 + key.length); + } + + /** + * @param key Key to find + * @return Offset of block containing key or -1 if this file + * does not contain the request. + */ + int blockContainingKey(final byte[] key) { + int pos = Arrays.binarySearch(blockKeys, key, this.comparator); + if (pos < 0) { + pos ++; + pos *= -1; + if (pos == 0) { + // falls before the beginning of the file. + return -1; + } + // When switched to "first key in block" index, binarySearch now returns + // the block with a firstKey < key. This means the value we want is potentially + // in the next block. + pos --; // in previous block. + + return pos; + } + // wow, a perfect hit, how unlikely? + return pos; + } + + /* + * @return File midkey. Inexact. Operates on block boundaries. Does + * not go into blocks. + */ + byte [] midkey() throws IOException { + int pos = ((this.count - 1)/2); // middle of the index + if (pos < 0) { + throw new IOException("HFile empty"); + } + return this.blockKeys[pos]; + } + + /* + * Write out index. Whatever we write here must jibe with what + * BlockIndex#readIndex is expecting. Make sure the two ends of the + * index serialization match. + * @param o + * @param keys + * @param offsets + * @param sizes + * @param c + * @return Position at which we entered the index. + * @throws IOException + */ + static long writeIndex(final FSDataOutputStream o, + final List keys, final List offsets, + final List sizes) + throws IOException { + long pos = o.getPos(); + // Don't write an index if nothing in the index. + if (keys.size() > 0) { + o.write(INDEXBLOCKMAGIC); + // Write the index. + for (int i = 0; i < keys.size(); ++i) { + o.writeLong(offsets.get(i).longValue()); + o.writeInt(sizes.get(i).intValue()); + byte [] key = keys.get(i); + Bytes.writeByteArray(o, key); + } + } + return pos; + } + + /* + * Read in the index that is at indexOffset + * Must match what was written by writeIndex in the Writer.close. + * @param in + * @param indexOffset + * @throws IOException + */ + static BlockIndex readIndex(final RawComparator c, + final FSDataInputStream in, final long indexOffset, final int indexSize) + throws IOException { + BlockIndex bi = new BlockIndex(c); + bi.blockOffsets = new long[indexSize]; + bi.blockKeys = new byte[indexSize][]; + bi.blockDataSizes = new int[indexSize]; + // If index size is zero, no index was written. + if (indexSize > 0) { + in.seek(indexOffset); + byte [] magic = new byte[INDEXBLOCKMAGIC.length]; + IOUtils.readFully(in, magic, 0, magic.length); + if (!Arrays.equals(magic, INDEXBLOCKMAGIC)) { + throw new IOException("Index block magic is wrong: " + + Arrays.toString(magic)); + } + for (int i = 0; i < indexSize; ++i ) { + long offset = in.readLong(); + int dataSize = in.readInt(); + byte [] key = Bytes.readByteArray(in); + bi.add(key, offset, dataSize); + } + } + return bi; + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("size=" + count); + for (int i = 0; i < count ; i++) { + sb.append(", "); + sb.append("key=").append(Bytes.toString(blockKeys[i])). + append(", offset=").append(blockOffsets[i]). + append(", dataSize=" + blockDataSizes[i]); + } + return sb.toString(); + } + + public long heapSize() { + return this.size; + } + } + + /* + * Metadata for this file. Conjured by the writer. Read in by the reader. + */ + static class FileInfo extends HbaseMapWritable { + static final String RESERVED_PREFIX = "hfile."; + static final byte [] LASTKEY = Bytes.toBytes(RESERVED_PREFIX + "LASTKEY"); + static final byte [] AVG_KEY_LEN = + Bytes.toBytes(RESERVED_PREFIX + "AVG_KEY_LEN"); + static final byte [] AVG_VALUE_LEN = + Bytes.toBytes(RESERVED_PREFIX + "AVG_VALUE_LEN"); + static final byte [] COMPARATOR = + Bytes.toBytes(RESERVED_PREFIX + "COMPARATOR"); + + /* + * Constructor. + */ + FileInfo() { + super(); + } + } + + /** + * Get names of supported compression algorithms. The names are acceptable by + * HFile.Writer. + * + * @return Array of strings, each represents a supported compression + * algorithm. Currently, the following compression algorithms are + * supported. + *

+ */ + public static String[] getSupportedCompressionAlgorithms() { + return Compression.getSupportedAlgorithms(); + } + + // Utility methods. + /* + * @param l Long to convert to an int. + * @return l cast as an int. + */ + static int longToInt(final long l) { + // Expecting the size() of a block not exceeding 4GB. Assuming the + // size() will wrap to negative integer if it exceeds 2GB (From tfile). + return (int)(l & 0x00000000ffffffffL); + } +} \ No newline at end of file diff --git a/src/java/org/apache/hadoop/hbase/io/hfile/HFileScanner.java b/src/java/org/apache/hadoop/hbase/io/hfile/HFileScanner.java new file mode 100644 index 00000000000..b16ee765280 --- /dev/null +++ b/src/java/org/apache/hadoop/hbase/io/hfile/HFileScanner.java @@ -0,0 +1,112 @@ +/** + * Copyright 2009 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.io.hfile; + +import java.io.IOException; +import java.nio.ByteBuffer; + +/** + * A scanner allows you to position yourself within a HFile and + * scan through it. It allows you to reposition yourself as well. + * + *

A scanner doesn't always have a key/value that it is pointing to + * when it is first created and before + * {@link #seekTo()}/{@link #seekTo(byte[])} are called. + * In this case, {@link #getKey()}/{@link #getValue()} returns null. At most + * other times, a key and value will be available. The general pattern is that + * you position the Scanner using the seekTo variants and then getKey and + * getValue. + */ +public interface HFileScanner { + /** + * SeekTo or just before the passed key. Examine the return + * code to figure whether we found the key or not. + * Consider the key stream of all the keys in the file, + * k[0] .. k[n], where there are n keys in the file. + * @param key Key to find. + * @return -1, if key < k[0], no position; + * 0, such that k[i] = key and scanner is left in position i; and + * 1, such that k[i] < key, and scanner is left in position i. + * Furthermore, there may be a k[i+1], such that k[i] < key < k[i+1] + * but there may not be a k[i+1], and next() will return false (EOF). + * @throws IOException + */ + public int seekTo(byte[] key) throws IOException; + /** + * Consider the key stream of all the keys in the file, + * k[0] .. k[n], where there are n keys in the file. + * @param key Key to find + * @return false if key <= k[0] or true with scanner in position 'i' such + * that: k[i] < key. Furthermore: there may be a k[i+1], such that + * k[i] < key <= k[i+1] but there may also NOT be a k[i+1], and next() will + * return false (EOF). + */ + public boolean seekBefore(byte [] key) throws IOException; + /** + * Positions this scanner at the start of the file. + * @return False if empty file; i.e. a call to next would return false and + * the current key and value are undefined. + * @throws IOException + */ + public boolean seekTo() throws IOException; + /** + * Scans to the next entry in the file. + * @return Returns false if you are at the end otherwise true if more in file. + * @throws IOException + */ + public boolean next() throws IOException; + /** + * Gets a buffer view to the current key. You must call + * {@link #seekTo(byte[])} before this method. + * @return byte buffer for the key. The limit is set to the key size, and the + * position is 0, the start of the buffer view. + */ + public ByteBuffer getKey(); + /** + * Gets a buffer view to the current value. You must call + * {@link #seekTo(byte[])} before this method. + * + * @return byte buffer for the value. The limit is set to the value size, and + * the position is 0, the start of the buffer view. + */ + public ByteBuffer getValue(); + /** + * Convenience method to get a copy of the key as a string - interpreting the + * bytes as UTF8. You must call {@link #seekTo(byte[])} before this method. + * @return key as a string + */ + public String getKeyString(); + /** + * Convenience method to get a copy of the value as a string - interpreting + * the bytes as UTF8. You must call {@link #seekTo(byte[])} before this method. + * @return value as a string + */ + public String getValueString(); + /** + * @return Reader that underlies this Scanner instance. + */ + public HFile.Reader getReader(); + /** + * @return True is scanner has had one of the seek calls invoked; i.e. + * {@link #seekBefore(byte[])} or {@link #seekTo()} or {@link #seekTo(byte[])}. + * Otherwise returns false. + */ + public boolean isSeeked(); +} \ No newline at end of file diff --git a/src/java/org/apache/hadoop/hbase/io/hfile/SimpleBlockCache.java b/src/java/org/apache/hadoop/hbase/io/hfile/SimpleBlockCache.java new file mode 100644 index 00000000000..7f934e19e74 --- /dev/null +++ b/src/java/org/apache/hadoop/hbase/io/hfile/SimpleBlockCache.java @@ -0,0 +1,56 @@ +package org.apache.hadoop.hbase.io.hfile; + +import java.lang.ref.ReferenceQueue; +import java.lang.ref.SoftReference; +import java.nio.ByteBuffer; +import java.util.HashMap; +import java.util.Map; + + +/** + * Simple one RFile soft reference cache. + */ +public class SimpleBlockCache implements BlockCache { + private static class Ref extends SoftReference { + public String blockId; + public Ref(String blockId, ByteBuffer buf, ReferenceQueue q) { + super(buf, q); + this.blockId = blockId; + } + } + private Map cache = + new HashMap(); + + private ReferenceQueue q = new ReferenceQueue(); + public int dumps = 0; + + public SimpleBlockCache() { + super(); + } + + void processQueue() { + Ref r; + while ( (r = (Ref)q.poll()) != null) { + cache.remove(r.blockId); + dumps++; + } + } + + public synchronized int size() { + processQueue(); + return cache.size(); + } + @Override + public synchronized ByteBuffer getBlock(String blockName) { + processQueue(); // clear out some crap. + Ref ref = cache.get(blockName); + if (ref == null) + return null; + return ref.get(); + } + + @Override + public synchronized void cacheBlock(String blockName, ByteBuffer buf) { + cache.put(blockName, new Ref(blockName, buf, q)); + } +} diff --git a/src/java/org/apache/hadoop/hbase/io/hfile/package.html b/src/java/org/apache/hadoop/hbase/io/hfile/package.html new file mode 100644 index 00000000000..fa9244f735a --- /dev/null +++ b/src/java/org/apache/hadoop/hbase/io/hfile/package.html @@ -0,0 +1,25 @@ + + + + + + + +Provides the hbase data+index+metadata file. + + diff --git a/src/java/org/apache/hadoop/hbase/regionserver/HStore.java b/src/java/org/apache/hadoop/hbase/regionserver/HStore.java deleted file mode 100644 index a9e86b32847..00000000000 --- a/src/java/org/apache/hadoop/hbase/regionserver/HStore.java +++ /dev/null @@ -1,2207 +0,0 @@ -/** - * Copyright 2007 The Apache Software Foundation - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.regionserver; - -import java.io.EOFException; -import java.io.IOException; -import java.io.UnsupportedEncodingException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.SortedMap; -import java.util.TreeMap; -import java.util.concurrent.CopyOnWriteArraySet; -import java.util.concurrent.locks.ReentrantReadWriteLock; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.HBaseConfiguration; -import org.apache.hadoop.hbase.HColumnDescriptor; -import org.apache.hadoop.hbase.HConstants; -import org.apache.hadoop.hbase.HRegionInfo; -import org.apache.hadoop.hbase.HStoreKey; -import org.apache.hadoop.hbase.RemoteExceptionHandler; -import org.apache.hadoop.hbase.filter.RowFilterInterface; -import org.apache.hadoop.hbase.io.BloomFilterMapFile; -import org.apache.hadoop.hbase.io.Cell; -import org.apache.hadoop.hbase.io.HBaseMapFile; -import org.apache.hadoop.hbase.io.ImmutableBytesWritable; -import org.apache.hadoop.hbase.io.Reference; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.hbase.util.FSUtils; -import org.apache.hadoop.hbase.io.MapFile; -import org.apache.hadoop.hbase.io.SequenceFile; -import org.apache.hadoop.util.Progressable; -import org.apache.hadoop.util.StringUtils; - -/** - * HStore maintains a bunch of data files. It is responsible for maintaining - * the memory/file hierarchy and for periodic flushes to disk and compacting - * edits to the file. - * - * Locking and transactions are handled at a higher level. This API should not - * be called directly by any writer, but rather by an HRegion manager. - */ -public class HStore implements HConstants { - static final Log LOG = LogFactory.getLog(HStore.class); - - /* - * Regex that will work for straight filenames and for reference names. - * If reference, then the regex has more than just one group. Group 1 is - * this files id. Group 2 the referenced region name, etc. - */ - private static final Pattern REF_NAME_PARSER = - Pattern.compile("^(\\d+)(?:\\.(.+))?$"); - - protected final Memcache memcache; - private final Path basedir; - private final HRegionInfo info; - private final HColumnDescriptor family; - private final SequenceFile.CompressionType compression; - final FileSystem fs; - private final HBaseConfiguration conf; - // ttl in milliseconds. - protected long ttl; - private long majorCompactionTime; - private int maxFilesToCompact; - private final long desiredMaxFileSize; - private volatile long storeSize; - - private final Integer flushLock = new Integer(0); - - final ReentrantReadWriteLock lock = new ReentrantReadWriteLock(); - - final byte [] storeName; - private final String storeNameStr; - - /* - * Sorted Map of readers keyed by sequence id (Most recent should be last in - * in list). - */ - private final SortedMap storefiles = - Collections.synchronizedSortedMap(new TreeMap()); - - /* - * Sorted Map of readers keyed by sequence id (Most recent is last in list). - */ - private final SortedMap readers = - new TreeMap(); - - // The most-recent log-seq-ID that's present. The most-recent such ID means - // we can ignore all log messages up to and including that ID (because they're - // already reflected in the TreeMaps). - private volatile long maxSeqId; - - private final Path compactionDir; - private final Integer compactLock = new Integer(0); - private final int compactionThreshold; - - // All access must be synchronized. - private final CopyOnWriteArraySet changedReaderObservers = - new CopyOnWriteArraySet(); - - /** - * An HStore is a set of zero or more MapFiles, which stretch backwards over - * time. A given HStore is responsible for a certain set of columns for a - * row in the HRegion. - * - *

The HRegion starts writing to its set of HStores when the HRegion's - * memcache is flushed. This results in a round of new MapFiles, one for - * each HStore. - * - *

There's no reason to consider append-logging at this level; all logging - * and locking is handled at the HRegion level. HStore just provides - * services to manage sets of MapFiles. One of the most important of those - * services is MapFile-compaction services. - * - *

The only thing having to do with logs that HStore needs to deal with is - * the reconstructionLog. This is a segment of an HRegion's log that might - * NOT be present upon startup. If the param is NULL, there's nothing to do. - * If the param is non-NULL, we need to process the log to reconstruct - * a TreeMap that might not have been written to disk before the process - * died. - * - *

It's assumed that after this constructor returns, the reconstructionLog - * file will be deleted (by whoever has instantiated the HStore). - * - * @param basedir qualified path under which the region directory lives - * @param info HRegionInfo for this region - * @param family HColumnDescriptor for this column - * @param fs file system object - * @param reconstructionLog existing log file to apply if any - * @param conf configuration object - * @param reporter Call on a period so hosting server can report we're - * making progress to master -- otherwise master might think region deploy - * failed. Can be null. - * @throws IOException - */ - protected HStore(Path basedir, HRegionInfo info, HColumnDescriptor family, - FileSystem fs, Path reconstructionLog, HBaseConfiguration conf, - final Progressable reporter) - throws IOException { - this.basedir = basedir; - this.info = info; - this.family = family; - this.fs = fs; - this.conf = conf; - // getTimeToLive returns ttl in seconds. Convert to milliseconds. - this.ttl = family.getTimeToLive(); - if (ttl != HConstants.FOREVER) { - this.ttl *= 1000; - } - this.memcache = new Memcache(this.ttl, info); - this.compactionDir = HRegion.getCompactionDir(basedir); - this.storeName = Bytes.toBytes(this.info.getEncodedName() + "/" + - Bytes.toString(this.family.getName())); - this.storeNameStr = Bytes.toString(this.storeName); - - // By default, we compact if an HStore has more than - // MIN_COMMITS_FOR_COMPACTION map files - this.compactionThreshold = - conf.getInt("hbase.hstore.compactionThreshold", 3); - - // By default we split region if a file > DEFAULT_MAX_FILE_SIZE. - long maxFileSize = info.getTableDesc().getMaxFileSize(); - if (maxFileSize == HConstants.DEFAULT_MAX_FILE_SIZE) { - maxFileSize = conf.getLong("hbase.hregion.max.filesize", - HConstants.DEFAULT_MAX_FILE_SIZE); - } - this.desiredMaxFileSize = maxFileSize; - - this.majorCompactionTime = - conf.getLong(HConstants.MAJOR_COMPACTION_PERIOD, 86400000); - if (family.getValue(HConstants.MAJOR_COMPACTION_PERIOD) != null) { - String strCompactionTime = - family.getValue(HConstants.MAJOR_COMPACTION_PERIOD); - this.majorCompactionTime = (new Long(strCompactionTime)).longValue(); - } - - this.maxFilesToCompact = conf.getInt("hbase.hstore.compaction.max", 10); - this.storeSize = 0L; - - if (family.getCompression() == HColumnDescriptor.CompressionType.BLOCK) { - this.compression = SequenceFile.CompressionType.BLOCK; - } else if (family.getCompression() == - HColumnDescriptor.CompressionType.RECORD) { - this.compression = SequenceFile.CompressionType.RECORD; - } else { - this.compression = SequenceFile.CompressionType.NONE; - } - - Path mapdir = checkdir(HStoreFile.getMapDir(basedir, info.getEncodedName(), - family.getName())); - Path infodir = checkdir(HStoreFile.getInfoDir(basedir, info.getEncodedName(), - family.getName())); - - // Go through the 'mapdir' and 'infodir' together, make sure that all - // MapFiles are in a reliable state. Every entry in 'mapdir' must have a - // corresponding one in 'loginfodir'. Without a corresponding log info - // file, the entry in 'mapdir' must be deleted. - // loadHStoreFiles also computes the max sequence id internally. - this.maxSeqId = -1L; - this.storefiles.putAll(loadHStoreFiles(infodir, mapdir)); - if (LOG.isDebugEnabled() && this.storefiles.size() > 0) { - LOG.debug("Loaded " + this.storefiles.size() + " file(s) in hstore " + - Bytes.toString(this.storeName) + ", max sequence id " + this.maxSeqId); - } - - // Do reconstruction log. - runReconstructionLog(reconstructionLog, this.maxSeqId, reporter); - - // Finally, start up all the map readers! - setupReaders(); - } - - /* - * Setup the mapfile readers for this store. There could be more than one - * since we haven't compacted yet. - * @throws IOException - */ - private void setupReaders() throws IOException { - boolean first = true; - for(Map.Entry e: this.storefiles.entrySet()) { - MapFile.Reader r = null; - if (first) { - // Use a block cache (if configured) for the first reader only - // so as to control memory usage. - r = e.getValue().getReader(this.fs, this.family.isBloomfilter(), - family.isBlockCacheEnabled()); - first = false; - } else { - r = e.getValue().getReader(this.fs, this.family.isBloomfilter(), - false); - } - this.readers.put(e.getKey(), r); - } - } - - /* - * @param dir If doesn't exist, create it. - * @return Passed dir. - * @throws IOException - */ - private Path checkdir(final Path dir) throws IOException { - if (!fs.exists(dir)) { - fs.mkdirs(dir); - } - return dir; - } - - HColumnDescriptor getFamily() { - return this.family; - } - - long getMaxSequenceId() { - return this.maxSeqId; - } - - /* - * Run reconstuction log - * @param reconstructionLog - * @param msid - * @param reporter - * @throws IOException - */ - private void runReconstructionLog(final Path reconstructionLog, - final long msid, final Progressable reporter) - throws IOException { - try { - doReconstructionLog(reconstructionLog, msid, reporter); - } catch (EOFException e) { - // Presume we got here because of lack of HADOOP-1700; for now keep going - // but this is probably not what we want long term. If we got here there - // has been data-loss - LOG.warn("Exception processing reconstruction log " + reconstructionLog + - " opening " + this.storeName + - " -- continuing. Probably lack-of-HADOOP-1700 causing DATA LOSS!", e); - } catch (IOException e) { - // Presume we got here because of some HDFS issue. Don't just keep going. - // Fail to open the HStore. Probably means we'll fail over and over - // again until human intervention but alternative has us skipping logs - // and losing edits: HBASE-642. - LOG.warn("Exception processing reconstruction log " + reconstructionLog + - " opening " + this.storeName, e); - throw e; - } - } - - /* - * Read the reconstructionLog to see whether we need to build a brand-new - * MapFile out of non-flushed log entries. - * - * We can ignore any log message that has a sequence ID that's equal to or - * lower than maxSeqID. (Because we know such log messages are already - * reflected in the MapFiles.) - */ - @SuppressWarnings("unchecked") - private void doReconstructionLog(final Path reconstructionLog, - final long maxSeqID, final Progressable reporter) - throws UnsupportedEncodingException, IOException { - if (reconstructionLog == null || !fs.exists(reconstructionLog)) { - // Nothing to do. - return; - } - // Check its not empty. - FileStatus[] stats = fs.listStatus(reconstructionLog); - if (stats == null || stats.length == 0) { - LOG.warn("Passed reconstruction log " + reconstructionLog + " is zero-length"); - return; - } - long maxSeqIdInLog = -1; - TreeMap reconstructedCache = - new TreeMap(new HStoreKey.HStoreKeyWritableComparator(this.info)); - - SequenceFile.Reader logReader = new SequenceFile.Reader(this.fs, - reconstructionLog, this.conf); - - try { - HLogKey key = new HLogKey(); - HLogEdit val = new HLogEdit(); - long skippedEdits = 0; - long editsCount = 0; - // How many edits to apply before we send a progress report. - int reportInterval = this.conf.getInt("hbase.hstore.report.interval.edits", 2000); - while (logReader.next(key, val)) { - maxSeqIdInLog = Math.max(maxSeqIdInLog, key.getLogSeqNum()); - if (key.getLogSeqNum() <= maxSeqID) { - skippedEdits++; - continue; - } - // Check this edit is for me. Also, guard against writing - // METACOLUMN info such as HBASE::CACHEFLUSH entries - byte [] column = val.getColumn(); - if (val.isTransactionEntry() || Bytes.equals(column, HLog.METACOLUMN) - || !Bytes.equals(key.getRegionName(), info.getRegionName()) - || !HStoreKey.matchingFamily(family.getName(), column)) { - continue; - } - HStoreKey k = new HStoreKey(key.getRow(), column, val.getTimestamp(), - this.info); - reconstructedCache.put(k, val.getVal()); - editsCount++; - // Every 2k edits, tell the reporter we're making progress. - // Have seen 60k edits taking 3minutes to complete. - if (reporter != null && (editsCount % reportInterval) == 0) { - reporter.progress(); - } - } - if (LOG.isDebugEnabled()) { - LOG.debug("Applied " + editsCount + ", skipped " + skippedEdits + - " because sequence id <= " + maxSeqID); - } - } finally { - logReader.close(); - } - - if (reconstructedCache.size() > 0) { - // We create a "virtual flush" at maxSeqIdInLog+1. - if (LOG.isDebugEnabled()) { - LOG.debug("flushing reconstructionCache"); - } - internalFlushCache(reconstructedCache, maxSeqIdInLog + 1); - } - } - - /* - * Creates a series of HStoreFiles loaded from the given directory. - * There must be a matching 'mapdir' and 'loginfo' pair of files. - * If only one exists, we'll delete it. Does other consistency tests - * checking files are not zero, etc. - * - * @param infodir qualified path for info file directory - * @param mapdir qualified path for map file directory - * @throws IOException - */ - private SortedMap loadHStoreFiles(Path infodir, Path mapdir) - throws IOException { - // Look first at info files. If a reference, these contain info we need - // to create the HStoreFile. - FileStatus infofiles[] = fs.listStatus(infodir); - SortedMap results = new TreeMap(); - ArrayList mapfiles = new ArrayList(infofiles.length); - for (int i = 0; i < infofiles.length; i++) { - Path p = infofiles[i].getPath(); - // Check for empty info file. Should never be the case but can happen - // after data loss in hdfs for whatever reason (upgrade, etc.): HBASE-646 - if (this.fs.getFileStatus(p).getLen() <= 0) { - LOG.warn("Skipping " + p + " because its empty. DATA LOSS? Can " + - "this scenario be repaired? HBASE-646"); - continue; - } - - Matcher m = REF_NAME_PARSER.matcher(p.getName()); - /* - * * * * * N O T E * * * * * - * - * We call isReference(Path, Matcher) here because it calls - * Matcher.matches() which must be called before Matcher.group(int) - * and we don't want to call Matcher.matches() twice. - * - * * * * * N O T E * * * * * - */ - boolean isReference = isReference(p, m); - long fid = Long.parseLong(m.group(1)); - - HStoreFile curfile = null; - Reference reference = null; - if (isReference) { - reference = HStoreFile.readSplitInfo(p, fs); - } - curfile = new HStoreFile(conf, fs, basedir, this.info, - family.getName(), fid, reference); - long storeSeqId = -1; - try { - storeSeqId = curfile.loadInfo(fs); - if (storeSeqId > this.maxSeqId) { - this.maxSeqId = storeSeqId; - } - } catch (IOException e) { - // If the HSTORE_LOGINFOFILE doesn't contain a number, just ignore it. - // That means it was built prior to the previous run of HStore, and so - // it cannot contain any updates also contained in the log. - LOG.info("HSTORE_LOGINFOFILE " + curfile + - " does not contain a sequence number - ignoring"); - } - Path mapfile = curfile.getMapFilePath(); - if (!fs.exists(mapfile)) { - fs.delete(curfile.getInfoFilePath(), false); - LOG.warn("Mapfile " + mapfile.toString() + " does not exist. " + - "Cleaned up info file. Continuing...Probable DATA LOSS!!!"); - continue; - } - // References don't have data or index components under mapfile. - if (!isReference && isEmptyDataFile(mapfile)) { - curfile.delete(); - // We can have empty data file if data loss in hdfs. - LOG.warn("Mapfile " + mapfile.toString() + " has empty data. " + - "Deleting. Continuing...Probable DATA LOSS!!! See HBASE-646."); - continue; - } - if (!isReference && isEmptyIndexFile(mapfile)) { - try { - // Try fixing this file.. if we can. Use the hbase version of fix. - // Need to remove the old index file first else fix won't go ahead. - this.fs.delete(new Path(mapfile, MapFile.INDEX_FILE_NAME), false); - // TODO: This is going to fail if we are to rebuild a file from - // meta because it won't have right comparator: HBASE-848. - long count = MapFile.fix(this.fs, mapfile, HStoreKey.class, - HBaseMapFile.VALUE_CLASS, false, this.conf); - if (LOG.isDebugEnabled()) { - LOG.debug("Fixed index on " + mapfile.toString() + "; had " + - count + " entries"); - } - } catch (Exception e) { - LOG.warn("Failed fix of " + mapfile.toString() + - "...continuing; Probable DATA LOSS!!!", e); - continue; - } - } - long length = curfile.length(); - storeSize += length; - - // TODO: Confirm referent exists. - - // Found map and sympathetic info file. Add this hstorefile to result. - if (LOG.isDebugEnabled()) { - LOG.debug("loaded " + FSUtils.getPath(p) + ", isReference=" + - isReference + ", sequence id=" + storeSeqId + - ", length=" + length + ", majorCompaction=" + - curfile.isMajorCompaction()); - } - results.put(Long.valueOf(storeSeqId), curfile); - // Keep list of sympathetic data mapfiles for cleaning info dir in next - // section. Make sure path is fully qualified for compare. - mapfiles.add(this.fs.makeQualified(mapfile)); - } - cleanDataFiles(mapfiles, mapdir); - return results; - } - - /* - * If no info file delete the sympathetic data file. - * @param mapfiles List of mapfiles. - * @param mapdir Directory to check. - * @throws IOException - */ - private void cleanDataFiles(final List mapfiles, final Path mapdir) - throws IOException { - // List paths by experience returns fully qualified names -- at least when - // running on a mini hdfs cluster. - FileStatus [] datfiles = fs.listStatus(mapdir); - for (int i = 0; i < datfiles.length; i++) { - Path p = datfiles[i].getPath(); - // If does not have sympathetic info file, delete. - Path qualifiedP = fs.makeQualified(p); - if (!mapfiles.contains(qualifiedP)) { - fs.delete(p, true); - } - } - } - - /* - * @param mapfile - * @return True if the passed mapfile has a zero-length data component (its - * broken). - * @throws IOException - */ - private boolean isEmptyDataFile(final Path mapfile) - throws IOException { - // Mapfiles are made of 'data' and 'index' files. Confirm 'data' is - // non-null if it exists (may not have been written to yet). - return isEmptyFile(new Path(mapfile, MapFile.DATA_FILE_NAME)); - } - - /* - * @param mapfile - * @return True if the passed mapfile has a zero-length index component (its - * broken). - * @throws IOException - */ - private boolean isEmptyIndexFile(final Path mapfile) - throws IOException { - // Mapfiles are made of 'data' and 'index' files. Confirm 'data' is - // non-null if it exists (may not have been written to yet). - return isEmptyFile(new Path(mapfile, MapFile.INDEX_FILE_NAME)); - } - - /* - * @param f - * @return True if the passed file does not exist or is zero-length (its - * broken). - * @throws IOException - */ - private boolean isEmptyFile(final Path f) - throws IOException { - return !this.fs.exists(f) || this.fs.getFileStatus(f).getLen() == 0; - } - - /** - * Adds a value to the memcache - * - * @param key - * @param value - * @return memcache size delta - */ - protected long add(HStoreKey key, byte[] value) { - lock.readLock().lock(); - try { - return this.memcache.add(key, value); - } finally { - lock.readLock().unlock(); - } - } - - /** - * Close all the MapFile readers - * - * We don't need to worry about subsequent requests because the HRegion holds - * a write lock that will prevent any more reads or writes. - * - * @throws IOException - */ - List close() throws IOException { - ArrayList result = null; - this.lock.writeLock().lock(); - try { - for (MapFile.Reader reader: this.readers.values()) { - reader.close(); - } - synchronized (this.storefiles) { - result = new ArrayList(storefiles.values()); - } - LOG.debug("closed " + this.storeNameStr); - return result; - } finally { - this.lock.writeLock().unlock(); - } - } - - ////////////////////////////////////////////////////////////////////////////// - // Flush changes to disk - ////////////////////////////////////////////////////////////////////////////// - - /** - * Snapshot this stores memcache. Call before running - * {@link #flushCache(long)} so it has some work to do. - */ - void snapshot() { - this.memcache.snapshot(); - } - - /** - * Write out current snapshot. Presumes {@link #snapshot()} has been called - * previously. - * @param logCacheFlushId flush sequence number - * @return true if a compaction is needed - * @throws IOException - */ - boolean flushCache(final long logCacheFlushId) throws IOException { - // Get the snapshot to flush. Presumes that a call to - // this.memcache.snapshot() has happened earlier up in the chain. - SortedMap cache = this.memcache.getSnapshot(); - boolean compactionNeeded = internalFlushCache(cache, logCacheFlushId); - // If an exception happens flushing, we let it out without clearing - // the memcache snapshot. The old snapshot will be returned when we say - // 'snapshot', the next time flush comes around. - this.memcache.clearSnapshot(cache); - return compactionNeeded; - } - - private boolean internalFlushCache(final SortedMap cache, - final long logCacheFlushId) - throws IOException { - long flushed = 0; - // Don't flush if there are no entries. - if (cache.size() == 0) { - return false; - } - - // TODO: We can fail in the below block before we complete adding this - // flush to list of store files. Add cleanup of anything put on filesystem - // if we fail. - synchronized(flushLock) { - long now = System.currentTimeMillis(); - // A. Write the Maps out to the disk - HStoreFile flushedFile = new HStoreFile(conf, fs, basedir, - this.info, family.getName(), -1L, null); - MapFile.Writer out = flushedFile.getWriter(this.fs, this.compression, - this.family.isBloomfilter(), cache.size()); - setIndexInterval(out); - - // Here we tried picking up an existing HStoreFile from disk and - // interlacing the memcache flush compacting as we go. The notion was - // that interlacing would take as long as a pure flush with the added - // benefit of having one less file in the store. Experiments showed that - // it takes two to three times the amount of time flushing -- more column - // families makes it so the two timings come closer together -- but it - // also complicates the flush. The code was removed. Needed work picking - // which file to interlace (favor references first, etc.) - // - // Related, looks like 'merging compactions' in BigTable paper interlaces - // a memcache flush. We don't. - int entries = 0; - try { - for (Map.Entry es: cache.entrySet()) { - HStoreKey curkey = es.getKey(); - byte[] bytes = es.getValue(); - if (HStoreKey.matchingFamily(this.family.getName(), curkey.getColumn())) { - if (!isExpired(curkey, ttl, now)) { - entries++; - out.append(curkey, new ImmutableBytesWritable(bytes)); - flushed += this.memcache.heapSize(curkey, bytes, null); - } - } - } - } finally { - out.close(); - } - long newStoreSize = flushedFile.length(); - storeSize += newStoreSize; - - // B. Write out the log sequence number that corresponds to this output - // MapFile. The MapFile is current up to and including the log seq num. - flushedFile.writeInfo(fs, logCacheFlushId); - - // C. Finally, make the new MapFile available. - updateReaders(logCacheFlushId, flushedFile); - if(LOG.isDebugEnabled()) { - LOG.debug("Added " + FSUtils.getPath(flushedFile.getMapFilePath()) + - " with " + entries + - " entries, sequence id " + logCacheFlushId + ", data size ~" + - StringUtils.humanReadableInt(flushed) + ", file size " + - StringUtils.humanReadableInt(newStoreSize) + " to " + - this.info.getRegionNameAsString()); - } - } - return storefiles.size() >= compactionThreshold; - } - - /* - * Change readers adding into place the Reader produced by this new flush. - * @param logCacheFlushId - * @param flushedFile - * @throws IOException - */ - private void updateReaders(final long logCacheFlushId, - final HStoreFile flushedFile) - throws IOException { - this.lock.writeLock().lock(); - try { - Long flushid = Long.valueOf(logCacheFlushId); - // Open the map file reader. - this.readers.put(flushid, - flushedFile.getReader(this.fs, this.family.isBloomfilter(), - this.family.isBlockCacheEnabled())); - this.storefiles.put(flushid, flushedFile); - // Tell listeners of the change in readers. - notifyChangedReadersObservers(); - } finally { - this.lock.writeLock().unlock(); - } - } - - /* - * Notify all observers that set of Readers has changed. - * @throws IOException - */ - private void notifyChangedReadersObservers() throws IOException { - for (ChangedReadersObserver o: this.changedReaderObservers) { - o.updateReaders(); - } - } - - /* - * @param o Observer who wants to know about changes in set of Readers - */ - void addChangedReaderObserver(ChangedReadersObserver o) { - this.changedReaderObservers.add(o); - } - - /* - * @param o Observer no longer interested in changes in set of Readers. - */ - void deleteChangedReaderObserver(ChangedReadersObserver o) { - if (!this.changedReaderObservers.remove(o)) { - LOG.warn("Not in set" + o); - } - } - - ////////////////////////////////////////////////////////////////////////////// - // Compaction - ////////////////////////////////////////////////////////////////////////////// - - /* - * @param files - * @return True if any of the files in files are References. - */ - private boolean hasReferences(Collection files) { - if (files != null && files.size() > 0) { - for (HStoreFile hsf: files) { - if (hsf.isReference()) { - return true; - } - } - } - return false; - } - - /* - * Gets lowest timestamp from files in a dir - * - * @param fs - * @param dir - * @throws IOException - */ - private static long getLowestTimestamp(FileSystem fs, Path dir) - throws IOException { - FileStatus[] stats = fs.listStatus(dir); - if (stats == null || stats.length == 0) { - return 0l; - } - long lowTimestamp = Long.MAX_VALUE; - for (int i = 0; i < stats.length; i++) { - long timestamp = stats[i].getModificationTime(); - if (timestamp < lowTimestamp){ - lowTimestamp = timestamp; - } - } - return lowTimestamp; - } - - /** - * Compact the back-HStores. This method may take some time, so the calling - * thread must be able to block for long periods. - * - *

During this time, the HStore can work as usual, getting values from - * MapFiles and writing new MapFiles from the Memcache. - * - * Existing MapFiles are not destroyed until the new compacted TreeMap is - * completely written-out to disk. - * - * The compactLock prevents multiple simultaneous compactions. - * The structureLock prevents us from interfering with other write operations. - * - * We don't want to hold the structureLock for the whole time, as a compact() - * can be lengthy and we want to allow cache-flushes during this period. - * - * @param majorCompaction True to force a major compaction regardless of - * thresholds - * @return mid key if a split is needed, null otherwise - * @throws IOException - */ - StoreSize compact(final boolean majorCompaction) throws IOException { - boolean forceSplit = this.info.shouldSplit(false); - boolean doMajorCompaction = majorCompaction; - synchronized (compactLock) { - long maxId = -1; - List filesToCompact = null; - synchronized (storefiles) { - if (this.storefiles.size() <= 0) { - LOG.debug(this.storeNameStr + ": no store files to compact"); - return null; - } - // filesToCompact are sorted oldest to newest. - filesToCompact = new ArrayList(this.storefiles.values()); - - // The max-sequenceID in any of the to-be-compacted TreeMaps is the - // last key of storefiles. - maxId = this.storefiles.lastKey().longValue(); - } - // Check to see if we need to do a major compaction on this region. - // If so, change doMajorCompaction to true to skip the incremental - // compacting below. Only check if doMajorCompaction is not true. - if (!doMajorCompaction) { - doMajorCompaction = isMajorCompaction(filesToCompact); - } - boolean references = hasReferences(filesToCompact); - if (!doMajorCompaction && !references && - (forceSplit || (filesToCompact.size() < compactionThreshold))) { - return checkSplit(forceSplit); - } - if (!fs.exists(compactionDir) && !fs.mkdirs(compactionDir)) { - LOG.warn("Mkdir on " + compactionDir.toString() + " failed"); - return checkSplit(forceSplit); - } - - // HBASE-745, preparing all store file sizes for incremental compacting - // selection. - int countOfFiles = filesToCompact.size(); - long totalSize = 0; - long[] fileSizes = new long[countOfFiles]; - long skipped = 0; - int point = 0; - for (int i = 0; i < countOfFiles; i++) { - HStoreFile file = filesToCompact.get(i); - Path path = file.getMapFilePath(); - if (path == null) { - LOG.warn("Path is null for " + file); - return null; - } - int len = 0; - // listStatus can come back null. - FileStatus [] fss = this.fs.listStatus(path); - for (int ii = 0; fss != null && ii < fss.length; ii++) { - len += fss[ii].getLen(); - } - fileSizes[i] = len; - totalSize += len; - } - if (!doMajorCompaction && !references) { - // Here we select files for incremental compaction. - // The rule is: if the largest(oldest) one is more than twice the - // size of the second, skip the largest, and continue to next..., - // until we meet the compactionThreshold limit. - for (point = 0; point < countOfFiles - 1; point++) { - if ((fileSizes[point] < fileSizes[point + 1] * 2) && - (countOfFiles - point) <= maxFilesToCompact) { - break; - } - skipped += fileSizes[point]; - } - filesToCompact = new ArrayList(filesToCompact.subList(point, - countOfFiles)); - if (filesToCompact.size() <= 1) { - if (LOG.isDebugEnabled()) { - LOG.debug("Skipped compaction of 1 file; compaction size of " + - this.storeNameStr + ": " + - StringUtils.humanReadableInt(totalSize) + "; Skipped " + point + - " files, size: " + skipped); - } - return checkSplit(forceSplit); - } - if (LOG.isDebugEnabled()) { - LOG.debug("Compaction size of " + this.storeNameStr + ": " + - StringUtils.humanReadableInt(totalSize) + "; Skipped " + point + - " file(s), size: " + skipped); - } - } - - /* - * We create a new list of MapFile.Reader objects so we don't screw up - * the caching associated with the currently-loaded ones. Our iteration- - * based access pattern is practically designed to ruin the cache. - */ - List rdrs = new ArrayList(); - int nrows = createReaders(rdrs, filesToCompact); - - // Step through them, writing to the brand-new MapFile - HStoreFile compactedOutputFile = new HStoreFile(conf, fs, - this.compactionDir, this.info, family.getName(), -1L, null); - if (LOG.isDebugEnabled()) { - LOG.debug("Started compaction of " + rdrs.size() + " file(s)" + - (references? ", hasReferences=true,": " ") + " into " + - FSUtils.getPath(compactedOutputFile.getMapFilePath())); - } - MapFile.Writer writer = compactedOutputFile.getWriter(this.fs, - this.compression, this.family.isBloomfilter(), nrows); - setIndexInterval(writer); - try { - compact(writer, rdrs, doMajorCompaction); - } finally { - writer.close(); - } - - // Now, write out an HSTORE_LOGINFOFILE for the brand-new TreeMap. - compactedOutputFile.writeInfo(fs, maxId, doMajorCompaction); - - // Move the compaction into place. - completeCompaction(filesToCompact, compactedOutputFile); - if (LOG.isDebugEnabled()) { - LOG.debug("Completed " + (doMajorCompaction? "major": "") + - " compaction of " + this.storeNameStr + - " store size is " + StringUtils.humanReadableInt(storeSize)); - } - } - return checkSplit(forceSplit); - } - - /* - * Set the index interval for the mapfile. There are two sources for - * configuration information: the HCD, and the global HBase config. - * If a source returns the default value, it is ignored. Otherwise, - * the smallest non-default value is preferred. - */ - private void setIndexInterval(MapFile.Writer writer) { - int familyInterval = this.family.getMapFileIndexInterval(); - int interval = this.conf.getInt("hbase.io.index.interval", - HColumnDescriptor.DEFAULT_MAPFILE_INDEX_INTERVAL); - if (familyInterval != HColumnDescriptor.DEFAULT_MAPFILE_INDEX_INTERVAL) { - if (interval != HColumnDescriptor.DEFAULT_MAPFILE_INDEX_INTERVAL) { - if (familyInterval < interval) - interval = familyInterval; - } else { - interval = familyInterval; - } - } - writer.setIndexInterval(interval); - } - - /* - * @return True if we should run a major compaction. - */ - boolean isMajorCompaction() throws IOException { - List filesToCompact = null; - synchronized (storefiles) { - // filesToCompact are sorted oldest to newest. - filesToCompact = new ArrayList(this.storefiles.values()); - } - return isMajorCompaction(filesToCompact); - } - - /* - * @param filesToCompact Files to compact. Can be null. - * @return True if we should run a major compaction. - */ - private boolean isMajorCompaction(final List filesToCompact) - throws IOException { - boolean result = false; - Path mapdir = HStoreFile.getMapDir(this.basedir, this.info.getEncodedName(), - this.family.getName()); - long lowTimestamp = getLowestTimestamp(fs, mapdir); - long now = System.currentTimeMillis(); - if (lowTimestamp > 0l && lowTimestamp < (now - this.majorCompactionTime)) { - // Major compaction time has elapsed. - long elapsedTime = now - lowTimestamp; - if (filesToCompact != null && filesToCompact.size() == 1 && - filesToCompact.get(0).isMajorCompaction() && - (this.ttl == HConstants.FOREVER || elapsedTime < this.ttl)) { - if (LOG.isDebugEnabled()) { - LOG.debug("Skipping major compaction of " + this.storeNameStr + - " because one (major) compacted file only and elapsedTime " + - elapsedTime + "ms is < ttl=" + this.ttl); - } - } else { - if (LOG.isDebugEnabled()) { - LOG.debug("Major compaction triggered on store " + this.storeNameStr + - "; time since last major compaction " + (now - lowTimestamp) + "ms"); - } - result = true; - } - } - return result; - } - - /* - * Create readers for the passed in list of HStoreFiles and add them to - * readers list. Used compacting. - * @param readers Add Readers here. - * @param files List of HSFs to make Readers for. - * @return Count of rows for bloom filter sizing. Returns -1 if no bloom - * filter wanted. - */ - private int createReaders(final List rs, - final List files) - throws IOException { - /* We create a new list of MapFile.Reader objects so we don't screw up - * the caching associated with the currently-loaded ones. Our iteration- - * based access pattern is practically designed to ruin the cache. - */ - int nrows = -1; - for (HStoreFile file: files) { - try { - // TODO: Readers are opened without block-cache enabled. Means we don't - // get the prefetch that makes the read faster. But we don't want to - // enable block-cache for these readers that are about to be closed. - // The compaction of soon-to-be closed readers will probably force out - // blocks that may be needed servicing real-time requests whereas - // compaction runs in background. TODO: We know we're going to read - // this file straight through. Leverage this fact. Use a big buffer - // client side to speed things up or read it all up into memory one file - // at a time or pull local and memory-map the file but leave the writer - // up in hdfs? - BloomFilterMapFile.Reader reader = file.getReader(fs, false, false); - rs.add(reader); - // Compute the size of the new bloomfilter if needed - if (this.family.isBloomfilter()) { - nrows += reader.getBloomFilterSize(); - } - } catch (IOException e) { - // Add info about which file threw exception. It may not be in the - // exception message so output a message here where we know the - // culprit. - LOG.warn("Failed with " + e.toString() + ": " + file.toString()); - closeCompactionReaders(rs); - throw e; - } - } - return nrows; - } - - /* - * @param r List to reverse - * @return A reversed array of content of readers - */ - private MapFile.Reader [] reverse(final List r) { - List copy = new ArrayList(r); - Collections.reverse(copy); - return copy.toArray(new MapFile.Reader[0]); - } - - /* - * @param rdrs List of readers - * @param keys Current keys - * @param done Which readers are done - * @return The lowest current key in passed rdrs - */ - private int getLowestKey(final MapFile.Reader [] rdrs, - final HStoreKey [] keys, final boolean [] done) { - int lowestKey = -1; - for (int i = 0; i < rdrs.length; i++) { - if (done[i]) { - continue; - } - if (lowestKey < 0) { - lowestKey = i; - } else { - if (keys[i].compareTo(keys[lowestKey]) < 0) { - lowestKey = i; - } - } - } - return lowestKey; - } - - /* - * Compact a list of MapFile.Readers into MapFile.Writer. - * - * We work by iterating through the readers in parallel looking at newest - * store file first. We always increment the lowest-ranked one. Updates to a - * single row/column will appear ranked by timestamp. - * @param compactedOut Where to write compaction. - * @param pReaders List of readers sorted oldest to newest. - * @param majorCompaction True to force a major compaction regardless of - * thresholds - * @throws IOException - */ - private void compact(final MapFile.Writer compactedOut, - final List pReaders, final boolean majorCompaction) - throws IOException { - // Reverse order so newest store file is first. - MapFile.Reader[] rdrs = reverse(pReaders); - try { - HStoreKey [] keys = new HStoreKey[rdrs.length]; - ImmutableBytesWritable [] vals = new ImmutableBytesWritable[rdrs.length]; - boolean [] done = new boolean[rdrs.length]; - for(int i = 0; i < rdrs.length; i++) { - keys[i] = new HStoreKey(HConstants.EMPTY_BYTE_ARRAY, this.info); - vals[i] = new ImmutableBytesWritable(); - done[i] = false; - } - - // Now, advance through the readers in order. This will have the - // effect of a run-time sort of the entire dataset. - int numDone = 0; - for (int i = 0; i < rdrs.length; i++) { - rdrs[i].reset(); - done[i] = !rdrs[i].next(keys[i], vals[i]); - if (done[i]) { - numDone++; - } - } - - long now = System.currentTimeMillis(); - int timesSeen = 0; - HStoreKey lastSeen = new HStoreKey(); - HStoreKey lastDelete = null; - while (numDone < done.length) { - // Get lowest key in all store files. - int lowestKey = getLowestKey(rdrs, keys, done); - HStoreKey sk = keys[lowestKey]; - // If its same row and column as last key, increment times seen. - if (HStoreKey.equalsTwoRowKeys(info, lastSeen.getRow(), sk.getRow()) - && Bytes.equals(lastSeen.getColumn(), sk.getColumn())) { - timesSeen++; - // Reset last delete if not exact timestamp -- lastDelete only stops - // exactly the same key making it out to the compacted store file. - if (lastDelete != null && - lastDelete.getTimestamp() != sk.getTimestamp()) { - lastDelete = null; - } - } else { - timesSeen = 1; - lastDelete = null; - } - - // Don't write empty rows or columns. Only remove cells on major - // compaction. Remove if expired of > VERSIONS - if (sk.getRow().length != 0 && sk.getColumn().length != 0) { - ImmutableBytesWritable value = vals[lowestKey]; - if (!majorCompaction) { - // Write out all values if not a major compaction. - compactedOut.append(sk, value); - } else { - boolean expired = false; - boolean deleted = false; - if (timesSeen <= family.getMaxVersions() && - !(expired = isExpired(sk, ttl, now))) { - // If this value key is same as a deleted key, skip - if (lastDelete != null && sk.equals(lastDelete)) { - deleted = true; - } else if (HLogEdit.isDeleted(value.get())) { - // If a deleted value, skip - deleted = true; - lastDelete = new HStoreKey(sk); - } else { - compactedOut.append(sk, vals[lowestKey]); - } - } - if (expired || deleted) { - // HBASE-855 remove one from timesSeen because it did not make it - // past expired check -- don't count against max versions. - timesSeen--; - } - } - } - - // Update last-seen items - lastSeen = new HStoreKey(sk); - - // Advance the smallest key. If that reader's all finished, then - // mark it as done. - if (!rdrs[lowestKey].next(keys[lowestKey], vals[lowestKey])) { - done[lowestKey] = true; - rdrs[lowestKey].close(); - rdrs[lowestKey] = null; - numDone++; - } - } - } finally { - closeCompactionReaders(Arrays.asList(rdrs)); - } - } - - private void closeCompactionReaders(final List rdrs) { - for (MapFile.Reader r: rdrs) { - try { - if (r != null) { - r.close(); - } - } catch (IOException e) { - LOG.warn("Exception closing reader for " + this.storeNameStr, e); - } - } - } - - /* - * It's assumed that the compactLock will be acquired prior to calling this - * method! Otherwise, it is not thread-safe! - * - * It works by processing a compaction that's been written to disk. - * - *

It is usually invoked at the end of a compaction, but might also be - * invoked at HStore startup, if the prior execution died midway through. - * - *

Moving the compacted TreeMap into place means: - *

-   * 1) Moving the new compacted MapFile into place
-   * 2) Unload all replaced MapFiles, close and collect list to delete.
-   * 3) Loading the new TreeMap.
-   * 4) Compute new store size
-   * 
- * - * @param compactedFiles list of files that were compacted - * @param compactedFile HStoreFile that is the result of the compaction - * @throws IOException - */ - private void completeCompaction(final List compactedFiles, - final HStoreFile compactedFile) - throws IOException { - this.lock.writeLock().lock(); - try { - // 1. Moving the new MapFile into place. - HStoreFile finalCompactedFile = new HStoreFile(conf, fs, basedir, - this.info, family.getName(), -1, null, - compactedFile.isMajorCompaction()); - if (LOG.isDebugEnabled()) { - LOG.debug("moving " + FSUtils.getPath(compactedFile.getMapFilePath()) + - " to " + FSUtils.getPath(finalCompactedFile.getMapFilePath())); - } - if (!compactedFile.rename(this.fs, finalCompactedFile)) { - LOG.error("Failed move of compacted file " + - finalCompactedFile.getMapFilePath().toString()); - return; - } - - // 2. Unload all replaced MapFiles, close and collect list to delete. - synchronized (storefiles) { - Map toDelete = new HashMap(); - for (Map.Entry e : this.storefiles.entrySet()) { - if (!compactedFiles.contains(e.getValue())) { - continue; - } - Long key = e.getKey(); - MapFile.Reader reader = this.readers.remove(key); - if (reader != null) { - reader.close(); - } - toDelete.put(key, e.getValue()); - } - - try { - // 3. Loading the new TreeMap. - // Change this.storefiles so it reflects new state but do not - // delete old store files until we have sent out notification of - // change in case old files are still being accessed by outstanding - // scanners. - for (Long key : toDelete.keySet()) { - this.storefiles.remove(key); - } - // Add new compacted Reader and store file. - Long orderVal = Long.valueOf(finalCompactedFile.loadInfo(fs)); - this.readers.put(orderVal, - // Use a block cache (if configured) for this reader since - // it is the only one. - finalCompactedFile.getReader(this.fs, - this.family.isBloomfilter(), - this.family.isBlockCacheEnabled())); - this.storefiles.put(orderVal, finalCompactedFile); - // Tell observers that list of Readers has changed. - notifyChangedReadersObservers(); - // Finally, delete old store files. - for (HStoreFile hsf : toDelete.values()) { - hsf.delete(); - } - } catch (IOException e) { - e = RemoteExceptionHandler.checkIOException(e); - LOG.error("Failed replacing compacted files for " + - this.storeNameStr + - ". Compacted file is " + finalCompactedFile.toString() + - ". Files replaced are " + compactedFiles.toString() + - " some of which may have been already removed", e); - } - // 4. Compute new store size - storeSize = 0L; - for (HStoreFile hsf : storefiles.values()) { - storeSize += hsf.length(); - } - } - } finally { - this.lock.writeLock().unlock(); - } - } - - // //////////////////////////////////////////////////////////////////////////// - // Accessors. - // (This is the only section that is directly useful!) - ////////////////////////////////////////////////////////////////////////////// - - /** - * Return all the available columns for the given key. The key indicates a - * row and timestamp, but not a column name. - * - * The returned object should map column names to Cells. - */ - void getFull(HStoreKey key, final Set columns, - final int numVersions, Map results) - throws IOException { - int versions = versionsToReturn(numVersions); - - Map deletes = - new TreeMap(Bytes.BYTES_COMPARATOR); - - // if the key is null, we're not even looking for anything. return. - if (key == null) { - return; - } - - this.lock.readLock().lock(); - - // get from the memcache first. - memcache.getFull(key, columns, versions, deletes, results); - - try { - MapFile.Reader[] maparray = getReaders(); - - // examine each mapfile - for (int i = maparray.length - 1; i >= 0; i--) { - MapFile.Reader map = maparray[i]; - - // synchronize on the map so that no one else iterates it at the same - // time - getFullFromMapFile(map, key, columns, versions, deletes, results); - } - } finally { - this.lock.readLock().unlock(); - } - } - - private void getFullFromMapFile(MapFile.Reader map, HStoreKey key, - Set columns, int numVersions, Map deletes, - Map results) - throws IOException { - synchronized(map) { - long now = System.currentTimeMillis(); - - // seek back to the beginning - map.reset(); - - // seek to the closest key that should match the row we're looking for - ImmutableBytesWritable readval = new ImmutableBytesWritable(); - HStoreKey readkey = (HStoreKey)map.getClosest(key, readval); - if (readkey == null) { - return; - } - - do { - byte [] readcol = readkey.getColumn(); - - // if we're looking for this column (or all of them), and there isn't - // already a value for this column in the results map or there is a value - // but we haven't collected enough versions yet, and the key we - // just read matches, then we'll consider it - if ((columns == null || columns.contains(readcol)) - && (!results.containsKey(readcol) - || results.get(readcol).getNumValues() < numVersions) - && key.matchesWithoutColumn(readkey)) { - // if the value of the cell we're looking at right now is a delete, - // we need to treat it differently - if(HLogEdit.isDeleted(readval.get())) { - // if it's not already recorded as a delete or recorded with a more - // recent delete timestamp, record it for later - if (!deletes.containsKey(readcol) - || deletes.get(readcol).longValue() < readkey.getTimestamp()) { - deletes.put(readcol, Long.valueOf(readkey.getTimestamp())); - } - } else if (!(deletes.containsKey(readcol) - && deletes.get(readcol).longValue() >= readkey.getTimestamp()) ) { - // So the cell itself isn't a delete, but there may be a delete - // pending from earlier in our search. Only record this result if - // there aren't any pending deletes. - if (!(deletes.containsKey(readcol) && - deletes.get(readcol).longValue() >= readkey.getTimestamp())) { - if (!isExpired(readkey, ttl, now)) { - if (!results.containsKey(readcol)) { - results.put(readcol, - new Cell(readval.get(), readkey.getTimestamp())); - } else { - results.get(readcol).add(readval.get(), - readkey.getTimestamp()); - } - // need to reinstantiate the readval so we can reuse it, - // otherwise next iteration will destroy our result - readval = new ImmutableBytesWritable(); - } - } - } - } else if (HStoreKey.compareTwoRowKeys(info,key.getRow(), readkey.getRow()) < 0) { - // if we've crossed into the next row, then we can just stop - // iterating - break; - } - - } while(map.next(readkey, readval)); - } - } - - /** - * @return Array of readers ordered oldest to newest. - */ - public MapFile.Reader [] getReaders() { - return this.readers.values(). - toArray(new MapFile.Reader[this.readers.size()]); - } - - /* - * @param wantedVersions How many versions were asked for. - * @return wantedVersions or this families' MAX_VERSIONS. - */ - private int versionsToReturn(final int wantedVersions) { - if (wantedVersions <= 0) { - throw new IllegalArgumentException("Number of versions must be > 0"); - } - // Make sure we do not return more than maximum versions for this store. - return wantedVersions > this.family.getMaxVersions()? - this.family.getMaxVersions(): wantedVersions; - } - - /** - * Get the value for the indicated HStoreKey. Grab the target value and the - * previous numVersions - 1 values, as well. - * - * Use {@link HConstants.ALL_VERSIONS} to retrieve all versions. - * @param key - * @param numVersions Number of versions to fetch. Must be > 0. - * @return values for the specified versions - * @throws IOException - */ - Cell[] get(final HStoreKey key, final int numVersions) throws IOException { - // This code below is very close to the body of the getKeys method. Any - // changes in the flow below should also probably be done in getKeys. - // TODO: Refactor so same code used. - long now = System.currentTimeMillis(); - int versions = versionsToReturn(numVersions); - // Keep a list of deleted cell keys. We need this because as we go through - // the memcache and store files, the cell with the delete marker may be - // in one store and the old non-delete cell value in a later store. - // If we don't keep around the fact that the cell was deleted in a newer - // record, we end up returning the old value if user is asking for more - // than one version. This List of deletes should not be large since we - // are only keeping rows and columns that match those set on the get and - // which have delete values. If memory usage becomes an issue, could - // redo as bloom filter. - Set deletes = new HashSet(); - this.lock.readLock().lock(); - try { - // Check the memcache - List results = this.memcache.get(key, versions, deletes, now); - // If we got sufficient versions from memcache, return. - if (results.size() == versions) { - return results.toArray(new Cell[results.size()]); - } - MapFile.Reader[] maparray = getReaders(); - // Returned array is sorted with the most recent addition last. - for(int i = maparray.length - 1; - i >= 0 && !hasEnoughVersions(versions, results); i--) { - MapFile.Reader r = maparray[i]; - synchronized (r) { - // Do the priming read - ImmutableBytesWritable readval = new ImmutableBytesWritable(); - HStoreKey readkey = (HStoreKey)r.getClosest(key, readval); - if (readkey == null) { - // map.getClosest returns null if the passed key is > than the - // last key in the map file. getClosest is a bit of a misnomer - // since it returns exact match or the next closest key AFTER not - // BEFORE. We use getClosest because we're usually passed a - // key that has a timestamp of maximum long to indicate we want - // most recent update. - continue; - } - if (!readkey.matchesRowCol(key)) { - continue; - } - if (get(readkey, readval.get(), versions, results, deletes, now)) { - break; - } - for (readval = new ImmutableBytesWritable(); - r.next(readkey, readval) && readkey.matchesRowCol(key); - readval = new ImmutableBytesWritable()) { - if (get(readkey, readval.get(), versions, results, deletes, now)) { - break; - } - } - } - } - return results.size() == 0 ? - null : results.toArray(new Cell[results.size()]); - } finally { - this.lock.readLock().unlock(); - } - } - - /* - * Look at one key/value. - * @param key - * @param value - * @param versions - * @param results - * @param deletes - * @param now - * @return True if we have enough versions. - */ - private boolean get(final HStoreKey key, final byte [] value, - final int versions, final List results, - final Set deletes, final long now) { - if (!HLogEdit.isDeleted(value)) { - if (notExpiredAndNotInDeletes(this.ttl, key, now, deletes)) { - results.add(new Cell(value, key.getTimestamp())); - } - // Perhaps only one version is wanted. I could let this - // test happen later in the for loop test but it would cost - // the allocation of an ImmutableBytesWritable. - if (hasEnoughVersions(versions, results)) { - return true; - } - } else { - // Is this copy necessary? - deletes.add(new HStoreKey(key)); - } - return false; - } - - /* - * Small method to check if we are over the max number of versions - * or we acheived this family max versions. - * The later happens when we have the situation described in HBASE-621. - * @param versions - * @param c - * @return - */ - private boolean hasEnoughVersions(final int versions, final List c) { - return c.size() >= versions; - } - - /** - * Get versions of keys matching the origin key's - * row/column/timestamp and those of an older vintage. - * @param origin Where to start searching. - * @param versions How many versions to return. Pass - * {@link HConstants#ALL_VERSIONS} to retrieve all. - * @param now - * @param columnPattern regex pattern for column matching. if columnPattern - * is not null, we use column pattern to match columns. And the columnPattern - * only works when origin's column is null or its length is zero. - * @return Matching keys. - * @throws IOException - */ - public List getKeys(final HStoreKey origin, final int versions, - final long now, final Pattern columnPattern) - throws IOException { - // This code below is very close to the body of the get method. Any - // changes in the flow below should also probably be done in get. TODO: - // Refactor so same code used. - Set deletes = new HashSet(); - this.lock.readLock().lock(); - try { - // Check the memcache - List keys = - this.memcache.getKeys(origin, versions, deletes, now, columnPattern); - // If we got sufficient versions from memcache, return. - if (keys.size() >= versions) { - return keys; - } - MapFile.Reader[] maparray = getReaders(); - // Returned array is sorted with the most recent addition last. - for (int i = maparray.length - 1; - i >= 0 && keys.size() < versions; i--) { - MapFile.Reader map = maparray[i]; - synchronized(map) { - map.reset(); - // Do the priming read - ImmutableBytesWritable readval = new ImmutableBytesWritable(); - HStoreKey readkey = (HStoreKey)map.getClosest(origin, readval); - if (readkey == null) { - // map.getClosest returns null if the passed key is > than the - // last key in the map file. getClosest is a bit of a misnomer - // since it returns exact match or the next closest key AFTER not - // BEFORE. - continue; - } - do { - // if the row matches, we might want this one. - if (rowMatches(origin, readkey)) { - // if the column pattern is not null, we use it for column matching. - // we will skip the keys whose column doesn't match the pattern. - if (columnPattern != null) { - if (!(columnPattern.matcher(Bytes.toString(readkey.getColumn())).matches())) { - continue; - } - } - // if the cell address matches, then we definitely want this key. - if (cellMatches(origin, readkey)) { - // Store key if isn't deleted or superceded by memcache - if (!HLogEdit.isDeleted(readval.get())) { - if (notExpiredAndNotInDeletes(this.ttl, readkey, now, deletes)) { - keys.add(new HStoreKey(readkey)); - } - if (keys.size() >= versions) { - break; - } - } else { - deletes.add(new HStoreKey(readkey)); - } - } else { - // the cell doesn't match, but there might be more with different - // timestamps, so move to the next key - continue; - } - } else { - // the row doesn't match, so we've gone too far. - break; - } - } while (map.next(readkey, readval)); // advance to the next key - } - } - return keys; - } finally { - this.lock.readLock().unlock(); - } - } - - /** - * Find the key that matches row exactly, or the one that immediately - * preceeds it. WARNING: Only use this method on a table where writes occur - * with stricly increasing timestamps. This method assumes this pattern of - * writes in order to make it reasonably performant. - * @param row - * @return Found row - * @throws IOException - */ - @SuppressWarnings("unchecked") - byte [] getRowKeyAtOrBefore(final byte [] row) - throws IOException{ - // Map of HStoreKeys that are candidates for holding the row key that - // most closely matches what we're looking for. We'll have to update it as - // deletes are found all over the place as we go along before finally - // reading the best key out of it at the end. - SortedMap candidateKeys = new TreeMap( - new HStoreKey.HStoreKeyWritableComparator(info)); - - // Keep a list of deleted cell keys. We need this because as we go through - // the store files, the cell with the delete marker may be in one file and - // the old non-delete cell value in a later store file. If we don't keep - // around the fact that the cell was deleted in a newer record, we end up - // returning the old value if user is asking for more than one version. - // This List of deletes should not be large since we are only keeping rows - // and columns that match those set on the scanner and which have delete - // values. If memory usage becomes an issue, could redo as bloom filter. - Set deletes = new HashSet(); - this.lock.readLock().lock(); - try { - // First go to the memcache. Pick up deletes and candidates. - this.memcache.getRowKeyAtOrBefore(row, candidateKeys, deletes); - - // Process each store file. Run through from newest to oldest. - // This code below is very close to the body of the getKeys method. - MapFile.Reader[] maparray = getReaders(); - for (int i = maparray.length - 1; i >= 0; i--) { - // Update the candidate keys from the current map file - rowAtOrBeforeFromMapFile(maparray[i], row, candidateKeys, deletes); - } - // Return the best key from candidateKeys - byte [] result = - candidateKeys.isEmpty()? null: candidateKeys.lastKey().getRow(); - return result; - } finally { - this.lock.readLock().unlock(); - } - } - - /* - * Check an individual MapFile for the row at or before a given key - * and timestamp - * @param map - * @param row - * @param candidateKeys - * @throws IOException - */ - private void rowAtOrBeforeFromMapFile(final MapFile.Reader map, - final byte [] row, final SortedMap candidateKeys, - final Set deletes) - throws IOException { - HStoreKey startKey = new HStoreKey(); - ImmutableBytesWritable startValue = new ImmutableBytesWritable(); - synchronized(map) { - // Don't bother with the rest of this if the file is empty - map.reset(); - if (!map.next(startKey, startValue)) { - return; - } - startKey.setHRegionInfo(this.info); - // If start row for this file is beyond passed in row, return; nothing - // in here is of use to us. - if (HStoreKey.compareTwoRowKeys(this.info, startKey.getRow(), row) > 0) { - return; - } - long now = System.currentTimeMillis(); - // if there aren't any candidate keys yet, we'll do some things different - if (candidateKeys.isEmpty()) { - rowAtOrBeforeCandidate(startKey, map, row, candidateKeys, deletes, now); - } else { - rowAtOrBeforeWithCandidates(startKey, map, row, candidateKeys, deletes, - now); - } - } - } - - /* Find a candidate for row that is at or before passed row in passed - * mapfile. - * @param startKey First key in the mapfile. - * @param map - * @param row - * @param candidateKeys - * @param now - * @throws IOException - */ - private void rowAtOrBeforeCandidate(final HStoreKey startKey, - final MapFile.Reader map, final byte[] row, - final SortedMap candidateKeys, - final Set deletes, final long now) - throws IOException { - // if the row we're looking for is past the end of this mapfile, set the - // search key to be the last key. If its a deleted key, then we'll back - // up to the row before and return that. - HStoreKey finalKey = getFinalKey(map); - HStoreKey searchKey = null; - if (HStoreKey.compareTwoRowKeys(info,finalKey.getRow(), row) < 0) { - searchKey = finalKey; - } else { - searchKey = new HStoreKey(row, this.info); - if (searchKey.compareTo(startKey) < 0) { - searchKey = startKey; - } - } - rowAtOrBeforeCandidate(map, searchKey, candidateKeys, deletes, now); - } - - /* - * @param ttlSetting - * @param hsk - * @param now - * @param deletes - * @return True if key has not expired and is not in passed set of deletes. - */ - static boolean notExpiredAndNotInDeletes(final long ttl, - final HStoreKey hsk, final long now, final Set deletes) { - return !isExpired(hsk, ttl, now) && - (deletes == null || !deletes.contains(hsk)); - } - - static boolean isExpired(final HStoreKey hsk, final long ttl, - final long now) { - return ttl != HConstants.FOREVER && now > hsk.getTimestamp() + ttl; - } - - /* Find a candidate for row that is at or before passed key, sk, in mapfile. - * @param map - * @param sk Key to go search the mapfile with. - * @param candidateKeys - * @param now - * @throws IOException - * @see {@link #rowAtOrBeforeCandidate(HStoreKey, org.apache.hadoop.io.MapFile.Reader, byte[], SortedMap, long)} - */ - private void rowAtOrBeforeCandidate(final MapFile.Reader map, - final HStoreKey sk, final SortedMap candidateKeys, - final Set deletes, final long now) - throws IOException { - HStoreKey searchKey = sk; - if (searchKey.getHRegionInfo() == null) { - searchKey.setHRegionInfo(this.info); - } - HStoreKey readkey = null; - ImmutableBytesWritable readval = new ImmutableBytesWritable(); - HStoreKey knownNoGoodKey = null; - for (boolean foundCandidate = false; !foundCandidate;) { - // Seek to the exact row, or the one that would be immediately before it - readkey = (HStoreKey)map.getClosest(searchKey, readval, true); - if (readkey == null) { - // If null, we are at the start or end of the file. - break; - } - HStoreKey deletedOrExpiredRow = null; - do { - // Set this region into the readkey. - readkey.setHRegionInfo(this.info); - // If we have an exact match on row, and it's not a delete, save this - // as a candidate key - if (HStoreKey.equalsTwoRowKeys(this.info, readkey.getRow(), - searchKey.getRow())) { - if (!HLogEdit.isDeleted(readval.get())) { - if (handleNonDelete(readkey, now, deletes, candidateKeys)) { - foundCandidate = true; - // NOTE! Continue. - continue; - } - } - HStoreKey copy = addCopyToDeletes(readkey, deletes); - if (deletedOrExpiredRow == null) { - deletedOrExpiredRow = copy; - } - } else if (HStoreKey.compareTwoRowKeys(this.info, readkey.getRow(), - searchKey.getRow()) > 0) { - // if the row key we just read is beyond the key we're searching for, - // then we're done. - break; - } else { - // So, the row key doesn't match, but we haven't gone past the row - // we're seeking yet, so this row is a candidate for closest - // (assuming that it isn't a delete). - if (!HLogEdit.isDeleted(readval.get())) { - if (handleNonDelete(readkey, now, deletes, candidateKeys)) { - foundCandidate = true; - // NOTE: Continue - continue; - } - } - HStoreKey copy = addCopyToDeletes(readkey, deletes); - if (deletedOrExpiredRow == null) { - deletedOrExpiredRow = copy; - } - } - } while(map.next(readkey, readval) && (knownNoGoodKey == null || - readkey.compareTo(knownNoGoodKey) < 0)); - - // If we get here and have no candidates but we did find a deleted or - // expired candidate, we need to look at the key before that - if (!foundCandidate && deletedOrExpiredRow != null) { - knownNoGoodKey = deletedOrExpiredRow; - searchKey = new HStoreKey.BeforeThisStoreKey(deletedOrExpiredRow); - } else { - // No candidates and no deleted or expired candidates. Give up. - break; - } - } - - // Arriving here just means that we consumed the whole rest of the map - // without going "past" the key we're searching for. we can just fall - // through here. - } - - /* - * @param key Key to copy and add to deletes - * @param deletes - * @return Instance of the copy added to deletes - */ - private HStoreKey addCopyToDeletes(final HStoreKey key, - final Set deletes) { - HStoreKey copy = new HStoreKey(key); - deletes.add(copy); - return copy; - } - - private void rowAtOrBeforeWithCandidates(final HStoreKey startKey, - final MapFile.Reader map, final byte[] row, - final SortedMap candidateKeys, - final Set deletes, final long now) - throws IOException { - HStoreKey readkey = null; - ImmutableBytesWritable readval = new ImmutableBytesWritable(); - - // if there are already candidate keys, we need to start our search - // at the earliest possible key so that we can discover any possible - // deletes for keys between the start and the search key. Back up to start - // of the row in case there are deletes for this candidate in this mapfile - // BUT do not backup before the first key in the mapfile else getClosest - // will return null - HStoreKey searchKey = new HStoreKey(candidateKeys.firstKey().getRow(), this.info); - if (searchKey.compareTo(startKey) < 0) { - searchKey = startKey; - } - - // Seek to the exact row, or the one that would be immediately before it - readkey = (HStoreKey)map.getClosest(searchKey, readval, true); - if (readkey == null) { - // If null, we are at the start or end of the file. - // Didn't find anything that would match, so return - return; - } - - do { - // if we have an exact match on row, and it's not a delete, save this - // as a candidate key - if (Bytes.equals(readkey.getRow(), row)) { - handleKey(readkey, readval.get(), now, deletes, candidateKeys); - } else if (HStoreKey.compareTwoRowKeys(info, - readkey.getRow(), row) > 0 ) { - // if the row key we just read is beyond the key we're searching for, - // then we're done. - break; - } else { - // So, the row key doesn't match, but we haven't gone past the row - // we're seeking yet, so this row is a candidate for closest - // (assuming that it isn't a delete). - handleKey(readkey, readval.get(), now, deletes, candidateKeys); - } - } while(map.next(readkey, readval)); - } - - /* - * @param readkey - * @param now - * @param deletes - * @param candidateKeys - */ - private void handleKey(final HStoreKey readkey, final byte [] value, - final long now, final Set deletes, - final SortedMap candidateKeys) { - if (!HLogEdit.isDeleted(value)) { - handleNonDelete(readkey, now, deletes, candidateKeys); - } else { - // Pass copy because readkey will change next time next is called. - handleDeleted(new HStoreKey(readkey), candidateKeys, deletes); - } - } - - /* - * @param readkey - * @param now - * @param deletes - * @param candidateKeys - * @return True if we added a candidate. - */ - private boolean handleNonDelete(final HStoreKey readkey, final long now, - final Set deletes, final Map candidateKeys) { - if (notExpiredAndNotInDeletes(this.ttl, readkey, now, deletes)) { - candidateKeys.put(stripTimestamp(readkey), - Long.valueOf(readkey.getTimestamp())); - return true; - } - return false; - } - - /* Handle keys whose values hold deletes. - * Add to the set of deletes and then if the candidate keys contain any that - * might match by timestamp, then check for a match and remove it if it's too - * young to survive the delete - * @param k Be careful; if key was gotten from a Mapfile, pass in a copy. - * Values gotten by 'nexting' out of Mapfiles will change in each invocation. - * @param candidateKeys - * @param deletes - */ - static void handleDeleted(final HStoreKey k, - final SortedMap candidateKeys, - final Set deletes) { - deletes.add(k); - HStoreKey strippedKey = stripTimestamp(k); - if (candidateKeys.containsKey(strippedKey)) { - long bestCandidateTs = - candidateKeys.get(strippedKey).longValue(); - if (bestCandidateTs <= k.getTimestamp()) { - candidateKeys.remove(strippedKey); - } - } - } - - /* - * @param mf MapFile to dig in. - * @return Final key from passed mf - * @throws IOException - */ - private HStoreKey getFinalKey(final MapFile.Reader mf) throws IOException { - HStoreKey finalKey = new HStoreKey(); - mf.finalKey(finalKey); - finalKey.setHRegionInfo(this.info); - return finalKey; - } - - static HStoreKey stripTimestamp(HStoreKey key) { - return new HStoreKey(key.getRow(), key.getColumn(), key.getHRegionInfo()); - } - - /* - * Test that the target matches the origin cell address. If the - * origin has an empty column, then it's assumed to mean any column - * matches and only match on row and timestamp. Otherwise, it compares the - * keys with HStoreKey.matchesRowCol(). - * @param origin The key we're testing against - * @param target The key we're testing - */ - private boolean cellMatches(HStoreKey origin, HStoreKey target){ - // if the origin's column is empty, then we're matching any column - if (Bytes.equals(origin.getColumn(), HConstants.EMPTY_BYTE_ARRAY)) { - // if the row matches, then... - if (HStoreKey.equalsTwoRowKeys(info, target.getRow(), origin.getRow())) { - // check the timestamp - return target.getTimestamp() <= origin.getTimestamp(); - } - return false; - } - // otherwise, we want to match on row and column - return target.matchesRowCol(origin); - } - - /* - * Test that the target matches the origin. If the origin - * has an empty column, then it just tests row equivalence. Otherwise, it uses - * HStoreKey.matchesRowCol(). - * @param origin Key we're testing against - * @param target Key we're testing - */ - private boolean rowMatches(HStoreKey origin, HStoreKey target){ - // if the origin's column is empty, then we're matching any column - if (Bytes.equals(origin.getColumn(), HConstants.EMPTY_BYTE_ARRAY)) { - // if the row matches, then... - return HStoreKey.equalsTwoRowKeys(info, target.getRow(), origin.getRow()); - } - // otherwise, we want to match on row and column - return target.matchesRowCol(origin); - } - - /** - * Determines if HStore can be split - * @param force Whether to force a split or not. - * @return a StoreSize if store can be split, null otherwise - */ - StoreSize checkSplit(final boolean force) { - if (this.storefiles.size() <= 0) { - return null; - } - if (!force && (storeSize < this.desiredMaxFileSize)) { - return null; - } - this.lock.readLock().lock(); - try { - // Not splitable if we find a reference store file present in the store. - boolean splitable = true; - long maxSize = 0L; - Long mapIndex = Long.valueOf(0L); - // Iterate through all the MapFiles - synchronized (storefiles) { - for (Map.Entry e: storefiles.entrySet()) { - HStoreFile curHSF = e.getValue(); - if (splitable) { - splitable = !curHSF.isReference(); - if (!splitable) { - // RETURN IN MIDDLE OF FUNCTION!!! If not splitable, just return. - if (LOG.isDebugEnabled()) { - LOG.debug(curHSF + " is not splittable"); - } - return null; - } - } - long size = curHSF.length(); - if (size > maxSize) { - // This is the largest one so far - maxSize = size; - mapIndex = e.getKey(); - } - } - } - - // Cast to HbaseReader. - HBaseMapFile.HBaseReader r = - (HBaseMapFile.HBaseReader)this.readers.get(mapIndex); - // Get first, last, and mid keys. - r.reset(); - HStoreKey firstKey = new HStoreKey(); - HStoreKey lastKey = new HStoreKey(); - r.next(firstKey, new ImmutableBytesWritable()); - r.finalKey(lastKey); - HStoreKey mk = (HStoreKey)r.midKey(); - if (mk != null) { - // if the midkey is the same as the first and last keys, then we cannot - // (ever) split this region. - if (HStoreKey.equalsTwoRowKeys(info, mk.getRow(), firstKey.getRow()) && - HStoreKey.equalsTwoRowKeys(info, mk.getRow(), lastKey.getRow())) { - if (LOG.isDebugEnabled()) { - LOG.debug("cannot split because midkey is the same as first or last row"); - } - return null; - } - return new StoreSize(maxSize, mk.getRow()); - } - } catch(IOException e) { - LOG.warn("Failed getting store size for " + this.storeNameStr, e); - } finally { - this.lock.readLock().unlock(); - } - return null; - } - - /** @return aggregate size of HStore */ - public long getSize() { - return storeSize; - } - - ////////////////////////////////////////////////////////////////////////////// - // File administration - ////////////////////////////////////////////////////////////////////////////// - - /** - * Return a scanner for both the memcache and the HStore files - */ - protected InternalScanner getScanner(long timestamp, byte [][] targetCols, - byte [] firstRow, RowFilterInterface filter) - throws IOException { - lock.readLock().lock(); - try { - return new HStoreScanner(this, targetCols, firstRow, timestamp, filter); - } finally { - lock.readLock().unlock(); - } - } - - @Override - public String toString() { - return this.storeNameStr; - } - - /** - * @param p Path to check. - * @return True if the path has format of a HStoreFile reference. - */ - public static boolean isReference(final Path p) { - return isReference(p, REF_NAME_PARSER.matcher(p.getName())); - } - - private static boolean isReference(final Path p, final Matcher m) { - if (m == null || !m.matches()) { - LOG.warn("Failed match of store file name " + p.toString()); - throw new RuntimeException("Failed match of store file name " + - p.toString()); - } - return m.groupCount() > 1 && m.group(2) != null; - } - - /** - * @return Current list of store files. - */ - SortedMap getStorefiles() { - synchronized (this.storefiles) { - SortedMap copy = - new TreeMap(this.storefiles); - return copy; - } - } - - /** - * @return Count of store files - */ - int getStorefilesCount() { - return this.storefiles.size(); - } - - /** - * @return The size of the store file indexes, in bytes. - * @throws IOException if there was a problem getting file sizes from the - * filesystem - */ - long getStorefilesIndexSize() throws IOException { - long size = 0; - for (HStoreFile s: storefiles.values()) - size += s.indexLength(); - return size; - } - - /* - * Datastructure that holds size and key. - */ - class StoreSize { - private final long size; - private final byte[] key; - StoreSize(long size, byte[] key) { - this.size = size; - this.key = new byte[key.length]; - System.arraycopy(key, 0, this.key, 0, key.length); - } - /* @return the size */ - long getSize() { - return size; - } - /* @return the key */ - byte[] getKey() { - return key; - } - } - - HRegionInfo getHRegionInfo() { - return this.info; - } -} diff --git a/src/java/org/apache/hadoop/hbase/regionserver/HStoreFile.java b/src/java/org/apache/hadoop/hbase/regionserver/HStoreFile.java deleted file mode 100644 index 2162276d05a..00000000000 --- a/src/java/org/apache/hadoop/hbase/regionserver/HStoreFile.java +++ /dev/null @@ -1,558 +0,0 @@ -/** - * Copyright 2007 The Apache Software Foundation - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.regionserver; - -import java.io.DataInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.Random; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.HBaseConfiguration; -import org.apache.hadoop.hbase.HConstants; -import org.apache.hadoop.hbase.HRegionInfo; -import org.apache.hadoop.hbase.io.BloomFilterMapFile; -import org.apache.hadoop.hbase.io.HalfMapFileReader; -import org.apache.hadoop.hbase.io.Reference; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.hbase.io.MapFile; -import org.apache.hadoop.hbase.io.SequenceFile; - -/** - * A HStore data file. HStores usually have one or more of these files. They - * are produced by flushing the memcache to disk. - * - *

Each HStore maintains a bunch of different data files. The filename is a - * mix of the parent dir, the region name, the column name, and a file - * identifier. The name may also be a reference to a store file located - * elsewhere. This class handles all that path-building stuff for you. - * - *

An HStoreFile usually tracks 4 things: its parent dir, the region - * identifier, the column family, and the file identifier. If you know those - * four things, you know how to obtain the right HStoreFile. HStoreFiles may - * also reference store files in another region serving either from - * the top-half of the remote file or from the bottom-half. Such references - * are made fast splitting regions. - * - *

Plain HStoreFiles are named for a randomly generated id as in: - * 1278437856009925445 A file by this name is made in both the - * mapfiles and info subdirectories of a - * HStore columnfamily directoy: E.g. If the column family is 'anchor:', then - * under the region directory there is a subdirectory named 'anchor' within - * which is a 'mapfiles' and 'info' subdirectory. In each will be found a - * file named something like 1278437856009925445, one to hold the - * data in 'mapfiles' and one under 'info' that holds the sequence id for this - * store file. - * - *

References to store files located over in some other region look like - * this: - * 1278437856009925445.hbaserepository,qAReLZD-OyQORZWq_vqR1k==,959247014679548184: - * i.e. an id followed by the name of the referenced region. The data - * ('mapfiles') of HStoreFile references are empty. The accompanying - * info file contains the - * midkey, the id of the remote store we're referencing and whether we're - * to serve the top or bottom region of the remote store file. Note, a region - * is not splitable if it has instances of store file references (References - * are cleaned up by compactions). - * - *

When merging or splitting HRegions, we might want to modify one of the - * params for an HStoreFile (effectively moving it elsewhere). - */ -public class HStoreFile implements HConstants { - static final Log LOG = LogFactory.getLog(HStoreFile.class.getName()); - static final byte INFO_SEQ_NUM = 0; - static final byte MAJOR_COMPACTION = INFO_SEQ_NUM + 1; - static final String HSTORE_DATFILE_DIR = "mapfiles"; - static final String HSTORE_INFO_DIR = "info"; - static final String HSTORE_FILTER_DIR = "filter"; - - private final static Random rand = new Random(); - - private final Path basedir; - private final int encodedRegionName; - private final byte [] colFamily; - private final long fileId; - private final HBaseConfiguration conf; - private final FileSystem fs; - private final Reference reference; - private final HRegionInfo hri; - /* If true, this file was product of a major compaction. - */ - private boolean majorCompaction = false; - private long indexLength; - - /** - * Constructor that fully initializes the object - * @param conf Configuration object - * @param basedir qualified path that is parent of region directory - * @param colFamily name of the column family - * @param fileId file identifier - * @param ref Reference to another HStoreFile. - * @param hri The region info for this file (HACK HBASE-868). TODO: Fix. - * @throws IOException - */ - HStoreFile(HBaseConfiguration conf, FileSystem fs, Path basedir, - final HRegionInfo hri, byte [] colFamily, long fileId, - final Reference ref) - throws IOException { - this(conf, fs, basedir, hri, colFamily, fileId, ref, false); - } - - /** - * Constructor that fully initializes the object - * @param conf Configuration object - * @param basedir qualified path that is parent of region directory - * @param colFamily name of the column family - * @param fileId file identifier - * @param ref Reference to another HStoreFile. - * @param hri The region info for this file (HACK HBASE-868). TODO: Fix. - * @param mc Try if this file was result of a major compression. - * @throws IOException - */ - HStoreFile(HBaseConfiguration conf, FileSystem fs, Path basedir, - final HRegionInfo hri, byte [] colFamily, long fileId, - final Reference ref, final boolean mc) - throws IOException { - this.conf = conf; - this.fs = fs; - this.basedir = basedir; - this.encodedRegionName = hri.getEncodedName(); - this.colFamily = colFamily; - this.hri = hri; - - long id = fileId; - if (id == -1) { - Path mapdir = HStoreFile.getMapDir(basedir, encodedRegionName, colFamily); - Path testpath = null; - do { - id = Math.abs(rand.nextLong()); - testpath = new Path(mapdir, createHStoreFilename(id, -1)); - } while(fs.exists(testpath)); - } - this.fileId = id; - - // If a reference, construction does not write the pointer files. Thats - // done by invocations of writeReferenceFiles(hsf, fs). Happens at split. - this.reference = ref; - this.majorCompaction = mc; - } - - /** @return the region name */ - boolean isReference() { - return reference != null; - } - - Reference getReference() { - return reference; - } - - int getEncodedRegionName() { - return this.encodedRegionName; - } - - /** @return the column family */ - byte [] getColFamily() { - return colFamily; - } - - /** @return the file identifier */ - long getFileId() { - return fileId; - } - - // Build full filenames from those components - - /** @return path for MapFile */ - Path getMapFilePath() { - if (isReference()) { - return getMapFilePath(encodedRegionName, fileId, - reference.getEncodedRegionName()); - } - return getMapFilePath(this.encodedRegionName, fileId); - } - - private Path getMapFilePath(final Reference r) { - if (r == null) { - return getMapFilePath(); - } - return getMapFilePath(r.getEncodedRegionName(), r.getFileId()); - } - - private Path getMapFilePath(final int encodedName, final long fid) { - return getMapFilePath(encodedName, fid, HRegionInfo.NO_HASH); - } - - private Path getMapFilePath(final int encodedName, final long fid, - final int ern) { - return new Path(HStoreFile.getMapDir(basedir, encodedName, colFamily), - createHStoreFilename(fid, ern)); - } - - /** @return path for info file */ - Path getInfoFilePath() { - if (isReference()) { - return getInfoFilePath(encodedRegionName, fileId, - reference.getEncodedRegionName()); - - } - return getInfoFilePath(encodedRegionName, fileId); - } - - private Path getInfoFilePath(final int encodedName, final long fid) { - return getInfoFilePath(encodedName, fid, HRegionInfo.NO_HASH); - } - - private Path getInfoFilePath(final int encodedName, final long fid, - final int ern) { - return new Path(HStoreFile.getInfoDir(basedir, encodedName, colFamily), - createHStoreFilename(fid, ern)); - } - - // File handling - - /* - * Split by making two new store files that reference top and bottom regions - * of original store file. - * @param midKey - * @param dstA - * @param dstB - * @param fs - * @param c - * @throws IOException - * - * @param midKey the key which will be the starting key of the second region - * @param dstA the file which will contain keys from the start of the source - * @param dstB the file which will contain keys from midKey to end of source - * @param fs file system - * @param c configuration - * @throws IOException - */ - void splitStoreFile(final HStoreFile dstA, final HStoreFile dstB, - final FileSystem fs) - throws IOException { - dstA.writeReferenceFiles(fs); - dstB.writeReferenceFiles(fs); - } - - void writeReferenceFiles(final FileSystem fs) - throws IOException { - createOrFail(fs, getMapFilePath()); - writeSplitInfo(fs); - } - - /* - * If reference, create and write the remote store file id, the midkey and - * whether we're going against the top file region of the referent out to - * the info file. - * @param p Path to info file. - * @param hsf - * @param fs - * @throws IOException - */ - private void writeSplitInfo(final FileSystem fs) throws IOException { - Path p = getInfoFilePath(); - if (fs.exists(p)) { - throw new IOException("File already exists " + p.toString()); - } - FSDataOutputStream out = fs.create(p); - try { - reference.write(out); - } finally { - out.close(); - } - } - - /** - * @see #writeSplitInfo(FileSystem fs) - */ - static Reference readSplitInfo(final Path p, final FileSystem fs) - throws IOException { - FSDataInputStream in = fs.open(p); - try { - Reference r = new Reference(); - r.readFields(in); - return r; - } finally { - in.close(); - } - } - - private void createOrFail(final FileSystem fs, final Path p) - throws IOException { - if (fs.exists(p)) { - throw new IOException("File already exists " + p.toString()); - } - if (!fs.createNewFile(p)) { - throw new IOException("Failed create of " + p); - } - } - - /** - * Reads in an info file - * - * @param filesystem file system - * @return The sequence id contained in the info file - * @throws IOException - */ - long loadInfo(final FileSystem filesystem) throws IOException { - Path p = null; - if (isReference()) { - p = getInfoFilePath(reference.getEncodedRegionName(), - this.reference.getFileId()); - } else { - p = getInfoFilePath(); - } - long length = filesystem.getFileStatus(p).getLen(); - boolean hasMoreThanSeqNum = length > (Byte.SIZE + Bytes.SIZEOF_LONG); - DataInputStream in = new DataInputStream(filesystem.open(p)); - try { - byte flag = in.readByte(); - if (flag == INFO_SEQ_NUM) { - if (hasMoreThanSeqNum) { - flag = in.readByte(); - if (flag == MAJOR_COMPACTION) { - this.majorCompaction = in.readBoolean(); - } - } - return in.readLong(); - } - throw new IOException("Cannot process log file: " + p); - } finally { - in.close(); - } - } - - /** - * Writes the file-identifier to disk - * - * @param filesystem file system - * @param infonum file id - * @throws IOException - */ - void writeInfo(final FileSystem filesystem, final long infonum) - throws IOException { - writeInfo(filesystem, infonum, false); - } - - /** - * Writes the file-identifier to disk - * - * @param filesystem file system - * @param infonum file id - * @param mc True if this file is product of a major compaction - * @throws IOException - */ - void writeInfo(final FileSystem filesystem, final long infonum, - final boolean mc) - throws IOException { - Path p = getInfoFilePath(); - FSDataOutputStream out = filesystem.create(p); - try { - out.writeByte(INFO_SEQ_NUM); - out.writeLong(infonum); - if (mc) { - // Set whether major compaction flag on this file. - this.majorCompaction = mc; - out.writeByte(MAJOR_COMPACTION); - out.writeBoolean(mc); - } - } finally { - out.close(); - } - } - - /** - * Delete store map files. - * @throws IOException - */ - public void delete() throws IOException { - fs.delete(getMapFilePath(), true); - fs.delete(getInfoFilePath(), true); - } - - /** - * Renames the mapfiles and info directories under the passed - * hsf directory. - * @param fs - * @param hsf - * @return True if succeeded. - * @throws IOException - */ - public boolean rename(final FileSystem fs, final HStoreFile hsf) - throws IOException { - Path src = getMapFilePath(); - if (!fs.exists(src)) { - throw new FileNotFoundException(src.toString()); - } - boolean success = fs.rename(src, hsf.getMapFilePath()); - if (!success) { - LOG.warn("Failed rename of " + src + " to " + hsf.getMapFilePath()); - } else { - src = getInfoFilePath(); - if (!fs.exists(src)) { - throw new FileNotFoundException(src.toString()); - } - success = fs.rename(src, hsf.getInfoFilePath()); - if (!success) { - LOG.warn("Failed rename of " + src + " to " + hsf.getInfoFilePath()); - } - } - return success; - } - - /** - * Get reader for the store file map file. - * Client is responsible for closing file when done. - * @param fs - * @param bloomFilter If true, a bloom filter exists - * @param blockCacheEnabled If true, MapFile blocks should be cached. - * @return BloomFilterMapFile.Reader - * @throws IOException - */ - public synchronized BloomFilterMapFile.Reader getReader(final FileSystem fs, - final boolean bloomFilter, final boolean blockCacheEnabled) - throws IOException { - if (isReference()) { - return new HalfMapFileReader(fs, - getMapFilePath(reference).toString(), conf, - reference.getFileRegion(), reference.getMidkey(), bloomFilter, - blockCacheEnabled, this.hri); - } - return new BloomFilterMapFile.Reader(fs, getMapFilePath().toString(), - conf, bloomFilter, blockCacheEnabled, this.hri); - } - - /** - * Get a store file writer. - * Client is responsible for closing file when done. - * @param fs - * @param compression Pass SequenceFile.CompressionType.NONE - * for none. - * @param bloomFilter If true, create a bloom filter - * @param nrows number of rows expected. Required if bloomFilter is true. - * @return MapFile.Writer - * @throws IOException - */ - public MapFile.Writer getWriter(final FileSystem fs, - final SequenceFile.CompressionType compression, - final boolean bloomFilter, int nrows) - throws IOException { - if (isReference()) { - throw new IOException("Illegal Access: Cannot get a writer on a" + - "HStoreFile reference"); - } - return new BloomFilterMapFile.Writer(conf, fs, - getMapFilePath().toString(), compression, bloomFilter, nrows, this.hri); - } - - /** - * @return Length of the store map file. If a reference, size is - * approximation. - * @throws IOException - */ - public long length() throws IOException { - Path p = new Path(getMapFilePath(reference), MapFile.DATA_FILE_NAME); - long l = p.getFileSystem(conf).getFileStatus(p).getLen(); - return (isReference())? l / 2: l; - } - - /** - * @return Length of the store map file index. - * @throws IOException - */ - public synchronized long indexLength() throws IOException { - if (indexLength == 0) { - Path p = new Path(getMapFilePath(reference), MapFile.INDEX_FILE_NAME); - indexLength = p.getFileSystem(conf).getFileStatus(p).getLen(); - } - return indexLength; - } - - @Override - public String toString() { - return encodedRegionName + "/" + Bytes.toString(colFamily) + "/" + fileId + - (isReference()? "-" + reference.toString(): ""); - } - - /** - * @return True if this file was made by a major compaction. - */ - public boolean isMajorCompaction() { - return this.majorCompaction; - } - - private static String createHStoreFilename(final long fid, - final int encodedRegionName) { - return Long.toString(fid) + - ((encodedRegionName != HRegionInfo.NO_HASH)? - "." + encodedRegionName : ""); - } - - /** - * @param dir Base directory - * @param encodedRegionName Encoding of region name. - * @param f Column family. - * @return path for map file directory - */ - public static Path getMapDir(Path dir, int encodedRegionName, - final byte [] f) { - return getFamilySubDir(dir, encodedRegionName, f, HSTORE_DATFILE_DIR); - } - - /** - * @param dir Base directory - * @param encodedRegionName Encoding of region name. - * @param f Column family. - * @return the info directory path - */ - public static Path getInfoDir(Path dir, int encodedRegionName, byte [] f) { - return getFamilySubDir(dir, encodedRegionName, f, HSTORE_INFO_DIR); - } - - /** - * @param dir Base directory - * @param encodedRegionName Encoding of region name. - * @param f Column family. - * @return the bloom filter directory path - */ - @Deprecated - public static Path getFilterDir(Path dir, int encodedRegionName, - final byte [] f) { - return getFamilySubDir(dir, encodedRegionName, f, HSTORE_FILTER_DIR); - } - - /* - * @param base Base directory - * @param encodedRegionName Encoding of region name. - * @param f Column family. - * @param subdir Subdirectory to create under column family/store directory. - * @return - */ - private static Path getFamilySubDir(final Path base, - final int encodedRegionName, final byte [] f, final String subdir) { - return new Path(base, new Path(Integer.toString(encodedRegionName), - new Path(Bytes.toString(f), subdir))); - } -} \ No newline at end of file diff --git a/src/java/org/apache/hadoop/hbase/regionserver/Store.java b/src/java/org/apache/hadoop/hbase/regionserver/Store.java new file mode 100644 index 00000000000..075891dd210 --- /dev/null +++ b/src/java/org/apache/hadoop/hbase/regionserver/Store.java @@ -0,0 +1,1807 @@ +/** + * Copyright 2009 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.regionserver; + +import java.io.EOFException; +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.concurrent.ConcurrentSkipListMap; +import java.util.concurrent.CopyOnWriteArraySet; +import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.regex.Pattern; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.HColumnDescriptor; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.HRegionInfo; +import org.apache.hadoop.hbase.HStoreKey; +import org.apache.hadoop.hbase.RemoteExceptionHandler; +import org.apache.hadoop.hbase.filter.RowFilterInterface; +import org.apache.hadoop.hbase.io.Cell; +import org.apache.hadoop.hbase.io.SequenceFile; +import org.apache.hadoop.hbase.io.hfile.HFile; +import org.apache.hadoop.hbase.io.hfile.HFileScanner; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.FSUtils; +import org.apache.hadoop.io.RawComparator; +import org.apache.hadoop.util.Progressable; +import org.apache.hadoop.util.StringUtils; + +/** + * A Store holds a column family in a Region. Its a memcache and a set of zero + * or more StoreFiles, which stretch backwards over time. + * + *

There's no reason to consider append-logging at this level; all logging + * and locking is handled at the HRegion level. Store just provides + * services to manage sets of StoreFiles. One of the most important of those + * services is compaction services where files are aggregated once they pass + * a configurable threshold. + * + *

The only thing having to do with logs that Store needs to deal with is + * the reconstructionLog. This is a segment of an HRegion's log that might + * NOT be present upon startup. If the param is NULL, there's nothing to do. + * If the param is non-NULL, we need to process the log to reconstruct + * a TreeMap that might not have been written to disk before the process + * died. + * + *

It's assumed that after this constructor returns, the reconstructionLog + * file will be deleted (by whoever has instantiated the Store). + * + *

Locking and transactions are handled at a higher level. This API should + * not be called directly but by an HRegion manager. + */ +public class Store implements HConstants { + static final Log LOG = LogFactory.getLog(Store.class); + protected final Memcache memcache; + // This stores directory in the filesystem. + private final Path homedir; + private final HRegionInfo regioninfo; + private final HColumnDescriptor family; + final FileSystem fs; + private final HBaseConfiguration conf; + // ttl in milliseconds. + protected long ttl; + private long majorCompactionTime; + private int maxFilesToCompact; + private final long desiredMaxFileSize; + private volatile long storeSize = 0L; + private final Integer flushLock = new Integer(0); + final ReentrantReadWriteLock lock = new ReentrantReadWriteLock(); + final byte [] storeName; + private final String storeNameStr; + + /* + * Sorted Map of readers keyed by maximum edit sequence id (Most recent should + * be last in in list). ConcurrentSkipListMap is lazily consistent so no + * need to lock it down when iterating; iterator view is that of when the + * iterator was taken out. + */ + private final NavigableMap storefiles = + new ConcurrentSkipListMap(); + + // All access must be synchronized. + private final CopyOnWriteArraySet changedReaderObservers = + new CopyOnWriteArraySet(); + + // The most-recent log-seq-ID. The most-recent such ID means we can ignore + // all log messages up to and including that ID (because they're already + // reflected in the TreeMaps). + private volatile long maxSeqId = -1; + + private final Path compactionDir; + private final Integer compactLock = new Integer(0); + private final int compactionThreshold; + + /** + * Constructor + * @param basedir qualified path under which the region directory lives; + * generally the table subdirectory + * @param info HRegionInfo for this region + * @param family HColumnDescriptor for this column + * @param fs file system object + * @param reconstructionLog existing log file to apply if any + * @param conf configuration object + * @param reporter Call on a period so hosting server can report we're + * making progress to master -- otherwise master might think region deploy + * failed. Can be null. + * @throws IOException + */ + protected Store(Path basedir, HRegionInfo info, HColumnDescriptor family, + FileSystem fs, Path reconstructionLog, HBaseConfiguration conf, + final Progressable reporter) + throws IOException { + this.homedir = getStoreHomedir(basedir, info.getEncodedName(), + family.getName()); + this.regioninfo = info; + this.family = family; + this.fs = fs; + this.conf = conf; + // getTimeToLive returns ttl in seconds. Convert to milliseconds. + this.ttl = family.getTimeToLive(); + if (ttl != HConstants.FOREVER) { + this.ttl *= 1000; + } + this.memcache = new Memcache(this.ttl); + this.compactionDir = HRegion.getCompactionDir(basedir); + this.storeName = Bytes.toBytes(this.regioninfo.getEncodedName() + "/" + + Bytes.toString(this.family.getName())); + this.storeNameStr = Bytes.toString(this.storeName); + + // By default, we compact if an HStore has more than + // MIN_COMMITS_FOR_COMPACTION map files + this.compactionThreshold = + conf.getInt("hbase.hstore.compactionThreshold", 3); + + // By default we split region if a file > DEFAULT_MAX_FILE_SIZE. + long maxFileSize = info.getTableDesc().getMaxFileSize(); + if (maxFileSize == HConstants.DEFAULT_MAX_FILE_SIZE) { + maxFileSize = conf.getLong("hbase.hregion.max.filesize", + HConstants.DEFAULT_MAX_FILE_SIZE); + } + this.desiredMaxFileSize = maxFileSize; + + this.majorCompactionTime = + conf.getLong(HConstants.MAJOR_COMPACTION_PERIOD, 86400000); + if (family.getValue(HConstants.MAJOR_COMPACTION_PERIOD) != null) { + String strCompactionTime = + family.getValue(HConstants.MAJOR_COMPACTION_PERIOD); + this.majorCompactionTime = (new Long(strCompactionTime)).longValue(); + } + + this.maxFilesToCompact = conf.getInt("hbase.hstore.compaction.max", 10); + + // loadStoreFiles calculates this.maxSeqId. as side-effect. + this.storefiles.putAll(loadStoreFiles()); + if (LOG.isDebugEnabled() && this.storefiles.size() > 0) { + LOG.debug("Loaded " + this.storefiles.size() + " file(s) in Store " + + Bytes.toString(this.storeName) + ", max sequence id " + this.maxSeqId); + } + + // Do reconstruction log. + runReconstructionLog(reconstructionLog, this.maxSeqId, reporter); + } + + HColumnDescriptor getFamily() { + return this.family; + } + + long getMaxSequenceId() { + return this.maxSeqId; + } + + /** + * @param tabledir + * @param encodedName Encoded region name. + * @param family + * @return Path to family/Store home directory. + */ + public static Path getStoreHomedir(final Path tabledir, + final int encodedName, final byte [] family) { + return new Path(tabledir, new Path(Integer.toString(encodedName), + new Path(Bytes.toString(family)))); + } + + /* + * Run reconstruction log + * @param reconstructionLog + * @param msid + * @param reporter + * @throws IOException + */ + private void runReconstructionLog(final Path reconstructionLog, + final long msid, final Progressable reporter) + throws IOException { + try { + doReconstructionLog(reconstructionLog, msid, reporter); + } catch (EOFException e) { + // Presume we got here because of lack of HADOOP-1700; for now keep going + // but this is probably not what we want long term. If we got here there + // has been data-loss + LOG.warn("Exception processing reconstruction log " + reconstructionLog + + " opening " + this.storeName + + " -- continuing. Probably lack-of-HADOOP-1700 causing DATA LOSS!", e); + } catch (IOException e) { + // Presume we got here because of some HDFS issue. Don't just keep going. + // Fail to open the HStore. Probably means we'll fail over and over + // again until human intervention but alternative has us skipping logs + // and losing edits: HBASE-642. + LOG.warn("Exception processing reconstruction log " + reconstructionLog + + " opening " + this.storeName, e); + throw e; + } + } + + /* + * Read the reconstructionLog to see whether we need to build a brand-new + * file out of non-flushed log entries. + * + * We can ignore any log message that has a sequence ID that's equal to or + * lower than maxSeqID. (Because we know such log messages are already + * reflected in the MapFiles.) + */ + @SuppressWarnings("unchecked") + private void doReconstructionLog(final Path reconstructionLog, + final long maxSeqID, final Progressable reporter) + throws UnsupportedEncodingException, IOException { + if (reconstructionLog == null || !this.fs.exists(reconstructionLog)) { + // Nothing to do. + return; + } + // Check its not empty. + FileStatus [] stats = this.fs.listStatus(reconstructionLog); + if (stats == null || stats.length == 0) { + LOG.warn("Passed reconstruction log " + reconstructionLog + + " is zero-length"); + return; + } + // TODO: This could grow large and blow heap out. Need to get it into + // general memory usage accounting. + long maxSeqIdInLog = -1; + NavigableMap reconstructedCache = + new TreeMap(new HStoreKey.HStoreKeyWritableComparator()); + SequenceFile.Reader logReader = new SequenceFile.Reader(this.fs, + reconstructionLog, this.conf); + try { + HLogKey key = new HLogKey(); + HLogEdit val = new HLogEdit(); + long skippedEdits = 0; + long editsCount = 0; + // How many edits to apply before we send a progress report. + int reportInterval = + this.conf.getInt("hbase.hstore.report.interval.edits", 2000); + while (logReader.next(key, val)) { + maxSeqIdInLog = Math.max(maxSeqIdInLog, key.getLogSeqNum()); + if (key.getLogSeqNum() <= maxSeqID) { + skippedEdits++; + continue; + } + // Check this edit is for me. Also, guard against writing + // METACOLUMN info such as HBASE::CACHEFLUSH entries + byte [] column = val.getColumn(); + if (val.isTransactionEntry() || Bytes.equals(column, HLog.METACOLUMN) + || !Bytes.equals(key.getRegionName(), regioninfo.getRegionName()) + || !HStoreKey.matchingFamily(family.getName(), column)) { + continue; + } + HStoreKey k = new HStoreKey(key.getRow(), column, val.getTimestamp()); + reconstructedCache.put(k, val.getVal()); + editsCount++; + // Every 2k edits, tell the reporter we're making progress. + // Have seen 60k edits taking 3minutes to complete. + if (reporter != null && (editsCount % reportInterval) == 0) { + reporter.progress(); + } + } + if (LOG.isDebugEnabled()) { + LOG.debug("Applied " + editsCount + ", skipped " + skippedEdits + + " because sequence id <= " + maxSeqID); + } + } finally { + logReader.close(); + } + + if (reconstructedCache.size() > 0) { + // We create a "virtual flush" at maxSeqIdInLog+1. + if (LOG.isDebugEnabled()) { + LOG.debug("flushing reconstructionCache"); + } + internalFlushCache(reconstructedCache, maxSeqIdInLog + 1); + } + } + + /* + * Creates a series of StoreFile loaded from the given directory. + * @throws IOException + */ + private Map loadStoreFiles() + throws IOException { + Map results = new HashMap(); + FileStatus files[] = this.fs.listStatus(this.homedir); + for (int i = 0; files != null && i < files.length; i++) { + // Skip directories. + if (files[i].isDir()) { + continue; + } + Path p = files[i].getPath(); + // Check for empty file. Should never be the case but can happen + // after data loss in hdfs for whatever reason (upgrade, etc.): HBASE-646 + if (this.fs.getFileStatus(p).getLen() <= 0) { + LOG.warn("Skipping " + p + " because its empty. HBASE-646 DATA LOSS?"); + continue; + } + StoreFile curfile = new StoreFile(fs, p); + long storeSeqId = curfile.getMaxSequenceId(); + if (storeSeqId > this.maxSeqId) { + this.maxSeqId = storeSeqId; + } + long length = curfile.getReader().length(); + this.storeSize += length; + if (LOG.isDebugEnabled()) { + LOG.debug("loaded " + FSUtils.getPath(p) + ", isReference=" + + curfile.isReference() + ", sequence id=" + storeSeqId + + ", length=" + length + ", majorCompaction=" + + curfile.isMajorCompaction()); + } + results.put(Long.valueOf(storeSeqId), curfile); + } + return results; + } + + /** + * Adds a value to the memcache + * + * @param key + * @param value + * @return memcache size delta + */ + protected long add(HStoreKey key, byte[] value) { + lock.readLock().lock(); + try { + return this.memcache.add(key, value); + } finally { + lock.readLock().unlock(); + } + } + + /** + * @return All store files. + */ + NavigableMap getStorefiles() { + return this.storefiles; + } + + /** + * Close all the readers + * + * We don't need to worry about subsequent requests because the HRegion holds + * a write lock that will prevent any more reads or writes. + * + * @throws IOException + */ + List close() throws IOException { + this.lock.writeLock().lock(); + try { + ArrayList result = + new ArrayList(storefiles.values()); + // Clear so metrics doesn't find them. + this.storefiles.clear(); + for (StoreFile f: result) { + f.close(); + } + LOG.debug("closed " + this.storeNameStr); + return result; + } finally { + this.lock.writeLock().unlock(); + } + } + + /** + * Snapshot this stores memcache. Call before running + * {@link #flushCache(long)} so it has some work to do. + */ + void snapshot() { + this.memcache.snapshot(); + } + + /** + * Write out current snapshot. Presumes {@link #snapshot()} has been called + * previously. + * @param logCacheFlushId flush sequence number + * @return true if a compaction is needed + * @throws IOException + */ + boolean flushCache(final long logCacheFlushId) throws IOException { + // Get the snapshot to flush. Presumes that a call to + // this.memcache.snapshot() has happened earlier up in the chain. + SortedMap cache = this.memcache.getSnapshot(); + // If an exception happens flushing, we let it out without clearing + // the memcache snapshot. The old snapshot will be returned when we say + // 'snapshot', the next time flush comes around. + StoreFile sf = internalFlushCache(cache, logCacheFlushId); + if (sf == null) { + return false; + } + // Add new file to store files. Clear snapshot too while we have the + // Store write lock. + int size = updateStorefiles(logCacheFlushId, sf, cache); + return size >= this.compactionThreshold; + } + + /* + * @param cache + * @param logCacheFlushId + * @return StoreFile created. + * @throws IOException + */ + private StoreFile internalFlushCache(final SortedMap cache, + final long logCacheFlushId) + throws IOException { + HFile.Writer writer = null; + long flushed = 0; + // Don't flush if there are no entries. + if (cache.size() == 0) { + return null; + } + long now = System.currentTimeMillis(); + // TODO: We can fail in the below block before we complete adding this + // flush to list of store files. Add cleanup of anything put on filesystem + // if we fail. + synchronized (flushLock) { + // A. Write the map out to the disk + writer = StoreFile.getWriter(this.fs, this.homedir); + int entries = 0; + try { + for (Map.Entry es: cache.entrySet()) { + HStoreKey curkey = es.getKey(); + byte[] bytes = es.getValue(); + if (!isExpired(curkey, ttl, now)) { + writer.append(curkey.getBytes(), bytes); + entries++; + flushed += this.memcache.heapSize(curkey, bytes, null); + } + } + // B. Write out the log sequence number that corresponds to this output + // MapFile. The MapFile is current up to and including logCacheFlushId. + StoreFile.appendMetadata(writer, logCacheFlushId); + } finally { + writer.close(); + } + } + StoreFile sf = new StoreFile(this.fs, writer.getPath()); + this.storeSize += sf.getReader().length(); + if(LOG.isDebugEnabled()) { + LOG.debug("Added " + sf + ", entries=" + sf.getReader().getEntries() + + ", sequenceid=" + logCacheFlushId + + ", memsize=" + StringUtils.humanReadableInt(flushed) + + ", filesize=" + StringUtils.humanReadableInt(sf.getReader().length()) + + " to " + this.regioninfo.getRegionNameAsString()); + } + return sf; + } + + /* + * Change storefiles adding into place the Reader produced by this new flush. + * @param logCacheFlushId + * @param sf + * @param cache That was used to make the passed file p. + * @throws IOException + * @return Count of store files. + */ + private int updateStorefiles(final long logCacheFlushId, + final StoreFile sf, final SortedMap cache) + throws IOException { + int count = 0; + this.lock.writeLock().lock(); + try { + this.storefiles.put(Long.valueOf(logCacheFlushId), sf); + count = this.storefiles.size(); + // Tell listeners of the change in readers. + notifyChangedReadersObservers(); + this.memcache.clearSnapshot(cache); + return count; + } finally { + this.lock.writeLock().unlock(); + } + } + + /* + * Notify all observers that set of Readers has changed. + * @throws IOException + */ + private void notifyChangedReadersObservers() throws IOException { + for (ChangedReadersObserver o: this.changedReaderObservers) { + o.updateReaders(); + } + } + + /* + * @param o Observer who wants to know about changes in set of Readers + */ + void addChangedReaderObserver(ChangedReadersObserver o) { + this.changedReaderObservers.add(o); + } + + /* + * @param o Observer no longer interested in changes in set of Readers. + */ + void deleteChangedReaderObserver(ChangedReadersObserver o) { + if (!this.changedReaderObservers.remove(o)) { + LOG.warn("Not in set" + o); + } + } + + ////////////////////////////////////////////////////////////////////////////// + // Compaction + ////////////////////////////////////////////////////////////////////////////// + + /** + * Compact the StoreFiles. This method may take some time, so the calling + * thread must be able to block for long periods. + * + *

During this time, the Store can work as usual, getting values from + * MapFiles and writing new MapFiles from the Memcache. + * + * Existing MapFiles are not destroyed until the new compacted TreeMap is + * completely written-out to disk. + * + * The compactLock prevents multiple simultaneous compactions. + * The structureLock prevents us from interfering with other write operations. + * + * We don't want to hold the structureLock for the whole time, as a compact() + * can be lengthy and we want to allow cache-flushes during this period. + * + * @param mc True to force a major compaction regardless of + * thresholds + * @return row to split around if a split is needed, null otherwise + * @throws IOException + */ + StoreSize compact(final boolean mc) throws IOException { + boolean forceSplit = this.regioninfo.shouldSplit(false); + boolean majorcompaction = mc; + synchronized (compactLock) { + long maxId = -1; + // filesToCompact are sorted oldest to newest. + List filesToCompact = null; + filesToCompact = new ArrayList(this.storefiles.values()); + if (filesToCompact.size() <= 0) { + LOG.debug(this.storeNameStr + ": no store files to compact"); + return null; + } + // The max-sequenceID in any of the to-be-compacted TreeMaps is the + // last key of storefiles. + maxId = this.storefiles.lastKey().longValue(); + // Check to see if we need to do a major compaction on this region. + // If so, change doMajorCompaction to true to skip the incremental + // compacting below. Only check if doMajorCompaction is not true. + if (!majorcompaction) { + majorcompaction = isMajorCompaction(filesToCompact); + } + boolean references = hasReferences(filesToCompact); + if (!majorcompaction && !references && + (forceSplit || (filesToCompact.size() < compactionThreshold))) { + return checkSplit(forceSplit); + } + if (!fs.exists(compactionDir) && !fs.mkdirs(compactionDir)) { + LOG.warn("Mkdir on " + compactionDir.toString() + " failed"); + return checkSplit(forceSplit); + } + + // HBASE-745, preparing all store file sizes for incremental compacting + // selection. + int countOfFiles = filesToCompact.size(); + long totalSize = 0; + long[] fileSizes = new long[countOfFiles]; + long skipped = 0; + int point = 0; + for (int i = 0; i < countOfFiles; i++) { + StoreFile file = filesToCompact.get(i); + Path path = file.getPath(); + if (path == null) { + LOG.warn("Path is null for " + file); + return null; + } + long len = file.getReader().length(); + fileSizes[i] = len; + totalSize += len; + } + if (!majorcompaction && !references) { + // Here we select files for incremental compaction. + // The rule is: if the largest(oldest) one is more than twice the + // size of the second, skip the largest, and continue to next..., + // until we meet the compactionThreshold limit. + for (point = 0; point < countOfFiles - 1; point++) { + if ((fileSizes[point] < fileSizes[point + 1] * 2) && + (countOfFiles - point) <= maxFilesToCompact) { + break; + } + skipped += fileSizes[point]; + } + filesToCompact = new ArrayList(filesToCompact.subList(point, + countOfFiles)); + if (filesToCompact.size() <= 1) { + if (LOG.isDebugEnabled()) { + LOG.debug("Skipped compaction of 1 file; compaction size of " + + this.storeNameStr + ": " + + StringUtils.humanReadableInt(totalSize) + "; Skipped " + point + + " files, size: " + skipped); + } + return checkSplit(forceSplit); + } + if (LOG.isDebugEnabled()) { + LOG.debug("Compaction size of " + this.storeNameStr + ": " + + StringUtils.humanReadableInt(totalSize) + "; Skipped " + point + + " file(s), size: " + skipped); + } + } + + // Step through them, writing to the brand-new file + HFile.Writer writer = StoreFile.getWriter(this.fs, this.homedir); + if (LOG.isDebugEnabled()) { + LOG.debug("Started compaction of " + filesToCompact.size() + " file(s)" + + (references? ", hasReferences=true,": " ") + " into " + + FSUtils.getPath(writer.getPath())); + } + try { + compact(writer, filesToCompact, majorcompaction); + } finally { + // Now, write out an HSTORE_LOGINFOFILE for the brand-new TreeMap. + StoreFile.appendMetadata(writer, maxId, majorcompaction); + writer.close(); + } + + // Move the compaction into place. + completeCompaction(filesToCompact, writer); + if (LOG.isDebugEnabled()) { + LOG.debug("Completed " + (majorcompaction? "major": "") + + " compaction of " + this.storeNameStr + + " store size is " + StringUtils.humanReadableInt(storeSize)); + } + } + return checkSplit(forceSplit); + } + + /* + * @param files + * @return True if any of the files in files are References. + */ + private boolean hasReferences(Collection files) { + if (files != null && files.size() > 0) { + for (StoreFile hsf: files) { + if (hsf.isReference()) { + return true; + } + } + } + return false; + } + + /* + * Gets lowest timestamp from files in a dir + * + * @param fs + * @param dir + * @throws IOException + */ + private static long getLowestTimestamp(FileSystem fs, Path dir) + throws IOException { + FileStatus[] stats = fs.listStatus(dir); + if (stats == null || stats.length == 0) { + return 0l; + } + long lowTimestamp = Long.MAX_VALUE; + for (int i = 0; i < stats.length; i++) { + long timestamp = stats[i].getModificationTime(); + if (timestamp < lowTimestamp){ + lowTimestamp = timestamp; + } + } + return lowTimestamp; + } + + /* + * @return True if we should run a major compaction. + */ + boolean isMajorCompaction() throws IOException { + List filesToCompact = null; + // filesToCompact are sorted oldest to newest. + filesToCompact = new ArrayList(this.storefiles.values()); + return isMajorCompaction(filesToCompact); + } + + /* + * @param filesToCompact Files to compact. Can be null. + * @return True if we should run a major compaction. + */ + private boolean isMajorCompaction(final List filesToCompact) + throws IOException { + boolean result = false; + if (filesToCompact == null || filesToCompact.size() <= 0) { + return result; + } + long lowTimestamp = getLowestTimestamp(fs, + filesToCompact.get(0).getPath().getParent()); + long now = System.currentTimeMillis(); + if (lowTimestamp > 0l && lowTimestamp < (now - this.majorCompactionTime)) { + // Major compaction time has elapsed. + long elapsedTime = now - lowTimestamp; + if (filesToCompact.size() == 1 && + filesToCompact.get(0).isMajorCompaction() && + (this.ttl == HConstants.FOREVER || elapsedTime < this.ttl)) { + if (LOG.isDebugEnabled()) { + LOG.debug("Skipping major compaction of " + this.storeNameStr + + " because one (major) compacted file only and elapsedTime " + + elapsedTime + "ms is < ttl=" + this.ttl); + } + } else { + if (LOG.isDebugEnabled()) { + LOG.debug("Major compaction triggered on store " + this.storeNameStr + + "; time since last major compaction " + (now - lowTimestamp) + "ms"); + } + result = true; + } + } + return result; + } + + /* + * @param r StoreFile list to reverse + * @return A new array of content of readers, reversed. + */ + private StoreFile [] reverse(final List r) { + List copy = new ArrayList(r); + Collections.reverse(copy); + // MapFile.Reader is instance of StoreFileReader so this should be ok. + return copy.toArray(new StoreFile[0]); + } + + /* + * @param rdrs List of StoreFiles + * @param keys Current keys + * @param done Which readers are done + * @return The lowest current key in passed rdrs + */ + private int getLowestKey(final HFileScanner [] rdrs, final ByteBuffer [] keys, + final boolean [] done) { + int lowestKey = -1; + for (int i = 0; i < rdrs.length; i++) { + if (done[i]) { + continue; + } + if (lowestKey < 0) { + lowestKey = i; + } else { + RawComparator c = rdrs[i].getReader().getComparator(); + if (c.compare(keys[i].array(), keys[i].arrayOffset(), keys[i].limit(), + keys[lowestKey].array(), keys[lowestKey].arrayOffset(), + keys[lowestKey].limit()) < 0) { + lowestKey = i; + } + } + } + return lowestKey; + } + + /* + * Compact a list of StoreFiles. + * + * We work by iterating through the readers in parallel looking at newest + * store file first. We always increment the lowest-ranked one. Updates to a + * single row/column will appear ranked by timestamp. + * @param compactedOut Where to write compaction. + * @param pReaders List of readers sorted oldest to newest. + * @param majorCompaction True to force a major compaction regardless of + * thresholds + * @throws IOException + */ + private void compact(final HFile.Writer compactedOut, + final List pReaders, final boolean majorCompaction) + throws IOException { + // Reverse order so newest store file is first. + StoreFile[] files = reverse(pReaders); + HFileScanner[] rdrs = new HFileScanner[files.length]; + ByteBuffer[] keys = new ByteBuffer[rdrs.length]; + ByteBuffer[] vals = new ByteBuffer[rdrs.length]; + boolean[] done = new boolean[rdrs.length]; + // Now, advance through the readers in order. This will have the + // effect of a run-time sort of the entire dataset. + int numDone = 0; + for (int i = 0; i < rdrs.length; i++) { + rdrs[i] = files[i].getReader().getScanner(); + done[i] = !rdrs[i].seekTo(); + if (done[i]) { + numDone++; + } else { + keys[i] = rdrs[i].getKey(); + vals[i] = rdrs[i].getValue(); + } + } + + long now = System.currentTimeMillis(); + int timesSeen = 0; + HStoreKey lastSeen = new HStoreKey(); + HStoreKey lastDelete = null; + while (numDone < done.length) { + // Get lowest key in all store files. + int lowestKey = getLowestKey(rdrs, keys, done); + // TODO: Suboptimal. And below where we are going from ByteBuffer to + // byte array. FIX!! Can we get rid of HSK instantiations? + HStoreKey hsk = HStoreKey.create(keys[lowestKey]); + // If its same row and column as last key, increment times seen. + if (HStoreKey.equalsTwoRowKeys(lastSeen.getRow(), hsk.getRow()) + && Bytes.equals(lastSeen.getColumn(), hsk.getColumn())) { + timesSeen++; + // Reset last delete if not exact timestamp -- lastDelete only stops + // exactly the same key making it out to the compacted store file. + if (lastDelete != null + && lastDelete.getTimestamp() != hsk.getTimestamp()) { + lastDelete = null; + } + } else { + timesSeen = 1; + lastDelete = null; + } + + // Don't write empty rows or columns. Only remove cells on major + // compaction. Remove if expired of > VERSIONS + if (hsk.getRow().length != 0 && hsk.getColumn().length != 0) { + ByteBuffer value = vals[lowestKey]; + if (!majorCompaction) { + // Write out all values if not a major compaction. + compactedOut.append(hsk.getBytes(), Bytes.toBytes(value)); + } else { + boolean expired = false; + boolean deleted = false; + if (timesSeen <= family.getMaxVersions() + && !(expired = isExpired(hsk, ttl, now))) { + // If this value key is same as a deleted key, skip + if (lastDelete != null && hsk.equals(lastDelete)) { + deleted = true; + } else if (HLogEdit.isDeleted(value)) { + // If a deleted value, skip + deleted = true; + lastDelete = hsk; + } else { + compactedOut.append(hsk.getBytes(), Bytes.toBytes(value)); + } + } + if (expired || deleted) { + // HBASE-855 remove one from timesSeen because it did not make it + // past expired check -- don't count against max versions. + timesSeen--; + } + } + } + + // Update last-seen items + lastSeen = hsk; + + // Advance the smallest key. If that reader's all finished, then + // mark it as done. + if (!rdrs[lowestKey].next()) { + done[lowestKey] = true; + rdrs[lowestKey] = null; + numDone++; + } else { + keys[lowestKey] = rdrs[lowestKey].getKey(); + vals[lowestKey] = rdrs[lowestKey].getValue(); + } + } + } + + /* + * It's assumed that the compactLock will be acquired prior to calling this + * method! Otherwise, it is not thread-safe! + * + * It works by processing a compaction that's been written to disk. + * + *

It is usually invoked at the end of a compaction, but might also be + * invoked at HStore startup, if the prior execution died midway through. + * + *

Moving the compacted TreeMap into place means: + *

+   * 1) Moving the new compacted MapFile into place
+   * 2) Unload all replaced MapFiles, close and collect list to delete.
+   * 3) Loading the new TreeMap.
+   * 4) Compute new store size
+   * 
+ * + * @param compactedFiles list of files that were compacted + * @param compactedFile HStoreFile that is the result of the compaction + * @throws IOException + */ + private void completeCompaction(final List compactedFiles, + final HFile.Writer compactedFile) + throws IOException { + // 1. Moving the new files into place. + Path p = null; + try { + p = StoreFile.rename(this.fs, compactedFile.getPath(), + StoreFile.getRandomFilename(fs, this.homedir)); + } catch (IOException e) { + LOG.error("Failed move of compacted file " + compactedFile.getPath(), e); + return; + } + StoreFile finalCompactedFile = new StoreFile(this.fs, p); + this.lock.writeLock().lock(); + try { + try { + // 3. Loading the new TreeMap. + // Change this.storefiles so it reflects new state but do not + // delete old store files until we have sent out notification of + // change in case old files are still being accessed by outstanding + // scanners. + for (Map.Entry e: this.storefiles.entrySet()) { + if (compactedFiles.contains(e.getValue())) { + this.storefiles.remove(e.getKey()); + } + } + // Add new compacted Reader and store file. + Long orderVal = Long.valueOf(finalCompactedFile.getMaxSequenceId()); + this.storefiles.put(orderVal, finalCompactedFile); + // Tell observers that list of Readers has changed. + notifyChangedReadersObservers(); + // Finally, delete old store files. + for (StoreFile hsf: compactedFiles) { + hsf.delete(); + } + } catch (IOException e) { + e = RemoteExceptionHandler.checkIOException(e); + LOG.error("Failed replacing compacted files for " + + this.storeNameStr + + ". Compacted file is " + finalCompactedFile.toString() + + ". Files replaced are " + compactedFiles.toString() + + " some of which may have been already removed", e); + } + // 4. Compute new store size + this.storeSize = 0L; + for (StoreFile hsf : this.storefiles.values()) { + this.storeSize += hsf.getReader().length(); + } + } finally { + this.lock.writeLock().unlock(); + } + } + + // //////////////////////////////////////////////////////////////////////////// + // Accessors. + // (This is the only section that is directly useful!) + ////////////////////////////////////////////////////////////////////////////// + + /** + * Return all the available columns for the given key. The key indicates a + * row and timestamp, but not a column name. + * + * The returned object should map column names to Cells. + */ + void getFull(HStoreKey key, final Set columns, + final int numVersions, Map results) + throws IOException { + int versions = versionsToReturn(numVersions); + Map deletes = + new TreeMap(Bytes.BYTES_COMPARATOR); + // if the key is null, we're not even looking for anything. return. + if (key == null) { + return; + } + + this.lock.readLock().lock(); + // get from the memcache first. + this.memcache.getFull(key, columns, versions, deletes, results); + try { + Map m = this.storefiles.descendingMap(); + for (Iterator> i = m.entrySet().iterator(); + i.hasNext();) { + getFullFromStoreFile(i.next().getValue(), key, columns, versions, deletes, results); + } + } finally { + this.lock.readLock().unlock(); + } + } + + private void getFullFromStoreFile(StoreFile f, HStoreKey key, + Set columns, int numVersions, Map deletes, + Map results) + throws IOException { + long now = System.currentTimeMillis(); + HFileScanner scanner = f.getReader().getScanner(); + if (!getClosest(scanner, key.getBytes())) { + return; + } + do { + HStoreKey readkey = HStoreKey.create(scanner.getKey()); + byte[] readcol = readkey.getColumn(); + + // if we're looking for this column (or all of them), and there isn't + // already a value for this column in the results map or there is a value + // but we haven't collected enough versions yet, and the key we + // just read matches, then we'll consider it + if ((columns == null || columns.contains(readcol)) && + (!results.containsKey(readcol) || + results.get(readcol).getNumValues() < numVersions) && + key.matchesWithoutColumn(readkey)) { + // if the value of the cell we're looking at right now is a delete, + // we need to treat it differently + ByteBuffer value = scanner.getValue(); + if (HLogEdit.isDeleted(value)) { + // if it's not already recorded as a delete or recorded with a more + // recent delete timestamp, record it for later + if (!deletes.containsKey(readcol) + || deletes.get(readcol).longValue() < readkey.getTimestamp()) { + deletes.put(readcol, Long.valueOf(readkey.getTimestamp())); + } + } else if (!(deletes.containsKey(readcol) && deletes.get(readcol) + .longValue() >= readkey.getTimestamp())) { + // So the cell itself isn't a delete, but there may be a delete + // pending from earlier in our search. Only record this result if + // there aren't any pending deletes. + if (!(deletes.containsKey(readcol) && deletes.get(readcol) + .longValue() >= readkey.getTimestamp())) { + if (!isExpired(readkey, ttl, now)) { + if (!results.containsKey(readcol)) { + results.put(readcol, new Cell(value, readkey.getTimestamp())); + } else { + results.get(readcol).add(Bytes.toBytes(value), + readkey.getTimestamp()); + } + } + } + } + } else if (HStoreKey.compareTwoRowKeys(key.getRow(), readkey.getRow()) < 0) { + // if we've crossed into the next row, then we can just stop + // iterating + break; + } + } while (scanner.next()); + } + + /* + * @param wantedVersions How many versions were asked for. + * @return wantedVersions or this families' MAX_VERSIONS. + */ + private int versionsToReturn(final int wantedVersions) { + if (wantedVersions <= 0) { + throw new IllegalArgumentException("Number of versions must be > 0"); + } + // Make sure we do not return more than maximum versions for this store. + return wantedVersions > this.family.getMaxVersions()? + this.family.getMaxVersions(): wantedVersions; + } + + /** + * Get the value for the indicated HStoreKey. Grab the target value and the + * previous numVersions - 1 values, as well. + * + * Use {@link HConstants.ALL_VERSIONS} to retrieve all versions. + * @param key + * @param numVersions Number of versions to fetch. Must be > 0. + * @return values for the specified versions + * @throws IOException + */ + Cell[] get(final HStoreKey key, final int numVersions) throws IOException { + // This code below is very close to the body of the getKeys method. Any + // changes in the flow below should also probably be done in getKeys. + // TODO: Refactor so same code used. + long now = System.currentTimeMillis(); + int versions = versionsToReturn(numVersions); + // Keep a list of deleted cell keys. We need this because as we go through + // the memcache and store files, the cell with the delete marker may be + // in one store and the old non-delete cell value in a later store. + // If we don't keep around the fact that the cell was deleted in a newer + // record, we end up returning the old value if user is asking for more + // than one version. This List of deletes should not be large since we + // are only keeping rows and columns that match those set on the get and + // which have delete values. If memory usage becomes an issue, could + // redo as bloom filter. + Set deletes = new HashSet(); + this.lock.readLock().lock(); + try { + // Check the memcache + List results = this.memcache.get(key, versions, deletes, now); + // If we got sufficient versions from memcache, return. + if (results.size() == versions) { + return results.toArray(new Cell[results.size()]); + } + Map m = this.storefiles.descendingMap(); + byte [] keyBytes = key.getBytes(); + for (Iterator> i = m.entrySet().iterator(); + i.hasNext() && !hasEnoughVersions(versions, results);) { + StoreFile f = i.next().getValue(); + HFileScanner scanner = f.getReader().getScanner(); + if (!getClosest(scanner, keyBytes)) { + continue; + } + HStoreKey readkey = HStoreKey.create(scanner.getKey()); + if (!readkey.matchesRowCol(key)) { + continue; + } + if (get(readkey, scanner.getValue(), versions, results, deletes, now)) { + break; + } + while (scanner.next()) { + readkey = HStoreKey.create(scanner.getKey()); + if (!readkey.matchesRowCol(key)) { + break; + } + if (get(readkey, scanner.getValue(), versions, results, deletes, now)) { + break; + } + } + } + return results.size() == 0 ? + null : results.toArray(new Cell[results.size()]); + } finally { + this.lock.readLock().unlock(); + } + } + + /* + * Look at one key/value. + * @param key + * @param value + * @param versions + * @param results + * @param deletes + * @param now + * @return True if we have enough versions. + */ + private boolean get(final HStoreKey key, ByteBuffer value, + final int versions, final List results, + final Set deletes, final long now) { + if (!HLogEdit.isDeleted(value)) { + if (notExpiredAndNotInDeletes(this.ttl, key, now, deletes)) { + results.add(new Cell(value, key.getTimestamp())); + } + // Perhaps only one version is wanted. I could let this + // test happen later in the for loop test but it would cost + // the allocation of an ImmutableBytesWritable. + if (hasEnoughVersions(versions, results)) { + return true; + } + } else { + // Is this copy necessary? + deletes.add(new HStoreKey(key)); + } + return false; + } + + /* + * Small method to check if we are over the max number of versions + * or we acheived this family max versions. + * The later happens when we have the situation described in HBASE-621. + * @param versions + * @param c + * @return + */ + private boolean hasEnoughVersions(final int versions, final List c) { + return c.size() >= versions; + } + + /** + * Get versions of keys matching the origin key's + * row/column/timestamp and those of an older vintage. + * @param origin Where to start searching. + * @param versions How many versions to return. Pass + * {@link HConstants#ALL_VERSIONS} to retrieve all. + * @param now + * @param columnPattern regex pattern for column matching. if columnPattern + * is not null, we use column pattern to match columns. And the columnPattern + * only works when origin's column is null or its length is zero. + * @return Matching keys. + * @throws IOException + */ + public List getKeys(final HStoreKey origin, final int versions, + final long now, final Pattern columnPattern) + throws IOException { + // This code below is very close to the body of the get method. Any + // changes in the flow below should also probably be done in get. + // TODO: Refactor so same code used. + Set deletes = new HashSet(); + this.lock.readLock().lock(); + try { + // Check the memcache + List keys = + this.memcache.getKeys(origin, versions, deletes, now, columnPattern); + // If we got sufficient versions from memcache, return. + if (keys.size() >= versions) { + return keys; + } + Map m = this.storefiles.descendingMap(); + for (Iterator> i = m.entrySet().iterator(); + i.hasNext() && keys.size() < versions;) { + StoreFile f = i.next().getValue(); + HFileScanner scanner = f.getReader().getScanner(); + if (!getClosest(scanner, origin.getBytes())) { + continue; + } + do { + HStoreKey readkey = HStoreKey.create(scanner.getKey()); + // if the row and column matches, we might want this one. + if (rowMatches(origin, readkey)) { + // if the column pattern is not null, we use it for column matching. + // we will skip the keys whose column doesn't match the pattern. + if (columnPattern != null) { + if (!(columnPattern. + matcher(Bytes.toString(readkey.getColumn())).matches())) { + continue; + } + } + // if the cell address matches, then we definitely want this key. + if (cellMatches(origin, readkey)) { + ByteBuffer readval = scanner.getValue(); + // Store key if isn't deleted or superceded by memcache + if (!HLogEdit.isDeleted(readval)) { + if (notExpiredAndNotInDeletes(this.ttl, readkey, now, deletes)) { + keys.add(readkey); + } + if (keys.size() >= versions) { + break; + } + } else { + deletes.add(readkey); + } + } else { + // the cell doesn't match, but there might be more with different + // timestamps, so move to the next key + continue; + } + } else { + // the row doesn't match, so we've gone too far. + break; + } + } while (scanner.next()); // advance to the next key + } + return keys; + } finally { + this.lock.readLock().unlock(); + } + } + + /** + * Find the key that matches row exactly, or the one that immediately + * preceeds it. WARNING: Only use this method on a table where writes occur + * with stricly increasing timestamps. This method assumes this pattern of + * writes in order to make it reasonably performant. + * @param row + * @return Found row + * @throws IOException + */ + @SuppressWarnings("unchecked") + byte [] getRowKeyAtOrBefore(final byte [] row) + throws IOException{ + // Map of HStoreKeys that are candidates for holding the row key that + // most closely matches what we're looking for. We'll have to update it as + // deletes are found all over the place as we go along before finally + // reading the best key out of it at the end. + NavigableMap candidateKeys = new TreeMap( + new HStoreKey.HStoreKeyWritableComparator()); + + // Keep a list of deleted cell keys. We need this because as we go through + // the store files, the cell with the delete marker may be in one file and + // the old non-delete cell value in a later store file. If we don't keep + // around the fact that the cell was deleted in a newer record, we end up + // returning the old value if user is asking for more than one version. + // This List of deletes should not be large since we are only keeping rows + // and columns that match those set on the scanner and which have delete + // values. If memory usage becomes an issue, could redo as bloom filter. + Set deletes = new HashSet(); + this.lock.readLock().lock(); + try { + // First go to the memcache. Pick up deletes and candidates. + this.memcache.getRowKeyAtOrBefore(row, candidateKeys, deletes); + // Process each store file. Run through from newest to oldest. + // This code below is very close to the body of the getKeys method. + Map m = this.storefiles.descendingMap(); + for (Map.Entry e: m.entrySet()) { + // Update the candidate keys from the current map file + rowAtOrBeforeFromStoreFile(e.getValue(), row, candidateKeys, deletes); + } + // Return the best key from candidateKeys + byte [] result = + candidateKeys.isEmpty()? null: candidateKeys.lastKey().getRow(); + return result; + } finally { + this.lock.readLock().unlock(); + } + } + + /* + * Check an individual MapFile for the row at or before a given key + * and timestamp + * @param f + * @param row + * @param candidateKeys + * @throws IOException + */ + private void rowAtOrBeforeFromStoreFile(final StoreFile f, + final byte [] row, final SortedMap candidateKeys, + final Set deletes) + throws IOException { + HFileScanner scanner = f.getReader().getScanner(); + // TODO: FIX THIS PROFLIGACY!!! + if (!scanner.seekBefore(new HStoreKey(row).getBytes())) { + return; + } + long now = System.currentTimeMillis(); + HStoreKey startKey = HStoreKey.create(scanner.getKey()); + // if there aren't any candidate keys yet, we'll do some things different + if (candidateKeys.isEmpty()) { + rowAtOrBeforeCandidate(startKey, f, row, candidateKeys, deletes, now); + } else { + rowAtOrBeforeWithCandidates(startKey, f, row, candidateKeys, deletes, now); + } + } + + /* Find a candidate for row that is at or before passed row in passed + * mapfile. + * @param startKey First key in the mapfile. + * @param map + * @param row + * @param candidateKeys + * @param now + * @throws IOException + */ + private void rowAtOrBeforeCandidate(final HStoreKey startKey, + final StoreFile f, final byte[] row, + final SortedMap candidateKeys, + final Set deletes, final long now) + throws IOException { + // if the row we're looking for is past the end of this mapfile, set the + // search key to be the last key. If its a deleted key, then we'll back + // up to the row before and return that. + HStoreKey finalKey = HStoreKey.create(f.getReader().getLastKey()); + HStoreKey searchKey = null; + if (HStoreKey.compareTwoRowKeys(finalKey.getRow(), row) < 0) { + searchKey = finalKey; + } else { + searchKey = new HStoreKey(row); + if (searchKey.compareTo(startKey) < 0) { + searchKey = startKey; + } + } + rowAtOrBeforeCandidate(f, searchKey, candidateKeys, deletes, now); + } + + /* + * @param ttlSetting + * @param hsk + * @param now + * @param deletes + * @return True if key has not expired and is not in passed set of deletes. + */ + static boolean notExpiredAndNotInDeletes(final long ttl, + final HStoreKey hsk, final long now, final Set deletes) { + return !isExpired(hsk, ttl, now) && + (deletes == null || !deletes.contains(hsk)); + } + + static boolean isExpired(final HStoreKey hsk, final long ttl, + final long now) { + return ttl != HConstants.FOREVER && now > hsk.getTimestamp() + ttl; + } + + /* Find a candidate for row that is at or before passed key, sk, in mapfile. + * @param f + * @param sk Key to go search the mapfile with. + * @param candidateKeys + * @param now + * @throws IOException + * @see {@link #rowAtOrBeforeCandidate(HStoreKey, org.apache.hadoop.io.MapFile.Reader, byte[], SortedMap, long)} + */ + private void rowAtOrBeforeCandidate(final StoreFile f, + final HStoreKey sk, final SortedMap candidateKeys, + final Set deletes, final long now) + throws IOException { + HStoreKey searchKey = sk; + HStoreKey readkey = null; + HStoreKey knownNoGoodKey = null; + HFileScanner scanner = f.getReader().getScanner(); + for (boolean foundCandidate = false; !foundCandidate;) { + // Seek to the exact row, or the one that would be immediately before it + int result = scanner.seekTo(searchKey.getBytes()); + if (result < 0) { + // Not in file. + continue; + } + HStoreKey deletedOrExpiredRow = null; + do { + readkey = HStoreKey.create(scanner.getKey()); + ByteBuffer value = scanner.getValue(); + // If we have an exact match on row, and it's not a delete, save this + // as a candidate key + if (HStoreKey.equalsTwoRowKeys(readkey.getRow(), searchKey.getRow())) { + if (!HLogEdit.isDeleted(value)) { + if (handleNonDelete(readkey, now, deletes, candidateKeys)) { + foundCandidate = true; + // NOTE! Continue. + continue; + } + } + HStoreKey copy = addCopyToDeletes(readkey, deletes); + if (deletedOrExpiredRow == null) { + deletedOrExpiredRow = copy; + } + } else if (HStoreKey.compareTwoRowKeys(readkey.getRow(), + searchKey.getRow()) > 0) { + // if the row key we just read is beyond the key we're searching for, + // then we're done. + break; + } else { + // So, the row key doesn't match, but we haven't gone past the row + // we're seeking yet, so this row is a candidate for closest + // (assuming that it isn't a delete). + if (!HLogEdit.isDeleted(value)) { + if (handleNonDelete(readkey, now, deletes, candidateKeys)) { + foundCandidate = true; + // NOTE: Continue + continue; + } + } + HStoreKey copy = addCopyToDeletes(readkey, deletes); + if (deletedOrExpiredRow == null) { + deletedOrExpiredRow = copy; + } + } + } while(scanner.next() && (knownNoGoodKey == null || + readkey.compareTo(knownNoGoodKey) < 0)); + + // If we get here and have no candidates but we did find a deleted or + // expired candidate, we need to look at the key before that + if (!foundCandidate && deletedOrExpiredRow != null) { + knownNoGoodKey = deletedOrExpiredRow; + if (!scanner.seekBefore(deletedOrExpiredRow.getBytes())) { + // Is this right? + break; + } + searchKey = HStoreKey.create(scanner.getKey()); + } else { + // No candidates and no deleted or expired candidates. Give up. + break; + } + } + + // Arriving here just means that we consumed the whole rest of the map + // without going "past" the key we're searching for. we can just fall + // through here. + } + + /* + * @param key Key to copy and add to deletes + * @param deletes + * @return Instance of the copy added to deletes + */ + private HStoreKey addCopyToDeletes(final HStoreKey key, + final Set deletes) { + HStoreKey copy = new HStoreKey(key); + deletes.add(copy); + return copy; + } + + private void rowAtOrBeforeWithCandidates(final HStoreKey startKey, + final StoreFile f, final byte[] row, + final SortedMap candidateKeys, + final Set deletes, final long now) + throws IOException { + // if there are already candidate keys, we need to start our search + // at the earliest possible key so that we can discover any possible + // deletes for keys between the start and the search key. Back up to start + // of the row in case there are deletes for this candidate in this mapfile + // BUT do not backup before the first key in the store file. + // TODO: FIX THIS PROFLIGATE OBJECT MAKING!!! + byte [] searchKey = + new HStoreKey(candidateKeys.firstKey().getRow()).getBytes(); + if (f.getReader().getComparator().compare(searchKey, 0, searchKey.length, + startKey.getRow(), 0, startKey.getRow().length) < 0) { + searchKey = startKey.getBytes(); + } + + // Seek to the exact row, or the one that would be immediately before it + HFileScanner scanner = f.getReader().getScanner(); + int result = scanner.seekTo(searchKey); + if (result < 0) { + // Key is before start of this file. Return. + return; + } + do { + HStoreKey k = HStoreKey.create(scanner.getKey()); + ByteBuffer v = scanner.getValue(); + // if we have an exact match on row, and it's not a delete, save this + // as a candidate key + if (HStoreKey.equalsTwoRowKeys(k.getRow(), row)) { + handleKey(k, v, now, deletes, candidateKeys); + } else if (HStoreKey.compareTwoRowKeys(k.getRow(), row) > 0 ) { + // if the row key we just read is beyond the key we're searching for, + // then we're done. + break; + } else { + // So, the row key doesn't match, but we haven't gone past the row + // we're seeking yet, so this row is a candidate for closest + // (assuming that it isn't a delete). + handleKey(k, v, now, deletes, candidateKeys); + } + } while(scanner.next()); + } + + /* + * @param readkey + * @param now + * @param deletes + * @param candidateKeys + */ + private void handleKey(final HStoreKey readkey, ByteBuffer value, + final long now, final Set deletes, + final SortedMap candidateKeys) { + if (!HLogEdit.isDeleted(value)) { + handleNonDelete(readkey, now, deletes, candidateKeys); + } else { + // Pass copy because readkey will change next time next is called. + handleDeleted(new HStoreKey(readkey), candidateKeys, deletes); + } + } + + /* + * @param readkey + * @param now + * @param deletes + * @param candidateKeys + * @return True if we added a candidate. + */ + private boolean handleNonDelete(final HStoreKey readkey, final long now, + final Set deletes, final Map candidateKeys) { + if (notExpiredAndNotInDeletes(this.ttl, readkey, now, deletes)) { + candidateKeys.put(stripTimestamp(readkey), + Long.valueOf(readkey.getTimestamp())); + return true; + } + return false; + } + + /* Handle keys whose values hold deletes. + * Add to the set of deletes and then if the candidate keys contain any that + * might match by timestamp, then check for a match and remove it if it's too + * young to survive the delete + * @param k Be careful; if key was gotten from a Mapfile, pass in a copy. + * Values gotten by 'nexting' out of Mapfiles will change in each invocation. + * @param candidateKeys + * @param deletes + */ + static void handleDeleted(final HStoreKey k, + final SortedMap candidateKeys, + final Set deletes) { + deletes.add(k); + HStoreKey strippedKey = stripTimestamp(k); + if (candidateKeys.containsKey(strippedKey)) { + long bestCandidateTs = + candidateKeys.get(strippedKey).longValue(); + if (bestCandidateTs <= k.getTimestamp()) { + candidateKeys.remove(strippedKey); + } + } + } + + static HStoreKey stripTimestamp(HStoreKey key) { + return new HStoreKey(key.getRow(), key.getColumn()); + } + + /* + * Test that the target matches the origin cell address. If the + * origin has an empty column, then it's assumed to mean any column + * matches and only match on row and timestamp. Otherwise, it compares the + * keys with HStoreKey.matchesRowCol(). + * @param origin The key we're testing against + * @param target The key we're testing + */ + private boolean cellMatches(HStoreKey origin, HStoreKey target){ + // if the origin's column is empty, then we're matching any column + if (Bytes.equals(origin.getColumn(), HConstants.EMPTY_BYTE_ARRAY)) { + // if the row matches, then... + if (HStoreKey.equalsTwoRowKeys(target.getRow(), origin.getRow())) { + // check the timestamp + return target.getTimestamp() <= origin.getTimestamp(); + } + return false; + } + // otherwise, we want to match on row and column + return target.matchesRowCol(origin); + } + + /* + * Test that the target matches the origin. If the origin + * has an empty column, then it just tests row equivalence. Otherwise, it uses + * HStoreKey.matchesRowCol(). + * @param origin Key we're testing against + * @param target Key we're testing + */ + private boolean rowMatches(final HStoreKey origin, final HStoreKey target){ + // if the origin's column is empty, then we're matching any column + if (Bytes.equals(origin.getColumn(), HConstants.EMPTY_BYTE_ARRAY)) { + // if the row matches, then... + return HStoreKey.equalsTwoRowKeys(target.getRow(), origin.getRow()); + } + // otherwise, we want to match on row and column + return target.matchesRowCol(origin); + } + + /** + * Determines if HStore can be split + * @param force Whether to force a split or not. + * @return a StoreSize if store can be split, null otherwise. + */ + StoreSize checkSplit(final boolean force) { + this.lock.readLock().lock(); + try { + // Iterate through all store files + if (this.storefiles.size() <= 0) { + return null; + } + if (!force && (storeSize < this.desiredMaxFileSize)) { + return null; + } + // Not splitable if we find a reference store file present in the store. + boolean splitable = true; + long maxSize = 0L; + Long mapIndex = Long.valueOf(0L); + for (Map.Entry e: storefiles.entrySet()) { + StoreFile curHSF = e.getValue(); + if (splitable) { + splitable = !curHSF.isReference(); + if (!splitable) { + // RETURN IN MIDDLE OF FUNCTION!!! If not splitable, just return. + if (LOG.isDebugEnabled()) { + LOG.debug(curHSF + " is not splittable"); + } + return null; + } + } + long size = curHSF.getReader().length(); + if (size > maxSize) { + // This is the largest one so far + maxSize = size; + mapIndex = e.getKey(); + } + } + + HFile.Reader r = this.storefiles.get(mapIndex).getReader(); + // Get first, last, and mid keys. Midkey is the key that starts block + // in middle of hfile. Has column and timestamp. Need to return just + // the row we want to split on as midkey. + byte [] midkey = r.midkey(); + if (midkey != null) { + HStoreKey mk = HStoreKey.create(midkey); + HStoreKey firstKey = HStoreKey.create(r.getFirstKey()); + HStoreKey lastKey = HStoreKey.create(r.getLastKey()); + // if the midkey is the same as the first and last keys, then we cannot + // (ever) split this region. + if (HStoreKey.equalsTwoRowKeys(mk.getRow(), firstKey.getRow()) && + HStoreKey.equalsTwoRowKeys( mk.getRow(), lastKey.getRow())) { + if (LOG.isDebugEnabled()) { + LOG.debug("cannot split because midkey is the same as first or " + + "last row"); + } + return null; + } + return new StoreSize(maxSize, mk.getRow()); + } + } catch(IOException e) { + LOG.warn("Failed getting store size for " + this.storeNameStr, e); + } finally { + this.lock.readLock().unlock(); + } + return null; + } + + /** @return aggregate size of HStore */ + public long getSize() { + return storeSize; + } + + ////////////////////////////////////////////////////////////////////////////// + // File administration + ////////////////////////////////////////////////////////////////////////////// + + /** + * Return a scanner for both the memcache and the HStore files + */ + protected InternalScanner getScanner(long timestamp, byte [][] targetCols, + byte [] firstRow, RowFilterInterface filter) + throws IOException { + lock.readLock().lock(); + try { + return new StoreScanner(this, targetCols, firstRow, timestamp, filter); + } finally { + lock.readLock().unlock(); + } + } + + @Override + public String toString() { + return this.storeNameStr; + } + + /** + * @return Count of store files + */ + int getStorefilesCount() { + return this.storefiles.size(); + } + + /** + * @return The size of the store file indexes, in bytes. + * @throws IOException if there was a problem getting file sizes from the + * filesystem + */ + long getStorefilesIndexSize() throws IOException { + long size = 0; + for (StoreFile s: storefiles.values()) + size += s.getReader().indexSize(); + return size; + } + + /* + * Datastructure that holds size and row to split a file around. + */ + class StoreSize { + private final long size; + private final byte[] key; + StoreSize(long size, byte[] key) { + this.size = size; + this.key = new byte[key.length]; + System.arraycopy(key, 0, this.key, 0, key.length); + } + /* @return the size */ + long getSize() { + return size; + } + /* @return the key */ + byte[] getSplitRow() { + return key; + } + } + + HRegionInfo getHRegionInfo() { + return this.regioninfo; + } + + /** + * Convenience method that implements the old MapFile.getClosest on top of + * HFile Scanners. getClosest used seek to the asked-for key or just after + * (HFile seeks to the key or just before). + * @param s + * @param b + * @return True if we were able to seek the scanner to b or to + * the key just after. + * @throws IOException + */ + static boolean getClosest(final HFileScanner s, final byte [] b) + throws IOException { + int result = s.seekTo(b); + if (result < 0) { + // Not in file. Will the first key do? + if (!s.seekTo()) { + return false; + } + } else if (result > 0) { + // Less than what was asked for but maybe < because we're asking for + // r/c/LATEST_TIMESTAMP -- what was returned was r/c-1/SOME_TS... + // A next will get us a r/c/SOME_TS. + if (!s.next()) { + return false; + } + } + return true; + } +} diff --git a/src/java/org/apache/hadoop/hbase/regionserver/StoreFile.java b/src/java/org/apache/hadoop/hbase/regionserver/StoreFile.java new file mode 100644 index 00000000000..133ef2cef8c --- /dev/null +++ b/src/java/org/apache/hadoop/hbase/regionserver/StoreFile.java @@ -0,0 +1,443 @@ +/** + * Copyright 2009 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.regionserver; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.HStoreKey; +import org.apache.hadoop.hbase.io.HalfHFileReader; +import org.apache.hadoop.hbase.io.Reference; +import org.apache.hadoop.hbase.io.hfile.HFile; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.io.RawComparator; + +/** + * A Store data file. Stores usually have one or more of these files. They + * are produced by flushing the memcache to disk. To + * create, call {@link #getWriter(FileSystem, Path)} and append data. Be + * sure to add any metadata before calling close on the Writer + * (Use the appendMetadata convenience methods). On close, a StoreFile is + * sitting in the Filesystem. To refer to it, create a StoreFile instance + * passing filesystem and path. To read, call {@link #getReader()}. + *

StoreFiles may also reference store files in another Store. + */ +public class StoreFile implements HConstants { + static final Log LOG = LogFactory.getLog(StoreFile.class.getName()); + + // Make default block size for StoreFiles 8k while testing. TODO: FIX! + // Need to make it 8k for testing. + private static final int DEFAULT_BLOCKSIZE_SMALL = 8 * 1024; + + private final FileSystem fs; + // This file's path. + private final Path path; + // If this storefile references another, this is the reference instance. + private Reference reference; + // If this StoreFile references another, this is the other files path. + private Path referencePath; + + // Keys for metadata stored in backing HFile. + private static final byte [] MAX_SEQ_ID_KEY = Bytes.toBytes("MAX_SEQ_ID_KEY"); + // Set when we obtain a Reader. + private long sequenceid = -1; + + private static final byte [] MAJOR_COMPACTION_KEY = + Bytes.toBytes("MAJOR_COMPACTION_KEY"); + // If true, this file was product of a major compaction. Its then set + // whenever you get a Reader. + private AtomicBoolean majorCompaction = null; + + /* + * Regex that will work for straight filenames and for reference names. + * If reference, then the regex has more than just one group. Group 1 is + * this files id. Group 2 the referenced region name, etc. + */ + private static final Pattern REF_NAME_PARSER = + Pattern.compile("^(\\d+)(?:\\.(.+))?$"); + + private volatile HFile.Reader reader; + + // Used making file ids. + private final static Random rand = new Random(); + + /** + * Constructor. + * Loads up a Reader (and its indices, etc.). + * @param fs Filesystem. + * @param p qualified path + * @throws IOException + */ + StoreFile(final FileSystem fs, final Path p) + throws IOException { + this.fs = fs; + this.path = p; + if (isReference(p)) { + this.reference = Reference.read(fs, p); + this.referencePath = getReferredToFile(this.path); + } + this.reader = open(); + } + + /** + * @return Path or null if this StoreFile was made with a Stream. + */ + Path getPath() { + return this.path; + } + + /** + * @return The Store/ColumnFamily this file belongs to. + */ + byte [] getFamily() { + return Bytes.toBytes(this.path.getParent().getName()); + } + + /** + * @return True if this is a StoreFile Reference; call after {@link #open()} + * else may get wrong answer. + */ + boolean isReference() { + return this.reference != null; + } + + /** + * @param p Path to check. + * @return True if the path has format of a HStoreFile reference. + */ + public static boolean isReference(final Path p) { + return isReference(p, REF_NAME_PARSER.matcher(p.getName())); + } + + /** + * @param p Path to check. + * @param m Matcher to use. + * @return True if the path has format of a HStoreFile reference. + */ + public static boolean isReference(final Path p, final Matcher m) { + if (m == null || !m.matches()) { + LOG.warn("Failed match of store file name " + p.toString()); + throw new RuntimeException("Failed match of store file name " + + p.toString()); + } + return m.groupCount() > 1 && m.group(2) != null; + } + + /* + * Return path to the file referred to by a Reference. Presumes a directory + * hierarchy of ${hbase.rootdir}/tablename/regionname/familyname. + * @param p Path to a Reference file. + * @return Calculated path to parent region file. + * @throws IOException + */ + static Path getReferredToFile(final Path p) throws IOException { + Matcher m = REF_NAME_PARSER.matcher(p.getName()); + if (m == null || !m.matches()) { + LOG.warn("Failed match of store file name " + p.toString()); + throw new RuntimeException("Failed match of store file name " + + p.toString()); + } + // Other region name is suffix on the passed Reference file name + String otherRegion = m.group(2); + // Tabledir is up two directories from where Reference was written. + Path tableDir = p.getParent().getParent().getParent(); + String nameStrippedOfSuffix = m.group(1); + // Build up new path with the referenced region in place of our current + // region in the reference path. Also strip regionname suffix from name. + return new Path(new Path(new Path(tableDir, otherRegion), + p.getParent().getName()), nameStrippedOfSuffix); + } + + /** + * @return True if this file was made by a major compaction. + */ + boolean isMajorCompaction() { + if (this.majorCompaction == null) { + throw new NullPointerException("This has not been set yet"); + } + return this.majorCompaction.get(); + } + + /** + * @return This files maximum edit sequence id. + */ + public long getMaxSequenceId() { + if (this.sequenceid == -1) { + throw new IllegalAccessError("Has not been initialized"); + } + return this.sequenceid; + } + + /** + * Opens reader on this store file. Called by Constructor. + * @return Reader for the store file. + * @throws IOException + * @see #close() + */ + protected HFile.Reader open() + throws IOException { + if (this.reader != null) { + throw new IllegalAccessError("Already open"); + } + if (isReference()) { + this.reader = new HalfHFileReader(this.fs, this.referencePath, null, + this.reference); + } else { + this.reader = new HFile.Reader(this.fs, this.path, null); + } + // Load up indices and fileinfo. + Map map = this.reader.loadFileInfo(); + // Read in our metadata. + byte [] b = map.get(MAX_SEQ_ID_KEY); + if (b != null) { + // By convention, if halfhfile, top half has a sequence number > bottom + // half. Thats why we add one in below. Its done for case the two halves + // are ever merged back together --rare. Without it, on open of store, + // since store files are distingushed by sequence id, the one half would + // subsume the other. + this.sequenceid = Bytes.toLong(b); + if (isReference()) { + if (Reference.isTopFileRegion(this.reference.getFileRegion())) { + this.sequenceid += 1; + } + } + + } + b = map.get(MAJOR_COMPACTION_KEY); + if (b != null) { + boolean mc = Bytes.toBoolean(b); + if (this.majorCompaction == null) { + this.majorCompaction = new AtomicBoolean(mc); + } else { + this.majorCompaction.set(mc); + } + } + return this.reader; + } + + /** + * @return Current reader. Must call open first. + */ + public HFile.Reader getReader() { + if (this.reader == null) { + throw new IllegalAccessError("Call open first"); + } + return this.reader; + } + + /** + * @throws IOException + * @see #open() + */ + public synchronized void close() throws IOException { + if (this.reader != null) { + this.reader.close(); + this.reader = null; + } + } + + public String toString() { + return this.path.toString() + + (isReference()? "-" + this.referencePath + "-" + reference.toString(): ""); + } + + /** + * Delete this file + * @throws IOException + */ + public void delete() throws IOException { + close(); + this.fs.delete(getPath(), true); + } + + /** + * Utility to help with rename. + * @param fs + * @param src + * @param tgt + * @return True if succeeded. + * @throws IOException + */ + public static Path rename(final FileSystem fs, final Path src, + final Path tgt) + throws IOException { + if (!fs.exists(src)) { + throw new FileNotFoundException(src.toString()); + } + if (!fs.rename(src, tgt)) { + throw new IOException("Failed rename of " + src + " to " + tgt); + } + return tgt; + } + + /** + * Get a store file writer. Client is responsible for closing file when done. + * If metadata, add BEFORE closing using + * {@link #appendMetadata(org.apache.hadoop.hbase.io.hfile.HFile.Writer, long)}. + * @param fs + * @param dir Path to family directory. Makes the directory if doesn't exist. + * Creates a file with a unique name in this directory. + * @return HFile.Writer + * @throws IOException + */ + public static HFile.Writer getWriter(final FileSystem fs, final Path dir) + throws IOException { + return getWriter(fs, dir, DEFAULT_BLOCKSIZE_SMALL, null, null); + } + + /** + * Get a store file writer. Client is responsible for closing file when done. + * If metadata, add BEFORE closing using + * {@link #appendMetadata(org.apache.hadoop.hbase.io.hfile.HFile.Writer, long)}. + * @param fs + * @param dir Path to family directory. Makes the directory if doesn't exist. + * Creates a file with a unique name in this directory. + * @param blocksize + * @param algorithm Pass null to get default. + * @param c Pass null to get default. + * @return HFile.Writer + * @throws IOException + */ + public static HFile.Writer getWriter(final FileSystem fs, final Path dir, + final int blocksize, final String algorithm, final RawComparator c) + throws IOException { + if (!fs.exists(dir)) { + fs.mkdirs(dir); + } + Path path = getUniqueFile(fs, dir); + return new HFile.Writer(fs, path, blocksize, + algorithm == null? HFile.DEFAULT_COMPRESSION: algorithm, + c == null? HStoreKey.BYTECOMPARATOR: c); + } + + /** + * @param fs + * @param p + * @return random filename inside passed dir + */ + static Path getUniqueFile(final FileSystem fs, final Path p) + throws IOException { + if (!fs.getFileStatus(p).isDir()) { + throw new IOException("Expecting a directory"); + } + return fs.getFileStatus(p).isDir()? getRandomFilename(fs, p): p; + } + + /** + * @param fs + * @param dir + * @param encodedRegionName + * @param family + * @return Path to a file that doesn't exist at time of this invocation. + * @throws IOException + */ + static Path getRandomFilename(final FileSystem fs, final Path dir) + throws IOException { + return getRandomFilename(fs, dir, null); + } + + /** + * @param fs + * @param dir + * @param encodedRegionName + * @param family + * @param suffix + * @return Path to a file that doesn't exist at time of this invocation. + * @throws IOException + */ + static Path getRandomFilename(final FileSystem fs, final Path dir, + final String suffix) + throws IOException { + long id = -1; + Path p = null; + do { + id = Math.abs(rand.nextLong()); + p = new Path(dir, Long.toString(id) + + ((suffix == null || suffix.length() <= 0)? "": suffix)); + } while(fs.exists(p)); + return p; + } + + /** + * Write file metadata. + * Call before you call close on the passed w since its written + * as metadata to that file. + * + * @param filesystem file system + * @param maxSequenceId Maximum sequence id. + * @throws IOException + */ + static void appendMetadata(final HFile.Writer w, final long maxSequenceId) + throws IOException { + appendMetadata(w, maxSequenceId, false); + } + + /** + * Writes metadata. + * Call before you call close on the passed w since its written + * as metadata to that file. + * @param maxSequenceId Maximum sequence id. + * @param mc True if this file is product of a major compaction + * @throws IOException + */ + static void appendMetadata(final HFile.Writer w, final long maxSequenceId, + final boolean mc) + throws IOException { + w.appendFileInfo(MAX_SEQ_ID_KEY, Bytes.toBytes(maxSequenceId)); + w.appendFileInfo(MAJOR_COMPACTION_KEY, Bytes.toBytes(mc)); + } + + /* + * Write out a split reference. + * @param fs + * @param splitDir Presumes path format is actually + * SOME_DIRECTORY/REGIONNAME/FAMILY. + * @param f File to split. + * @param splitRow + * @param range + * @return Path to created reference. + * @throws IOException + */ + static Path split(final FileSystem fs, final Path splitDir, + final StoreFile f, final byte [] splitRow, final Reference.Range range) + throws IOException { + // A reference to the bottom half of the hsf store file. + Reference r = new Reference(new HStoreKey(splitRow).getBytes(), range); + // Add the referred-to regions name as a dot separated suffix. + // See REF_NAME_PARSER regex above. The referred-to regions name is + // up in the path of the passed in f -- parentdir is family, + // then the directory above is the region name. + String parentRegionName = f.getPath().getParent().getParent().getName(); + // Write reference with same file id only with the other region name as + // suffix and into the new region location (under same family). + Path p = new Path(splitDir, f.getPath().getName() + "." + parentRegionName); + return r.write(fs, p); + } +} \ No newline at end of file diff --git a/src/java/org/apache/hadoop/hbase/regionserver/HStoreScanner.java b/src/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java similarity index 91% rename from src/java/org/apache/hadoop/hbase/regionserver/HStoreScanner.java rename to src/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java index 33f5a3b9a8c..ccd5d605048 100644 --- a/src/java/org/apache/hadoop/hbase/regionserver/HStoreScanner.java +++ b/src/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java @@ -36,13 +36,12 @@ import org.apache.hadoop.hbase.HStoreKey; import org.apache.hadoop.hbase.filter.RowFilterInterface; import org.apache.hadoop.hbase.io.Cell; import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.hbase.io.MapFile; /** * Scanner scans both the memcache and the HStore */ -class HStoreScanner implements InternalScanner, ChangedReadersObserver { - static final Log LOG = LogFactory.getLog(HStoreScanner.class); +class StoreScanner implements InternalScanner, ChangedReadersObserver { + static final Log LOG = LogFactory.getLog(StoreScanner.class); private InternalScanner[] scanners; private TreeMap[] resultSets; @@ -50,7 +49,7 @@ class HStoreScanner implements InternalScanner, ChangedReadersObserver { private boolean wildcardMatch = false; private boolean multipleMatchers = false; private RowFilterInterface dataFilter; - private HStore store; + private Store store; private final long timestamp; private final byte [][] targetCols; @@ -66,7 +65,7 @@ class HStoreScanner implements InternalScanner, ChangedReadersObserver { /** Create an Scanner with a handle on the memcache and HStore files. */ @SuppressWarnings("unchecked") - HStoreScanner(HStore store, byte [][] targetCols, byte [] firstRow, + StoreScanner(Store store, byte [][] targetCols, byte [] firstRow, long timestamp, RowFilterInterface filter) throws IOException { this.store = store; @@ -81,7 +80,6 @@ class HStoreScanner implements InternalScanner, ChangedReadersObserver { // See updateReaders below. this.timestamp = timestamp; this.targetCols = targetCols; - try { scanners[MEMS_INDEX] = store.memcache.getScanner(timestamp, targetCols, firstRow); @@ -124,7 +122,8 @@ class HStoreScanner implements InternalScanner, ChangedReadersObserver { private void setupScanner(final int i) throws IOException { this.keys[i] = new HStoreKey(); this.resultSets[i] = new TreeMap(Bytes.BYTES_COMPARATOR); - if (this.scanners[i] != null && !this.scanners[i].next(this.keys[i], this.resultSets[i])) { + if (this.scanners[i] != null && !this.scanners[i].next(this.keys[i], + this.resultSets[i])) { closeScanner(i); } } @@ -154,10 +153,8 @@ class HStoreScanner implements InternalScanner, ChangedReadersObserver { for (int i = 0; i < this.keys.length; i++) { if (scanners[i] != null && (chosenRow == null || - (HStoreKey.compareTwoRowKeys(store.getHRegionInfo(), - keys[i].getRow(), chosenRow) < 0) || - ((HStoreKey.compareTwoRowKeys(store.getHRegionInfo(), - keys[i].getRow(), chosenRow) == 0) && + (HStoreKey.compareTwoRowKeys(this.keys[i].getRow(), chosenRow) < 0) || + ((HStoreKey.compareTwoRowKeys(this.keys[i].getRow(), chosenRow) == 0) && (keys[i].getTimestamp() > chosenTimestamp)))) { chosenRow = keys[i].getRow(); chosenTimestamp = keys[i].getTimestamp(); @@ -187,8 +184,7 @@ class HStoreScanner implements InternalScanner, ChangedReadersObserver { while ((scanners[i] != null && !filtered && moreToFollow) - && (HStoreKey.compareTwoRowKeys(store.getHRegionInfo(), - keys[i].getRow(), chosenRow) == 0)) { + && (HStoreKey.compareTwoRowKeys(this.keys[i].getRow(), chosenRow) == 0)) { // If we are doing a wild card match or there are multiple // matchers per column, we need to scan all the older versions of // this row to pick up the rest of the family members @@ -204,7 +200,7 @@ class HStoreScanner implements InternalScanner, ChangedReadersObserver { // a result if the map does not contain the key. HStoreKey hsk = new HStoreKey(key.getRow(), HConstants.EMPTY_BYTE_ARRAY, - key.getTimestamp(), this.store.getHRegionInfo()); + key.getTimestamp()); for (Map.Entry e : resultSets[i].entrySet()) { hsk.setColumn(e.getKey()); if (HLogEdit.isDeleted(e.getValue().getValue())) { @@ -231,15 +227,13 @@ class HStoreScanner implements InternalScanner, ChangedReadersObserver { closeScanner(i); } } - } + } } - for (int i = 0; i < scanners.length; i++) { // If the current scanner is non-null AND has a lower-or-equal // row label, then its timestamp is bad. We need to advance it. while ((scanners[i] != null) && - (HStoreKey.compareTwoRowKeys(store.getHRegionInfo(), - keys[i].getRow(), chosenRow) <= 0)) { + (HStoreKey.compareTwoRowKeys(this.keys[i].getRow(), chosenRow) <= 0)) { resultSets[i].clear(); if (!scanners[i].next(keys[i], resultSets[i])) { closeScanner(i); @@ -248,7 +242,6 @@ class HStoreScanner implements InternalScanner, ChangedReadersObserver { } moreToFollow = chosenTimestamp >= 0; - if (dataFilter != null) { if (dataFilter.filterAllRemaining()) { moreToFollow = false; @@ -319,9 +312,8 @@ class HStoreScanner implements InternalScanner, ChangedReadersObserver { } this.lock.writeLock().lock(); try { - MapFile.Reader [] readers = this.store.getReaders(); - if (this.scanners[HSFS_INDEX] == null && readers != null && - readers.length > 0) { + Map map = this.store.getStorefiles(); + if (this.scanners[HSFS_INDEX] == null && map != null && map.size() > 0) { // Presume that we went from no readers to at least one -- need to put // a HStoreScanner in place. try { diff --git a/src/test/org/apache/hadoop/hbase/HFilePerformanceEvaluation.java b/src/test/org/apache/hadoop/hbase/HFilePerformanceEvaluation.java new file mode 100644 index 00000000000..a6d2ea08bfc --- /dev/null +++ b/src/test/org/apache/hadoop/hbase/HFilePerformanceEvaluation.java @@ -0,0 +1,365 @@ +/** + * Copyright 2007 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Random; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.commons.math.random.RandomData; +import org.apache.commons.math.random.RandomDataImpl; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.io.hfile.HFile; +import org.apache.hadoop.hbase.io.hfile.HFileScanner; +import org.apache.hadoop.hbase.util.Bytes; + +/** + *

+ * This class runs performance benchmarks for {@link HFile}. + *

+ */ +public class HFilePerformanceEvaluation { + + private static final int ROW_LENGTH = 10; + private static final int ROW_COUNT = 1000000; + private static final int RFILE_BLOCKSIZE = 8 * 1024; + + static final Log LOG = + LogFactory.getLog(HFilePerformanceEvaluation.class.getName()); + + static byte [] format(final int i) { + String v = Integer.toString(i); + return Bytes.toBytes("0000000000".substring(v.length()) + v); + } + + static ImmutableBytesWritable format(final int i, ImmutableBytesWritable w) { + w.set(format(i)); + return w; + } + + private void runBenchmarks() throws Exception { + final Configuration conf = new Configuration(); + final FileSystem fs = FileSystem.get(conf); + final Path mf = fs.makeQualified(new Path("performanceevaluation.mapfile")); + if (fs.exists(mf)) { + fs.delete(mf, true); + } + + runBenchmark(new SequentialWriteBenchmark(conf, fs, mf, ROW_COUNT), + ROW_COUNT); + PerformanceEvaluationCommons.concurrentReads(new Runnable() { + public void run() { + try { + runBenchmark(new UniformRandomSmallScan(conf, fs, mf, ROW_COUNT), + ROW_COUNT); + } catch (Exception e) { + e.printStackTrace(); + } + } + }); + PerformanceEvaluationCommons.concurrentReads(new Runnable() { + public void run() { + try { + runBenchmark(new UniformRandomReadBenchmark(conf, fs, mf, ROW_COUNT), + ROW_COUNT); + } catch (Exception e) { + e.printStackTrace(); + } + } + }); + PerformanceEvaluationCommons.concurrentReads(new Runnable() { + public void run() { + try { + runBenchmark(new GaussianRandomReadBenchmark(conf, fs, mf, ROW_COUNT), + ROW_COUNT); + } catch (Exception e) { + e.printStackTrace(); + } + } + }); + PerformanceEvaluationCommons.concurrentReads(new Runnable() { + public void run() { + try { + runBenchmark(new SequentialReadBenchmark(conf, fs, mf, ROW_COUNT), + ROW_COUNT); + } catch (Exception e) { + e.printStackTrace(); + } + } + }); + + } + + private void runBenchmark(RowOrientedBenchmark benchmark, int rowCount) + throws Exception { + LOG.info("Running " + benchmark.getClass().getSimpleName() + " for " + + rowCount + " rows."); + long elapsedTime = benchmark.run(); + LOG.info("Running " + benchmark.getClass().getSimpleName() + " for " + + rowCount + " rows took " + elapsedTime + "ms."); + } + + static abstract class RowOrientedBenchmark { + + protected final Configuration conf; + protected final FileSystem fs; + protected final Path mf; + protected final int totalRows; + + public RowOrientedBenchmark(Configuration conf, FileSystem fs, Path mf, + int totalRows) { + this.conf = conf; + this.fs = fs; + this.mf = mf; + this.totalRows = totalRows; + } + + void setUp() throws Exception { + // do nothing + } + + abstract void doRow(int i) throws Exception; + + protected int getReportingPeriod() { + return this.totalRows / 10; + } + + void tearDown() throws Exception { + // do nothing + } + + /** + * Run benchmark + * @return elapsed time. + * @throws Exception + */ + long run() throws Exception { + long elapsedTime; + setUp(); + long startTime = System.currentTimeMillis(); + try { + for (int i = 0; i < totalRows; i++) { + if (i > 0 && i % getReportingPeriod() == 0) { + LOG.info("Processed " + i + " rows."); + } + doRow(i); + } + elapsedTime = System.currentTimeMillis() - startTime; + } finally { + tearDown(); + } + return elapsedTime; + } + + } + + static class SequentialWriteBenchmark extends RowOrientedBenchmark { + protected HFile.Writer writer; + private Random random = new Random(); + private byte[] bytes = new byte[ROW_LENGTH]; + + public SequentialWriteBenchmark(Configuration conf, FileSystem fs, Path mf, + int totalRows) { + super(conf, fs, mf, totalRows); + } + + @Override + void setUp() throws Exception { + writer = new HFile.Writer(this.fs, this.mf, RFILE_BLOCKSIZE, null, null); + } + + @Override + void doRow(int i) throws Exception { + writer.append(format(i), generateValue()); + } + + private byte[] generateValue() { + random.nextBytes(bytes); + return bytes; + } + + @Override + protected int getReportingPeriod() { + return this.totalRows; // don't report progress + } + + @Override + void tearDown() throws Exception { + writer.close(); + } + + } + + static abstract class ReadBenchmark extends RowOrientedBenchmark { + ImmutableBytesWritable key = new ImmutableBytesWritable(); + ImmutableBytesWritable value = new ImmutableBytesWritable(); + + protected HFile.Reader reader; + + public ReadBenchmark(Configuration conf, FileSystem fs, Path mf, + int totalRows) { + super(conf, fs, mf, totalRows); + } + + @Override + void setUp() throws Exception { + reader = new HFile.Reader(this.fs, this.mf, null); + this.reader.loadFileInfo(); + } + + @Override + void tearDown() throws Exception { + reader.close(); + } + + } + + static class SequentialReadBenchmark extends ReadBenchmark { + private HFileScanner scanner; + + public SequentialReadBenchmark(Configuration conf, FileSystem fs, + Path mf, int totalRows) + throws IOException { + super(conf, fs, mf, totalRows); + } + + @Override + void setUp() throws Exception { + super.setUp(); + this.scanner = this.reader.getScanner(); + this.scanner.seekTo(); + } + + @Override + void doRow(@SuppressWarnings("unused") int i) throws Exception { + if (this.scanner.next()) { + ByteBuffer k = this.scanner.getKey(); + PerformanceEvaluationCommons.assertKey(format(i + 1), k); + ByteBuffer v = scanner.getValue(); + PerformanceEvaluationCommons.assertValueSize(v.limit(), ROW_LENGTH); + } + } + + @Override + protected int getReportingPeriod() { + return this.totalRows; // don't report progress + } + + } + + static class UniformRandomReadBenchmark extends ReadBenchmark { + + private Random random = new Random(); + + public UniformRandomReadBenchmark(Configuration conf, FileSystem fs, + Path mf, int totalRows) { + super(conf, fs, mf, totalRows); + } + + @Override + void doRow(@SuppressWarnings("unused") int i) throws Exception { + HFileScanner scanner = this.reader.getScanner(); + byte [] b = getRandomRow(); + scanner.seekTo(b); + ByteBuffer k = scanner.getKey(); + PerformanceEvaluationCommons.assertKey(b, k); + ByteBuffer v = scanner.getValue(); + PerformanceEvaluationCommons.assertValueSize(v.limit(), ROW_LENGTH); + } + + private byte [] getRandomRow() { + return format(random.nextInt(totalRows)); + } + } + + static class UniformRandomSmallScan extends ReadBenchmark { + private Random random = new Random(); + + public UniformRandomSmallScan(Configuration conf, FileSystem fs, + Path mf, int totalRows) { + super(conf, fs, mf, totalRows/10); + } + + @Override + void doRow(@SuppressWarnings("unused") int i) throws Exception { + HFileScanner scanner = this.reader.getScanner(); + byte [] b = getRandomRow(); + if (scanner.seekTo(b) != 0) { + System.out.println("Nonexistent row: " + new String(b)); + return; + } + ByteBuffer k = scanner.getKey(); + PerformanceEvaluationCommons.assertKey(b, k); + // System.out.println("Found row: " + new String(b)); + for (int ii = 0; ii < 30; ii++) { + if (!scanner.next()) { + System.out.println("NOTHING FOLLOWS"); + } + ByteBuffer v = scanner.getValue(); + PerformanceEvaluationCommons.assertValueSize(v.limit(), ROW_LENGTH); + } + } + + private byte [] getRandomRow() { + return format(random.nextInt(totalRows)); + } + } + + static class GaussianRandomReadBenchmark extends ReadBenchmark { + + private RandomData randomData = new RandomDataImpl(); + + public GaussianRandomReadBenchmark(Configuration conf, FileSystem fs, + Path mf, int totalRows) { + super(conf, fs, mf, totalRows); + } + + @Override + void doRow(@SuppressWarnings("unused") int i) throws Exception { + HFileScanner scanner = this.reader.getScanner(); + scanner.seekTo(getGaussianRandomRowBytes()); + for (int ii = 0; ii < 30; ii++) { + if (!scanner.next()) { + System.out.println("NOTHING FOLLOWS"); + } + scanner.getKey(); + scanner.getValue(); + } + } + + private byte [] getGaussianRandomRowBytes() { + int r = (int) randomData.nextGaussian(totalRows / 2, totalRows / 10); + return format(r); + } + } + + /** + * @param args + * @throws IOException + */ + public static void main(String[] args) throws Exception { + new HFilePerformanceEvaluation().runBenchmarks(); + } +} \ No newline at end of file diff --git a/src/test/org/apache/hadoop/hbase/PerformanceEvaluationCommons.java b/src/test/org/apache/hadoop/hbase/PerformanceEvaluationCommons.java new file mode 100644 index 00000000000..78d984cee54 --- /dev/null +++ b/src/test/org/apache/hadoop/hbase/PerformanceEvaluationCommons.java @@ -0,0 +1,78 @@ +/** + * Copyright 2009 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + + +/** + * Code shared by PE tests. + */ +public class PerformanceEvaluationCommons { + static final Log LOG = + LogFactory.getLog(PerformanceEvaluationCommons.class.getName()); + + public static void assertValueSize(final int expectedSize, final int got) { + if (got != expectedSize) { + throw new AssertionError("Expected " + expectedSize + " but got " + got); + } + } + + public static void assertKey(final byte [] expected, final ByteBuffer got) { + byte [] b = new byte[got.limit()]; + got.get(b, 0, got.limit()); + assertKey(expected, b); + } + + public static void assertKey(final byte [] expected, final byte [] got) { + if (!org.apache.hadoop.hbase.util.Bytes.equals(expected, got)) { + throw new AssertionError("Expected " + + org.apache.hadoop.hbase.util.Bytes.toString(expected) + + " but got " + org.apache.hadoop.hbase.util.Bytes.toString(got)); + } + } + + public static void concurrentReads(final Runnable r) { + final int count = 1; + long now = System.currentTimeMillis(); + List threads = new ArrayList(count); + for (int i = 0; i < count; i++) { + Thread t = new Thread(r); + t.setName("" + i); + threads.add(t); + } + for (Thread t: threads) { + t.start(); + } + for (Thread t: threads) { + try { + t.join(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + LOG.info("Test took " + (System.currentTimeMillis() - now)); + } +} \ No newline at end of file diff --git a/src/test/org/apache/hadoop/hbase/TestHStoreKey.java b/src/test/org/apache/hadoop/hbase/TestHStoreKey.java new file mode 100644 index 00000000000..f03f01cf1b0 --- /dev/null +++ b/src/test/org/apache/hadoop/hbase/TestHStoreKey.java @@ -0,0 +1,128 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import junit.framework.TestCase; + +import org.apache.hadoop.hbase.HStoreKey.StoreKeyByteComparator; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.Writables; + +/** + * Tests for the HStoreKey Plain and Meta RawComparators. + */ +public class TestHStoreKey extends TestCase { + protected void setUp() throws Exception { + super.setUp(); + } + + protected void tearDown() throws Exception { + super.tearDown(); + } + + public void testByteBuffer() throws Exception { + final long ts = 123; + final byte [] row = Bytes.toBytes("row"); + final byte [] column = Bytes.toBytes("column"); + HStoreKey hsk = new HStoreKey(row, column, ts); + ByteBuffer bb = ByteBuffer.wrap(hsk.getBytes()); + assertTrue(Bytes.equals(row, HStoreKey.getRow(bb))); + assertTrue(Bytes.equals(column, HStoreKey.getColumn(bb))); + assertEquals(ts, HStoreKey.getTimestamp(bb)); + } + + /** + * Test the byte comparator works same as the object comparator. + */ + public void testRawComparator() throws IOException { + long timestamp = System.currentTimeMillis(); + byte [] a = Bytes.toBytes("a"); + HStoreKey past = new HStoreKey(a, a, timestamp - 10); + byte [] pastBytes = Writables.getBytes(past); + HStoreKey now = new HStoreKey(a, a, timestamp); + byte [] nowBytes = Writables.getBytes(now); + HStoreKey future = new HStoreKey(a, a, timestamp + 10); + byte [] futureBytes = Writables.getBytes(future); + StoreKeyByteComparator comparator = new HStoreKey.StoreKeyByteComparator(); + assertTrue(past.compareTo(now) > 0); + assertTrue(comparator.compare(pastBytes, nowBytes) > 0); + assertTrue(now.compareTo(now) == 0); + assertTrue(comparator.compare(nowBytes, nowBytes) == 0); + assertTrue(future.compareTo(now) < 0); + assertTrue(comparator.compare(futureBytes, nowBytes) < 0); + // Check that empty column comes before one with a column + HStoreKey nocolumn = new HStoreKey(a, timestamp); + byte [] nocolumnBytes = Writables.getBytes(nocolumn); + HStoreKey withcolumn = new HStoreKey(a, a, timestamp); + byte [] withcolumnBytes = Writables.getBytes(withcolumn); + assertTrue(nocolumn.compareTo(withcolumn) < 0); + assertTrue(comparator.compare(nocolumnBytes, withcolumnBytes) < 0); + // Check that empty column comes and LATEST comes before one with a column + // and old timestamp. + nocolumn = new HStoreKey(a, HConstants.LATEST_TIMESTAMP); + nocolumnBytes = Writables.getBytes(nocolumn); + withcolumn = new HStoreKey(a, a, timestamp); + withcolumnBytes = Writables.getBytes(withcolumn); + assertTrue(nocolumn.compareTo(withcolumn) < 0); + assertTrue(comparator.compare(nocolumnBytes, withcolumnBytes) < 0); + } + +// /** +// * Tests cases where rows keys have characters below the ','. +// * See HBASE-832 +// * @throws IOException +// */ +// public void testHStoreKeyBorderCases() throws IOException { +// HRegionInfo info = new HRegionInfo(new HTableDescriptor("testtable"), +// HConstants.EMPTY_BYTE_ARRAY, HConstants.EMPTY_BYTE_ARRAY); +// +// HStoreKey rowA = new HStoreKey("testtable,www.hbase.org/,1234", +// "", Long.MAX_VALUE, info); +// byte [] rowABytes = Writables.getBytes(rowA); +// HStoreKey rowB = new HStoreKey("testtable,www.hbase.org/%20,99999", +// "", Long.MAX_VALUE, info); +// byte [] rowBBytes = Writables.getBytes(rowB); +// assertTrue(rowA.compareTo(rowB) > 0); +// HStoreKey.Comparator comparator = new HStoreKey.PlainStoreKeyComparator(); +// assertTrue(comparator.compare(rowABytes, rowBBytes) > 0); +// +// rowA = new HStoreKey("testtable,www.hbase.org/,1234", +// "", Long.MAX_VALUE, HRegionInfo.FIRST_META_REGIONINFO); +// rowB = new HStoreKey("testtable,www.hbase.org/%20,99999", +// "", Long.MAX_VALUE, HRegionInfo.FIRST_META_REGIONINFO); +// assertTrue(rowA.compareTo(rowB) < 0); +// assertTrue(comparator.compare(rowABytes, rowBBytes) < 0); +// +// rowA = new HStoreKey("testtable,,1234", +// "", Long.MAX_VALUE, HRegionInfo.FIRST_META_REGIONINFO); +// rowB = new HStoreKey("testtable,$www.hbase.org/,99999", +// "", Long.MAX_VALUE, HRegionInfo.FIRST_META_REGIONINFO); +// assertTrue(rowA.compareTo(rowB) < 0); +// assertTrue(comparator.compare(rowABytes, rowBBytes) < 0); +// +// rowA = new HStoreKey(".META.,testtable,www.hbase.org/,1234,4321", +// "", Long.MAX_VALUE, HRegionInfo.ROOT_REGIONINFO); +// rowB = new HStoreKey(".META.,testtable,www.hbase.org/%20,99999,99999", +// "", Long.MAX_VALUE, HRegionInfo.ROOT_REGIONINFO); +// assertTrue(rowA.compareTo(rowB) > 0); +// assertTrue(comparator.compare(rowABytes, rowBBytes) > 0); +// } +} \ No newline at end of file diff --git a/src/test/org/apache/hadoop/hbase/io/hfile/KVGenerator.java b/src/test/org/apache/hadoop/hbase/io/hfile/KVGenerator.java new file mode 100644 index 00000000000..ca8b80ad0e1 --- /dev/null +++ b/src/test/org/apache/hadoop/hbase/io/hfile/KVGenerator.java @@ -0,0 +1,111 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package org.apache.hadoop.hbase.io.hfile; + +import java.util.Random; + +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.WritableComparator; + +/** + * Generate random pairs. + *

+ * Copied from + * hadoop-3315 tfile. + * Remove after tfile is committed and use the tfile version of this class + * instead.

+ */ +class KVGenerator { + private final Random random; + private final byte[][] dict; + private final boolean sorted; + private final RandomDistribution.DiscreteRNG keyLenRNG, valLenRNG; + private BytesWritable lastKey; + private static final int MIN_KEY_LEN = 4; + private final byte prefix[] = new byte[MIN_KEY_LEN]; + + public KVGenerator(Random random, boolean sorted, + RandomDistribution.DiscreteRNG keyLenRNG, + RandomDistribution.DiscreteRNG valLenRNG, + RandomDistribution.DiscreteRNG wordLenRNG, int dictSize) { + this.random = random; + dict = new byte[dictSize][]; + this.sorted = sorted; + this.keyLenRNG = keyLenRNG; + this.valLenRNG = valLenRNG; + for (int i = 0; i < dictSize; ++i) { + int wordLen = wordLenRNG.nextInt(); + dict[i] = new byte[wordLen]; + random.nextBytes(dict[i]); + } + lastKey = new BytesWritable(); + fillKey(lastKey); + } + + private void fillKey(BytesWritable o) { + int len = keyLenRNG.nextInt(); + if (len < MIN_KEY_LEN) len = MIN_KEY_LEN; + o.setSize(len); + int n = MIN_KEY_LEN; + while (n < len) { + byte[] word = dict[random.nextInt(dict.length)]; + int l = Math.min(word.length, len - n); + System.arraycopy(word, 0, o.get(), n, l); + n += l; + } + if (sorted + && WritableComparator.compareBytes(lastKey.get(), MIN_KEY_LEN, lastKey + .getSize() + - MIN_KEY_LEN, o.get(), MIN_KEY_LEN, o.getSize() - MIN_KEY_LEN) > 0) { + incrementPrefix(); + } + + System.arraycopy(prefix, 0, o.get(), 0, MIN_KEY_LEN); + lastKey.set(o); + } + + private void fillValue(BytesWritable o) { + int len = valLenRNG.nextInt(); + o.setSize(len); + int n = 0; + while (n < len) { + byte[] word = dict[random.nextInt(dict.length)]; + int l = Math.min(word.length, len - n); + System.arraycopy(word, 0, o.get(), n, l); + n += l; + } + } + + private void incrementPrefix() { + for (int i = MIN_KEY_LEN - 1; i >= 0; --i) { + ++prefix[i]; + if (prefix[i] != 0) return; + } + + throw new RuntimeException("Prefix overflown"); + } + + public void next(BytesWritable key, BytesWritable value, boolean dupKey) { + if (dupKey) { + key.set(lastKey); + } + else { + fillKey(key); + } + fillValue(value); + } +} diff --git a/src/test/org/apache/hadoop/hbase/io/hfile/KeySampler.java b/src/test/org/apache/hadoop/hbase/io/hfile/KeySampler.java new file mode 100644 index 00000000000..b90a3ea7529 --- /dev/null +++ b/src/test/org/apache/hadoop/hbase/io/hfile/KeySampler.java @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package org.apache.hadoop.hbase.io.hfile; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Random; + +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.hbase.io.hfile.RandomDistribution.DiscreteRNG; + +/* +*

+* Copied from +* hadoop-3315 tfile. +* Remove after tfile is committed and use the tfile version of this class +* instead.

+*/ +class KeySampler { + Random random; + int min, max; + DiscreteRNG keyLenRNG; + private static final int MIN_KEY_LEN = 4; + + public KeySampler(Random random, byte [] first, byte [] last, + DiscreteRNG keyLenRNG) throws IOException { + this.random = random; + min = keyPrefixToInt(first); + max = keyPrefixToInt(last); + this.keyLenRNG = keyLenRNG; + } + + private int keyPrefixToInt(byte [] key) throws IOException { + byte[] b = key; + int o = 0; + return (b[o] & 0xff) << 24 | (b[o + 1] & 0xff) << 16 + | (b[o + 2] & 0xff) << 8 | (b[o + 3] & 0xff); + } + + public void next(BytesWritable key) { + key.setSize(Math.max(MIN_KEY_LEN, keyLenRNG.nextInt())); + random.nextBytes(key.get()); + int n = random.nextInt(max - min) + min; + byte[] b = key.get(); + b[0] = (byte) (n >> 24); + b[1] = (byte) (n >> 16); + b[2] = (byte) (n >> 8); + b[3] = (byte) n; + } +} diff --git a/src/test/org/apache/hadoop/hbase/io/hfile/NanoTimer.java b/src/test/org/apache/hadoop/hbase/io/hfile/NanoTimer.java new file mode 100644 index 00000000000..1312da047ab --- /dev/null +++ b/src/test/org/apache/hadoop/hbase/io/hfile/NanoTimer.java @@ -0,0 +1,198 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package org.apache.hadoop.hbase.io.hfile; + +/** + * A nano-second timer. + *

+ * Copied from + * hadoop-3315 tfile. + * Remove after tfile is committed and use the tfile version of this class + * instead.

+ */ +public class NanoTimer { + private long last = -1; + private boolean started = false; + private long cumulate = 0; + + /** + * Constructor + * + * @param start + * Start the timer upon construction. + */ + public NanoTimer(boolean start) { + if (start) this.start(); + } + + /** + * Start the timer. + * + * Note: No effect if timer is already started. + */ + public void start() { + if (!this.started) { + this.last = System.nanoTime(); + this.started = true; + } + } + + /** + * Stop the timer. + * + * Note: No effect if timer is already stopped. + */ + public void stop() { + if (this.started) { + this.started = false; + this.cumulate += System.nanoTime() - this.last; + } + } + + /** + * Read the timer. + * + * @return the elapsed time in nano-seconds. Note: If the timer is never + * started before, -1 is returned. + */ + public long read() { + if (!readable()) return -1; + + return this.cumulate; + } + + /** + * Reset the timer. + */ + public void reset() { + this.last = -1; + this.started = false; + this.cumulate = 0; + } + + /** + * Checking whether the timer is started + * + * @return true if timer is started. + */ + public boolean isStarted() { + return this.started; + } + + /** + * Format the elapsed time to a human understandable string. + * + * Note: If timer is never started, "ERR" will be returned. + */ + public String toString() { + if (!readable()) { + return "ERR"; + } + + return NanoTimer.nanoTimeToString(this.cumulate); + } + + /** + * A utility method to format a time duration in nano seconds into a human + * understandable stirng. + * + * @param t + * Time duration in nano seconds. + * @return String representation. + */ + public static String nanoTimeToString(long t) { + if (t < 0) return "ERR"; + + if (t == 0) return "0"; + + if (t < 1000) { + return t + "ns"; + } + + double us = (double) t / 1000; + if (us < 1000) { + return String.format("%.2fus", us); + } + + double ms = us / 1000; + if (ms < 1000) { + return String.format("%.2fms", ms); + } + + double ss = ms / 1000; + if (ss < 1000) { + return String.format("%.2fs", ss); + } + + long mm = (long) ss / 60; + ss -= mm * 60; + long hh = mm / 60; + mm -= hh * 60; + long dd = hh / 24; + hh -= dd * 24; + + if (dd > 0) { + return String.format("%dd%dh", dd, hh); + } + + if (hh > 0) { + return String.format("%dh%dm", hh, mm); + } + + if (mm > 0) { + return String.format("%dm%.1fs", mm, ss); + } + + return String.format("%.2fs", ss); + + /** + * StringBuilder sb = new StringBuilder(); String sep = ""; + * + * if (dd > 0) { String unit = (dd > 1) ? "days" : "day"; + * sb.append(String.format("%s%d%s", sep, dd, unit)); sep = " "; } + * + * if (hh > 0) { String unit = (hh > 1) ? "hrs" : "hr"; + * sb.append(String.format("%s%d%s", sep, hh, unit)); sep = " "; } + * + * if (mm > 0) { String unit = (mm > 1) ? "mins" : "min"; + * sb.append(String.format("%s%d%s", sep, mm, unit)); sep = " "; } + * + * if (ss > 0) { String unit = (ss > 1) ? "secs" : "sec"; + * sb.append(String.format("%s%.3f%s", sep, ss, unit)); sep = " "; } + * + * return sb.toString(); + */ + } + + private boolean readable() { + return this.last != -1; + } + + /** + * Simple tester. + * + * @param args + */ + public static void main(String[] args) { + long i = 7; + + for (int x = 0; x < 20; ++x, i *= 7) { + System.out.println(NanoTimer.nanoTimeToString(i)); + } + } +} + diff --git a/src/test/org/apache/hadoop/hbase/io/hfile/RandomDistribution.java b/src/test/org/apache/hadoop/hbase/io/hfile/RandomDistribution.java new file mode 100644 index 00000000000..32196648901 --- /dev/null +++ b/src/test/org/apache/hadoop/hbase/io/hfile/RandomDistribution.java @@ -0,0 +1,271 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package org.apache.hadoop.hbase.io.hfile; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Random; + +/** + * A class that generates random numbers that follow some distribution. + *

+ * Copied from + * hadoop-3315 tfile. + * Remove after tfile is committed and use the tfile version of this class + * instead.

+ */ +public class RandomDistribution { + /** + * Interface for discrete (integer) random distributions. + */ + public static interface DiscreteRNG { + /** + * Get the next random number + * + * @return the next random number. + */ + public int nextInt(); + } + + /** + * P(i)=1/(max-min) + */ + public static final class Flat implements DiscreteRNG { + private final Random random; + private final int min; + private final int max; + + /** + * Generate random integers from min (inclusive) to max (exclusive) + * following even distribution. + * + * @param random + * The basic random number generator. + * @param min + * Minimum integer + * @param max + * maximum integer (exclusive). + * + */ + public Flat(Random random, int min, int max) { + if (min >= max) { + throw new IllegalArgumentException("Invalid range"); + } + this.random = random; + this.min = min; + this.max = max; + } + + /** + * @see DiscreteRNG#nextInt() + */ + @Override + public int nextInt() { + return random.nextInt(max - min) + min; + } + } + + /** + * Zipf distribution. The ratio of the probabilities of integer i and j is + * defined as follows: + * + * P(i)/P(j)=((j-min+1)/(i-min+1))^sigma. + */ + public static final class Zipf implements DiscreteRNG { + private static final double DEFAULT_EPSILON = 0.001; + private final Random random; + private final ArrayList k; + private final ArrayList v; + + /** + * Constructor + * + * @param r + * The random number generator. + * @param min + * minimum integer (inclusvie) + * @param max + * maximum integer (exclusive) + * @param sigma + * parameter sigma. (sigma > 1.0) + */ + public Zipf(Random r, int min, int max, double sigma) { + this(r, min, max, sigma, DEFAULT_EPSILON); + } + + /** + * Constructor. + * + * @param r + * The random number generator. + * @param min + * minimum integer (inclusvie) + * @param max + * maximum integer (exclusive) + * @param sigma + * parameter sigma. (sigma > 1.0) + * @param epsilon + * Allowable error percentage (0 < epsilon < 1.0). + */ + public Zipf(Random r, int min, int max, double sigma, double epsilon) { + if ((max <= min) || (sigma <= 1) || (epsilon <= 0) + || (epsilon >= 0.5)) { + throw new IllegalArgumentException("Invalid arguments"); + } + random = r; + k = new ArrayList(); + v = new ArrayList(); + + double sum = 0; + int last = -1; + for (int i = min; i < max; ++i) { + sum += Math.exp(-sigma * Math.log(i - min + 1)); + if ((last == -1) || i * (1 - epsilon) > last) { + k.add(i); + v.add(sum); + last = i; + } + } + + if (last != max - 1) { + k.add(max - 1); + v.add(sum); + } + + v.set(v.size() - 1, 1.0); + + for (int i = v.size() - 2; i >= 0; --i) { + v.set(i, v.get(i) / sum); + } + } + + /** + * @see DiscreteRNG#nextInt() + */ + @Override + public int nextInt() { + double d = random.nextDouble(); + int idx = Collections.binarySearch(v, d); + + if (idx > 0) { + ++idx; + } + else { + idx = -(idx + 1); + } + + if (idx >= v.size()) { + idx = v.size() - 1; + } + + if (idx == 0) { + return k.get(0); + } + + int ceiling = k.get(idx); + int lower = k.get(idx - 1); + + return ceiling - random.nextInt(ceiling - lower); + } + } + + /** + * Binomial distribution. + * + * P(k)=select(n, k)*p^k*(1-p)^(n-k) (k = 0, 1, ..., n) + * + * P(k)=select(max-min-1, k-min)*p^(k-min)*(1-p)^(k-min)*(1-p)^(max-k-1) + */ + public static final class Binomial implements DiscreteRNG { + private final Random random; + private final int min; + private final int n; + private final double[] v; + + private static double select(int n, int k) { + double ret = 1.0; + for (int i = k + 1; i <= n; ++i) { + ret *= (double) i / (i - k); + } + return ret; + } + + private static double power(double p, int k) { + return Math.exp(k * Math.log(p)); + } + + /** + * Generate random integers from min (inclusive) to max (exclusive) + * following Binomial distribution. + * + * @param random + * The basic random number generator. + * @param min + * Minimum integer + * @param max + * maximum integer (exclusive). + * @param p + * parameter. + * + */ + public Binomial(Random random, int min, int max, double p) { + if (min >= max) { + throw new IllegalArgumentException("Invalid range"); + } + this.random = random; + this.min = min; + this.n = max - min - 1; + if (n > 0) { + v = new double[n + 1]; + double sum = 0.0; + for (int i = 0; i <= n; ++i) { + sum += select(n, i) * power(p, i) * power(1 - p, n - i); + v[i] = sum; + } + for (int i = 0; i <= n; ++i) { + v[i] /= sum; + } + } + else { + v = null; + } + } + + /** + * @see DiscreteRNG#nextInt() + */ + @Override + public int nextInt() { + if (v == null) { + return min; + } + double d = random.nextDouble(); + int idx = Arrays.binarySearch(v, d); + if (idx > 0) { + ++idx; + } else { + idx = -(idx + 1); + } + + if (idx >= v.length) { + idx = v.length - 1; + } + return idx + min; + } + } +} diff --git a/src/test/org/apache/hadoop/hbase/io/hfile/RandomSeek.java b/src/test/org/apache/hadoop/hbase/io/hfile/RandomSeek.java new file mode 100644 index 00000000000..2845c073937 --- /dev/null +++ b/src/test/org/apache/hadoop/hbase/io/hfile/RandomSeek.java @@ -0,0 +1,124 @@ +/** + * Copyright 2009 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.io.hfile; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.LocalFileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RawLocalFileSystem; +import org.apache.hadoop.hbase.io.hfile.HFile.Reader; +import org.apache.hadoop.hbase.util.Bytes; + +/** + * Random seek test. + */ +public class RandomSeek { + private static List slurp(String fname) throws IOException { + BufferedReader istream = new BufferedReader(new FileReader(fname)); + String str; + List l = new ArrayList(); + while ( (str=istream.readLine()) != null) { + String [] parts = str.split(","); + l.add(parts[0] + ":" + parts[1] + ":" + parts[2]); + } + return l; + } + private static String randKey(List keys) { + Random r = new Random(); + //return keys.get(r.nextInt(keys.size())); + return "2" + Integer.toString(7+r.nextInt(2)) + Integer.toString(r.nextInt(100)); + //return new String(r.nextInt(100)); + } + + public static void main(String [] argv) throws IOException { + Configuration conf = new Configuration(); + conf.setInt("io.file.buffer.size", 64*1024); + RawLocalFileSystem rlfs = new RawLocalFileSystem(); + rlfs.setConf(conf); + LocalFileSystem lfs = new LocalFileSystem(rlfs); + + Path path = new Path("/Users/ryan/rfile.big.txt"); + long start = System.currentTimeMillis(); + SimpleBlockCache cache = new SimpleBlockCache(); + //LruBlockCache cache = new LruBlockCache(); + Reader reader = new HFile.Reader(lfs, path, cache); + reader.loadFileInfo(); + System.out.println(reader.trailer); + long end = System.currentTimeMillis(); + + System.out.println("Index read time: " + (end - start)); + + List keys = slurp("/Users/ryan/xaa.50k"); + + HFileScanner scanner = reader.getScanner(); + int count; + long totalBytes = 0; + int notFound = 0; + + start = System.nanoTime(); + for(count = 0; count < 500000; ++count) { + String key = randKey(keys); + byte [] bkey = Bytes.toBytes(key); + int res = scanner.seekTo(bkey); + if (res == 0) { + ByteBuffer k = scanner.getKey(); + ByteBuffer v = scanner.getValue(); + totalBytes += k.limit(); + totalBytes += v.limit(); + } else { + ++ notFound; + } + if (res == -1) { + scanner.seekTo(); + } + // Scan for another 1000 rows. + for (int i = 0; i < 1000; ++i) { + if (!scanner.next()) + break; + ByteBuffer k = scanner.getKey(); + ByteBuffer v = scanner.getValue(); + totalBytes += k.limit(); + totalBytes += v.limit(); + } + + if ( count % 1000 == 0 ) { + end = System.nanoTime(); + + System.out.println("Cache block count: " + cache.size() + " dumped: "+ cache.dumps); + //System.out.println("Cache size: " + cache.heapSize()); + double msTime = ((end - start) / 1000000.0); + System.out.println("Seeked: "+ count + " in " + msTime + " (ms) " + + (1000.0 / msTime ) + " seeks/ms " + + (msTime / 1000.0) + " ms/seek"); + + start = System.nanoTime(); + } + } + System.out.println("Total bytes: " + totalBytes + " not found: " + notFound); + } +} diff --git a/src/test/org/apache/hadoop/hbase/io/hfile/TestHFile.java b/src/test/org/apache/hadoop/hbase/io/hfile/TestHFile.java new file mode 100644 index 00000000000..0ab42b80931 --- /dev/null +++ b/src/test/org/apache/hadoop/hbase/io/hfile/TestHFile.java @@ -0,0 +1,248 @@ +/** + * Copyright 2009 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.io.hfile; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Arrays; + +import junit.framework.TestCase; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocalFileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RawLocalFileSystem; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.io.hfile.HFile.Reader; +import org.apache.hadoop.hbase.io.hfile.HFile.Writer; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.io.RawComparator; + +/** + * test hfile features. + *

+ * Copied from + * hadoop-3315 tfile. + * Remove after tfile is committed and use the tfile version of this class + * instead.

+ */ +public class TestHFile extends TestCase { + static final Log LOG = LogFactory.getLog(TestHFile.class); + + private static String ROOT_DIR = + System.getProperty("test.build.data", "/tmp/TestHFile"); + private FileSystem fs; + private Configuration conf; + private final int minBlockSize = 512; + private static String localFormatter = "%010d"; + + @Override + public void setUp() { + conf = new HBaseConfiguration(); + RawLocalFileSystem rawLFS = new RawLocalFileSystem(); + rawLFS.setConf(conf); + fs = new LocalFileSystem(rawLFS); + } + + // write some records into the tfile + // write them twice + private int writeSomeRecords(Writer writer, int start, int n) + throws IOException { + String value = "value"; + for (int i = start; i < (start + n); i++) { + String key = String.format(localFormatter, Integer.valueOf(i)); + writer.append(Bytes.toBytes(key), Bytes.toBytes(value + key)); + } + return (start + n); + } + + private void readAllRecords(HFileScanner scanner) throws IOException { + readAndCheckbytes(scanner, 0, 100); + } + + // read the records and check + private int readAndCheckbytes(HFileScanner scanner, int start, int n) + throws IOException { + String value = "value"; + int i = start; + for (; i < (start + n); i++) { + ByteBuffer key = scanner.getKey(); + ByteBuffer val = scanner.getValue(); + String keyStr = String.format(localFormatter, Integer.valueOf(i)); + String valStr = value + keyStr; + byte [] keyBytes = Bytes.toBytes(key); + assertTrue("bytes for keys do not match " + keyStr + " " + + Bytes.toString(Bytes.toBytes(key)), + Arrays.equals(Bytes.toBytes(keyStr), keyBytes)); + byte [] valBytes = Bytes.toBytes(val); + assertTrue("bytes for vals do not match " + valStr + " " + + Bytes.toString(valBytes), + Arrays.equals(Bytes.toBytes(valStr), valBytes)); + if (!scanner.next()) { + break; + } + } + assertEquals(i, start + n - 1); + return (start + n); + } + + private byte[] getSomeKey(int rowId) { + return String.format(localFormatter, Integer.valueOf(rowId)).getBytes(); + } + + private void writeRecords(Writer writer) throws IOException { + writeSomeRecords(writer, 0, 100); + writer.close(); + } + + private FSDataOutputStream createFSOutput(Path name) throws IOException { + if (fs.exists(name)) fs.delete(name, true); + FSDataOutputStream fout = fs.create(name); + return fout; + } + + /** + * test none codecs + */ + void basicWithSomeCodec(String codec) throws IOException { + Path ncTFile = new Path(ROOT_DIR, "basic.hfile"); + FSDataOutputStream fout = createFSOutput(ncTFile); + Writer writer = new Writer(fout, minBlockSize, codec, null); + LOG.info(writer); + writeRecords(writer); + fout.close(); + FSDataInputStream fin = fs.open(ncTFile); + Reader reader = new Reader(fs.open(ncTFile), + fs.getFileStatus(ncTFile).getLen(), null); + // Load up the index. + reader.loadFileInfo(); + LOG.info(reader); + HFileScanner scanner = reader.getScanner(); + // Align scanner at start of the file. + scanner.seekTo(); + readAllRecords(scanner); + scanner.seekTo(getSomeKey(50)); + assertTrue("location lookup failed", scanner.seekTo(getSomeKey(50)) == 0); + // read the key and see if it matches + ByteBuffer readKey = scanner.getKey(); + assertTrue("seeked key does not match", Arrays.equals(getSomeKey(50), + Bytes.toBytes(readKey))); + + scanner.seekTo(new byte[0]); + ByteBuffer val1 = scanner.getValue(); + scanner.seekTo(new byte[0]); + ByteBuffer val2 = scanner.getValue(); + assertTrue(Arrays.equals(Bytes.toBytes(val1), Bytes.toBytes(val2))); + + reader.close(); + fin.close(); + fs.delete(ncTFile, true); + } + + public void testTFileFeatures() throws IOException { + basicWithSomeCodec("none"); + basicWithSomeCodec("gz"); + } + + private void writeNumMetablocks(Writer writer, int n) { + for (int i = 0; i < n; i++) { + writer.appendMetaBlock("TfileMeta" + i, ("something to test" + i).getBytes()); + } + } + + private void someTestingWithMetaBlock(Writer writer) { + writeNumMetablocks(writer, 10); + } + + private void readNumMetablocks(Reader reader, int n) throws IOException { + for (int i = 0; i < n; i++) { + ByteBuffer b = reader.getMetaBlock("TfileMeta" + i); + byte [] found = Bytes.toBytes(b); + assertTrue("failed to match metadata", Arrays.equals( + ("something to test" + i).getBytes(), found)); + } + } + + private void someReadingWithMetaBlock(Reader reader) throws IOException { + readNumMetablocks(reader, 10); + } + + private void metablocks(final String compress) throws Exception { + Path mFile = new Path(ROOT_DIR, "meta.tfile"); + FSDataOutputStream fout = createFSOutput(mFile); + Writer writer = new Writer(fout, minBlockSize, compress, null); + someTestingWithMetaBlock(writer); + writer.close(); + fout.close(); + FSDataInputStream fin = fs.open(mFile); + Reader reader = new Reader(fs.open(mFile), this.fs.getFileStatus(mFile) + .getLen(), null); + reader.loadFileInfo(); + // No data -- this should return false. + assertFalse(reader.getScanner().seekTo()); + someReadingWithMetaBlock(reader); + fs.delete(mFile, true); + reader.close(); + fin.close(); + } + + // test meta blocks for tfiles + public void testMetaBlocks() throws Exception { + metablocks("none"); + metablocks("gz"); + } + + /** + * Make sure the orginals for our compression libs doesn't change on us. + */ + public void testCompressionOrdinance() { + assertTrue(Compression.Algorithm.LZO.ordinal() == 0); + assertTrue(Compression.Algorithm.GZ.ordinal() == 1); + assertTrue(Compression.Algorithm.NONE.ordinal() == 2); + } + + + public void testComparator() throws IOException { + Path mFile = new Path(ROOT_DIR, "meta.tfile"); + FSDataOutputStream fout = createFSOutput(mFile); + Writer writer = new Writer(fout, minBlockSize, "none", + new RawComparator() { + @Override + public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, + int l2) { + return -Bytes.compareTo(b1, s1, l1, b2, s2, l2); + + } + @Override + public int compare(byte[] o1, byte[] o2) { + return compare(o1, 0, o1.length, o2, 0, o2.length); + } + }); + writer.append("3".getBytes(), "0".getBytes()); + writer.append("2".getBytes(), "0".getBytes()); + writer.append("1".getBytes(), "0".getBytes()); + writer.close(); + } +} \ No newline at end of file diff --git a/src/test/org/apache/hadoop/hbase/io/hfile/TestHFilePerformance.java b/src/test/org/apache/hadoop/hbase/io/hfile/TestHFilePerformance.java new file mode 100644 index 00000000000..b59e30f5ea0 --- /dev/null +++ b/src/test/org/apache/hadoop/hbase/io/hfile/TestHFilePerformance.java @@ -0,0 +1,387 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.io.hfile; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Random; + +import junit.framework.TestCase; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.io.compress.LzoCodec; + +/** + * Set of long-running tests to measure performance of HFile. + *

+ * Copied from + * hadoop-3315 tfile. + * Remove after tfile is committed and use the tfile version of this class + * instead.

+ */ +public class TestHFilePerformance extends TestCase { + private static String ROOT_DIR = + System.getProperty("test.build.data", "/tmp/TestHFilePerformance"); + private FileSystem fs; + private Configuration conf; + private long startTimeEpoch; + private long finishTimeEpoch; + private DateFormat formatter; + + @Override + public void setUp() throws IOException { + conf = new Configuration(); + fs = FileSystem.get(conf); + formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + } + + public void startTime() { + startTimeEpoch = System.currentTimeMillis(); + System.out.println(formatTime() + " Started timing."); + } + + public void stopTime() { + finishTimeEpoch = System.currentTimeMillis(); + System.out.println(formatTime() + " Stopped timing."); + } + + public long getIntervalMillis() { + return finishTimeEpoch - startTimeEpoch; + } + + public void printlnWithTimestamp(String message) { + System.out.println(formatTime() + " " + message); + } + + /* + * Format millis into minutes and seconds. + */ + public String formatTime(long milis){ + return formatter.format(milis); + } + + public String formatTime(){ + return formatTime(System.currentTimeMillis()); + } + + private FSDataOutputStream createFSOutput(Path name) throws IOException { + if (fs.exists(name)) + fs.delete(name, true); + FSDataOutputStream fout = fs.create(name); + return fout; + } + + //TODO have multiple ways of generating key/value e.g. dictionary words + //TODO to have a sample compressable data, for now, made 1 out of 3 values random + // keys are all random. + + private static class KeyValueGenerator { + Random keyRandomizer; + Random valueRandomizer; + long randomValueRatio = 3; // 1 out of randomValueRatio generated values will be random. + long valueSequence = 0 ; + + + KeyValueGenerator() { + keyRandomizer = new Random(0L); //TODO with seed zero + valueRandomizer = new Random(1L); //TODO with seed one + } + + // Key is always random now. + void getKey(byte[] key) { + keyRandomizer.nextBytes(key); + } + + void getValue(byte[] value) { + if (valueSequence % randomValueRatio == 0) + valueRandomizer.nextBytes(value); + valueSequence++; + } + } + + /** + * + * @param fileType "HFile" or "SequenceFile" + * @param keyLength + * @param valueLength + * @param codecName "none", "lzo", "gz" + * @param rows number of rows to be written. + * @param writeMethod used for HFile only. + * @param minBlockSize used for HFile only. + * @throws IOException + */ + //TODO writeMethod: implement multiple ways of writing e.g. A) known length (no chunk) B) using a buffer and streaming (for many chunks). + public void timeWrite(String fileType, int keyLength, int valueLength, + String codecName, long rows, String writeMethod, int minBlockSize) + throws IOException { + System.out.println("File Type: " + fileType); + System.out.println("Writing " + fileType + " with codecName: " + codecName); + long totalBytesWritten = 0; + + + //Using separate randomizer for key/value with seeds matching Sequence File. + byte[] key = new byte[keyLength]; + byte[] value = new byte[valueLength]; + KeyValueGenerator generator = new KeyValueGenerator(); + + startTime(); + + Path path = new Path(ROOT_DIR, fileType + ".Performance"); + System.out.println(ROOT_DIR + path.getName()); + FSDataOutputStream fout = createFSOutput(path); + + if ("HFile".equals(fileType)){ + System.out.println("HFile write method: "); + HFile.Writer writer = + new HFile.Writer(fout, minBlockSize, codecName, null); + + // Writing value in one shot. + for (long l=0 ; l + * Copied from + * hadoop-3315 tfile. + * Remove after tfile is committed and use the tfile version of this class + * instead.

+ */ +public class TestHFileSeek extends TestCase { + private MyOptions options; + private Configuration conf; + private Path path; + private FileSystem fs; + private NanoTimer timer; + private Random rng; + private RandomDistribution.DiscreteRNG keyLenGen; + private KVGenerator kvGen; + + @Override + public void setUp() throws IOException { + if (options == null) { + options = new MyOptions(new String[0]); + } + + conf = new Configuration(); + conf.setInt("tfile.fs.input.buffer.size", options.fsInputBufferSize); + conf.setInt("tfile.fs.output.buffer.size", options.fsOutputBufferSize); + path = new Path(new Path(options.rootDir), options.file); + fs = path.getFileSystem(conf); + timer = new NanoTimer(false); + rng = new Random(options.seed); + keyLenGen = + new RandomDistribution.Zipf(new Random(rng.nextLong()), + options.minKeyLen, options.maxKeyLen, 1.2); + RandomDistribution.DiscreteRNG valLenGen = + new RandomDistribution.Flat(new Random(rng.nextLong()), + options.minValLength, options.maxValLength); + RandomDistribution.DiscreteRNG wordLenGen = + new RandomDistribution.Flat(new Random(rng.nextLong()), + options.minWordLen, options.maxWordLen); + kvGen = + new KVGenerator(rng, true, keyLenGen, valLenGen, wordLenGen, + options.dictSize); + } + + @Override + public void tearDown() { + try { + fs.close(); + } + catch (Exception e) { + // Nothing + } + } + + private static FSDataOutputStream createFSOutput(Path name, FileSystem fs) + throws IOException { + if (fs.exists(name)) { + fs.delete(name, true); + } + FSDataOutputStream fout = fs.create(name); + return fout; + } + + private void createTFile() throws IOException { + long totalBytes = 0; + FSDataOutputStream fout = createFSOutput(path, fs); + try { + Writer writer = + new Writer(fout, options.minBlockSize, options.compress, null); + try { + BytesWritable key = new BytesWritable(); + BytesWritable val = new BytesWritable(); + timer.start(); + for (long i = 0; true; ++i) { + if (i % 1000 == 0) { // test the size for every 1000 rows. + if (fs.getFileStatus(path).getLen() >= options.fileSize) { + break; + } + } + kvGen.next(key, val, false); + byte [] k = new byte [key.getLength()]; + System.arraycopy(key.getBytes(), 0, k, 0, key.getLength()); + byte [] v = new byte [val.getLength()]; + System.arraycopy(val.getBytes(), 0, v, 0, key.getLength()); + writer.append(k, v); + totalBytes += key.getLength(); + totalBytes += val.getLength(); + } + timer.stop(); + } + finally { + writer.close(); + } + } + finally { + fout.close(); + } + double duration = (double)timer.read()/1000; // in us. + long fsize = fs.getFileStatus(path).getLen(); + + System.out.printf( + "time: %s...uncompressed: %.2fMB...raw thrpt: %.2fMB/s\n", + timer.toString(), (double) totalBytes / 1024 / 1024, totalBytes + / duration); + System.out.printf("time: %s...file size: %.2fMB...disk thrpt: %.2fMB/s\n", + timer.toString(), (double) fsize / 1024 / 1024, fsize / duration); + } + + public void seekTFile() throws IOException { + int miss = 0; + long totalBytes = 0; + FSDataInputStream fsdis = fs.open(path); + Reader reader = + new Reader(fsdis, fs.getFileStatus(path).getLen(), null); + reader.loadFileInfo(); + System.out.println(reader); + KeySampler kSampler = + new KeySampler(rng, reader.getFirstKey(), reader.getLastKey(), + keyLenGen); + HFileScanner scanner = reader.getScanner(); + BytesWritable key = new BytesWritable(); + BytesWritable val = new BytesWritable(); + timer.reset(); + timer.start(); + for (int i = 0; i < options.seekCount; ++i) { + kSampler.next(key); + byte [] k = new byte [key.getLength()]; + System.arraycopy(key.getBytes(), 0, k, 0, key.getLength()); + if (scanner.seekTo(k) >= 0) { + ByteBuffer bbkey = scanner.getKey(); + ByteBuffer bbval = scanner.getValue(); + totalBytes += bbkey.limit(); + totalBytes += bbval.limit(); + } + else { + ++miss; + } + } + timer.stop(); + double duration = (double) timer.read() / 1000; // in us. + System.out.printf( + "time: %s...avg seek: %s...%d hit...%d miss...avg I/O size: %.2fKB\n", + timer.toString(), NanoTimer.nanoTimeToString(timer.read() + / options.seekCount), options.seekCount - miss, miss, + (double) totalBytes / 1024 / (options.seekCount - miss)); + + } + + public void testSeeks() throws IOException { + if (options.doCreate()) { + createTFile(); + } + + if (options.doRead()) { + seekTFile(); + } + + if (options.doCreate()) { + fs.delete(path, true); + } + } + + private static class IntegerRange { + private final int from, to; + + public IntegerRange(int from, int to) { + this.from = from; + this.to = to; + } + + public static IntegerRange parse(String s) throws ParseException { + StringTokenizer st = new StringTokenizer(s, " \t,"); + if (st.countTokens() != 2) { + throw new ParseException("Bad integer specification: " + s); + } + int from = Integer.parseInt(st.nextToken()); + int to = Integer.parseInt(st.nextToken()); + return new IntegerRange(from, to); + } + + public int from() { + return from; + } + + public int to() { + return to; + } + } + + private static class MyOptions { + // hard coded constants + int dictSize = 1000; + int minWordLen = 5; + int maxWordLen = 20; + int osInputBufferSize = 64 * 1024; + int osOutputBufferSize = 64 * 1024; + int fsInputBufferSizeNone = 0; + int fsInputBufferSizeLzo = 0; + int fsInputBufferSizeGz = 0; + int fsOutputBufferSizeNone = 1; + int fsOutputBufferSizeLzo = 1; + int fsOutputBufferSizeGz = 1; + + String rootDir = + System.getProperty("test.build.data", "/tmp/TestTFileSeek"); + String file = "TestTFileSeek"; + // String compress = "lzo"; DISABLED + String compress = "none"; + int minKeyLen = 10; + int maxKeyLen = 50; + int minValLength = 1024; + int maxValLength = 2 * 1024; + int minBlockSize = 1 * 1024 * 1024; + int fsOutputBufferSize = 1; + int fsInputBufferSize = 0; + // Default writing 10MB. + long fileSize = 10 * 1024 * 1024; + long seekCount = 1000; + long seed; + + static final int OP_CREATE = 1; + static final int OP_READ = 2; + int op = OP_CREATE | OP_READ; + + boolean proceed = false; + + public MyOptions(String[] args) { + seed = System.nanoTime(); + + try { + Options opts = buildOptions(); + CommandLineParser parser = new GnuParser(); + CommandLine line = parser.parse(opts, args, true); + processOptions(line, opts); + validateOptions(); + } + catch (ParseException e) { + System.out.println(e.getMessage()); + System.out.println("Try \"--help\" option for details."); + setStopProceed(); + } + } + + public boolean proceed() { + return proceed; + } + + private Options buildOptions() { + Option compress = + OptionBuilder.withLongOpt("compress").withArgName("[none|lzo|gz]") + .hasArg().withDescription("compression scheme").create('c'); + + Option fileSize = + OptionBuilder.withLongOpt("file-size").withArgName("size-in-MB") + .hasArg().withDescription("target size of the file (in MB).") + .create('s'); + + Option fsInputBufferSz = + OptionBuilder.withLongOpt("fs-input-buffer").withArgName("size") + .hasArg().withDescription( + "size of the file system input buffer (in bytes).").create( + 'i'); + + Option fsOutputBufferSize = + OptionBuilder.withLongOpt("fs-output-buffer").withArgName("size") + .hasArg().withDescription( + "size of the file system output buffer (in bytes).").create( + 'o'); + + Option keyLen = + OptionBuilder + .withLongOpt("key-length") + .withArgName("min,max") + .hasArg() + .withDescription( + "the length range of the key (in bytes)") + .create('k'); + + Option valueLen = + OptionBuilder + .withLongOpt("value-length") + .withArgName("min,max") + .hasArg() + .withDescription( + "the length range of the value (in bytes)") + .create('v'); + + Option blockSz = + OptionBuilder.withLongOpt("block").withArgName("size-in-KB").hasArg() + .withDescription("minimum block size (in KB)").create('b'); + + Option seed = + OptionBuilder.withLongOpt("seed").withArgName("long-int").hasArg() + .withDescription("specify the seed").create('S'); + + Option operation = + OptionBuilder.withLongOpt("operation").withArgName("r|w|rw").hasArg() + .withDescription( + "action: seek-only, create-only, seek-after-create").create( + 'x'); + + Option rootDir = + OptionBuilder.withLongOpt("root-dir").withArgName("path").hasArg() + .withDescription( + "specify root directory where files will be created.") + .create('r'); + + Option file = + OptionBuilder.withLongOpt("file").withArgName("name").hasArg() + .withDescription("specify the file name to be created or read.") + .create('f'); + + Option seekCount = + OptionBuilder + .withLongOpt("seek") + .withArgName("count") + .hasArg() + .withDescription( + "specify how many seek operations we perform (requires -x r or -x rw.") + .create('n'); + + Option help = + OptionBuilder.withLongOpt("help").hasArg(false).withDescription( + "show this screen").create("h"); + + return new Options().addOption(compress).addOption(fileSize).addOption( + fsInputBufferSz).addOption(fsOutputBufferSize).addOption(keyLen) + .addOption(blockSz).addOption(rootDir).addOption(valueLen).addOption( + operation).addOption(seekCount).addOption(file).addOption(help); + + } + + private void processOptions(CommandLine line, Options opts) + throws ParseException { + // --help -h and --version -V must be processed first. + if (line.hasOption('h')) { + HelpFormatter formatter = new HelpFormatter(); + System.out.println("TFile and SeqFile benchmark."); + System.out.println(); + formatter.printHelp(100, + "java ... TestTFileSeqFileComparison [options]", + "\nSupported options:", opts, ""); + return; + } + + if (line.hasOption('c')) { + compress = line.getOptionValue('c'); + } + + if (line.hasOption('d')) { + dictSize = Integer.parseInt(line.getOptionValue('d')); + } + + if (line.hasOption('s')) { + fileSize = Long.parseLong(line.getOptionValue('s')) * 1024 * 1024; + } + + if (line.hasOption('i')) { + fsInputBufferSize = Integer.parseInt(line.getOptionValue('i')); + } + + if (line.hasOption('o')) { + fsOutputBufferSize = Integer.parseInt(line.getOptionValue('o')); + } + + if (line.hasOption('n')) { + seekCount = Integer.parseInt(line.getOptionValue('n')); + } + + if (line.hasOption('k')) { + IntegerRange ir = IntegerRange.parse(line.getOptionValue('k')); + minKeyLen = ir.from(); + maxKeyLen = ir.to(); + } + + if (line.hasOption('v')) { + IntegerRange ir = IntegerRange.parse(line.getOptionValue('v')); + minValLength = ir.from(); + maxValLength = ir.to(); + } + + if (line.hasOption('b')) { + minBlockSize = Integer.parseInt(line.getOptionValue('b')) * 1024; + } + + if (line.hasOption('r')) { + rootDir = line.getOptionValue('r'); + } + + if (line.hasOption('f')) { + file = line.getOptionValue('f'); + } + + if (line.hasOption('S')) { + seed = Long.parseLong(line.getOptionValue('S')); + } + + if (line.hasOption('x')) { + String strOp = line.getOptionValue('x'); + if (strOp.equals("r")) { + op = OP_READ; + } + else if (strOp.equals("w")) { + op = OP_CREATE; + } + else if (strOp.equals("rw")) { + op = OP_CREATE | OP_READ; + } + else { + throw new ParseException("Unknown action specifier: " + strOp); + } + } + + proceed = true; + } + + private void validateOptions() throws ParseException { + if (!compress.equals("none") && !compress.equals("lzo") + && !compress.equals("gz")) { + throw new ParseException("Unknown compression scheme: " + compress); + } + + if (minKeyLen >= maxKeyLen) { + throw new ParseException( + "Max key length must be greater than min key length."); + } + + if (minValLength >= maxValLength) { + throw new ParseException( + "Max value length must be greater than min value length."); + } + + if (minWordLen >= maxWordLen) { + throw new ParseException( + "Max word length must be greater than min word length."); + } + return; + } + + private void setStopProceed() { + proceed = false; + } + + public boolean doCreate() { + return (op & OP_CREATE) != 0; + } + + public boolean doRead() { + return (op & OP_READ) != 0; + } + } + + public static void main(String[] argv) throws IOException { + TestHFileSeek testCase = new TestHFileSeek(); + MyOptions options = new MyOptions(argv); + + if (options.proceed == false) { + return; + } + + testCase.options = options; + testCase.setUp(); + testCase.testSeeks(); + testCase.tearDown(); + } +} diff --git a/src/test/org/apache/hadoop/hbase/io/hfile/TestSeekTo.java b/src/test/org/apache/hadoop/hbase/io/hfile/TestSeekTo.java new file mode 100644 index 00000000000..cedfb3e9e07 --- /dev/null +++ b/src/test/org/apache/hadoop/hbase/io/hfile/TestSeekTo.java @@ -0,0 +1,146 @@ +/** + * Copyright 2009 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.io.hfile; + +import java.io.IOException; + +import junit.framework.TestCase; + +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.LocalFileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RawLocalFileSystem; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.io.hfile.HFile; +import org.apache.hadoop.hbase.util.Bytes; + +/** + * Test {@link HFileScanner#seekTo(byte[])} and its variants. + */ +public class TestSeekTo extends TestCase { + private static String ROOT_DIR = + System.getProperty("test.build.data", "/tmp/TestHFile"); + + private HBaseConfiguration conf; + private LocalFileSystem fs; + + @Override + public void setUp() { + conf = new HBaseConfiguration(); + RawLocalFileSystem rawLFS = new RawLocalFileSystem(); + rawLFS.setConf(conf); + fs = new LocalFileSystem(rawLFS); + } + private FSDataOutputStream createFSOutput(Path name) throws IOException { + if (fs.exists(name)) fs.delete(name, true); + FSDataOutputStream fout = fs.create(name); + return fout; + } + + Path makeNewFile() throws IOException { + Path ncTFile = new Path(ROOT_DIR, "basic.hfile"); + FSDataOutputStream fout = createFSOutput(ncTFile); + HFile.Writer writer = new HFile.Writer(fout, 40, "none", null); + // 4 bytes * 3 * 2 for each key/value + + // 3 for keys, 15 for values = 42 (woot) + writer.append(Bytes.toBytes("c"), Bytes.toBytes("value")); + writer.append(Bytes.toBytes("e"), Bytes.toBytes("value")); + writer.append(Bytes.toBytes("g"), Bytes.toBytes("value")); + // block transition + writer.append(Bytes.toBytes("i"), Bytes.toBytes("value")); + writer.append(Bytes.toBytes("k"), Bytes.toBytes("value")); + writer.close(); + fout.close(); + return ncTFile; + } + public void testSeekBefore() throws Exception { + Path p = makeNewFile(); + HFile.Reader reader = new HFile.Reader(fs, p, null); + reader.loadFileInfo(); + HFileScanner scanner = reader.getScanner(); + assertEquals(false, scanner.seekBefore(Bytes.toBytes("a"))); + + assertEquals(false, scanner.seekBefore(Bytes.toBytes("c"))); + + assertEquals(true, scanner.seekBefore(Bytes.toBytes("d"))); + assertEquals("c", scanner.getKeyString()); + + assertEquals(true, scanner.seekBefore(Bytes.toBytes("e"))); + assertEquals("c", scanner.getKeyString()); + + assertEquals(true, scanner.seekBefore(Bytes.toBytes("f"))); + assertEquals("e", scanner.getKeyString()); + + assertEquals(true, scanner.seekBefore(Bytes.toBytes("g"))); + assertEquals("e", scanner.getKeyString()); + + assertEquals(true, scanner.seekBefore(Bytes.toBytes("h"))); + assertEquals("g", scanner.getKeyString()); + assertEquals(true, scanner.seekBefore(Bytes.toBytes("i"))); + assertEquals("g", scanner.getKeyString()); + assertEquals(true, scanner.seekBefore(Bytes.toBytes("j"))); + assertEquals("i", scanner.getKeyString()); + assertEquals(true, scanner.seekBefore(Bytes.toBytes("k"))); + assertEquals("i", scanner.getKeyString()); + assertEquals(true, scanner.seekBefore(Bytes.toBytes("l"))); + assertEquals("k", scanner.getKeyString()); + } + + public void testSeekTo() throws Exception { + Path p = makeNewFile(); + HFile.Reader reader = new HFile.Reader(fs, p, null); + reader.loadFileInfo(); + assertEquals(2, reader.blockIndex.count); + HFileScanner scanner = reader.getScanner(); + // lies before the start of the file. + assertEquals(-1, scanner.seekTo(Bytes.toBytes("a"))); + + assertEquals(1, scanner.seekTo(Bytes.toBytes("d"))); + assertEquals("c", scanner.getKeyString()); + + // Across a block boundary now. + assertEquals(1, scanner.seekTo(Bytes.toBytes("h"))); + assertEquals("g", scanner.getKeyString()); + + assertEquals(1, scanner.seekTo(Bytes.toBytes("l"))); + assertEquals("k", scanner.getKeyString()); + } + + public void testBlockContainingKey() throws Exception { + Path p = makeNewFile(); + HFile.Reader reader = new HFile.Reader(fs, p, null); + reader.loadFileInfo(); + System.out.println(reader.blockIndex.toString()); + // falls before the start of the file. + assertEquals(-1, reader.blockIndex.blockContainingKey(Bytes.toBytes("a"))); + assertEquals(0, reader.blockIndex.blockContainingKey(Bytes.toBytes("c"))); + assertEquals(0, reader.blockIndex.blockContainingKey(Bytes.toBytes("d"))); + assertEquals(0, reader.blockIndex.blockContainingKey(Bytes.toBytes("e"))); + assertEquals(0, reader.blockIndex.blockContainingKey(Bytes.toBytes("g"))); + assertEquals(0, reader.blockIndex.blockContainingKey(Bytes.toBytes("h"))); + assertEquals(1, reader.blockIndex.blockContainingKey(Bytes.toBytes("i"))); + assertEquals(1, reader.blockIndex.blockContainingKey(Bytes.toBytes("j"))); + assertEquals(1, reader.blockIndex.blockContainingKey(Bytes.toBytes("k"))); + assertEquals(1, reader.blockIndex.blockContainingKey(Bytes.toBytes("l"))); + + + + } +} \ No newline at end of file diff --git a/src/test/org/apache/hadoop/hbase/regionserver/TestHStoreFile.java b/src/test/org/apache/hadoop/hbase/regionserver/TestHStoreFile.java deleted file mode 100644 index 64d8e0ed6b9..00000000000 --- a/src/test/org/apache/hadoop/hbase/regionserver/TestHStoreFile.java +++ /dev/null @@ -1,388 +0,0 @@ -/** - * Copyright 2007 The Apache Software Foundation - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.regionserver; - -import java.io.IOException; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.HBaseTestCase; -import org.apache.hadoop.hbase.HConstants; -import org.apache.hadoop.hbase.HRegionInfo; -import org.apache.hadoop.hbase.HStoreKey; -import org.apache.hadoop.hbase.io.HalfMapFileReader; -import org.apache.hadoop.hbase.io.ImmutableBytesWritable; -import org.apache.hadoop.hbase.io.MapFile; -import org.apache.hadoop.hbase.io.SequenceFile; -import org.apache.hadoop.hbase.io.Reference; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.hdfs.MiniDFSCluster; -import org.apache.hadoop.io.WritableComparable; -/** - * Test HStoreFile - */ -public class TestHStoreFile extends HBaseTestCase { - static final Log LOG = LogFactory.getLog(TestHStoreFile.class); - private static String DIR = "/"; - private MiniDFSCluster cluster; - private Path dir = null; - - @Override - public void setUp() throws Exception { - try { - this.cluster = new MiniDFSCluster(this.conf, 2, true, (String[])null); - // Set the hbase.rootdir to be the home directory in mini dfs. - this.conf.set(HConstants.HBASE_DIR, - this.cluster.getFileSystem().getHomeDirectory().toString()); - this.dir = new Path(DIR, getName()); - } catch (IOException e) { - shutdownDfs(cluster); - } - super.setUp(); - } - - @Override - public void tearDown() throws Exception { - super.tearDown(); - shutdownDfs(cluster); - // ReflectionUtils.printThreadInfo(new PrintWriter(System.out), - // "Temporary end-of-test thread dump debugging HADOOP-2040: " + getName()); - } - - private Path writeMapFile(final String name) - throws IOException { - Path path = new Path(DIR, name); - MapFile.Writer writer = new MapFile.Writer(this.conf, fs, path.toString(), - HStoreKey.class, ImmutableBytesWritable.class); - writeStoreFile(writer); - return path; - } - - private Path writeSmallMapFile(final String name) - throws IOException { - Path path = new Path(DIR, name); - MapFile.Writer writer = new MapFile.Writer(this.conf, fs, path.toString(), - HStoreKey.class, ImmutableBytesWritable.class); - try { - for (char d = FIRST_CHAR; d <= LAST_CHAR; d++) { - byte[] b = new byte[] {(byte)d}; - byte [] t = Bytes.toBytes(new String(b, HConstants.UTF8_ENCODING)); - writer.append(new HStoreKey(t, t, System.currentTimeMillis()), - new ImmutableBytesWritable(t)); - } - } finally { - writer.close(); - } - return path; - } - - /* - * Writes HStoreKey and ImmutableBytes data to passed writer and - * then closes it. - * @param writer - * @throws IOException - */ - private void writeStoreFile(final MapFile.Writer writer) - throws IOException { - try { - for (char d = FIRST_CHAR; d <= LAST_CHAR; d++) { - for (char e = FIRST_CHAR; e <= LAST_CHAR; e++) { - byte[] b = new byte[] { (byte) d, (byte) e }; - byte [] t = Bytes.toBytes(new String(b, HConstants.UTF8_ENCODING)); - writer.append(new HStoreKey(t, t, System.currentTimeMillis()), - new ImmutableBytesWritable(t)); - } - } - } finally { - writer.close(); - } - } - - /** - * Test that our mechanism of writing store files in one region to reference - * store files in other regions works. - * @throws IOException - */ - public void testReference() - throws IOException { - // Make a store file and write data to it. - HStoreFile hsf = new HStoreFile(this.conf, this.fs, this.dir, - HRegionInfo.FIRST_META_REGIONINFO, - Bytes.toBytes("colfamily"), 1234567890L, null); - MapFile.Writer writer = - hsf.getWriter(this.fs, SequenceFile.CompressionType.NONE, false, 0); - writeStoreFile(writer); - MapFile.Reader reader = hsf.getReader(this.fs, false, false); - // Split on a row, not in middle of row. Midkey returned by reader - // may be in middle of row. Create new one with empty column and - // timestamp. - HStoreKey midkey = new HStoreKey(((HStoreKey)reader.midKey()).getRow()); - HStoreKey hsk = new HStoreKey(); - reader.finalKey(hsk); - byte [] finalKey = hsk.getRow(); - // Make a reference for the bottom half of the just written file. - Reference reference = - new Reference(hsf.getEncodedRegionName(), hsf.getFileId(), - midkey, Reference.Range.top); - HStoreFile refHsf = new HStoreFile(this.conf, this.fs, - new Path(DIR, getName()), - HRegionInfo.FIRST_META_REGIONINFO, - hsf.getColFamily(), 456, reference); - // Assert that reference files are written and that we can write and - // read the info reference file at least. - refHsf.writeReferenceFiles(this.fs); - assertTrue(this.fs.exists(refHsf.getMapFilePath())); - assertTrue(this.fs.exists(refHsf.getInfoFilePath())); - Reference otherReference = - HStoreFile.readSplitInfo(refHsf.getInfoFilePath(), this.fs); - assertEquals(reference.getEncodedRegionName(), - otherReference.getEncodedRegionName()); - assertEquals(reference.getFileId(), - otherReference.getFileId()); - assertEquals(reference.getMidkey().toString(), - otherReference.getMidkey().toString()); - // Now confirm that I can read from the reference and that it only gets - // keys from top half of the file. - MapFile.Reader halfReader = refHsf.getReader(this.fs, false, false); - HStoreKey key = new HStoreKey(); - ImmutableBytesWritable value = new ImmutableBytesWritable(); - boolean first = true; - while(halfReader.next(key, value)) { - if (first) { - assertTrue(Bytes.equals(key.getRow(), midkey.getRow())); - first = false; - } - } - assertTrue(Bytes.equals(key.getRow(), finalKey)); - } - - /** - * Write a file and then assert that we can read from top and bottom halves - * using two HalfMapFiles. - * @throws Exception - */ - public void testBasicHalfMapFile() throws Exception { - Path p = writeMapFile(getName()); - WritableComparable midkey = getMidkey(p); - checkHalfMapFile(p, midkey); - } - - /** - * Check HalfMapFile works even if file we're to go against is smaller than - * the default MapFile interval of 128: i.e. index gets entry every 128 - * keys. - * @throws Exception - */ - public void testSmallHalfMapFile() throws Exception { - Path p = writeSmallMapFile(getName()); - // I know keys are a-z. Let the midkey we want to use be 'd'. See if - // HalfMapFiles work even if size of file is < than default MapFile - // interval. - checkHalfMapFile(p, new HStoreKey("d")); - } - - private WritableComparable getMidkey(final Path p) throws IOException { - MapFile.Reader reader = - new MapFile.Reader(this.fs, p.toString(), this.conf); - HStoreKey key = new HStoreKey(); - ImmutableBytesWritable value = new ImmutableBytesWritable(); - reader.next(key, value); - String firstKey = key.toString(); - WritableComparable midkey = reader.midKey(); - reader.finalKey(key); - LOG.info("First key " + firstKey + ", midkey " + midkey.toString() - + ", last key " + key.toString()); - reader.close(); - return midkey; - } - - private void checkHalfMapFile(final Path p, WritableComparable midkey) - throws IOException { - MapFile.Reader top = null; - MapFile.Reader bottom = null; - HStoreKey key = new HStoreKey(); - ImmutableBytesWritable value = new ImmutableBytesWritable(); - String previous = null; - try { - // Now make two HalfMapFiles and assert they can read the full backing - // file, one from the top and the other from the bottom. - // Test bottom half first. - bottom = new HalfMapFileReader(this.fs, p.toString(), - this.conf, Reference.Range.bottom, midkey, null); - boolean first = true; - while (bottom.next(key, value)) { - previous = key.toString(); - if (first) { - first = false; - LOG.info("First in bottom: " + previous); - } - assertTrue(key.compareTo((HStoreKey)midkey) < 0); - } - if (previous != null) { - LOG.info("Last in bottom: " + previous.toString()); - } - // Now test reading from the top. - top = new HalfMapFileReader(this.fs, p.toString(), this.conf, - Reference.Range.top, midkey, null); - first = true; - while (top.next(key, value)) { - assertTrue(key.compareTo((HStoreKey)midkey) >= 0); - if (first) { - first = false; - LOG.info("First in top: " + key.toString()); - } - } - LOG.info("Last in top: " + key.toString()); - - // Next test using a midkey that does not exist in the file. - // First, do a key that is < than first key. Ensure splits behave - // properly. - WritableComparable badkey = new HStoreKey(" "); - bottom = new HalfMapFileReader(this.fs, p.toString(), - this.conf, Reference.Range.bottom, badkey, null); - // When badkey is < than the bottom, should return no values. - assertFalse(bottom.next(key, value)); - // Now read from the top. - top = new HalfMapFileReader(this.fs, p.toString(), this.conf, - Reference.Range.top, badkey, null); - first = true; - while (top.next(key, value)) { - assertTrue(key.compareTo((HStoreKey)badkey) >= 0); - if (first) { - first = false; - LOG.info("First top when key < bottom: " + key.toString()); - String tmp = Bytes.toString(key.getRow()); - for (int i = 0; i < tmp.length(); i++) { - assertTrue(tmp.charAt(i) == 'a'); - } - } - } - LOG.info("Last top when key < bottom: " + key.toString()); - String tmp = Bytes.toString(key.getRow()); - for (int i = 0; i < tmp.length(); i++) { - assertTrue(tmp.charAt(i) == 'z'); - } - - // Test when badkey is > than last key in file ('||' > 'zz'). - badkey = new HStoreKey("|||"); - bottom = new HalfMapFileReader(this.fs, p.toString(), - this.conf, Reference.Range.bottom, badkey, null); - first = true; - while (bottom.next(key, value)) { - if (first) { - first = false; - LOG.info("First bottom when key > top: " + key.toString()); - tmp = Bytes.toString(key.getRow()); - for (int i = 0; i < tmp.length(); i++) { - assertTrue(tmp.charAt(i) == 'a'); - } - } - } - LOG.info("Last bottom when key > top: " + key.toString()); - tmp = Bytes.toString(key.getRow()); - for (int i = 0; i < tmp.length(); i++) { - assertTrue(tmp.charAt(i) == 'z'); - } - // Now look at top. Should not return any values. - top = new HalfMapFileReader(this.fs, p.toString(), this.conf, - Reference.Range.top, badkey, null); - assertFalse(top.next(key, value)); - - } finally { - if (top != null) { - top.close(); - } - if (bottom != null) { - bottom.close(); - } - fs.delete(p, true); - } - } - - /** - * Assert HalFMapFile does right thing when midkey does not exist in the - * backing file (its larger or smaller than any of the backing mapfiles keys). - * - * @throws Exception - */ - public void testOutOfRangeMidkeyHalfMapFile() throws Exception { - MapFile.Reader top = null; - MapFile.Reader bottom = null; - HStoreKey key = new HStoreKey(); - ImmutableBytesWritable value = new ImmutableBytesWritable(); - Path p = writeMapFile(getName()); - try { - try { - // Test using a midkey that does not exist in the file. - // First, do a key that is < than first key. Ensure splits behave - // properly. - HStoreKey midkey = new HStoreKey(" "); - bottom = new HalfMapFileReader(this.fs, p.toString(), - this.conf, Reference.Range.bottom, midkey, null); - // When midkey is < than the bottom, should return no values. - assertFalse(bottom.next(key, value)); - // Now read from the top. - top = new HalfMapFileReader(this.fs, p.toString(), - this.conf, Reference.Range.top, midkey, null); - boolean first = true; - while (top.next(key, value)) { - assertTrue(key.compareTo(midkey) >= 0); - if (first) { - first = false; - LOG.info("First top when key < bottom: " + key.toString()); - assertEquals("aa", Bytes.toString(key.getRow())); - } - } - LOG.info("Last top when key < bottom: " + key.toString()); - assertEquals("zz", Bytes.toString(key.getRow())); - - // Test when midkey is > than last key in file ('||' > 'zz'). - midkey = new HStoreKey("|||"); - bottom = new HalfMapFileReader(this.fs, p.toString(), - this.conf, Reference.Range.bottom, midkey, null); - first = true; - while (bottom.next(key, value)) { - if (first) { - first = false; - LOG.info("First bottom when key > top: " + key.toString()); - assertEquals("aa", Bytes.toString(key.getRow())); - } - } - LOG.info("Last bottom when key > top: " + key.toString()); - assertEquals("zz", Bytes.toString(key.getRow())); - // Now look at top. Should not return any values. - top = new HalfMapFileReader(this.fs, p.toString(), - this.conf, Reference.Range.top, midkey, null); - assertFalse(top.next(key, value)); - } finally { - if (top != null) { - top.close(); - } - if (bottom != null) { - bottom.close(); - } - fs.delete(p, true); - } - } finally { - this.fs.delete(p, true); - } - } -} \ No newline at end of file diff --git a/src/test/org/apache/hadoop/hbase/regionserver/TestStoreFile.java b/src/test/org/apache/hadoop/hbase/regionserver/TestStoreFile.java new file mode 100644 index 00000000000..f91d53d49fd --- /dev/null +++ b/src/test/org/apache/hadoop/hbase/regionserver/TestStoreFile.java @@ -0,0 +1,295 @@ +/** + * Copyright 2007 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.regionserver; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseTestCase; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.HStoreKey; +import org.apache.hadoop.hbase.io.Reference.Range; +import org.apache.hadoop.hbase.io.hfile.HFile; +import org.apache.hadoop.hbase.io.hfile.HFileScanner; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hdfs.MiniDFSCluster; +/** + * Test HStoreFile + */ +public class TestStoreFile extends HBaseTestCase { + static final Log LOG = LogFactory.getLog(TestStoreFile.class); + private MiniDFSCluster cluster; + + @Override + public void setUp() throws Exception { + try { + this.cluster = new MiniDFSCluster(this.conf, 2, true, (String[])null); + // Set the hbase.rootdir to be the home directory in mini dfs. + this.conf.set(HConstants.HBASE_DIR, + this.cluster.getFileSystem().getHomeDirectory().toString()); + } catch (IOException e) { + shutdownDfs(cluster); + } + super.setUp(); + } + + @Override + public void tearDown() throws Exception { + super.tearDown(); + shutdownDfs(cluster); + // ReflectionUtils.printThreadInfo(new PrintWriter(System.out), + // "Temporary end-of-test thread dump debugging HADOOP-2040: " + getName()); + } + + /** + * Write a file and then assert that we can read from top and bottom halves + * using two HalfMapFiles. + * @throws Exception + */ + public void testBasicHalfMapFile() throws Exception { + // Make up a directory hierarchy that has a regiondir and familyname. + HFile.Writer writer = StoreFile.getWriter(this.fs, + new Path(new Path(this.testDir, "regionname"), "familyname"), + 2 * 1024, null, null); + writeStoreFile(writer); + checkHalfHFile(new StoreFile(this.fs, writer.getPath())); + } + + /* + * Writes HStoreKey and ImmutableBytes data to passed writer and + * then closes it. + * @param writer + * @throws IOException + */ + private void writeStoreFile(final HFile.Writer writer) + throws IOException { + try { + for (char d = FIRST_CHAR; d <= LAST_CHAR; d++) { + for (char e = FIRST_CHAR; e <= LAST_CHAR; e++) { + byte[] b = new byte[] { (byte) d, (byte) e }; + byte [] t = Bytes.toBytes(new String(b, HConstants.UTF8_ENCODING)); + HStoreKey hsk = new HStoreKey(t, t, System.currentTimeMillis()); + writer.append(hsk.getBytes(), t); + } + } + } finally { + writer.close(); + } + } + + /** + * Test that our mechanism of writing store files in one region to reference + * store files in other regions works. + * @throws IOException + */ + public void testReference() + throws IOException { + Path storedir = new Path(new Path(this.testDir, "regionname"), "familyname"); + Path dir = new Path(storedir, "1234567890"); + // Make a store file and write data to it. + HFile.Writer writer = StoreFile.getWriter(this.fs, dir, 8 * 1024, null, null); + writeStoreFile(writer); + StoreFile hsf = new StoreFile(this.fs, writer.getPath()); + HFile.Reader reader = hsf.getReader(); + // Split on a row, not in middle of row. Midkey returned by reader + // may be in middle of row. Create new one with empty column and + // timestamp. + HStoreKey hsk = HStoreKey.create(reader.midkey()); + byte [] midkey = hsk.getRow(); + hsk = HStoreKey.create(reader.getLastKey()); + byte [] finalKey = hsk.getRow(); + // Make a reference + Path refPath = StoreFile.split(fs, dir, hsf, reader.midkey(), Range.top); + StoreFile refHsf = new StoreFile(this.fs, refPath); + // Now confirm that I can read from the reference and that it only gets + // keys from top half of the file. + HFileScanner s = refHsf.getReader().getScanner(); + for(boolean first = true; (!s.isSeeked() && s.seekTo()) || s.next();) { + ByteBuffer bb = s.getKey(); + hsk = HStoreKey.create(bb.array(), bb.arrayOffset(), bb.limit()); + if (first) { + assertTrue(Bytes.equals(hsk.getRow(), midkey)); + first = false; + } + } + assertTrue(Bytes.equals(hsk.getRow(), finalKey)); + } + + private void checkHalfHFile(final StoreFile f) + throws IOException { + byte [] midkey = f.getReader().midkey(); + // Create top split. + Path topDir = Store.getStoreHomedir(this.testDir, 1, + Bytes.toBytes(f.getPath().getParent().getName())); + if (this.fs.exists(topDir)) { + this.fs.delete(topDir, true); + } + Path topPath = StoreFile.split(this.fs, topDir, f, midkey, Range.top); + // Create bottom split. + Path bottomDir = Store.getStoreHomedir(this.testDir, 2, + Bytes.toBytes(f.getPath().getParent().getName())); + if (this.fs.exists(bottomDir)) { + this.fs.delete(bottomDir, true); + } + Path bottomPath = StoreFile.split(this.fs, bottomDir, + f, midkey, Range.bottom); + // Make readers on top and bottom. + HFile.Reader top = new StoreFile(this.fs, topPath).getReader(); + HFile.Reader bottom = new StoreFile(this.fs, bottomPath).getReader(); + ByteBuffer previous = null; + LOG.info("Midkey: " + Bytes.toString(midkey)); + byte [] midkeyBytes = new HStoreKey(midkey).getBytes(); + ByteBuffer bbMidkeyBytes = ByteBuffer.wrap(midkeyBytes); + try { + // Now make two HalfMapFiles and assert they can read the full backing + // file, one from the top and the other from the bottom. + // Test bottom half first. + // Now test reading from the top. + boolean first = true; + ByteBuffer key = null; + HFileScanner topScanner = top.getScanner(); + while ((!topScanner.isSeeked() && topScanner.seekTo()) || + (topScanner.isSeeked() && topScanner.next())) { + key = topScanner.getKey(); + + assertTrue(topScanner.getReader().getComparator().compare(key.array(), + key.arrayOffset(), key.limit(), midkeyBytes, 0, midkeyBytes.length) >= 0); + if (first) { + first = false; + LOG.info("First in top: " + Bytes.toString(Bytes.toBytes(key))); + } + } + LOG.info("Last in top: " + Bytes.toString(Bytes.toBytes(key))); + + first = true; + HFileScanner bottomScanner = bottom.getScanner(); + while ((!bottomScanner.isSeeked() && bottomScanner.seekTo()) || + bottomScanner.next()) { + previous = bottomScanner.getKey(); + key = bottomScanner.getKey(); + if (first) { + first = false; + LOG.info("First in bottom: " + + Bytes.toString(Bytes.toBytes(previous))); + } + assertTrue(key.compareTo(bbMidkeyBytes) < 0); + } + if (previous != null) { + LOG.info("Last in bottom: " + Bytes.toString(Bytes.toBytes(previous))); + } + // Remove references. + this.fs.delete(topPath, false); + this.fs.delete(bottomPath, false); + + // Next test using a midkey that does not exist in the file. + // First, do a key that is < than first key. Ensure splits behave + // properly. + byte [] badmidkey = Bytes.toBytes(" ."); + topPath = StoreFile.split(this.fs, topDir, f, badmidkey, Range.top); + bottomPath = StoreFile.split(this.fs, bottomDir, f, badmidkey, + Range.bottom); + top = new StoreFile(this.fs, topPath).getReader(); + bottom = new StoreFile(this.fs, bottomPath).getReader(); + bottomScanner = bottom.getScanner(); + int count = 0; + while ((!bottomScanner.isSeeked() && bottomScanner.seekTo()) || + bottomScanner.next()) { + count++; + } + // When badkey is < than the bottom, should return no values. + assertTrue(count == 0); + // Now read from the top. + first = true; + topScanner = top.getScanner(); + while ((!topScanner.isSeeked() && topScanner.seekTo()) || + topScanner.next()) { + key = topScanner.getKey(); + assertTrue(topScanner.getReader().getComparator().compare(key.array(), + key.arrayOffset(), key.limit(), badmidkey, 0, badmidkey.length) >= 0); + if (first) { + first = false; + first = false; + HStoreKey keyhsk = HStoreKey.create(key); + LOG.info("First top when key < bottom: " + keyhsk); + String tmp = Bytes.toString(keyhsk.getRow()); + for (int i = 0; i < tmp.length(); i++) { + assertTrue(tmp.charAt(i) == 'a'); + } + } + } + HStoreKey keyhsk = HStoreKey.create(key); + LOG.info("Last top when key < bottom: " + keyhsk); + String tmp = Bytes.toString(keyhsk.getRow()); + for (int i = 0; i < tmp.length(); i++) { + assertTrue(tmp.charAt(i) == 'z'); + } + // Remove references. + this.fs.delete(topPath, false); + this.fs.delete(bottomPath, false); + + // Test when badkey is > than last key in file ('||' > 'zz'). + badmidkey = Bytes.toBytes("|||"); + topPath = StoreFile.split(this.fs, topDir, f, badmidkey, Range.top); + bottomPath = StoreFile.split(this.fs, bottomDir, f, badmidkey, + Range.bottom); + top = new StoreFile(this.fs, topPath).getReader(); + bottom = new StoreFile(this.fs, bottomPath).getReader(); + first = true; + bottomScanner = bottom.getScanner(); + while ((!bottomScanner.isSeeked() && bottomScanner.seekTo()) || + bottomScanner.next()) { + key = bottomScanner.getKey(); + if (first) { + first = false; + keyhsk = HStoreKey.create(key); + LOG.info("First bottom when key > top: " + keyhsk); + tmp = Bytes.toString(keyhsk.getRow()); + for (int i = 0; i < tmp.length(); i++) { + assertTrue(tmp.charAt(i) == 'a'); + } + } + } + keyhsk = HStoreKey.create(key); + LOG.info("Last bottom when key > top: " + keyhsk); + for (int i = 0; i < tmp.length(); i++) { + assertTrue(Bytes.toString(keyhsk.getRow()).charAt(i) == 'z'); + } + count = 0; + topScanner = top.getScanner(); + while ((!topScanner.isSeeked() && topScanner.seekTo()) || + (topScanner.isSeeked() && topScanner.next())) { + count++; + } + // When badkey is < than the bottom, should return no values. + assertTrue(count == 0); + } finally { + if (top != null) { + top.close(); + } + if (bottom != null) { + bottom.close(); + } + fs.delete(f.getPath(), true); + } + } +} \ No newline at end of file