HBASE-15554 - StoreFile$Writer.appendGeneralBloomFilter generates extra KV

(Ram)
This commit is contained in:
Ramkrishna 2016-08-12 10:21:41 +05:30
parent fb668a0d3f
commit 285e104176
21 changed files with 656 additions and 215 deletions

View File

@ -26,6 +26,7 @@ import java.io.IOException;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
@ -33,6 +34,7 @@ import java.util.NavigableMap;
import org.apache.hadoop.hbase.KeyValue.Type;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.classification.InterfaceAudience.Private;
import org.apache.hadoop.hbase.classification.InterfaceStability;
import org.apache.hadoop.hbase.io.HeapSize;
import org.apache.hadoop.hbase.io.TagCompressionContext;
@ -139,6 +141,22 @@ public final class CellUtil {
return destinationOffset + rowLen;
}
/**
* Copies the row to a new byte[]
* @param cell the cell from which row has to copied
* @return the byte[] containing the row
*/
public static byte[] copyRow(Cell cell) {
if (cell instanceof ByteBufferedCell) {
return ByteBufferUtils.copyOfRange(((ByteBufferedCell) cell).getRowByteBuffer(),
((ByteBufferedCell) cell).getRowPosition(),
((ByteBufferedCell) cell).getRowPosition() + cell.getRowLength());
} else {
return Arrays.copyOfRange(cell.getRowArray(), cell.getRowOffset(),
cell.getRowOffset() + cell.getRowLength());
}
}
public static int copyFamilyTo(Cell cell, byte[] destination, int destinationOffset) {
byte fLen = cell.getFamilyLength();
if (cell instanceof ByteBufferedCell) {
@ -200,6 +218,7 @@ public final class CellUtil {
/********************* misc *************************************/
@Private
public static byte getRowByte(Cell cell, int index) {
if (cell instanceof ByteBufferedCell) {
return ((ByteBufferedCell) cell).getRowByteBuffer().get(
@ -208,6 +227,15 @@ public final class CellUtil {
return cell.getRowArray()[cell.getRowOffset() + index];
}
@Private
public static byte getQualifierByte(Cell cell, int index) {
if (cell instanceof ByteBufferedCell) {
return ((ByteBufferedCell) cell).getQualifierByteBuffer().get(
((ByteBufferedCell) cell).getQualifierPosition() + index);
}
return cell.getQualifierArray()[cell.getQualifierOffset() + index];
}
public static ByteBuffer getValueBufferShallowCopy(Cell cell) {
ByteBuffer buffer = ByteBuffer.wrap(cell.getValueArray(), cell.getValueOffset(),
cell.getValueLength());
@ -1701,7 +1729,7 @@ public final class CellUtil {
/**
* Create a Cell that is smaller than all other possible Cells for the given Cell's row.
*
* The family length is considered to be 0
* @param cell
* @return First possible Cell on passed Cell's row.
*/

View File

@ -0,0 +1,33 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.util;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
@InterfaceAudience.Private
public class ByteArrayHashKey extends HashKey<byte[]> {
public ByteArrayHashKey(byte[] t) {
super(t);
}
@Override
public byte get(int pos) {
return t[pos];
}
}

View File

@ -24,6 +24,7 @@ import java.io.OutputStream;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.util.Arrays;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.classification.InterfaceStability;
@ -984,6 +985,21 @@ public final class ByteBufferUtils {
}
}
/**
* Similar to {@link Arrays#copyOfRange(byte[], int, int)}
* @param original the buffer from which the copy has to happen
* @param from the starting index
* @param to the ending index
* @return a byte[] created out of the copy
*/
public static byte[] copyOfRange(ByteBuffer original, int from, int to) {
int newLength = to - from;
if (newLength < 0) throw new IllegalArgumentException(from + " > " + to);
byte[] copy = new byte[newLength];
ByteBufferUtils.copyFromBufferToArray(copy, original, from, 0, newLength);
return copy;
}
// For testing purpose
public static String toStringBinary(final ByteBuffer b, int off, int len) {
StringBuilder result = new StringBuilder();

View File

@ -0,0 +1,37 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.util;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
/**
* Extracts the byte for the hash calculation from the given cell
*/
@InterfaceAudience.Private
public abstract class CellHashKey extends HashKey<Cell> {
protected static final byte[] LATEST_TS = Bytes.toBytes(HConstants.LATEST_TIMESTAMP);
protected static final byte MAX_TYPE = KeyValue.Type.Maximum.getCode();
public CellHashKey(Cell cell) {
super(cell);
}
}

View File

@ -139,7 +139,20 @@ public abstract class Hash {
* @param initval seed value
* @return hash value
*/
public abstract int hash(byte[] bytes, int offset, int length, int initval);
// TODO : remove this once read path is updated to work with Cells
public int hash(byte[] bytes, int offset, int length, int initval) {
return hash(new ByteArrayHashKey(bytes), offset, length, initval);
}
// TODO : a buffer based hash function would be needed.. Not adding it for now
/**
* Calculate a hash using bytes from <code>offset</code> to <code>offset +
* length</code>, and the provided seed value.
* @param hashKey key to extract the hash
* @param offset offset to be used by the hash algo
* @param length length to be used by the hash algo
* @param initval the seed value
* @return hash value
*/
// TODO : Remove offset and length when the byte[] version of hash() is removed
public abstract int hash(HashKey hashKey, int offset, int length, int initval);
}

View File

@ -0,0 +1,36 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.util;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
/**
* Used to calculate the hash {@link Hash} algorithms for Bloomfilters.
*
* @param <T> the type of HashKey
*/
@InterfaceAudience.Private
public abstract class HashKey<T> {
protected final T t;
public HashKey(T t) {
this.t = t;
}
public abstract byte get(int pos);
}

View File

@ -53,10 +53,34 @@ public class JenkinsHash extends Hash {
return _instance;
}
/**
* Compute the hash of the specified file
* @param args name of file to compute hash of.
* @throws IOException e
*/
public static void main(String[] args) throws IOException {
if (args.length != 1) {
System.err.println("Usage: JenkinsHash filename");
System.exit(-1);
}
FileInputStream in = new FileInputStream(args[0]);
byte[] bytes = new byte[512];
int value = 0;
JenkinsHash hash = new JenkinsHash();
try {
for (int length = in.read(bytes); length > 0; length = in.read(bytes)) {
value = hash.hash(bytes, length, value);
}
} finally {
in.close();
}
System.out.println(Math.abs(value));
}
/**
* taken from hashlittle() -- hash a variable-length key into a 32-bit value
*
* @param key the key (the unaligned variable-length array of bytes)
* @param hashKey the key to extract the bytes for hash algo
* @param nbytes number of bytes to include in hash
* @param initval can be any integer value
* @return a 32-bit value. Every bit of the key affects every bit of the
@ -78,26 +102,26 @@ public class JenkinsHash extends Hash {
* <p>Use for hash table lookup, or anything where one collision in 2^^32 is
* acceptable. Do NOT use for cryptographic purposes.
*/
@Override
@SuppressWarnings("fallthrough")
public int hash(byte[] key, int off, int nbytes, int initval) {
@Override
public int hash(HashKey hashKey, int off, int nbytes, int initval) {
int length = nbytes;
int a, b, c;
a = b = c = 0xdeadbeef + length + initval;
int offset = off;
for (; length > 12; offset += 12, length -= 12) {
a += (key[offset] & BYTE_MASK);
a += ((key[offset + 1] & BYTE_MASK) << 8);
a += ((key[offset + 2] & BYTE_MASK) << 16);
a += ((key[offset + 3] & BYTE_MASK) << 24);
b += (key[offset + 4] & BYTE_MASK);
b += ((key[offset + 5] & BYTE_MASK) << 8);
b += ((key[offset + 6] & BYTE_MASK) << 16);
b += ((key[offset + 7] & BYTE_MASK) << 24);
c += (key[offset + 8] & BYTE_MASK);
c += ((key[offset + 9] & BYTE_MASK) << 8);
c += ((key[offset + 10] & BYTE_MASK) << 16);
c += ((key[offset + 11] & BYTE_MASK) << 24);
a += (hashKey.get(offset) & BYTE_MASK);
a += ((hashKey.get(offset + 1) & BYTE_MASK) << 8);
a += ((hashKey.get(offset + 2) & BYTE_MASK) << 16);
a += ((hashKey.get(offset + 3) & BYTE_MASK) << 24);
b += (hashKey.get(offset + 4) & BYTE_MASK);
b += ((hashKey.get(offset + 5) & BYTE_MASK) << 8);
b += ((hashKey.get(offset + 6) & BYTE_MASK) << 16);
b += ((hashKey.get(offset + 7) & BYTE_MASK) << 24);
c += (hashKey.get(offset + 8) & BYTE_MASK);
c += ((hashKey.get(offset + 9) & BYTE_MASK) << 8);
c += ((hashKey.get(offset + 10) & BYTE_MASK) << 16);
c += ((hashKey.get(offset + 11) & BYTE_MASK) << 24);
/*
* mix -- mix 3 32-bit values reversibly.
@ -164,30 +188,30 @@ public class JenkinsHash extends Hash {
//-------------------------------- last block: affect all 32 bits of (c)
switch (length) { // all the case statements fall through
case 12:
c += ((key[offset + 11] & BYTE_MASK) << 24);
c += ((hashKey.get(offset + 11) & BYTE_MASK) << 24);
case 11:
c += ((key[offset + 10] & BYTE_MASK) << 16);
c += ((hashKey.get(offset + 10) & BYTE_MASK) << 16);
case 10:
c += ((key[offset + 9] & BYTE_MASK) << 8);
c += ((hashKey.get(offset + 9) & BYTE_MASK) << 8);
case 9:
c += (key[offset + 8] & BYTE_MASK);
c += (hashKey.get(offset + 8) & BYTE_MASK);
case 8:
b += ((key[offset + 7] & BYTE_MASK) << 24);
b += ((hashKey.get(offset + 7) & BYTE_MASK) << 24);
case 7:
b += ((key[offset + 6] & BYTE_MASK) << 16);
b += ((hashKey.get(offset + 6) & BYTE_MASK) << 16);
case 6:
b += ((key[offset + 5] & BYTE_MASK) << 8);
b += ((hashKey.get(offset + 5) & BYTE_MASK) << 8);
case 5:
b += (key[offset + 4] & BYTE_MASK);
b += (hashKey.get(offset + 4) & BYTE_MASK);
case 4:
a += ((key[offset + 3] & BYTE_MASK) << 24);
a += ((hashKey.get(offset + 3) & BYTE_MASK) << 24);
case 3:
a += ((key[offset + 2] & BYTE_MASK) << 16);
a += ((hashKey.get(offset + 2) & BYTE_MASK) << 16);
case 2:
a += ((key[offset + 1] & BYTE_MASK) << 8);
a += ((hashKey.get(offset + 1) & BYTE_MASK) << 8);
case 1:
//noinspection PointlessArithmeticExpression
a += (key[offset + 0] & BYTE_MASK);
a += (hashKey.get(offset + 0) & BYTE_MASK);
break;
case 0:
return c;
@ -238,28 +262,4 @@ public class JenkinsHash extends Hash {
c ^= b; c -= rotateLeft(b, 24);
return c;
}
/**
* Compute the hash of the specified file
* @param args name of file to compute hash of.
* @throws IOException e
*/
public static void main(String[] args) throws IOException {
if (args.length != 1) {
System.err.println("Usage: JenkinsHash filename");
System.exit(-1);
}
FileInputStream in = new FileInputStream(args[0]);
byte[] bytes = new byte[512];
int value = 0;
JenkinsHash hash = new JenkinsHash();
try {
for (int length = in.read(bytes); length > 0; length = in.read(bytes)) {
value = hash.hash(bytes, length, value);
}
} finally {
in.close();
}
System.out.println(Math.abs(value));
}
}

View File

@ -39,7 +39,7 @@ public class MurmurHash extends Hash {
}
@Override
public int hash(byte[] data, int offset, int length, int seed) {
public int hash(HashKey hashKey, int offset, int length, int seed) {
int m = 0x5bd1e995;
int r = 24;
@ -49,14 +49,14 @@ public class MurmurHash extends Hash {
for (int i = 0; i < len_4; i++) {
int i_4 = (i << 2) + offset;
int k = data[i_4 + 3];
int k = hashKey.get(i_4 + 3);
k = k << 8;
k = k | (data[i_4 + 2] & 0xff);
k = k | (hashKey.get(i_4 + 2) & 0xff);
k = k << 8;
k = k | (data[i_4 + 1] & 0xff);
k = k | (hashKey.get(i_4 + 1) & 0xff);
k = k << 8;
//noinspection PointlessArithmeticExpression
k = k | (data[i_4 + 0] & 0xff);
// noinspection PointlessArithmeticExpression
k = k | (hashKey.get(i_4 + 0) & 0xff);
k *= m;
k ^= k >>> r;
k *= m;
@ -71,13 +71,13 @@ public class MurmurHash extends Hash {
if (left != 0) {
if (left >= 3) {
h ^= data[i_m + 2] << 16;
h ^= hashKey.get(i_m + 2) << 16;
}
if (left >= 2) {
h ^= data[i_m + 1] << 8;
h ^= hashKey.get(i_m + 1) << 8;
}
if (left >= 1) {
h ^= data[i_m];
h ^= hashKey.get(i_m);
}
h *= m;

View File

@ -40,7 +40,7 @@ public class MurmurHash3 extends Hash {
/** Returns the MurmurHash3_x86_32 hash. */
@edu.umd.cs.findbugs.annotations.SuppressWarnings("SF")
@Override
public int hash(byte[] bytes, int offset, int length, int initval) {
public int hash(HashKey hashKey, int offset, int length, int initval) {
final int c1 = 0xcc9e2d51;
final int c2 = 0x1b873593;
@ -49,8 +49,10 @@ public class MurmurHash3 extends Hash {
for (int i = offset; i < roundedEnd; i += 4) {
// little endian load order
int k1 = (bytes[i] & 0xff) | ((bytes[i + 1] & 0xff) << 8) | ((bytes[i + 2] & 0xff) << 16)
| (bytes[i + 3] << 24);
int k1 =
(hashKey.get(i) & 0xff) | ((hashKey.get(i + 1) & 0xff) << 8)
| ((hashKey.get(i + 2) & 0xff) << 16)
| (hashKey.get(i + 3) << 24);
k1 *= c1;
k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
k1 *= c2;
@ -65,13 +67,13 @@ public class MurmurHash3 extends Hash {
switch (length & 0x03) {
case 3:
k1 = (bytes[roundedEnd + 2] & 0xff) << 16;
k1 = (hashKey.get(roundedEnd + 2) & 0xff) << 16;
// FindBugs SF_SWITCH_FALLTHROUGH
case 2:
k1 |= (bytes[roundedEnd + 1] & 0xff) << 8;
k1 |= (hashKey.get(roundedEnd + 1) & 0xff) << 8;
// FindBugs SF_SWITCH_FALLTHROUGH
case 1:
k1 |= (bytes[roundedEnd] & 0xff);
k1 |= (hashKey.get(roundedEnd) & 0xff);
k1 *= c1;
k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
k1 *= c2;

View File

@ -0,0 +1,35 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.util;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
@InterfaceAudience.Private
public class RowBloomHashKey extends CellHashKey {
public RowBloomHashKey(Cell cell) {
super(cell);
}
@Override
public byte get(int offset) {
return CellUtil.getRowByte(t, offset);
}
}

View File

@ -0,0 +1,80 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.util;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
/**
* An hash key for ROWCOL bloom. This assumes the cells to be serialized in the Keyvalue
* serialization format with Empty column family. Note that the byte representing the family length
* is considered to be 0
*/
@InterfaceAudience.Private
public class RowColBloomHashKey extends CellHashKey {
private final int rowLength;
private final int qualLength;
public RowColBloomHashKey(Cell cell) {
super(cell);
rowLength = cell.getRowLength();
// We don't consider the family length for ROWCOL bloom. So subtract the famLen from the
// length calculation. Timestamp and type are of no relevance here
qualLength = cell.getQualifierLength();
}
@Override
public byte get(int offset) {
// Always assume that this cell has keyvalue serialized key structure.
// rk len + row key + 0 byte for family length + qual + ts + type
if (offset < Bytes.SIZEOF_SHORT) {
// assign locally
int rowlen = rowLength;
byte b = (byte) rowlen;
if (offset == 0) {
rowlen >>= 8;
b = (byte) rowlen;
}
return b;
}
int refLen = Bytes.SIZEOF_SHORT + rowLength;
if (offset < refLen) {
return CellUtil.getRowByte(t, offset - Bytes.SIZEOF_SHORT);
}
if (offset == refLen) {
// The fam length should return 0 assuming there is no column family.
// Because for ROWCOL blooms family is not considered
return 0;
}
refLen += qualLength + Bytes.SIZEOF_BYTE;
// skip the family len because actual cells may have family also
if (offset < refLen) {
return CellUtil.getQualifierByte(t,
offset - (Bytes.SIZEOF_SHORT + rowLength + Bytes.SIZEOF_BYTE));
}
// TODO : check if ts and type can be removed
refLen += KeyValue.TIMESTAMP_SIZE;
if (offset < refLen) {
return LATEST_TS[offset - (Bytes.SIZEOF_SHORT + rowLength + qualLength + Bytes.SIZEOF_BYTE)];
}
return MAX_TYPE;
}
}

View File

@ -150,9 +150,11 @@ public class CompoundBloomFilter extends CompoundBloomFilterBase
// Atleast we now avoid multiple copies until it comes here. If we want to make this to work
// with BBs then the Hash.java APIs should also be changed to work with BBs.
if (keyCell instanceof KeyValue) {
// TODO : directly use Cell here
return checkContains(((KeyValue) keyCell).getBuffer(), ((KeyValue) keyCell).getKeyOffset(),
((KeyValue) keyCell).getKeyLength(), block);
}
// TODO : Avoid this copy in read path also
byte[] key = CellUtil.getCellKeySerializedAsKeyValueKey(keyCell);
return checkContains(key, 0, key.length, block);
}

View File

@ -21,15 +21,16 @@ package org.apache.hadoop.hbase.io.hfile;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.Queue;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellComparator;
import org.apache.hadoop.hbase.regionserver.StoreFileWriter;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.regionserver.BloomType;
import org.apache.hadoop.hbase.util.BloomFilterChunk;
import org.apache.hadoop.hbase.util.BloomFilterUtil;
import org.apache.hadoop.hbase.util.BloomFilterWriter;
@ -78,6 +79,8 @@ public class CompoundBloomFilterWriter extends CompoundBloomFilterBase
/** Whether to cache-on-write compound Bloom filter chunks */
private boolean cacheOnWrite;
private BloomType bloomType;
/**
* @param chunkByteSizeHint
* each chunk's size in bytes. The real chunk size might be different
@ -88,10 +91,12 @@ public class CompoundBloomFilterWriter extends CompoundBloomFilterBase
* hash function type to use
* @param maxFold
* maximum degree of folding allowed
* @param bloomType
* the bloom type
*/
public CompoundBloomFilterWriter(int chunkByteSizeHint, float errorRate,
int hashType, int maxFold, boolean cacheOnWrite,
CellComparator comparator) {
CellComparator comparator, BloomType bloomType) {
chunkByteSize = BloomFilterUtil.computeFoldableByteSize(
chunkByteSizeHint * 8L, maxFold);
@ -100,6 +105,7 @@ public class CompoundBloomFilterWriter extends CompoundBloomFilterBase
this.maxFold = maxFold;
this.cacheOnWrite = cacheOnWrite;
this.comparator = comparator;
this.bloomType = bloomType;
}
@Override
@ -152,16 +158,9 @@ public class CompoundBloomFilterWriter extends CompoundBloomFilterBase
chunk = null;
}
/**
* Adds a Bloom filter key. This key must be greater than the previous key,
* as defined by the comparator this compound Bloom filter is configured
* with. For efficiency, key monotonicity is not checked here. See
* {@link StoreFileWriter#append(
* org.apache.hadoop.hbase.Cell)} for the details of deduplication.
*/
@Override
public void add(byte[] bloomKey, int keyOffset, int keyLength) {
if (bloomKey == null)
public void add(Cell cell) {
if (cell == null)
throw new NullPointerException();
enqueueReadyChunk(false);
@ -171,32 +170,39 @@ public class CompoundBloomFilterWriter extends CompoundBloomFilterBase
throw new IllegalStateException("First key in chunk already set: "
+ Bytes.toStringBinary(firstKeyInChunk));
}
firstKeyInChunk = Arrays.copyOfRange(bloomKey, keyOffset, keyOffset
+ keyLength);
if (prevChunk == null) {
// First chunk
chunk = BloomFilterUtil.createBySize(chunkByteSize, errorRate,
hashType, maxFold);
// This will be done only once per chunk
if (bloomType == BloomType.ROW) {
firstKeyInChunk = CellUtil.copyRow(cell);
} else {
// Use the same parameters as the last chunk, but a new array and
// a zero key count.
chunk = prevChunk.createAnother();
firstKeyInChunk =
CellUtil.getCellKeySerializedAsKeyValueKey(CellUtil.createFirstOnRowCol(cell));
}
if (chunk.getKeyCount() != 0) {
throw new IllegalStateException("keyCount=" + chunk.getKeyCount()
+ " > 0");
}
chunk.allocBloom();
++numChunks;
allocateNewChunk();
}
chunk.add(bloomKey, keyOffset, keyLength);
chunk.add(cell);
++totalKeyCount;
}
private void allocateNewChunk() {
if (prevChunk == null) {
// First chunk
chunk = BloomFilterUtil.createBySize(chunkByteSize, errorRate,
hashType, maxFold, bloomType);
} else {
// Use the same parameters as the last chunk, but a new array and
// a zero key count.
chunk = prevChunk.createAnother();
}
if (chunk.getKeyCount() != 0) {
throw new IllegalStateException("keyCount=" + chunk.getKeyCount()
+ " > 0");
}
chunk.allocBloom();
++numChunks;
}
@Override
public void writeInlineBlock(DataOutput out) throws IOException {
// We don't remove the chunk from the queue here, because we might need it

View File

@ -18,7 +18,9 @@
*/
package org.apache.hadoop.hbase.regionserver;
import com.google.common.base.Preconditions;
import java.io.IOException;
import java.net.InetSocketAddress;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
@ -29,20 +31,20 @@ import org.apache.hadoop.hbase.CellComparator;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.KeyValueUtil;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.io.hfile.HFileContext;
import org.apache.hadoop.hbase.regionserver.compactions.Compactor;
import org.apache.hadoop.hbase.util.BloomContext;
import org.apache.hadoop.hbase.util.BloomFilterFactory;
import org.apache.hadoop.hbase.util.BloomFilterWriter;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.RowBloomContext;
import org.apache.hadoop.hbase.util.RowColBloomContext;
import org.apache.hadoop.io.WritableUtils;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.Arrays;
import com.google.common.base.Preconditions;
/**
* A StoreFile writer. Use this to read/write HBase Store Files. It is package
@ -55,12 +57,10 @@ public class StoreFileWriter implements Compactor.CellSink {
private final BloomFilterWriter generalBloomFilterWriter;
private final BloomFilterWriter deleteFamilyBloomFilterWriter;
private final BloomType bloomType;
private byte[] lastBloomKey;
private int lastBloomKeyOffset, lastBloomKeyLen;
private Cell lastCell = null;
private long earliestPutTs = HConstants.LATEST_TIMESTAMP;
private Cell lastDeleteFamilyCell = null;
private long deleteFamilyCnt = 0;
private BloomContext bloomContext = null;
/**
* timeRangeTrackerSet is used to figure if we were passed a filled-out TimeRangeTracker or not.
@ -73,7 +73,6 @@ public class StoreFileWriter implements Compactor.CellSink {
final TimeRangeTracker timeRangeTracker;
protected HFile.Writer writer;
private KeyValue.KeyOnlyKeyValue lastBloomKeyOnlyKV = null;
/**
* Creates an HFile.Writer that also write helpful meta data.
@ -134,9 +133,6 @@ public class StoreFileWriter implements Compactor.CellSink {
if (generalBloomFilterWriter != null) {
this.bloomType = bloomType;
if(this.bloomType == BloomType.ROWCOL) {
lastBloomKeyOnlyKV = new KeyValue.KeyOnlyKeyValue();
}
if (LOG.isTraceEnabled()) {
LOG.trace("Bloom filter type for " + path + ": " + this.bloomType + ", " +
generalBloomFilterWriter.getClass().getSimpleName());
@ -218,87 +214,30 @@ public class StoreFileWriter implements Compactor.CellSink {
private void appendGeneralBloomfilter(final Cell cell) throws IOException {
if (this.generalBloomFilterWriter != null) {
// only add to the bloom filter on a new, unique key
boolean newKey = true;
if (this.lastCell != null) {
switch(bloomType) {
case ROW:
newKey = ! CellUtil.matchingRows(cell, lastCell);
break;
case ROWCOL:
newKey = ! CellUtil.matchingRowColumn(cell, lastCell);
break;
case NONE:
newKey = false;
break;
default:
throw new IOException("Invalid Bloom filter type: " + bloomType +
" (ROW or ROWCOL expected)");
}
}
if (newKey) {
/*
* http://2.bp.blogspot.com/_Cib_A77V54U/StZMrzaKufI/AAAAAAAAADo/ZhK7bGoJdMQ/s400/KeyValue.png
* Key = RowLen + Row + FamilyLen + Column [Family + Qualifier] + TimeStamp
*
* 2 Types of Filtering:
* 1. Row = Row
* 2. RowCol = Row + Qualifier
*/
byte[] bloomKey = null;
// Used with ROW_COL bloom
KeyValue bloomKeyKV = null;
int bloomKeyOffset, bloomKeyLen;
if (this.bloomContext == null) {
// init bloom context
switch (bloomType) {
case ROW:
bloomKey = cell.getRowArray();
bloomKeyOffset = cell.getRowOffset();
bloomKeyLen = cell.getRowLength();
break;
case ROWCOL:
// merge(row, qualifier)
// TODO: could save one buffer copy in case of compound Bloom
// filters when this involves creating a KeyValue
// TODO : Handle while writes also
bloomKeyKV = KeyValueUtil.createFirstOnRow(cell.getRowArray(), cell.getRowOffset(),
cell.getRowLength(),
HConstants.EMPTY_BYTE_ARRAY, 0, 0, cell.getQualifierArray(),
cell.getQualifierOffset(),
cell.getQualifierLength());
bloomKey = bloomKeyKV.getBuffer();
bloomKeyOffset = bloomKeyKV.getKeyOffset();
bloomKeyLen = bloomKeyKV.getKeyLength();
break;
default:
throw new IOException("Invalid Bloom filter type: " + bloomType +
" (ROW or ROWCOL expected)");
case ROW:
bloomContext = new RowBloomContext(generalBloomFilterWriter);
break;
case ROWCOL:
bloomContext = new RowColBloomContext(generalBloomFilterWriter);
break;
default:
throw new IOException(
"Invalid Bloom filter type: " + bloomType + " (ROW or ROWCOL expected)");
}
generalBloomFilterWriter.add(bloomKey, bloomKeyOffset, bloomKeyLen);
if (lastBloomKey != null) {
int res = 0;
// hbase:meta does not have blooms. So we need not have special interpretation
// of the hbase:meta cells. We can safely use Bytes.BYTES_RAWCOMPARATOR for ROW Bloom
if (bloomType == BloomType.ROW) {
res = Bytes.BYTES_RAWCOMPARATOR.compare(bloomKey, bloomKeyOffset, bloomKeyLen,
lastBloomKey, lastBloomKeyOffset, lastBloomKeyLen);
} else {
// TODO : Caching of kv components becomes important in these cases
res = CellComparator.COMPARATOR.compare(bloomKeyKV, lastBloomKeyOnlyKV);
}
if (res <= 0) {
throw new IOException("Non-increasing Bloom keys: "
+ Bytes.toStringBinary(bloomKey, bloomKeyOffset, bloomKeyLen) + " after "
+ Bytes.toStringBinary(lastBloomKey, lastBloomKeyOffset, lastBloomKeyLen));
}
}
lastBloomKey = bloomKey;
lastBloomKeyOffset = bloomKeyOffset;
lastBloomKeyLen = bloomKeyLen;
if (bloomType == BloomType.ROWCOL) {
lastBloomKeyOnlyKV.setKey(bloomKey, bloomKeyOffset, bloomKeyLen);
}
this.lastCell = cell;
}
/*
* http://2.bp.blogspot.com/_Cib_A77V54U/StZMrzaKufI/AAAAAAAAADo/ZhK7bGoJdMQ/s400/KeyValue.png
* Key = RowLen + Row + FamilyLen + Column [Family + Qualifier] + TimeStamp
*
* 2 Types of Filtering:
* 1. Row = Row
* 2. RowCol = Row + Qualifier
*/
bloomContext.writeBloom(cell);
}
}
@ -317,9 +256,9 @@ public class StoreFileWriter implements Compactor.CellSink {
// of the hbase:meta cells
newKey = !CellUtil.matchingRows(cell, lastDeleteFamilyCell);
}
// TODO : Use bloom context for delete family bloom filter also
if (newKey) {
this.deleteFamilyBloomFilterWriter.add(cell.getRowArray(),
cell.getRowOffset(), cell.getRowLength());
this.deleteFamilyBloomFilterWriter.add(cell);
this.lastDeleteFamilyCell = cell;
}
}
@ -365,11 +304,7 @@ public class StoreFileWriter implements Compactor.CellSink {
writer.addGeneralBloomFilter(generalBloomFilterWriter);
writer.appendFileInfo(StoreFile.BLOOM_FILTER_TYPE_KEY,
Bytes.toBytes(bloomType.toString()));
if (lastBloomKey != null) {
writer.appendFileInfo(StoreFile.LAST_BLOOM_KEY, Arrays.copyOfRange(
lastBloomKey, lastBloomKeyOffset, lastBloomKeyOffset
+ lastBloomKeyLen));
}
bloomContext.addLastBloomKey(writer);
}
return hasGeneralBloom;
}

View File

@ -0,0 +1,71 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.util;
import java.io.IOException;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.io.hfile.HFile;
/**
* The bloom context that is used by the StorefileWriter to add the bloom details
* per cell
*/
@InterfaceAudience.Private
public abstract class BloomContext {
// TODO : Avoid holding references to lastCell
protected Cell lastCell;
protected BloomFilterWriter generalBloomFilterWriter;
public BloomContext(BloomFilterWriter generalBloomFilterWriter) {
this.generalBloomFilterWriter = generalBloomFilterWriter;
}
public Cell getLastCell() {
return this.lastCell;
}
/**
* Bloom information from the cell is retrieved
* @param cell
* @throws IOException
*/
public void writeBloom(Cell cell) throws IOException {
if (isNewKey(cell)) {
generalBloomFilterWriter.add(cell);
this.lastCell = cell;
}
}
/**
* Adds the last bloom key to the HFile Writer as part of StorefileWriter close.
* @param writer
* @throws IOException
*/
public abstract void addLastBloomKey(HFile.Writer writer) throws IOException;
/**
* Returns true if the cell is a new key as per the bloom type
* @param cell the cell to be verified
* @return true if a new key else false
*/
protected abstract boolean isNewKey(Cell cell);
}

View File

@ -24,7 +24,10 @@ import java.io.DataOutput;
import java.io.IOException;
import java.nio.ByteBuffer;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.KeyValueUtil;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.regionserver.BloomType;
/**
* The basic building block for the {@link org.apache.hadoop.hbase.io.hfile.CompoundBloomFilter}
@ -46,6 +49,8 @@ public class BloomFilterChunk implements BloomFilterBase {
protected int maxKeys;
/** Bloom bits */
protected ByteBuffer bloom;
/** The type of bloom */
protected BloomType bloomType;
/**
* Loads bloom filter meta data from file input.
@ -80,9 +85,10 @@ public class BloomFilterChunk implements BloomFilterBase {
return BloomFilterUtil.actualErrorRate(keyCount, byteSize * 8, hashCount);
}
public BloomFilterChunk(int hashType) {
public BloomFilterChunk(int hashType, BloomType bloomType) {
this.hashType = hashType;
this.hash = Hash.getInstance(hashType);
this.bloomType = bloomType;
}
/**
@ -100,9 +106,10 @@ public class BloomFilterChunk implements BloomFilterBase {
* than maxKeys.
* @throws IllegalArgumentException
*/
// Used only in testcases
public BloomFilterChunk(int maxKeys, double errorRate, int hashType,
int foldFactor) throws IllegalArgumentException {
this(hashType);
this(hashType, BloomType.ROW);
long bitSize = BloomFilterUtil.computeBitSize(maxKeys, errorRate);
hashCount = BloomFilterUtil.optimalFunctionCount(maxKeys, bitSize);
@ -121,7 +128,7 @@ public class BloomFilterChunk implements BloomFilterBase {
* @return a Bloom filter with the same configuration as this
*/
public BloomFilterChunk createAnother() {
BloomFilterChunk bbf = new BloomFilterChunk(hashType);
BloomFilterChunk bbf = new BloomFilterChunk(hashType, this.bloomType);
bbf.byteSize = byteSize;
bbf.hashCount = hashCount;
bbf.maxKeys = maxKeys;
@ -173,6 +180,38 @@ public class BloomFilterChunk implements BloomFilterBase {
int hash1 = this.hash.hash(buf, offset, len, 0);
int hash2 = this.hash.hash(buf, offset, len, hash1);
setHashLoc(hash1, hash2);
}
public void add(Cell cell) {
/*
* For faster hashing, use combinatorial generation
* http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf
*/
int hash1;
int hash2;
HashKey<Cell> hashKey;
if (this.bloomType == BloomType.ROW) {
// TODO : Move this length to the HashKey when we do the read path to work with
// extractor so that the byte[] version of hash() function is removed
int length = cell.getRowLength();
hashKey = new RowBloomHashKey(cell);
hash1 = this.hash.hash(hashKey, 0, length, 0);
hash2 = this.hash.hash(hashKey, 0, length, hash1);
} else {
int famLen = cell.getFamilyLength();
// TODO : Move this length to the HashKey when we do the read path to work with
// extractor so that the byte[] version of hash() function is removed
int length = KeyValueUtil.keyLength(cell) - famLen;
hashKey = new RowColBloomHashKey(cell);
hash1 = this.hash.hash(hashKey, 0, length, 0);
hash2 = this.hash.hash(hashKey, 0, length, hash1);
}
setHashLoc(hash1, hash2);
}
private void setHashLoc(int hash1, int hash2) {
for (int i = 0; i < this.hashCount; i++) {
long hashLoc = Math.abs((hash1 + i * hash2) % (this.byteSize * 8));
set(hashLoc);

View File

@ -196,7 +196,7 @@ public final class BloomFilterFactory {
// In case of compound Bloom filters we ignore the maxKeys hint.
CompoundBloomFilterWriter bloomWriter = new CompoundBloomFilterWriter(getBloomBlockSize(conf),
err, Hash.getHashType(conf), maxFold, cacheConf.shouldCacheBloomsOnWrite(),
bloomType == BloomType.ROWCOL ? CellComparator.COMPARATOR : null);
bloomType == BloomType.ROWCOL ? CellComparator.COMPARATOR : null, bloomType);
writer.addInlineBlockWriter(bloomWriter);
return bloomWriter;
}
@ -227,7 +227,7 @@ public final class BloomFilterFactory {
// In case of compound Bloom filters we ignore the maxKeys hint.
CompoundBloomFilterWriter bloomWriter = new CompoundBloomFilterWriter(getBloomBlockSize(conf),
err, Hash.getHashType(conf), maxFold, cacheConf.shouldCacheBloomsOnWrite(),
null);
null, BloomType.ROW);
writer.addInlineBlockWriter(bloomWriter);
return bloomWriter;
}

View File

@ -22,6 +22,7 @@ import java.util.Random;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.nio.ByteBuff;
import org.apache.hadoop.hbase.regionserver.BloomType;
/**
* Utility methods related to BloomFilters
@ -173,11 +174,12 @@ public final class BloomFilterUtil {
* @param errorRate target false positive rate of the Bloom filter
* @param hashType Bloom filter hash function type
* @param foldFactor
* @param bloomType
* @return the new Bloom filter of the desired size
*/
public static BloomFilterChunk createBySize(int byteSizeHint,
double errorRate, int hashType, int foldFactor) {
BloomFilterChunk bbf = new BloomFilterChunk(hashType);
double errorRate, int hashType, int foldFactor, BloomType bloomType) {
BloomFilterChunk bbf = new BloomFilterChunk(hashType, bloomType);
bbf.byteSize = computeFoldableByteSize(byteSizeHint * 8L, foldFactor);
long bitSize = bbf.byteSize * 8;
@ -195,11 +197,12 @@ public final class BloomFilterUtil {
public static boolean contains(byte[] buf, int offset, int length,
ByteBuff bloomBuf, int bloomOffset, int bloomSize, Hash hash,
int hashCount) {
int hash1 = hash.hash(buf, offset, length, 0);
int hash2 = hash.hash(buf, offset, length, hash1);
// TODO : this will get removed once read path also work with Cell for blooms.
ByteArrayHashKey hashKey = new ByteArrayHashKey(buf);
int hash1 = hash.hash(hashKey, offset, length, 0);
int hash2 = hash.hash(hashKey, offset, length, hash1);
int bloomBitSize = bloomSize << 3;
if (randomGeneratorForTest == null) {
// Production mode.
int compositeHash = hash1;

View File

@ -19,6 +19,7 @@
package org.apache.hadoop.hbase.util;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.io.Writable;
@ -48,11 +49,7 @@ public interface BloomFilterWriter extends BloomFilterBase {
/**
* Add the specified binary to the bloom filter.
*
* @param buf data to be added to the bloom
* @param offset offset into the data to be added
* @param len length of the data to be added
* @param cell the cell data to be added to the bloom
*/
void add(byte[] buf, int offset, int len);
void add(Cell cell);
}

View File

@ -0,0 +1,52 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.util;
import java.io.IOException;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.io.hfile.HFile.Writer;
import org.apache.hadoop.hbase.regionserver.StoreFile;
/**
* Handles ROW bloom related context. It works with both ByteBufferedCell and byte[] backed cells
*/
@InterfaceAudience.Private
public class RowBloomContext extends BloomContext {
public RowBloomContext(BloomFilterWriter generalBloomFilterWriter) {
super(generalBloomFilterWriter);
}
public void addLastBloomKey(Writer writer) throws IOException {
if (lastCell != null) {
byte[] key = CellUtil.copyRow(this.lastCell);
writer.appendFileInfo(StoreFile.LAST_BLOOM_KEY, key);
}
}
@Override
protected boolean isNewKey(Cell cell) {
if (this.lastCell != null) {
return !CellUtil.matchingRows(cell, this.lastCell);
}
return true;
}
}

View File

@ -0,0 +1,56 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.util;
import java.io.IOException;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.io.hfile.HFile.Writer;
import org.apache.hadoop.hbase.regionserver.StoreFile;
/**
* Handles ROWCOL bloom related context. It can work with both BytebufferdCells
* and byte[] backed cells
*/
@InterfaceAudience.Private
public class RowColBloomContext extends BloomContext {
public RowColBloomContext(BloomFilterWriter generalBloomFilterWriter) {
super(generalBloomFilterWriter);
}
@Override
public void addLastBloomKey(Writer writer) throws IOException {
if (this.lastCell != null) {
Cell firstOnRow = CellUtil.createFirstOnRowCol(this.lastCell);
// This copy happens only once when the writer is closed
byte[] key = CellUtil.getCellKeySerializedAsKeyValueKey(firstOnRow);
writer.appendFileInfo(StoreFile.LAST_BLOOM_KEY, key);
}
}
@Override
protected boolean isNewKey(Cell cell) {
if (this.lastCell != null) {
return !CellUtil.matchingRowColumn(cell, this.lastCell);
}
return true;
}
}