HBASE-2824 A filter that randomly includes rows based on a configured chance

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1067232 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Andrew Kyle Purtell 2011-02-04 17:32:02 +00:00
parent 5bb4725494
commit 1aa394d882
4 changed files with 216 additions and 2 deletions

View File

@ -63,7 +63,8 @@ Release 0.91.0 - Unreleased
HBASE-3393 Update Avro gateway to use Avro 1.4.1 and the new
server.join() method (Jeff Hammerbacher via Stack)
HBASE-3437 Support Explict Split Points from the Shell
HBASE-3433 KeyValue API to explicitly distinguish between deep & shallow copies
HBASE-3433 KeyValue API to explicitly distinguish between deep & shallow
copies
HBASE-3305 Allow round-robin distribution for table created with
multiple regions (ted yu via jgray)
@ -74,6 +75,9 @@ Release 0.91.0 - Unreleased
HBASE-3335 Add BitComparator for filtering (Nathaniel Cook via Stack)
HBASE-3256 Coprocessors: Coprocessor host and observer for HMaster
HBASE-3448 RegionSplitter, utility class to manually split tables
HBASE-2824 A filter that randomly includes rows based on a configured
chance (Ferdy via Andrew Purtell)
Release 0.90.1 - Unreleased

View File

@ -0,0 +1,118 @@
/**
* Copyright 2011 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.filter;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Random;
import org.apache.hadoop.hbase.KeyValue;
/**
* A filter that includes rows based on a chance.
*
*/
public class RandomRowFilter extends FilterBase {
protected static final Random random = new Random();
protected float chance;
protected boolean filterOutRow;
/**
* Writable constructor, do not use.
*/
public RandomRowFilter() {
}
/**
* Create a new filter with a specified chance for a row to be included.
*
* @param chance
*/
public RandomRowFilter(float chance) {
this.chance = chance;
}
/**
* @return The chance that a row gets included.
*/
public float getChance() {
return chance;
}
/**
* Set the chance that a row is included.
*
* @param chance
*/
public void setChance(float chance) {
this.chance = chance;
}
@Override
public boolean filterAllRemaining() {
return false;
}
@Override
public ReturnCode filterKeyValue(KeyValue v) {
if (filterOutRow) {
return ReturnCode.NEXT_ROW;
}
return ReturnCode.INCLUDE;
}
@Override
public boolean filterRow() {
return filterOutRow;
}
@Override
public boolean filterRowKey(byte[] buffer, int offset, int length) {
if (chance < 0) {
// with a zero chance, the rows is always excluded
filterOutRow = true;
} else if (chance > 1) {
// always included
filterOutRow = false;
} else {
// roll the dice
filterOutRow = !(random.nextFloat() < chance);
}
return filterOutRow;
}
@Override
public void reset() {
filterOutRow = false;
}
@Override
public void readFields(DataInput in) throws IOException {
chance = in.readFloat();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeFloat(chance);
}
}

View File

@ -22,7 +22,6 @@ import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
@ -73,6 +72,7 @@ import org.apache.hadoop.hbase.filter.KeyOnlyFilter;
import org.apache.hadoop.hbase.filter.PageFilter;
import org.apache.hadoop.hbase.filter.PrefixFilter;
import org.apache.hadoop.hbase.filter.QualifierFilter;
import org.apache.hadoop.hbase.filter.RandomRowFilter;
import org.apache.hadoop.hbase.filter.RowFilter;
import org.apache.hadoop.hbase.filter.SingleColumnValueExcludeFilter;
import org.apache.hadoop.hbase.filter.SingleColumnValueFilter;
@ -218,6 +218,7 @@ public class HbaseObjectWritable implements Writable, WritableWithSize, Configur
// serializable
addToMap(Serializable.class, code++);
addToMap(RandomRowFilter.class, code++);
}
private Class<?> declaredClass;

View File

@ -0,0 +1,91 @@
/**
* Copyright 2011 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.filter;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import junit.framework.TestCase;
import org.apache.hadoop.hbase.util.Bytes;
public class TestRandomRowFilter extends TestCase {
protected RandomRowFilter halfChanceFilter;
@Override
protected void setUp() throws Exception {
super.setUp();
halfChanceFilter = new RandomRowFilter(0.25f);
}
/**
* Tests basics
*
* @throws Exception
*/
public void testBasics() throws Exception {
int included = 0;
int max = 1000000;
for (int i = 0; i < max; i++) {
if (!halfChanceFilter.filterRowKey(Bytes.toBytes("row"), 0, Bytes
.toBytes("row").length)) {
included++;
}
}
// Now let's check if the filter included the right number of rows;
// since we're dealing with randomness, we must have a include an epsilon
// tolerance.
int epsilon = max / 100;
assertTrue("Roughly 25% should pass the filter", Math.abs(included - max
/ 4) < epsilon);
}
/**
* Tests serialization
*
* @throws Exception
*/
public void testSerialization() throws Exception {
RandomRowFilter newFilter = serializationTest(halfChanceFilter);
// use epsilon float comparison
assertTrue("float should be equal", Math.abs(newFilter.getChance()
- halfChanceFilter.getChance()) < 0.000001f);
}
private RandomRowFilter serializationTest(RandomRowFilter filter)
throws Exception {
// Decompose filter to bytes.
ByteArrayOutputStream stream = new ByteArrayOutputStream();
DataOutputStream out = new DataOutputStream(stream);
filter.write(out);
out.close();
byte[] buffer = stream.toByteArray();
// Recompose filter.
DataInputStream in = new DataInputStream(new ByteArrayInputStream(buffer));
RandomRowFilter newFilter = new RandomRowFilter();
newFilter.readFields(in);
return newFilter;
}
}