HBASE-21520 TestMultiColumnScanner cost long time when using ROWCOL bloom type

This commit is contained in:
huzheng 2018-12-13 15:04:12 +08:00
parent 491153488e
commit ac0b3bb547
5 changed files with 219 additions and 67 deletions

View File

@ -32,11 +32,9 @@ import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.TreeSet;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellComparatorImpl;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.KeyValue;
@ -47,29 +45,27 @@ import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.compress.Compression;
import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.testclassification.RegionServerTests;
import org.apache.hadoop.hbase.util.BloomFilterUtil;
import org.apache.hadoop.hbase.util.Bytes;
import org.junit.ClassRule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import org.junit.runners.Parameterized.Parameters;
import org.junit.runners.Parameterized.Parameter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Tests optimized scanning of multiple columns.
* Tests optimized scanning of multiple columns. <br>
* We separated the big test into several sub-class UT, because When in ROWCOL bloom type, we will
* test the row-col bloom filter frequently for saving HDFS seek once we switch from one column to
* another in our UT. It's cpu time consuming (~45s for each case), so moved the ROWCOL case into a
* separated LargeTests to avoid timeout failure. <br>
* <br>
* To be clear: In TestMultiColumnScanner, we will flush 10 (NUM_FLUSHES=10) HFiles here, and the
* table will put ~1000 cells (rows=20, ts=6, qualifiers=8, total=20*6*8 ~ 1000) . Each full table
* scan will check the ROWCOL bloom filter 20 (rows)* 8 (column) * 10 (hfiles)= 1600 times, beside
* it will scan the full table 6*2^8=1536 times, so finally will have 1600*1536=2457600 bloom filter
* testing. (See HBASE-21520)
*/
@RunWith(Parameterized.class)
@Category({RegionServerTests.class, MediumTests.class})
public class TestMultiColumnScanner {
@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
HBaseClassTestRule.forClass(TestMultiColumnScanner.class);
public abstract class TestMultiColumnScanner {
private static final Logger LOG = LoggerFactory.getLogger(TestMultiColumnScanner.class);
@ -104,20 +100,19 @@ public class TestMultiColumnScanner {
/** The probability that a column is skipped in a store file. */
private static final double COLUMN_SKIP_IN_STORE_FILE_PROB = 0.7;
/** The probability of skipping a column in a single row */
private static final double COLUMN_SKIP_IN_ROW_PROB = 0.1;
/** The probability of skipping a column everywhere */
private static final double COLUMN_SKIP_EVERYWHERE_PROB = 0.1;
/** The probability to delete a row/column pair */
private static final double DELETE_PROBABILITY = 0.02;
private final static HBaseTestingUtility TEST_UTIL = HBaseTestingUtility.createLocalHTU();
private final Compression.Algorithm comprAlgo;
private final BloomType bloomType;
private final DataBlockEncoding dataBlockEncoding;
@Parameter(0)
public Compression.Algorithm comprAlgo;
@Parameter(1)
public BloomType bloomType;
@Parameter(2)
public DataBlockEncoding dataBlockEncoding;
// Some static sanity-checking.
static {
@ -128,27 +123,17 @@ public class TestMultiColumnScanner {
assertTrue(TIMESTAMPS[i] < TIMESTAMPS[i + 1]);
}
@Parameters
public static final Collection<Object[]> parameters() {
public static Collection<Object[]> generateParams(Compression.Algorithm algo,
boolean useDataBlockEncoding) {
List<Object[]> parameters = new ArrayList<>();
for (Object[] bloomAndCompressionParams :
HBaseTestingUtility.BLOOM_AND_COMPRESSION_COMBINATIONS) {
for (boolean useDataBlockEncoding : new boolean[]{false, true}) {
parameters.add(ArrayUtils.add(bloomAndCompressionParams,
useDataBlockEncoding));
}
for (BloomType bloomType : BloomType.values()) {
DataBlockEncoding dataBlockEncoding =
useDataBlockEncoding ? DataBlockEncoding.PREFIX : DataBlockEncoding.NONE;
parameters.add(new Object[] { algo, bloomType, dataBlockEncoding });
}
return parameters;
}
public TestMultiColumnScanner(Compression.Algorithm comprAlgo,
BloomType bloomType, boolean useDataBlockEncoding) {
this.comprAlgo = comprAlgo;
this.bloomType = bloomType;
this.dataBlockEncoding = useDataBlockEncoding ? DataBlockEncoding.PREFIX :
DataBlockEncoding.NONE;
}
@Test
public void testMultiColumnScanner() throws IOException {
TEST_UTIL.getConfiguration().setInt(BloomFilterUtil.PREFIX_LENGTH_KEY, 10);
@ -170,24 +155,6 @@ public class TestMultiColumnScanner {
Map<String, Long> lastDelTimeMap = new HashMap<>();
Random rand = new Random(29372937L);
Set<String> rowQualSkip = new HashSet<>();
// Skip some columns in some rows. We need to test scanning over a set
// of columns when some of the columns are not there.
for (String row : rows)
for (String qual : qualifiers)
if (rand.nextDouble() < COLUMN_SKIP_IN_ROW_PROB) {
LOG.info("Skipping " + qual + " in row " + row);
rowQualSkip.add(rowQualKey(row, qual));
}
// Also skip some columns in all rows.
for (String qual : qualifiers)
if (rand.nextDouble() < COLUMN_SKIP_EVERYWHERE_PROB) {
LOG.info("Skipping " + qual + " in all rows");
for (String row : rows)
rowQualSkip.add(rowQualKey(row, qual));
}
for (int iFlush = 0; iFlush < NUM_FLUSHES; ++iFlush) {
for (String qual : qualifiers) {
@ -316,10 +283,6 @@ public class TestMultiColumnScanner {
kv.getQualifierLength());
}
private static String rowQualKey(String row, String qual) {
return row + "_" + qual;
}
static String createValue(String row, String qual, long ts) {
return "value_for_" + row + "_" + qual + "_" + ts;
}
@ -339,10 +302,7 @@ public class TestMultiColumnScanner {
lst.add(sb.toString());
}
return lst;
}
}

View File

@ -0,0 +1,48 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.regionserver;
import java.util.Collection;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.io.compress.Compression.Algorithm;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.testclassification.RegionServerTests;
import org.junit.ClassRule;
import org.junit.experimental.categories.Category;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import org.junit.runners.Parameterized.Parameters;
/**
* Test case for Compression.Algorithm.GZ and no use data block encoding.
* @see org.apache.hadoop.hbase.regionserver.TestMultiColumnScanner
*/
@RunWith(Parameterized.class)
@Category({ RegionServerTests.class, LargeTests.class })
public class TestMultiColumnScannerWithAlgoGZAndNoDataEncoding extends TestMultiColumnScanner {
@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
HBaseClassTestRule.forClass(TestMultiColumnScannerWithAlgoGZAndNoDataEncoding.class);
@Parameters
public static Collection<Object[]> parameters() {
return TestMultiColumnScanner.generateParams(Algorithm.GZ, false);
}
}

View File

@ -0,0 +1,48 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.regionserver;
import java.util.Collection;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.io.compress.Compression.Algorithm;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.testclassification.RegionServerTests;
import org.junit.ClassRule;
import org.junit.experimental.categories.Category;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import org.junit.runners.Parameterized.Parameters;
/**
* Test case for Compression.Algorithm.GZ and use data block encoding.
* @see org.apache.hadoop.hbase.regionserver.TestMultiColumnScanner
*/
@RunWith(Parameterized.class)
@Category({ RegionServerTests.class, LargeTests.class })
public class TestMultiColumnScannerWithAlgoGZAndUseDataEncoding extends TestMultiColumnScanner {
@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
HBaseClassTestRule.forClass(TestMultiColumnScannerWithAlgoGZAndUseDataEncoding.class);
@Parameters
public static Collection<Object[]> parameters() {
return TestMultiColumnScanner.generateParams(Algorithm.GZ, true);
}
}

View File

@ -0,0 +1,48 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.regionserver;
import java.util.Collection;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.io.compress.Compression.Algorithm;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.testclassification.RegionServerTests;
import org.junit.ClassRule;
import org.junit.experimental.categories.Category;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import org.junit.runners.Parameterized.Parameters;
/**
* Test case for Compression.Algorithm.NONE and no use data block encoding.
* @see org.apache.hadoop.hbase.regionserver.TestMultiColumnScanner
*/
@RunWith(Parameterized.class)
@Category({ RegionServerTests.class, LargeTests.class })
public class TestMultiColumnScannerWithNoneAndNoDataEncoding extends TestMultiColumnScanner {
@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
HBaseClassTestRule.forClass(TestMultiColumnScannerWithNoneAndNoDataEncoding.class);
@Parameters
public static Collection<Object[]> parameters() {
return TestMultiColumnScanner.generateParams(Algorithm.NONE, false);
}
}

View File

@ -0,0 +1,48 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.regionserver;
import java.util.Collection;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.io.compress.Compression.Algorithm;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.testclassification.RegionServerTests;
import org.junit.ClassRule;
import org.junit.experimental.categories.Category;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import org.junit.runners.Parameterized.Parameters;
/**
* Test case for Compression.Algorithm.NONE and no use data block encoding.
* @see org.apache.hadoop.hbase.regionserver.TestMultiColumnScanner
*/
@RunWith(Parameterized.class)
@Category({ RegionServerTests.class, LargeTests.class })
public class TestMultiColumnScannerWithNoneAndUseDataEncoding extends TestMultiColumnScanner {
@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
HBaseClassTestRule.forClass(TestMultiColumnScannerWithNoneAndUseDataEncoding.class);
@Parameters
public static Collection<Object[]> parameters() {
return TestMultiColumnScanner.generateParams(Algorithm.NONE, true);
}
}