From a287431293e4a7b171b672153686037183f62a64 Mon Sep 17 00:00:00 2001 From: Shai Erera Date: Mon, 10 Jun 2013 18:41:17 +0000 Subject: [PATCH] LUCENE-5048: limit CategoryPath total length git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1491562 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 6 ++ .../lucene/facet/taxonomy/CategoryPath.java | 28 +++++++++ .../writercache/cl2o/CategoryPathUtils.java | 6 +- .../facet/taxonomy/TestCategoryPath.java | 28 +++++++++ .../TestDirectoryTaxonomyWriter.java | 58 ++++++++++++++++++- .../org/apache/lucene/util/_TestUtil.java | 8 ++- 6 files changed, 128 insertions(+), 6 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index b0c43cf15ed..559be7ca050 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -148,6 +148,12 @@ Bug Fixes into out-of-memory errors when working with large stored fields. (Adrien Grand) +* LUCENE-5048: CategoryPath with a long path could result in hitting + NegativeArraySizeException, categories being added multiple times to the + taxonomy or drill-down terms silently discarded by the indexer. CategoryPath + is now limited to MAX_CATEGORY_PATH_LENGTH characters. + (Colton Jamieson, Mike McCandless, Shai Erera) + Optimizations * LUCENE-4936: Improve numeric doc values compression in case all values share diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java index d31556ed36a..7fe36506d24 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java @@ -17,6 +17,8 @@ package org.apache.lucene.facet.taxonomy; * limitations under the License. */ +import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE; + import java.util.Arrays; import java.util.regex.Pattern; @@ -28,6 +30,18 @@ import java.util.regex.Pattern; */ public class CategoryPath implements Comparable { + /* + * copied from DocumentWriterPerThread -- if a CategoryPath is resolved to a + * drill-down term which is encoded to a larger term than that length, it is + * silently dropped! Therefore we limit the number of characters to MAX/4 to + * be on the safe side. + */ + /** + * The maximum number of characters a {@link CategoryPath} can have. That is + * {@link CategoryPath#toString(char)} length must not exceed that limit. + */ + public final static int MAX_CATEGORY_PATH_LENGTH = (BYTE_BLOCK_SIZE - 2) / 4; + /** An empty {@link CategoryPath} */ public static final CategoryPath EMPTY = new CategoryPath(); @@ -63,10 +77,18 @@ public class CategoryPath implements Comparable { /** Construct from the given path components. */ public CategoryPath(final String... components) { assert components.length > 0 : "use CategoryPath.EMPTY to create an empty path"; + long len = 0; for (String comp : components) { if (comp == null || comp.isEmpty()) { throw new IllegalArgumentException("empty or null components not allowed: " + Arrays.toString(components)); } + len += comp.length(); + } + len += components.length - 1; // add separators + if (len > MAX_CATEGORY_PATH_LENGTH) { + throw new IllegalArgumentException("category path exceeds maximum allowed path length: max=" + + MAX_CATEGORY_PATH_LENGTH + " len=" + len + + " path=" + Arrays.toString(components).substring(0, 30) + "..."); } this.components = components; length = components.length; @@ -74,6 +96,12 @@ public class CategoryPath implements Comparable { /** Construct from a given path, separating path components with {@code delimiter}. */ public CategoryPath(final String pathString, final char delimiter) { + if (pathString.length() > MAX_CATEGORY_PATH_LENGTH) { + throw new IllegalArgumentException("category path exceeds maximum allowed path length: max=" + + MAX_CATEGORY_PATH_LENGTH + " len=" + pathString.length() + + " path=" + pathString.substring(0, 30) + "..."); + } + String[] comps = pathString.split(Pattern.quote(Character.toString(delimiter))); if (comps.length == 1 && comps[0].isEmpty()) { components = null; diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CategoryPathUtils.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CategoryPathUtils.java index 2ceff4c11ab..f1d2e962cbd 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CategoryPathUtils.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CategoryPathUtils.java @@ -39,14 +39,14 @@ class CategoryPathUtils { * {@link #serialize(CategoryPath, CharBlockArray)}. */ public static int hashCodeOfSerialized(CharBlockArray charBlockArray, int offset) { - int length = (short) charBlockArray.charAt(offset++); + int length = charBlockArray.charAt(offset++); if (length == 0) { return 0; } int hash = length; for (int i = 0; i < length; i++) { - int len = (short) charBlockArray.charAt(offset++); + int len = charBlockArray.charAt(offset++); hash = hash * 31 + charBlockArray.subSequence(offset, offset + len).hashCode(); offset += len; } @@ -67,7 +67,7 @@ class CategoryPathUtils { } for (int i = 0; i < cp.length; i++) { - int len = (short) charBlockArray.charAt(offset++); + int len = charBlockArray.charAt(offset++); if (len != cp.components[i].length()) { return false; } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestCategoryPath.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestCategoryPath.java index 8091c6ac4b3..de5d3cb0160 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestCategoryPath.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestCategoryPath.java @@ -3,6 +3,7 @@ package org.apache.lucene.facet.taxonomy; import java.util.Arrays; import org.apache.lucene.facet.FacetTestCase; +import org.apache.lucene.util._TestUtil; import org.junit.Test; /* @@ -274,5 +275,32 @@ public class TestCategoryPath extends FacetTestCase { // expected } } + + @Test + public void testLongPath() throws Exception { + String bigComp = null; + while (true) { + int len = CategoryPath.MAX_CATEGORY_PATH_LENGTH; + bigComp = _TestUtil.randomSimpleString(random(), len, len); + if (bigComp.indexOf('\u001f') != -1) { + continue; + } + break; + } + + try { + assertNotNull(new CategoryPath("dim", bigComp)); + fail("long paths should not be allowed; len=" + bigComp.length()); + } catch (IllegalArgumentException e) { + // expected + } + + try { + assertNotNull(new CategoryPath("dim\u001f" + bigComp, '\u001f')); + fail("long paths should not be allowed; len=" + bigComp.length()); + } catch (IllegalArgumentException e) { + // expected + } + } } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java index 9aeb72dec37..fd515eb0cd6 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java @@ -1,13 +1,19 @@ package org.apache.lucene.facet.taxonomy.directory; import java.io.IOException; +import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.Random; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; import org.apache.lucene.facet.FacetTestCase; +import org.apache.lucene.facet.index.FacetFields; +import org.apache.lucene.facet.params.FacetIndexingParams; +import org.apache.lucene.facet.search.DrillDownQuery; import org.apache.lucene.facet.taxonomy.CategoryPath; import org.apache.lucene.facet.taxonomy.TaxonomyReader; import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.MemoryOrdinalMap; @@ -20,8 +26,11 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.SegmentInfos; +import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util._TestUtil; import org.junit.Test; /* @@ -412,5 +421,52 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase { taxoWriter.close(); dir.close(); } - + + @Test + public void testHugeLabel() throws Exception { + Directory indexDir = newDirectory(), taxoDir = newDirectory(); + IndexWriter indexWriter = new IndexWriter(indexDir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE, new Cl2oTaxonomyWriterCache(2, 1f, 1)); + FacetFields facetFields = new FacetFields(taxoWriter); + + // Add one huge label: + String bigs = null; + int ordinal = -1; + CategoryPath cp = null; + while (true) { + int len = CategoryPath.MAX_CATEGORY_PATH_LENGTH - 4; // for the dimension and separator + bigs = _TestUtil.randomSimpleString(random(), len, len); + cp = new CategoryPath("dim", bigs); + ordinal = taxoWriter.addCategory(cp); + Document doc = new Document(); + facetFields.addFields(doc, Collections.singletonList(cp)); + indexWriter.addDocument(doc); + break; + } + + // Add tiny ones to cause a re-hash + for (int i = 0; i < 3; i++) { + String s = _TestUtil.randomSimpleString(random(), 1, 10); + taxoWriter.addCategory(new CategoryPath("dim", s)); + Document doc = new Document(); + facetFields.addFields(doc, Collections.singletonList(new CategoryPath("dim", s))); + indexWriter.addDocument(doc); + } + + // when too large components were allowed to be added, this resulted in a new added category + assertEquals(ordinal, taxoWriter.addCategory(cp)); + + IOUtils.close(indexWriter, taxoWriter); + + DirectoryReader indexReader = DirectoryReader.open(indexDir); + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); + IndexSearcher searcher = new IndexSearcher(indexReader); + DrillDownQuery ddq = new DrillDownQuery(FacetIndexingParams.DEFAULT); + ddq.add(cp); + assertEquals(1, searcher.search(ddq, 10).totalHits); + + IOUtils.close(indexReader, taxoReader); + + IOUtils.close(indexDir, taxoDir); + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java index 919f45d4713..21b8eec8514 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java @@ -293,7 +293,11 @@ public class _TestUtil { } public static String randomSimpleString(Random r, int maxLength) { - final int end = nextInt(r, 0, maxLength); + return randomSimpleString(r, 0, maxLength); + } + + public static String randomSimpleString(Random r, int minLength, int maxLength) { + final int end = nextInt(r, minLength, maxLength); if (end == 0) { // allow 0 length return ""; @@ -319,7 +323,7 @@ public class _TestUtil { } public static String randomSimpleString(Random r) { - return randomSimpleString(r, 10); + return randomSimpleString(r, 0, 10); } /** Returns random string, including full unicode range. */