LUCENE-5048: limit CategoryPath total length

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1491562 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shai Erera 2013-06-10 18:41:17 +00:00
parent eef4c01c99
commit a287431293
6 changed files with 128 additions and 6 deletions

View File

@ -148,6 +148,12 @@ Bug Fixes
into out-of-memory errors when working with large stored fields. into out-of-memory errors when working with large stored fields.
(Adrien Grand) (Adrien Grand)
* LUCENE-5048: CategoryPath with a long path could result in hitting
NegativeArraySizeException, categories being added multiple times to the
taxonomy or drill-down terms silently discarded by the indexer. CategoryPath
is now limited to MAX_CATEGORY_PATH_LENGTH characters.
(Colton Jamieson, Mike McCandless, Shai Erera)
Optimizations Optimizations
* LUCENE-4936: Improve numeric doc values compression in case all values share * LUCENE-4936: Improve numeric doc values compression in case all values share

View File

@ -17,6 +17,8 @@ package org.apache.lucene.facet.taxonomy;
* limitations under the License. * limitations under the License.
*/ */
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
import java.util.Arrays; import java.util.Arrays;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -28,6 +30,18 @@ import java.util.regex.Pattern;
*/ */
public class CategoryPath implements Comparable<CategoryPath> { public class CategoryPath implements Comparable<CategoryPath> {
/*
* copied from DocumentWriterPerThread -- if a CategoryPath is resolved to a
* drill-down term which is encoded to a larger term than that length, it is
* silently dropped! Therefore we limit the number of characters to MAX/4 to
* be on the safe side.
*/
/**
* The maximum number of characters a {@link CategoryPath} can have. That is
* {@link CategoryPath#toString(char)} length must not exceed that limit.
*/
public final static int MAX_CATEGORY_PATH_LENGTH = (BYTE_BLOCK_SIZE - 2) / 4;
/** An empty {@link CategoryPath} */ /** An empty {@link CategoryPath} */
public static final CategoryPath EMPTY = new CategoryPath(); public static final CategoryPath EMPTY = new CategoryPath();
@ -63,10 +77,18 @@ public class CategoryPath implements Comparable<CategoryPath> {
/** Construct from the given path components. */ /** Construct from the given path components. */
public CategoryPath(final String... components) { public CategoryPath(final String... components) {
assert components.length > 0 : "use CategoryPath.EMPTY to create an empty path"; assert components.length > 0 : "use CategoryPath.EMPTY to create an empty path";
long len = 0;
for (String comp : components) { for (String comp : components) {
if (comp == null || comp.isEmpty()) { if (comp == null || comp.isEmpty()) {
throw new IllegalArgumentException("empty or null components not allowed: " + Arrays.toString(components)); throw new IllegalArgumentException("empty or null components not allowed: " + Arrays.toString(components));
} }
len += comp.length();
}
len += components.length - 1; // add separators
if (len > MAX_CATEGORY_PATH_LENGTH) {
throw new IllegalArgumentException("category path exceeds maximum allowed path length: max="
+ MAX_CATEGORY_PATH_LENGTH + " len=" + len
+ " path=" + Arrays.toString(components).substring(0, 30) + "...");
} }
this.components = components; this.components = components;
length = components.length; length = components.length;
@ -74,6 +96,12 @@ public class CategoryPath implements Comparable<CategoryPath> {
/** Construct from a given path, separating path components with {@code delimiter}. */ /** Construct from a given path, separating path components with {@code delimiter}. */
public CategoryPath(final String pathString, final char delimiter) { public CategoryPath(final String pathString, final char delimiter) {
if (pathString.length() > MAX_CATEGORY_PATH_LENGTH) {
throw new IllegalArgumentException("category path exceeds maximum allowed path length: max="
+ MAX_CATEGORY_PATH_LENGTH + " len=" + pathString.length()
+ " path=" + pathString.substring(0, 30) + "...");
}
String[] comps = pathString.split(Pattern.quote(Character.toString(delimiter))); String[] comps = pathString.split(Pattern.quote(Character.toString(delimiter)));
if (comps.length == 1 && comps[0].isEmpty()) { if (comps.length == 1 && comps[0].isEmpty()) {
components = null; components = null;

View File

@ -39,14 +39,14 @@ class CategoryPathUtils {
* {@link #serialize(CategoryPath, CharBlockArray)}. * {@link #serialize(CategoryPath, CharBlockArray)}.
*/ */
public static int hashCodeOfSerialized(CharBlockArray charBlockArray, int offset) { public static int hashCodeOfSerialized(CharBlockArray charBlockArray, int offset) {
int length = (short) charBlockArray.charAt(offset++); int length = charBlockArray.charAt(offset++);
if (length == 0) { if (length == 0) {
return 0; return 0;
} }
int hash = length; int hash = length;
for (int i = 0; i < length; i++) { for (int i = 0; i < length; i++) {
int len = (short) charBlockArray.charAt(offset++); int len = charBlockArray.charAt(offset++);
hash = hash * 31 + charBlockArray.subSequence(offset, offset + len).hashCode(); hash = hash * 31 + charBlockArray.subSequence(offset, offset + len).hashCode();
offset += len; offset += len;
} }
@ -67,7 +67,7 @@ class CategoryPathUtils {
} }
for (int i = 0; i < cp.length; i++) { for (int i = 0; i < cp.length; i++) {
int len = (short) charBlockArray.charAt(offset++); int len = charBlockArray.charAt(offset++);
if (len != cp.components[i].length()) { if (len != cp.components[i].length()) {
return false; return false;
} }

View File

@ -3,6 +3,7 @@ package org.apache.lucene.facet.taxonomy;
import java.util.Arrays; import java.util.Arrays;
import org.apache.lucene.facet.FacetTestCase; import org.apache.lucene.facet.FacetTestCase;
import org.apache.lucene.util._TestUtil;
import org.junit.Test; import org.junit.Test;
/* /*
@ -275,4 +276,31 @@ public class TestCategoryPath extends FacetTestCase {
} }
} }
@Test
public void testLongPath() throws Exception {
String bigComp = null;
while (true) {
int len = CategoryPath.MAX_CATEGORY_PATH_LENGTH;
bigComp = _TestUtil.randomSimpleString(random(), len, len);
if (bigComp.indexOf('\u001f') != -1) {
continue;
}
break;
}
try {
assertNotNull(new CategoryPath("dim", bigComp));
fail("long paths should not be allowed; len=" + bigComp.length());
} catch (IllegalArgumentException e) {
// expected
}
try {
assertNotNull(new CategoryPath("dim\u001f" + bigComp, '\u001f'));
fail("long paths should not be allowed; len=" + bigComp.length());
} catch (IllegalArgumentException e) {
// expected
}
}
} }

View File

@ -1,13 +1,19 @@
package org.apache.lucene.facet.taxonomy.directory; package org.apache.lucene.facet.taxonomy.directory;
import java.io.IOException; import java.io.IOException;
import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Random; import java.util.Random;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.facet.FacetTestCase; import org.apache.lucene.facet.FacetTestCase;
import org.apache.lucene.facet.index.FacetFields;
import org.apache.lucene.facet.params.FacetIndexingParams;
import org.apache.lucene.facet.search.DrillDownQuery;
import org.apache.lucene.facet.taxonomy.CategoryPath; import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyReader; import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.MemoryOrdinalMap; import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.MemoryOrdinalMap;
@ -20,8 +26,11 @@ import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.SegmentInfos; import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util._TestUtil;
import org.junit.Test; import org.junit.Test;
/* /*
@ -413,4 +422,51 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
dir.close(); dir.close();
} }
@Test
public void testHugeLabel() throws Exception {
Directory indexDir = newDirectory(), taxoDir = newDirectory();
IndexWriter indexWriter = new IndexWriter(indexDir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE, new Cl2oTaxonomyWriterCache(2, 1f, 1));
FacetFields facetFields = new FacetFields(taxoWriter);
// Add one huge label:
String bigs = null;
int ordinal = -1;
CategoryPath cp = null;
while (true) {
int len = CategoryPath.MAX_CATEGORY_PATH_LENGTH - 4; // for the dimension and separator
bigs = _TestUtil.randomSimpleString(random(), len, len);
cp = new CategoryPath("dim", bigs);
ordinal = taxoWriter.addCategory(cp);
Document doc = new Document();
facetFields.addFields(doc, Collections.singletonList(cp));
indexWriter.addDocument(doc);
break;
}
// Add tiny ones to cause a re-hash
for (int i = 0; i < 3; i++) {
String s = _TestUtil.randomSimpleString(random(), 1, 10);
taxoWriter.addCategory(new CategoryPath("dim", s));
Document doc = new Document();
facetFields.addFields(doc, Collections.singletonList(new CategoryPath("dim", s)));
indexWriter.addDocument(doc);
}
// when too large components were allowed to be added, this resulted in a new added category
assertEquals(ordinal, taxoWriter.addCategory(cp));
IOUtils.close(indexWriter, taxoWriter);
DirectoryReader indexReader = DirectoryReader.open(indexDir);
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir);
IndexSearcher searcher = new IndexSearcher(indexReader);
DrillDownQuery ddq = new DrillDownQuery(FacetIndexingParams.DEFAULT);
ddq.add(cp);
assertEquals(1, searcher.search(ddq, 10).totalHits);
IOUtils.close(indexReader, taxoReader);
IOUtils.close(indexDir, taxoDir);
}
} }

View File

@ -293,7 +293,11 @@ public class _TestUtil {
} }
public static String randomSimpleString(Random r, int maxLength) { public static String randomSimpleString(Random r, int maxLength) {
final int end = nextInt(r, 0, maxLength); return randomSimpleString(r, 0, maxLength);
}
public static String randomSimpleString(Random r, int minLength, int maxLength) {
final int end = nextInt(r, minLength, maxLength);
if (end == 0) { if (end == 0) {
// allow 0 length // allow 0 length
return ""; return "";
@ -319,7 +323,7 @@ public class _TestUtil {
} }
public static String randomSimpleString(Random r) { public static String randomSimpleString(Random r) {
return randomSimpleString(r, 10); return randomSimpleString(r, 0, 10);
} }
/** Returns random string, including full unicode range. */ /** Returns random string, including full unicode range. */