mirror of https://github.com/apache/lucene.git
LUCENE-5048: limit CategoryPath total length
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1491562 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
eef4c01c99
commit
a287431293
|
@ -148,6 +148,12 @@ Bug Fixes
|
|||
into out-of-memory errors when working with large stored fields.
|
||||
(Adrien Grand)
|
||||
|
||||
* LUCENE-5048: CategoryPath with a long path could result in hitting
|
||||
NegativeArraySizeException, categories being added multiple times to the
|
||||
taxonomy or drill-down terms silently discarded by the indexer. CategoryPath
|
||||
is now limited to MAX_CATEGORY_PATH_LENGTH characters.
|
||||
(Colton Jamieson, Mike McCandless, Shai Erera)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-4936: Improve numeric doc values compression in case all values share
|
||||
|
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.facet.taxonomy;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
@ -28,6 +30,18 @@ import java.util.regex.Pattern;
|
|||
*/
|
||||
public class CategoryPath implements Comparable<CategoryPath> {
|
||||
|
||||
/*
|
||||
* copied from DocumentWriterPerThread -- if a CategoryPath is resolved to a
|
||||
* drill-down term which is encoded to a larger term than that length, it is
|
||||
* silently dropped! Therefore we limit the number of characters to MAX/4 to
|
||||
* be on the safe side.
|
||||
*/
|
||||
/**
|
||||
* The maximum number of characters a {@link CategoryPath} can have. That is
|
||||
* {@link CategoryPath#toString(char)} length must not exceed that limit.
|
||||
*/
|
||||
public final static int MAX_CATEGORY_PATH_LENGTH = (BYTE_BLOCK_SIZE - 2) / 4;
|
||||
|
||||
/** An empty {@link CategoryPath} */
|
||||
public static final CategoryPath EMPTY = new CategoryPath();
|
||||
|
||||
|
@ -63,10 +77,18 @@ public class CategoryPath implements Comparable<CategoryPath> {
|
|||
/** Construct from the given path components. */
|
||||
public CategoryPath(final String... components) {
|
||||
assert components.length > 0 : "use CategoryPath.EMPTY to create an empty path";
|
||||
long len = 0;
|
||||
for (String comp : components) {
|
||||
if (comp == null || comp.isEmpty()) {
|
||||
throw new IllegalArgumentException("empty or null components not allowed: " + Arrays.toString(components));
|
||||
}
|
||||
len += comp.length();
|
||||
}
|
||||
len += components.length - 1; // add separators
|
||||
if (len > MAX_CATEGORY_PATH_LENGTH) {
|
||||
throw new IllegalArgumentException("category path exceeds maximum allowed path length: max="
|
||||
+ MAX_CATEGORY_PATH_LENGTH + " len=" + len
|
||||
+ " path=" + Arrays.toString(components).substring(0, 30) + "...");
|
||||
}
|
||||
this.components = components;
|
||||
length = components.length;
|
||||
|
@ -74,6 +96,12 @@ public class CategoryPath implements Comparable<CategoryPath> {
|
|||
|
||||
/** Construct from a given path, separating path components with {@code delimiter}. */
|
||||
public CategoryPath(final String pathString, final char delimiter) {
|
||||
if (pathString.length() > MAX_CATEGORY_PATH_LENGTH) {
|
||||
throw new IllegalArgumentException("category path exceeds maximum allowed path length: max="
|
||||
+ MAX_CATEGORY_PATH_LENGTH + " len=" + pathString.length()
|
||||
+ " path=" + pathString.substring(0, 30) + "...");
|
||||
}
|
||||
|
||||
String[] comps = pathString.split(Pattern.quote(Character.toString(delimiter)));
|
||||
if (comps.length == 1 && comps[0].isEmpty()) {
|
||||
components = null;
|
||||
|
|
|
@ -39,14 +39,14 @@ class CategoryPathUtils {
|
|||
* {@link #serialize(CategoryPath, CharBlockArray)}.
|
||||
*/
|
||||
public static int hashCodeOfSerialized(CharBlockArray charBlockArray, int offset) {
|
||||
int length = (short) charBlockArray.charAt(offset++);
|
||||
int length = charBlockArray.charAt(offset++);
|
||||
if (length == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int hash = length;
|
||||
for (int i = 0; i < length; i++) {
|
||||
int len = (short) charBlockArray.charAt(offset++);
|
||||
int len = charBlockArray.charAt(offset++);
|
||||
hash = hash * 31 + charBlockArray.subSequence(offset, offset + len).hashCode();
|
||||
offset += len;
|
||||
}
|
||||
|
@ -67,7 +67,7 @@ class CategoryPathUtils {
|
|||
}
|
||||
|
||||
for (int i = 0; i < cp.length; i++) {
|
||||
int len = (short) charBlockArray.charAt(offset++);
|
||||
int len = charBlockArray.charAt(offset++);
|
||||
if (len != cp.components[i].length()) {
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -3,6 +3,7 @@ package org.apache.lucene.facet.taxonomy;
|
|||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.facet.FacetTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.junit.Test;
|
||||
|
||||
/*
|
||||
|
@ -274,5 +275,32 @@ public class TestCategoryPath extends FacetTestCase {
|
|||
// expected
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLongPath() throws Exception {
|
||||
String bigComp = null;
|
||||
while (true) {
|
||||
int len = CategoryPath.MAX_CATEGORY_PATH_LENGTH;
|
||||
bigComp = _TestUtil.randomSimpleString(random(), len, len);
|
||||
if (bigComp.indexOf('\u001f') != -1) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
try {
|
||||
assertNotNull(new CategoryPath("dim", bigComp));
|
||||
fail("long paths should not be allowed; len=" + bigComp.length());
|
||||
} catch (IllegalArgumentException e) {
|
||||
// expected
|
||||
}
|
||||
|
||||
try {
|
||||
assertNotNull(new CategoryPath("dim\u001f" + bigComp, '\u001f'));
|
||||
fail("long paths should not be allowed; len=" + bigComp.length());
|
||||
} catch (IllegalArgumentException e) {
|
||||
// expected
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,13 +1,19 @@
|
|||
package org.apache.lucene.facet.taxonomy.directory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.facet.FacetTestCase;
|
||||
import org.apache.lucene.facet.index.FacetFields;
|
||||
import org.apache.lucene.facet.params.FacetIndexingParams;
|
||||
import org.apache.lucene.facet.search.DrillDownQuery;
|
||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.MemoryOrdinalMap;
|
||||
|
@ -20,8 +26,11 @@ import org.apache.lucene.index.IndexWriter;
|
|||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.index.SegmentInfos;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.junit.Test;
|
||||
|
||||
/*
|
||||
|
@ -412,5 +421,52 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
|
|||
taxoWriter.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testHugeLabel() throws Exception {
|
||||
Directory indexDir = newDirectory(), taxoDir = newDirectory();
|
||||
IndexWriter indexWriter = new IndexWriter(indexDir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));
|
||||
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE, new Cl2oTaxonomyWriterCache(2, 1f, 1));
|
||||
FacetFields facetFields = new FacetFields(taxoWriter);
|
||||
|
||||
// Add one huge label:
|
||||
String bigs = null;
|
||||
int ordinal = -1;
|
||||
CategoryPath cp = null;
|
||||
while (true) {
|
||||
int len = CategoryPath.MAX_CATEGORY_PATH_LENGTH - 4; // for the dimension and separator
|
||||
bigs = _TestUtil.randomSimpleString(random(), len, len);
|
||||
cp = new CategoryPath("dim", bigs);
|
||||
ordinal = taxoWriter.addCategory(cp);
|
||||
Document doc = new Document();
|
||||
facetFields.addFields(doc, Collections.singletonList(cp));
|
||||
indexWriter.addDocument(doc);
|
||||
break;
|
||||
}
|
||||
|
||||
// Add tiny ones to cause a re-hash
|
||||
for (int i = 0; i < 3; i++) {
|
||||
String s = _TestUtil.randomSimpleString(random(), 1, 10);
|
||||
taxoWriter.addCategory(new CategoryPath("dim", s));
|
||||
Document doc = new Document();
|
||||
facetFields.addFields(doc, Collections.singletonList(new CategoryPath("dim", s)));
|
||||
indexWriter.addDocument(doc);
|
||||
}
|
||||
|
||||
// when too large components were allowed to be added, this resulted in a new added category
|
||||
assertEquals(ordinal, taxoWriter.addCategory(cp));
|
||||
|
||||
IOUtils.close(indexWriter, taxoWriter);
|
||||
|
||||
DirectoryReader indexReader = DirectoryReader.open(indexDir);
|
||||
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir);
|
||||
IndexSearcher searcher = new IndexSearcher(indexReader);
|
||||
DrillDownQuery ddq = new DrillDownQuery(FacetIndexingParams.DEFAULT);
|
||||
ddq.add(cp);
|
||||
assertEquals(1, searcher.search(ddq, 10).totalHits);
|
||||
|
||||
IOUtils.close(indexReader, taxoReader);
|
||||
|
||||
IOUtils.close(indexDir, taxoDir);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -293,7 +293,11 @@ public class _TestUtil {
|
|||
}
|
||||
|
||||
public static String randomSimpleString(Random r, int maxLength) {
|
||||
final int end = nextInt(r, 0, maxLength);
|
||||
return randomSimpleString(r, 0, maxLength);
|
||||
}
|
||||
|
||||
public static String randomSimpleString(Random r, int minLength, int maxLength) {
|
||||
final int end = nextInt(r, minLength, maxLength);
|
||||
if (end == 0) {
|
||||
// allow 0 length
|
||||
return "";
|
||||
|
@ -319,7 +323,7 @@ public class _TestUtil {
|
|||
}
|
||||
|
||||
public static String randomSimpleString(Random r) {
|
||||
return randomSimpleString(r, 10);
|
||||
return randomSimpleString(r, 0, 10);
|
||||
}
|
||||
|
||||
/** Returns random string, including full unicode range. */
|
||||
|
|
Loading…
Reference in New Issue