mirror of https://github.com/apache/lucene.git
LUCENE-5048: limit CategoryPath total length
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1491562 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
eef4c01c99
commit
a287431293
|
@ -148,6 +148,12 @@ Bug Fixes
|
||||||
into out-of-memory errors when working with large stored fields.
|
into out-of-memory errors when working with large stored fields.
|
||||||
(Adrien Grand)
|
(Adrien Grand)
|
||||||
|
|
||||||
|
* LUCENE-5048: CategoryPath with a long path could result in hitting
|
||||||
|
NegativeArraySizeException, categories being added multiple times to the
|
||||||
|
taxonomy or drill-down terms silently discarded by the indexer. CategoryPath
|
||||||
|
is now limited to MAX_CATEGORY_PATH_LENGTH characters.
|
||||||
|
(Colton Jamieson, Mike McCandless, Shai Erera)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-4936: Improve numeric doc values compression in case all values share
|
* LUCENE-4936: Improve numeric doc values compression in case all values share
|
||||||
|
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.facet.taxonomy;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
@ -28,6 +30,18 @@ import java.util.regex.Pattern;
|
||||||
*/
|
*/
|
||||||
public class CategoryPath implements Comparable<CategoryPath> {
|
public class CategoryPath implements Comparable<CategoryPath> {
|
||||||
|
|
||||||
|
/*
|
||||||
|
* copied from DocumentWriterPerThread -- if a CategoryPath is resolved to a
|
||||||
|
* drill-down term which is encoded to a larger term than that length, it is
|
||||||
|
* silently dropped! Therefore we limit the number of characters to MAX/4 to
|
||||||
|
* be on the safe side.
|
||||||
|
*/
|
||||||
|
/**
|
||||||
|
* The maximum number of characters a {@link CategoryPath} can have. That is
|
||||||
|
* {@link CategoryPath#toString(char)} length must not exceed that limit.
|
||||||
|
*/
|
||||||
|
public final static int MAX_CATEGORY_PATH_LENGTH = (BYTE_BLOCK_SIZE - 2) / 4;
|
||||||
|
|
||||||
/** An empty {@link CategoryPath} */
|
/** An empty {@link CategoryPath} */
|
||||||
public static final CategoryPath EMPTY = new CategoryPath();
|
public static final CategoryPath EMPTY = new CategoryPath();
|
||||||
|
|
||||||
|
@ -63,10 +77,18 @@ public class CategoryPath implements Comparable<CategoryPath> {
|
||||||
/** Construct from the given path components. */
|
/** Construct from the given path components. */
|
||||||
public CategoryPath(final String... components) {
|
public CategoryPath(final String... components) {
|
||||||
assert components.length > 0 : "use CategoryPath.EMPTY to create an empty path";
|
assert components.length > 0 : "use CategoryPath.EMPTY to create an empty path";
|
||||||
|
long len = 0;
|
||||||
for (String comp : components) {
|
for (String comp : components) {
|
||||||
if (comp == null || comp.isEmpty()) {
|
if (comp == null || comp.isEmpty()) {
|
||||||
throw new IllegalArgumentException("empty or null components not allowed: " + Arrays.toString(components));
|
throw new IllegalArgumentException("empty or null components not allowed: " + Arrays.toString(components));
|
||||||
}
|
}
|
||||||
|
len += comp.length();
|
||||||
|
}
|
||||||
|
len += components.length - 1; // add separators
|
||||||
|
if (len > MAX_CATEGORY_PATH_LENGTH) {
|
||||||
|
throw new IllegalArgumentException("category path exceeds maximum allowed path length: max="
|
||||||
|
+ MAX_CATEGORY_PATH_LENGTH + " len=" + len
|
||||||
|
+ " path=" + Arrays.toString(components).substring(0, 30) + "...");
|
||||||
}
|
}
|
||||||
this.components = components;
|
this.components = components;
|
||||||
length = components.length;
|
length = components.length;
|
||||||
|
@ -74,6 +96,12 @@ public class CategoryPath implements Comparable<CategoryPath> {
|
||||||
|
|
||||||
/** Construct from a given path, separating path components with {@code delimiter}. */
|
/** Construct from a given path, separating path components with {@code delimiter}. */
|
||||||
public CategoryPath(final String pathString, final char delimiter) {
|
public CategoryPath(final String pathString, final char delimiter) {
|
||||||
|
if (pathString.length() > MAX_CATEGORY_PATH_LENGTH) {
|
||||||
|
throw new IllegalArgumentException("category path exceeds maximum allowed path length: max="
|
||||||
|
+ MAX_CATEGORY_PATH_LENGTH + " len=" + pathString.length()
|
||||||
|
+ " path=" + pathString.substring(0, 30) + "...");
|
||||||
|
}
|
||||||
|
|
||||||
String[] comps = pathString.split(Pattern.quote(Character.toString(delimiter)));
|
String[] comps = pathString.split(Pattern.quote(Character.toString(delimiter)));
|
||||||
if (comps.length == 1 && comps[0].isEmpty()) {
|
if (comps.length == 1 && comps[0].isEmpty()) {
|
||||||
components = null;
|
components = null;
|
||||||
|
|
|
@ -39,14 +39,14 @@ class CategoryPathUtils {
|
||||||
* {@link #serialize(CategoryPath, CharBlockArray)}.
|
* {@link #serialize(CategoryPath, CharBlockArray)}.
|
||||||
*/
|
*/
|
||||||
public static int hashCodeOfSerialized(CharBlockArray charBlockArray, int offset) {
|
public static int hashCodeOfSerialized(CharBlockArray charBlockArray, int offset) {
|
||||||
int length = (short) charBlockArray.charAt(offset++);
|
int length = charBlockArray.charAt(offset++);
|
||||||
if (length == 0) {
|
if (length == 0) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int hash = length;
|
int hash = length;
|
||||||
for (int i = 0; i < length; i++) {
|
for (int i = 0; i < length; i++) {
|
||||||
int len = (short) charBlockArray.charAt(offset++);
|
int len = charBlockArray.charAt(offset++);
|
||||||
hash = hash * 31 + charBlockArray.subSequence(offset, offset + len).hashCode();
|
hash = hash * 31 + charBlockArray.subSequence(offset, offset + len).hashCode();
|
||||||
offset += len;
|
offset += len;
|
||||||
}
|
}
|
||||||
|
@ -67,7 +67,7 @@ class CategoryPathUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < cp.length; i++) {
|
for (int i = 0; i < cp.length; i++) {
|
||||||
int len = (short) charBlockArray.charAt(offset++);
|
int len = charBlockArray.charAt(offset++);
|
||||||
if (len != cp.components[i].length()) {
|
if (len != cp.components[i].length()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,7 @@ package org.apache.lucene.facet.taxonomy;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
import org.apache.lucene.facet.FacetTestCase;
|
import org.apache.lucene.facet.FacetTestCase;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -275,4 +276,31 @@ public class TestCategoryPath extends FacetTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLongPath() throws Exception {
|
||||||
|
String bigComp = null;
|
||||||
|
while (true) {
|
||||||
|
int len = CategoryPath.MAX_CATEGORY_PATH_LENGTH;
|
||||||
|
bigComp = _TestUtil.randomSimpleString(random(), len, len);
|
||||||
|
if (bigComp.indexOf('\u001f') != -1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
assertNotNull(new CategoryPath("dim", bigComp));
|
||||||
|
fail("long paths should not be allowed; len=" + bigComp.length());
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
// expected
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
assertNotNull(new CategoryPath("dim\u001f" + bigComp, '\u001f'));
|
||||||
|
fail("long paths should not be allowed; len=" + bigComp.length());
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
// expected
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,13 +1,19 @@
|
||||||
package org.apache.lucene.facet.taxonomy.directory;
|
package org.apache.lucene.facet.taxonomy.directory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.facet.FacetTestCase;
|
import org.apache.lucene.facet.FacetTestCase;
|
||||||
|
import org.apache.lucene.facet.index.FacetFields;
|
||||||
|
import org.apache.lucene.facet.params.FacetIndexingParams;
|
||||||
|
import org.apache.lucene.facet.search.DrillDownQuery;
|
||||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.MemoryOrdinalMap;
|
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.MemoryOrdinalMap;
|
||||||
|
@ -20,8 +26,11 @@ import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.index.IndexWriterConfig;
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||||
import org.apache.lucene.index.SegmentInfos;
|
import org.apache.lucene.index.SegmentInfos;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.store.AlreadyClosedException;
|
import org.apache.lucene.store.AlreadyClosedException;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -413,4 +422,51 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHugeLabel() throws Exception {
|
||||||
|
Directory indexDir = newDirectory(), taxoDir = newDirectory();
|
||||||
|
IndexWriter indexWriter = new IndexWriter(indexDir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));
|
||||||
|
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE, new Cl2oTaxonomyWriterCache(2, 1f, 1));
|
||||||
|
FacetFields facetFields = new FacetFields(taxoWriter);
|
||||||
|
|
||||||
|
// Add one huge label:
|
||||||
|
String bigs = null;
|
||||||
|
int ordinal = -1;
|
||||||
|
CategoryPath cp = null;
|
||||||
|
while (true) {
|
||||||
|
int len = CategoryPath.MAX_CATEGORY_PATH_LENGTH - 4; // for the dimension and separator
|
||||||
|
bigs = _TestUtil.randomSimpleString(random(), len, len);
|
||||||
|
cp = new CategoryPath("dim", bigs);
|
||||||
|
ordinal = taxoWriter.addCategory(cp);
|
||||||
|
Document doc = new Document();
|
||||||
|
facetFields.addFields(doc, Collections.singletonList(cp));
|
||||||
|
indexWriter.addDocument(doc);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add tiny ones to cause a re-hash
|
||||||
|
for (int i = 0; i < 3; i++) {
|
||||||
|
String s = _TestUtil.randomSimpleString(random(), 1, 10);
|
||||||
|
taxoWriter.addCategory(new CategoryPath("dim", s));
|
||||||
|
Document doc = new Document();
|
||||||
|
facetFields.addFields(doc, Collections.singletonList(new CategoryPath("dim", s)));
|
||||||
|
indexWriter.addDocument(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// when too large components were allowed to be added, this resulted in a new added category
|
||||||
|
assertEquals(ordinal, taxoWriter.addCategory(cp));
|
||||||
|
|
||||||
|
IOUtils.close(indexWriter, taxoWriter);
|
||||||
|
|
||||||
|
DirectoryReader indexReader = DirectoryReader.open(indexDir);
|
||||||
|
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir);
|
||||||
|
IndexSearcher searcher = new IndexSearcher(indexReader);
|
||||||
|
DrillDownQuery ddq = new DrillDownQuery(FacetIndexingParams.DEFAULT);
|
||||||
|
ddq.add(cp);
|
||||||
|
assertEquals(1, searcher.search(ddq, 10).totalHits);
|
||||||
|
|
||||||
|
IOUtils.close(indexReader, taxoReader);
|
||||||
|
|
||||||
|
IOUtils.close(indexDir, taxoDir);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -293,7 +293,11 @@ public class _TestUtil {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String randomSimpleString(Random r, int maxLength) {
|
public static String randomSimpleString(Random r, int maxLength) {
|
||||||
final int end = nextInt(r, 0, maxLength);
|
return randomSimpleString(r, 0, maxLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String randomSimpleString(Random r, int minLength, int maxLength) {
|
||||||
|
final int end = nextInt(r, minLength, maxLength);
|
||||||
if (end == 0) {
|
if (end == 0) {
|
||||||
// allow 0 length
|
// allow 0 length
|
||||||
return "";
|
return "";
|
||||||
|
@ -319,7 +323,7 @@ public class _TestUtil {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String randomSimpleString(Random r) {
|
public static String randomSimpleString(Random r) {
|
||||||
return randomSimpleString(r, 10);
|
return randomSimpleString(r, 0, 10);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns random string, including full unicode range. */
|
/** Returns random string, including full unicode range. */
|
||||||
|
|
Loading…
Reference in New Issue