diff --git a/build.xml b/build.xml index 636e844242a..686dec86dfd 100644 --- a/build.xml +++ b/build.xml @@ -281,18 +281,9 @@ - - - - - - - - diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 770155943df..9ac8f0c3e65 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -106,7 +106,15 @@ Changes in backwards compatibility policy * LUCENE-4659: Massive cleanup to CategoryPath API. Additionally, CategoryPath is now immutable, so you don't need to clone() it. (Shai Erera) - + +* LUCENE-4670: StoredFieldsWriter and TermVectorsWriter have new finish* callbacks + which are called after a doc/field/term has been completely added. + (Adrien Grand, Robert Muir) + +* LUCENE-4620: IntEncoder/Decoder were changed to do bulk encoding/decoding. As a + result, few other classes such as Aggregator and CategoryListIterator were + changed to handle bulk category ordinals. (Shai Erera) + New Features * LUCENE-4226: New experimental StoredFieldsFormat that compresses chunks of @@ -324,6 +332,8 @@ Bug Fixes * LUCENE-4662: Add missing elided articles and prepositions to FrenchAnalyzer's DEFAULT_ARTICLES list passed to ElisionFilter. (David Leunen via Steve Rowe) + +* LUCENE-4671: Fix CharsRef.subSequence method. (Tim Smith via Robert Muir) Changes in Runtime Behavior diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java index 3bdfdd411bf..b100b8d6752 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java @@ -34,6 +34,7 @@ import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.Enumeration; +import java.util.HashMap; import java.util.HashSet; import java.util.IdentityHashMap; import java.util.List; @@ -66,6 +67,8 @@ import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; import org.apache.lucene.analysis.hunspell.HunspellDictionary; import org.apache.lucene.analysis.hunspell.HunspellDictionaryTest; import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter; +import org.apache.lucene.analysis.miscellaneous.KeepWordFilter; +import org.apache.lucene.analysis.miscellaneous.LengthFilter; import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter; import org.apache.lucene.analysis.miscellaneous.TrimFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; @@ -103,67 +106,145 @@ public class TestRandomChains extends BaseTokenStreamTestCase { static List> tokenfilters; static List> charfilters; - // TODO: fix those and remove - private static final Set> brokenComponents = Collections.newSetFromMap(new IdentityHashMap,Boolean>()); + private static interface Predicate { + boolean apply(T o); + } + + private static final Predicate ALWAYS = new Predicate() { + public boolean apply(Object[] args) { + return true; + }; + }; + + private static final Map,Predicate> brokenConstructors = new HashMap, Predicate>(); static { - // TODO: can we promote some of these to be only - // offsets offenders? - Collections.>addAll(brokenComponents, - // doesn't actual reset itself! - CachingTokenFilter.class, - // doesn't consume whole stream! - LimitTokenCountFilter.class, - // Not broken: we forcefully add this, so we shouldn't - // also randomly pick it: - ValidatingTokenFilter.class, - // NOTE: these by themselves won't cause any 'basic assertions' to fail. - // but see https://issues.apache.org/jira/browse/LUCENE-3920, if any - // tokenfilter that combines words (e.g. shingles) comes after them, - // this will create bogus offsets because their 'offsets go backwards', - // causing shingle or whatever to make a single token with a - // startOffset thats > its endOffset - // (see LUCENE-3738 for a list of other offenders here) - // broken! - NGramTokenizer.class, - // broken! - NGramTokenFilter.class, - // broken! - EdgeNGramTokenizer.class, - // broken! - EdgeNGramTokenFilter.class, - // broken! - WordDelimiterFilter.class, - // broken! - TrimFilter.class - ); + try { + brokenConstructors.put( + LimitTokenCountFilter.class.getConstructor(TokenStream.class, int.class), + ALWAYS); + brokenConstructors.put( + LimitTokenCountFilter.class.getConstructor(TokenStream.class, int.class, boolean.class), + new Predicate() { + @Override + public boolean apply(Object[] args) { + assert args.length == 3; + return !((Boolean) args[2]); // args are broken if consumeAllTokens is false + } + }); + for (Class c : Arrays.>asList( + // TODO: can we promote some of these to be only + // offsets offenders? + // doesn't actual reset itself! + CachingTokenFilter.class, + // Not broken: we forcefully add this, so we shouldn't + // also randomly pick it: + ValidatingTokenFilter.class, + // NOTE: these by themselves won't cause any 'basic assertions' to fail. + // but see https://issues.apache.org/jira/browse/LUCENE-3920, if any + // tokenfilter that combines words (e.g. shingles) comes after them, + // this will create bogus offsets because their 'offsets go backwards', + // causing shingle or whatever to make a single token with a + // startOffset thats > its endOffset + // (see LUCENE-3738 for a list of other offenders here) + // broken! + NGramTokenizer.class, + // broken! + NGramTokenFilter.class, + // broken! + EdgeNGramTokenizer.class, + // broken! + EdgeNGramTokenFilter.class, + // broken! + WordDelimiterFilter.class)) { + for (Constructor ctor : c.getConstructors()) { + brokenConstructors.put(ctor, ALWAYS); + } + } + } catch (Exception e) { + throw new Error(e); + } } // TODO: also fix these and remove (maybe): - // Classes that don't produce consistent graph offsets: - private static final Set> brokenOffsetsComponents = Collections.newSetFromMap(new IdentityHashMap,Boolean>()); + // Classes/options that don't produce consistent graph offsets: + private static final Map,Predicate> brokenOffsetsConstructors = new HashMap, Predicate>(); static { - Collections.>addAll(brokenOffsetsComponents, - ReversePathHierarchyTokenizer.class, - PathHierarchyTokenizer.class, - HyphenationCompoundWordTokenFilter.class, - DictionaryCompoundWordTokenFilter.class, - // TODO: corrumpts graphs (offset consistency check): - PositionFilter.class, - // TODO: it seems to mess up offsets!? - WikipediaTokenizer.class, - // TODO: doesn't handle graph inputs - ThaiWordFilter.class, - // TODO: doesn't handle graph inputs - CJKBigramFilter.class, - // TODO: doesn't handle graph inputs (or even look at positionIncrement) - HyphenatedWordsFilter.class, - // LUCENE-4065: only if you pass 'false' to enablePositionIncrements! - TypeTokenFilter.class, - // TODO: doesn't handle graph inputs - CommonGramsQueryFilter.class - ); + try { + brokenOffsetsConstructors.put( + TrimFilter.class.getConstructor(TokenStream.class, boolean.class), + new Predicate() { + @Override + public boolean apply(Object[] args) { + assert args.length == 2; + return (Boolean) args[1]; // args are broken if updateOffsets is true + } + }); + brokenOffsetsConstructors.put( + TypeTokenFilter.class.getConstructor(boolean.class, TokenStream.class, Set.class, boolean.class), + new Predicate() { + @Override + public boolean apply(Object[] args) { + assert args.length == 4; + // LUCENE-4065: only if you pass 'false' to enablePositionIncrements! + return !(Boolean) args[0]; + } + }); + brokenOffsetsConstructors.put( + TypeTokenFilter.class.getConstructor(boolean.class, TokenStream.class, Set.class), + new Predicate() { + @Override + public boolean apply(Object[] args) { + assert args.length == 3; + // LUCENE-4065: only if you pass 'false' to enablePositionIncrements! + return !(Boolean) args[0]; + } + }); + brokenOffsetsConstructors.put( + LengthFilter.class.getConstructor(boolean.class, TokenStream.class, int.class, int.class), + new Predicate() { + @Override + public boolean apply(Object[] args) { + assert args.length == 4; + // LUCENE-4065: only if you pass 'false' to enablePositionIncrements! + return !(Boolean) args[0]; + } + }); + brokenOffsetsConstructors.put( + KeepWordFilter.class.getConstructor(boolean.class, TokenStream.class, CharArraySet.class), + new Predicate() { + @Override + public boolean apply(Object[] args) { + assert args.length == 3; + // LUCENE-4065: only if you pass 'false' to enablePositionIncrements! + return !(Boolean) args[0]; + } + }); + for (Class c : Arrays.>asList( + ReversePathHierarchyTokenizer.class, + PathHierarchyTokenizer.class, + HyphenationCompoundWordTokenFilter.class, + DictionaryCompoundWordTokenFilter.class, + // TODO: corrumpts graphs (offset consistency check): + PositionFilter.class, + // TODO: it seems to mess up offsets!? + WikipediaTokenizer.class, + // TODO: doesn't handle graph inputs + ThaiWordFilter.class, + // TODO: doesn't handle graph inputs + CJKBigramFilter.class, + // TODO: doesn't handle graph inputs (or even look at positionIncrement) + HyphenatedWordsFilter.class, + // TODO: doesn't handle graph inputs + CommonGramsQueryFilter.class)) { + for (Constructor ctor : c.getConstructors()) { + brokenOffsetsConstructors.put(ctor, ALWAYS); + } + } + } catch (Exception e) { + throw new Error(e); + } } - + @BeforeClass public static void beforeClass() throws Exception { List> analysisClasses = getClassesForPackage("org.apache.lucene.analysis"); @@ -176,7 +257,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase { // don't waste time with abstract classes or deprecated known-buggy ones Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers) || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface() - || brokenComponents.contains(c) || c.isAnnotationPresent(Deprecated.class) || !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharFilter.class.isAssignableFrom(c)) ) { @@ -185,7 +265,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase { for (final Constructor ctor : c.getConstructors()) { // don't test synthetic or deprecated ctors, they likely have known bugs: - if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class)) { + if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class) || brokenConstructors.get(ctor) == ALWAYS) { continue; } if (Tokenizer.class.isAssignableFrom(c)) { @@ -679,7 +759,17 @@ public class TestRandomChains extends BaseTokenStreamTestCase { } return null; // no success } - + + private boolean broken(Constructor ctor, Object[] args) { + final Predicate pred = brokenConstructors.get(ctor); + return pred != null && pred.apply(args); + } + + private boolean brokenOffsets(Constructor ctor, Object[] args) { + final Predicate pred = brokenOffsetsConstructors.get(ctor); + return pred != null && pred.apply(args); + } + // create a new random tokenizer from classpath private TokenizerSpec newTokenizer(Random random, Reader reader) { TokenizerSpec spec = new TokenizerSpec(); @@ -688,11 +778,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase { final StringBuilder descr = new StringBuilder(); final CheckThatYouDidntReadAnythingReaderWrapper wrapper = new CheckThatYouDidntReadAnythingReaderWrapper(reader); final Object args[] = newTokenizerArgs(random, wrapper, ctor.getParameterTypes()); + if (broken(ctor, args)) { + continue; + } spec.tokenizer = createComponent(ctor, args, descr); if (spec.tokenizer != null) { - if (brokenOffsetsComponents.contains(ctor.getDeclaringClass())) { - spec.offsetsAreCorrect = false; - } + spec.offsetsAreCorrect &= !brokenOffsets(ctor, args); spec.toString = descr.toString(); } else { assertFalse(ctor.getDeclaringClass().getName() + " has read something in ctor but failed with UOE/IAE", wrapper.readSomething); @@ -710,6 +801,9 @@ public class TestRandomChains extends BaseTokenStreamTestCase { while (true) { final Constructor ctor = charfilters.get(random.nextInt(charfilters.size())); final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes()); + if (broken(ctor, args)) { + continue; + } reader = createComponent(ctor, args, descr); if (reader != null) { spec.reader = reader; @@ -746,11 +840,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase { } final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes()); + if (broken(ctor, args)) { + continue; + } final TokenFilter flt = createComponent(ctor, args, descr); if (flt != null) { - if (brokenOffsetsComponents.contains(ctor.getDeclaringClass())) { - spec.offsetsAreCorrect = false; - } + spec.offsetsAreCorrect &= !brokenOffsets(ctor, args); spec.stream = flt; break; } diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java index 569908da694..2df460dd023 100644 --- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java +++ b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java @@ -132,7 +132,7 @@ public class TokenInfoDictionaryBuilder { System.out.println(" encode..."); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(true); - Builder fstBuilder = new Builder(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, true); + Builder fstBuilder = new Builder(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, true, true); IntsRef scratch = new IntsRef(); long ord = -1; // first ord will be 0 String lastValue = null; diff --git a/lucene/build.xml b/lucene/build.xml index 71dd7305ad9..75c6b788f50 100644 --- a/lucene/build.xml +++ b/lucene/build.xml @@ -458,7 +458,20 @@ - + + + + + + + + + + + + + diff --git a/lucene/classification/build.xml b/lucene/classification/build.xml index a0cd03b0848..39cc28d3753 100644 --- a/lucene/classification/build.xml +++ b/lucene/classification/build.xml @@ -24,23 +24,25 @@ - - + + + + - - - - - + + + + + diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java index 4e61dad538f..f35fb6c416b 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java @@ -113,7 +113,7 @@ public final class MemoryPostingsFormat extends PostingsFormat { this.field = field; this.doPackFST = doPackFST; this.acceptableOverheadRatio = acceptableOverheadRatio; - builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doPackFST, acceptableOverheadRatio); + builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doPackFST, acceptableOverheadRatio, true); } private class PostingsWriter extends PostingsConsumer { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java index 39ced1d8e44..e80c9028be6 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java @@ -419,7 +419,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer { final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); final Builder indexBuilder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, false, Integer.MAX_VALUE, - outputs, null, false); + outputs, null, false, true); //if (DEBUG) { // System.out.println(" compile index for prefix=" + prefix); //} @@ -962,7 +962,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer { 0, 0, true, true, Integer.MAX_VALUE, noOutputs, - new FindBlocks(), false); + new FindBlocks(), false, true); postingsWriter.setField(fieldInfo); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java index 5babf7b10a9..5a32aae79eb 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java @@ -55,7 +55,10 @@ public abstract class StoredFieldsWriter implements Closeable { * called even if the document has no stored fields, in * this case numStoredFields will be zero. */ public abstract void startDocument(int numStoredFields) throws IOException; - + + /** Called when a document and all its fields have been added. */ + public void finishDocument() throws IOException {} + /** Writes a single stored field. */ public abstract void writeField(FieldInfo info, StorableField field) throws IOException; @@ -116,6 +119,8 @@ public abstract class StoredFieldsWriter implements Closeable { for (StorableField field : doc) { writeField(fieldInfos.fieldInfo(field.name()), field); } + + finishDocument(); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java index f6fcb86712a..c15be770d8c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java @@ -71,18 +71,27 @@ public abstract class TermVectorsWriter implements Closeable { * has no vector fields, in this case numVectorFields * will be zero. */ public abstract void startDocument(int numVectorFields) throws IOException; - + + /** Called after a doc and all its fields have been added. */ + public void finishDocument() throws IOException {}; + /** Called before writing the terms of the field. * {@link #startTerm(BytesRef, int)} will be called numTerms times. */ public abstract void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets, boolean payloads) throws IOException; - + + /** Called after a field and all its terms have been added. */ + public void finishField() throws IOException {}; + /** Adds a term and its term frequency freq. * If this field has positions and/or offsets enabled, then * {@link #addPosition(int, int, int, BytesRef)} will be called * freq times respectively. */ public abstract void startTerm(BytesRef term, int freq) throws IOException; - + + /** Called after a term and all its positions have been added. */ + public void finishTerm() throws IOException {} + /** Adds a term position and offsets */ public abstract void addPosition(int position, int startOffset, int endOffset, BytesRef payload) throws IOException; @@ -97,7 +106,7 @@ public abstract class TermVectorsWriter implements Closeable { * check that this is the case to detect the JRE bug described * in LUCENE-1282. */ public abstract void finish(FieldInfos fis, int numDocs) throws IOException; - + /** * Called by IndexWriter when writing new segments. *

@@ -197,6 +206,7 @@ public abstract class TermVectorsWriter implements Closeable { protected final void addAllDocVectors(Fields vectors, MergeState mergeState) throws IOException { if (vectors == null) { startDocument(0); + finishDocument(); return; } @@ -275,10 +285,13 @@ public abstract class TermVectorsWriter implements Closeable { addPosition(pos, startOffset, endOffset, payload); } } + finishTerm(); } assert termCount == numTerms; + finishField(); } assert fieldCount == numFields; + finishDocument(); } /** Return the BytesRef Comparator used to sort terms diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java index 797cdb7b6fc..7d06240281a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java @@ -395,8 +395,10 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader { * Copy compressed data. */ void copyCompressedData(DataOutput out) throws IOException { - final int chunkSize = chunkSize(); - decompressor.copyCompressedData(fieldsStream, chunkSize, out); + final long chunkEnd = docBase + chunkDocs == numDocs + ? fieldsStream.length() + : indexReader.getStartPointer(docBase + chunkDocs); + out.copyBytes(fieldsStream, chunkEnd - fieldsStream.getFilePointer()); } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java index d1bf861e9ea..e0498ed2980 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java @@ -136,19 +136,8 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { } } - private void endWithPreviousDocument() throws IOException { - if (numBufferedDocs > 0) { - endOffsets[numBufferedDocs - 1] = bufferedDocs.length; - } - } - @Override public void startDocument(int numStoredFields) throws IOException { - endWithPreviousDocument(); - if (triggerFlush()) { - flush(); - } - if (numBufferedDocs == this.numStoredFields.length) { final int newLength = ArrayUtil.oversize(numBufferedDocs + 1, 4); this.numStoredFields = Arrays.copyOf(this.numStoredFields, newLength); @@ -158,6 +147,14 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { ++numBufferedDocs; } + @Override + public void finishDocument() throws IOException { + endOffsets[numBufferedDocs - 1] = bufferedDocs.length; + if (triggerFlush()) { + flush(); + } + } + private static void saveInts(int[] values, int length, DataOutput out) throws IOException { assert length > 0; if (length == 1) { @@ -295,9 +292,10 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { @Override public void finish(FieldInfos fis, int numDocs) throws IOException { - endWithPreviousDocument(); if (numBufferedDocs > 0) { flush(); + } else { + assert bufferedDocs.length == 0; } if (docBase != numDocs) { throw new RuntimeException("Wrote " + docBase + " docs, finish called with numDocs=" + numDocs); @@ -351,17 +349,13 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { } if (compressionMode == matchingFieldsReader.getCompressionMode() // same compression mode - && (numBufferedDocs == 0 || triggerFlush()) // starting a new chunk + && numBufferedDocs == 0 // starting a new chunk && startOffsets[it.chunkDocs - 1] < chunkSize // chunk is small enough && startOffsets[it.chunkDocs - 1] + it.lengths[it.chunkDocs - 1] >= chunkSize // chunk is large enough && nextDeletedDoc(it.docBase, liveDocs, it.docBase + it.chunkDocs) == it.docBase + it.chunkDocs) { // no deletion in the chunk assert docID == it.docBase; // no need to decompress, just copy data - endWithPreviousDocument(); - if (triggerFlush()) { - flush(); - } indexWriter.writeIndex(it.chunkDocs, fieldsStream.getFilePointer()); writeHeader(this.docBase, it.chunkDocs, it.numStoredFields, it.lengths); it.copyCompressedData(fieldsStream); @@ -380,6 +374,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { final int diff = docID - it.docBase; startDocument(it.numStoredFields[diff]); bufferedDocs.writeBytes(it.bytes.bytes, it.bytes.offset + startOffsets[diff], it.lengths[diff]); + finishDocument(); ++docCount; mergeState.checkAbort.work(300); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java index b38490225ee..700258fdd28 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java @@ -140,14 +140,6 @@ public abstract class CompressionMode { bytes.length = length; } - @Override - public void copyCompressedData(DataInput in, int originalLength, DataOutput out) throws IOException { - final int copied = LZ4.copyCompressedData(in, originalLength, out); - if (copied != originalLength) { - throw new CorruptIndexException("Currupted compressed stream: expected " + originalLength + " bytes, but got at least" + copied); - } - } - @Override public Decompressor clone() { return this; @@ -224,13 +216,6 @@ public abstract class CompressionMode { bytes.length = length; } - @Override - public void copyCompressedData(DataInput in, int originalLength, DataOutput out) throws IOException { - final int compressedLength = in.readVInt(); - out.writeVInt(compressedLength); - out.copyBytes(in, compressedLength); - } - @Override public Decompressor clone() { return new DeflateDecompressor(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/Compressor.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/Compressor.java index a6529994b5d..ef4fcc27789 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/Compressor.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/Compressor.java @@ -24,7 +24,10 @@ import org.apache.lucene.store.DataOutput; /** * A data compressor. */ -abstract class Compressor { +public abstract class Compressor { + + /** Sole constructor, typically called from sub-classes. */ + protected Compressor() {} /** * Compress bytes into out. It it the responsibility of the diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/Decompressor.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/Decompressor.java index 43f30c2c08f..c7fffc7392d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/Decompressor.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/Decompressor.java @@ -20,13 +20,15 @@ package org.apache.lucene.codecs.compressing; import java.io.IOException; import org.apache.lucene.store.DataInput; -import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.BytesRef; /** - * An decompressor. + * A decompressor. */ -abstract class Decompressor implements Cloneable { +public abstract class Decompressor implements Cloneable { + + /** Sole constructor, typically called from sub-classes. */ + protected Decompressor() {} /** * Decompress bytes that were stored between offsets offset and @@ -44,10 +46,6 @@ abstract class Decompressor implements Cloneable { */ public abstract void decompress(DataInput in, int originalLength, int offset, int length, BytesRef bytes) throws IOException; - /** Copy a compressed stream whose original length is - * originalLength from in to out. */ - public abstract void copyCompressedData(DataInput in, int originalLength, DataOutput out) throws IOException; - @Override public abstract Decompressor clone(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/LZ4.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/LZ4.java index 8807018fb75..7e52339657d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/LZ4.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/LZ4.java @@ -506,51 +506,4 @@ class LZ4 { encodeLastLiterals(src, anchor, srcEnd - anchor, out); } - /** Copy bytes from in to out where - * in is a LZ4-encoded stream. This method copies enough bytes - * so that out can be used later on to restore the first - * length bytes of the stream. This method always reads at - * least one byte from in so make sure not to call this method - * if in reached the end of the stream, even if - * length=0. */ - public static int copyCompressedData(DataInput in, int length, DataOutput out) throws IOException { - int n = 0; - do { - // literals - final byte token = in.readByte(); - out.writeByte(token); - int literalLen = (token & 0xFF) >>> 4; - if (literalLen == 0x0F) { - byte len; - while ((len = in.readByte()) == (byte) 0xFF) { - literalLen += 0xFF; - out.writeByte(len); - } - literalLen += len & 0xFF; - out.writeByte(len); - } - out.copyBytes(in, literalLen); - n += literalLen; - if (n >= length) { - break; - } - - // matchs - out.copyBytes(in, 2); // match dec - int matchLen = token & 0x0F; - if (matchLen == 0x0F) { - byte len; - while ((len = in.readByte()) == (byte) 0xFF) { - matchLen += 0xFF; - out.writeByte(len); - } - matchLen += len & 0xFF; - out.writeByte(len); - } - matchLen += MIN_MATCH; - n += matchLen; - } while (n < length); - return n; - } - } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsWriter.java index 36529b63fff..0386369e336 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsWriter.java @@ -124,17 +124,16 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter { if (payloads) bits |= Lucene40TermVectorsReader.STORE_PAYLOAD_WITH_TERMVECTOR; tvf.writeByte(bits); - - assert fieldCount <= numVectorFields; - if (fieldCount == numVectorFields) { - // last field of the document - // this is crazy because the file format is crazy! - for (int i = 1; i < fieldCount; i++) { - tvd.writeVLong(fps[i] - fps[i-1]); - } - } } + @Override + public void finishDocument() throws IOException { + assert fieldCount == numVectorFields; + for (int i = 1; i < fieldCount; i++) { + tvd.writeVLong(fps[i] - fps[i-1]); + } + } + private final BytesRef lastTerm = new BytesRef(10); // NOTE: we override addProx, so we don't need to buffer when indexing. @@ -222,20 +221,6 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter { } bufferedIndex++; - - // dump buffer if we are done - if (bufferedIndex == bufferedFreq) { - if (payloads) { - tvf.writeBytes(payloadData.bytes, payloadData.offset, payloadData.length); - } - for (int i = 0; i < bufferedIndex; i++) { - if (offsets) { - tvf.writeVInt(offsetStartBuffer[i] - lastOffset); - tvf.writeVInt(offsetEndBuffer[i] - offsetStartBuffer[i]); - lastOffset = offsetEndBuffer[i]; - } - } - } } else if (positions) { // write position delta writePosition(position - lastPosition, payload); @@ -248,6 +233,25 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter { } } + @Override + public void finishTerm() throws IOException { + if (bufferedIndex > 0) { + // dump buffer + assert positions && (offsets || payloads); + assert bufferedIndex == bufferedFreq; + if (payloads) { + tvf.writeBytes(payloadData.bytes, payloadData.offset, payloadData.length); + } + for (int i = 0; i < bufferedIndex; i++) { + if (offsets) { + tvf.writeVInt(offsetStartBuffer[i] - lastOffset); + tvf.writeVInt(offsetEndBuffer[i] - offsetStartBuffer[i]); + lastOffset = offsetEndBuffer[i]; + } + } + } + } + private void writePosition(int delta, BytesRef payload) throws IOException { if (payloads) { int payloadLength = payload == null ? 0 : payload.length; diff --git a/lucene/core/src/java/org/apache/lucene/index/StoredFieldsProcessor.java b/lucene/core/src/java/org/apache/lucene/index/StoredFieldsProcessor.java index b0d7cf7b8ac..eea05b13200 100644 --- a/lucene/core/src/java/org/apache/lucene/index/StoredFieldsProcessor.java +++ b/lucene/core/src/java/org/apache/lucene/index/StoredFieldsProcessor.java @@ -108,6 +108,7 @@ final class StoredFieldsProcessor extends StoredFieldsConsumer { while(lastDocID < docID) { fieldsWriter.startDocument(0); lastDocID++; + fieldsWriter.finishDocument(); } } @@ -123,6 +124,7 @@ final class StoredFieldsProcessor extends StoredFieldsConsumer { for (int i = 0; i < numStoredFields; i++) { fieldsWriter.writeField(fieldInfos[i], storedFields[i]); } + fieldsWriter.finishDocument(); lastDocID++; } diff --git a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java index 40756e1fa25..bafba22ca80 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java @@ -78,6 +78,7 @@ final class TermVectorsConsumer extends TermsHashConsumer { void fill(int docID) throws IOException { while(lastDocID < docID) { writer.startDocument(0); + writer.finishDocument(); lastDocID++; } } @@ -108,6 +109,7 @@ final class TermVectorsConsumer extends TermsHashConsumer { for (int i = 0; i < numVectorFields; i++) { perFields[i].finishDocument(); } + writer.finishDocument(); assert lastDocID == docState.docID: "lastDocID=" + lastDocID + " docState.docID=" + docState.docID; diff --git a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java index 0e5a366f327..c3cb2c8d4cc 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java @@ -182,7 +182,9 @@ final class TermVectorsConsumerPerField extends TermsHashConsumerPerField { } tv.addProx(freq, posReader, offReader); } + tv.finishTerm(); } + tv.finishField(); termsHashPerField.reset(); diff --git a/lucene/core/src/java/org/apache/lucene/util/CharsRef.java b/lucene/core/src/java/org/apache/lucene/util/CharsRef.java index 60537bfdeb9..30e95439c76 100644 --- a/lucene/core/src/java/org/apache/lucene/util/CharsRef.java +++ b/lucene/core/src/java/org/apache/lucene/util/CharsRef.java @@ -218,7 +218,7 @@ public final class CharsRef implements Comparable, CharSequence, Clone if (start < 0 || end > length || start > end) { throw new IndexOutOfBoundsException(); } - return new CharsRef(chars, offset + start, offset + end); + return new CharsRef(chars, offset + start, end - start); } /** @deprecated This comparator is only a transition mechanism */ diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java b/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java index 8ab2d929708..19ee5412943 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java @@ -84,11 +84,11 @@ public class Builder { /** * Instantiates an FST/FSA builder without any pruning. A shortcut * to {@link #Builder(FST.INPUT_TYPE, int, int, boolean, - * boolean, int, Outputs, FreezeTail, boolean)} with + * boolean, int, Outputs, FreezeTail, boolean, boolean)} with * pruning options turned off. */ public Builder(FST.INPUT_TYPE inputType, Outputs outputs) { - this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, false, PackedInts.COMPACT); + this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, false, PackedInts.COMPACT, true); } /** @@ -97,9 +97,9 @@ public class Builder { */ public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix, boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs outputs, - FreezeTail freezeTail, boolean willPackFST) { + FreezeTail freezeTail, boolean willPackFST, boolean allowArrayArcs) { this(inputType, minSuffixCount1, minSuffixCount2, doShareSuffix, doShareNonSingletonNodes, - shareMaxTailLength, outputs, freezeTail, willPackFST, PackedInts.DEFAULT); + shareMaxTailLength, outputs, freezeTail, willPackFST, PackedInts.DEFAULT, allowArrayArcs); } /** @@ -143,10 +143,14 @@ public class Builder { * * @param acceptableOverheadRatio How to trade speed for space when building the FST. This option * is only relevant when doPackFST is true. @see PackedInts#getMutable(int, int, float) + * + * @param allowArrayArcs Pass false to disable the array arc optimization + * while building the FST; this will make the resulting + * FST smaller but slower to traverse. */ public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix, boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs outputs, - FreezeTail freezeTail, boolean doPackFST, float acceptableOverheadRatio) { + FreezeTail freezeTail, boolean doPackFST, float acceptableOverheadRatio, boolean allowArrayArcs) { this.minSuffixCount1 = minSuffixCount1; this.minSuffixCount2 = minSuffixCount2; this.freezeTail = freezeTail; @@ -154,7 +158,7 @@ public class Builder { this.shareMaxTailLength = shareMaxTailLength; this.doPackFST = doPackFST; this.acceptableOverheadRatio = acceptableOverheadRatio; - fst = new FST(inputType, outputs, doPackFST, acceptableOverheadRatio); + fst = new FST(inputType, outputs, doPackFST, acceptableOverheadRatio, allowArrayArcs); if (doShareSuffix) { dedupHash = new NodeHash(fst); } else { @@ -182,13 +186,6 @@ public class Builder { return dedupHash == null ? 0 : fst.nodeCount; } - /** Pass false to disable the array arc optimization - * while building the FST; this will make the resulting - * FST smaller but slower to traverse. */ - public void setAllowArrayArcs(boolean b) { - fst.setAllowArrayArcs(b); - } - private CompiledNode compileNode(UnCompiledNode nodeIn, int tailLength) throws IOException { final int node; if (dedupHash != null && (doShareNonSingletonNodes || nodeIn.numArcs <= 1) && tailLength <= shareMaxTailLength) { diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index 24f30681dd9..10d326e58e8 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -33,6 +33,7 @@ import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.InputStreamDataInput; import org.apache.lucene.store.OutputStreamDataOutput; +import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; @@ -137,16 +138,18 @@ public final class FST { // if non-null, this FST accepts the empty string and // produces this output T emptyOutput; - private byte[] emptyOutputBytes; // Not private to avoid synthetic access$NNN methods: byte[] bytes; - int byteUpto = 0; private int startNode = -1; public final Outputs outputs; + // Used for the BIT_TARGET_NEXT optimization (whereby + // instead of storing the address of the target node for + // a given arc, we mark a single bit noting that the next + // node in the byte[] is the target node): private int lastFrozenNode; private final T NO_OUTPUT; @@ -161,7 +164,7 @@ public final class FST { /** If arc has this label then that arc is final/accepted */ public static final int END_LABEL = -1; - private boolean allowArrayArcs = true; + private final boolean allowArrayArcs; private Arc cachedRootArcs[]; @@ -262,9 +265,10 @@ public final class FST { // make a new empty FST, for building; Builder invokes // this ctor - FST(INPUT_TYPE inputType, Outputs outputs, boolean willPackFST, float acceptableOverheadRatio) { + FST(INPUT_TYPE inputType, Outputs outputs, boolean willPackFST, float acceptableOverheadRatio, boolean allowArrayArcs) { this.inputType = inputType; this.outputs = outputs; + this.allowArrayArcs = allowArrayArcs; bytes = new byte[128]; NO_OUTPUT = outputs.getNoOutput(); if (willPackFST) { @@ -293,14 +297,15 @@ public final class FST { if (in.readByte() == 1) { // accepts empty string int numBytes = in.readVInt(); - // messy bytes = new byte[numBytes]; in.readBytes(bytes, 0, numBytes); + + // De-serialize empty-string output: BytesReader reader; if (packed) { - reader = getBytesReader(0); + reader = new ForwardBytesReader(bytes, 0); } else { - reader = getBytesReader(numBytes-1); + reader = new ReverseBytesReader(bytes, bytes.length-1); } emptyOutput = outputs.readFinalOutput(reader); } else { @@ -335,6 +340,11 @@ public final class FST { NO_OUTPUT = outputs.getNoOutput(); cacheRootArcs(); + + // NOTE: bogus because this is only used during + // building; we need to break out mutable FST from + // immutable + allowArrayArcs = false; } public INPUT_TYPE getInputType() { @@ -412,26 +422,6 @@ public final class FST { } else { emptyOutput = v; } - - // TODO: this is messy -- replace with sillyBytesWriter; maybe make - // bytes private - final int posSave = writer.getPosition(); - outputs.writeFinalOutput(emptyOutput, writer); - emptyOutputBytes = new byte[writer.getPosition()-posSave]; - - if (!packed) { - // reverse - final int stopAt = (writer.getPosition() - posSave)/2; - int upto = 0; - while(upto < stopAt) { - final byte b = bytes[posSave + upto]; - bytes[posSave+upto] = bytes[writer.getPosition()-upto-1]; - bytes[writer.getPosition()-upto-1] = b; - upto++; - } - } - System.arraycopy(bytes, posSave, emptyOutputBytes, 0, writer.getPosition()-posSave); - writer.setPosition(posSave); } public void save(DataOutput out) throws IOException { @@ -453,7 +443,27 @@ public final class FST { // TODO: really we should encode this as an arc, arriving // to the root node, instead of special casing here: if (emptyOutput != null) { + // Accepts empty string out.writeByte((byte) 1); + + // Serialize empty-string output: + RAMOutputStream ros = new RAMOutputStream(); + outputs.writeFinalOutput(emptyOutput, ros); + + byte[] emptyOutputBytes = new byte[(int) ros.getFilePointer()]; + ros.writeTo(emptyOutputBytes, 0); + + if (!packed) { + // reverse + final int stopAt = emptyOutputBytes.length/2; + int upto = 0; + while(upto < stopAt) { + final byte b = emptyOutputBytes[upto]; + emptyOutputBytes[upto] = emptyOutputBytes[emptyOutputBytes.length-upto-1]; + emptyOutputBytes[emptyOutputBytes.length-upto-1] = b; + upto++; + } + } out.writeVInt(emptyOutputBytes.length); out.writeBytes(emptyOutputBytes, 0, emptyOutputBytes.length); } else { @@ -1160,10 +1170,6 @@ public final class FST { return arcWithOutputCount; } - public void setAllowArrayArcs(boolean v) { - allowArrayArcs = v; - } - /** * Nodes will be expanded if their depth (distance from the root node) is * <= this value and their number of arcs is >= @@ -1453,6 +1459,11 @@ public final class FST { this.outputs = outputs; NO_OUTPUT = outputs.getNoOutput(); writer = new DefaultBytesWriter(); + + // NOTE: bogus because this is only used during + // building; we need to break out mutable FST from + // immutable + allowArrayArcs = false; } /** Expert: creates an FST by packing this one. This diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java b/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java new file mode 100644 index 00000000000..9d2aed79120 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java @@ -0,0 +1,227 @@ +package org.apache.lucene.util.packed; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.lucene.util.packed.BlockPackedWriter.BPV_SHIFT; +import static org.apache.lucene.util.packed.BlockPackedWriter.MIN_VALUE_EQUALS_0; +import static org.apache.lucene.util.packed.BlockPackedWriter.checkBlockSize; + +import java.io.EOFException; +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.LongsRef; + +/** + * Reader for sequences of longs written with {@link BlockPackedWriter}. + * @see BlockPackedWriter + * @lucene.internal + */ +public final class BlockPackedReader { + + static long zigZagDecode(long n) { + return ((n >>> 1) ^ -(n & 1)); + } + + // same as DataInput.readVLong but supports negative values + static long readVLong(DataInput in) throws IOException { + byte b = in.readByte(); + if (b >= 0) return b; + long i = b & 0x7FL; + b = in.readByte(); + i |= (b & 0x7FL) << 7; + if (b >= 0) return i; + b = in.readByte(); + i |= (b & 0x7FL) << 14; + if (b >= 0) return i; + b = in.readByte(); + i |= (b & 0x7FL) << 21; + if (b >= 0) return i; + b = in.readByte(); + i |= (b & 0x7FL) << 28; + if (b >= 0) return i; + b = in.readByte(); + i |= (b & 0x7FL) << 35; + if (b >= 0) return i; + b = in.readByte(); + i |= (b & 0x7FL) << 42; + if (b >= 0) return i; + b = in.readByte(); + i |= (b & 0x7FL) << 49; + if (b >= 0) return i; + b = in.readByte(); + i |= (b & 0xFFL) << 56; + return i; + } + + final DataInput in; + final int packedIntsVersion; + final long valueCount; + final int blockSize; + final LongsRef values; + byte[] blocks; + int off; + long ord; + + /** Sole constructor. + * @param blockSize the number of values of a block, must be equal to the + * block size of the {@link BlockPackedWriter} which has + * been used to write the stream + */ + public BlockPackedReader(DataInput in, int packedIntsVersion, int blockSize, long valueCount) { + checkBlockSize(blockSize); + this.in = in; + this.packedIntsVersion = packedIntsVersion; + this.blockSize = blockSize; + this.values = new LongsRef(blockSize); + assert valueCount >= 0; + this.valueCount = valueCount; + off = blockSize; + ord = 0; + } + + /** Skip exactly count values. */ + public void skip(long count) throws IOException { + assert count >= 0; + if (ord + count > valueCount || ord + count < 0) { + throw new EOFException(); + } + + // 1. skip buffered values + final int skipBuffer = (int) Math.min(count, blockSize - off); + off += skipBuffer; + ord += skipBuffer; + count -= skipBuffer; + if (count == 0L) { + return; + } + + // 2. skip as many blocks as necessary + assert off == blockSize; + while (count >= blockSize) { + final int token = in.readByte() & 0xFF; + final int bitsPerValue = token >>> BPV_SHIFT; + if (bitsPerValue > 64) { + throw new IOException("Corrupted"); + } + if ((token & MIN_VALUE_EQUALS_0) == 0) { + readVLong(in); + } + final long blockBytes = PackedInts.Format.PACKED.byteCount(packedIntsVersion, blockSize, bitsPerValue); + skipBytes(blockBytes); + ord += blockSize; + count -= blockSize; + } + if (count == 0L) { + return; + } + + // 3. skip last values + assert count < blockSize; + refill(); + ord += count; + off += count; + } + + private void skipBytes(long count) throws IOException { + if (in instanceof IndexInput) { + final IndexInput iin = (IndexInput) in; + iin.seek(iin.getFilePointer() + count); + } else { + if (blocks == null) { + blocks = new byte[blockSize]; + } + long skipped = 0; + while (skipped < count) { + final int toSkip = (int) Math.min(blocks.length, count - skipped); + in.readBytes(blocks, 0, toSkip); + skipped += toSkip; + } + } + } + + /** Read the next value. */ + public long next() throws IOException { + next(1); + assert values.length == 1; + return values.longs[values.offset]; + } + + /** Read between 1 and count values. */ + public LongsRef next(int count) throws IOException { + assert count > 0; + if (ord == valueCount) { + throw new EOFException(); + } + if (off == blockSize) { + refill(); + } + + count = Math.min(count, blockSize - off); + count = (int) Math.min(count, valueCount - ord); + + values.offset = off; + values.length = count; + off += count; + ord += count; + return values; + } + + private void refill() throws IOException { + final int token = in.readByte() & 0xFF; + final boolean minEquals0 = (token & MIN_VALUE_EQUALS_0) != 0; + final int bitsPerValue = token >>> BPV_SHIFT; + if (bitsPerValue > 64) { + throw new IOException("Corrupted"); + } + final long minValue = minEquals0 ? 0L : zigZagDecode(1L + readVLong(in)); + assert minEquals0 || minValue != 0; + + if (bitsPerValue == 0) { + Arrays.fill(values.longs, minValue); + } else { + final PackedInts.Decoder decoder = PackedInts.getDecoder(PackedInts.Format.PACKED, packedIntsVersion, bitsPerValue); + final int iterations = blockSize / decoder.valueCount(); + final int blocksSize = iterations * 8 * decoder.blockCount(); + if (blocks == null || blocks.length < blocksSize) { + blocks = new byte[blocksSize]; + } + + final int valueCount = (int) Math.min(this.valueCount - ord, blockSize); + final int blocksCount = (int) PackedInts.Format.PACKED.byteCount(packedIntsVersion, valueCount, bitsPerValue); + in.readBytes(blocks, 0, blocksCount); + + decoder.decode(blocks, 0, values.longs, 0, iterations); + + if (minValue != 0) { + for (int i = 0; i < valueCount; ++i) { + values.longs[i] += minValue; + } + } + } + off = 0; + } + + /** Return the offset of the next value to read. */ + public long ord() { + return ord; + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java b/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java new file mode 100644 index 00000000000..43670cc1a5f --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java @@ -0,0 +1,164 @@ +package org.apache.lucene.util.packed; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.store.DataOutput; + +/** + * A writer for large sequences of longs. + *

+ * The sequence is divided into fixed-size blocks and for each block, the + * difference between each value and the minimum value of the block is encoded + * using as few bits as possible. Memory usage of this class is proportional to + * the block size. Each block has an overhead between 1 and 10 bytes to store + * the minimum value and the number of bits per value of the block. + * @see BlockPackedReader + * @lucene.internal + */ +public final class BlockPackedWriter { + + static final int MAX_BLOCK_SIZE = 1 << (30 - 3); + static final int MIN_VALUE_EQUALS_0 = 1 << 0; + static final int BPV_SHIFT = 1; + + static void checkBlockSize(int blockSize) { + if (blockSize <= 0 || blockSize > MAX_BLOCK_SIZE) { + throw new IllegalArgumentException("blockSize must be > 0 and < " + MAX_BLOCK_SIZE + ", got " + blockSize); + } + if (blockSize % 64 != 0) { + throw new IllegalArgumentException("blockSize must be a multiple of 64, got " + blockSize); + } + } + + static long zigZagEncode(long n) { + return (n >> 63) ^ (n << 1); + } + + // same as DataOutput.writeVLong but accepts negative values + static void writeVLong(DataOutput out, long i) throws IOException { + int k = 0; + while ((i & ~0x7FL) != 0L && k++ < 8) { + out.writeByte((byte)((i & 0x7FL) | 0x80L)); + i >>>= 7; + } + out.writeByte((byte) i); + } + + final DataOutput out; + final long[] values; + byte[] blocks; + int off; + long ord; + boolean finished; + + /** + * Sole constructor. + * @param blockSize the number of values of a single block, must be a multiple of 64 + */ + public BlockPackedWriter(DataOutput out, int blockSize) { + checkBlockSize(blockSize); + this.out = out; + values = new long[blockSize]; + off = 0; + ord = 0L; + finished = false; + } + + private void checkNotFinished() { + if (finished) { + throw new IllegalStateException("Already finished"); + } + } + + /** Append a new long. */ + public void add(long l) throws IOException { + checkNotFinished(); + if (off == values.length) { + flush(); + } + values[off++] = l; + ++ord; + } + + /** Flush all buffered data to disk. This instance is not usable anymore + * after this method has been called. */ + public void finish() throws IOException { + checkNotFinished(); + if (off > 0) { + flush(); + } + finished = true; + } + + private void flush() throws IOException { + assert off > 0; + long min = Long.MAX_VALUE, max = Long.MIN_VALUE; + for (int i = 0; i < off; ++i) { + min = Math.min(values[i], min); + max = Math.max(values[i], max); + } + + final long delta = max - min; + final int bitsRequired = delta < 0 ? 64 : delta == 0L ? 0 : PackedInts.bitsRequired(delta); + if (bitsRequired == 64) { + // no need to delta-encode + min = 0L; + } else if (min > 0L) { + // make min as small as possible so that writeVLong requires fewer bytes + min = Math.max(0L, max - PackedInts.maxValue(bitsRequired)); + } + + final int token = (bitsRequired << BPV_SHIFT) | (min == 0 ? MIN_VALUE_EQUALS_0 : 0); + out.writeByte((byte) token); + + if (min != 0) { + writeVLong(out, zigZagEncode(min) - 1); + } + + if (bitsRequired > 0) { + if (min != 0) { + for (int i = 0; i < off; ++i) { + values[i] -= min; + } + } + final PackedInts.Encoder encoder = PackedInts.getEncoder(PackedInts.Format.PACKED, PackedInts.VERSION_CURRENT, bitsRequired); + final int iterations = values.length / encoder.valueCount(); + final int blockSize = encoder.blockCount() * 8 * iterations; + if (blocks == null || blocks.length < blockSize) { + blocks = new byte[blockSize]; + } + if (off < values.length) { + Arrays.fill(values, off, values.length, 0L); + } + encoder.encode(values, 0, blocks, 0, iterations); + final int blockCount = (int) PackedInts.Format.PACKED.byteCount(PackedInts.VERSION_CURRENT, off, bitsRequired); + out.writeBytes(blocks, blockCount); + } + + off = 0; + } + + /** Return the number of values which have been added. */ + public long ord() { + return ord; + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java b/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java index 7b25839179f..fd579f673e2 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java @@ -80,16 +80,6 @@ public abstract class AbstractTestCompressionMode extends LuceneTestCase { return Arrays.copyOfRange(bytes.bytes, bytes.offset, bytes.offset + bytes.length); } - static byte[] copyCompressedData(Decompressor decompressor, byte[] compressed, int originalLength) throws IOException { - GrowableByteArrayDataOutput out = new GrowableByteArrayDataOutput(compressed.length); - decompressor.copyCompressedData(new ByteArrayDataInput(compressed), originalLength, out); - return Arrays.copyOf(out.bytes, out.length); - } - - byte[] copyCompressedData(byte[] compressed, int originalLength) throws IOException { - return copyCompressedData(mode.newDecompressor(), compressed, originalLength); - } - public void testDecompress() throws IOException { final int iterations = atLeast(10); for (int i = 0; i < iterations; ++i) { @@ -117,17 +107,10 @@ public abstract class AbstractTestCompressionMode extends LuceneTestCase { } } - public void testCopyCompressedData() throws IOException { - final byte[] decompressed = randomArray(); - final byte[] compressed = compress(decompressed); - assertArrayEquals(compressed, copyCompressedData(compressed, decompressed.length)); - } - public byte[] test(byte[] decompressed) throws IOException { final byte[] compressed = compress(decompressed); final byte[] restored = decompress(compressed, decompressed.length); assertEquals(decompressed.length, restored.length); - assertArrayEquals(compressed, copyCompressedData(compressed, decompressed.length)); return compressed; } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestCharsRef.java b/lucene/core/src/test/org/apache/lucene/util/TestCharsRef.java index 4237a6d0478..1997bcc4296 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestCharsRef.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestCharsRef.java @@ -116,11 +116,28 @@ public class TestCharsRef extends LuceneTestCase { } // LUCENE-3590: fix off-by-one in subsequence, and fully obey interface + // LUCENE-4671: fix subSequence public void testCharSequenceSubSequence() { - CharSequence c = new CharsRef("abc"); + CharSequence sequences[] = { + new CharsRef("abc"), + new CharsRef("0abc".toCharArray(), 1, 3), + new CharsRef("abc0".toCharArray(), 0, 3), + new CharsRef("0abc0".toCharArray(), 1, 3) + }; + + for (CharSequence c : sequences) { + doTestSequence(c); + } + } + + private void doTestSequence(CharSequence c) { // slice assertEquals("a", c.subSequence(0, 1).toString()); + // mid subsequence + assertEquals("b", c.subSequence(1, 2).toString()); + // end subsequence + assertEquals("bc", c.subSequence(1, 3).toString()); // empty subsequence assertEquals("", c.subSequence(0, 0).toString()); diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java index 176dbca3770..8fb4b2ab3a5 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java @@ -310,7 +310,7 @@ public class TestFSTs extends LuceneTestCase { final boolean doRewrite = random().nextBoolean(); - Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doRewrite); + Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doRewrite, true); boolean storeOrd = random().nextBoolean(); if (VERBOSE) { @@ -453,8 +453,7 @@ public class TestFSTs extends LuceneTestCase { this.outputs = outputs; this.doPack = doPack; - builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, null, doPack); - builder.setAllowArrayArcs(!noArcArrays); + builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, null, doPack, !noArcArrays); } protected abstract T getOutput(IntsRef input, int ord) throws IOException; @@ -1063,7 +1062,7 @@ public class TestFSTs extends LuceneTestCase { public void testFinalOutputOnEndState() throws Exception { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); - final Builder builder = new Builder(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null, random().nextBoolean()); + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null, random().nextBoolean(), true); builder.add(Util.toUTF32("stat", new IntsRef()), 17L); builder.add(Util.toUTF32("station", new IntsRef()), 10L); final FST fst = builder.finish(); @@ -1078,7 +1077,7 @@ public class TestFSTs extends LuceneTestCase { public void testInternalFinalState() throws Exception { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final boolean willRewrite = random().nextBoolean(); - final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, willRewrite); + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, willRewrite, true); builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRef()), outputs.getNoOutput()); builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRef()), outputs.getNoOutput()); final FST fst = builder.finish(); @@ -1101,7 +1100,7 @@ public class TestFSTs extends LuceneTestCase { final Long nothing = outputs.getNoOutput(); final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, outputs); - final FST fst = new FST(FST.INPUT_TYPE.BYTE1, outputs, false, PackedInts.COMPACT); + final FST fst = new FST(FST.INPUT_TYPE.BYTE1, outputs, false, PackedInts.COMPACT, true); final Builder.UnCompiledNode rootNode = new Builder.UnCompiledNode(b, 0); diff --git a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java index 59e2d670d0c..18629bc22c7 100644 --- a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java +++ b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java @@ -27,6 +27,8 @@ import java.util.Locale; import java.util.Random; import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.DataInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; @@ -875,4 +877,102 @@ public class TestPackedInts extends LuceneTestCase { in.close(); dir.close(); } + public void testBlockPackedReaderWriter() throws IOException { + final int iters = atLeast(2); + for (int iter = 0; iter < iters; ++iter) { + final int blockSize = 64 * _TestUtil.nextInt(random(), 1, 1 << 12); + final int valueCount = random().nextInt(1 << 18); + final long[] values = new long[valueCount]; + long minValue = 0; + int bpv = 0; + for (int i = 0; i < valueCount; ++i) { + if (i % blockSize == 0) { + minValue = rarely() ? random().nextInt(256) : rarely() ? -5 : random().nextLong(); + bpv = random().nextInt(65); + } + if (bpv == 0) { + values[i] = minValue; + } else if (bpv == 64) { + values[i] = random().nextLong(); + } else { + values[i] = minValue + _TestUtil.nextLong(random(), 0, (1L << bpv) - 1); + } + } + + final Directory dir = newDirectory(); + final IndexOutput out = dir.createOutput("out.bin", IOContext.DEFAULT); + final BlockPackedWriter writer = new BlockPackedWriter(out, blockSize); + for (int i = 0; i < valueCount; ++i) { + assertEquals(i, writer.ord()); + writer.add(values[i]); + } + assertEquals(valueCount, writer.ord()); + writer.finish(); + assertEquals(valueCount, writer.ord()); + final long fp = out.getFilePointer(); + out.close(); + + DataInput in = dir.openInput("out.bin", IOContext.DEFAULT); + if (random().nextBoolean()) { + byte[] buf = new byte[(int) fp]; + in.readBytes(buf, 0, (int) fp); + ((IndexInput) in).close(); + in = new ByteArrayDataInput(buf); + } + final BlockPackedReader reader = new BlockPackedReader(in, PackedInts.VERSION_CURRENT, blockSize, valueCount); + for (int i = 0; i < valueCount; ) { + if (random().nextBoolean()) { + assertEquals("" + i, values[i], reader.next()); + ++i; + } else { + final LongsRef nextValues = reader.next(_TestUtil.nextInt(random(), 1, 1024)); + for (int j = 0; j < nextValues.length; ++j) { + assertEquals("" + (i + j), values[i + j], nextValues.longs[nextValues.offset + j]); + } + i += nextValues.length; + } + assertEquals(i, reader.ord()); + } + assertEquals(fp, in instanceof ByteArrayDataInput ? ((ByteArrayDataInput) in).getPosition() : ((IndexInput) in).getFilePointer()); + try { + reader.next(); + assertTrue(false); + } catch (IOException e) { + // OK + } + + if (in instanceof ByteArrayDataInput) { + ((ByteArrayDataInput) in).setPosition(0); + } else { + ((IndexInput) in).seek(0L); + } + final BlockPackedReader reader2 = new BlockPackedReader(in, PackedInts.VERSION_CURRENT, blockSize, valueCount); + int i = 0; + while (true) { + final int skip = _TestUtil.nextInt(random(), 0, valueCount - i); + reader2.skip(skip); + i += skip; + assertEquals(i, reader2.ord()); + if (i == valueCount) { + break; + } else { + assertEquals(values[i], reader2.next()); + ++i; + } + } + assertEquals(fp, in instanceof ByteArrayDataInput ? ((ByteArrayDataInput) in).getPosition() : ((IndexInput) in).getFilePointer()); + try { + reader2.skip(1); + assertTrue(false); + } catch (IOException e) { + // OK + } + + if (in instanceof IndexInput) { + ((IndexInput) in).close(); + } + dir.close(); + } + } + } diff --git a/lucene/facet/build.xml b/lucene/facet/build.xml index 209f5c7e2f2..9cdd5e995bb 100644 --- a/lucene/facet/build.xml +++ b/lucene/facet/build.xml @@ -81,5 +81,12 @@ - + + + + + + + + diff --git a/lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationIndexer.java b/lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsIndexer.java similarity index 93% rename from lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationIndexer.java rename to lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsIndexer.java index 145d13e7826..85c34783cde 100644 --- a/lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationIndexer.java +++ b/lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsIndexer.java @@ -39,7 +39,7 @@ import org.apache.lucene.store.Directory; * * @lucene.experimental */ -public class AssociationIndexer { +public class CategoryAssociationsIndexer { /** * Create an index, and adds to it sample documents and categories. @@ -75,13 +75,11 @@ public class AssociationIndexer { ++nFacetsAdded; } // and also those with associations - CategoryPath[] associationsPaths = AssociationUtils.categories[docNum]; - CategoryAssociation[] associationsValues = AssociationUtils.associations[docNum]; + CategoryPath[] associationsPaths = CategoryAssociationsUtils.categories[docNum]; + CategoryAssociation[] associationsValues = CategoryAssociationsUtils.associations[docNum]; for (int i = 0; i < associationsPaths.length; i++) { associations.setAssociation(associationsPaths[i], associationsValues[i]); - ExampleUtils.log("\t $$$$ Association: (" - + associationsPaths[i] + "," + associationsValues[i] - + ")"); + ExampleUtils.log("\t $$$$ Association: (" + associationsPaths[i] + "," + associationsValues[i] + ")"); ++nFacetsAdded; } diff --git a/lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationMain.java b/lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsMain.java similarity index 83% rename from lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationMain.java rename to lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsMain.java index 619377b48a6..bdd99be72dd 100644 --- a/lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationMain.java +++ b/lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsMain.java @@ -31,15 +31,15 @@ import org.apache.lucene.facet.search.results.FacetResult; * * @lucene.experimental */ -public class AssociationMain { +public class CategoryAssociationsMain { /** * Driver for the simple sample. * @throws Exception on error (no detailed exception handling here for sample simplicity */ public static void main(String[] args) throws Exception { - new AssociationMain().runSumIntAssociationSample(); - new AssociationMain().runSumFloatAssociationSample(); + new CategoryAssociationsMain().runSumIntAssociationSample(); + new CategoryAssociationsMain().runSumFloatAssociationSample(); ExampleUtils.log("DONE"); } @@ -51,10 +51,10 @@ public class AssociationMain { // index the sample documents ExampleUtils.log("index the sample documents..."); - AssociationIndexer.index(indexDir, taxoDir); + CategoryAssociationsIndexer.index(indexDir, taxoDir); ExampleUtils.log("search the sample documents..."); - List facetRes = AssociationSearcher.searchSumIntAssociation(indexDir, taxoDir); + List facetRes = CategoryAssociationsSearcher.searchSumIntAssociation(indexDir, taxoDir); ExampleResult res = new ExampleResult(); res.setFacetResults(facetRes); @@ -69,10 +69,10 @@ public class AssociationMain { // index the sample documents ExampleUtils.log("index the sample documents..."); - AssociationIndexer.index(indexDir, taxoDir); + CategoryAssociationsIndexer.index(indexDir, taxoDir); ExampleUtils.log("search the sample documents..."); - List facetRes = AssociationSearcher.searchSumFloatAssociation(indexDir, taxoDir); + List facetRes = CategoryAssociationsSearcher.searchSumFloatAssociation(indexDir, taxoDir); ExampleResult res = new ExampleResult(); res.setFacetResults(facetRes); diff --git a/lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationSearcher.java b/lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsSearcher.java similarity index 90% rename from lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationSearcher.java rename to lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsSearcher.java index 1e643df2c50..57c9524266b 100644 --- a/lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationSearcher.java +++ b/lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsSearcher.java @@ -37,18 +37,15 @@ import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; * * @lucene.experimental */ -public class AssociationSearcher { +public class CategoryAssociationsSearcher { /** Search an index with a sum of int-association. */ - public static List searchSumIntAssociation(Directory indexDir, - Directory taxoDir) throws Exception { + public static List searchSumIntAssociation(Directory indexDir, Directory taxoDir) throws Exception { // prepare index reader IndexReader indexReader = DirectoryReader.open(indexDir); TaxonomyReader taxo = new DirectoryTaxonomyReader(taxoDir); - AssociationIntSumFacetRequest facetRequest = new AssociationIntSumFacetRequest( - new CategoryPath("tags"), 10); - + AssociationIntSumFacetRequest facetRequest = new AssociationIntSumFacetRequest(new CategoryPath("tags"), 10); List res = SimpleSearcher.searchWithRequest(indexReader, taxo, null, facetRequest); // close readers @@ -59,14 +56,12 @@ public class AssociationSearcher { } /** Search an index with a sum of float-association. */ - public static List searchSumFloatAssociation(Directory indexDir, - Directory taxoDir) throws Exception { + public static List searchSumFloatAssociation(Directory indexDir, Directory taxoDir) throws Exception { // prepare index reader IndexReader indexReader = DirectoryReader.open(indexDir); TaxonomyReader taxo = new DirectoryTaxonomyReader(taxoDir); - AssociationFloatSumFacetRequest facetRequest = new AssociationFloatSumFacetRequest( - new CategoryPath("genre"), 10); + AssociationFloatSumFacetRequest facetRequest = new AssociationFloatSumFacetRequest(new CategoryPath("genre"), 10); List res = SimpleSearcher.searchWithRequest(indexReader, taxo, null, facetRequest); diff --git a/lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationUtils.java b/lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsUtils.java similarity index 98% rename from lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationUtils.java rename to lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsUtils.java index 022a233a5d6..f2cc8520599 100644 --- a/lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationUtils.java +++ b/lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsUtils.java @@ -25,7 +25,7 @@ import org.apache.lucene.facet.taxonomy.CategoryPath; /** * @lucene.experimental */ -public class AssociationUtils { +public class CategoryAssociationsUtils { /** * Categories: categories[D][N] == category-path with association no. N for diff --git a/lucene/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleUtils.java b/lucene/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleUtils.java index 087f7397ae5..08ae45027ea 100644 --- a/lucene/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleUtils.java +++ b/lucene/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleUtils.java @@ -1,11 +1,7 @@ package org.apache.lucene.facet.example.simple; -import java.util.ArrayList; -import java.util.List; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; - import org.apache.lucene.facet.example.ExampleUtils; import org.apache.lucene.facet.taxonomy.CategoryPath; diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsCategoryListBuilder.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsCategoryListBuilder.java deleted file mode 100644 index c19898cbbb3..00000000000 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsCategoryListBuilder.java +++ /dev/null @@ -1,92 +0,0 @@ -package org.apache.lucene.facet.associations; - -import java.io.IOException; -import java.util.HashMap; - -import org.apache.lucene.facet.index.CategoryListBuilder; -import org.apache.lucene.facet.index.params.CategoryListParams; -import org.apache.lucene.facet.index.params.FacetIndexingParams; -import org.apache.lucene.facet.taxonomy.CategoryPath; -import org.apache.lucene.facet.taxonomy.TaxonomyWriter; -import org.apache.lucene.store.ByteArrayDataOutput; -import org.apache.lucene.util.BytesRef; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * A {@link CategoryListBuilder} which encodes category-association value pairs - * in addition to regular counting list category ordinals. Every - * category-association pair is written under the respective association's - * {@link CategoryAssociation#getCategoryListID()}. - */ -public class AssociationsCategoryListBuilder extends CategoryListBuilder { - - private final CategoryAssociationsContainer associations; - private final HashMap perAssociationBytes = new HashMap(); - private final ByteArrayDataOutput output = new ByteArrayDataOutput(); - - public AssociationsCategoryListBuilder(CategoryAssociationsContainer associations, - CategoryListParams categoryListParams, FacetIndexingParams indexingParams, TaxonomyWriter taxoWriter) { - super(categoryListParams, indexingParams, taxoWriter); - this.associations = associations; - } - - @Override - public void handle(int ordinal, CategoryPath cp) throws IOException { - super.handle(ordinal, cp); - - // build per-association key BytesRef - CategoryAssociation association = associations.getAssociation(cp); - if (association == null) { - // it is ok to set a null association for a category - it's treated as a - // regular category in that case. - return; - } - - BytesRef bytes = perAssociationBytes.get(association.getCategoryListID()); - if (bytes == null) { - bytes = new BytesRef(); - perAssociationBytes.put(association.getCategoryListID(), bytes); - } - - int maxBytesNeeded = 4 /* int */ + association.maxBytesNeeded(); - if (bytes.bytes.length - bytes.length < maxBytesNeeded) { - bytes.grow(bytes.bytes.length + maxBytesNeeded); - } - - // reset the output to write from bytes.length (current position) until the end - output.reset(bytes.bytes, bytes.length, bytes.bytes.length - bytes.length); - output.writeInt(ordinal); - - // encode the association bytes - association.serialize(output); - - // update BytesRef - bytes.length = output.getPosition(); - } - - @Override - public HashMap finish() { - // build the ordinals list - HashMap result = super.finish(); - // add per association bytes - result.putAll(perAssociationBytes); - return result; - } - -} diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsFacetFields.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsFacetFields.java index 00f61e17149..1ed3985afef 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsFacetFields.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsFacetFields.java @@ -7,7 +7,7 @@ import java.util.Map; import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.TextField; -import org.apache.lucene.facet.index.CategoryListBuilder; +import org.apache.lucene.facet.index.CountingListBuilder; import org.apache.lucene.facet.index.DrillDownStream; import org.apache.lucene.facet.index.FacetFields; import org.apache.lucene.facet.index.params.CategoryListParams; @@ -15,6 +15,8 @@ import org.apache.lucene.facet.index.params.FacetIndexingParams; import org.apache.lucene.facet.taxonomy.CategoryPath; import org.apache.lucene.facet.taxonomy.TaxonomyWriter; import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -94,15 +96,16 @@ public class AssociationsFacetFields extends FacetFields { return categoryLists; } - /** - * Returns a {@link CategoryListBuilder} for encoding the given categories and - * associations. - */ @Override - protected CategoryListBuilder getCategoryListBuilder(CategoryListParams categoryListParams, - Iterable categories) { - return new AssociationsCategoryListBuilder((CategoryAssociationsContainer) categories, categoryListParams, - indexingParams, taxonomyWriter); + protected Map getCategoryListData(CategoryListParams categoryListParams, IntsRef ordinals, + Iterable categories) throws IOException { + AssociationsListBuilder associations = new AssociationsListBuilder((CategoryAssociationsContainer) categories); + CountingListBuilder counting = new CountingListBuilder(categoryListParams, indexingParams, taxonomyWriter); + // CountingListBuilder modifies the ordinals array, by e.g. adding parent ordinals, sorting etc. + // Therefore first build the associations list and only afterwards the counting list. + final Map res = associations.build(ordinals, categories); + res.putAll(counting.build(ordinals, categories)); + return res; } @Override diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsListBuilder.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsListBuilder.java new file mode 100644 index 00000000000..42a4218f323 --- /dev/null +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsListBuilder.java @@ -0,0 +1,89 @@ +package org.apache.lucene.facet.associations; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.facet.index.CategoryListBuilder; +import org.apache.lucene.facet.index.CountingListBuilder; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A {@link AssociationsListBuilder} which encodes category-association value pairs. + * Every category-association pair is written under the respective association's + * {@link CategoryAssociation#getCategoryListID()}. + *

+ * NOTE: associations list do not encode the counting list data. You + * should use {@link CountingListBuilder} to build that information and then + * merge the results of both {@link #build(IntsRef, Iterable)}. + */ +public class AssociationsListBuilder implements CategoryListBuilder { + + private final CategoryAssociationsContainer associations; + private final ByteArrayDataOutput output = new ByteArrayDataOutput(); + + public AssociationsListBuilder(CategoryAssociationsContainer associations) { + this.associations = associations; + } + + @Override + public Map build(IntsRef ordinals, Iterable categories) throws IOException { + final HashMap res = new HashMap(); + int idx = 0; + for (CategoryPath cp : categories) { + // build per-association key BytesRef + CategoryAssociation association = associations.getAssociation(cp); + + if (association == null) { + // it is ok to set a null association for a category - it's treated as a + // regular category in that case. + ++idx; + continue; + } + + BytesRef bytes = res.get(association.getCategoryListID()); + if (bytes == null) { + bytes = new BytesRef(32); + res.put(association.getCategoryListID(), bytes); + } + + int maxBytesNeeded = 4 /* int */ + association.maxBytesNeeded() + bytes.length; + if (bytes.bytes.length < maxBytesNeeded) { + bytes.grow(maxBytesNeeded); + } + + // reset the output to write from bytes.length (current position) until the end + output.reset(bytes.bytes, bytes.length, bytes.bytes.length - bytes.length); + output.writeInt(ordinals.ints[idx++]); + + // encode the association bytes + association.serialize(output); + + // update BytesRef + bytes.length = output.getPosition(); + } + + return res; + } + +} diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsPayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsPayloadIterator.java index d5ffcd7632d..6ced1c70ce8 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsPayloadIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsPayloadIterator.java @@ -53,20 +53,22 @@ public abstract class AssociationsPayloadIterator } /** - * Skip to the requested document. Returns true iff the document has categort - * association values and they were read successfully. + * Skip to the requested document. Returns true iff the document has category + * association values and they were read successfully. Associations are + * handled through {@link #handleAssociation(int, CategoryAssociation)} by + * extending classes. */ - public boolean setNextDoc(int docId) throws IOException { + protected final boolean setNextDoc(int docID) throws IOException { if (!hasAssociations) { // there are no associations at all return false; } - if (!pi.setdoc(docId)) { // no associations for the requested document + BytesRef bytes = pi.getPayload(docID); + if (bytes == null) { // no associations for the requested document return false; } - - BytesRef associations = pi.getPayload(); - ByteArrayDataInput in = new ByteArrayDataInput(associations.bytes, associations.offset, associations.length); + + ByteArrayDataInput in = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length); while (!in.eof()) { int ordinal = in.readInt(); association.deserialize(in); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryAssociationsContainer.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryAssociationsContainer.java index 27b3e58470b..67e6c4c8e90 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryAssociationsContainer.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryAssociationsContainer.java @@ -55,5 +55,10 @@ public class CategoryAssociationsContainer implements Iterable { public void clear() { categoryAssociations.clear(); } + + @Override + public String toString() { + return categoryAssociations.toString(); + } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryFloatAssociation.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryFloatAssociation.java index bd826dfb60c..459e523e15d 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryFloatAssociation.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryFloatAssociation.java @@ -71,5 +71,10 @@ public class CategoryFloatAssociation implements CategoryAssociation { public float getValue() { return value; } + + @Override + public String toString() { + return getClass().getSimpleName() + "(" + value + ")"; + } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryIntAssociation.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryIntAssociation.java index 6758d7d00fc..1d3b691b0d3 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryIntAssociation.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryIntAssociation.java @@ -72,4 +72,9 @@ public class CategoryIntAssociation implements CategoryAssociation { return value; } + @Override + public String toString() { + return getClass().getSimpleName() + "(" + value + ")"; + } + } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsPayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsPayloadIterator.java index 958dfb49ff7..0708910523d 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsPayloadIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsPayloadIterator.java @@ -40,23 +40,17 @@ public class FloatAssociationsPayloadIterator extends AssociationsPayloadIterato protected void handleAssociation(int ordinal, CategoryFloatAssociation association) { ordinalAssociations.put(ordinal, association.getValue()); } - - @Override - public boolean setNextDoc(int docId) throws IOException { - ordinalAssociations.clear(); - return super.setNextDoc(docId); - } /** - * Get the float association value for the given ordinal, or - * {@link Float#NaN} in case the ordinal has no association value. + * Returns the float association values of the categories that are associated + * with the given document, or {@code null} if the document has no + * associations. + *

+ * NOTE: you are not expected to modify the returned map. */ - public float getAssociation(int ordinal) { - if (ordinalAssociations.containsKey(ordinal)) { - return ordinalAssociations.get(ordinal); - } - - return Float.NaN; + public IntToFloatMap getAssociations(int docID) throws IOException { + ordinalAssociations.clear(); + return setNextDoc(docID) ? ordinalAssociations : null; } - + } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsPayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsPayloadIterator.java index ad983114084..e3bed4f51e5 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsPayloadIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsPayloadIterator.java @@ -31,12 +31,6 @@ public class IntAssociationsPayloadIterator extends AssociationsPayloadIterator< private final IntToIntMap ordinalAssociations = new IntToIntMap(); - /** - * The long-special-value returned for ordinals which have no associated int - * value. It is not in the int range of values making it a valid mark. - */ - public final static long NO_ASSOCIATION = Integer.MAX_VALUE + 1; - public IntAssociationsPayloadIterator(IndexReader reader, String field, CategoryIntAssociation association) throws IOException { super(reader, field, association); @@ -47,22 +41,16 @@ public class IntAssociationsPayloadIterator extends AssociationsPayloadIterator< ordinalAssociations.put(ordinal, association.getValue()); } - @Override - public boolean setNextDoc(int docId) throws IOException { - ordinalAssociations.clear(); - return super.setNextDoc(docId); - } - /** - * Get the integer association value for the given ordinal, or - * {@link #NO_ASSOCIATION} in case the ordinal has no association value. + * Returns the integer association values of the categories that are + * associated with the given document, or {@code null} if the document has no + * associations. + *

+ * NOTE: you are not expected to modify the returned map. */ - public long getAssociation(int ordinal) { - if (ordinalAssociations.containsKey(ordinal)) { - return ordinalAssociations.get(ordinal); - } - - return NO_ASSOCIATION; + public IntToIntMap getAssociations(int docID) throws IOException { + ordinalAssociations.clear(); + return setNextDoc(docID) ? ordinalAssociations : null; } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/CategoryListBuilder.java b/lucene/facet/src/java/org/apache/lucene/facet/index/CategoryListBuilder.java index 509c0c61684..0bea43235b4 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/index/CategoryListBuilder.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/CategoryListBuilder.java @@ -1,19 +1,11 @@ package org.apache.lucene.facet.index; import java.io.IOException; -import java.util.HashMap; -import java.util.Map.Entry; +import java.util.Map; -import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy; -import org.apache.lucene.facet.index.params.CategoryListParams; -import org.apache.lucene.facet.index.params.FacetIndexingParams; import org.apache.lucene.facet.taxonomy.CategoryPath; -import org.apache.lucene.facet.taxonomy.TaxonomyWriter; -import org.apache.lucene.facet.util.PartitionsUtils; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.UnsafeByteArrayOutputStream; -import org.apache.lucene.util.encoding.IntEncoder; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -33,149 +25,14 @@ import org.apache.lucene.util.encoding.IntEncoder; */ /** - * Builds a category list by encoding the category ordinals into one or more - * {@link BytesRef}. Each {@link BytesRef} corresponds to a set of ordinals that - * belong to the same partition. When partitions are not enabled (i.e. - * {@link FacetIndexingParams#getPartitionSize()} returns - * {@link Integer#MAX_VALUE}), only one {@link BytesRef} is returned by this - * class. + * Builds a category list data by encoding the appropriate information for every + * category and ordinal given to {@link #build(IntsRef, Iterable)}. + * + * @lucene.experimental */ -public class CategoryListBuilder { - - /** Specializes encoding ordinals when partitions are enabled/disabled. */ - private static abstract class OrdinalsEncoder { - OrdinalsEncoder() {} - public abstract void encode(int ordinal); - public abstract HashMap finish(); - } +public interface CategoryListBuilder { - private static final class NoPartitionsOrdinalsEncoder extends OrdinalsEncoder { - - private final IntEncoder encoder; - private final UnsafeByteArrayOutputStream ubaos; - private final String name; - - NoPartitionsOrdinalsEncoder(CategoryListParams categoryListParams) { - name = categoryListParams.getTerm().text(); - encoder = categoryListParams.createEncoder(); - ubaos = new UnsafeByteArrayOutputStream(); - encoder.reInit(ubaos); - } - - @Override - public void encode(int ordinal) { - try { - encoder.encode(ordinal); - } catch (IOException e) { - // shouldn't happen as we're writing to byte[] - throw new RuntimeException("unexpected exception", e); - } - } - - @Override - public HashMap finish() { - try { - encoder.close(); - } catch (IOException e) { - // shouldn't happen as we're writing to byte[] - throw new RuntimeException("unexpected exception", e); - } - HashMap result = new HashMap(); - result.put(name, new BytesRef(ubaos.toByteArray(), ubaos.getStartPos(), ubaos.length())); - return result; - } - - } - - private static final class PerPartitionOrdinalsEncoder extends OrdinalsEncoder { - - private final FacetIndexingParams indexingParams; - private final CategoryListParams categoryListParams; - private final int partitionSize; - private final HashMap partitionEncoder = new HashMap(); - private final HashMap partitionBytes = new HashMap(); - - PerPartitionOrdinalsEncoder(FacetIndexingParams indexingParams, CategoryListParams categoryListParams) { - this.indexingParams = indexingParams; - this.categoryListParams = categoryListParams; - this.partitionSize = indexingParams.getPartitionSize(); - } - - @Override - public void encode(int ordinal) { - final String name = PartitionsUtils.partitionNameByOrdinal(indexingParams, categoryListParams, ordinal); - IntEncoder encoder = partitionEncoder.get(name); - if (encoder == null) { - encoder = categoryListParams.createEncoder(); - final UnsafeByteArrayOutputStream ubaos = new UnsafeByteArrayOutputStream(); - encoder.reInit(ubaos); - partitionEncoder.put(name, encoder); - partitionBytes.put(name, ubaos); - } - try { - encoder.encode(ordinal % partitionSize); - } catch (IOException e) { - // shouldn't happen as we're writing to byte[] - throw new RuntimeException("unexpected exception", e); - } - } - - @Override - public HashMap finish() { - // finish encoding - IOUtils.closeWhileHandlingException(partitionEncoder.values()); - - HashMap bytes = new HashMap(); - for (Entry e : partitionBytes.entrySet()) { - UnsafeByteArrayOutputStream ubaos = e.getValue(); - bytes.put(e.getKey(), new BytesRef(ubaos.toByteArray(), ubaos.getStartPos(), ubaos.length())); - } - return bytes; - } - - } - - private final TaxonomyWriter taxoWriter; - private final OrdinalsEncoder ordinalsEncoder; - private final OrdinalPolicy ordinalPolicy; - - public CategoryListBuilder(CategoryListParams categoryListParams, FacetIndexingParams indexingParams, - TaxonomyWriter taxoWriter) { - this.taxoWriter = taxoWriter; - this.ordinalPolicy = indexingParams.getOrdinalPolicy(); - if (indexingParams.getPartitionSize() == Integer.MAX_VALUE) { - ordinalsEncoder = new NoPartitionsOrdinalsEncoder(categoryListParams); - } else { - ordinalsEncoder = new PerPartitionOrdinalsEncoder(indexingParams, categoryListParams); - } - } - - /** - * Encodes the given ordinal as well as any of its parent ordinals (per - * {@link OrdinalPolicy}). - */ - public void handle(int ordinal, CategoryPath cp) throws IOException { - ordinalsEncoder.encode(ordinal); - - // add all parent ordinals, per OrdinalPolicy - int parent = taxoWriter.getParent(ordinal); - while (parent > 0) { - if (ordinalPolicy.shouldAdd(parent)) { - ordinalsEncoder.encode(parent); - } - parent = taxoWriter.getParent(parent); - } - } - - /** - * Returns the encoded ordinals data. Every returned {@link BytesRef} - * corresponds to a single partition (as defined by - * {@link FacetIndexingParams#getPartitionSize()}) and the key denotes the - * partition ID. When no partitions are defined, the returned map includes - * only one value. - */ - public HashMap finish() { - return ordinalsEncoder.finish(); - } + /** Returns the encoded ordinals data. */ + public Map build(IntsRef ordinals, Iterable categories) throws IOException; } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/CountingListBuilder.java b/lucene/facet/src/java/org/apache/lucene/facet/index/CountingListBuilder.java new file mode 100644 index 00000000000..29b87b3f7b2 --- /dev/null +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/CountingListBuilder.java @@ -0,0 +1,160 @@ +package org.apache.lucene.facet.index; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy; +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.util.PartitionsUtils; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.encoding.IntEncoder; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A {@link CategoryListBuilder} which builds a counting list data by encoding + * the category ordinals into one or more {@link BytesRef}. Each + * {@link BytesRef} corresponds to a set of ordinals that belong to the same + * partition. When partitions are not enabled (i.e. + * {@link FacetIndexingParams#getPartitionSize()} returns + * {@link Integer#MAX_VALUE}), only one {@link BytesRef} is returned by this + * class. + *

+ * Counting lists are used usually for computing the weight of categories by + * summing their number of occurrences (hence counting) in a result set. + */ +public class CountingListBuilder implements CategoryListBuilder { + + /** Specializes encoding ordinals when partitions are enabled/disabled. */ + private static abstract class OrdinalsEncoder { + OrdinalsEncoder() {} + public abstract Map encode(IntsRef ordinals); + } + + private static final class NoPartitionsOrdinalsEncoder extends OrdinalsEncoder { + + private final IntEncoder encoder; + private final String name; + + NoPartitionsOrdinalsEncoder(CategoryListParams categoryListParams) { + name = categoryListParams.getTerm().text(); + encoder = categoryListParams.createEncoder(); + } + + @Override + public Map encode(IntsRef ordinals) { + final BytesRef bytes = new BytesRef(128); // should be enough for most common applications + encoder.encode(ordinals, bytes); + return Collections.singletonMap(name, bytes); + } + + } + + private static final class PerPartitionOrdinalsEncoder extends OrdinalsEncoder { + + private final FacetIndexingParams indexingParams; + private final CategoryListParams categoryListParams; + private final int partitionSize; + private final HashMap partitionEncoder = new HashMap(); + + PerPartitionOrdinalsEncoder(FacetIndexingParams indexingParams, CategoryListParams categoryListParams) { + this.indexingParams = indexingParams; + this.categoryListParams = categoryListParams; + this.partitionSize = indexingParams.getPartitionSize(); + } + + @Override + public HashMap encode(IntsRef ordinals) { + // build the partitionOrdinals map + final HashMap partitionOrdinals = new HashMap(); + for (int i = 0; i < ordinals.length; i++) { + int ordinal = ordinals.ints[i]; + final String name = PartitionsUtils.partitionNameByOrdinal(indexingParams, categoryListParams, ordinal); + IntsRef partitionOrds = partitionOrdinals.get(name); + if (partitionOrds == null) { + partitionOrds = new IntsRef(32); + partitionOrdinals.put(name, partitionOrds); + partitionEncoder.put(name, categoryListParams.createEncoder()); + } + partitionOrds.ints[partitionOrds.length++] = ordinal % partitionSize; + } + + HashMap partitionBytes = new HashMap(); + for (Entry e : partitionOrdinals.entrySet()) { + String name = e.getKey(); + final IntEncoder encoder = partitionEncoder.get(name); + final BytesRef bytes = new BytesRef(128); // should be enough for most common applications + encoder.encode(e.getValue(), bytes); + partitionBytes.put(name, bytes); + } + return partitionBytes; + } + + } + + private final OrdinalsEncoder ordinalsEncoder; + private final TaxonomyWriter taxoWriter; + private final OrdinalPolicy ordinalPolicy; + + public CountingListBuilder(CategoryListParams categoryListParams, FacetIndexingParams indexingParams, + TaxonomyWriter taxoWriter) { + this.taxoWriter = taxoWriter; + this.ordinalPolicy = indexingParams.getOrdinalPolicy(); + if (indexingParams.getPartitionSize() == Integer.MAX_VALUE) { + ordinalsEncoder = new NoPartitionsOrdinalsEncoder(categoryListParams); + } else { + ordinalsEncoder = new PerPartitionOrdinalsEncoder(indexingParams, categoryListParams); + } + } + + /** + * Every returned {@link BytesRef} corresponds to a single partition (as + * defined by {@link FacetIndexingParams#getPartitionSize()}) and the key + * denotes the partition ID. When no partitions are defined, the returned map + * contains only one value. + *

+ * NOTE: the {@code ordinals} array is modified by adding parent + * ordinals to it. Also, some encoders may sort the array and remove duplicate + * ordinals. Therefore you may want to invoke this method after you finished + * processing the array for other purposes. + */ + @Override + public Map build(IntsRef ordinals, Iterable categories) throws IOException { + int upto = ordinals.length; // since we add ordinals to IntsRef, iterate upto original length + + for (int i = 0; i < upto; i++) { + int ordinal = ordinals.ints[i]; + int parent = taxoWriter.getParent(ordinal); + while (parent > 0) { + if (ordinalPolicy.shouldAdd(parent)) { + ordinals.ints[ordinals.length++] = parent; + } + parent = taxoWriter.getParent(parent); + } + } + return ordinalsEncoder.encode(ordinals); + } + +} diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java b/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java index 07249aaec99..9fda2396ee5 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java @@ -2,6 +2,7 @@ package org.apache.lucene.facet.index; import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -21,6 +22,7 @@ import org.apache.lucene.facet.taxonomy.CategoryPath; import org.apache.lucene.facet.taxonomy.TaxonomyWriter; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -69,7 +71,7 @@ public class FacetFields { return true; } - void setCategoriesData(HashMap categoriesData) { + void setCategoriesData(Map categoriesData) { this.categoriesData = categoriesData.entrySet().iterator(); } @@ -132,6 +134,9 @@ public class FacetFields { */ protected Map> createCategoryListMapping( Iterable categories) { + if (indexingParams.getAllCategoryListParams().size() == 1) { + return Collections.singletonMap(indexingParams.getCategoryListParams(null), categories); + } HashMap> categoryLists = new HashMap>(); for (CategoryPath cp : categories) { @@ -147,10 +152,15 @@ public class FacetFields { return categoryLists; } - /** Returns a {@link CategoryListBuilder} for encoding the given categories. */ - protected CategoryListBuilder getCategoryListBuilder(CategoryListParams categoryListParams, - Iterable categories /* needed for AssociationsFacetFields */) { - return new CategoryListBuilder(categoryListParams, indexingParams, taxonomyWriter); + /** + * Returns the category list data, as a mapping from key to {@link BytesRef} + * which includes the encoded data. Every ordinal in {@code ordinals} + * corrspond to a {@link CategoryPath} returned from {@code categories}. + */ + protected Map getCategoryListData(CategoryListParams categoryListParams, + IntsRef ordinals, Iterable categories /* needed for AssociationsFacetFields */) + throws IOException { + return new CountingListBuilder(categoryListParams, indexingParams, taxonomyWriter).build(ordinals, categories); } /** @@ -185,17 +195,25 @@ public class FacetFields { // for each CLP we add a different field for drill-down terms as well as for // counting list data. + IntsRef ordinals = new IntsRef(32); // should be enough for most common applications for (Entry> e : categoryLists.entrySet()) { final CategoryListParams clp = e.getKey(); final String field = clp.getTerm().field(); - // add the counting list data - CategoryListBuilder categoriesPayloadBuilder = getCategoryListBuilder(clp, e.getValue()); + // build category list data + ordinals.length = 0; // reset + int maxNumOrds = 0; for (CategoryPath cp : e.getValue()) { int ordinal = taxonomyWriter.addCategory(cp); - categoriesPayloadBuilder.handle(ordinal , cp); + maxNumOrds += cp.length; // ordinal and potentially all parents + if (ordinals.ints.length < maxNumOrds) { + ordinals.grow(maxNumOrds); + } + ordinals.ints[ordinals.length++] = ordinal; } - HashMap categoriesData = categoriesPayloadBuilder.finish(); + Map categoriesData = getCategoryListData(clp, ordinals, e.getValue()); + + // add the counting list data CountingListStream ts = new CountingListStream(); ts.setCategoriesData(categoriesData); doc.add(new Field(field, ts, COUNTING_LIST_PAYLOAD_TYPE)); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/OrdinalMappingAtomicReader.java b/lucene/facet/src/java/org/apache/lucene/facet/index/OrdinalMappingAtomicReader.java index e2f6a5acd88..522f383e4d4 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/index/OrdinalMappingAtomicReader.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/OrdinalMappingAtomicReader.java @@ -17,10 +17,7 @@ package org.apache.lucene.facet.index; * limitations under the License. */ -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.io.InputStream; import java.util.HashMap; import java.util.Map; @@ -36,6 +33,7 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.encoding.IntDecoder; import org.apache.lucene.util.encoding.IntEncoder; @@ -187,7 +185,7 @@ public class OrdinalMappingAtomicReader extends FilterAtomicReader { private class OrdinalMappingDocsAndPositionsEnum extends FilterDocsAndPositionsEnum { private final IntEncoder encoder; private final IntDecoder decoder; - private final ByteArrayOutputStream os = new ByteArrayOutputStream(); + private final IntsRef ordinals = new IntsRef(32); private final BytesRef payloadOut = new BytesRef(); public OrdinalMappingDocsAndPositionsEnum(DocsAndPositionsEnum in, CategoryListParams params) { @@ -202,21 +200,14 @@ public class OrdinalMappingAtomicReader extends FilterAtomicReader { if (payload == null) { return payload; } else { - InputStream is = new ByteArrayInputStream(payload.bytes, payload.offset, payload.length); - decoder.reInit(is); - os.reset(); - encoder.reInit(os); - long ordinal; - while ((ordinal = decoder.decode()) != IntDecoder.EOS) { - int newOrdinal = ordinalMap[(int)ordinal]; - encoder.encode(newOrdinal); + decoder.decode(payload, ordinals); + + // map the ordinals + for (int i = 0; i < ordinals.length; i++) { + ordinals.ints[i] = ordinalMap[ordinals.ints[i]]; } - encoder.close(); - // TODO (Facet): avoid copy? - byte out[] = os.toByteArray(); - payloadOut.bytes = out; - payloadOut.offset = 0; - payloadOut.length = out.length; + + encoder.encode(ordinals, payloadOut); return payloadOut; } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java b/lucene/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java index 349da82d7ef..576b9be077a 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java @@ -7,7 +7,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.facet.search.CategoryListIterator; -import org.apache.lucene.facet.search.PayloadIntDecodingIterator; +import org.apache.lucene.facet.search.PayloadCategoryListIteraor; import org.apache.lucene.facet.search.TotalFacetCounts; import org.apache.lucene.facet.util.PartitionsUtils; import org.apache.lucene.util.encoding.DGapIntEncoder; @@ -142,7 +142,7 @@ public class CategoryListParams implements Serializable { int partition) throws IOException { String categoryListTermStr = PartitionsUtils.partitionName(this, partition); Term payloadTerm = new Term(term.field(), categoryListTermStr); - return new PayloadIntDecodingIterator(reader, payloadTerm, + return new PayloadCategoryListIteraor(reader, payloadTerm, createEncoder().createMatchingDecoder()); } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/params/FacetIndexingParams.java b/lucene/facet/src/java/org/apache/lucene/facet/index/params/FacetIndexingParams.java index 62502e75443..3fca0e3a4aa 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/index/params/FacetIndexingParams.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/params/FacetIndexingParams.java @@ -83,18 +83,9 @@ public class FacetIndexingParams { } /** - * The name of the category list to put this category in, or {@code null} if - * this category should not be aggregatable. - *

- * By default, all categories are written to the same category list, but - * applications which know in advance that in some situations only parts of - * the category hierarchy needs to be counted can divide the categories into - * two or more different category lists. - *

- * If {@code null} is returned for a category, it means that this category - * should not appear in any category list, and thus weights for it cannot be - * aggregated. This category can still be used for drill-down, even though the - * its weight is unknown. + * Returns the {@link CategoryListParams} for this {@link CategoryPath}. The + * default implementation returns the same {@link CategoryListParams} for all + * categories (even if {@code category} is {@code null}). * * @see PerDimensionIndexingParams */ diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/params/PerDimensionIndexingParams.java b/lucene/facet/src/java/org/apache/lucene/facet/index/params/PerDimensionIndexingParams.java index 1ccb58d309f..c4b971a5ca9 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/index/params/PerDimensionIndexingParams.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/params/PerDimensionIndexingParams.java @@ -78,7 +78,9 @@ public class PerDimensionIndexingParams extends FacetIndexingParams { /** * Returns the {@link CategoryListParams} for the corresponding dimension - * which is returned by {@code category.getComponent(0)}. + * which is returned by {@code category.getComponent(0)}. If {@code category} + * is {@code null}, or was not specified in the map given to the constructor, + * returns the default {@link CategoryListParams}. */ @Override public CategoryListParams getCategoryListParams(CategoryPath category) { diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/CategoryListIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/CategoryListIterator.java index 79f89909030..5132e930264 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/CategoryListIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/CategoryListIterator.java @@ -2,6 +2,8 @@ package org.apache.lucene.facet.search; import java.io.IOException; +import org.apache.lucene.util.IntsRef; + /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -20,20 +22,10 @@ import java.io.IOException; */ /** - * An interface for iterating over a "category list", i.e., the list of - * categories per document. + * An interface for obtaining the category ordinals of documents. *

- * NOTE: - *

    - *
  • This class operates as a key to a Map. Appropriate implementation of - * hashCode() and equals() must be provided. - *
  • {@link #init()} must be called before you consume any categories, or call - * {@link #skipTo(int)}. - *
  • {@link #skipTo(int)} must be called before any calls to - * {@link #nextCategory()}. - *
  • {@link #nextCategory()} returns values < {@link Integer#MAX_VALUE}, so - * you can use it as a stop condition. - *
+ * NOTE: this class operates as a key to a map, and therefore you should + * implement {@code equals()} and {@code hashCode()} for proper behavior. * * @lucene.experimental */ @@ -41,29 +33,20 @@ public interface CategoryListIterator { /** * Initializes the iterator. This method must be called before any calls to - * {@link #skipTo(int)}, and its return value indicates whether there are - * any relevant documents for this iterator. If it returns false, any call - * to {@link #skipTo(int)} will return false as well.
- * NOTE: calling this method twice may result in skipping over - * documents for some implementations. Also, calling it again after all - * documents were consumed may yield unexpected behavior. + * {@link #getOrdinals(int, IntsRef)}, and its return value indicates whether there are + * any relevant documents for this iterator. */ public boolean init() throws IOException; /** - * Skips forward to document docId. Returns true iff this document exists - * and has any categories. This method must be called before calling - * {@link #nextCategory()} for a particular document.
- * NOTE: Users should call this method with increasing docIds, and - * implementations can assume that this is the case. + * Stores the category ordinals of the given document ID in the given + * {@link IntsRef}, starting at position 0 upto {@link IntsRef#length}. Grows + * the {@link IntsRef} if it is not large enough. + * + *

+ * NOTE: if the requested document does not category ordinals + * associated with it, {@link IntsRef#length} is set to zero. */ - public boolean skipTo(int docId) throws IOException; - - /** - * Returns the next category for the current document that is set through - * {@link #skipTo(int)}, or a number higher than {@link Integer#MAX_VALUE}. - * No assumptions can be made on the order of the categories. - */ - public long nextCategory() throws IOException; - + public void getOrdinals(int docID, IntsRef ints) throws IOException; + } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadCategoryListIteraor.java similarity index 52% rename from lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java rename to lucene/facet/src/java/org/apache/lucene/facet/search/PayloadCategoryListIteraor.java index db4803e1019..3deba112f3e 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadCategoryListIteraor.java @@ -5,7 +5,7 @@ import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.UnsafeByteArrayInputStream; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.encoding.IntDecoder; /* @@ -26,44 +26,21 @@ import org.apache.lucene.util.encoding.IntDecoder; */ /** - * A payload deserializer comes with its own working space (buffer). One need to - * define the {@link IndexReader} and {@link Term} in which the payload resides. - * The iterator then consumes the payload information of each document and - * decodes it into categories. A typical use case of this class is: - * - *

- * IndexReader reader = [open your reader];
- * Term t = new Term("field", "where-payload-exists");
- * CategoryListIterator cli = new PayloadIntDecodingIterator(reader, t);
- * if (!cli.init()) {
- *   // it means there are no payloads / documents associated with that term.
- *   // Usually a sanity check. However, init() must be called.
- * }
- * DocIdSetIterator disi = [you usually iterate on something else, such as a Scorer];
- * int doc;
- * while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
- *   cli.setdoc(doc);
- *   long category;
- *   while ((category = cli.nextCategory()) < Integer.MAX_VALUE) {
- *   }
- * }
- * 
+ * A {@link CategoryListIterator} which reads the category ordinals from a + * payload. * * @lucene.experimental */ -public class PayloadIntDecodingIterator implements CategoryListIterator { +public class PayloadCategoryListIteraor implements CategoryListIterator { - private final UnsafeByteArrayInputStream ubais; private final IntDecoder decoder; - private final IndexReader indexReader; private final Term term; private final PayloadIterator pi; private final int hashCode; - public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder) throws IOException { + public PayloadCategoryListIteraor(IndexReader indexReader, Term term, IntDecoder decoder) throws IOException { pi = new PayloadIterator(indexReader, term); - ubais = new UnsafeByteArrayInputStream(); this.decoder = decoder; hashCode = indexReader.hashCode() ^ term.hashCode(); this.term = term; @@ -72,10 +49,10 @@ public class PayloadIntDecodingIterator implements CategoryListIterator { @Override public boolean equals(Object other) { - if (!(other instanceof PayloadIntDecodingIterator)) { + if (!(other instanceof PayloadCategoryListIteraor)) { return false; } - PayloadIntDecodingIterator that = (PayloadIntDecodingIterator) other; + PayloadCategoryListIteraor that = (PayloadCategoryListIteraor) other; if (hashCode != that.hashCode) { return false; } @@ -95,21 +72,12 @@ public class PayloadIntDecodingIterator implements CategoryListIterator { } @Override - public long nextCategory() throws IOException { - return decoder.decode(); - } - - @Override - public boolean skipTo(int docId) throws IOException { - if (!pi.setdoc(docId)) { - return false; + public void getOrdinals(int docID, IntsRef ints) throws IOException { + ints.length = 0; + BytesRef payload = pi.getPayload(docID); + if (payload != null) { + decoder.decode(payload, ints); } - - // Initializing the decoding mechanism with the new payload data - BytesRef data = pi.getPayload(); - ubais.reInit(data.bytes, data.offset, data.length + data.offset); - decoder.reInit(ubais); - return true; } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java index a12f2cd91f6..7cc7527280d 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java @@ -34,9 +34,9 @@ import org.apache.lucene.util.BytesRef; * A utility class for iterating through a posting list of a given term and * retrieving the payload of the first position in every document. For * efficiency, this class does not check if documents passed to - * {@link #setdoc(int)} are deleted, since it is usually used to iterate on + * {@link #getPayload(int)} are deleted, since it is usually used to iterate on * payloads of documents that matched a query. If you need to skip over deleted - * documents, you should do so before calling {@link #setdoc(int)}. + * documents, you should do so before calling {@link #getPayload(int)}. * * @lucene.experimental */ @@ -84,8 +84,8 @@ public class PayloadIterator { /** * Initialize the iterator. Should be done before the first call to - * {@link #setdoc(int)}. Returns {@code false} if no category list is found, - * or the category list has no documents. + * {@link #getPayload(int)}. Returns {@code false} if no category list is + * found, or the category list has no documents. */ public boolean init() throws IOException { nextSegment(); @@ -93,30 +93,29 @@ public class PayloadIterator { } /** - * Skip forward to document docId. Return true if this document exists and - * has any payload. - *

- * Users should call this method with increasing docIds, and implementations - * can assume that this is the case. + * Returns the {@link BytesRef payload} of the given document, or {@code null} + * if the document does not exist, there are no more documents in the posting + * list, or the document exists but has not payload. You should call + * {@link #init()} before the first call to this method. */ - public boolean setdoc(int docId) throws IOException { + public BytesRef getPayload(int docID) throws IOException { if (!hasMore) { - return false; + return null; } // re-basing docId->localDocID is done fewer times than currentDoc->globalDoc - int localDocID = docId - curDocBase; + int localDocID = docID - curDocBase; if (curDocID > localDocID) { // document does not exist - return false; + return null; } if (curDocID < localDocID) { // look for the document either in that segment, or others while (hasMore && (curDocID = currentDPE.advance(localDocID)) == DocIdSetIterator.NO_MORE_DOCS) { nextSegment(); // also updates curDocID - localDocID = docId - curDocBase; + localDocID = docID - curDocBase; // nextSegment advances to nextDoc, so check if we still need to advance if (curDocID >= localDocID) { break; @@ -127,7 +126,7 @@ public class PayloadIterator { // 1. we iterated over all segments (hasMore=false) // 2. current segment advanced to a doc, either requested or higher if (!hasMore || curDocID != localDocID) { - return false; + return null; } } @@ -135,12 +134,7 @@ public class PayloadIterator { assert currentDPE.freq() == 1 : "expecting freq=1 (got " + currentDPE.freq() + ") term=" + term + " doc=" + (curDocID + curDocBase); int pos = currentDPE.nextPosition(); assert pos != -1 : "no positions for term=" + term + " doc=" + (curDocID + curDocBase); - data = currentDPE.getPayload(); - return data != null; + return currentDPE.getPayload(); } - public BytesRef getPayload() { - return data; - } - } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java index 78b362ea063..32466269d60 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java @@ -10,6 +10,7 @@ import java.util.logging.Level; import java.util.logging.Logger; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.facet.search.aggregator.Aggregator; import org.apache.lucene.facet.search.params.FacetSearchParams; @@ -231,9 +232,9 @@ public class StandardFacetsAccumulator extends FacetsAccumulator { facetArrays.free(); // to get a cleared array for this partition } - HashMap categoryLists = getCategoryListMap( - facetArrays, partition); + HashMap categoryLists = getCategoryListMap(facetArrays, partition); + IntsRef ordinals = new IntsRef(32); // a reasonable start capacity for most common apps for (Entry entry : categoryLists.entrySet()) { CategoryListIterator categoryList = entry.getKey(); if (!categoryList.init()) { @@ -244,14 +245,11 @@ public class StandardFacetsAccumulator extends FacetsAccumulator { ScoredDocIDsIterator iterator = docids.iterator(); while (iterator.next()) { int docID = iterator.getDocID(); - if (!categoryList.skipTo(docID)) { + categoryList.getOrdinals(docID, ordinals); + if (ordinals.length == 0) { continue; } - categorator.setNextDoc(docID, iterator.getScore()); - long ordinal; - while ((ordinal = categoryList.nextCategory()) <= Integer.MAX_VALUE) { - categorator.aggregate((int) ordinal); - } + categorator.aggregate(docID, iterator.getScore(), ordinals); } } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java index 447ddd9b49d..a3743317cc8 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java @@ -2,6 +2,8 @@ package org.apache.lucene.facet.search.aggregator; import java.io.IOException; +import org.apache.lucene.util.IntsRef; + /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -36,16 +38,9 @@ import java.io.IOException; public interface Aggregator { /** - * Specify the document (and its score in the search) that the following - * {@link #aggregate(int)} calls will pertain to. + * Aggregate the ordinals of the given document ID (and its score). The given + * ordinals offset is always zero. */ - void setNextDoc(int docid, float score) throws IOException; - - /** - * Collect (and do whatever an implementation deems appropriate) the - * category given by its ordinal. This category belongs to a document - * given earlier by {@link #setNextDoc(int, float)}. - */ - void aggregate(int ordinal); - + public void aggregate(int docID, float score, IntsRef ordinals) throws IOException; + } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ComplementCountingAggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ComplementCountingAggregator.java index 3bf62c4e520..50ca39fa352 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ComplementCountingAggregator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ComplementCountingAggregator.java @@ -1,5 +1,9 @@ package org.apache.lucene.facet.search.aggregator; +import java.io.IOException; + +import org.apache.lucene.util.IntsRef; + /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -29,9 +33,12 @@ public class ComplementCountingAggregator extends CountingAggregator { } @Override - public void aggregate(int ordinal) { - assert counterArray[ordinal]!=0:"complement aggregation: count is about to become negative for ordinal "+ordinal; - --counterArray[ordinal]; + public void aggregate(int docID, float score, IntsRef ordinals) throws IOException { + for (int i = 0; i < ordinals.length; i++) { + int ord = ordinals.ints[i]; + assert counterArray[ord] != 0 : "complement aggregation: count is about to become negative for ordinal " + ord; + --counterArray[ord]; + } } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java index 5dd54a06b3a..8cd71595dc8 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java @@ -1,5 +1,9 @@ package org.apache.lucene.facet.search.aggregator; +import java.io.IOException; + +import org.apache.lucene.util.IntsRef; + /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -27,21 +31,17 @@ package org.apache.lucene.facet.search.aggregator; public class CountingAggregator implements Aggregator { protected int[] counterArray; - - @Override - public void aggregate(int ordinal) { - ++counterArray[ordinal]; - } - - @Override - public void setNextDoc(int docid, float score) { - // There's nothing for us to do here since we only increment the count by 1 - // in this aggregator. - } - + public CountingAggregator(int[] counterArray) { this.counterArray = counterArray; } + + @Override + public void aggregate(int docID, float score, IntsRef ordinals) throws IOException { + for (int i = 0; i < ordinals.length; i++) { + counterArray[ordinals.ints[i]]++; + } + } @Override public boolean equals(Object obj) { @@ -54,8 +54,7 @@ public class CountingAggregator implements Aggregator { @Override public int hashCode() { - int hashCode = counterArray == null ? 0 : counterArray.hashCode(); - - return hashCode; + return counterArray == null ? 0 : counterArray.hashCode(); } + } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java index ef788968138..6c3dc492703 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java @@ -1,5 +1,9 @@ package org.apache.lucene.facet.search.aggregator; +import java.io.IOException; + +import org.apache.lucene.util.IntsRef; + /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -26,7 +30,6 @@ package org.apache.lucene.facet.search.aggregator; public class ScoringAggregator implements Aggregator { private final float[] scoreArray; - private float score; private final int hashCode; public ScoringAggregator(float[] counterArray) { @@ -35,10 +38,12 @@ public class ScoringAggregator implements Aggregator { } @Override - public void aggregate(int ordinal) { - scoreArray[ordinal] += score; + public void aggregate(int docID, float score, IntsRef ordinals) throws IOException { + for (int i = 0; i < ordinals.length; i++) { + scoreArray[ordinals.ints[i]] += score; + } } - + @Override public boolean equals(Object obj) { if (obj == null || obj.getClass() != this.getClass()) { @@ -53,8 +58,4 @@ public class ScoringAggregator implements Aggregator { return hashCode; } - @Override - public void setNextDoc(int docid, float score) { - this.score = score; - } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationFloatSumAggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationFloatSumAggregator.java index 57814d7cf57..22ebfecdba9 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationFloatSumAggregator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationFloatSumAggregator.java @@ -7,6 +7,8 @@ import org.apache.lucene.facet.associations.FloatAssociationsPayloadIterator; import org.apache.lucene.facet.index.params.CategoryListParams; import org.apache.lucene.facet.search.aggregator.Aggregator; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.collections.IntToFloatMap; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -48,13 +50,18 @@ public class AssociationFloatSumAggregator implements Aggregator { } @Override - public void aggregate(int ordinal) { - float association = associations.getAssociation(ordinal); - if (!Float.isNaN(association)) { - sumArray[ordinal] += association; + public void aggregate(int docID, float score, IntsRef ordinals) throws IOException { + IntToFloatMap values = associations.getAssociations(docID); + if (values != null) { + for (int i = 0; i < ordinals.length; i++) { + int ord = ordinals.ints[i]; + if (values.containsKey(ord)) { + sumArray[ord] += values.get(ord); + } + } } } - + @Override public boolean equals(Object obj) { if (obj == null || obj.getClass() != this.getClass()) { @@ -69,9 +76,4 @@ public class AssociationFloatSumAggregator implements Aggregator { return field.hashCode(); } - @Override - public void setNextDoc(int docid, float score) throws IOException { - associations.setNextDoc(docid); - } - } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationIntSumAggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationIntSumAggregator.java index 42260597376..2f12080a35b 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationIntSumAggregator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationIntSumAggregator.java @@ -7,6 +7,8 @@ import org.apache.lucene.facet.associations.IntAssociationsPayloadIterator; import org.apache.lucene.facet.index.params.CategoryListParams; import org.apache.lucene.facet.search.aggregator.Aggregator; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.collections.IntToIntMap; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -48,13 +50,18 @@ public class AssociationIntSumAggregator implements Aggregator { } @Override - public void aggregate(int ordinal) { - long association = associations.getAssociation(ordinal); - if (association != IntAssociationsPayloadIterator.NO_ASSOCIATION) { - sumArray[ordinal] += association; + public void aggregate(int docID, float score, IntsRef ordinals) throws IOException { + IntToIntMap values = associations.getAssociations(docID); + if (values != null) { + for (int i = 0; i < ordinals.length; i++) { + int ord = ordinals.ints[i]; + if (values.containsKey(ord)) { + sumArray[ord] += values.get(ord); + } + } } } - + @Override public boolean equals(Object obj) { if (obj == null || obj.getClass() != this.getClass()) { @@ -69,9 +76,4 @@ public class AssociationIntSumAggregator implements Aggregator { return field.hashCode(); } - @Override - public void setNextDoc(int docid, float score) throws IOException { - associations.setNextDoc(docid); - } - } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java b/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java index cc0daa4f6fe..4dc5c694a8c 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java @@ -2,13 +2,12 @@ package org.apache.lucene.facet.search.cache; import java.io.IOException; -import org.apache.lucene.index.IndexReader; - import org.apache.lucene.facet.index.params.CategoryListParams; import org.apache.lucene.facet.index.params.FacetIndexingParams; import org.apache.lucene.facet.search.CategoryListIterator; import org.apache.lucene.facet.taxonomy.TaxonomyReader; -import org.apache.lucene.util.collections.IntArray; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -56,33 +55,26 @@ public class CategoryListData { protected CategoryListData() { } - /** - * Compute category list data for caching for faster iteration. - */ + /** Compute category list data for caching for faster iteration. */ CategoryListData(IndexReader reader, TaxonomyReader taxo, FacetIndexingParams iparams, CategoryListParams clp) throws IOException { final int maxDoc = reader.maxDoc(); int[][][]dpf = new int[maxDoc][][]; int numPartitions = (int)Math.ceil(taxo.getSize()/(double)iparams.getPartitionSize()); - IntArray docCategories = new IntArray(); - for (int part=0; part 0) { + if (dpf[doc] == null) { dpf[doc] = new int[numPartitions][]; } - long category; - while ((category = cli.nextCategory()) <= Integer.MAX_VALUE) { - docCategories.addToArray((int)category); - } - final int size = docCategories.size(); - dpf[doc][part] = new int[size]; - for (int i=0; ipart; + return dpc != null && dpc.length > part; } @Override - public long nextCategory() throws IOException { - if (nextCategoryIndex >= dpc[currDoc][part].length) { - return 1L+Integer.MAX_VALUE; + public void getOrdinals(int docID, IntsRef ints) throws IOException { + ints.length = 0; + if (dpc.length > docID && dpc[docID] != null && dpc[docID][part] != null) { + if (ints.ints.length < dpc[docID][part].length) { + ints.grow(dpc[docID][part].length); + } + ints.length = 0; + for (int i = 0; i < dpc[docID][part].length; i++) { + ints.ints[ints.length++] = dpc[docID][part][i]; + } } - return dpc[currDoc][part][nextCategoryIndex++]; - } - - @Override - public boolean skipTo(int docId) throws IOException { - final boolean res = dpc.length>docId && dpc[docId]!=null && dpc[docId][part]!=null; - if (res) { - currDoc = docId; - nextCategoryIndex = 0; - } - return res; } } + } \ No newline at end of file diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java b/lucene/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java index 9a55244a5ee..71c7df7f0c4 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java @@ -48,8 +48,7 @@ public class CountFacetRequest extends FacetRequest { @Override public Aggregator createAggregator(boolean useComplements, - FacetArrays arrays, IndexReader reader, - TaxonomyReader taxonomy) { + FacetArrays arrays, IndexReader reader, TaxonomyReader taxonomy) { // we rely on that, if needed, result is cleared by arrays! int[] a = arrays.getIntArray(); if (useComplements) { diff --git a/lucene/facet/src/java/org/apache/lucene/facet/util/MultiCategoryListIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/util/MultiCategoryListIterator.java index 13fc778bad1..402ba3dcf69 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/util/MultiCategoryListIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/util/MultiCategoryListIterator.java @@ -5,6 +5,7 @@ import java.util.ArrayList; import java.util.List; import org.apache.lucene.facet.search.CategoryListIterator; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -33,16 +34,13 @@ public class MultiCategoryListIterator implements CategoryListIterator { private final CategoryListIterator[] iterators; private final List validIterators; - private final List perDocValidIterators; /** Receives the iterators to iterate on */ public MultiCategoryListIterator(CategoryListIterator... iterators) { this.iterators = iterators; this.validIterators = new ArrayList(); - this.perDocValidIterators = new ArrayList(); } - /** Fails if all given iterators fail to init */ @Override public boolean init() throws IOException { for (CategoryListIterator cli : iterators) { @@ -52,35 +50,17 @@ public class MultiCategoryListIterator implements CategoryListIterator { } return !validIterators.isEmpty(); } - - /** - * Return a value larger than {@link Integer#MAX_VALUE} only if all - * iterators are exhausted - */ + @Override - public long nextCategory() throws IOException { - while (!perDocValidIterators.isEmpty()) { - long value = perDocValidIterators.get(0).nextCategory(); - if (value <= Integer.MAX_VALUE) { - return value; - } - perDocValidIterators.remove(0); - } - return 0x100000000L; - } - - /** - * Fails only if skipTo on all the provided iterators returned {@code false} - */ - @Override - public boolean skipTo(int docId) throws IOException { - perDocValidIterators.clear(); + public void getOrdinals(int docID, IntsRef ints) throws IOException { + IntsRef tmp = new IntsRef(ints.length); for (CategoryListIterator cli : validIterators) { - if (cli.skipTo(docId)) { - perDocValidIterators.add(cli); + cli.getOrdinals(docID, tmp); + if (ints.ints.length < ints.length + tmp.length) { + ints.grow(ints.length + tmp.length); } + ints.length += tmp.length; } - return !perDocValidIterators.isEmpty(); } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/Vint8.java b/lucene/facet/src/java/org/apache/lucene/util/Vint8.java deleted file mode 100644 index 40dd96daebd..00000000000 --- a/lucene/facet/src/java/org/apache/lucene/util/Vint8.java +++ /dev/null @@ -1,229 +0,0 @@ -package org.apache.lucene.util; - -import java.io.EOFException; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Variable-length encoding of 32-bit integers, into 8-bit bytes. A number is encoded as follows: - *

    - *
  • If it is less than 127 and non-negative (i.e., if the number uses only 7 bits), it is encoded as - * as single byte: 0bbbbbbb. - *
  • If its highest nonzero bit is greater than bit 6 (0x40), it is represented as a series of - * bytes, each byte's - * 7 LSB containing bits from the original value, with the MSB set for all but the last - * byte. The first encoded byte contains the highest nonzero bits from the - * original; the second byte contains the next 7 MSB; and so on, with the last byte - * containing the 7 LSB of the original. - *
- * Examples: - *
    - *
  1. n = 117 = 1110101: This has fewer than 8 significant bits, and so is encoded as - * 01110101 = 0x75. - *
  2. n = 100000 = (binary) 11000011010100000. This has 17 significant bits, and so needs - * three Vint8 bytes. Left-zero-pad it to a multiple of 7 bits, then split it into chunks of 7 - * and add an MSB, 0 for the last byte, 1 for the others: 1|0000110 1|0001101 0|0100000 - * = 0x86 0x8D 0x20. - *
- * This encoder/decoder will correctly handle any 32-bit integer, but for negative numbers, - * and positive numbers with more than 28 significant bits, encoding requires 5 bytes; this - * is not an efficient encoding scheme for large - * positive numbers or any negative number. - *

- * Compatibility:
- * This class has been used in products that have shipped to customers, and is needed to - * decode legacy data. Do not modify this class in ways that will break compatibility. - * - * @lucene.experimental - */ -public class Vint8 { - - /** - * Because Java lacks call-by-reference, this class boxes the decoding position, which - * is initially set by the caller, and returned after decoding, incremented by the number - * of bytes processed. - */ - public static class Position { - /** - * Creates a position value set to zero. - */ - public Position() { - // The initial position is zero by default. - } - /** - * Creates a position set to {@code initialPosition}. - * @param initialPosition The starting decoding position in the source buffer. - */ - public Position(int initialPosition) { - this.pos = initialPosition; - } - /** - * The value passed by reference. - */ - public int pos; - } - - /** - * Returns the number of bytes needed to encode {@code number}. - * @param number The number whose encoded length is needed. - * @return The number of bytes needed to encode {@code number}. - */ - public static int bytesNeeded(int number) { - if ((number & ~0x7F) == 0) { - return 1; - } else if ((number & ~0x3FFF) == 0) { - return 2; - } else if ((number & ~0x1FFFFF) == 0) { - return 3; - } else if ((number & ~0xFFFFFFF) == 0) { - return 4; - } else { - return 5; - } - } - - /** - * The maximum number of bytes needed to encode a number using {@code Vint8}. - */ - public static final int MAXIMUM_BYTES_NEEDED = 5; - - /** - * Encodes {@code number} to {@code out}. - * @param number The value to be written in encoded form, to {@code out}. - * @param out The output stream receiving the encoded bytes. - * @exception IOException If there is a problem writing to {@code out}. - */ - public static void encode(int number, OutputStream out) throws IOException { - if ((number & ~0x7F) == 0) { - out.write(number); - } else if ((number & ~0x3FFF) == 0) { - out.write(0x80 | (number >> 7)); - out.write(0x7F & number); - } else if ((number & ~0x1FFFFF) == 0) { - out.write(0x80 | (number >> 14)); - out.write(0x80 | (number >> 7)); - out.write(0x7F & number); - } else if ((number & ~0xFFFFFFF) == 0) { - out.write(0x80 | (number >> 21)); - out.write(0x80 | (number >> 14)); - out.write(0x80 | (number >> 7)); - out.write(0x7F & number); - } else { - out.write(0x80 | (number >> 28)); - out.write(0x80 | (number >> 21)); - out.write(0x80 | (number >> 14)); - out.write(0x80 | (number >> 7)); - out.write(0x7F & number); - } - } - - /** - * Encodes {@code number} into {@code dest}, starting at offset {@code start} from - * the beginning of the array. This method assumes {@code dest} is large enough to - * hold the required number of bytes. - * @param number The number to be encoded. - * @param dest The destination array. - * @param start The starting offset in the array. - * @return The number of bytes used in the array. - */ - public static int encode(int number, byte[] dest, int start) { - if ((number & ~0x7F) == 0) { - dest[start] = (byte) number; - return 1; - } else if ((number & ~0x3FFF) == 0) { - dest[start] = (byte) (0x80 | ((number & 0x3F80) >> 7)); - dest[start + 1] = (byte) (number & 0x7F); - return 2; - } else if ((number & ~0x1FFFFF) == 0) { - dest[start] = (byte) (0x80 | ((number & 0x1FC000) >> 14)); - dest[start + 1] = (byte) (0x80 | ((number & 0x3F80) >> 7)); - dest[start + 2] = (byte) (number & 0x7F); - return 3; - } else if ((number & ~0xFFFFFFF) == 0) { - dest[start] = (byte) (0x80 | ((number & 0xFE00000) >> 21)); - dest[start + 1] = (byte) (0x80 | ((number & 0x1FC000) >> 14)); - dest[start + 2] = (byte) (0x80 | ((number & 0x3F80) >> 7)); - dest[start + 3] = (byte) (number & 0x7F); - return 4; - } else { - dest[start] = (byte) (0x80 | ((number & 0xF0000000) >> 28)); - dest[start + 1] = (byte) (0x80 | ((number & 0xFE00000) >> 21)); - dest[start + 2] = (byte) (0x80 | ((number & 0x1FC000) >> 14)); - dest[start + 3] = (byte) (0x80 | ((number & 0x3F80) >> 7)); - dest[start + 4] = (byte) (number & 0x7F); - return 5; - } - } - - /** - * Decodes a 32-bit integer from {@code bytes}, beginning at offset {@code pos.pos}. - * The decoded value is returned, and {@code pos.pos} is incremented by the number of - * bytes processed. - * @param bytes The byte array containing an encoded value. - * @param pos On entry, the starting position in the array; on return, one greater - * than the position of the last byte decoded in the call. - * @return The decoded value. - */ - public static int decode(byte[] bytes, Position pos) { - int value = 0; - while (true) { - byte first = bytes[pos.pos]; - ++pos.pos; - value |= first & 0x7F; - if ((first & 0x80) == 0) { - return value; - } - value <<= 7; - } - } - - /** - * Decodes a 32-bit integer from bytes read from {@code in}. Bytes are read, - * one at a time, from {@code in}, and it is assumed they represent a 32-bit - * integer encoded using this class's encoding scheme. The decoded value is - * returned. - * @param in The input stream containing the encoded bytes. - * @return The decoded value. - * @exception EOFException If the stream ends before a value has been decoded. - */ - public static int decode(InputStream in) throws IOException { - int value = 0; - while (true) { - int first = in.read(); - if (first < 0) { - throw new EOFException(); - } - value |= first & 0x7F; - if ((first & 0x80) == 0) { - return value; - } - value <<= 7; - } - } - - /** - * The default ctor is made private because all methods of this class are static. - */ - private Vint8() { - // Just making it impossible to instantiate. - } - -} diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/ChunksIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/ChunksIntEncoder.java index c640c53b6cc..a19550c29e5 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/ChunksIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/ChunksIntEncoder.java @@ -1,7 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.OutputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -27,38 +27,31 @@ import java.io.OutputStream; * read more on the two implementations {@link FourFlagsIntEncoder} and * {@link EightFlagsIntEncoder}. *

- * Extensions of this class need to implement {@link #encode(int)} in order to - * build the proper indicator (flags). When enough values were accumulated - * (typically the batch size), extensions can call {@link #encodeChunk()} to - * flush the indicator and the rest of the values. + * Extensions of this class need to implement {@link #encode(IntsRef, BytesRef)} + * in order to build the proper indicator (flags). When enough values were + * accumulated (typically the batch size), extensions can call + * {@link #encodeChunk(BytesRef)} to flush the indicator and the rest of the + * values. *

* NOTE: flags encoders do not accept values ≤ 0 (zero) in their - * {@link #encode(int)}. For performance reasons they do not check that - * condition, however if such value is passed the result stream may be corrupt - * or an exception will be thrown. Also, these encoders perform the best when - * there are many consecutive small values (depends on the encoder + * {@link #encode(IntsRef, BytesRef)}. For performance reasons they do not check + * that condition, however if such value is passed the result stream may be + * corrupt or an exception will be thrown. Also, these encoders perform the best + * when there are many consecutive small values (depends on the encoder * implementation). If that is not the case, the encoder will occupy 1 more byte * for every batch number of integers, over whatever * {@link VInt8IntEncoder} would have occupied. Therefore make sure to check * whether your data fits into the conditions of the specific encoder. *

* For the reasons mentioned above, these encoders are usually chained with - * {@link UniqueValuesIntEncoder} and {@link DGapIntEncoder} in the following - * manner:

- * IntEncoder fourFlags = 
- *         new SortingEncoderFilter(new UniqueValuesIntEncoder(new DGapIntEncoder(new FlagsIntEncoderImpl())));
- * 
+ * {@link UniqueValuesIntEncoder} and {@link DGapIntEncoder}. * * @lucene.experimental */ public abstract class ChunksIntEncoder extends IntEncoder { /** Holds the values which must be encoded, outside the indicator. */ - protected final int[] encodeQueue; - protected int encodeQueueSize = 0; - - /** Encoder used to encode values outside the indicator. */ - protected final IntEncoder encoder = new VInt8IntEncoder(); + protected final IntsRef encodeQueue; /** Represents bits flag byte. */ protected int indicator = 0; @@ -67,39 +60,33 @@ public abstract class ChunksIntEncoder extends IntEncoder { protected byte ordinal = 0; protected ChunksIntEncoder(int chunkSize) { - encodeQueue = new int[chunkSize]; + encodeQueue = new IntsRef(chunkSize); } /** * Encodes the values of the current chunk. First it writes the indicator, and * then it encodes the values outside the indicator. */ - protected void encodeChunk() throws IOException { - out.write(indicator); - for (int i = 0; i < encodeQueueSize; ++i) { - encoder.encode(encodeQueue[i]); + protected void encodeChunk(BytesRef buf) { + // ensure there's enough room in the buffer + int maxBytesRequired = buf.length + 1 + encodeQueue.length * 4; /* indicator + at most 4 bytes per positive VInt */ + if (buf.bytes.length < maxBytesRequired) { + buf.grow(maxBytesRequired); } - encodeQueueSize = 0; - ordinal = 0; - indicator = 0; + + buf.bytes[buf.length++] = ((byte) indicator); + for (int i = 0; i < encodeQueue.length; i++) { + VInt8.encode(encodeQueue.ints[i], buf); + } + + reset(); } @Override - public void close() throws IOException { - if (ordinal != 0) { - encodeChunk(); - } - encoder.close(); - super.close(); - } - - @Override - public void reInit(OutputStream out) { - encoder.reInit(out); - super.reInit(out); + protected void reset() { ordinal = 0; indicator = 0; - encodeQueueSize = 0; + encodeQueue.length = 0; } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntDecoder.java index a37a4b9f4bb..a5b2fb3c28c 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntDecoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntDecoder.java @@ -1,7 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.InputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -21,10 +21,8 @@ import java.io.InputStream; */ /** - * An {@link IntDecoder} which wraps another {@link IntDecoder} and reverts the - * d-gap that was encoded by {@link DGapIntEncoder}. The wrapped decoder - * performs the actual decoding, while this class simply adds the decoded value - * to the previous value. + * An {@link IntDecoder} which wraps another decoder and reverts the d-gap that + * was encoded by {@link DGapIntEncoder}. * * @lucene.experimental */ @@ -32,26 +30,23 @@ public class DGapIntDecoder extends IntDecoder { private final IntDecoder decoder; - private int prev = 0; - public DGapIntDecoder(IntDecoder decoder) { this.decoder = decoder; } @Override - public long decode() throws IOException { - long decode = decoder.decode(); - if (decode == EOS) { - return EOS; - } - - return prev += decode; + protected void reset() { + decoder.reset(); } - + @Override - public void reInit(InputStream in) { - decoder.reInit(in); - prev = 0; + protected void doDecode(BytesRef buf, IntsRef values, int upto) { + decoder.doDecode(buf, values, upto); + int prev = 0; + for (int i = 0; i < values.length; i++) { + values.ints[i] += prev; + prev = values.ints[i]; + } } @Override diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntEncoder.java index 8b1a0deffaf..305f975c619 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntEncoder.java @@ -1,7 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.OutputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -27,7 +27,7 @@ import java.io.OutputStream; * space) if the values are 'close' to each other. *

* NOTE: this encoder assumes the values are given to - * {@link #encode(int)} in an ascending sorted manner, which ensures only + * {@link #encode(IntsRef, BytesRef)} in an ascending sorted manner, which ensures only * positive values are encoded and thus yields better performance. If you are * not sure whether the values are sorted or not, it is possible to chain this * encoder with {@link SortingIntEncoder} to ensure the values will be @@ -37,17 +37,20 @@ import java.io.OutputStream; */ public class DGapIntEncoder extends IntEncoderFilter { - private int prev = 0; - /** Initializes with the given encoder. */ public DGapIntEncoder(IntEncoder encoder) { super(encoder); } @Override - public void encode(int value) throws IOException { - encoder.encode(value - prev); - prev = value; + protected void doEncode(IntsRef values, BytesRef buf, int upto) { + int prev = 0; + for (int i = values.offset; i < upto; i++) { + int tmp = values.ints[i]; + values.ints[i] -= prev; + prev = tmp; + } + encoder.doEncode(values, buf, upto); } @Override @@ -55,12 +58,6 @@ public class DGapIntEncoder extends IntEncoderFilter { return new DGapIntDecoder(encoder.createMatchingDecoder()); } - @Override - public void reInit(OutputStream out) { - super.reInit(out); - prev = 0; - } - @Override public String toString() { return "DGap (" + encoder.toString() + ")"; diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntDecoder.java index fe2b9c6d67b..270fcffca52 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntDecoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntDecoder.java @@ -1,7 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.InputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -21,20 +21,17 @@ import java.io.InputStream; */ /** - * Decodes data which was encoded by {@link EightFlagsIntEncoder}. Scans - * the indicator, one flag (1-bits) at a time, and decodes extra - * data using {@link VInt8IntDecoder}. + * Decodes values encoded with {@link EightFlagsIntEncoder}. * - * @see EightFlagsIntEncoder * @lucene.experimental */ public class EightFlagsIntDecoder extends IntDecoder { - /** + /* * Holds all combinations of indicator for fast decoding (saves time * on real-time bit manipulation) */ - private static final byte[][] decodeTable = new byte[256][8]; + private static final byte[][] DECODE_TABLE = new byte[256][8]; /** Generating all combinations of indicator into separate flags. */ static { @@ -42,45 +39,36 @@ public class EightFlagsIntDecoder extends IntDecoder { --i; for (int j = 8; j != 0;) { --j; - decodeTable[i][j] = (byte) ((i >>> j) & 0x1); + DECODE_TABLE[i][j] = (byte) ((i >>> j) & 0x1); } } } - private final IntDecoder decoder = new VInt8IntDecoder(); - - /** The indicator for decoding a chunk of 8 integers. */ - private int indicator; - - /** Used as an ordinal of 0 - 7, as the decoder decodes chunks of 8 integers. */ - private int ordinal = 0; - @Override - public long decode() throws IOException { - // If we've decoded 8 integers, read the next indicator. - if ((ordinal & 0x7) == 0) { - indicator = in.read(); - if (indicator < 0) { - return EOS; + protected void doDecode(BytesRef buf, IntsRef values, int upto) { + while (buf.offset < upto) { + // read indicator + int indicator = buf.bytes[buf.offset++] & 0xFF; + int ordinal = 0; + + int capacityNeeded = values.length + 8; + if (values.ints.length < capacityNeeded) { + values.grow(capacityNeeded); + } + + // process indicator, until we read 8 values, or end-of-buffer + while (ordinal != 8) { + if (DECODE_TABLE[indicator][ordinal++] == 0) { + if (buf.offset == upto) { // end of buffer + return; + } + // decode the value from the stream. + values.ints[values.length++] = VInt8.decode(buf) + 2; + } else { + values.ints[values.length++] = 1; + } } - ordinal = 0; } - - if (decodeTable[indicator][ordinal++] == 0) { - // decode the value from the stream. - long decode = decoder.decode(); - return decode == EOS ? EOS : decode + 2; - } - - return 1; - } - - @Override - public void reInit(InputStream in) { - super.reInit(in); - decoder.reInit(in); - ordinal = 0; - indicator = 0; } @Override diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntEncoder.java index ac427725674..143660a4742 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntEncoder.java @@ -1,6 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -20,14 +21,15 @@ import java.io.IOException; */ /** - * A {@link ChunksIntEncoder} which encodes data in chunks of 8. Every group starts with a single - * byte (called indicator) which represents 8 - 1 bit flags, where the value: + * A {@link ChunksIntEncoder} which encodes data in chunks of 8. Every group + * starts with a single byte (called indicator) which represents 8 - 1 bit + * flags, where the value: *

    *
  • 1 means the encoded value is '1' *
  • 0 means the value is encoded using {@link VInt8IntEncoder}, and the * encoded bytes follow the indicator.
    - * Since value 0 is illegal, and 1 is encoded in the indicator, the actual - * value that is encoded is value-2, which saves some more bits. + * Since value 0 is illegal, and 1 is encoded in the indicator, the actual value + * that is encoded is value-2, which saves some more bits. *
* Encoding example: *
    @@ -46,28 +48,36 @@ import java.io.IOException; */ public class EightFlagsIntEncoder extends ChunksIntEncoder { - /** + /* * Holds all combinations of indicator flags for fast encoding (saves * time on bit manipulation at encode time) */ - private static byte[] encodeTable = new byte[] { 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, (byte) 0x80 }; + private static final byte[] ENCODE_TABLE = new byte[] { 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, (byte) 0x80 }; public EightFlagsIntEncoder() { super(8); } @Override - public void encode(int data) throws IOException { - if (data == 1) { - indicator |= encodeTable[ordinal]; - } else { - encodeQueue[encodeQueueSize++] = data - 2; + protected void doEncode(IntsRef values, BytesRef buf, int upto) { + for (int i = values.offset; i < upto; i++) { + int value = values.ints[i]; + if (value == 1) { + indicator |= ENCODE_TABLE[ordinal]; + } else { + encodeQueue.ints[encodeQueue.length++] = value - 2; + } + ++ordinal; + + // encode the chunk and the indicator + if (ordinal == 8) { + encodeChunk(buf); + } } - ++ordinal; - - // If 8 values were encoded thus far, 'flush' them including the indicator. - if ((ordinal & 0x7) == 0) { - encodeChunk(); + + // encode remaining values + if (ordinal != 0) { + encodeChunk(buf); } } @@ -78,7 +88,7 @@ public class EightFlagsIntEncoder extends ChunksIntEncoder { @Override public String toString() { - return "EightFlags (" + encoder.toString() + ")"; + return "EightFlags (VInt)"; } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntDecoder.java index a597f9e2a38..ebc161ac9cd 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntDecoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntDecoder.java @@ -1,7 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.InputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -21,11 +21,8 @@ import java.io.InputStream; */ /** - * Decodes data which was encoded by {@link FourFlagsIntEncoder}. Scans - * the indicator, one flag (1-bits) at a time, and decodes extra - * data using {@link VInt8IntDecoder}. + * Decodes values encoded with {@link FourFlagsIntEncoder}. * - * @see FourFlagsIntEncoder * @lucene.experimental */ public class FourFlagsIntDecoder extends IntDecoder { @@ -34,7 +31,7 @@ public class FourFlagsIntDecoder extends IntDecoder { * Holds all combinations of indicator for fast decoding (saves time * on real-time bit manipulation) */ - private final static byte[][] decodeTable = new byte[256][4]; + private final static byte[][] DECODE_TABLE = new byte[256][4]; /** Generating all combinations of indicator into separate flags. */ static { @@ -42,46 +39,36 @@ public class FourFlagsIntDecoder extends IntDecoder { --i; for (int j = 4; j != 0;) { --j; - decodeTable[i][j] = (byte) ((i >>> (j << 1)) & 0x3); + DECODE_TABLE[i][j] = (byte) ((i >>> (j << 1)) & 0x3); } } } - private final IntDecoder decoder = new VInt8IntDecoder(); - - /** The indicator for decoding a chunk of 4 integers. */ - private int indicator; - - /** Used as an ordinal of 0 - 3, as the decoder decodes chunks of 4 integers. */ - private int ordinal = 0; - @Override - public long decode() throws IOException { - // If we've decoded 8 integers, read the next indicator. - if ((ordinal & 0x3) == 0) { - indicator = in.read(); - if (indicator < 0) { - return EOS; + protected void doDecode(BytesRef buf, IntsRef values, int upto) { + while (buf.offset < upto) { + // read indicator + int indicator = buf.bytes[buf.offset++] & 0xFF; + int ordinal = 0; + + int capacityNeeded = values.length + 4; + if (values.ints.length < capacityNeeded) { + values.grow(capacityNeeded); + } + + while (ordinal != 4) { + byte decodeVal = DECODE_TABLE[indicator][ordinal++]; + if (decodeVal == 0) { + if (buf.offset == upto) { // end of buffer + return; + } + // decode the value from the stream. + values.ints[values.length++] = VInt8.decode(buf) + 4; + } else { + values.ints[values.length++] = decodeVal; + } } - ordinal = 0; } - - byte decodeVal = decodeTable[indicator][ordinal++]; - if (decodeVal == 0) { - // decode the value from the stream. - long decode = decoder.decode(); - return decode == EOS ? EOS : decode + 4; - } - - return decodeVal; - } - - @Override - public void reInit(InputStream in) { - super.reInit(in); - decoder.reInit(in); - ordinal = 0; - indicator = 0; } @Override diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntEncoder.java index a1e227d5cdf..535a90fb60c 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntEncoder.java @@ -1,6 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -48,11 +49,11 @@ import java.io.IOException; */ public class FourFlagsIntEncoder extends ChunksIntEncoder { - /** + /* * Holds all combinations of indicator flags for fast encoding (saves * time on bit manipulation @ encode time) */ - private static byte[][] encodeTable = new byte[][] { + private static final byte[][] ENCODE_TABLE = new byte[][] { new byte[] { 0x00, 0x00, 0x00, 0x00 }, new byte[] { 0x01, 0x04, 0x10, 0x40 }, new byte[] { 0x02, 0x08, 0x20, (byte) 0x80 }, @@ -63,26 +64,26 @@ public class FourFlagsIntEncoder extends ChunksIntEncoder { super(4); } - /** - * Small values (<=3) are stored in the indicator while larger - * values are saved for later encoding in the {@link #encodeQueue}. Since - * Vint8 will only encode values larger or equal to 4, the values saves for - * encoded are transformed to (value - 4).
    - * When a chunk is ready (got 4 values), the {@link #encodeChunk()} - * takes control. - */ @Override - public void encode(int data) throws IOException { - if (data <= 3) { - indicator |= encodeTable[data][ordinal]; - } else { - encodeQueue[encodeQueueSize++] = data - 4; + protected void doEncode(IntsRef values, BytesRef buf, int upto) { + for (int i = values.offset; i < upto; i++) { + int value = values.ints[i]; + if (value <= 3) { + indicator |= ENCODE_TABLE[value][ordinal]; + } else { + encodeQueue.ints[encodeQueue.length++] = value - 4; + } + ++ordinal; + + // encode the chunk and the indicator + if (ordinal == 4) { + encodeChunk(buf); + } } - ++ordinal; - - // If 4 values were encoded thus far, 'flush' them including the indicator. - if ((ordinal & 0x3) == 0) { - encodeChunk(); + + // encode remaining values + if (ordinal != 0) { + encodeChunk(buf); } } @@ -93,7 +94,7 @@ public class FourFlagsIntEncoder extends ChunksIntEncoder { @Override public String toString() { - return "FourFlags (" + encoder.toString() + ")"; + return "FourFlags (VInt)"; } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntDecoder.java index faabad4c8ed..acf83cc6158 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntDecoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntDecoder.java @@ -1,7 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.InputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -21,33 +21,50 @@ import java.io.InputStream; */ /** - * Decodes integers from a set {@link InputStream}. For re-usability, the - * decoder's input stream can be set by ({@link #reInit(InputStream)}). - * By design, Decoders are NOT thread-safe. + * Decodes integers from a set {@link BytesRef}. * * @lucene.experimental */ public abstract class IntDecoder { - /** A special long value which is used to indicate end-of-stream has reached. */ - public static final long EOS = 0x100000000L; - - /** Input stream from which the encoded bytes are read */ - protected InputStream in; - - /** Sets the input stream from which the encoded data is read. */ - public void reInit(InputStream in) { - this.in = in; + /** + * Performs the actual decoding. Values should be read from + * {@link BytesRef#offset} up to {@code upto}. Also, {@code values} offset and + * length are set to 0 and the encoder is expected to update + * {@link IntsRef#length}, but not {@link IntsRef#offset}. + * + *

    + * NOTE: it is ok to use the buffer's offset as the current position in + * the buffer (and modify it), it will be reset by + * {@link #decode(BytesRef, IntsRef)}. + */ + protected abstract void doDecode(BytesRef buf, IntsRef values, int upto); + + /** + * Called before {@link #doDecode(BytesRef, IntsRef, int)} so that decoders + * can reset their state. + */ + protected void reset() { + // do nothing by default } /** - * Decodes data received from the input stream, and returns one decoded - * integer. If end of stream is reached, {@link #EOS} is returned. - * - * @return one decoded integer as long or {@link #EOS} if end-of-stream - * reached. - * @throws IOException if an I/O error occurs + * Decodes the values from the buffer into the given {@link IntsRef}. Note + * that {@code values.offset} and {@code values.length} are set to 0. */ - public abstract long decode() throws IOException; + public final void decode(BytesRef buf, IntsRef values) { + values.offset = values.length = 0; // must do that because we cannot grow() them otherwise + + // some decoders may use the buffer's offset as a position index, so save + // current offset. + int bufOffset = buf.offset; + + reset(); + doDecode(buf, values, buf.offset + buf.length); + assert values.offset == 0 : "offset should not have been modified by the decoder."; + + // fix offset + buf.offset = bufOffset; + } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoder.java index 4055d750810..0a3197d6c6b 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoder.java @@ -1,8 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.Closeable; -import java.io.IOException; -import java.io.OutputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -22,94 +21,47 @@ import java.io.OutputStream; */ /** - * Encodes integers to a set {@link OutputStream}. Extending classes need to - * override {@link #encode(int)} to encode the value using their encoding - * algorithm. The default implementation of {@link #close()} closes the set - * {@link OutputStream}. - *

    - * The default {@link #IntEncoder() constructor} is provided for convenience - * only. One must call {@link #reInit(OutputStream)} before calling - * {@link #encode(int)} or {@link #close()}. - *

    - * For convenience, each encoder implements {@link #createMatchingDecoder()} for - * easy access to the matching decoder. - *

    - * NOTE: some implementations may buffer the encoded values in memory - * (such as {@link IntEncoderFilter} implementations) and encoding will happen - * only upon calling {@link #close()}. Therefore it is important to always call - * {@link #close()} on the encoder at hand. - *

    - * NOTE: encoders are usually not thread safe, unless specifically - * documented otherwise by an implementation. + * Encodes integers to a set {@link BytesRef}. For convenience, each encoder + * implements {@link #createMatchingDecoder()} for easy access to the matching + * decoder. * * @lucene.experimental */ -public abstract class IntEncoder implements Closeable { +public abstract class IntEncoder { - protected OutputStream out = null; + public IntEncoder() {} /** - * Default constructor, provided here for robustness: if in the future a - * constructor with parameters will be added, this might break custom - * implementations of this class which call this implicit constructor. So we - * make it explicit to avoid any such issue in the future. + * Performs the actual encoding. Values should be read from + * {@link IntsRef#offset} up to {@code upto}. Also, it is guaranteed that + * {@code buf's} offset and length are set to 0 and the encoder is expected to + * update {@link BytesRef#length}, but not {@link BytesRef#offset}. */ - public IntEncoder() { + protected abstract void doEncode(IntsRef values, BytesRef buf, int upto); + + /** + * Called before {@link #doEncode(IntsRef, BytesRef, int)} so that encoders + * can reset their state. + */ + protected void reset() { + // do nothing by default } /** - * Instructs the encoder to finish the encoding process. This method closes - * the output stream which was specified by {@link #reInit(OutputStream) - * reInit}. An implementation may do here additional cleanup required to - * complete the encoding, such as flushing internal buffers, etc.
    - * Once this method was called, no further calls to {@link #encode(int) - * encode} should be made before first calling {@link #reInit(OutputStream) - * reInit}. - *

    - * NOTE: overriding classes should make sure they either call - * super.close() or close the output stream themselves. + * Encodes the values to the given buffer. Note that the buffer's offset and + * length are set to 0. */ - @Override - public void close() throws IOException { - if (out != null) { - out.close(); - } + public final void encode(IntsRef values, BytesRef buf) { + buf.offset = buf.length = 0; + reset(); + doEncode(values, buf, values.offset + values.length); + assert buf.offset == 0 : "offset should not have been modified by the encoder."; } /** - * Encodes an integer to the output stream given in - * {@link #reInit(OutputStream) reInit} - */ - public abstract void encode(int value) throws IOException; - - /** - * Returns an {@link IntDecoder} which matches this encoder. Every encoder - * must return an {@link IntDecoder} and null is not a valid - * value. If an encoder is just a filter, it should at least return its - * wrapped encoder's matching decoder. - *

    - * NOTE: this method should create a new instance of the matching - * decoder and leave the instance sharing to the caller. Returning the same - * instance over and over is risky because encoders and decoders are not - * thread safe. + * Returns an {@link IntDecoder} which can decode the values that were encoded + * with this encoder. */ public abstract IntDecoder createMatchingDecoder(); - /** - * Reinitializes the encoder with the give {@link OutputStream}. For - * re-usability it can be changed without the need to reconstruct a new - * object. - *

    - * NOTE: after calling {@link #close()}, one must call - * this method even if the output stream itself hasn't changed. An example - * case is that the output stream wraps a byte[], and the output stream itself - * is reset, but its instance hasn't changed. Some implementations of - * {@link IntEncoder} may write some metadata about themselves to the output - * stream, and therefore it is imperative that one calls this method before - * encoding any data. - */ - public void reInit(OutputStream out) { - this.out = out; - } - } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoderFilter.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoderFilter.java index 6c7a40340d0..ee2e5db7e9e 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoderFilter.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoderFilter.java @@ -1,7 +1,5 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.OutputStream; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -21,17 +19,7 @@ import java.io.OutputStream; */ /** - * An abstract implementation of {@link IntEncoder} which is served as a filter - * on the values to encode. An encoder filter wraps another {@link IntEncoder} - * which does the actual encoding. This allows for chaining filters and - * encoders, such as:

    - * new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEnoder()));
    - * {@link UniqueValuesIntEncoder} followed by {@link DGapIntEncoder}
    -  
    - *

    - * The default implementation implements {@link #close()} by closing the wrapped - * encoder and {@link #reInit(OutputStream)} by re-initializing the wrapped - * encoder. + * An abstract implementation of {@link IntEncoder} which wraps another encoder. * * @lucene.experimental */ @@ -44,15 +32,8 @@ public abstract class IntEncoderFilter extends IntEncoder { } @Override - public void close() throws IOException { - // There is no need to call super.close(), since we don't pass the output - // stream to super. - encoder.close(); - } - - @Override - public void reInit(OutputStream out) { - encoder.reInit(out); + public void reset() { + encoder.reset(); } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntDecoder.java index 6d00e049080..1cf33857280 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntDecoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntDecoder.java @@ -1,7 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.InputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -21,54 +21,65 @@ import java.io.InputStream; */ /** - * Decodes data which was encoded by {@link NOnesIntEncoder}. Uses a - * {@link FourFlagsIntDecoder} to perform the actual encoding and translates the - * values back as described in {@link NOnesIntEncoder}. + * Decodes values encoded encoded with {@link NOnesIntEncoder}. * - * @see NOnesIntEncoder * @lucene.experimental */ public class NOnesIntDecoder extends FourFlagsIntDecoder { - /** Number of consecutive '1's to generate upon decoding a '2'. */ - private int n; - - private int onesCounter; - + // Number of consecutive '1's to generate upon decoding a '2' + private final int n; + private final IntsRef internalBuffer; + /** * Constructs a decoder with a given N (Number of consecutive '1's which are * translated into a single target value '2'. */ public NOnesIntDecoder(int n) { this.n = n; + // initial size (room for 100 integers) + internalBuffer = new IntsRef(100); } @Override - public long decode() throws IOException { - // If we read '2', we should return n '1's. - if (onesCounter > 0) { - --onesCounter; - return 1; - } - - long decode = super.decode(); - if (decode == 1) { - return 1; - } - if (decode == 2) { - onesCounter = n - 1; - return 1; - } - if (decode == 3) { - return 2; - } - return decode == EOS ? EOS : decode - 1; + protected void reset() { + internalBuffer.length = 0; + super.reset(); } - + @Override - public void reInit(InputStream in) { - super.reInit(in); - onesCounter = 0; + protected void doDecode(BytesRef buf, IntsRef values, int upto) { + super.doDecode(buf, internalBuffer, upto); + if (values.ints.length < internalBuffer.length) { + // need space for internalBuffer.length to internalBuffer.length*N, + // grow mildly at first + values.grow(internalBuffer.length * n/2); + } + + for (int i = 0; i < internalBuffer.length; i++) { + int decode = internalBuffer.ints[i]; + if (decode == 1) { + if (values.length == values.ints.length) { + values.grow(values.length + 10); // grow by few items, however not too many + } + // 1 is 1 + values.ints[values.length++] = 1; + } else if (decode == 2) { + if (values.length + n >= values.ints.length) { + values.grow(values.length + n); // grow by few items, however not too many + } + // '2' means N 1's + for (int j = 0; j < n; j++) { + values.ints[values.length++] = 1; + } + } else { + if (values.length == values.ints.length) { + values.grow(values.length + 10); // grow by few items, however not too many + } + // any other value is val-1 + values.ints[values.length++] = decode - 1; + } + } } @Override diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntEncoder.java index badfe1cd31d..956eea253a3 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntEncoder.java @@ -1,7 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.OutputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -50,11 +50,10 @@ import java.io.OutputStream; */ public class NOnesIntEncoder extends FourFlagsIntEncoder { + private final IntsRef internalBuffer; + /** Number of consecutive '1's to be translated into single target value '2'. */ - private int n; - - /** Counts the number of consecutive ones seen. */ - private int onesCounter = 0; + private final int n; /** * Constructs an encoder with a given value of N (N: Number of consecutive @@ -62,38 +61,48 @@ public class NOnesIntEncoder extends FourFlagsIntEncoder { */ public NOnesIntEncoder(int n) { this.n = n; + internalBuffer = new IntsRef(n); } @Override - public void close() throws IOException { - // We might have ones in our buffer, encode them as neccesary. - while (onesCounter-- > 0) { - super.encode(1); + protected void reset() { + internalBuffer.length = 0; + super.reset(); + } + + @Override + protected void doEncode(IntsRef values, BytesRef buf, int upto) { + // make sure the internal buffer is large enough + if (values.length > internalBuffer.ints.length) { + internalBuffer.grow(values.length); } - - super.close(); - } - - @Override - public void encode(int value) throws IOException { - if (value == 1) { - // Increment the number of consecutive ones seen so far - if (++onesCounter == n) { - super.encode(2); - onesCounter = 0; + + int onesCounter = 0; + for (int i = values.offset; i < upto; i++) { + int value = values.ints[i]; + if (value == 1) { + // every N 1's should be encoded as '2' + if (++onesCounter == n) { + internalBuffer.ints[internalBuffer.length++] = 2; + onesCounter = 0; + } + } else { + // there might have been 1's that we need to encode + while (onesCounter > 0) { + --onesCounter; + internalBuffer.ints[internalBuffer.length++] = 1; + } + + // encode value as value+1 + internalBuffer.ints[internalBuffer.length++] = value + 1; } - return; } - - // If it's not one - there might have been ones we had to encode prior to - // this value + // there might have been 1's that we need to encode while (onesCounter > 0) { --onesCounter; - super.encode(1); + internalBuffer.ints[internalBuffer.length++] = 1; } - - // encode value + 1 --> the translation. - super.encode(value + 1); + super.doEncode(internalBuffer, buf, internalBuffer.length); } @Override @@ -101,12 +110,6 @@ public class NOnesIntEncoder extends FourFlagsIntEncoder { return new NOnesIntDecoder(n); } - @Override - public void reInit(OutputStream out) { - super.reInit(out); - onesCounter = 0; - } - @Override public String toString() { return "NOnes (" + n + ") (" + super.toString() + ")"; diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntDecoder.java index 414af6effa7..af6fce26af7 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntDecoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntDecoder.java @@ -1,7 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.StreamCorruptedException; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -21,41 +21,24 @@ import java.io.StreamCorruptedException; */ /** - * A simple stream decoder which can decode values encoded with - * {@link SimpleIntEncoder}. + * Decodes values encoded with {@link SimpleIntEncoder}. * * @lucene.experimental */ public class SimpleIntDecoder extends IntDecoder { - /** - * reusable buffer - allocated only once as this is not a thread-safe object - */ - private byte[] buffer = new byte[4]; - @Override - public long decode() throws IOException { - - // we need exactly 4 bytes to decode an int in this decoder impl, otherwise, throw an exception - int offset = 0; - while (offset < 4) { - int nRead = in.read(buffer, offset, 4 - offset); - if (nRead == -1) { - if (offset > 0) { - throw new StreamCorruptedException( - "Need 4 bytes for decoding an int, got only " + offset); - } - return EOS; + protected void doDecode(BytesRef buf, IntsRef values, int upto) { + while (buf.offset < upto) { + if (values.length == values.ints.length) { + values.grow(values.length + 10); // grow by few items, however not too many } - offset += nRead; + values.ints[values.length++] = + ((buf.bytes[buf.offset++] & 0xFF) << 24) | + ((buf.bytes[buf.offset++] & 0xFF) << 16) | + ((buf.bytes[buf.offset++] & 0xFF) << 8) | + (buf.bytes[buf.offset++] & 0xFF); } - - int v = buffer[3] & 0xff; - v |= (buffer[2] << 8) & 0xff00; - v |= (buffer[1] << 16) & 0xff0000; - v |= (buffer[0] << 24) & 0xff000000; - - return v; } @Override diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntEncoder.java index 2e17ef29640..fd6a0206117 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntEncoder.java @@ -1,6 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -26,22 +27,21 @@ import java.io.IOException; */ public class SimpleIntEncoder extends IntEncoder { - /** - * This method makes sure the value wasn't previously encoded by checking - * against the Set. If the value wasn't encoded, it's added to the Set, and - * encoded with {#link Vint8#encode} - * - * @param value - * an integer to be encoded - * @throws IOException - * possibly thrown by the OutputStream - */ @Override - public void encode(int value) throws IOException { - out.write(value >>> 24); - out.write((value >> 16) & 0xFF); - out.write((value >> 8) & 0xFF); - out.write(value & 0xFF); + protected void doEncode(IntsRef values, BytesRef buf, int upto) { + // ensure there's enough room in the buffer + int bytesNeeded = values.length * 4; + if (buf.bytes.length < bytesNeeded) { + buf.grow(bytesNeeded); + } + + for (int i = values.offset; i < upto; i++) { + int value = values.ints[i]; + buf.bytes[buf.length++] = (byte) (value >>> 24); + buf.bytes[buf.length++] = (byte) ((value >> 16) & 0xFF); + buf.bytes[buf.length++] = (byte) ((value >> 8) & 0xFF); + buf.bytes[buf.length++] = (byte) (value & 0xFF); + } } @Override diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/SortingIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/SortingIntEncoder.java index 9f6cee635ce..0ebb06efa85 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/SortingIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/SortingIntEncoder.java @@ -1,9 +1,10 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.OutputStream; import java.util.Arrays; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; + /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -23,47 +24,21 @@ import java.util.Arrays; /** * An {@link IntEncoderFilter} which sorts the values to encode in ascending - * order before encoding them. Encoding therefore happens upon calling - * {@link #close()}. Since this encoder is usually chained with another encoder - * that relies on sorted values, it does not offer a default constructor. + * order before encoding them. * * @lucene.experimental */ public class SortingIntEncoder extends IntEncoderFilter { - private float grow = 2.0f; - private int index = 0; - private int[] set = new int[1024]; - /** Initializes with the given encoder. */ public SortingIntEncoder(IntEncoder encoder) { super(encoder); } @Override - public void close() throws IOException { - if (index == 0) { - return; - } - - Arrays.sort(set, 0, index); - for (int i = 0; i < index; i++) { - encoder.encode(set[i]); - } - encoder.close(); - index = 0; - - super.close(); - } - - @Override - public void encode(int value) throws IOException { - if (index == set.length) { - int[] newSet = new int[(int) (set.length * grow)]; - System.arraycopy(set, 0, newSet, 0, set.length); - set = newSet; - } - set[index++] = value; + protected void doEncode(IntsRef values, BytesRef buf, int upto) { + Arrays.sort(values.ints, values.offset, upto); + encoder.doEncode(values, buf, upto); } @Override @@ -71,12 +46,6 @@ public class SortingIntEncoder extends IntEncoderFilter { return encoder.createMatchingDecoder(); } - @Override - public void reInit(OutputStream out) { - super.reInit(out); - index = 0; - } - @Override public String toString() { return "Sorting (" + encoder.toString() + ")"; diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/UniqueValuesIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/UniqueValuesIntEncoder.java index 3583402295f..c9a6be5c848 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/UniqueValuesIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/UniqueValuesIntEncoder.java @@ -1,7 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.OutputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -22,7 +22,7 @@ import java.io.OutputStream; /** * An {@link IntEncoderFilter} which ensures only unique values are encoded. The - * implementation assumes the values given to {@link #encode(int)} are sorted. + * implementation assumes the values given to {@link #encode(IntsRef, BytesRef)} are sorted. * If this is not the case, you can chain this encoder with * {@link SortingIntEncoder}. * @@ -30,26 +30,23 @@ import java.io.OutputStream; */ public final class UniqueValuesIntEncoder extends IntEncoderFilter { - /** - * Denotes an illegal value which we can use to init 'prev' to. Since all - * encoded values are integers, this value is init to MAX_INT+1 and is of type - * long. Therefore we are guaranteed not to get this value in encode. - */ - private static final long ILLEGAL_VALUE = Integer.MAX_VALUE + 1; - - private long prev = ILLEGAL_VALUE; - /** Constructs a new instance with the given encoder. */ public UniqueValuesIntEncoder(IntEncoder encoder) { super(encoder); } @Override - public void encode(int value) throws IOException { - if (prev != value) { - encoder.encode(value); - prev = value; + protected void doEncode(IntsRef values, BytesRef buf, int upto) { + int prev = values.ints[values.offset]; + int idx = values.offset + 1; + for (int i = idx; i < upto; i++) { + if (values.ints[i] != prev) { + values.ints[idx++] = values.ints[i]; + prev = values.ints[i]; + } } + values.length = idx - values.offset; + encoder.doEncode(values, buf, idx); } @Override @@ -57,12 +54,6 @@ public final class UniqueValuesIntEncoder extends IntEncoderFilter { return encoder.createMatchingDecoder(); } - @Override - public void reInit(OutputStream out) { - super.reInit(out); - prev = ILLEGAL_VALUE; - } - @Override public String toString() { return "Unique (" + encoder.toString() + ")"; diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8.java new file mode 100644 index 00000000000..267d52bae96 --- /dev/null +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8.java @@ -0,0 +1,138 @@ +package org.apache.lucene.util.encoding; + +import org.apache.lucene.util.BytesRef; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Variable-length encoding of 32-bit integers, into 8-bit bytes. A number is + * encoded as follows: + *

      + *
    • If it is less than 127 and non-negative (i.e., if the number uses only 7 + * bits), it is encoded as as single byte: 0bbbbbbb. + *
    • If its highest nonzero bit is greater than bit 6 (0x40), it is + * represented as a series of bytes, each byte's 7 LSB containing bits from the + * original value, with the MSB set for all but the last byte. The first encoded + * byte contains the highest nonzero bits from the original; the second byte + * contains the next 7 MSB; and so on, with the last byte containing the 7 LSB + * of the original. + *
    + * Examples: + *
      + *
    1. n = 117 = 1110101: This has fewer than 8 significant bits, and so is + * encoded as 01110101 = 0x75. + *
    2. n = 100000 = (binary) 11000011010100000. This has 17 significant bits, + * and so needs three Vint8 bytes. Left-zero-pad it to a multiple of 7 bits, + * then split it into chunks of 7 and add an MSB, 0 for the last byte, 1 for the + * others: 1|0000110 1|0001101 0|0100000 = 0x86 0x8D 0x20. + *
    + * {@link #encode(int, BytesRef)} and {@link #decode(BytesRef)} will correctly + * handle any 32-bit integer, but for negative numbers, and positive numbers + * with more than 28 significant bits, encoding requires 5 bytes; this is not an + * efficient encoding scheme for large positive numbers or any negative number. + * + * @lucene.experimental + */ +public class VInt8 { + + /** The maximum number of bytes needed to encode an integer. */ + public static final int MAXIMUM_BYTES_NEEDED = 5; + + /** + * Decodes an int from the given bytes, starting at {@link BytesRef#offset}. + * Returns the decoded bytes and updates {@link BytesRef#offset}. + */ + public static int decode(BytesRef bytes) { + /* + This is the original code of this method, but a Hotspot bug + corrupted the for-loop of DataInput.readVInt() (see LUCENE-2975) + so the loop was unwounded here too, to be on the safe side + int value = 0; + while (true) { + byte first = bytes.bytes[bytes.offset++]; + value |= first & 0x7F; + if ((first & 0x80) == 0) { + return value; + } + value <<= 7; + } + */ + + // byte 1 + byte b = bytes.bytes[bytes.offset++]; + if (b >= 0) return b; + + // byte 2 + int value = b & 0x7F; + b = bytes.bytes[bytes.offset++]; + value = (value << 7) | b & 0x7F; + if (b >= 0) return value; + + // byte 3 + b = bytes.bytes[bytes.offset++]; + value = (value << 7) | b & 0x7F; + if (b >= 0) return value; + + // byte 4 + b = bytes.bytes[bytes.offset++]; + value = (value << 7) | b & 0x7F; + if (b >= 0) return value; + + // byte 5 + b = bytes.bytes[bytes.offset++]; + return (value << 7) | b & 0x7F; + } + + /** + * Encodes the given number into bytes, starting at {@link BytesRef#length}. + * Assumes that the array is large enough. + */ + public static void encode(int value, BytesRef bytes) { + if ((value & ~0x7F) == 0) { + bytes.bytes[bytes.length] = (byte) value; + bytes.length++; + } else if ((value & ~0x3FFF) == 0) { + bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0x3F80) >> 7)); + bytes.bytes[bytes.length + 1] = (byte) (value & 0x7F); + bytes.length += 2; + } else if ((value & ~0x1FFFFF) == 0) { + bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0x1FC000) >> 14)); + bytes.bytes[bytes.length + 1] = (byte) (0x80 | ((value & 0x3F80) >> 7)); + bytes.bytes[bytes.length + 2] = (byte) (value & 0x7F); + bytes.length += 3; + } else if ((value & ~0xFFFFFFF) == 0) { + bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0xFE00000) >> 21)); + bytes.bytes[bytes.length + 1] = (byte) (0x80 | ((value & 0x1FC000) >> 14)); + bytes.bytes[bytes.length + 2] = (byte) (0x80 | ((value & 0x3F80) >> 7)); + bytes.bytes[bytes.length + 3] = (byte) (value & 0x7F); + bytes.length += 4; + } else { + bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0xF0000000) >> 28)); + bytes.bytes[bytes.length + 1] = (byte) (0x80 | ((value & 0xFE00000) >> 21)); + bytes.bytes[bytes.length + 2] = (byte) (0x80 | ((value & 0x1FC000) >> 14)); + bytes.bytes[bytes.length + 3] = (byte) (0x80 | ((value & 0x3F80) >> 7)); + bytes.bytes[bytes.length + 4] = (byte) (value & 0x7F); + bytes.length += 5; + } + } + + private VInt8() { + // Just making it impossible to instantiate. + } + +} diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntDecoder.java index 2beaf773d78..e9fe5600c9a 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntDecoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntDecoder.java @@ -1,6 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -20,33 +21,19 @@ import java.io.IOException; */ /** - * An {@link IntDecoder} which can decode values encoded by - * {@link VInt8IntEncoder}. + * Decodes values encoded by {@link VInt8IntEncoder}. * * @lucene.experimental */ public class VInt8IntDecoder extends IntDecoder { - private boolean legalEOS = true; - @Override - public long decode() throws IOException { - int value = 0; - while (true) { - int first = in.read(); - if (first < 0) { - if (!legalEOS) { - throw new IOException("Unexpected End-Of-Stream"); - } - return EOS; + protected void doDecode(BytesRef buf, IntsRef values, int upto) { + while (buf.offset < upto) { + if (values.length == values.ints.length) { + values.grow(values.length + 10); // grow by few items, however not too many } - value |= first & 0x7F; - if ((first & 0x80) == 0) { - legalEOS = true; - return value; - } - legalEOS = false; - value <<= 7; + values.ints[values.length++] = VInt8.decode(buf); } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntEncoder.java index 8fe6fbc9e09..7c62bf3e035 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntEncoder.java @@ -1,6 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -49,27 +50,14 @@ import java.io.IOException; public class VInt8IntEncoder extends IntEncoder { @Override - public void encode(int value) throws IOException { - if ((value & ~0x7F) == 0) { - out.write(value); - } else if ((value & ~0x3FFF) == 0) { - out.write(0x80 | (value >> 7)); - out.write(0x7F & value); - } else if ((value & ~0x1FFFFF) == 0) { - out.write(0x80 | (value >> 14)); - out.write(0x80 | (value >> 7)); - out.write(0x7F & value); - } else if ((value & ~0xFFFFFFF) == 0) { - out.write(0x80 | (value >> 21)); - out.write(0x80 | (value >> 14)); - out.write(0x80 | (value >> 7)); - out.write(0x7F & value); - } else { - out.write(0x80 | (value >> 28)); - out.write(0x80 | (value >> 21)); - out.write(0x80 | (value >> 14)); - out.write(0x80 | (value >> 7)); - out.write(0x7F & value); + protected void doEncode(IntsRef values, BytesRef buf, int upto) { + int maxBytesNeeded = 5 * values.length; // at most 5 bytes per VInt + if (buf.bytes.length < maxBytesNeeded) { + buf.grow(maxBytesNeeded); + } + + for (int i = values.offset; i < upto; i++) { + VInt8.encode(values.ints[i], buf); } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/package.html b/lucene/facet/src/java/org/apache/lucene/util/encoding/package.html index 2e50e8f38eb..8a81b258e34 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/package.html +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/package.html @@ -25,49 +25,8 @@ mechanisms to create new ones. The super class for all encoders is encoders there is a matching {@link org.apache.lucene.util.encoding.IntDecoder} implementation (not all encoders need a decoder). -

    An encoder encodes the integers that are passed to {@link -org.apache.lucene.util.encoding.IntEncoder#encode(int) encode} into a -set output stream (see {@link -org.apache.lucene.util.encoding.IntEncoder#reInit(OutputStream) -reInit}). One should always call {@link -org.apache.lucene.util.encoding.IntEncoder#close() close} when all -integers have been encoded, to ensure proper finish by the encoder. Some -encoders buffer values in-memory and encode in batches in order to -optimize the encoding, and not closing them may result in loss of -information or corrupt stream. -

    A proper and typical usage of an encoder looks like this: -

    
    -int[] data = <the values to encode>
    -IntEncoder encoder = new VInt8IntEncoder();
    -OutputStream out = new ByteArrayOutputStream();
    -encoder.reInit(out);
    -for (int val : data) {
    -  encoder.encode(val);
    -}
    -encoder.close();
    -
    -// Print the bytes in binary
    -byte[] bytes = out.toByteArray();
    -for (byte b : bytes) {
    -  System.out.println(Integer.toBinaryString(b));
    -}
    -
    -Each encoder also implements {@link -org.apache.lucene.util.encoding.IntEncoder#createMatchingDecoder() -createMatchingDecoder} which returns the matching decoder for this encoder. -As mentioned above, not all encoders have a matching decoder (like some -encoder filters which are explained next), however every encoder should -return a decoder following a call to that method. To complete the -example above, one can easily iterate over the decoded values like this: -
    
    -IntDecoder d = e.createMatchingDecoder();
    -d.reInit(new ByteArrayInputStream(bytes));
    -long val;
    -while ((val = d.decode()) != IntDecoder.EOS) {
    -  System.out.println(val);
    -}
    -
    -

    Some encoders don't perform any encoding at all, or do not include an +

    +Some encoders don't perform any encoding at all, or do not include an encoding logic. Those are called {@link org.apache.lucene.util.encoding.IntEncoderFilter}s. A filter is an encoder which delegates the encoding task to a given encoder, however @@ -76,91 +35,6 @@ example is {@link org.apache.lucene.util.encoding.DGapIntEncoder} which encodes the gaps between values rather than the values themselves. Another example is {@link org.apache.lucene.util.encoding.SortingIntEncoder} which sorts all the -values in ascending order before they are sent for encoding. This -encoder aggregates the values in its {@link -org.apache.lucene.util.encoding.IntEncoder#encode(int) encode} implementation -and decoding only happens upon calling {@link -org.apache.lucene.util.encoding.IntEncoder#close() close}. -

    Extending IntEncoder

    -Extending {@link org.apache.lucene.util.encoding.IntEncoder} is a very -easy task. One only needs to implement {@link -org.apache.lucene.util.encoding.IntEncoder#encode(int) encode} and -{@link org.apache.lucene.util.encoding.IntEncoder#createMatchingDecoder() -createMatchingDecoder} as the base implementation takes care of -re-initializing the output stream and closing it. The following example -illustrates how can one write an encoder (and a matching decoder) which -'tags' the stream with type/ID of the encoder. Such tagging is important -in scenarios where an application uses different encoders for different -streams, and wants to manage some sort of mapping between an encoder ID -to an IntEncoder/Decoder implementation, so a proper decoder will be -initialized on the fly: -
    
    -public class TaggingIntEncoder extends IntEncoderFilter {
    -  
    -  public TaggingIntEncoder(IntEncoder encoder) {
    -    super(encoder);
    -  }
    -  
    -  @Override
    -  public void encode(int value) throws IOException {
    -    encoder.encode(value);
    -  }
    -
    -  @Override
    -  public IntDecoder createMatchingDecoder() {
    -    return new TaggingIntDecoder();
    -  }
    -	
    -  @Override
    -  public void reInit(OutputStream out) {
    -    super.reInit(os);
    -    // Assumes the application has a static EncodersMap class which is able to 
    -    // return a unique ID for a given encoder.
    -    int encoderID = EncodersMap.getID(encoder);
    -    this.out.write(encoderID);
    -  }
    -
    -  @Override
    -  public String toString() {
    -    return "Tagging (" + encoder.toString() + ")";
    -  }
    -
    -}
    -
    -And the matching decoder: -
    
    -public class TaggingIntDecoder extends IntDecoder {
    -  
    -  // Will be initialized upon calling reInit.
    -  private IntDecoder decoder;
    -  
    -  @Override
    -  public void reInit(InputStream in) {
    -    super.reInit(in);
    -    
    -    // Read the ID of the encoder that tagged this stream.
    -    int encoderID = in.read();
    -    
    -    // Assumes EncodersMap can return the proper IntEncoder given the ID.
    -    decoder = EncodersMap.getEncoder(encoderID).createMatchingDecoder();
    -  }
    -	
    -  @Override
    -  public long decode() throws IOException {
    -    return decoder.decode();
    -  }
    -
    -  @Override
    -  public String toString() {
    -    return "Tagging (" + decoder == null ? "none" : decoder.toString() + ")";
    -  }
    -
    -}
    -
    -The example implements TaggingIntEncoder as a filter over another -encoder. Even though it does not do any filtering on the actual values, it feels -right to present it as a filter. Anyway, this is just an example code and one -can choose to implement it however it makes sense to the application. For -simplicity, error checking was omitted from the sample code. +values in ascending order before they are sent for encoding. \ No newline at end of file diff --git a/lucene/facet/src/test/org/apache/lucene/facet/example/TestAssociationExample.java b/lucene/facet/src/test/org/apache/lucene/facet/example/TestAssociationExample.java index c630fe95330..203343859e6 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/example/TestAssociationExample.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/example/TestAssociationExample.java @@ -4,7 +4,7 @@ import org.junit.Test; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.facet.example.ExampleResult; -import org.apache.lucene.facet.example.association.AssociationMain; +import org.apache.lucene.facet.example.association.CategoryAssociationsMain; import org.apache.lucene.facet.search.results.FacetResultNode; /* @@ -35,8 +35,8 @@ public class TestAssociationExample extends LuceneTestCase { @Test public void testAssociationExamples() throws Exception { - assertExampleResult(new AssociationMain().runSumIntAssociationSample(), EXPECTED_INT_SUM_RESULTS); - assertExampleResult(new AssociationMain().runSumFloatAssociationSample(), EXPECTED_FLOAT_SUM_RESULTS); + assertExampleResult(new CategoryAssociationsMain().runSumIntAssociationSample(), EXPECTED_INT_SUM_RESULTS); + assertExampleResult(new CategoryAssociationsMain().runSumFloatAssociationSample(), EXPECTED_FLOAT_SUM_RESULTS); } private void assertExampleResult(ExampleResult res, double[] expectedResults) { diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/CategoryListIteratorTest.java b/lucene/facet/src/test/org/apache/lucene/facet/search/CategoryListIteratorTest.java index ddb4ac54480..7db8056a71c 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/CategoryListIteratorTest.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/CategoryListIteratorTest.java @@ -19,8 +19,8 @@ import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.UnsafeByteArrayOutputStream; import org.apache.lucene.util.encoding.DGapIntEncoder; import org.apache.lucene.util.encoding.IntEncoder; import org.apache.lucene.util.encoding.SortingIntEncoder; @@ -49,17 +49,19 @@ public class CategoryListIteratorTest extends LuceneTestCase { private static final class DataTokenStream extends TokenStream { + private final PayloadAttribute payload = addAttribute(PayloadAttribute.class); + private final BytesRef buf; + private final IntEncoder encoder; + private final CharTermAttribute term = addAttribute(CharTermAttribute.class); + private int idx; - private PayloadAttribute payload = addAttribute(PayloadAttribute.class); - private byte[] buf = new byte[20]; - UnsafeByteArrayOutputStream ubaos = new UnsafeByteArrayOutputStream(buf); - IntEncoder encoder; private boolean exhausted = false; - private CharTermAttribute term = addAttribute(CharTermAttribute.class); public DataTokenStream(String text, IntEncoder encoder) { this.encoder = encoder; term.setEmpty().append(text); + buf = new BytesRef(); + payload.setPayload(buf); } public void setIdx(int idx) { @@ -73,30 +75,26 @@ public class CategoryListIteratorTest extends LuceneTestCase { return false; } - int[] values = data[idx]; - ubaos.reInit(buf); - encoder.reInit(ubaos); - for (int val : values) { - encoder.encode(val); - } - encoder.close(); - payload.setPayload(new BytesRef(buf, 0, ubaos.length())); - + // must copy because encoders may change the buffer + encoder.encode(IntsRef.deepCopyOf(data[idx]), buf); exhausted = true; return true; } } - static final int[][] data = new int[][] { - new int[] { 1, 2 }, new int[] { 3, 4 }, new int[] { 1, 3 }, new int[] { 1, 2, 3, 4 }, + static final IntsRef[] data = new IntsRef[] { + new IntsRef(new int[] { 1, 2 }, 0, 2), + new IntsRef(new int[] { 3, 4 }, 0, 2), + new IntsRef(new int[] { 1, 3 }, 0, 2), + new IntsRef(new int[] { 1, 2, 3, 4 }, 0, 4) }; @Test - public void testPayloadIntDecodingIterator() throws Exception { + public void testPayloadCategoryListIteraor() throws Exception { Directory dir = newDirectory(); - DataTokenStream dts = new DataTokenStream("1",new SortingIntEncoder( - new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder())))); + final IntEncoder encoder = new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder()))); + DataTokenStream dts = new DataTokenStream("1",encoder); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)).setMergePolicy(newLogMergePolicy())); for (int i = 0; i < data.length; i++) { @@ -108,21 +106,21 @@ public class CategoryListIteratorTest extends LuceneTestCase { IndexReader reader = writer.getReader(); writer.close(); - CategoryListIterator cli = new PayloadIntDecodingIterator(reader, new Term( - "f","1"), dts.encoder.createMatchingDecoder()); + IntsRef ordinals = new IntsRef(); + CategoryListIterator cli = new PayloadCategoryListIteraor(reader, new Term("f","1"), encoder.createMatchingDecoder()); cli.init(); int totalCategories = 0; for (int i = 0; i < data.length; i++) { Set values = new HashSet(); for (int j = 0; j < data[i].length; j++) { - values.add(data[i][j]); + values.add(data[i].ints[j]); } - cli.skipTo(i); - long cat; - while ((cat = cli.nextCategory()) < Integer.MAX_VALUE) { - assertTrue("expected category not found: " + cat, values.contains((int) cat)); - totalCategories ++; + cli.getOrdinals(i, ordinals); + assertTrue("no ordinals for document " + i, ordinals.length > 0); + for (int j = 0; j < ordinals.length; j++) { + assertTrue("expected category not found: " + ordinals.ints[j], values.contains(ordinals.ints[j])); } + totalCategories += ordinals.length; } assertEquals("Missing categories!",10,totalCategories); reader.close(); @@ -135,8 +133,8 @@ public class CategoryListIteratorTest extends LuceneTestCase { @Test public void testPayloadIteratorWithInvalidDoc() throws Exception { Directory dir = newDirectory(); - DataTokenStream dts = new DataTokenStream("1",new SortingIntEncoder( - new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder())))); + final IntEncoder encoder = new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder()))); + DataTokenStream dts = new DataTokenStream("1", encoder); // this test requires that no payloads ever be randomly present! final Analyzer noPayloadsAnalyzer = new Analyzer() { @Override @@ -162,30 +160,27 @@ public class CategoryListIteratorTest extends LuceneTestCase { IndexReader reader = writer.getReader(); writer.close(); - CategoryListIterator cli = new PayloadIntDecodingIterator(reader, new Term( - "f","1"), dts.encoder.createMatchingDecoder()); + IntsRef ordinals = new IntsRef(); + CategoryListIterator cli = new PayloadCategoryListIteraor(reader, new Term("f","1"), encoder.createMatchingDecoder()); assertTrue("Failed to initialize payload iterator", cli.init()); - int totalCats = 0; + int totalCategories = 0; for (int i = 0; i < data.length; i++) { - // doc no. i Set values = new HashSet(); for (int j = 0; j < data[i].length; j++) { - values.add(data[i][j]); + values.add(data[i].ints[j]); } - boolean hasDoc = cli.skipTo(i); - if (hasDoc) { - assertTrue("Document " + i + " must not have a payload!", i == 0); - long cat; - while ((cat = cli.nextCategory()) < Integer.MAX_VALUE) { - assertTrue("expected category not found: " + cat, values.contains((int) cat)); - ++totalCats; + cli.getOrdinals(i, ordinals); + if (i == 0) { + assertTrue("document 0 must have a payload", ordinals.length > 0); + for (int j = 0; j < ordinals.length; j++) { + assertTrue("expected category not found: " + ordinals.ints[j], values.contains(ordinals.ints[j])); } + totalCategories += ordinals.length; } else { - assertFalse("Document " + i + " must have a payload!", i == 0); + assertTrue("only document 0 should have a payload", ordinals.length == 0); } - } - assertEquals("Wrong number of total categories!", 2, totalCats); + assertEquals("Wrong number of total categories!", 2, totalCategories); reader.close(); dir.close(); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/DrillDownTest.java b/lucene/facet/src/test/org/apache/lucene/facet/search/DrillDownTest.java index 6292ea37353..c20a98c41fe 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/DrillDownTest.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/DrillDownTest.java @@ -90,7 +90,9 @@ public class DrillDownTest extends LuceneTestCase { paths.add(new CategoryPath("b")); } FacetFields facetFields = new FacetFields(taxoWriter); - facetFields.addFields(doc, paths); + if (paths.size() > 0) { + facetFields.addFields(doc, paths); + } writer.addDocument(doc); } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/TestCategoryListCache.java b/lucene/facet/src/test/org/apache/lucene/facet/search/TestCategoryListCache.java index 08b48aad3fe..aaafc9abd1e 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/TestCategoryListCache.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/TestCategoryListCache.java @@ -6,6 +6,7 @@ import java.util.List; import java.util.Map; import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.util.IntsRef; import org.junit.After; import org.junit.Before; @@ -118,18 +119,17 @@ public class TestCategoryListCache extends FacetTestBase { @Override public CategoryListIterator iterator(int partition) throws IOException { final CategoryListIterator it = cld.iterator(partition); - return new CategoryListIterator() { + return new CategoryListIterator() { @Override - public boolean skipTo(int docId) throws IOException { - return it.skipTo(docId); - } - @Override - public long nextCategory() throws IOException { - long res = it.nextCategory(); - if (res>Integer.MAX_VALUE) { - return res; + public void getOrdinals(int docID, IntsRef ints) throws IOException { + it.getOrdinals(docID, ints); + for (int i = 0; i < ints.length; i++) { + if (ints.ints[i] > 1) { + ints.ints[i]--; + } else { + ints.ints[i]++; + } } - return res>1 ? res-1 : res+1; } @Override public boolean init() throws IOException { diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/params/MultiCategoryListIteratorTest.java b/lucene/facet/src/test/org/apache/lucene/facet/search/params/MultiCategoryListIteratorTest.java new file mode 100644 index 00000000000..404ac5f54e8 --- /dev/null +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/params/MultiCategoryListIteratorTest.java @@ -0,0 +1,126 @@ +package org.apache.lucene.facet.search.params; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Random; + +import org.apache.lucene.document.Document; +import org.apache.lucene.facet.index.FacetFields; +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.PerDimensionIndexingParams; +import org.apache.lucene.facet.search.CategoryListIterator; +import org.apache.lucene.facet.search.PayloadCategoryListIteraor; +import org.apache.lucene.facet.search.cache.CategoryListCache; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; +import org.apache.lucene.facet.util.MultiCategoryListIterator; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.encoding.IntDecoder; +import org.junit.Test; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class MultiCategoryListIteratorTest extends LuceneTestCase { + + @Test + public void testMultipleCategoryLists() throws Exception { + Random random = random(); + int numDimensions = atLeast(random, 2); // at least 2 dimensions + String[] dimensions = new String[numDimensions]; + for (int i = 0; i < numDimensions; i++) { + dimensions[i] = "dim" + i; + } + + // build the PerDimensionIndexingParams + HashMap clps = new HashMap(); + for (String dim : dimensions) { + CategoryPath cp = new CategoryPath(dim); + CategoryListParams clp = new CategoryListParams(new Term("$" + dim, CategoryListParams.DEFAULT_TERM.bytes())); + clps.put(cp, clp); + } + PerDimensionIndexingParams indexingParams = new PerDimensionIndexingParams(clps); + + // index some documents + Directory indexDir = newDirectory(); + Directory taxoDir = newDirectory(); + IndexWriter indexWriter = new IndexWriter(indexDir, newIndexWriterConfig(TEST_VERSION_CURRENT, null).setMaxBufferedDocs(2)); + TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); + FacetFields facetFields = new FacetFields(taxoWriter, indexingParams); + int ndocs = atLeast(random, 10); + for (int i = 0; i < ndocs; i++) { + Document doc = new Document(); + int numCategories = random.nextInt(numDimensions) + 1; + ArrayList categories = new ArrayList(); + for (int j = 0; j < numCategories; j++) { + String dimension = dimensions[random.nextInt(dimensions.length)]; + categories.add(new CategoryPath(dimension, Integer.toString(i))); + } + facetFields.addFields(doc, categories); + indexWriter.addDocument(doc); + } + IOUtils.close(indexWriter, taxoWriter); + + // test the multi iterator + CategoryListCache clCache = null; + if (random.nextBoolean()) { + clCache = new CategoryListCache(); + } + + DirectoryReader indexReader = DirectoryReader.open(indexDir); + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); + CategoryListIterator[] iterators = new CategoryListIterator[numDimensions]; + for (int i = 0; i < iterators.length; i++) { + CategoryListParams clp = indexingParams.getCategoryListParams(new CategoryPath(dimensions[i])); + IntDecoder decoder = clp.createEncoder().createMatchingDecoder(); + if (clCache != null && random.nextBoolean()) { + clCache.loadAndRegister(clp, indexReader, taxoReader, indexingParams); + iterators[i] = clCache.get(clp).iterator(0); // no partitions + } else { + iterators[i] = new PayloadCategoryListIteraor(indexReader, clp.getTerm(), decoder); + } + } + MultiCategoryListIterator cli = new MultiCategoryListIterator(iterators); + assertTrue("failed to init multi-iterator", cli.init()); + IntsRef ordinals = new IntsRef(); + int maxDoc = indexReader.maxDoc(); + for (int i = 0; i < maxDoc; i++) { + cli.getOrdinals(i, ordinals); + assertTrue("document " + i + " does not have categories", ordinals.length > 0); + for (int j = 0; j < ordinals.length; j++) { + CategoryPath cp = taxoReader.getPath(ordinals.ints[j]); + assertNotNull("ordinal " + ordinals.ints[j] + " not found in taxonomy", cp); + if (cp.length == 2) { + assertEquals("invalid category for document " + i, i, Integer.parseInt(cp.components[1])); + } + } + } + + IOUtils.close(indexReader, taxoReader); + IOUtils.close(indexDir, taxoDir); + } + +} \ No newline at end of file diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/params/MultiIteratorsPerCLParamsTest.java b/lucene/facet/src/test/org/apache/lucene/facet/search/params/MultiIteratorsPerCLParamsTest.java deleted file mode 100644 index a6d169b3284..00000000000 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/params/MultiIteratorsPerCLParamsTest.java +++ /dev/null @@ -1,271 +0,0 @@ -package org.apache.lucene.facet.search.params; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import org.apache.lucene.analysis.MockAnalyzer; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.document.Document; -import org.apache.lucene.facet.index.FacetFields; -import org.apache.lucene.facet.index.params.CategoryListParams; -import org.apache.lucene.facet.index.params.FacetIndexingParams; -import org.apache.lucene.facet.search.CategoryListIterator; -import org.apache.lucene.facet.search.FacetArrays; -import org.apache.lucene.facet.search.FacetResultsHandler; -import org.apache.lucene.facet.search.FacetsAccumulator; -import org.apache.lucene.facet.search.ScoredDocIDs; -import org.apache.lucene.facet.search.StandardFacetsAccumulator; -import org.apache.lucene.facet.search.TopKFacetResultsHandler; -import org.apache.lucene.facet.search.cache.CategoryListCache; -import org.apache.lucene.facet.search.results.FacetResult; -import org.apache.lucene.facet.search.results.FacetResultNode; -import org.apache.lucene.facet.search.results.IntermediateFacetResult; -import org.apache.lucene.facet.taxonomy.CategoryPath; -import org.apache.lucene.facet.taxonomy.TaxonomyReader; -import org.apache.lucene.facet.taxonomy.TaxonomyWriter; -import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; -import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; -import org.apache.lucene.facet.util.ScoredDocIdsUtils; -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.RandomIndexWriter; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.LuceneTestCase; -import org.junit.Test; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Test faceted search with creation of multiple category list iterators by the - * same CLP, depending on the provided facet request - */ -public class MultiIteratorsPerCLParamsTest extends LuceneTestCase { - - CategoryPath[][] perDocCategories = new CategoryPath[][] { - { new CategoryPath("author", "Mark Twain"), - new CategoryPath("date", "2010") }, - { new CategoryPath("author", "Robert Frost"), - new CategoryPath("date", "2009") }, - { new CategoryPath("author", "Artur Miller"), - new CategoryPath("date", "2010") }, - { new CategoryPath("author", "Edgar Allan Poe"), - new CategoryPath("date", "2009") }, - { new CategoryPath("author", "Henry James"), - new CategoryPath("date", "2010") } }; - - String countForbiddenDimension; - - @Test - public void testCLParamMultiIteratorsByRequest() throws Exception { - doTestCLParamMultiIteratorsByRequest(false); - } - - @Test - public void testCLParamMultiIteratorsByRequestCacheCLI() throws Exception { - doTestCLParamMultiIteratorsByRequest(true); - } - - private void doTestCLParamMultiIteratorsByRequest(boolean cacheCLI) throws Exception { - // Create a CLP which generates different CLIs according to the - // FacetRequest's dimension - CategoryListParams clp = new CategoryListParams(); - FacetIndexingParams iParams = new FacetIndexingParams(clp); - Directory indexDir = newDirectory(); - Directory taxoDir = newDirectory(); - populateIndex(iParams, indexDir, taxoDir); - - TaxonomyReader taxo = new DirectoryTaxonomyReader(taxoDir); - IndexReader reader = DirectoryReader.open(indexDir); - - CategoryListCache clCache = null; - if (cacheCLI) { - // caching the iteratorr, so: - // 1: create the cached iterator, using original params - clCache = new CategoryListCache(); - clCache.loadAndRegister(clp, reader, taxo, iParams); - } - - ScoredDocIDs allDocs = ScoredDocIdsUtils - .createAllDocsScoredDocIDs(reader); - - // Search index with 'author' should filter ONLY ordinals whose parent - // is 'author' - countForbiddenDimension = "date"; - validateFacetedSearch(iParams, taxo, reader, clCache, allDocs, "author", 5, 5); - - // Search index with 'date' should filter ONLY ordinals whose parent is - // 'date' - countForbiddenDimension = "author"; - validateFacetedSearch(iParams, taxo, reader, clCache, allDocs, "date", 5, 2); - - // Search index with both 'date' and 'author' - countForbiddenDimension = null; - validateFacetedSearch(iParams, taxo, reader, clCache, allDocs, new String[] { - "author", "date" }, new int[] { 5, 5 }, new int[] { 5, 2 }); - taxo.close(); - reader.close(); - indexDir.close(); - taxoDir.close(); - } - - private void validateFacetedSearch(FacetIndexingParams iParams, - TaxonomyReader taxo, IndexReader reader, CategoryListCache clCache, - ScoredDocIDs allDocs, String dimension, int expectedValue, int expectedNumDescendants) throws IOException { - validateFacetedSearch(iParams, taxo, reader, clCache, allDocs, - new String[] { dimension }, new int[] { expectedValue }, - new int[] { expectedNumDescendants }); - } - - private void validateFacetedSearch(FacetIndexingParams iParams, - TaxonomyReader taxo, IndexReader reader, final CategoryListCache clCache, - ScoredDocIDs allDocs, String[] dimension, int[] expectedValue, - int[] expectedNumDescendants) throws IOException { - List facetRequests = new ArrayList(); - for (String dim : dimension) { - facetRequests.add(new PerDimCountFacetRequest(new CategoryPath(dim), 10)); - } - FacetSearchParams sParams = new FacetSearchParams(facetRequests, iParams) { - @Override - public CategoryListCache getCategoryListCache() { - return clCache; - } - }; - FacetsAccumulator acc = new StandardFacetsAccumulator(sParams, reader, taxo); - - // no use to test this with complement since at that mode all facets are taken - acc.setComplementThreshold(FacetsAccumulator.DISABLE_COMPLEMENT); - - List results = acc.accumulate(allDocs); - assertEquals("Wrong #results", dimension.length, results.size()); - - for (int i = 0; i < results.size(); i++) { - FacetResult res = results.get(i); - assertEquals("wrong num-descendants for dimension " + dimension[i], - expectedNumDescendants[i], res.getNumValidDescendants()); - FacetResultNode resNode = res.getFacetResultNode(); - assertEquals("wrong value for dimension " + dimension[i], - expectedValue[i], (int) resNode.getValue()); - } - } - - private void populateIndex(FacetIndexingParams iParams, Directory indexDir, - Directory taxoDir) throws Exception { - RandomIndexWriter writer = new RandomIndexWriter(random(), indexDir, - newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false))); - TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); - - FacetFields facetFields = new FacetFields(taxoWriter, iParams); - for (CategoryPath[] categories : perDocCategories) { - Document doc = new Document(); - facetFields.addFields(doc, Arrays.asList(categories)); - writer.addDocument(doc); - } - taxoWriter.commit(); - writer.commit(); - taxoWriter.close(); - writer.close(); - } - - private class PerDimCountFacetRequest extends CountFacetRequest { - - public PerDimCountFacetRequest(CategoryPath path, int num) { - super(path, num); - } - - @Override - public CategoryListIterator createCategoryListIterator(IndexReader reader, - TaxonomyReader taxo, FacetSearchParams sParams, int partition) throws IOException { - // categories of certain dimension only - return new PerDimensionCLI(taxo, super.createCategoryListIterator( - reader, taxo, sParams, partition), getCategoryPath()); - } - - @Override - /** Override this method just for verifying that only specified facets are iterated.. */ - public FacetResultsHandler createFacetResultsHandler( - TaxonomyReader taxonomyReader) { - return new TopKFacetResultsHandler(taxonomyReader, this) { - @Override - public IntermediateFacetResult fetchPartitionResult( - FacetArrays facetArrays, int offset) throws IOException { - final IntermediateFacetResult res = super.fetchPartitionResult(facetArrays, offset); - if (countForbiddenDimension!=null) { - int ord = taxonomyReader.getOrdinal(new CategoryPath(countForbiddenDimension)); - assertEquals("Should not have accumulated for dimension '"+countForbiddenDimension+"'!",0,facetArrays.getIntArray()[ord]); - } - return res; - } - }; - } - } - - /** - * a CLI which filters another CLI for the dimension of the provided - * category-path - */ - private static class PerDimensionCLI implements CategoryListIterator { - private final CategoryListIterator superCLI; - private final int[] parentArray; - private final int parentOrdinal; - - PerDimensionCLI(TaxonomyReader taxo, CategoryListIterator superCLI, - CategoryPath requestedPath) throws IOException { - this.superCLI = superCLI; - if (requestedPath == null) { - parentOrdinal = 0; - } else { - CategoryPath cp = new CategoryPath(requestedPath.components[0]); - parentOrdinal = taxo.getOrdinal(cp); - } - parentArray = taxo.getParallelTaxonomyArrays().parents(); - } - - @Override - public boolean init() throws IOException { - return superCLI.init(); - } - - @Override - public long nextCategory() throws IOException { - long next; - while ((next = superCLI.nextCategory()) <= Integer.MAX_VALUE - && !isInDimension((int) next)) { - } - - return next; - } - - /** look for original parent ordinal, meaning same dimension */ - private boolean isInDimension(int ordinal) { - while (ordinal > 0) { - if (ordinal == parentOrdinal) { - return true; - } - ordinal = parentArray[ordinal]; - } - return false; - } - - @Override - public boolean skipTo(int docId) throws IOException { - return superCLI.skipTo(docId); - } - } -} \ No newline at end of file diff --git a/lucene/facet/src/test/org/apache/lucene/util/Vint8Test.java b/lucene/facet/src/test/org/apache/lucene/util/Vint8Test.java deleted file mode 100644 index 82141860552..00000000000 --- a/lucene/facet/src/test/org/apache/lucene/util/Vint8Test.java +++ /dev/null @@ -1,141 +0,0 @@ -package org.apache.lucene.util; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; - -import org.junit.Test; - -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.Vint8; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Tests the {@link Vint8} class. - */ -public class Vint8Test extends LuceneTestCase { - - /** - * Tests the position wrapper. - * @throws Exception For any reason. - */ - @Test - public void testPosition() throws Exception { - Vint8.Position pos = new Vint8.Position(); - assertEquals(0, pos.pos); - pos = new Vint8.Position(12345); - assertEquals(12345, pos.pos); - } - - private static int[] testValues = { - -1000000000, - -1, 0, (1 << 7) - 1, 1 << 7, (1 << 14) - 1, 1 << 14, - (1 << 21) - 1, 1 << 21, (1 << 28) - 1, 1 << 28 - }; - private static int[] bytesNeededTestValues = { - 5, 5, 1, 1, 2, 2, 3, 3, 4, 4, 5 - }; - - /** - * Tests the {@code bytesNeeded} method. - */ - @Test - public void testBytesNeeded() { - assertEquals(5, Vint8.MAXIMUM_BYTES_NEEDED); - for (int j = 0; j < testValues.length; j++) { - assertEquals(bytesNeededTestValues[j], Vint8.bytesNeeded(testValues[j])); - } - } - - /** - * Tests encoding and decoding to and from a stream. - */ - @Test - public void testStreamEncodingAndDecoding() throws IOException { - ByteArrayOutputStream baos = new ByteArrayOutputStream(256); - int expectedSize = 0; - for (int j = 0; j < testValues.length; j++) { - Vint8.encode(testValues[j], baos); - expectedSize += bytesNeededTestValues[j]; - } - assertEquals(expectedSize, baos.size()); - ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); - for (int j = 0; j < testValues.length; j++) { - assertEquals(testValues[j], Vint8.decode(bais)); - } - assertEquals(0, bais.available()); - } - - /** - * Tests encoding and decoding to and from an array. - */ - @Test - public void testArrayEncodingAndDecoding() throws IOException { - byte[] byteArray = new byte[256]; - int position = 0, expectedSize = 0; - for (int j = 0; j < testValues.length; j++) { - position += Vint8.encode(testValues[j], byteArray, position); - expectedSize += bytesNeededTestValues[j]; - } - assertEquals(expectedSize, position); - Vint8.Position pos = new Vint8.Position(); - for (int j = 0; j < testValues.length; j++) { - assertEquals(testValues[j], Vint8.decode(byteArray, pos)); - } - assertEquals(expectedSize, pos.pos); - } - - /** - * The result of encoding the test values with the current algorithm. If these - * values are changed to match an algorithm change, compatibility with legacy - * data will be broken. - */ - private static final byte[] encodedTestValues = { - -4, -93, -108, -20, 0, -1, -1, -1, -1, 127, 0, 127, -127, 0, -1, 127, - -127, -128, 0, -1, -1, 127, -127, -128, -128, 0, -1, -1, -1, 127, -127, - -128, -128, -128, 0 - }; - - /** - * Tests algorithm. - */ - @Test - public void testLegacyCompatibility() throws IOException { - /* To generate the encoded test values: - byte[] byteArray = new byte[256]; - int position = 0, expectedSize = 0; - for (int j = 0; j < testValues.length; j++) { - position += Vint8.encode(testValues[j], byteArray, position); - expectedSize += bytesNeededTestValues[j]; - } - assertEquals(expectedSize, position); - Vint8.Position pos = new Vint8.Position(); - for (int j = 0; j < expectedSize; j++) { - System.out.print(byteArray[j] + ", "); - } - System.out.flush(); - pos.pos = 0; - */ - Vint8.Position pos = new Vint8.Position(); - for (int j = 0; j < testValues.length; j++) { - assertEquals(testValues[j], Vint8.decode(encodedTestValues, pos)); - } - } - -} // end class Vint8Test diff --git a/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingSpeed.java b/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingSpeed.java index 96dcea5a5ba..7b58ff4d0d7 100644 --- a/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingSpeed.java +++ b/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingSpeed.java @@ -1,21 +1,12 @@ package org.apache.lucene.util.encoding; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; import java.io.IOException; import java.text.NumberFormat; import java.util.Arrays; import java.util.Locale; -import org.apache.lucene.util.encoding.DGapIntEncoder; -import org.apache.lucene.util.encoding.EightFlagsIntEncoder; -import org.apache.lucene.util.encoding.FourFlagsIntEncoder; -import org.apache.lucene.util.encoding.IntDecoder; -import org.apache.lucene.util.encoding.IntEncoder; -import org.apache.lucene.util.encoding.NOnesIntEncoder; -import org.apache.lucene.util.encoding.SortingIntEncoder; -import org.apache.lucene.util.encoding.UniqueValuesIntEncoder; -import org.apache.lucene.util.encoding.VInt8IntEncoder; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -40,8 +31,8 @@ public class EncodingSpeed { private static int[] data9910 = null; private static int[] data501871 = null; private static int[] data10k = null; - private static String resultsFormat = "%-20s %10s %20d %26s %20d %26s"; - private static String headerFormat = "%-20s %10s %20s %26s %20s %26s"; + private static String resultsFormat = "%-60s %10s %20d %26s %20d %26s"; + private static String headerFormat = "%-60s %10s %20s %26s %20s %26s"; private static int integers = 100000000; private static NumberFormat nf; @@ -53,8 +44,14 @@ public class EncodingSpeed { testFacetIDs(data501871, 501871); } - private static void testFacetIDs(int[] facetIDs, int docID) - throws IOException { + private static IntsRef newIntsRef(int[] data) { + IntsRef res = new IntsRef(data.length); + System.arraycopy(data, 0, res.ints, 0, data.length); + res.length = data.length; + return res; + } + + private static void testFacetIDs(int[] facetIDs, int docID) throws IOException { int loopFactor = integers / facetIDs.length; System.out .println("\nEstimating ~" @@ -88,68 +85,53 @@ public class EncodingSpeed { System.out.println(); } - private static void encoderTest(IntEncoder encoder, int[] data, - int loopFactor) throws IOException { + private static void encoderTest(IntEncoder encoder, int[] values, int loopFactor) throws IOException { - long startTime, endTime; - ByteArrayOutputStream baos = new ByteArrayOutputStream(); + BytesRef bytes = new BytesRef(values.length); // at least one byte per value // -- Looping 100 times as a warm up -------------------------- for (int i = 100; i != 0; --i) { - baos.reset(); - encoder.reInit(baos); - for (int value : data) { - encoder.encode(value); - } - encoder.close(); + IntsRef data = newIntsRef(values); + encoder.encode(data, bytes); } // ----------------------------------------------------------- - startTime = System.currentTimeMillis(); + long encodeTime = 0; for (int factor = loopFactor; factor > 0; --factor) { - baos.reset(); - encoder.reInit(baos); - for (int value : data) { - encoder.encode(value); - } - encoder.close(); + IntsRef data = newIntsRef(values); + long start = System.currentTimeMillis(); + encoder.encode(data, bytes); + encodeTime += System.currentTimeMillis() - start; } - endTime = System.currentTimeMillis(); - long encodeTime = endTime - startTime; - - ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); + IntsRef decoded = new IntsRef(values.length); + int encodedSize = bytes.length; IntDecoder decoder = encoder.createMatchingDecoder(); - decoder.reInit(bais); // -- Looping 100 times as a warm up -------------------------- for (int i = 100; i != 0; --i) { - bais.mark(baos.size()); - while (decoder.decode() != IntDecoder.EOS) { - } - bais.reset(); - decoder.reInit(bais); + decoder.decode(bytes, decoded); } // ----------------------------------------------------------- - decoder.reInit(bais); - startTime = System.currentTimeMillis(); + long decodeTime = 0; for (int i = loopFactor; i > 0; --i) { - bais.mark(baos.size()); - while (decoder.decode() != IntDecoder.EOS) { - } - bais.reset(); - decoder.reInit(bais); + long start = System.currentTimeMillis(); + decoder.decode(bytes, decoded); + decodeTime += System.currentTimeMillis() - start; + } + + if (decoded.length != values.length) { + throw new RuntimeException("wrong num values. expected=" + values.length + " actual=" + decoded.length + + " decoder=" + decoder); } - endTime = System.currentTimeMillis(); - long decodeTime = endTime - startTime; - - System.out.println(String.format(Locale.ROOT, resultsFormat, encoder, nf.format(baos - .size() - * 8.0 / data.length), encodeTime, nf.format(encodeTime - * 1000000.0 / (loopFactor * data.length)), decodeTime, nf - .format(decodeTime * 1000000.0 / (loopFactor * data.length)))); + System.out.println(String.format(Locale.ROOT, resultsFormat, encoder, + nf.format(encodedSize * 8.0 / values.length), + encodeTime, + nf.format(encodeTime * 1000000.0 / (loopFactor * values.length)), + decodeTime, + nf.format(decodeTime * 1000000.0 / (loopFactor * values.length)))); } static { diff --git a/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingTest.java b/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingTest.java index 73e44fa744a..57689d5731c 100644 --- a/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingTest.java +++ b/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingTest.java @@ -1,14 +1,13 @@ package org.apache.lucene.util.encoding; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.util.HashSet; -import java.util.TreeSet; - -import org.junit.Test; +import java.util.Arrays; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.LuceneTestCase; +import org.junit.BeforeClass; +import org.junit.Test; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -29,65 +28,33 @@ import org.apache.lucene.util.LuceneTestCase; public class EncodingTest extends LuceneTestCase { - static int[] data = null; - - private static TreeSet dataSet = new TreeSet(); - static { - setData(); - } - - @Test - public void testVInt8() throws Exception { - encoderTest(new VInt8IntEncoder()); + private static IntsRef uniqueSortedData, data; + + @BeforeClass + public static void beforeClassEncodingTest() throws Exception { + int capacity = atLeast(10000); + data = new IntsRef(capacity); + for (int i = 0; i < 10; i++) { + data.ints[i] = i + 1; // small values + } + for (int i = 10; i < data.ints.length; i++) { + data.ints[i] = random().nextInt(Integer.MAX_VALUE - 1) + 1; // some encoders don't allow 0 + } + data.length = data.ints.length; - // cover negative numbers; - IntEncoder enc = new VInt8IntEncoder(); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - enc.reInit(baos); - enc.encode(-1); - - IntDecoder dec = enc.createMatchingDecoder(); - dec.reInit(new ByteArrayInputStream(baos.toByteArray())); - assertEquals(-1, dec.decode()); + uniqueSortedData = IntsRef.deepCopyOf(data); + Arrays.sort(uniqueSortedData.ints); + uniqueSortedData.length = 0; + int prev = -1; + for (int i = 0; i < uniqueSortedData.ints.length; i++) { + if (uniqueSortedData.ints[i] != prev) { + uniqueSortedData.ints[uniqueSortedData.length++] = uniqueSortedData.ints[i]; + prev = uniqueSortedData.ints[i]; + } + } } - @Test - public void testSimpleInt() { - encoderTest(new SimpleIntEncoder()); - } - - @Test - public void testSortingUniqueValues() { - encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new VInt8IntEncoder()))); - } - - @Test - public void testSortingUniqueDGap() { - encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder())))); - } - - @Test - public void testSortingUniqueDGapEightFlags() { - encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new EightFlagsIntEncoder())))); - } - - @Test - public void testSortingUniqueDGapFourFlags() { - encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new FourFlagsIntEncoder())))); - } - - @Test - public void testSortingUniqueDGapNOnes4() { - encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(4))))); - } - - @Test - public void testSortingUniqueDGapNOnes3() { - encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(3))))); - } - - private static void encoderTest(IntEncoder encoder) { - + private static void encoderTest(IntEncoder encoder, IntsRef data, IntsRef expected) throws IOException { // ensure toString is implemented String toString = encoder.toString(); assertFalse(toString.startsWith(encoder.getClass().getName() + "@")); @@ -95,320 +62,90 @@ public class EncodingTest extends LuceneTestCase { toString = decoder.toString(); assertFalse(toString.startsWith(decoder.getClass().getName() + "@")); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - - try { - encoding(encoder, baos); - decoding(baos, encoder.createMatchingDecoder()); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - - baos.reset(); - - try { - encoding(encoder, baos); - decoding(baos, encoder.createMatchingDecoder()); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } + BytesRef bytes = new BytesRef(100); // some initial capacity - encoders should grow the byte[] + IntsRef values = new IntsRef(100); // some initial capacity - decoders should grow the int[] + encoding(encoder, data, bytes); + decoding(bytes, values, encoder.createMatchingDecoder()); + assertTrue(expected.intsEquals(values)); } - private static void encoding(IntEncoder encoder, ByteArrayOutputStream baos) throws IOException { - encoder.reInit(baos); - for (int value : data) { - encoder.encode(value); + private static void encoding(IntEncoder encoder, IntsRef data, BytesRef bytes) throws IOException { + final IntsRef values; + if (random().nextBoolean()) { // randomly set the offset + values = new IntsRef(data.length + 1); + System.arraycopy(data.ints, 0, values.ints, 1, data.length); + values.offset = 1; // ints start at index 1 + values.length = data.length; + } else { + // need to copy the array because it may be modified by encoders (e.g. sorting) + values = IntsRef.deepCopyOf(data); } - encoder.close(); - - baos.reset(); - encoder.reInit(baos); - for (int value : data) { - encoder.encode(value); - } - encoder.close(); + encoder.encode(values, bytes); } - private static void decoding(ByteArrayOutputStream baos, IntDecoder decoder) - throws IOException { - ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); - decoder.reInit(bais); - - HashSet set = new HashSet(); - long value = 0; - while ((value = decoder.decode()) != IntDecoder.EOS) { - set.add(value); + private static void decoding(BytesRef bytes, IntsRef values, IntDecoder decoder) throws IOException { + int offset = 0; + if (random().nextBoolean()) { // randomly set the offset and length to other than 0,0 + bytes.grow(bytes.length + 1); // ensure that we have enough capacity to shift values by 1 + bytes.offset = 1; // bytes start at index 1 (must do that after grow) + System.arraycopy(bytes.bytes, 0, bytes.bytes, 1, bytes.length); + offset = 1; } - assertEquals(dataSet.size(), set.size()); - assertTrue(set.equals(dataSet)); - - set.clear(); - bais.reset(); - decoder.reInit(bais); - value = 0; - while ((value = decoder.decode()) != IntDecoder.EOS) { - set.add(value); - } - assertEquals(dataSet.size(), set.size()); - assertTrue(set.equals(dataSet)); - + decoder.decode(bytes, values); + assertEquals(offset, bytes.offset); // decoders should not mess with offsets } - private static void setData() { - data = new int[] { 2, 4, 86133, 11, 16505, 86134, 86135, 86136, 1290, - 86137, 86138, 32473, 19346, 32474, 4922, 32475, 86139, 16914, - 86140, 86141, 86142, 86143, 32478, 86144, 86145, 32480, 4884, - 4887, 32481, 86146, 16572, 86147, 16295, 165, 86148, 3183, - 21920, 21921, 21922, 555, 4006, 32484, 21925, 21926, 13775, - 86149, 13777, 85833, 85834, 13779, 13773, 13780, 75266, 17674, - 13784, 13785, 13786, 13787, 13788, 6258, 86150, 13790, 75267, - 13793, 13794, 13795, 312, 4914, 4915, 6222, 86151, 4845, 4883, - 4918, 4894, 4919, 86152, 4921, 6223, 6224, 6225, 6226, 67909, - 6229, 18170, 6230, 5198, 25625, 6231, 6232, 6233, 1808, 6234, - 6235, 6236, 41376, 6238, 6239, 67911, 6240, 86153, 6243, 6244, - 83549, 6246, 6247, 6248, 6249, 782, 444, 6251, 6250, 19863, - 28963, 310, 2234, 144, 2236, 2309, 69437, 2311, 2325, 2241, - 69438, 69439, 2244, 2245, 2246, 23504, 2314, 69440, 36603, - 2250, 2268, 2271, 2251, 2254, 2255, 2257, 2240, 36604, 84726, - 36605, 84727, 2262, 2263, 18431, 38853, 2317, 2149, 2326, 2327, - 2329, 3980, 2275, 2277, 2258, 84728, 2260, 84729, 84730, 13766, - 36607, 2282, 2283, 84731, 2284, 2286, 2287, 2337, 7424, 2288, - 2338, 3522, 2290, 84733, 32902, 371, 37708, 2096, 3065, 3066, - 375, 377, 374, 378, 2100, 86154, 381, 382, 58795, 379, 383, - 384, 385, 4449, 387, 388, 389, 390, 9052, 391, 18358, 2107, - 394, 2111, 2108, 393, 2109, 395, 86155, 86156, 397, 2113, 398, - 399, 400, 273, 274, 275, 40980, 276, 277, 31716, 279, 280, - 31717, 281, 282, 1628, 1623, 1624, 1625, 2052, 1626, 725, 727, - 728, 729, 730, 731, 1633, 733, 734, 735, 86157, 737, 738, 739, - 1634, 3563, 3564, 3565, 1667, 12461, 76276, 3567, 5413, 77622, - 5415, 5416, 5417, 5418, 107, 86158, 7784, 15363, 153, 3723, - 2713, 7786, 3835, 7787, 86159, 7789, 7791, 7792, 7794, 86160, - 7796, 86161, 6708, 7798, 7799, 7800, 7801, 7802, 7803, 1665, - 43150, 15365, 1581, 5656, 43152, 80258, 7450, 39922, 86162, - 51587, 9059, 4606, 396, 86163, 86164, 7250, 401, 403, 2860, - 33281, 2964, 408, 9119, 409, 86165, 7669, 2861, 410, 413, - 86166, 414, 415, 33282, 405, 33283, 7498, 2865, 7230, 33284, - 2866, 86167, 2867, 47518, 2868, 86168, 2869, 2870, 4712, 7096, - 28484, 6913, 6914, 6915, 6916, 37169, 37170, 7103, 28269, 6919, - 86169, 45431, 6922, 7104, 6923, 7108, 6924, 6925, 6926, 6927, - 6928, 86170, 86171, 86172, 6930, 6931, 6932, 6934, 6935, 6936, - 451, 6937, 6938, 4756, 3554, 5309, 8145, 3586, 16417, 9767, - 14126, 25854, 6580, 10174, 86173, 5519, 21309, 8561, 20938, - 10386, 86174, 781, 2030, 16419, 30323, 16420, 16421, 16424, - 86175, 86176, 86177, 28871, 86178, 28872, 63980, 6329, 49561, - 4271, 38778, 86179, 86180, 20126, 16245, 193, 195, 196, 197, - 56973, 199, 200, 201, 202, 203, 204, 56974, 56975, 205, 206, - 4662, 207, 208, 209, 210, 211, 212, 47901, 641, 642, 643, 1380, - 1079, 47902, 1381, 1081, 1082, 1083, 47903, 1382, 47904, 1087, - 47905, 965, 966, 1298, 968, 1387, 1300, 50288, 971, 972, 973, - 974, 23974, 22183, 1390, 23313, 1389, 1391, 902, 23029, 296, - 1304, 1395, 1303, 1309, 1308, 50289, 1312, 50290, 50291, 1315, - 1317, 9270, 19796, 3605, 1320, 1321, 44946, 1322, 1323, 50292, - 967, 1587, 1326, 1331, 17482, 633, 29115, 53858, 29118, 29119, - 62624, 44494, 6965, 6966, 6959, 6967, 71562, 6969, 23459, - 23460, 17464, 4225, 23461, 23462, 23463, 5893, 23464, 17467, - 17468, 23465, 12562, 1405, 1406, 1407, 960, 961, 962, 687, 963, - 86181, 86182, 5997, 10812, 11976, 11977, 1850, 577, 13393, - 10810, 13394, 65040, 86183, 3935, 3936, 3937, 710, 86184, 5785, - 5786, 29949, 5787, 5788, 283, 284, 2687, 285, 286, 287, 2689, - 288, 289, 8880, 290, 2690, 13899, 991, 292, 295, 42007, 35616, - 63103, 298, 299, 3520, 297, 9024, 303, 301, 302, 300, 31345, - 3719, 304, 305, 306, 307, 308, 368, 364, 85002, 9026, 63105, - 367, 39596, 25835, 19746, 293, 294, 26505, 85003, 18377, 56785, - 10122, 10123, 10124, 86185, 39863, 86186, 10125, 39865, 4066, - 4067, 24257, 4068, 4070, 86187, 4073, 4074, 86188, 4076, 7538, - 4077, 86189, 4078, 4079, 7540, 7541, 4084, 4085, 7542, 86190, - 4086, 86191, 4087, 4088, 86192, 7545, 44874, 7821, 44875, - 86193, 4286, 86194, 51470, 17609, 1408, 47486, 1411, 1412, - 47487, 1413, 1414, 1417, 1415, 47488, 1416, 1418, 1420, 470, - 1422, 1423, 1424, 5001, 5002, 47489, 1427, 1429, 1430, 31811, - 1432, 1433, 47490, 1435, 3753, 1437, 1439, 1440, 47491, 1443, - 47492, 1446, 5004, 5005, 1450, 47493, 353, 1452, 42145, 3103, - 3402, 3104, 3105, 4780, 3106, 3107, 3108, 12157, 3111, 42146, - 42147, 3114, 4782, 42148, 3116, 3117, 42149, 42150, 3407, 3121, - 3122, 18154, 3126, 3127, 3128, 3410, 3130, 3411, 3412, 3415, - 24241, 3417, 3418, 3449, 42151, 3421, 3422, 7587, 42152, 3424, - 3427, 3428, 3448, 3430, 3432, 42153, 42154, 41648, 1991, 407, - 57234, 411, 2862, 57235, 2863, 18368, 57236, 2874, 7350, 4115, - 2876, 2877, 17975, 86195, 4116, 2881, 2882, 2883, 2886, 463, - 870, 872, 873, 874, 875, 8783, 8784, 877, 1480, 1481, 459, - 2778, 881, 8785, 2779, 8786, 8787, 8788, 886, 887, 8789, 889, - 8790, 86196, 6920, 86197, 5080, 5081, 7395, 7396, 9395, 9396, - 1528, 42737, 805, 86198, 1209, 13595, 4126, 9680, 34368, 9682, - 86199, 86200, 174, 175, 176, 177, 178, 179, 180, 182, 183, - 1477, 31138, 186, 172, 187, 188, 189, 190, 191, 458, 871, - 31294, 31295, 27604, 31296, 31297, 882, 883, 884, 31298, 890, - 1089, 1488, 1489, 1092, 1093, 1094, 1095, 1096, 1097, 1490, - 1098, 1495, 1502, 1099, 1100, 1101, 1493, 2997, 12223, 1103, - 2654, 1498, 1499, 1500, 80615, 80616, 80617, 33359, 86201, - 9294, 1501, 86202, 1506, 1507, 23454, 38802, 38803, 1014, - 86203, 5583, 5584, 651, 74717, 5586, 5587, 5588, 5589, 74720, - 5590, 38808, 33527, 78330, 10930, 5119, 10931, 1000, 10928, - 10932, 10933, 10934, 10935, 5863, 10936, 86204, 10938, 10939, - 86205, 192, 194, 38754, 38755, 198, 38756, 38757, 38758, 2842, - 640, 22780, 22781, 1080, 86206, 86207, 1084, 1086, 1088, 63916, - 9412, 970, 9413, 9414, 9415, 9416, 9417, 1310, 7168, 7169, - 1318, 9418, 1324, 39159, 1804, 1557, 24850, 41499, 1560, 41500, - 1562, 1563, 1565, 1927, 1928, 1566, 1569, 1570, 1571, 1572, - 1573, 1574, 1575, 1576, 2674, 2677, 2678, 2679, 2946, 2682, - 2676, 2683, 2947, 1156, 1157, 1158, 1467, 1160, 1468, 1469, - 1161, 1162, 1163, 4369, 1165, 1166, 1167, 12923, 2917, 1169, - 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 18153, 8359, - 1178, 1164, 1191, 1180, 12924, 86208, 86209, 54817, 66962, - 2476, 86210, 86211, 41820, 41821, 41822, 41824, 1130, 1131, - 1132, 32692, 1134, 34848, 1136, 1133, 1137, 1138, 1139, 1140, - 1141, 1143, 1144, 1145, 34849, 2639, 34850, 1146, 1147, 1148, - 34851, 1150, 1151, 1152, 1153, 1154, 1155, 1678, 1679, 1680, - 1681, 40870, 2059, 1685, 1686, 32686, 14970, 1688, 1689, 86212, - 1692, 1682, 1693, 1695, 1696, 1698, 12955, 8909, 41690, 1700, - 41691, 86213, 30949, 41692, 1703, 1704, 1705, 41693, 14976, - 1708, 2071, 1709, 1710, 1711, 1712, 1727, 86214, 86215, 86216, - 1715, 86217, 1714, 1717, 1690, 41697, 86218, 1720, 86219, 2073, - 41699, 1724, 2075, 1726, 1729, 1730, 1732, 2078, 2223, 1735, - 1713, 41700, 1737, 14977, 1739, 1740, 1741, 2080, 1743, 1744, - 1745, 1746, 1747, 1748, 1749, 1750, 1751, 41701, 1752, 1753, - 1909, 86220, 2085, 1754, 19548, 86221, 19551, 5733, 3856, 5190, - 4581, 25145, 86222, 86223, 4846, 86224, 4861, 86225, 86226, - 86227, 25150, 86228, 86229, 13820, 2027, 4898, 4899, 4901, - 2135, 4902, 4868, 4904, 86230, 4905, 25155, 4907, 86231, 4909, - 4910, 4911, 4912, 86232, 6220, 81357, 86233, 2589, 73877, - 29706, 6227, 6228, 86234, 6237, 86235, 6241, 6242, 1812, 13808, - 13809, 70908, 2293, 2294, 86236, 2295, 2296, 2297, 22947, - 16511, 2299, 2300, 2301, 13097, 73079, 86237, 13099, 50121, - 86238, 86239, 13101, 86240, 2424, 4725, 4726, 4727, 4728, 4729, - 4730, 86241, 26881, 10944, 4734, 4735, 4736, 26239, 26240, - 71408, 86242, 57401, 71410, 26244, 5344, 26245, 86243, 4102, - 71414, 11091, 6736, 86244, 6737, 6738, 38152, 6740, 6741, 6742, - 6298, 6743, 6745, 6746, 20867, 6749, 20616, 86245, 9801, 65297, - 20617, 65298, 20619, 5629, 65299, 20621, 20622, 8385, 20623, - 20624, 5191, 20625, 20626, 442, 443, 445, 27837, 77681, 86246, - 27839, 86247, 86248, 41435, 66511, 2478, 2479, 2480, 2481, - 2482, 2483, 2484, 2485, 2486, 2487, 2488, 2489, 2490, 2494, - 2493, 33025, 12084, 2542, 2497, 2499, 2501, 2503, 2504, 2505, - 33026, 2506, 2507, 2508, 2509, 2511, 1787, 12080, 2513, 2514, - 3988, 3176, 3989, 2518, 2521, 9285, 2522, 2524, 2525, 3990, - 2527, 2528, 27499, 2529, 2530, 3991, 2532, 2534, 2535, 18038, - 2536, 2538, 2495, 46077, 61493, 61494, 1006, 713, 4971, 4972, - 4973, 4975, 4976, 650, 170, 7549, 7550, 7551, 7552, 7553, - 86249, 7936, 956, 11169, 11170, 1249, 1244, 1245, 1247, 2544, - 1250, 2545, 1252, 2547, 1253, 1254, 2549, 39636, 1259, 1257, - 1258, 39637, 1260, 1261, 2551, 1262, 1263, 848, 86250, 86251, - 854, 74596, 856, 1957, 86252, 855, 1959, 1961, 857, 86253, 851, - 859, 860, 862, 1964, 864, 865, 866, 867, 1965, 1966, 1967, - 1968, 1969, 86254, 1971, 1972, 1973, 1974, 1975, 1976, 1977, - 841, 1954, 842, 2978, 846, 847, 849, 850, 852, 1956, 17452, - 71941, 86255, 86256, 73665, 1471, 13690, 185, 503, 504, 2342, - 505, 506, 4378, 508, 4379, 17313, 510, 511, 512, 520, 513, - 4384, 17314, 514, 515, 46158, 17317, 518, 34269, 519, 4386, - 523, 524, 525, 46159, 528, 529, 17319, 531, 532, 533, 534, 535, - 7482, 537, 538, 5267, 536, 539, 541, 540, 19858, 17320, 17321, - 906, 907, 908, 17322, 910, 17323, 912, 15850, 913, 4398, 17324, - 86257, 278, 2948, 2949, 2950, 3007, 2951, 2952, 2953, 2954, - 2955, 3013, 35352, 3014, 3015, 2962, 3016, 33505, 39118, 3017, - 3018, 20492, 4000, 3021, 3022, 35353, 39293, 3024, 18443, 3029, - 9467, 20529, 39119, 8380, 2965, 3030, 3043, 22714, 39120, 2956, - 3035, 39121, 3037, 3038, 2688, 86258, 36675, 30894, 24505, - 8888, 13541, 49728, 27660, 9082, 27661, 365, 366, 2232, 76098, - 7233, 1494, 17391, 606, 607, 611, 610, 612, 614, 615, 613, 616, - 9117, 617, 618, 21155, 1789, 619, 620, 7636, 12019, 621, 622, - 1793, 623, 625, 624, 631, 626, 627, 21578, 21103, 628, 21579, - 629, 9122, 9123, 12189, 9289, 3168, 3169, 630, 632, 634, 21580, - 9121, 635, 636, 637, 21581, 12781, 1801, 638, 639, 1559, 24343, - 9419, 9420, 795, 796, 1611, 86259, 1612, 21551, 21552, 3741, - 1617, 3742, 1615, 1619, 1620, 6301, 3744, 1622, 67685, 8521, - 55937, 9025, 27663, 8881, 13581, 86260, 11592, 44720, 86261, - 63231, 50873, 42925, 52332, 86262, 72706, 17705, 17707, 17708, - 3401, 40217, 1248, 40218, 86263, 7098, 86264, 86265, 1264, - 86266, 1266, 1267, 1268, 1269, 86267, 1271, 1272, 1273, 1274, - 2556, 1275, 1276, 1277, 1278, 1279, 1280, 1282, 1283, 22680, - 11889, 86268, 45662, 7038, 86269, 19315, 45663, 45664, 86270, - 5855, 34002, 49245, 10447, 5663, 86271, 15429, 53877, 49249, - 86272, 86273, 86274, 60128, 60453, 60129, 5552, 31923, 43407, - 4287, 17980, 64977, 86275, 86276, 8234, 86277, 3649, 8240, - 1330, 11999, 1332, 27618, 1334, 1335, 340, 3651, 25640, 18165, - 1343, 4618, 1474, 3653, 75921, 1349, 53519, 1779, 45454, 22778, - 40153, 67677, 63826, 45455, 15128, 67678, 67679, 1792, 67680, - 3171, 47816, 45457, 9288, 59891, 67681, 25703, 35731, 35732, - 369, 35713, 35714, 35715, 34652, 35716, 31681, 35717, 12779, - 35718, 35719, 11992, 806, 807, 808, 43499, 43500, 810, 776, - 812, 813, 814, 241, 43501, 43502, 816, 755, 43503, 818, 819, - 820, 43504, 821, 822, 823, 824, 825, 826, 43505, 43506, 43507, - 828, 829, 20083, 43508, 43509, 832, 833, 834, 835, 86278, - 19984, 19985, 86279, 24125, 19986, 86280, 19988, 86281, 5414, - 86282, 85808, 5479, 5420, 5421, 5422, 5423, 63800, 86283, - 86284, 30965, 86285, 416, 1510, 5740, 5741, 81991, 86286, - 28938, 50149, 1003, 55512, 14306, 6960, 688, 86287, 14307, - 5399, 5400, 17783, 24118, 720, 86288, 44913, 24557, 667, 24876, - 6529, 24877, 24878, 24879, 24880, 31847, 20671, 4011, 171, 580, - 86289, 3863, 914, 2202, 916, 917, 918, 919, 921, 922, 923, - 7585, 925, 7586, 926, 927, 928, 7588, 929, 930, 931, 932, 933, - 934, 1875, 1876, 7589, 7590, 1878, 1879, 7591, 7592, 1882, - 1883, 1884, 2212, 7593, 1887, 1888, 1889, 1890, 1891, 1892, - 1893, 1894, 1895, 1896, 1897, 1898, 2217, 1900, 7594, 1902, - 2219, 7595, 1905, 1906, 1907, 3323, 7596, 1911, 1912, 7597, - 1914, 1915, 1916, 2226, 1919, 7598, 2227, 1920, 1921, 7599, - 7600, 4708, 1923, 355, 356, 1549, 358, 32077, 360, 32078, - 21117, 362, 19043, 71677, 5716, 86290, 49790, 86291, 86292, - 86293, 49792, 86294, 86295, 49794, 86296, 86297, 86298, 86299, - 11882, 86300, 49798, 86301, 49800, 49801, 49802, 49803, 453, - 49804, 8591, 6794, 49806, 18989, 49807, 49808, 16308, 49809, - 86302, 86303, 10105, 86304, 5285, 10106, 10107, 6557, 86305, - 23571, 10109, 38883, 10110, 5401, 86306, 67557, 16430, 67558, - 40171, 16433, 25878, 86307, 21762, 23, 86308, 86309, 21766, - 86310, 86311, 5149, 3926, 21768, 21769, 47826, 942, 46985, - 6588, 58867, 6589, 6590, 86312, 6592, 6006, 53855, 9565, 359, - 86313, 2845, 876, 879, 27556, 27557, 885, 27558, 888, 2847, - 27559, 2115, 2116, 2117, 53962, 57839, 315, 316, 317, 318, 319, - 86314, 321, 322, 2122, 323, 2123, 324, 325, 328, 326, 327, - 40542, 329, 330, 18079, 18080, 331, 1790, 7382, 332, 7380, - 7236, 23413, 23414, 18924, 18925, 333, 335, 336, 39750, 337, - 86315, 339, 341, 342, 343, 16264, 16265, 6615, 86316, 86317, - 86318, 86319, 16269, 10538, 33226, 86320, 16272, 5824, 16273, - 16274, 16276, 16277, 16278, 16279, 16280, 14517, 1547, 6463, - 3394, 49677, 659, 10380, 30013, 10382, 10378, 10379, 10383, - 10384, 10385, 86321, 4139, 13370, 13371, 86322, 86323, 11878, - 64509, 15141, 15142, 15143, 32737, 14183, 15144, 39101, 42768, - 5645, 32738, 801, 803, 804, 86324, 14707, 86325, 6601, 12402, - 712, 12403, 2936, 1447, 15477, 1410, 44872, 1550, 8614, 15478, - 15479, 15480, 15481, 4811, 3752, 1442, 15482, 8818, 1445, 5006, - 16304, 32277, 16305, 16306, 86326, 16307, 53691, 69305, 809, - 86327, 815, 26724, 69307, 43484, 63904, 86328, 13498, 827, - 86329, 831, 2857, 836, 86330, 86331, 837, 838, 839, 840, 228, - 229, 43722, 230, 231, 43723, 234, 235, 236, 237, 238, 239, - 2745, 2746, 240, 242, 243, 244, 43724, 19788, 246, 247, 21134, - 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 43725, 43726, - 41, 43727, 262, 43728, 2751, 264, 265, 266, 267, 268, 269, 270, - 271, 272, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, - 1033, 1034, 43729, 1035, 43730, 1037, 21821, 2926, 14388, - 10432, 14389, 14390, 14391, 14392, 86332, 14394, 14395, 2035, - 2169, 86333, 14397, 14398, 14399, 14400, 52, 14401, 14402, - 7077, 21822, 14405, 14406, 14396, 86334, 17356, 17357, 84679, - 84680, 76383, 17360, 17361, 86335, 38801, 2060, 30850, 12963, - 1684, 1687, 2061, 14978, 1694, 43387, 1697, 1699, 2067, 1701, - 1702, 1706, 43388, 43389, 76325, 1716, 1718, 26832, 1719, 1723, - 2081, 2063, 1728, 39059, 76326, 1731, 86336, 1736, 76327, 1738, - 19657, 6579, 6581, 6582, 6583, 6584, 6585, 29979, 1818, 28239, - 68, 69, 3391, 86337, 10266, 63528, 86338, 10269, 10270, 10271, - 10272, 86339, 86340, 63530, 63531, 63532, 63533, 10273, 63534, - 86341, 10681, 10682, 86342, 9673, 86343, 10683, 460, 461, 462, - 467, 4464, 4466, 3729, 471, 472, 468, 81634, 474, 81635, 475, - 476, 477, 479, 480, 81636, 81637, 482, 17442, 81638, 81639, - 484, 485, 486, 4473, 488, 489, 490, 493, 466, 494, 495, 496, - 497, 499, 500, 501, 502, 34376, 86344, 63836, 56281, 1707, - 20416, 61452, 56282, 1755, 56283, 56284, 18508, 53650, 63444, - 86345, 3579, 63445, 3677, 1979, 1980, 1981, 3132, 3147, 34090, - 1987, 12770, 1329, 80818, 80819, 1988, 23522, 1986, 15880, - 1985, 32975, 1992, 1993, 7165, 3141, 3143, 86346, 1982, 1984, - 3145, 86347, 78064, 55453, 2656, 2657, 35634, 35635, 2167, - 43479, - // ensure there is a representative number for any # of int bytes - 1, 1 << 8 + 1, 1 << 16 + 1, 1 << 24 + 1 }; -// data = new int[]{1, 2, 3, 4}; - for (int value : data) { - dataSet.add(new Long(value)); - } + @Test + public void testVInt8() throws Exception { + encoderTest(new VInt8IntEncoder(), data, data); + + // cover negative numbers; + BytesRef bytes = new BytesRef(5); + IntEncoder enc = new VInt8IntEncoder(); + IntsRef values = new IntsRef(1); + values.ints[values.length++] = -1; + enc.encode(values, bytes); + + IntDecoder dec = enc.createMatchingDecoder(); + values.length = 0; + dec.decode(bytes, values); + assertEquals(1, values.length); + assertEquals(-1, values.ints[0]); + } + + @Test + public void testSimpleInt() throws Exception { + encoderTest(new SimpleIntEncoder(), data, data); + } + + @Test + public void testSortingUniqueValues() throws Exception { + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new VInt8IntEncoder())), data, uniqueSortedData); + } + + @Test + public void testSortingUniqueDGap() throws Exception { + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder()))), data, uniqueSortedData); + } + + @Test + public void testSortingUniqueDGapEightFlags() throws Exception { + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new EightFlagsIntEncoder()))), data, uniqueSortedData); + } + + @Test + public void testSortingUniqueDGapFourFlags() throws Exception { + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new FourFlagsIntEncoder()))), data, uniqueSortedData); + } + + @Test + public void testSortingUniqueDGapNOnes4() throws Exception { + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(4)))), data, uniqueSortedData); + } + + @Test + public void testSortingUniqueDGapNOnes3() throws Exception { + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(3)))), data, uniqueSortedData); } } diff --git a/lucene/facet/src/test/org/apache/lucene/util/encoding/Vint8Test.java b/lucene/facet/src/test/org/apache/lucene/util/encoding/Vint8Test.java new file mode 100644 index 00000000000..f0556f3c718 --- /dev/null +++ b/lucene/facet/src/test/org/apache/lucene/util/encoding/Vint8Test.java @@ -0,0 +1,54 @@ +package org.apache.lucene.util.encoding; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.Test; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Tests the {@link VInt8} class. + */ +public class Vint8Test extends LuceneTestCase { + + private static final int[] TEST_VALUES = { + -1000000000, + -1, 0, (1 << 7) - 1, 1 << 7, (1 << 14) - 1, 1 << 14, + (1 << 21) - 1, 1 << 21, (1 << 28) - 1, 1 << 28 + }; + private static int[] BYTES_NEEDED_TEST_VALUES = { + 5, 5, 1, 1, 2, 2, 3, 3, 4, 4, 5 + }; + + @Test + public void testBytesRef() throws Exception { + BytesRef bytes = new BytesRef(256); + int expectedSize = 0; + for (int j = 0; j < TEST_VALUES.length; j++) { + VInt8.encode(TEST_VALUES[j], bytes); + expectedSize += BYTES_NEEDED_TEST_VALUES[j]; + } + assertEquals(expectedSize, bytes.length); + + for (int j = 0; j < TEST_VALUES.length; j++) { + assertEquals(TEST_VALUES[j], VInt8.decode(bytes)); + } + assertEquals(bytes.offset, bytes.length); + } + +} diff --git a/lucene/misc/src/java/org/apache/lucene/store/NativeUnixDirectory.java b/lucene/misc/src/java/org/apache/lucene/store/NativeUnixDirectory.java index fe9510757c8..817b5620a89 100644 --- a/lucene/misc/src/java/org/apache/lucene/store/NativeUnixDirectory.java +++ b/lucene/misc/src/java/org/apache/lucene/store/NativeUnixDirectory.java @@ -73,7 +73,7 @@ public class NativeUnixDirectory extends FSDirectory { private final static long ALIGN = 512; private final static long ALIGN_NOT_MASK = ~(ALIGN-1); - /** Default buffer size before writing to disk (256 MB); + /** Default buffer size before writing to disk (256 KB); * larger means less IO load but more RAM and direct * buffer storage space consumed during merging. */ diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java index 83ab1ce47d2..518c14234f9 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java @@ -237,7 +237,7 @@ public class FSTCompletionBuilder { final Object empty = outputs.getNoOutput(); final Builder builder = new Builder( FST.INPUT_TYPE.BYTE1, 0, 0, true, true, - shareMaxTailLength, outputs, null, false); + shareMaxTailLength, outputs, null, false, true); BytesRef scratch = new BytesRef(); BytesRef entry; diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingCodec.java index 7d775e3613f..6b1c101992f 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingCodec.java @@ -19,6 +19,7 @@ package org.apache.lucene.codecs.asserting; import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.lucene41.Lucene41Codec; @@ -29,6 +30,7 @@ public final class AssertingCodec extends FilterCodec { private final PostingsFormat postings = new AssertingPostingsFormat(); private final TermVectorsFormat vectors = new AssertingTermVectorsFormat(); + private final StoredFieldsFormat storedFields = new AssertingStoredFieldsFormat(); public AssertingCodec() { super("Asserting", new Lucene41Codec()); @@ -44,4 +46,8 @@ public final class AssertingCodec extends FilterCodec { return vectors; } + @Override + public StoredFieldsFormat storedFieldsFormat() { + return storedFields; + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingStoredFieldsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingStoredFieldsFormat.java new file mode 100644 index 00000000000..6fa8248630f --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingStoredFieldsFormat.java @@ -0,0 +1,136 @@ +package org.apache.lucene.codecs.asserting; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.StoredFieldsReader; +import org.apache.lucene.codecs.StoredFieldsWriter; +import org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.StorableField; +import org.apache.lucene.index.StoredFieldVisitor; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; + +/** + * Just like {@link Lucene41StoredFieldsFormat} but with additional asserts. + */ +public class AssertingStoredFieldsFormat extends StoredFieldsFormat { + private final StoredFieldsFormat in = new Lucene41StoredFieldsFormat(); + + @Override + public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException { + return new AssertingStoredFieldsReader(in.fieldsReader(directory, si, fn, context), si.getDocCount()); + } + + @Override + public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) throws IOException { + return new AssertingStoredFieldsWriter(in.fieldsWriter(directory, si, context)); + } + + static class AssertingStoredFieldsReader extends StoredFieldsReader { + private final StoredFieldsReader in; + private final int maxDoc; + + AssertingStoredFieldsReader(StoredFieldsReader in, int maxDoc) { + this.in = in; + this.maxDoc = maxDoc; + } + + @Override + public void close() throws IOException { + in.close(); + } + + @Override + public void visitDocument(int n, StoredFieldVisitor visitor) throws IOException { + assert n >= 0 && n < maxDoc; + in.visitDocument(n, visitor); + } + + @Override + public StoredFieldsReader clone() { + return new AssertingStoredFieldsReader(in.clone(), maxDoc); + } + } + + enum Status { + UNDEFINED, STARTED, FINISHED; + } + + static class AssertingStoredFieldsWriter extends StoredFieldsWriter { + private final StoredFieldsWriter in; + private int numWritten; + private int fieldCount; + private Status docStatus; + + AssertingStoredFieldsWriter(StoredFieldsWriter in) { + this.in = in; + this.docStatus = Status.UNDEFINED; + } + + @Override + public void startDocument(int numStoredFields) throws IOException { + assert docStatus != Status.STARTED; + in.startDocument(numStoredFields); + assert fieldCount == 0; + fieldCount = numStoredFields; + numWritten++; + docStatus = Status.STARTED; + } + + @Override + public void finishDocument() throws IOException { + assert docStatus == Status.STARTED; + assert fieldCount == 0; + in.finishDocument(); + docStatus = Status.FINISHED; + } + + @Override + public void writeField(FieldInfo info, StorableField field) throws IOException { + assert docStatus == Status.STARTED; + in.writeField(info, field); + assert fieldCount > 0; + fieldCount--; + } + + @Override + public void abort() { + in.abort(); + } + + @Override + public void finish(FieldInfos fis, int numDocs) throws IOException { + assert docStatus == (numDocs > 0 ? Status.FINISHED : Status.UNDEFINED); + in.finish(fis, numDocs); + assert fieldCount == 0; + assert numDocs == numWritten; + } + + @Override + public void close() throws IOException { + in.close(); + assert docStatus != Status.STARTED; + } + } +} diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingTermVectorsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingTermVectorsFormat.java index eacb4d9995d..257dee17ae2 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingTermVectorsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingTermVectorsFormat.java @@ -18,17 +18,20 @@ package org.apache.lucene.codecs.asserting; */ import java.io.IOException; +import java.util.Comparator; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.codecs.TermVectorsWriter; import org.apache.lucene.codecs.lucene40.Lucene40TermVectorsFormat; import org.apache.lucene.index.AssertingAtomicReader; +import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Fields; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; +import org.apache.lucene.util.BytesRef; /** * Just like {@link Lucene40TermVectorsFormat} but with additional asserts. @@ -43,16 +46,16 @@ public class AssertingTermVectorsFormat extends TermVectorsFormat { @Override public TermVectorsWriter vectorsWriter(Directory directory, SegmentInfo segmentInfo, IOContext context) throws IOException { - return in.vectorsWriter(directory, segmentInfo, context); + return new AssertingTermVectorsWriter(in.vectorsWriter(directory, segmentInfo, context)); } - + static class AssertingTermVectorsReader extends TermVectorsReader { private final TermVectorsReader in; - + AssertingTermVectorsReader(TermVectorsReader in) { this.in = in; } - + @Override public void close() throws IOException { in.close(); @@ -68,5 +71,120 @@ public class AssertingTermVectorsFormat extends TermVectorsFormat { public TermVectorsReader clone() { return new AssertingTermVectorsReader(in.clone()); } - } + } + + enum Status { + UNDEFINED, STARTED, FINISHED; + } + + static class AssertingTermVectorsWriter extends TermVectorsWriter { + private final TermVectorsWriter in; + private Status docStatus, fieldStatus, termStatus; + private int fieldCount, termCount, positionCount; + boolean hasPositions; + + AssertingTermVectorsWriter(TermVectorsWriter in) { + this.in = in; + docStatus = Status.UNDEFINED; + fieldStatus = Status.UNDEFINED; + termStatus = Status.UNDEFINED; + fieldCount = termCount = positionCount = 0; + } + + @Override + public void startDocument(int numVectorFields) throws IOException { + assert fieldCount == 0; + assert docStatus != Status.STARTED; + in.startDocument(numVectorFields); + docStatus = Status.STARTED; + fieldCount = numVectorFields; + } + + @Override + public void finishDocument() throws IOException { + assert fieldCount == 0; + assert docStatus == Status.STARTED; + in.finishDocument(); + docStatus = Status.FINISHED; + } + + @Override + public void startField(FieldInfo info, int numTerms, boolean positions, + boolean offsets, boolean payloads) throws IOException { + assert termCount == 0; + assert docStatus == Status.STARTED; + assert fieldStatus != Status.STARTED; + in.startField(info, numTerms, positions, offsets, payloads); + fieldStatus = Status.STARTED; + termCount = numTerms; + hasPositions = positions || offsets || payloads; + } + + @Override + public void finishField() throws IOException { + assert termCount == 0; + assert fieldStatus == Status.STARTED; + in.finishField(); + fieldStatus = Status.FINISHED; + --fieldCount; + } + + @Override + public void startTerm(BytesRef term, int freq) throws IOException { + assert docStatus == Status.STARTED; + assert fieldStatus == Status.STARTED; + assert termStatus != Status.STARTED; + in.startTerm(term, freq); + termStatus = Status.STARTED; + positionCount = hasPositions ? freq : 0; + } + + @Override + public void finishTerm() throws IOException { + assert positionCount == 0; + assert docStatus == Status.STARTED; + assert fieldStatus == Status.STARTED; + assert termStatus == Status.STARTED; + in.finishTerm(); + termStatus = Status.FINISHED; + --termCount; + } + + @Override + public void addPosition(int position, int startOffset, int endOffset, + BytesRef payload) throws IOException { + assert docStatus == Status.STARTED; + assert fieldStatus == Status.STARTED; + assert termStatus == Status.STARTED; + in.addPosition(position, startOffset, endOffset, payload); + --positionCount; + } + + @Override + public void abort() { + in.abort(); + } + + @Override + public void finish(FieldInfos fis, int numDocs) throws IOException { + assert docStatus == (numDocs > 0 ? Status.FINISHED : Status.UNDEFINED); + assert fieldStatus != Status.STARTED; + assert termStatus != Status.STARTED; + in.finish(fis, numDocs); + } + + @Override + public Comparator getComparator() throws IOException { + return in.getComparator(); + } + + @Override + public void close() throws IOException { + in.close(); + assert docStatus != Status.STARTED; + assert fieldStatus != Status.STARTED; + assert termStatus != Status.STARTED; + } + + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java index 6a67dc34e56..8d371014084 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java @@ -21,6 +21,7 @@ import java.util.Random; import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.compressing.dummy.DummyCompressingCodec; import org.apache.lucene.codecs.lucene41.Lucene41Codec; import com.carrotsearch.randomizedtesting.generators.RandomInts; diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/DummyCompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/dummy/DummyCompressingCodec.java similarity index 86% rename from lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/DummyCompressingCodec.java rename to lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/dummy/DummyCompressingCodec.java index a989bd87365..9e1649dc79e 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/DummyCompressingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/dummy/DummyCompressingCodec.java @@ -1,4 +1,4 @@ -package org.apache.lucene.codecs.compressing; +package org.apache.lucene.codecs.compressing.dummy; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -19,12 +19,18 @@ package org.apache.lucene.codecs.compressing; import java.io.IOException; +import org.apache.lucene.codecs.compressing.CompressingCodec; +import org.apache.lucene.codecs.compressing.CompressionMode; +import org.apache.lucene.codecs.compressing.Compressor; +import org.apache.lucene.codecs.compressing.Decompressor; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; /** CompressionCodec that does not compress data, useful for testing. */ +// In its own package to make sure the oal.codecs.compressing classes are +// visible enough to let people write their own CompressionMode public class DummyCompressingCodec extends CompressingCodec { public static final CompressionMode DUMMY = new CompressionMode() { @@ -60,11 +66,6 @@ public class DummyCompressingCodec extends CompressingCodec { bytes.length = length; } - @Override - public void copyCompressedData(DataInput in, int originalLength, DataOutput out) throws IOException { - out.copyBytes(in, originalLength); - } - @Override public Decompressor clone() { return this; diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/dummy/package.html b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/dummy/package.html new file mode 100644 index 00000000000..4ab3a440d50 --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/dummy/package.html @@ -0,0 +1,25 @@ + + + + + + + +Dummy CompressingCodec implementation used for testing. + + diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java b/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java index be21c1a0f47..2ff3ee4491c 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java @@ -287,7 +287,8 @@ public class FSTTester { allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE, outputs, null, - willRewrite); + willRewrite, + true); for(InputOutput pair : pairs) { if (pair.output instanceof List) { diff --git a/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index 5fafff60fbb..9898475e139 100644 --- a/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -17,4 +17,4 @@ org.apache.lucene.codecs.asserting.AssertingCodec org.apache.lucene.codecs.compressing.FastCompressingCodec org.apache.lucene.codecs.compressing.FastDecompressionCompressingCodec org.apache.lucene.codecs.compressing.HighCompressionCompressingCodec -org.apache.lucene.codecs.compressing.DummyCompressingCodec +org.apache.lucene.codecs.compressing.dummy.DummyCompressingCodec diff --git a/lucene/tools/forbiddenApis/servlet-api.txt b/lucene/tools/forbiddenApis/servlet-api.txt index fb8c13c649f..4034408da00 100644 --- a/lucene/tools/forbiddenApis/servlet-api.txt +++ b/lucene/tools/forbiddenApis/servlet-api.txt @@ -24,3 +24,18 @@ javax.servlet.ServletRequest#getParameterNames() javax.servlet.ServletRequest#getParameterValues(java.lang.String) javax.servlet.ServletResponse#getWriter() +javax.servlet.ServletInputStream#readLine(byte[],int,int) +javax.servlet.ServletOutputStream#print(boolean) +javax.servlet.ServletOutputStream#print(char) +javax.servlet.ServletOutputStream#print(double) +javax.servlet.ServletOutputStream#print(float) +javax.servlet.ServletOutputStream#print(int) +javax.servlet.ServletOutputStream#print(long) +javax.servlet.ServletOutputStream#print(java.lang.String) +javax.servlet.ServletOutputStream#println(boolean) +javax.servlet.ServletOutputStream#println(char) +javax.servlet.ServletOutputStream#println(double) +javax.servlet.ServletOutputStream#println(float) +javax.servlet.ServletOutputStream#println(int) +javax.servlet.ServletOutputStream#println(long) +javax.servlet.ServletOutputStream#println(java.lang.String) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 555fc83f3fb..5c33e2a634d 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -195,12 +195,14 @@ New Features that can be set to false to not filter. Its useful when there is already a spatial filter query but you also need to sort or boost by distance. (David Smiley) -* SOLR-4265: Solr now parses request parameters (in URL or sent with POST using - content-type application/x-www-form-urlencoded) in its dispatcher code. It no +* SOLR-4265, SOLR-4283: Solr now parses request parameters (in URL or sent with POST + using content-type application/x-www-form-urlencoded) in its dispatcher code. It no longer relies on special configuration settings in Tomcat or other web containers - to enable UTF-8 encoding, which is mandatory for correct Solr behaviour. Also - the maximum length of x-www-form-urlencoded POST parameters can now be configured - through the requestDispatcher/requestParsers/@formdataUploadLimitInKB setting in + to enable UTF-8 encoding, which is mandatory for correct Solr behaviour. Query + strings passed in via the URL need to be properly-%-escaped, UTF-8 encoded + bytes, otherwise Solr refuses to handle the request. The maximum length of + x-www-form-urlencoded POST parameters can now be configured through the + requestDispatcher/requestParsers/@formdataUploadLimitInKB setting in solrconfig.xml (defaults to 2 MiB). Solr now works out of the box with e.g. Tomcat, JBoss,... (Uwe Schindler, Dawid Weiss, Alex Rocher) @@ -246,6 +248,8 @@ Optimizations * SOLR-3840: XML query response display is unreadable in Solr Admin Query UI (steffkes) +* SOLR-3982: Admin UI: Various Dataimport Improvements (steffkes) + Bug Fixes ---------------------- @@ -493,6 +497,12 @@ Bug Fixes * SOLR-4170: The 'backup' ReplicationHandler command can sometimes use a stale index directory rather than the current one. (Mark Miller, Marcin Rzewuck) +* SOLR-3876: Solr Admin UI is completely dysfunctional on IE 9 (steffkes) + +* SOLR-4112: Fixed DataImportHandler ZKAwarePropertiesWriter implementation so + import works fine with SolrCloud clusters (Deniz Durmus, James Dyer, + Erick Erickson, shalin) + Other Changes ---------------------- @@ -567,6 +577,9 @@ Other Changes * SOLR-4226: Extract fl parsing code out of ReturnFields constructor. (Ryan Ernst via Robert Muir) +* SOLR-4208: ExtendedDismaxQParserPlugin has been refactored to make + subclassing easier. (Tomás Fernández Löbbe, hossman) + ================== 4.0.0 ================== Versions of Major Components @@ -1684,6 +1697,8 @@ Bug Fixes * SOLR-1958: When using the MailEntityProcessor, import would fail if fetchMailsSince was not specified. (Max Lynch via James Dyer) +* SOLR-4289: Admin UI - JVM memory bar - dark grey "used" width is too small + (steffkes, elyograg) Other Changes ---------------------- diff --git a/solr/build.xml b/solr/build.xml index ae70a73d3ae..7110d6ad89a 100644 --- a/solr/build.xml +++ b/solr/build.xml @@ -364,7 +364,20 @@ - + + + + + + + + + + + + + diff --git a/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/SimplePropertiesWriter.java b/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/SimplePropertiesWriter.java index a30ede5a3e5..82a3bd1a116 100644 --- a/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/SimplePropertiesWriter.java +++ b/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/SimplePropertiesWriter.java @@ -84,12 +84,7 @@ public class SimplePropertiesWriter extends DIHProperties { } else { filename = "dataimport.properties"; } - if(params.get(DIRECTORY) != null) { - configDir = params.get(DIRECTORY); - } else { - SolrCore core = dataImporter.getCore(); - configDir = (core == null ? "." : core.getResourceLoader().getConfigDir()); - } + findDirectory(dataImporter, params); if(params.get(LOCALE) != null) { String localeStr = params.get(LOCALE); for (Locale l : Locale.getAvailableLocales()) { @@ -109,6 +104,14 @@ public class SimplePropertiesWriter extends DIHProperties { } else { dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", locale); } + } + protected void findDirectory(DataImporter dataImporter, Map params) { + if(params.get(DIRECTORY) != null) { + configDir = params.get(DIRECTORY); + } else { + SolrCore core = dataImporter.getCore(); + configDir = (core == null ? "." : core.getResourceLoader().getConfigDir()); + } } private File getPersistFile() { diff --git a/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/ZKPropertiesWriter.java b/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/ZKPropertiesWriter.java index 3838a286533..afca619f6c2 100644 --- a/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/ZKPropertiesWriter.java +++ b/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/ZKPropertiesWriter.java @@ -42,12 +42,16 @@ public class ZKPropertiesWriter extends SimplePropertiesWriter { @Override public void init(DataImporter dataImporter, Map params) { - super.init(dataImporter, params); + super.init(dataImporter, params); + zkClient = dataImporter.getCore().getCoreDescriptor().getCoreContainer() + .getZkController().getZkClient(); + } + + @Override + protected void findDirectory(DataImporter dataImporter, Map params) { String collection = dataImporter.getCore().getCoreDescriptor() .getCloudDescriptor().getCollectionName(); path = "/configs/" + collection + "/" + filename; - zkClient = dataImporter.getCore().getCoreDescriptor().getCoreContainer() - .getZkController().getZkClient(); } @Override diff --git a/solr/contrib/dataimporthandler/src/test-files/dih/solr/collection1/conf/dataimport-schema.xml b/solr/contrib/dataimporthandler/src/test-files/dih/solr/collection1/conf/dataimport-schema.xml index 2bf706b0f82..5c87fb8192d 100644 --- a/solr/contrib/dataimporthandler/src/test-files/dih/solr/collection1/conf/dataimport-schema.xml +++ b/solr/contrib/dataimporthandler/src/test-files/dih/solr/collection1/conf/dataimport-schema.xml @@ -41,6 +41,8 @@ + + diff --git a/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestZKPropertiesWriter.java b/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestZKPropertiesWriter.java new file mode 100644 index 00000000000..6f291062bd9 --- /dev/null +++ b/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestZKPropertiesWriter.java @@ -0,0 +1,158 @@ +package org.apache.solr.handler.dataimport; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.solr.cloud.AbstractZkTestCase; +import org.apache.solr.cloud.ZkTestServer; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.request.LocalSolrQueryRequest; +import org.apache.solr.request.SolrQueryRequest; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.File; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +public class TestZKPropertiesWriter extends AbstractDataImportHandlerTestCase { + protected static ZkTestServer zkServer; + + protected static String zkDir; + + private String dateFormat = "yyyy-MM-dd HH:mm:ss.SSSSSS"; + + @BeforeClass + public static void dihZk_beforeClass() throws Exception { + createTempDir(); + zkDir = dataDir.getAbsolutePath() + File.separator + + "zookeeper/server1/data"; + zkServer = new ZkTestServer(zkDir); + zkServer.run(); + + System.setProperty("solrcloud.skip.autorecovery", "true"); + System.setProperty("zkHost", zkServer.getZkAddress()); + System.setProperty("jetty.port", "0000"); + + AbstractZkTestCase.buildZooKeeper(zkServer.getZkHost(), zkServer.getZkAddress(), getFile("dih/solr"), + "dataimport-solrconfig.xml", "dataimport-schema.xml"); + + initCore("dataimport-solrconfig.xml", "dataimport-schema.xml", getFile("dih/solr").getAbsolutePath()); + } + + @Before + public void beforeDihZKTest() throws Exception { + clearIndex(); + assertU(commit()); + } + + @After + public void afterDihZkTest() throws Exception { + MockDataSource.clearCache(); + } + + + @AfterClass + public static void dihZk_afterClass() throws Exception { + zkServer.shutdown(); + + zkServer = null; + zkDir = null; + + // wait just a bit for any zk client threads to outlast timeout + Thread.sleep(2000); + } + + @Test + public void testZKPropertiesWriter() throws Exception { + // test using ZooKeeper + assertTrue("Not using ZooKeeper", h.getCoreContainer().isZooKeeperAware()); + + // for the really slow/busy computer, we wait to make sure we have a leader before starting + h.getCoreContainer().getZkController().getZkStateReader().getLeaderUrl("collection1", "shard1", 30000); + + assertQ("test query on empty index", request("qlkciyopsbgzyvkylsjhchghjrdf"), + "//result[@numFound='0']"); + + SimpleDateFormat errMsgFormat = new SimpleDateFormat(dateFormat, Locale.ROOT); + + delQ("*:*"); + commit(); + SimpleDateFormat df = new SimpleDateFormat(dateFormat, Locale.ROOT); + Date oneSecondAgo = new Date(System.currentTimeMillis() - 1000); + + Map init = new HashMap(); + init.put("dateFormat", dateFormat); + ZKPropertiesWriter spw = new ZKPropertiesWriter(); + spw.init(new DataImporter(h.getCore(), "dataimport"), init); + Map props = new HashMap(); + props.put("SomeDates.last_index_time", oneSecondAgo); + props.put("last_index_time", oneSecondAgo); + spw.persist(props); + + List rows = new ArrayList(); + rows.add(createMap("id", "1", "year_s", "2013")); + MockDataSource.setIterator("select " + df.format(oneSecondAgo) + " from dummy", rows.iterator()); + + h.query("/dataimport", lrf.makeRequest("command", "full-import", "dataConfig", + generateConfig(), "clean", "true", "commit", "true", "synchronous", + "true", "indent", "true")); + props = spw.readIndexerProperties(); + Date entityDate = df.parse((String) props.get("SomeDates.last_index_time")); + Date docDate = df.parse((String) props.get("last_index_time")); + + Assert.assertTrue("This date: " + errMsgFormat.format(oneSecondAgo) + " should be prior to the document date: " + errMsgFormat.format(docDate), docDate.getTime() - oneSecondAgo.getTime() > 0); + Assert.assertTrue("This date: " + errMsgFormat.format(oneSecondAgo) + " should be prior to the entity date: " + errMsgFormat.format(entityDate), entityDate.getTime() - oneSecondAgo.getTime() > 0); + assertQ(request("*:*"), "//*[@numFound='1']", "//doc/str[@name=\"year_s\"]=\"2013\""); + + } + + public SolrQueryRequest request(String... q) { + LocalSolrQueryRequest req = lrf.makeRequest(q); + ModifiableSolrParams params = new ModifiableSolrParams(); + params.add(req.getParams()); + params.set("distrib", true); + req.setParams(params); + return req; + } + + protected String generateConfig() { + StringBuilder sb = new StringBuilder(); + sb.append(" \n"); + sb.append("\n"); + sb.append("\n"); + sb.append(" \n"); + sb.append("\n"); + sb.append(" \n"); + sb.append("\n"); + sb.append(" \n"); + sb.append(" \n"); + String config = sb.toString(); + log.debug(config); + return config; + } +} diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java index 661dd3275af..b15e1988fee 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java +++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java @@ -748,7 +748,8 @@ public final class ZkController { // now wait until our currently cloud state contains the latest leader String clusterStateLeader = zkStateReader.getLeaderUrl(collection, - shardId, timeoutms); + shardId, timeoutms * 2); // since we found it in zk, we are willing to + // wait a while to find it in state int tries = 0; while (!leaderUrl.equals(clusterStateLeader)) { if (tries == 60) { diff --git a/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java b/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java new file mode 100644 index 00000000000..51929e821ab --- /dev/null +++ b/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java @@ -0,0 +1,1512 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.search; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.core.StopFilterFactory; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.queries.function.BoostedQuery; +import org.apache.lucene.queries.function.FunctionQuery; +import org.apache.lucene.queries.function.ValueSource; +import org.apache.lucene.queries.function.valuesource.ProductFloatFunction; +import org.apache.lucene.queries.function.valuesource.QueryValueSource; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.DisjunctionMaxQuery; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MultiPhraseQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.solr.analysis.TokenizerChain; +import org.apache.solr.common.params.DisMaxParams; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.parser.QueryParser; +import org.apache.solr.parser.SolrQueryParserBase.MagicFieldName; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.schema.FieldType; +import org.apache.solr.util.SolrPluginUtils; + +/** + * Query parser that generates DisjunctionMaxQueries based on user configuration. + * See Wiki page http://wiki.apache.org/solr/ExtendedDisMax + */ +public class ExtendedDismaxQParser extends QParser { + + + /** + * A field we can't ever find in any schema, so we can safely tell + * DisjunctionMaxQueryParser to use it as our defaultField, and + * map aliases from it to any field in our schema. + */ + private static String IMPOSSIBLE_FIELD_NAME = "\uFFFC\uFFFC\uFFFC"; + + /** shorten the class references for utilities */ + private static class U extends SolrPluginUtils { + /* :NOOP */ + } + + /** shorten the class references for utilities */ + private static interface DMP extends DisMaxParams { + /** + * User fields. The fields that can be used by the end user to create field-specific queries. + */ + public static String UF = "uf"; + + /** + * Lowercase Operators. If set to true, 'or' and 'and' will be considered OR and AND, otherwise + * lowercase operators will be considered terms to search for. + */ + public static String LOWERCASE_OPS = "lowercaseOperators"; + + /** + * Multiplicative boost. Boost functions which scores are going to be multiplied to the score + * of the main query (instead of just added, like with bf) + */ + public static String MULT_BOOST = "boost"; + + /** + * If set to true, stopwords are removed from the query. + */ + public static String STOPWORDS = "stopwords"; + } + + private ExtendedDismaxConfiguration config; + private Query parsedUserQuery; + private Query altUserQuery; + private List boostQueries; + + + public ExtendedDismaxQParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) { + super(qstr, localParams, params, req); + config = this.createConfiguration(qstr,localParams,params,req); + } + + @Override + public Query parse() throws SyntaxError { + + /* the main query we will execute. we disable the coord because + * this query is an artificial construct + */ + BooleanQuery query = new BooleanQuery(true); + + /* * * Main User Query * * */ + parsedUserQuery = null; + String userQuery = getString(); + altUserQuery = null; + if( userQuery == null || userQuery.trim().length() == 0 ) { + // If no query is specified, we may have an alternate + if (config.altQ != null) { + QParser altQParser = subQuery(config.altQ, null); + altUserQuery = altQParser.getQuery(); + query.add( altUserQuery , BooleanClause.Occur.MUST ); + } else { + return null; + // throw new SyntaxError("missing query string" ); + } + } else { + // There is a valid query string + ExtendedSolrQueryParser up = createEdismaxQueryParser(this, IMPOSSIBLE_FIELD_NAME); + up.addAlias(IMPOSSIBLE_FIELD_NAME, config.tiebreaker, config.queryFields); + addAliasesFromRequest(up, config.tiebreaker); + up.setPhraseSlop(config.qslop); // slop for explicit user phrase queries + up.setAllowLeadingWildcard(true); + + // defer escaping and only do if lucene parsing fails, or we need phrases + // parsing fails. Need to sloppy phrase queries anyway though. + List clauses = splitIntoClauses(userQuery, false); + + // Always rebuild mainUserQuery from clauses to catch modifications from splitIntoClauses + // This was necessary for userFields modifications to get propagated into the query. + // Convert lower or mixed case operators to uppercase if we saw them. + // only do this for the lucene query part and not for phrase query boosting + // since some fields might not be case insensitive. + // We don't use a regex for this because it might change and AND or OR in + // a phrase query in a case sensitive field. + String mainUserQuery = rebuildUserQuery(clauses, config.lowercaseOperators); + + // but always for unstructured implicit bqs created by getFieldQuery + up.minShouldMatch = config.minShouldMatch; + + parsedUserQuery = parseOriginalQuery(up, mainUserQuery, clauses, config); + + if (parsedUserQuery == null) { + parsedUserQuery = parseEscapedQuery(up, escapeUserQuery(clauses), config); + } + + query.add(parsedUserQuery, BooleanClause.Occur.MUST); + + addPhraseFieldQueries(query, clauses, config); + + } + + /* * * Boosting Query * * */ + boostQueries = getBoostQueries(); + for(Query f : boostQueries) { + query.add(f, BooleanClause.Occur.SHOULD); + } + + /* * * Boosting Functions * * */ + List boostFunctions = getBoostFunctions(); + for(Query f : boostFunctions) { + query.add(f, BooleanClause.Occur.SHOULD); + } + + // + // create a boosted query (scores multiplied by boosts) + // + Query topQuery = query; + List boosts = getMultiplicativeBoosts(); + if (boosts.size()>1) { + ValueSource prod = new ProductFloatFunction(boosts.toArray(new ValueSource[boosts.size()])); + topQuery = new BoostedQuery(query, prod); + } else if (boosts.size() == 1) { + topQuery = new BoostedQuery(query, boosts.get(0)); + } + + return topQuery; + } + + /** + * Adds shingled phrase queries to all the fields specified in the pf, pf2 anf pf3 parameters + * + */ + protected void addPhraseFieldQueries(BooleanQuery query, List clauses, + ExtendedDismaxConfiguration config) throws SyntaxError { + + // sloppy phrase queries for proximity + List allPhraseFields = config.getAllPhraseFields(); + + if (allPhraseFields.size() > 0) { + // find non-field clauses + List normalClauses = new ArrayList(clauses.size()); + for (Clause clause : clauses) { + if (clause.field != null || clause.isPhrase) continue; + // check for keywords "AND,OR,TO" + if (clause.isBareWord()) { + String s = clause.val.toString(); + // avoid putting explicit operators in the phrase query + if ("OR".equals(s) || "AND".equals(s) || "NOT".equals(s) || "TO".equals(s)) continue; + } + normalClauses.add(clause); + } + + // full phrase and shingles + for (FieldParams phraseField: allPhraseFields) { + Map pf = new HashMap(1); + pf.put(phraseField.getField(),phraseField.getBoost()); + addShingledPhraseQueries(query, normalClauses, pf, + phraseField.getWordGrams(),config.tiebreaker, phraseField.getSlop()); + } + + } + } + + /** + * Creates an instance of ExtendedDismaxConfiguration. It will contain all + * the necessary parameters to parse the query + */ + protected ExtendedDismaxConfiguration createConfiguration(String qstr, + SolrParams localParams, SolrParams params, SolrQueryRequest req) { + return new ExtendedDismaxConfiguration(localParams,params,req); + } + + /** + * Creates an instance of ExtendedSolrQueryParser, the query parser that's going to be used + * to parse the query. + */ + protected ExtendedSolrQueryParser createEdismaxQueryParser(QParser qParser, String field) { + return new ExtendedSolrQueryParser(qParser, field); + } + + /** + * Parses an escaped version of the user's query. This method is called + * in the event that the original query encounters exceptions during parsing. + * + * @param up parser used + * @param escapedUserQuery query that is parsed, should already be escaped so that no trivial parse errors are encountered + * @param config Configuration options for this parse request + * @return the resulting query (flattened if needed) with "min should match" rules applied as specified in the config. + * @see #parseOriginalQuery + * @see SolrPluginUtils#flattenBooleanQuery + */ + protected Query parseEscapedQuery(ExtendedSolrQueryParser up, + String escapedUserQuery, ExtendedDismaxConfiguration config) throws SyntaxError { + Query query = up.parse(escapedUserQuery); + + if (query instanceof BooleanQuery) { + BooleanQuery t = new BooleanQuery(); + SolrPluginUtils.flattenBooleanQuery(t, (BooleanQuery)query); + SolrPluginUtils.setMinShouldMatch(t, config.minShouldMatch); + query = t; + } + return query; + } + + /** + * Parses the user's original query. This method attempts to cleanly parse the specified query string using the specified parser, any Exceptions are ignored resulting in null being returned. + * + * @param up parser used + * @param mainUserQuery query string that is parsed + * @param clauses used to dictate "min should match" logic + * @param config Configuration options for this parse request + * @return the resulting query with "min should match" rules applied as specified in the config. + * @see #parseEscapedQuery + */ + protected Query parseOriginalQuery(ExtendedSolrQueryParser up, + String mainUserQuery, List clauses, ExtendedDismaxConfiguration config) { + + Query query = null; + try { + up.setRemoveStopFilter(!config.stopwords); + up.exceptions = true; + query = up.parse(mainUserQuery); + + if (shouldRemoveStopFilter(config, query)) { + // if the query was all stop words, remove none of them + up.setRemoveStopFilter(true); + query = up.parse(mainUserQuery); + } + } catch (Exception e) { + // ignore failure and reparse later after escaping reserved chars + up.exceptions = false; + } + + if(query == null) { + return null; + } + // For correct lucene queries, turn off mm processing if there + // were explicit operators (except for AND). + boolean doMinMatched = doMinMatched(clauses, config.lowercaseOperators); + if (doMinMatched && query instanceof BooleanQuery) { + SolrPluginUtils.setMinShouldMatch((BooleanQuery)query, config.minShouldMatch); + } + return query; + } + + /** + * Determines if query should be re-parsed removing the stop filter. + * @return true if there are stopwords configured and the parsed query was empty + * false in any other case. + */ + protected boolean shouldRemoveStopFilter(ExtendedDismaxConfiguration config, + Query query) { + return config.stopwords && isEmpty(query); + } + + private String escapeUserQuery(List clauses) { + StringBuilder sb = new StringBuilder(); + for (Clause clause : clauses) { + + boolean doQuote = clause.isPhrase; + + String s=clause.val; + if (!clause.isPhrase && ("OR".equals(s) || "AND".equals(s) || "NOT".equals(s))) { + doQuote=true; + } + + if (clause.must != 0) { + sb.append(clause.must); + } + if (clause.field != null) { + sb.append(clause.field); + sb.append(':'); + } + if (doQuote) { + sb.append('"'); + } + sb.append(clause.val); + if (doQuote) { + sb.append('"'); + } + if (clause.field != null) { + // Add the default user field boost, if any + Float boost = config.userFields.getBoost(clause.field); + if(boost != null) + sb.append("^").append(boost); + } + sb.append(' '); + } + return sb.toString(); + } + + /** + * Returns false if at least one of the clauses is an explicit operator (except for AND) + */ + private boolean doMinMatched(List clauses, boolean lowercaseOperators) { + for (Clause clause : clauses) { + if (clause.must == '+') return false; + if (clause.must == '-') return false; + if (clause.isBareWord()) { + String s = clause.val; + if ("OR".equals(s)) { + return false; + } else if ("NOT".equals(s)) { + return false; + } else if (lowercaseOperators && "or".equals(s)) { + return false; + } + } + } + return true; + } + + /** + * Generates a query string from the raw clauses, uppercasing + * 'and' and 'or' as needed. + * @param clauses the clauses of the query string to be rebuilt + * @param lowercaseOperators if true, lowercase 'and' and 'or' clauses will + * be recognized as operators and uppercased in the final query string. + * @return the generated query string. + */ + protected String rebuildUserQuery(List clauses, boolean lowercaseOperators) { + StringBuilder sb = new StringBuilder(); + for (int i=0; i0 && i+1 getMultiplicativeBoosts() throws SyntaxError { + List boosts = new ArrayList(); + if (config.hasMultiplicativeBoosts()) { + for (String boostStr : config.multBoosts) { + if (boostStr==null || boostStr.length()==0) continue; + Query boost = subQuery(boostStr, FunctionQParserPlugin.NAME).getQuery(); + ValueSource vs; + if (boost instanceof FunctionQuery) { + vs = ((FunctionQuery)boost).getValueSource(); + } else { + vs = new QueryValueSource(boost, 1.0f); + } + boosts.add(vs); + } + } + return boosts; + } + + /** + * Parses all function queries + */ + protected List getBoostFunctions() throws SyntaxError { + List boostFunctions = new LinkedList(); + if (config.hasBoostFunctions()) { + for (String boostFunc : config.boostFuncs) { + if(null == boostFunc || "".equals(boostFunc)) continue; + Map ff = SolrPluginUtils.parseFieldBoosts(boostFunc); + for (String f : ff.keySet()) { + Query fq = subQuery(f, FunctionQParserPlugin.NAME).getQuery(); + Float b = ff.get(f); + if (null != b) { + fq.setBoost(b); + } + boostFunctions.add(fq); + } + } + } + return boostFunctions; + } + + /** + * Parses all boost queries + */ + protected List getBoostQueries() throws SyntaxError { + List boostQueries = new LinkedList(); + if (config.hasBoostParams()) { + for (String qs : config.boostParams) { + if (qs.trim().length()==0) continue; + Query q = subQuery(qs, null).getQuery(); + boostQueries.add(q); + } + } + return boostQueries; + } + + /** + * Extracts all the aliased fields from the requests and adds them to up + */ + private void addAliasesFromRequest(ExtendedSolrQueryParser up, float tiebreaker) { + Iterator it = config.solrParams.getParameterNamesIterator(); + while(it.hasNext()) { + String param = it.next(); + if(param.startsWith("f.") && param.endsWith(".qf")) { + // Add the alias + String fname = param.substring(2,param.length()-3); + String qfReplacement = config.solrParams.get(param); + Map parsedQf = SolrPluginUtils.parseFieldBoosts(qfReplacement); + if(parsedQf.size() == 0) + return; + up.addAlias(fname, tiebreaker, parsedQf); + } + } + } + + /** + * Modifies the main query by adding a new optional Query consisting + * of shingled phrase queries across the specified clauses using the + * specified field => boost mappings. + * + * @param mainQuery Where the phrase boosting queries will be added + * @param clauses Clauses that will be used to construct the phrases + * @param fields Field => boost mappings for the phrase queries + * @param shingleSize how big the phrases should be, 0 means a single phrase + * @param tiebreaker tie breaker value for the DisjunctionMaxQueries + * @param slop slop value for the constructed phrases + */ + protected void addShingledPhraseQueries(final BooleanQuery mainQuery, + final List clauses, + final Map fields, + int shingleSize, + final float tiebreaker, + final int slop) + throws SyntaxError { + + if (null == fields || fields.isEmpty() || + null == clauses || clauses.size() < shingleSize ) + return; + + if (0 == shingleSize) shingleSize = clauses.size(); + + final int goat = shingleSize-1; // :TODO: better name for var? + + StringBuilder userPhraseQuery = new StringBuilder(); + for (int i=0; i < clauses.size() - goat; i++) { + userPhraseQuery.append('"'); + for (int j=0; j <= goat; j++) { + userPhraseQuery.append(clauses.get(i + j).val); + userPhraseQuery.append(' '); + } + userPhraseQuery.append('"'); + userPhraseQuery.append(' '); + } + + /* for parsing sloppy phrases using DisjunctionMaxQueries */ + ExtendedSolrQueryParser pp = createEdismaxQueryParser(this, IMPOSSIBLE_FIELD_NAME); + + pp.addAlias(IMPOSSIBLE_FIELD_NAME, tiebreaker, fields); + pp.setPhraseSlop(slop); + pp.setRemoveStopFilter(true); // remove stop filter and keep stopwords + + /* :TODO: reevaluate using makeDismax=true vs false... + * + * The DismaxQueryParser always used DisjunctionMaxQueries for the + * pf boost, for the same reasons it used them for the qf fields. + * When Yonik first wrote the ExtendedDismaxQParserPlugin, he added + * the "makeDismax=false" property to use BooleanQueries instead, but + * when asked why his response was "I honestly don't recall" ... + * + * https://issues.apache.org/jira/browse/SOLR-1553?focusedCommentId=12793813#action_12793813 + * + * so for now, we continue to use dismax style queries because it + * seems the most logical and is back compatible, but we should + * try to figure out what Yonik was thinking at the time (because he + * rarely does things for no reason) + */ + pp.makeDismax = true; + + + // minClauseSize is independent of the shingleSize because of stop words + // (if they are removed from the middle, so be it, but we need at least + // two or there shouldn't be a boost) + pp.minClauseSize = 2; + + // TODO: perhaps we shouldn't use synonyms either... + + Query phrase = pp.parse(userPhraseQuery.toString()); + if (phrase != null) { + mainQuery.add(phrase, BooleanClause.Occur.SHOULD); + } + } + + + @Override + public String[] getDefaultHighlightFields() { + return config.queryFields.keySet().toArray(new String[0]); + } + + @Override + public Query getHighlightQuery() throws SyntaxError { + return parsedUserQuery == null ? altUserQuery : parsedUserQuery; + } + + @Override + public void addDebugInfo(NamedList debugInfo) { + super.addDebugInfo(debugInfo); + debugInfo.add("altquerystring", altUserQuery); + if (null != boostQueries) { + debugInfo.add("boost_queries", config.boostParams); + debugInfo.add("parsed_boost_queries", + QueryParsing.toString(boostQueries, getReq().getSchema())); + } + debugInfo.add("boostfuncs", getReq().getParams().getParams(DisMaxParams.BF)); + } + + + // FIXME: Not in use + // public static CharSequence partialEscape(CharSequence s) { + // StringBuilder sb = new StringBuilder(); + // + // int len = s.length(); + // for (int i = 0; i < len; i++) { + // char c = s.charAt(i); + // if (c == ':') { + // // look forward to make sure it's something that won't + // // cause a parse exception (something that won't be escaped... like + // // +,-,:, whitespace + // if (i+10) { + // char ch = s.charAt(i+1); + // if (!(Character.isWhitespace(ch) || ch=='+' || ch=='-' || ch==':')) { + // // OK, at this point the chars after the ':' will be fine. + // // now look back and try to determine if this is a fieldname + // // [+,-]? [letter,_] [letter digit,_,-,.]* + // // This won't cover *all* possible lucene fieldnames, but we should + // // only pick nice names to begin with + // int start, pos; + // for (start=i-1; start>=0; start--) { + // ch = s.charAt(start); + // if (Character.isWhitespace(ch)) break; + // } + // + // // skip whitespace + // pos = start+1; + // + // // skip leading + or - + // ch = s.charAt(pos); + // if (ch=='+' || ch=='-') { + // pos++; + // } + // + // // we don't need to explicitly check for end of string + // // since ':' will act as our sentinal + // + // // first char can't be '-' or '.' + // ch = s.charAt(pos++); + // if (Character.isJavaIdentifierPart(ch)) { + // + // for(;;) { + // ch = s.charAt(pos++); + // if (!(Character.isJavaIdentifierPart(ch) || ch=='-' || ch=='.')) { + // break; + // } + // } + // + // if (pos<=i) { + // // OK, we got to the ':' and everything looked like a valid fieldname, so + // // don't escape the ':' + // sb.append(':'); + // continue; // jump back to start of outer-most loop + // } + // + // } + // + // + // } + // } + // + // // we fell through to here, so we should escape this like other reserved chars. + // sb.append('\\'); + // } + // else if (c == '\\' || c == '!' || c == '(' || c == ')' || + // c == '^' || c == '[' || c == ']' || + // c == '{' || c == '}' || c == '~' || c == '*' || c == '?' + // ) + // { + // sb.append('\\'); + // } + // sb.append(c); + // } + // return sb; + // } + + + protected static class Clause { + + boolean isBareWord() { + return must==0 && !isPhrase; + } + + protected String field; + protected String rawField; // if the clause is +(foo:bar) then rawField=(foo + protected boolean isPhrase; + protected boolean hasWhitespace; + protected boolean hasSpecialSyntax; + protected boolean syntaxError; + protected char must; // + or - + protected String val; // the field value (minus the field name, +/-, quotes) + protected String raw; // the raw clause w/o leading/trailing whitespace + } + + public List splitIntoClauses(String s, boolean ignoreQuote) { + ArrayList lst = new ArrayList(4); + Clause clause; + + int pos=0; + int end=s.length(); + char ch=0; + int start; + boolean disallowUserField; + while (pos < end) { + clause = new Clause(); + disallowUserField = true; + + ch = s.charAt(pos); + + while (Character.isWhitespace(ch)) { + if (++pos >= end) break; + ch = s.charAt(pos); + } + + start = pos; + + if (ch=='+' || ch=='-') { + clause.must = ch; + pos++; + } + + clause.field = getFieldName(s, pos, end); + if(clause.field != null && !config.userFields.isAllowed(clause.field)) { + clause.field = null; + } + if (clause.field != null) { + disallowUserField = false; + int colon = s.indexOf(':',pos); + clause.rawField = s.substring(pos, colon); + pos += colon - pos; // skip the field name + pos++; // skip the ':' + } + + if (pos>=end) break; + + + char inString=0; + + ch = s.charAt(pos); + if (!ignoreQuote && ch=='"') { + clause.isPhrase = true; + inString = '"'; + pos++; + } + + StringBuilder sb = new StringBuilder(); + while (pos < end) { + ch = s.charAt(pos++); + if (ch=='\\') { // skip escaped chars, but leave escaped + sb.append(ch); + if (pos >= end) { + sb.append(ch); // double backslash if we are at the end of the string + break; + } + ch = s.charAt(pos++); + sb.append(ch); + continue; + } else if (inString != 0 && ch == inString) { + inString=0; + break; + } else if (Character.isWhitespace(ch)) { + clause.hasWhitespace=true; + if (inString == 0) { + // end of the token if we aren't in a string, backing + // up the position. + pos--; + break; + } + } + + if (inString == 0) { + switch (ch) { + case '!': + case '(': + case ')': + case ':': + case '^': + case '[': + case ']': + case '{': + case '}': + case '~': + case '*': + case '?': + case '"': + case '+': + case '-': + case '\\': + case '|': + case '&': + case '/': + clause.hasSpecialSyntax = true; + sb.append('\\'); + } + } else if (ch=='"') { + // only char we need to escape in a string is double quote + sb.append('\\'); + } + sb.append(ch); + } + clause.val = sb.toString(); + + if (clause.isPhrase) { + if (inString != 0) { + // detected bad quote balancing... retry + // parsing with quotes like any other char + return splitIntoClauses(s, true); + } + + // special syntax in a string isn't special + clause.hasSpecialSyntax = false; + } else { + // an empty clause... must be just a + or - on it's own + if (clause.val.length() == 0) { + clause.syntaxError = true; + if (clause.must != 0) { + clause.val="\\"+clause.must; + clause.must = 0; + clause.hasSpecialSyntax = true; + } else { + // uh.. this shouldn't happen. + clause=null; + } + } + } + + if (clause != null) { + if(disallowUserField) { + clause.raw = s.substring(start, pos); + // escape colons, except for "match all" query + if(!"*:*".equals(clause.raw)) { + clause.raw = clause.raw.replaceAll(":", "\\\\:"); + } + } else { + clause.raw = s.substring(start, pos); + // Add default userField boost if no explicit boost exists + if(config.userFields.isAllowed(clause.field) && !clause.raw.contains("^")) { + Float boost = config.userFields.getBoost(clause.field); + if(boost != null) + clause.raw += "^" + boost; + } + } + lst.add(clause); + } + } + + return lst; + } + + /** + * returns a field name or legal field alias from the current + * position of the string + */ + public String getFieldName(String s, int pos, int end) { + if (pos >= end) return null; + int p=pos; + int colon = s.indexOf(':',pos); + // make sure there is space after the colon, but not whitespace + if (colon<=pos || colon+1>=end || Character.isWhitespace(s.charAt(colon+1))) return null; + char ch = s.charAt(p++); + while ((ch=='(' || ch=='+' || ch=='-') && (pos split(String s, boolean ignoreQuote) { + ArrayList lst = new ArrayList(4); + int pos=0, start=0, end=s.length(); + char inString=0; + char ch=0; + while (pos < end) { + char prevChar=ch; + ch = s.charAt(pos++); + if (ch=='\\') { // skip escaped chars + pos++; + } else if (inString != 0 && ch==inString) { + inString=0; + } else if (!ignoreQuote && ch=='"') { + // If char is directly preceeded by a number or letter + // then don't treat it as the start of a string. + if (!Character.isLetterOrDigit(prevChar)) { + inString=ch; + } + } else if (Character.isWhitespace(ch) && inString==0) { + lst.add(s.substring(start,pos-1)); + start=pos; + } + } + if (start < end) { + lst.add(s.substring(start,end)); + } + + if (inString != 0) { + // unbalanced quote... ignore them + return split(s, true); + } + + return lst; + } + + enum QType { + FIELD, + PHRASE, + PREFIX, + WILDCARD, + FUZZY, + RANGE + } + + + static final RuntimeException unknownField = new RuntimeException("UnknownField"); + static { + unknownField.fillInStackTrace(); + } + + /** + * A subclass of SolrQueryParser that supports aliasing fields for + * constructing DisjunctionMaxQueries. + */ + public static class ExtendedSolrQueryParser extends SolrQueryParser { + + /** A simple container for storing alias info + */ + protected class Alias { + public float tie; + public Map fields; + } + + boolean makeDismax=true; + boolean disableCoord=true; + boolean allowWildcard=true; + int minClauseSize = 0; // minimum number of clauses per phrase query... + // used when constructing boosting part of query via sloppy phrases + boolean exceptions; // allow exceptions to be thrown (for example on a missing field) + + private Map nonStopFilterAnalyzerPerField; + private boolean removeStopFilter; + String minShouldMatch; // for inner boolean queries produced from a single fieldQuery + + /** + * Where we store a map from field name we expect to see in our query + * string, to Alias object containing the fields to use in our + * DisjunctionMaxQuery and the tiebreaker to use. + */ + protected Map aliases = new HashMap(3); + + private QType type; + private String field; + private String val; + private String val2; + private boolean bool; + private boolean bool2; + private float flt; + private int slop; + + public ExtendedSolrQueryParser(QParser parser, String defaultField) { + super(parser, defaultField); + // don't trust that our parent class won't ever change it's default + setDefaultOperator(QueryParser.Operator.OR); + } + + public void setRemoveStopFilter(boolean remove) { + removeStopFilter = remove; + } + + @Override + protected Query getBooleanQuery(List clauses, boolean disableCoord) throws SyntaxError { + Query q = super.getBooleanQuery(clauses, disableCoord); + if (q != null) { + q = QueryUtils.makeQueryable(q); + } + return q; + } + + /** + * Add an alias to this query parser. + * + * @param field the field name that should trigger alias mapping + * @param fieldBoosts the mapping from fieldname to boost value that + * should be used to build up the clauses of the + * DisjunctionMaxQuery. + * @param tiebreaker to the tiebreaker to be used in the + * DisjunctionMaxQuery + * @see SolrPluginUtils#parseFieldBoosts + */ + public void addAlias(String field, float tiebreaker, + Map fieldBoosts) { + Alias a = new Alias(); + a.tie = tiebreaker; + a.fields = fieldBoosts; + aliases.put(field, a); + } + + /** + * Returns the aliases found for a field. + * Returns null if there are no aliases for the field + * @return Alias + */ + protected Alias getAlias(String field) { + return aliases.get(field); + } + + @Override + protected Query getFieldQuery(String field, String val, boolean quoted) throws SyntaxError { + this.type = QType.FIELD; + this.field = field; + this.val = val; + this.slop = getPhraseSlop(); // unspecified + return getAliasedQuery(); + } + + @Override + protected Query getFieldQuery(String field, String val, int slop) throws SyntaxError { + this.type = QType.PHRASE; + this.field = field; + this.val = val; + this.slop = slop; + return getAliasedQuery(); + } + + @Override + protected Query getPrefixQuery(String field, String val) throws SyntaxError { + if (val.equals("") && field.equals("*")) { + return new MatchAllDocsQuery(); + } + this.type = QType.PREFIX; + this.field = field; + this.val = val; + return getAliasedQuery(); + } + + @Override + protected Query newFieldQuery(Analyzer analyzer, String field, String queryText, boolean quoted) throws SyntaxError { + Analyzer actualAnalyzer; + if (removeStopFilter) { + if (nonStopFilterAnalyzerPerField == null) { + nonStopFilterAnalyzerPerField = new HashMap(); + } + actualAnalyzer = nonStopFilterAnalyzerPerField.get(field); + if (actualAnalyzer == null) { + actualAnalyzer = noStopwordFilterAnalyzer(field); + } + } else { + actualAnalyzer = parser.getReq().getSchema().getFieldType(field).getQueryAnalyzer(); + } + return super.newFieldQuery(actualAnalyzer, field, queryText, quoted); + } + + @Override + protected Query getRangeQuery(String field, String a, String b, boolean startInclusive, boolean endInclusive) throws SyntaxError { + this.type = QType.RANGE; + this.field = field; + this.val = a; + this.val2 = b; + this.bool = startInclusive; + this.bool2 = endInclusive; + return getAliasedQuery(); + } + + @Override + protected Query getWildcardQuery(String field, String val) throws SyntaxError { + if (val.equals("*")) { + if (field.equals("*")) { + return new MatchAllDocsQuery(); + } else{ + return getPrefixQuery(field,""); + } + } + this.type = QType.WILDCARD; + this.field = field; + this.val = val; + return getAliasedQuery(); + } + + @Override + protected Query getFuzzyQuery(String field, String val, float minSimilarity) throws SyntaxError { + this.type = QType.FUZZY; + this.field = field; + this.val = val; + this.flt = minSimilarity; + return getAliasedQuery(); + } + + /** + * Delegates to the super class unless the field has been specified + * as an alias -- in which case we recurse on each of + * the aliased fields, and the results are composed into a + * DisjunctionMaxQuery. (so yes: aliases which point at other + * aliases should work) + */ + protected Query getAliasedQuery() throws SyntaxError { + Alias a = aliases.get(field); + this.validateCyclicAliasing(field); + if (a != null) { + List lst = getQueries(a); + if (lst == null || lst.size()==0) + return getQuery(); + // make a DisjunctionMaxQuery in this case too... it will stop + // the "mm" processing from making everything required in the case + // that the query expanded to multiple clauses. + // DisMaxQuery.rewrite() removes itself if there is just a single clause anyway. + // if (lst.size()==1) return lst.get(0); + + if (makeDismax) { + DisjunctionMaxQuery q = new DisjunctionMaxQuery(lst, a.tie); + return q; + } else { + // should we disable coord? + BooleanQuery q = new BooleanQuery(disableCoord); + for (Query sub : lst) { + q.add(sub, BooleanClause.Occur.SHOULD); + } + return q; + } + } else { + + // verify that a fielded query is actually on a field that exists... if not, + // then throw an exception to get us out of here, and we'll treat it like a + // literal when we try the escape+re-parse. + if (exceptions) { + FieldType ft = schema.getFieldTypeNoEx(field); + if (ft == null && null == MagicFieldName.get(field)) { + throw unknownField; + } + } + + return getQuery(); + } + } + + /** + * Validate there is no cyclic referencing in the aliasing + */ + private void validateCyclicAliasing(String field) throws SyntaxError { + Set set = new HashSet(); + set.add(field); + if(validateField(field, set)) { + throw new SyntaxError("Field aliases lead to a cycle"); + } + } + + private boolean validateField(String field, Set set) { + if(this.getAlias(field) == null) { + return false; + } + boolean hascycle = false; + for(String referencedField:this.getAlias(field).fields.keySet()) { + if(!set.add(referencedField)) { + hascycle = true; + } else { + if(validateField(referencedField, set)) { + hascycle = true; + } + set.remove(referencedField); + } + } + return hascycle; + } + + protected List getQueries(Alias a) throws SyntaxError { + if (a == null) return null; + if (a.fields.size()==0) return null; + List lst= new ArrayList(4); + + for (String f : a.fields.keySet()) { + this.field = f; + Query sub = getAliasedQuery(); + if (sub != null) { + Float boost = a.fields.get(f); + if (boost != null) { + sub.setBoost(boost); + } + lst.add(sub); + } + } + return lst; + } + + private Query getQuery() { + try { + + switch (type) { + case FIELD: // fallthrough + case PHRASE: + Query query = super.getFieldQuery(field, val, type == QType.PHRASE); + // A BooleanQuery is only possible from getFieldQuery if it came from + // a single whitespace separated term. In this case, check the coordination + // factor on the query: if its enabled, that means we aren't a set of synonyms + // but instead multiple terms from one whitespace-separated term, we must + // apply minShouldMatch here so that it works correctly with other things + // like aliasing. + if (query instanceof BooleanQuery) { + BooleanQuery bq = (BooleanQuery) query; + if (!bq.isCoordDisabled()) { + SolrPluginUtils.setMinShouldMatch(bq, minShouldMatch); + } + } + if (query instanceof PhraseQuery) { + PhraseQuery pq = (PhraseQuery)query; + if (minClauseSize > 1 && pq.getTerms().length < minClauseSize) return null; + ((PhraseQuery)query).setSlop(slop); + } else if (query instanceof MultiPhraseQuery) { + MultiPhraseQuery pq = (MultiPhraseQuery)query; + if (minClauseSize > 1 && pq.getTermArrays().size() < minClauseSize) return null; + ((MultiPhraseQuery)query).setSlop(slop); + } else if (minClauseSize > 1) { + // if it's not a type of phrase query, it doesn't meet the minClauseSize requirements + return null; + } + return query; + case PREFIX: return super.getPrefixQuery(field, val); + case WILDCARD: return super.getWildcardQuery(field, val); + case FUZZY: return super.getFuzzyQuery(field, val, flt); + case RANGE: return super.getRangeQuery(field, val, val2, bool, bool2); + } + return null; + + } catch (Exception e) { + // an exception here is due to the field query not being compatible with the input text + // for example, passing a string to a numeric field. + return null; + } + } + + private Analyzer noStopwordFilterAnalyzer(String fieldName) { + FieldType ft = parser.getReq().getSchema().getFieldType(fieldName); + Analyzer qa = ft.getQueryAnalyzer(); + if (!(qa instanceof TokenizerChain)) { + return qa; + } + + TokenizerChain tcq = (TokenizerChain) qa; + Analyzer ia = ft.getAnalyzer(); + if (ia == qa || !(ia instanceof TokenizerChain)) { + return qa; + } + TokenizerChain tci = (TokenizerChain) ia; + + // make sure that there isn't a stop filter in the indexer + for (TokenFilterFactory tf : tci.getTokenFilterFactories()) { + if (tf instanceof StopFilterFactory) { + return qa; + } + } + + // now if there is a stop filter in the query analyzer, remove it + int stopIdx = -1; + TokenFilterFactory[] facs = tcq.getTokenFilterFactories(); + + for (int i = 0; i < facs.length; i++) { + TokenFilterFactory tf = facs[i]; + if (tf instanceof StopFilterFactory) { + stopIdx = i; + break; + } + } + + if (stopIdx == -1) { + // no stop filter exists + return qa; + } + + TokenFilterFactory[] newtf = new TokenFilterFactory[facs.length - 1]; + for (int i = 0, j = 0; i < facs.length; i++) { + if (i == stopIdx) continue; + newtf[j++] = facs[i]; + } + + TokenizerChain newa = new TokenizerChain(tcq.getTokenizerFactory(), newtf); + newa.setPositionIncrementGap(tcq.getPositionIncrementGap(fieldName)); + return newa; + } + } + + static boolean isEmpty(Query q) { + if (q==null) return true; + if (q instanceof BooleanQuery && ((BooleanQuery)q).clauses().size()==0) return true; + return false; + } + + /** + * Class that encapsulates the input from userFields parameter and can answer whether + * a field allowed or disallowed as fielded query in the query string + */ + static class UserFields { + private Map userFieldsMap; + private DynamicField[] dynamicUserFields; + private DynamicField[] negativeDynamicUserFields; + + UserFields(Map ufm) { + userFieldsMap = ufm; + if (0 == userFieldsMap.size()) { + userFieldsMap.put("*", null); + } + + // Process dynamic patterns in userFields + ArrayList dynUserFields = new ArrayList(); + ArrayList negDynUserFields = new ArrayList(); + for(String f : userFieldsMap.keySet()) { + if(f.contains("*")) { + if(f.startsWith("-")) + negDynUserFields.add(new DynamicField(f.substring(1))); + else + dynUserFields.add(new DynamicField(f)); + } + } + Collections.sort(dynUserFields); + dynamicUserFields = dynUserFields.toArray(new DynamicField[dynUserFields.size()]); + Collections.sort(negDynUserFields); + negativeDynamicUserFields = negDynUserFields.toArray(new DynamicField[negDynUserFields.size()]); + } + + /** + * Is the given field name allowed according to UserFields spec given in the uf parameter? + * @param fname the field name to examine + * @return true if the fielded queries are allowed on this field + */ + public boolean isAllowed(String fname) { + boolean res = ((userFieldsMap.containsKey(fname) || isDynField(fname, false)) && + !userFieldsMap.containsKey("-"+fname) && + !isDynField(fname, true)); + return res; + } + + private boolean isDynField(String field, boolean neg) { + return getDynFieldForName(field, neg) == null ? false : true; + } + + private String getDynFieldForName(String f, boolean neg) { + for( DynamicField df : neg?negativeDynamicUserFields:dynamicUserFields ) { + if( df.matches( f ) ) return df.wildcard; + } + return null; + } + + /** + * Finds the default user field boost associated with the given field. + * This is parsed from the uf parameter, and may be specified as wildcards, e.g. *name^2.0 or *^3.0 + * @param field the field to find boost for + * @return the float boost value associated with the given field or a wildcard matching the field + */ + public Float getBoost(String field) { + return (userFieldsMap.containsKey(field)) ? + userFieldsMap.get(field) : // Exact field + userFieldsMap.get(getDynFieldForName(field, false)); // Dynamic field + } + } + + /* Represents a dynamic field, for easier matching, inspired by same class in IndexSchema */ + static class DynamicField implements Comparable { + final static int STARTS_WITH=1; + final static int ENDS_WITH=2; + final static int CATCHALL=3; + + final String wildcard; + final int type; + + final String str; + + protected DynamicField(String wildcard) { + this.wildcard = wildcard; + if (wildcard.equals("*")) { + type=CATCHALL; + str=null; + } + else if (wildcard.startsWith("*")) { + type=ENDS_WITH; + str=wildcard.substring(1); + } + else if (wildcard.endsWith("*")) { + type=STARTS_WITH; + str=wildcard.substring(0,wildcard.length()-1); + } + else { + throw new RuntimeException("dynamic field name must start or end with *"); + } + } + + /* + * Returns true if the regex wildcard for this DynamicField would match the input field name + */ + public boolean matches(String name) { + if (type==CATCHALL) return true; + else if (type==STARTS_WITH && name.startsWith(str)) return true; + else if (type==ENDS_WITH && name.endsWith(str)) return true; + else return false; + } + + /** + * Sort order is based on length of regex. Longest comes first. + * @param other The object to compare to. + * @return a negative integer, zero, or a positive integer + * as this object is less than, equal to, or greater than + * the specified object. + */ + @Override + public int compareTo(DynamicField other) { + return other.wildcard.length() - wildcard.length(); + } + + @Override + public String toString() { + return this.wildcard; + } + } + + /** + * Simple container for configuration information used when parsing queries + */ + public class ExtendedDismaxConfiguration { + + /** + * The field names specified by 'qf' that (most) clauses will + * be queried against + */ + protected Map queryFields; + + /** + * The field names specified by 'uf' that users are + * allowed to include literally in their query string. The Float + * boost values will be applied automatically to any clause using that + * field name. '*' will be treated as an alias for any + * field that exists in the schema. Wildcards are allowed to + * express dynamicFields. + */ + protected UserFields userFields; + + protected String[] boostParams; + protected String[] multBoosts; + protected SolrParams solrParams; + protected String minShouldMatch; + + protected List allPhraseFields; + + protected float tiebreaker; + + protected int qslop; + + protected boolean stopwords; + + protected String altQ; + + protected boolean lowercaseOperators; + + protected String[] boostFuncs; + + public ExtendedDismaxConfiguration(SolrParams localParams, + SolrParams params, SolrQueryRequest req) { + solrParams = SolrParams.wrapDefaults(localParams, params); + minShouldMatch = DisMaxQParser.parseMinShouldMatch(req.getSchema(), solrParams); + userFields = new UserFields(U.parseFieldBoosts(solrParams.getParams(DMP.UF))); + try { + queryFields = DisMaxQParser.parseQueryFields(req.getSchema(), solrParams); + } catch (SyntaxError e) { + throw new RuntimeException(); + } + // Phrase slop array + int pslop[] = new int[4]; + pslop[0] = solrParams.getInt(DisMaxParams.PS, 0); + pslop[2] = solrParams.getInt(DisMaxParams.PS2, pslop[0]); + pslop[3] = solrParams.getInt(DisMaxParams.PS3, pslop[0]); + + List phraseFields = U.parseFieldBoostsAndSlop(solrParams.getParams(DMP.PF),0,pslop[0]); + List phraseFields2 = U.parseFieldBoostsAndSlop(solrParams.getParams(DMP.PF2),2,pslop[2]); + List phraseFields3 = U.parseFieldBoostsAndSlop(solrParams.getParams(DMP.PF3),3,pslop[3]); + + allPhraseFields = new ArrayList(phraseFields.size() + phraseFields2.size() + phraseFields3.size()); + allPhraseFields.addAll(phraseFields); + allPhraseFields.addAll(phraseFields2); + allPhraseFields.addAll(phraseFields3); + + tiebreaker = solrParams.getFloat(DisMaxParams.TIE, 0.0f); + + qslop = solrParams.getInt(DisMaxParams.QS, 0); + + stopwords = solrParams.getBool(DMP.STOPWORDS, true); + + altQ = solrParams.get( DisMaxParams.ALTQ ); + + lowercaseOperators = solrParams.getBool(DMP.LOWERCASE_OPS, true); + + /* * * Boosting Query * * */ + boostParams = solrParams.getParams(DisMaxParams.BQ); + + boostFuncs = solrParams.getParams(DisMaxParams.BF); + + multBoosts = solrParams.getParams(DMP.MULT_BOOST); + } + /** + * + * @return true if there are valid multiplicative boost queries + */ + public boolean hasMultiplicativeBoosts() { + return multBoosts!=null && multBoosts.length>0; + } + + /** + * + * @return true if there are valid boost functions + */ + public boolean hasBoostFunctions() { + return null != boostFuncs && 0 != boostFuncs.length; + } + /** + * + * @return true if there are valid boost params + */ + public boolean hasBoostParams() { + return boostParams!=null && boostParams.length>0; + } + + public List getAllPhraseFields() { + return allPhraseFields; + } + } + +} diff --git a/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParserPlugin.java b/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParserPlugin.java index 6255e296844..d7337138752 100755 --- a/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParserPlugin.java +++ b/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParserPlugin.java @@ -17,39 +17,13 @@ package org.apache.solr.search; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.core.StopFilterFactory; -import org.apache.lucene.analysis.util.TokenFilterFactory; -import org.apache.lucene.queries.function.BoostedQuery; -import org.apache.lucene.queries.function.FunctionQuery; -import org.apache.lucene.queries.function.ValueSource; -import org.apache.lucene.queries.function.valuesource.ProductFloatFunction; -import org.apache.lucene.queries.function.valuesource.QueryValueSource; -import org.apache.lucene.search.*; -import org.apache.solr.analysis.TokenizerChain; -import org.apache.solr.parser.ParseException; -import org.apache.solr.parser.QueryParser; -import org.apache.solr.parser.SolrQueryParserBase.MagicFieldName; -import org.apache.solr.common.params.DisMaxParams; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.request.SolrQueryRequest; -import org.apache.solr.schema.FieldType; -import org.apache.solr.util.SolrPluginUtils; /** * An advanced multi-field query parser based on the DisMax parser. * See Wiki page http://wiki.apache.org/solr/ExtendedDisMax - * @lucene.experimental */ public class ExtendedDismaxQParserPlugin extends QParserPlugin { public static final String NAME = "edismax"; @@ -62,1328 +36,4 @@ public class ExtendedDismaxQParserPlugin extends QParserPlugin { public QParser createParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) { return new ExtendedDismaxQParser(qstr, localParams, params, req); } -} - - -class ExtendedDismaxQParser extends QParser { - - /** - * A field we can't ever find in any schema, so we can safely tell - * DisjunctionMaxQueryParser to use it as our defaultField, and - * map aliases from it to any field in our schema. - */ - private static String IMPOSSIBLE_FIELD_NAME = "\uFFFC\uFFFC\uFFFC"; - - /** shorten the class references for utilities */ - private static class U extends SolrPluginUtils { - /* :NOOP */ - } - - /** shorten the class references for utilities */ - private static interface DMP extends DisMaxParams { - /** - * User fields. The fields that can be used by the end user to create field-specific queries. - */ - public static String UF = "uf"; - } - - - public ExtendedDismaxQParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) { - super(qstr, localParams, params, req); - } - - /** - * The field names specified by 'qf' that (most) clauses will - * be queried against - */ - private Map queryFields; - - /** - * The field names specified by 'uf' that users are - * allowed to include literally in their query string. The Float - * boost values will be applied automaticly to any clause using that - * field name. '*' will be treated as an alias for any - * field that exists in the schema. Wildcards are allowed to - * express dynamicFields. - */ - private UserFields userFields; - - private Query parsedUserQuery; - - private String[] boostParams; - private String[] multBoosts; - private List boostQueries; - private Query altUserQuery; - private QParser altQParser; - private SolrParams solrParams; - - - @Override - public Query parse() throws SyntaxError { - SolrParams localParams = getLocalParams(); - SolrParams params = getParams(); - - solrParams = SolrParams.wrapDefaults(localParams, params); - - final String minShouldMatch = - DisMaxQParser.parseMinShouldMatch(req.getSchema(), solrParams); - - userFields = new UserFields(U.parseFieldBoosts(solrParams.getParams(DMP.UF))); - - queryFields = DisMaxQParser.parseQueryFields(req.getSchema(), solrParams); - - // Phrase slop array - int pslop[] = new int[4]; - pslop[0] = solrParams.getInt(DisMaxParams.PS, 0); - pslop[2] = solrParams.getInt(DisMaxParams.PS2, pslop[0]); - pslop[3] = solrParams.getInt(DisMaxParams.PS3, pslop[0]); - - - // Boosted phrase of the full query string - List phraseFields = - U.parseFieldBoostsAndSlop(solrParams.getParams(DMP.PF),0,pslop[0]); - // Boosted Bi-Term Shingles from the query string - List phraseFields2 = - U.parseFieldBoostsAndSlop(solrParams.getParams(DMP.PF2),2,pslop[2]); - // Boosted Tri-Term Shingles from the query string - List phraseFields3 = - U.parseFieldBoostsAndSlop(solrParams.getParams(DMP.PF3),3,pslop[3]); - - float tiebreaker = solrParams.getFloat(DisMaxParams.TIE, 0.0f); - - int qslop = solrParams.getInt(DisMaxParams.QS, 0); - - // remove stopwords from mandatory "matching" component? - boolean stopwords = solrParams.getBool("stopwords", true); - - /* the main query we will execute. we disable the coord because - * this query is an artificial construct - */ - BooleanQuery query = new BooleanQuery(true); - - /* * * Main User Query * * */ - parsedUserQuery = null; - String userQuery = getString(); - altUserQuery = null; - if( userQuery == null || userQuery.trim().length() == 0 ) { - // If no query is specified, we may have an alternate - String altQ = solrParams.get( DisMaxParams.ALTQ ); - if (altQ != null) { - altQParser = subQuery(altQ, null); - altUserQuery = altQParser.getQuery(); - query.add( altUserQuery , BooleanClause.Occur.MUST ); - } else { - return null; - // throw new SyntaxError("missing query string" ); - } - } - else { - // There is a valid query string - // userQuery = partialEscape(U.stripUnbalancedQuotes(userQuery)).toString(); - - boolean lowercaseOperators = solrParams.getBool("lowercaseOperators", true); - String mainUserQuery = userQuery; - - ExtendedSolrQueryParser up = - new ExtendedSolrQueryParser(this, IMPOSSIBLE_FIELD_NAME); - up.addAlias(IMPOSSIBLE_FIELD_NAME, - tiebreaker, queryFields); - addAliasesFromRequest(up, tiebreaker); - up.setPhraseSlop(qslop); // slop for explicit user phrase queries - up.setAllowLeadingWildcard(true); - - // defer escaping and only do if lucene parsing fails, or we need phrases - // parsing fails. Need to sloppy phrase queries anyway though. - List clauses = null; - int numPluses = 0; - int numMinuses = 0; - int numOR = 0; - int numNOT = 0; - - clauses = splitIntoClauses(userQuery, false); - for (Clause clause : clauses) { - if (clause.must == '+') numPluses++; - if (clause.must == '-') numMinuses++; - if (clause.isBareWord()) { - String s = clause.val; - if ("OR".equals(s)) { - numOR++; - } else if ("NOT".equals(s)) { - numNOT++; - } else if (lowercaseOperators && "or".equals(s)) { - numOR++; - } - } - } - - // Always rebuild mainUserQuery from clauses to catch modifications from splitIntoClauses - // This was necessary for userFields modifications to get propagated into the query. - // Convert lower or mixed case operators to uppercase if we saw them. - // only do this for the lucene query part and not for phrase query boosting - // since some fields might not be case insensitive. - // We don't use a regex for this because it might change and AND or OR in - // a phrase query in a case sensitive field. - StringBuilder sb = new StringBuilder(); - for (int i=0; i0 && i+1 allPhraseFields = new ArrayList(); - allPhraseFields.addAll(phraseFields); - allPhraseFields.addAll(phraseFields2); - allPhraseFields.addAll(phraseFields3); - - if (allPhraseFields.size() > 0) { - // find non-field clauses - List normalClauses = new ArrayList(clauses.size()); - for (Clause clause : clauses) { - if (clause.field != null || clause.isPhrase) continue; - // check for keywords "AND,OR,TO" - if (clause.isBareWord()) { - String s = clause.val.toString(); - // avoid putting explict operators in the phrase query - if ("OR".equals(s) || "AND".equals(s) || "NOT".equals(s) || "TO".equals(s)) continue; - } - normalClauses.add(clause); - } - - // full phrase and shingles - for (FieldParams phraseField: allPhraseFields) { - Map pf = new HashMap(1); - pf.put(phraseField.getField(),phraseField.getBoost()); - addShingledPhraseQueries(query, normalClauses, pf, - phraseField.getWordGrams(),tiebreaker, phraseField.getSlop()); - } - - } - } - - - - /* * * Boosting Query * * */ - boostParams = solrParams.getParams(DisMaxParams.BQ); - //List boostQueries = U.parseQueryStrings(req, boostParams); - boostQueries=null; - if (boostParams!=null && boostParams.length>0) { - boostQueries = new ArrayList(); - for (String qs : boostParams) { - if (qs.trim().length()==0) continue; - Query q = subQuery(qs, null).getQuery(); - boostQueries.add(q); - } - } - if (null != boostQueries) { - for(Query f : boostQueries) { - query.add(f, BooleanClause.Occur.SHOULD); - } - } - - /* * * Boosting Functions * * */ - - String[] boostFuncs = solrParams.getParams(DisMaxParams.BF); - if (null != boostFuncs && 0 != boostFuncs.length) { - for (String boostFunc : boostFuncs) { - if(null == boostFunc || "".equals(boostFunc)) continue; - Map ff = SolrPluginUtils.parseFieldBoosts(boostFunc); - for (String f : ff.keySet()) { - Query fq = subQuery(f, FunctionQParserPlugin.NAME).getQuery(); - Float b = ff.get(f); - if (null != b) { - fq.setBoost(b); - } - query.add(fq, BooleanClause.Occur.SHOULD); - } - } - } - - - // - // create a boosted query (scores multiplied by boosts) - // - Query topQuery = query; - multBoosts = solrParams.getParams("boost"); - if (multBoosts!=null && multBoosts.length>0) { - - List boosts = new ArrayList(); - for (String boostStr : multBoosts) { - if (boostStr==null || boostStr.length()==0) continue; - Query boost = subQuery(boostStr, FunctionQParserPlugin.NAME).getQuery(); - ValueSource vs; - if (boost instanceof FunctionQuery) { - vs = ((FunctionQuery)boost).getValueSource(); - } else { - vs = new QueryValueSource(boost, 1.0f); - } - boosts.add(vs); - } - - if (boosts.size()>1) { - ValueSource prod = new ProductFloatFunction(boosts.toArray(new ValueSource[boosts.size()])); - topQuery = new BoostedQuery(query, prod); - } else if (boosts.size() == 1) { - topQuery = new BoostedQuery(query, boosts.get(0)); - } - } - - return topQuery; - } - - /** - * Extracts all the aliased fields from the requests and adds them to up - */ - private void addAliasesFromRequest(ExtendedSolrQueryParser up, float tiebreaker) { - Iterator it = solrParams.getParameterNamesIterator(); - while(it.hasNext()) { - String param = it.next(); - if(param.startsWith("f.") && param.endsWith(".qf")) { - // Add the alias - String fname = param.substring(2,param.length()-3); - String qfReplacement = solrParams.get(param); - Map parsedQf = SolrPluginUtils.parseFieldBoosts(qfReplacement); - if(parsedQf.size() == 0) - return; - up.addAlias(fname, tiebreaker, parsedQf); - } - } - } - - /** - * Modifies the main query by adding a new optional Query consisting - * of shingled phrase queries across the specified clauses using the - * specified field => boost mappings. - * - * @param mainQuery Where the phrase boosting queries will be added - * @param clauses Clauses that will be used to construct the phrases - * @param fields Field => boost mappings for the phrase queries - * @param shingleSize how big the phrases should be, 0 means a single phrase - * @param tiebreaker tie breker value for the DisjunctionMaxQueries - * @param slop slop value for the constructed phrases - */ - private void addShingledPhraseQueries(final BooleanQuery mainQuery, - final List clauses, - final Map fields, - int shingleSize, - final float tiebreaker, - final int slop) - throws SyntaxError { - - if (null == fields || fields.isEmpty() || - null == clauses || clauses.size() < shingleSize ) - return; - - if (0 == shingleSize) shingleSize = clauses.size(); - - final int goat = shingleSize-1; // :TODO: better name for var? - - StringBuilder userPhraseQuery = new StringBuilder(); - for (int i=0; i < clauses.size() - goat; i++) { - userPhraseQuery.append('"'); - for (int j=0; j <= goat; j++) { - userPhraseQuery.append(clauses.get(i + j).val); - userPhraseQuery.append(' '); - } - userPhraseQuery.append('"'); - userPhraseQuery.append(' '); - } - - /* for parsing sloppy phrases using DisjunctionMaxQueries */ - ExtendedSolrQueryParser pp = - new ExtendedSolrQueryParser(this, IMPOSSIBLE_FIELD_NAME); - - pp.addAlias(IMPOSSIBLE_FIELD_NAME, tiebreaker, fields); - pp.setPhraseSlop(slop); - pp.setRemoveStopFilter(true); // remove stop filter and keep stopwords - - /* :TODO: reevaluate using makeDismax=true vs false... - * - * The DismaxQueryParser always used DisjunctionMaxQueries for the - * pf boost, for the same reasons it used them for the qf fields. - * When Yonik first wrote the ExtendedDismaxQParserPlugin, he added - * the "makeDismax=false" property to use BooleanQueries instead, but - * when asked why his response was "I honestly don't recall" ... - * - * https://issues.apache.org/jira/browse/SOLR-1553?focusedCommentId=12793813#action_12793813 - * - * so for now, we continue to use dismax style queries becuse it - * seems the most logical and is back compatible, but we should - * try to figure out what Yonik was thinking at the time (because he - * rarely does things for no reason) - */ - pp.makeDismax = true; - - - // minClauseSize is independent of the shingleSize because of stop words - // (if they are removed from the middle, so be it, but we need at least - // two or there shouldn't be a boost) - pp.minClauseSize = 2; - - // TODO: perhaps we shouldn't use synonyms either... - - Query phrase = pp.parse(userPhraseQuery.toString()); - if (phrase != null) { - mainQuery.add(phrase, BooleanClause.Occur.SHOULD); - } - } - - - @Override - public String[] getDefaultHighlightFields() { - String[] highFields = queryFields.keySet().toArray(new String[0]); - return highFields; - } - - @Override - public Query getHighlightQuery() throws SyntaxError { - return parsedUserQuery == null ? altUserQuery : parsedUserQuery; - } - - @Override - public void addDebugInfo(NamedList debugInfo) { - super.addDebugInfo(debugInfo); - debugInfo.add("altquerystring", altUserQuery); - if (null != boostQueries) { - debugInfo.add("boost_queries", boostParams); - debugInfo.add("parsed_boost_queries", - QueryParsing.toString(boostQueries, getReq().getSchema())); - } - debugInfo.add("boostfuncs", getReq().getParams().getParams(DisMaxParams.BF)); - } - - -// FIXME: Not in use -// public static CharSequence partialEscape(CharSequence s) { -// StringBuilder sb = new StringBuilder(); -// -// int len = s.length(); -// for (int i = 0; i < len; i++) { -// char c = s.charAt(i); -// if (c == ':') { -// // look forward to make sure it's something that won't -// // cause a parse exception (something that won't be escaped... like -// // +,-,:, whitespace -// if (i+10) { -// char ch = s.charAt(i+1); -// if (!(Character.isWhitespace(ch) || ch=='+' || ch=='-' || ch==':')) { -// // OK, at this point the chars after the ':' will be fine. -// // now look back and try to determine if this is a fieldname -// // [+,-]? [letter,_] [letter digit,_,-,.]* -// // This won't cover *all* possible lucene fieldnames, but we should -// // only pick nice names to begin with -// int start, pos; -// for (start=i-1; start>=0; start--) { -// ch = s.charAt(start); -// if (Character.isWhitespace(ch)) break; -// } -// -// // skip whitespace -// pos = start+1; -// -// // skip leading + or - -// ch = s.charAt(pos); -// if (ch=='+' || ch=='-') { -// pos++; -// } -// -// // we don't need to explicitly check for end of string -// // since ':' will act as our sentinal -// -// // first char can't be '-' or '.' -// ch = s.charAt(pos++); -// if (Character.isJavaIdentifierPart(ch)) { -// -// for(;;) { -// ch = s.charAt(pos++); -// if (!(Character.isJavaIdentifierPart(ch) || ch=='-' || ch=='.')) { -// break; -// } -// } -// -// if (pos<=i) { -// // OK, we got to the ':' and everything looked like a valid fieldname, so -// // don't escape the ':' -// sb.append(':'); -// continue; // jump back to start of outer-most loop -// } -// -// } -// -// -// } -// } -// -// // we fell through to here, so we should escape this like other reserved chars. -// sb.append('\\'); -// } -// else if (c == '\\' || c == '!' || c == '(' || c == ')' || -// c == '^' || c == '[' || c == ']' || -// c == '{' || c == '}' || c == '~' || c == '*' || c == '?' -// ) -// { -// sb.append('\\'); -// } -// sb.append(c); -// } -// return sb; -// } - - - static class Clause { - - boolean isBareWord() { - return must==0 && !isPhrase; - } - - String field; - String rawField; // if the clause is +(foo:bar) then rawField=(foo - boolean isPhrase; - boolean hasWhitespace; - boolean hasSpecialSyntax; - boolean syntaxError; - char must; // + or - - String val; // the field value (minus the field name, +/-, quotes) - String raw; // the raw clause w/o leading/trailing whitespace - } - - - public List splitIntoClauses(String s, boolean ignoreQuote) { - ArrayList lst = new ArrayList(4); - Clause clause; - - int pos=0; - int end=s.length(); - char ch=0; - int start; - boolean disallowUserField; - outer: while (pos < end) { - clause = new Clause(); - disallowUserField = true; - - ch = s.charAt(pos); - - while (Character.isWhitespace(ch)) { - if (++pos >= end) break; - ch = s.charAt(pos); - } - - start = pos; - - if (ch=='+' || ch=='-') { - clause.must = ch; - pos++; - } - - clause.field = getFieldName(s, pos, end); - if(clause.field != null && !userFields.isAllowed(clause.field)) { - clause.field = null; - } - if (clause.field != null) { - disallowUserField = false; - int colon = s.indexOf(':',pos); - clause.rawField = s.substring(pos, colon); - pos += colon - pos; // skip the field name - pos++; // skip the ':' - } - - if (pos>=end) break; - - - char inString=0; - - ch = s.charAt(pos); - if (!ignoreQuote && ch=='"') { - clause.isPhrase = true; - inString = '"'; - pos++; - } - - StringBuilder sb = new StringBuilder(); - while (pos < end) { - ch = s.charAt(pos++); - if (ch=='\\') { // skip escaped chars, but leave escaped - sb.append(ch); - if (pos >= end) { - sb.append(ch); // double backslash if we are at the end of the string - break; - } - ch = s.charAt(pos++); - sb.append(ch); - continue; - } else if (inString != 0 && ch == inString) { - inString=0; - break; - } else if (Character.isWhitespace(ch)) { - clause.hasWhitespace=true; - if (inString == 0) { - // end of the token if we aren't in a string, backing - // up the position. - pos--; - break; - } - } - - if (inString == 0) { - switch (ch) { - case '!': - case '(': - case ')': - case ':': - case '^': - case '[': - case ']': - case '{': - case '}': - case '~': - case '*': - case '?': - case '"': - case '+': - case '-': - case '\\': - case '|': - case '&': - case '/': - clause.hasSpecialSyntax = true; - sb.append('\\'); - } - } else if (ch=='"') { - // only char we need to escape in a string is double quote - sb.append('\\'); - } - sb.append(ch); - } - clause.val = sb.toString(); - - if (clause.isPhrase) { - if (inString != 0) { - // detected bad quote balancing... retry - // parsing with quotes like any other char - return splitIntoClauses(s, true); - } - - // special syntax in a string isn't special - clause.hasSpecialSyntax = false; - } else { - // an empty clause... must be just a + or - on it's own - if (clause.val.length() == 0) { - clause.syntaxError = true; - if (clause.must != 0) { - clause.val="\\"+clause.must; - clause.must = 0; - clause.hasSpecialSyntax = true; - } else { - // uh.. this shouldn't happen. - clause=null; - } - } - } - - if (clause != null) { - if(disallowUserField) { - clause.raw = s.substring(start, pos); - // escape colons, except for "match all" query - if(!"*:*".equals(clause.raw)) { - clause.raw = clause.raw.replaceAll(":", "\\\\:"); - } - } else { - clause.raw = s.substring(start, pos); - // Add default userField boost if no explicit boost exists - if(userFields.isAllowed(clause.field) && !clause.raw.contains("^")) { - Float boost = userFields.getBoost(clause.field); - if(boost != null) - clause.raw += "^" + boost; - } - } - lst.add(clause); - } - } - - return lst; - } - - /** - * returns a field name or legal field alias from the current - * position of the string - */ - public String getFieldName(String s, int pos, int end) { - if (pos >= end) return null; - int p=pos; - int colon = s.indexOf(':',pos); - // make sure there is space after the colon, but not whitespace - if (colon<=pos || colon+1>=end || Character.isWhitespace(s.charAt(colon+1))) return null; - char ch = s.charAt(p++); - while ((ch=='(' || ch=='+' || ch=='-') && (pos split(String s, boolean ignoreQuote) { - ArrayList lst = new ArrayList(4); - int pos=0, start=0, end=s.length(); - char inString=0; - char ch=0; - while (pos < end) { - char prevChar=ch; - ch = s.charAt(pos++); - if (ch=='\\') { // skip escaped chars - pos++; - } else if (inString != 0 && ch==inString) { - inString=0; - } else if (!ignoreQuote && ch=='"') { - // If char is directly preceeded by a number or letter - // then don't treat it as the start of a string. - if (!Character.isLetterOrDigit(prevChar)) { - inString=ch; - } - } else if (Character.isWhitespace(ch) && inString==0) { - lst.add(s.substring(start,pos-1)); - start=pos; - } - } - if (start < end) { - lst.add(s.substring(start,end)); - } - - if (inString != 0) { - // unbalanced quote... ignore them - return split(s, true); - } - - return lst; - } - - - - - enum QType { - FIELD, - PHRASE, - PREFIX, - WILDCARD, - FUZZY, - RANGE - } - - - static final RuntimeException unknownField = new RuntimeException("UnknownField"); - static { - unknownField.fillInStackTrace(); - } - - /** - * A subclass of SolrQueryParser that supports aliasing fields for - * constructing DisjunctionMaxQueries. - */ - static class ExtendedSolrQueryParser extends SolrQueryParser { - - - /** A simple container for storing alias info - */ - protected class Alias { - public float tie; - public Map fields; - } - - boolean makeDismax=true; - boolean disableCoord=true; - boolean allowWildcard=true; - int minClauseSize = 0; // minimum number of clauses per phrase query... - // used when constructing boosting part of query via sloppy phrases - boolean exceptions; // allow exceptions to be thrown (for example on a missing field) - - private Map nonStopFilterAnalyzerPerField; - private boolean removeStopFilter; - String minShouldMatch; // for inner boolean queries produced from a single fieldQuery - - /** - * Where we store a map from field name we expect to see in our query - * string, to Alias object containing the fields to use in our - * DisjunctionMaxQuery and the tiebreaker to use. - */ - protected Map aliases = new HashMap(3); - - public ExtendedSolrQueryParser(QParser parser, String defaultField) { - super(parser, defaultField); - // don't trust that our parent class won't ever change it's default - setDefaultOperator(QueryParser.Operator.OR); - } - - public void setRemoveStopFilter(boolean remove) { -// analyzer.removeStopFilter = remove; - removeStopFilter = remove; - } - - @Override - protected Query getBooleanQuery(List clauses, boolean disableCoord) throws SyntaxError { - Query q = super.getBooleanQuery(clauses, disableCoord); - if (q != null) { - q = QueryUtils.makeQueryable(q); - } - return q; - } - - - //////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////// - - @Override - protected void addClause(List clauses, int conj, int mods, Query q) { -//System.out.println("addClause:clauses="+clauses+" conj="+conj+" mods="+mods+" q="+q); - super.addClause(clauses, conj, mods, q); - } - - /** - * Add an alias to this query parser. - * - * @param field the field name that should trigger alias mapping - * @param fieldBoosts the mapping from fieldname to boost value that - * should be used to build up the clauses of the - * DisjunctionMaxQuery. - * @param tiebreaker to the tiebreaker to be used in the - * DisjunctionMaxQuery - * @see SolrPluginUtils#parseFieldBoosts - */ - public void addAlias(String field, float tiebreaker, - Map fieldBoosts) { - - Alias a = new Alias(); - a.tie = tiebreaker; - a.fields = fieldBoosts; - aliases.put(field, a); - } - - /** - * Returns the aliases found for a field. - * Returns null if there are no aliases for the field - * @return Alias - */ - public Alias getAlias(String field) { - return aliases.get(field); - } - - - QType type; - String field; - String val; - String val2; - boolean bool; - boolean bool2; - float flt; - int slop; - - @Override - protected Query getFieldQuery(String field, String val, boolean quoted) throws SyntaxError { -//System.out.println("getFieldQuery: val="+val); - - this.type = QType.FIELD; - this.field = field; - this.val = val; - this.slop = getPhraseSlop(); // unspecified - return getAliasedQuery(); - } - - @Override - protected Query getFieldQuery(String field, String val, int slop) throws SyntaxError { -//System.out.println("getFieldQuery: val="+val+" slop="+slop); - - this.type = QType.PHRASE; - this.field = field; - this.val = val; - this.slop = slop; - return getAliasedQuery(); - } - - @Override - protected Query getPrefixQuery(String field, String val) throws SyntaxError { -//System.out.println("getPrefixQuery: val="+val); - if (val.equals("") && field.equals("*")) { - return new MatchAllDocsQuery(); - } - this.type = QType.PREFIX; - this.field = field; - this.val = val; - return getAliasedQuery(); - } - - @Override - protected Query newFieldQuery(Analyzer analyzer, String field, String queryText, boolean quoted) throws SyntaxError { - Analyzer actualAnalyzer; - if (removeStopFilter) { - if (nonStopFilterAnalyzerPerField == null) { - nonStopFilterAnalyzerPerField = new HashMap(); - } - actualAnalyzer = nonStopFilterAnalyzerPerField.get(field); - if (actualAnalyzer == null) { - actualAnalyzer = noStopwordFilterAnalyzer(field); - } - } else { - actualAnalyzer = parser.getReq().getSchema().getFieldType(field).getQueryAnalyzer(); - } - return super.newFieldQuery(actualAnalyzer, field, queryText, quoted); - } - - @Override - protected Query getRangeQuery(String field, String a, String b, boolean startInclusive, boolean endInclusive) throws SyntaxError { -//System.out.println("getRangeQuery:"); - - this.type = QType.RANGE; - this.field = field; - this.val = a; - this.val2 = b; - this.bool = startInclusive; - this.bool2 = endInclusive; - return getAliasedQuery(); - } - - @Override - protected Query getWildcardQuery(String field, String val) throws SyntaxError { -//System.out.println("getWildcardQuery: val="+val); - - if (val.equals("*")) { - if (field.equals("*")) { - return new MatchAllDocsQuery(); - } else{ - return getPrefixQuery(field,""); - } - } - this.type = QType.WILDCARD; - this.field = field; - this.val = val; - return getAliasedQuery(); - } - - @Override - protected Query getFuzzyQuery(String field, String val, float minSimilarity) throws SyntaxError { -//System.out.println("getFuzzyQuery: val="+val); - - this.type = QType.FUZZY; - this.field = field; - this.val = val; - this.flt = minSimilarity; - return getAliasedQuery(); - } - - /** - * Delegates to the super class unless the field has been specified - * as an alias -- in which case we recurse on each of - * the aliased fields, and the results are composed into a - * DisjunctionMaxQuery. (so yes: aliases which point at other - * aliases should work) - */ - protected Query getAliasedQuery() throws SyntaxError { - Alias a = aliases.get(field); - this.validateCyclicAliasing(field); - if (a != null) { - List lst = getQueries(a); - if (lst == null || lst.size()==0) - return getQuery(); - // make a DisjunctionMaxQuery in this case too... it will stop - // the "mm" processing from making everything required in the case - // that the query expanded to multiple clauses. - // DisMaxQuery.rewrite() removes itself if there is just a single clause anyway. - // if (lst.size()==1) return lst.get(0); - - if (makeDismax) { - DisjunctionMaxQuery q = new DisjunctionMaxQuery(lst, a.tie); - return q; - } else { - // should we disable coord? - BooleanQuery q = new BooleanQuery(disableCoord); - for (Query sub : lst) { - q.add(sub, BooleanClause.Occur.SHOULD); - } - return q; - } - } else { - - // verify that a fielded query is actually on a field that exists... if not, - // then throw an exception to get us out of here, and we'll treat it like a - // literal when we try the escape+re-parse. - if (exceptions) { - FieldType ft = schema.getFieldTypeNoEx(field); - if (ft == null && null == MagicFieldName.get(field)) { - throw unknownField; - } - } - - return getQuery(); - } - } - - /** - * Validate there is no cyclic referencing in the aliasing - */ - private void validateCyclicAliasing(String field) throws SyntaxError { - Set set = new HashSet(); - set.add(field); - if(validateField(field, set)) { - throw new SyntaxError("Field aliases lead to a cycle"); - } - } - - private boolean validateField(String field, Set set) { - if(this.getAlias(field) == null) { - return false; - } - boolean hascycle = false; - for(String referencedField:this.getAlias(field).fields.keySet()) { - if(!set.add(referencedField)) { - hascycle = true; - } else { - if(validateField(referencedField, set)) { - hascycle = true; - } - set.remove(referencedField); - } - } - return hascycle; - } - - protected List getQueries(Alias a) throws SyntaxError { - if (a == null) return null; - if (a.fields.size()==0) return null; - List lst= new ArrayList(4); - - for (String f : a.fields.keySet()) { - this.field = f; - Query sub = getAliasedQuery(); - if (sub != null) { - Float boost = a.fields.get(f); - if (boost != null) { - sub.setBoost(boost); - } - lst.add(sub); - } - } - return lst; - } - - private Query getQuery() { - try { - - switch (type) { - case FIELD: // fallthrough - case PHRASE: - Query query = super.getFieldQuery(field, val, type == QType.PHRASE); - // A BooleanQuery is only possible from getFieldQuery if it came from - // a single whitespace separated term. In this case, check the coordination - // factor on the query: if its enabled, that means we aren't a set of synonyms - // but instead multiple terms from one whitespace-separated term, we must - // apply minShouldMatch here so that it works correctly with other things - // like aliasing. - if (query instanceof BooleanQuery) { - BooleanQuery bq = (BooleanQuery) query; - if (!bq.isCoordDisabled()) { - SolrPluginUtils.setMinShouldMatch(bq, minShouldMatch); - } - } - if (query instanceof PhraseQuery) { - PhraseQuery pq = (PhraseQuery)query; - if (minClauseSize > 1 && pq.getTerms().length < minClauseSize) return null; - ((PhraseQuery)query).setSlop(slop); - } else if (query instanceof MultiPhraseQuery) { - MultiPhraseQuery pq = (MultiPhraseQuery)query; - if (minClauseSize > 1 && pq.getTermArrays().size() < minClauseSize) return null; - ((MultiPhraseQuery)query).setSlop(slop); - } else if (minClauseSize > 1) { - // if it's not a type of phrase query, it doesn't meet the minClauseSize requirements - return null; - } - return query; - case PREFIX: return super.getPrefixQuery(field, val); - case WILDCARD: return super.getWildcardQuery(field, val); - case FUZZY: return super.getFuzzyQuery(field, val, flt); - case RANGE: return super.getRangeQuery(field, val, val2, bool, bool2); - } - return null; - - } catch (Exception e) { - // an exception here is due to the field query not being compatible with the input text - // for example, passing a string to a numeric field. - return null; - } - } - - private Analyzer noStopwordFilterAnalyzer(String fieldName) { - FieldType ft = parser.getReq().getSchema().getFieldType(fieldName); - Analyzer qa = ft.getQueryAnalyzer(); - if (!(qa instanceof TokenizerChain)) { - return qa; - } - - TokenizerChain tcq = (TokenizerChain) qa; - Analyzer ia = ft.getAnalyzer(); - if (ia == qa || !(ia instanceof TokenizerChain)) { - return qa; - } - TokenizerChain tci = (TokenizerChain) ia; - - // make sure that there isn't a stop filter in the indexer - for (TokenFilterFactory tf : tci.getTokenFilterFactories()) { - if (tf instanceof StopFilterFactory) { - return qa; - } - } - - // now if there is a stop filter in the query analyzer, remove it - int stopIdx = -1; - TokenFilterFactory[] facs = tcq.getTokenFilterFactories(); - - for (int i = 0; i < facs.length; i++) { - TokenFilterFactory tf = facs[i]; - if (tf instanceof StopFilterFactory) { - stopIdx = i; - break; - } - } - - if (stopIdx == -1) { - // no stop filter exists - return qa; - } - - TokenFilterFactory[] newtf = new TokenFilterFactory[facs.length - 1]; - for (int i = 0, j = 0; i < facs.length; i++) { - if (i == stopIdx) continue; - newtf[j++] = facs[i]; - } - - TokenizerChain newa = new TokenizerChain(tcq.getTokenizerFactory(), newtf); - newa.setPositionIncrementGap(tcq.getPositionIncrementGap(fieldName)); - return newa; - } - } - - static boolean isEmpty(Query q) { - if (q==null) return true; - if (q instanceof BooleanQuery && ((BooleanQuery)q).clauses().size()==0) return true; - return false; - } - - /** - * Class that encapsulates the input from userFields parameter and can answer whether - * a field allowed or disallowed as fielded query in the query string - */ - static class UserFields { - private Map userFieldsMap; - private DynamicField[] dynamicUserFields; - private DynamicField[] negativeDynamicUserFields; - - UserFields(Map ufm) { - userFieldsMap = ufm; - if (0 == userFieldsMap.size()) { - userFieldsMap.put("*", null); - } - - // Process dynamic patterns in userFields - ArrayList dynUserFields = new ArrayList(); - ArrayList negDynUserFields = new ArrayList(); - for(String f : userFieldsMap.keySet()) { - if(f.contains("*")) { - if(f.startsWith("-")) - negDynUserFields.add(new DynamicField(f.substring(1))); - else - dynUserFields.add(new DynamicField(f)); - } - } - Collections.sort(dynUserFields); - dynamicUserFields = dynUserFields.toArray(new DynamicField[dynUserFields.size()]); - Collections.sort(negDynUserFields); - negativeDynamicUserFields = negDynUserFields.toArray(new DynamicField[negDynUserFields.size()]); -// System.out.println("** userF="+userFieldsMap+", dynUF="+Arrays.toString(dynamicUserFields)+", negDynUF="+Arrays.toString(negativeDynamicUserFields)); - } - - /** - * Is the given field name allowed according to UserFields spec given in the uf parameter? - * @param fname the field name to examine - * @return true if the fielded queries are allowed on this field - */ - public boolean isAllowed(String fname) { - boolean res = ((userFieldsMap.containsKey(fname) || isDynField(fname, false)) && - !userFieldsMap.containsKey("-"+fname) && - !isDynField(fname, true)); - return res; - } - - private boolean isDynField(String field, boolean neg) { - return getDynFieldForName(field, neg) == null ? false : true; - } - - private String getDynFieldForName(String f, boolean neg) { - for( DynamicField df : neg?negativeDynamicUserFields:dynamicUserFields ) { - if( df.matches( f ) ) return df.wildcard; - } - return null; - } - - /** - * Finds the default user field boost associated with the given field. - * This is parsed from the uf parameter, and may be specified as wildcards, e.g. *name^2.0 or *^3.0 - * @param field the field to find boost for - * @return the float boost value associated with the given field or a wildcard matching the field - */ - public Float getBoost(String field) { - return (userFieldsMap.containsKey(field)) ? - userFieldsMap.get(field) : // Exact field - userFieldsMap.get(getDynFieldForName(field, false)); // Dynamic field - } - } - - /* Represents a dynamic field, for easier matching, inspired by same class in IndexSchema */ - static class DynamicField implements Comparable { - final static int STARTS_WITH=1; - final static int ENDS_WITH=2; - final static int CATCHALL=3; - - final String wildcard; - final int type; - - final String str; - - protected DynamicField(String wildcard) { - this.wildcard = wildcard; - if (wildcard.equals("*")) { - type=CATCHALL; - str=null; - } - else if (wildcard.startsWith("*")) { - type=ENDS_WITH; - str=wildcard.substring(1); - } - else if (wildcard.endsWith("*")) { - type=STARTS_WITH; - str=wildcard.substring(0,wildcard.length()-1); - } - else { - throw new RuntimeException("dynamic field name must start or end with *"); - } - } - - /* - * Returns true if the regex wildcard for this DynamicField would match the input field name - */ - public boolean matches(String name) { - if (type==CATCHALL) return true; - else if (type==STARTS_WITH && name.startsWith(str)) return true; - else if (type==ENDS_WITH && name.endsWith(str)) return true; - else return false; - } - - /** - * Sort order is based on length of regex. Longest comes first. - * @param other The object to compare to. - * @return a negative integer, zero, or a positive integer - * as this object is less than, equal to, or greater than - * the specified object. - */ - @Override - public int compareTo(DynamicField other) { - return other.wildcard.length() - wildcard.length(); - } - - @Override - public String toString() { - return this.wildcard; - } - } -} +} \ No newline at end of file diff --git a/solr/core/src/java/org/apache/solr/servlet/SolrRequestParsers.java b/solr/core/src/java/org/apache/solr/servlet/SolrRequestParsers.java index 54783966777..3a7caf35b57 100644 --- a/solr/core/src/java/org/apache/solr/servlet/SolrRequestParsers.java +++ b/solr/core/src/java/org/apache/solr/servlet/SolrRequestParsers.java @@ -20,9 +20,13 @@ package org.apache.solr.servlet; import java.io.File; import java.io.IOException; import java.io.InputStream; -import java.io.UnsupportedEncodingException; +import java.io.ByteArrayOutputStream; +import java.nio.ByteBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; import java.net.URL; -import java.net.URLDecoder; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; @@ -32,20 +36,20 @@ import java.util.Locale; import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.commons.io.IOUtils; -import org.apache.commons.io.input.BoundedInputStream; import javax.servlet.http.HttpServletRequest; import org.apache.commons.fileupload.FileItem; import org.apache.commons.fileupload.disk.DiskFileItemFactory; import org.apache.commons.fileupload.servlet.ServletFileUpload; +import org.apache.lucene.util.IOUtils; import org.apache.solr.common.SolrException; import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.MultiMapSolrParams; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.ContentStream; import org.apache.solr.common.util.ContentStreamBase; +import org.apache.solr.common.util.FastInputStream; import org.apache.solr.core.Config; import org.apache.solr.core.SolrCore; import org.apache.solr.request.SolrQueryRequest; @@ -71,7 +75,7 @@ public class SolrRequestParsers /** Default instance for e.g. admin requests. Limits to 2 MB uploads and does not allow remote streams. */ public static final SolrRequestParsers DEFAULT = new SolrRequestParsers(); - + /** * Pass in an xml configuration. A null configuration will enable * everything with maximum values. @@ -197,37 +201,140 @@ public class SolrRequestParsers */ public static MultiMapSolrParams parseQueryString(String queryString) { Map map = new HashMap(); - parseQueryString(queryString, "UTF-8", map); + parseQueryString(queryString, map); return new MultiMapSolrParams(map); } /** - * Given a url-encoded query string, map it into the given map + * Given a url-encoded query string (UTF-8), map it into the given map * @param queryString as given from URL - * @param charset to be used to decode %-encoding * @param map place all parameters in this map */ - static void parseQueryString(String queryString, String charset, Map map) { - if( queryString != null && queryString.length() > 0 ) { + static void parseQueryString(final String queryString, final Map map) { + if (queryString != null && queryString.length() > 0) { try { - for( String kv : queryString.split( "&" ) ) { - int idx = kv.indexOf( '=' ); - if( idx >= 0 ) { - String name = URLDecoder.decode( kv.substring( 0, idx ), charset); - String value = URLDecoder.decode( kv.substring( idx+1 ), charset); - MultiMapSolrParams.addParam( name, value, map ); - } else { - String name = URLDecoder.decode( kv, charset ); - MultiMapSolrParams.addParam( name, "", map ); + final int len = queryString.length(); + // this input stream emulates to get the raw bytes from the URL as passed to servlet container, it disallows any byte > 127 and enforces to %-escape them: + final InputStream in = new InputStream() { + int pos = 0; + @Override + public int read() { + if (pos < len) { + final char ch = queryString.charAt(pos); + if (ch > 127) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "URLDecoder: The query string contains a not-%-escaped byte > 127 at position " + pos); + } + pos++; + return ch; + } else { + return -1; + } } - } - } - catch( UnsupportedEncodingException uex ) { - throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, uex ); + }; + parseFormDataContent(in, Long.MAX_VALUE, IOUtils.CHARSET_UTF_8, map); + } catch (IOException ioe) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, ioe); } } } - + + /** + * Given a url-encoded form from POST content (as InputStream), map it into the given map. + * The given InputStream should be buffered! + * @param postContent to be parsed + * @param charset to be used to decode resulting bytes after %-decoding + * @param map place all parameters in this map + */ + @SuppressWarnings("fallthrough") + static long parseFormDataContent(final InputStream postContent, final long maxLen, final Charset charset, final Map map) throws IOException { + final CharsetDecoder charsetDecoder = charset.newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + long len = 0L, keyPos = 0L, valuePos = 0L; + final ByteArrayOutputStream2 keyStream = new ByteArrayOutputStream2(), + valueStream = new ByteArrayOutputStream2(); + ByteArrayOutputStream2 currentStream = keyStream; + for(;;) { + int b = postContent.read(); + switch (b) { + case -1: // end of stream + case '&': // separator + if (keyStream.size() > 0) { + final String key = decodeChars(keyStream, keyPos, charsetDecoder), value = decodeChars(valueStream, valuePos, charsetDecoder); + MultiMapSolrParams.addParam(key, value, map); + } else if (valueStream.size() > 0) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "application/x-www-form-urlencoded invalid: missing key"); + } + keyStream.reset(); + valueStream.reset(); + keyPos = valuePos = len + 1; + currentStream = keyStream; + break; + case '+': // space replacement + currentStream.write(' '); + break; + case '%': // escape + final int upper = digit16(b = postContent.read()); + len++; + final int lower = digit16(b = postContent.read()); + len++; + currentStream.write(((upper << 4) + lower)); + break; + case '=': // kv separator + if (currentStream == keyStream) { + valuePos = len + 1; + currentStream = valueStream; + break; + } + // fall-through + default: + currentStream.write(b); + } + if (b == -1) { + break; + } + len++; + if (len > maxLen) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "application/x-www-form-urlencoded content exceeds upload limit of " + (maxLen/1024L) + " KB"); + } + } + return len; + } + + private static String decodeChars(ByteArrayOutputStream2 stream, long position, CharsetDecoder charsetDecoder) { + try { + return charsetDecoder.decode(ByteBuffer.wrap(stream.buffer(), 0, stream.size())).toString(); + } catch (CharacterCodingException cce) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + "URLDecoder: Invalid character encoding detected after position " + position + + " of query string / form data (while parsing as " + charsetDecoder.charset().name() + ")" + ); + } + } + + /** Makes the buffer of ByteArrayOutputStream available without copy. */ + static final class ByteArrayOutputStream2 extends ByteArrayOutputStream { + byte[] buffer() { + return buf; + } + } + + private static int digit16(int b) { + if (b == -1) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "URLDecoder: Incomplete trailing escape (%) pattern"); + } + if (b >= '0' && b <= '9') { + return b - '0'; + } + if (b >= 'A' && b <= 'F') { + return b - ('A' - 10); + } + if (b >= 'a' && b <= 'f') { + return b - ('a' - 10); + } + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "URLDecoder: Invalid digit (" + ((char) b) + ") in escape (%) pattern"); + } + public boolean isHandleSelect() { return handleSelect; } @@ -404,15 +511,12 @@ class FormDataRequestParser implements SolrRequestParser throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Not application/x-www-form-urlencoded content: "+req.getContentType() ); } - String charset = ContentStreamBase.getCharsetFromContentType(req.getContentType()); - if (charset == null) charset = "UTF-8"; - final Map map = new HashMap(); // also add possible URL parameters and include into the map (parsed using UTF-8): final String qs = req.getQueryString(); if (qs != null) { - SolrRequestParsers.parseQueryString(qs, "UTF-8", map); + SolrRequestParsers.parseQueryString(qs, map); } // may be -1, so we check again later. But if its already greater we can stop processing! @@ -424,26 +528,21 @@ class FormDataRequestParser implements SolrRequestParser } // get query String from request body, using the charset given in content-type: - final InputStream in; + final String cs = ContentStreamBase.getCharsetFromContentType(req.getContentType()); + final Charset charset = (cs == null) ? IOUtils.CHARSET_UTF_8 : Charset.forName(cs); + InputStream in = null; try { in = req.getInputStream(); - } catch (IllegalStateException ise) { - throw (SolrException) getParameterIncompatibilityException().initCause(ise); - } - try { - final String data = IOUtils.toString(new BoundedInputStream(in, maxLength), charset); - // if there is remaining data in the underlying stream, throw exception: - if (in.read() != -1) { - // read remaining data and throw away: - while (IOUtils.skip(in, 1024L) > 0); - throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "application/x-www-form-urlencoded content exceeds upload limit of " + uploadLimitKB + " KB"); - } - if (data.length() == 0 && totalLength > 0L) { + final long bytesRead = SolrRequestParsers.parseFormDataContent(FastInputStream.wrap(in), maxLength, charset, map); + if (bytesRead == 0L && totalLength > 0L) { throw getParameterIncompatibilityException(); } - SolrRequestParsers.parseQueryString(data, charset, map); + } catch (IOException ioe) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, ioe); + } catch (IllegalStateException ise) { + throw (SolrException) getParameterIncompatibilityException().initCause(ise); } finally { - IOUtils.closeQuietly(in); + IOUtils.closeWhileHandlingException(in); } return new MultiMapSolrParams(map); diff --git a/solr/core/src/java/org/apache/solr/update/processor/FieldMutatingUpdateProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/FieldMutatingUpdateProcessorFactory.java index 249cf0b28ff..01b0f536e3c 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/FieldMutatingUpdateProcessorFactory.java +++ b/solr/core/src/java/org/apache/solr/update/processor/FieldMutatingUpdateProcessorFactory.java @@ -242,7 +242,7 @@ public abstract class FieldMutatingUpdateProcessorFactory /** * Removes all instance of the key from NamedList, returning the Set of - * Strings that key refered to. Throws an error if the key didn't refer + * Strings that key referred to. Throws an error if the key didn't refer * to one or more strings (or arrays of strings) * @exception SolrException invalid arr/str structure. */ diff --git a/solr/core/src/java/org/apache/solr/util/SolrPluginUtils.java b/solr/core/src/java/org/apache/solr/util/SolrPluginUtils.java index 1fff3440d25..7fcc6bf29a8 100644 --- a/solr/core/src/java/org/apache/solr/util/SolrPluginUtils.java +++ b/solr/core/src/java/org/apache/solr/util/SolrPluginUtils.java @@ -622,7 +622,7 @@ public class SolrPluginUtils { private final static Pattern CONSECUTIVE_OP_PATTERN = Pattern.compile( "\\s+[+-](?:\\s*[+-]+)+" ); /** - * Strips operators that are used illegally, otherwise reuturns it's + * Strips operators that are used illegally, otherwise returns its * input. Some examples of illegal user queries are: "chocolate +- * chip", "chocolate - - chip", and "chocolate chip -". */ diff --git a/solr/core/src/test-files/solr/collection1/conf/schema12.xml b/solr/core/src/test-files/solr/collection1/conf/schema12.xml index 57eeb311c5e..1ae814f2488 100755 --- a/solr/core/src/test-files/solr/collection1/conf/schema12.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema12.xml @@ -597,6 +597,7 @@ + diff --git a/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java b/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java index d51d6dc9d70..709eb46211e 100755 --- a/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java +++ b/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java @@ -17,8 +17,21 @@ package org.apache.solr.search; +import java.util.HashSet; +import java.util.Set; + +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.DisjunctionMaxQuery; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.solr.common.SolrException; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.util.AbstractSolrTestCase; +import org.apache.solr.util.SolrPluginUtils; import org.junit.BeforeClass; import org.junit.Test; @@ -510,37 +523,42 @@ public class TestExtendedDismaxParser extends AbstractSolrTestCase { public void testCyclicAliasing() throws Exception { try { - h.query(req("defType","edismax", "q","ignore_exception", "qf","who", "f.who.qf","name","f.name.qf","who")); - fail("Simple cyclic alising"); - } catch (SolrException e) { - assertTrue(e.getCause().getMessage().contains("Field aliases lead to a cycle")); - } - - try { - h.query(req("defType","edismax", "q","ignore_exception", "qf","who", "f.who.qf","name","f.name.qf","myalias", "f.myalias.qf","who")); - fail(); - } catch (SolrException e) { - assertTrue(e.getCause().getMessage().contains("Field aliases lead to a cycle")); - } - - try { - h.query(req("defType","edismax", "q","ignore_exception", "qf","field1", "f.field1.qf","field2 field3","f.field2.qf","field4 field5", "f.field4.qf","field5", "f.field5.qf","field6", "f.field3.qf","field6")); - } catch (SolrException e) { - fail("This is not cyclic alising"); - } - - try { - h.query(req("defType","edismax", "q","ignore_exception", "qf","field1", "f.field1.qf","field2 field3", "f.field2.qf","field4 field5", "f.field4.qf","field5", "f.field5.qf","field4")); - fail(); - } catch (SolrException e) { - assertTrue(e.getCause().getMessage().contains("Field aliases lead to a cycle")); - } - - try { - h.query(req("defType","edismax", "q","who:(Zapp Pig) ignore_exception", "qf","field1", "f.who.qf","name","f.name.qf","myalias", "f.myalias.qf","who")); - fail(); - } catch (SolrException e) { - assertTrue(e.getCause().getMessage().contains("Field aliases lead to a cycle")); + ignoreException(".*Field aliases lead to a cycle.*"); + try { + h.query(req("defType","edismax", "q","blarg", "qf","who", "f.who.qf","name","f.name.qf","who")); + fail("Simple cyclic alising not detected"); + } catch (SolrException e) { + assertTrue(e.getCause().getMessage().contains("Field aliases lead to a cycle")); + } + + try { + h.query(req("defType","edismax", "q","blarg", "qf","who", "f.who.qf","name","f.name.qf","myalias", "f.myalias.qf","who")); + fail("Cyclic alising not detected"); + } catch (SolrException e) { + assertTrue(e.getCause().getMessage().contains("Field aliases lead to a cycle")); + } + + try { + h.query(req("defType","edismax", "q","blarg", "qf","field1", "f.field1.qf","field2 field3","f.field2.qf","field4 field5", "f.field4.qf","field5", "f.field5.qf","field6", "f.field3.qf","field6")); + } catch (SolrException e) { + fail("This is not cyclic alising"); + } + + try { + h.query(req("defType","edismax", "q","blarg", "qf","field1", "f.field1.qf","field2 field3", "f.field2.qf","field4 field5", "f.field4.qf","field5", "f.field5.qf","field4")); + fail("Cyclic alising not detected"); + } catch (SolrException e) { + assertTrue(e.getCause().getMessage().contains("Field aliases lead to a cycle")); + } + + try { + h.query(req("defType","edismax", "q","who:(Zapp Pig)", "qf","field1", "f.who.qf","name","f.name.qf","myalias", "f.myalias.qf","who")); + fail("Cyclic alising not detected"); + } catch (SolrException e) { + assertTrue(e.getCause().getMessage().contains("Field aliases lead to a cycle")); + } + } finally { + resetExceptionIgnores(); } } @@ -930,4 +948,157 @@ public class TestExtendedDismaxParser extends AbstractSolrTestCase { "defType", "edismax") , "*[count(//doc)=1]"); } + + public void testEdismaxSimpleExtension() throws SyntaxError { + ModifiableSolrParams params = new ModifiableSolrParams(); + params.set("q", "foo bar"); + params.set("qf", "subject title^5"); + params.set("qf_fr", "subject_fr title_fr^5"); + params.set("qf_en", "subject_en title_en^5"); + params.set("qf_es", "subject_es title_es^5"); + + MultilanguageQueryParser parser = new MultilanguageQueryParser("foo bar", new ModifiableSolrParams(), params, req(params)); + Query query = parser.parse(); + assertNotNull(query); + assertTrue(containsClause(query, "title", "foo", 5, false)); + assertTrue(containsClause(query, "title", "bar", 5, false)); + assertTrue(containsClause(query, "subject", "foo", 1, false)); + assertTrue(containsClause(query, "subject", "bar", 1, false)); + + params.set("language", "es"); + parser = new MultilanguageQueryParser("foo bar", new ModifiableSolrParams(), params, req(params)); + query = parser.parse(); + assertNotNull(query); + assertTrue(containsClause(query, "title_es", "foo", 5, false)); + assertTrue(containsClause(query, "title_es", "bar", 5, false)); + assertTrue(containsClause(query, "subject_es", "foo", 1, false)); + assertTrue(containsClause(query, "subject_es", "bar", 1, false)); + + FuzzyDismaxQParser parser2 = new FuzzyDismaxQParser("foo bar absence", new ModifiableSolrParams(), params, req(params)); + query = parser2.parse(); + assertNotNull(query); + assertTrue(containsClause(query, "title", "foo", 5, false)); + assertTrue(containsClause(query, "title", "bar", 5, false)); + assertTrue(containsClause(query, "title", "absence", 5, true)); + + } + + private boolean containsClause(Query query, String field, String value, + int boost, boolean fuzzy) { + + if(query instanceof BooleanQuery) { + return containsClause((BooleanQuery)query, field, value, boost, fuzzy); + } + if(query instanceof DisjunctionMaxQuery) { + return containsClause((DisjunctionMaxQuery)query, field, value, boost, fuzzy); + } + if(query instanceof TermQuery && !fuzzy) { + return containsClause((TermQuery)query, field, value, boost); + } + if(query instanceof FuzzyQuery && fuzzy) { + return containsClause((FuzzyQuery)query, field, value, boost); + } + return false; + } + + private boolean containsClause(FuzzyQuery query, String field, String value, + int boost) { + if(query.getTerm().field().equals(field) && + query.getTerm().bytes().utf8ToString().equals(value) && + query.getBoost() == boost) { + return true; + } + return false; + } + + private boolean containsClause(BooleanQuery query, String field, String value, int boost, boolean fuzzy) { + for(BooleanClause clause:query.getClauses()) { + if(containsClause(clause.getQuery(), field, value, boost, fuzzy)) { + return true; + } + } + return false; + } + + private boolean containsClause(TermQuery query, String field, String value, int boost) { + if(query.getTerm().field().equals(field) && + query.getTerm().bytes().utf8ToString().equals(value) && + query.getBoost() == boost) { + return true; + } + return false; + } + + private boolean containsClause(DisjunctionMaxQuery query, String field, String value, int boost, boolean fuzzy) { + for(Query disjunct:query.getDisjuncts()) { + if(containsClause(disjunct, field, value, boost, fuzzy)) { + return true; + } + } + return false; + } + + class MultilanguageQueryParser extends ExtendedDismaxQParser { + + public MultilanguageQueryParser(String qstr, SolrParams localParams, + SolrParams params, SolrQueryRequest req) { + super(qstr, localParams, params, req); + } + + @Override + protected ExtendedDismaxConfiguration createConfiguration(String qstr, + SolrParams localParams, SolrParams params, SolrQueryRequest req) { + return new MultilanguageDismaxConfiguration(localParams, params, req); + } + + class MultilanguageDismaxConfiguration extends ExtendedDismaxConfiguration { + + public MultilanguageDismaxConfiguration(SolrParams localParams, + SolrParams params, SolrQueryRequest req) { + super(localParams, params, req); + String language = params.get("language"); + if(language != null) { + super.queryFields = SolrPluginUtils.parseFieldBoosts(solrParams.getParams("qf_" + language)); + } + } + + } + + } + + + + class FuzzyDismaxQParser extends ExtendedDismaxQParser { + + public FuzzyDismaxQParser(String qstr, SolrParams localParams, + SolrParams params, SolrQueryRequest req) { + super(qstr, localParams, params, req); + } + + @Override + protected ExtendedSolrQueryParser createEdismaxQueryParser(QParser qParser, + String field) { + return new FuzzyQueryParser(qParser, field); + } + + class FuzzyQueryParser extends ExtendedSolrQueryParser{ + + private Set frequentlyMisspelledWords; + + public FuzzyQueryParser(QParser parser, String defaultField) { + super(parser, defaultField); + frequentlyMisspelledWords = new HashSet(); + frequentlyMisspelledWords.add("absence"); + } + + @Override + protected Query getFieldQuery(String field, + String val, boolean quoted) throws SyntaxError { + if(frequentlyMisspelledWords.contains(val)) { + return getFuzzyQuery(field, val, 0.75F); + } + return super.getFieldQuery(field, val, quoted); + } + } + } } diff --git a/solr/core/src/test/org/apache/solr/servlet/DirectSolrConnectionTest.java b/solr/core/src/test/org/apache/solr/servlet/DirectSolrConnectionTest.java index 08c3766b593..bd566fcf592 100644 --- a/solr/core/src/test/org/apache/solr/servlet/DirectSolrConnectionTest.java +++ b/solr/core/src/test/org/apache/solr/servlet/DirectSolrConnectionTest.java @@ -17,6 +17,8 @@ package org.apache.solr.servlet; +import java.net.URLEncoder; + import org.apache.solr.common.params.CommonParams; import org.apache.solr.util.AbstractSolrTestCase; import org.junit.BeforeClass; @@ -74,7 +76,7 @@ public class DirectSolrConnectionTest extends AbstractSolrTestCase // Test using the Stream body parameter for( String cmd : cmds ) { - direct.request( "/update?"+CommonParams.STREAM_BODY+"="+cmd, null ); + direct.request( "/update?"+CommonParams.STREAM_BODY+"="+URLEncoder.encode(cmd, "UTF-8"), null ); } String got = direct.request( getIt, null ); assertTrue( got.indexOf( value ) > 0 ); diff --git a/solr/core/src/test/org/apache/solr/servlet/SolrRequestParserTest.java b/solr/core/src/test/org/apache/solr/servlet/SolrRequestParserTest.java index 8776b1f0afe..fb6d83f7a50 100644 --- a/solr/core/src/test/org/apache/solr/servlet/SolrRequestParserTest.java +++ b/solr/core/src/test/org/apache/solr/servlet/SolrRequestParserTest.java @@ -25,7 +25,6 @@ import java.io.ByteArrayInputStream; import java.net.HttpURLConnection; import java.net.SocketTimeoutException; import java.net.URL; -import java.net.URLConnection; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; @@ -115,7 +114,6 @@ public class SolrRequestParserTest extends SolrTestCaseJ4 { @Test public void testStreamURL() throws Exception { - boolean ok = false; String url = "http://www.apache.org/dist/lucene/solr/"; byte[] bytes = null; try { @@ -152,19 +150,51 @@ public class SolrRequestParserTest extends SolrTestCaseJ4 { } @Test - public void testUrlParamParsing() + public void testUrlParamParsing() throws Exception { - String[][] teststr = new String[][] { + final String[][] teststr = new String[][] { { "this is simple", "this%20is%20simple" }, { "this is simple", "this+is+simple" }, { "\u00FC", "%C3%BC" }, // lower-case "u" with diaeresis/umlaut { "\u0026", "%26" }, // & - { "\u20AC", "%E2%82%AC" } // euro + { "", "" }, // empty + { "\u20AC", "%E2%82%ac" } // euro, also with lowercase escapes }; for( String[] tst : teststr ) { - MultiMapSolrParams params = SolrRequestParsers.parseQueryString( "val="+tst[1] ); + SolrParams params = SolrRequestParsers.parseQueryString( "val="+tst[1] ); assertEquals( tst[0], params.get( "val" ) ); + params = SolrRequestParsers.parseQueryString( "val="+tst[1]+"&" ); + assertEquals( tst[0], params.get( "val" ) ); + params = SolrRequestParsers.parseQueryString( "&&val="+tst[1]+"&" ); + assertEquals( tst[0], params.get( "val" ) ); + params = SolrRequestParsers.parseQueryString( "&&val="+tst[1]+"&&&val="+tst[1]+"&" ); + assertArrayEquals(new String[]{tst[0],tst[0]}, params.getParams("val") ); + } + + SolrParams params = SolrRequestParsers.parseQueryString("val"); + assertEquals("", params.get("val")); + + params = SolrRequestParsers.parseQueryString("val&foo=bar=bar&muh&"); + assertEquals("", params.get("val")); + assertEquals("bar=bar", params.get("foo")); + assertEquals("", params.get("muh")); + + final String[] invalid = { + "q=h%FCllo", // non-UTF-8 + "q=h\u00FCllo", // encoded string is not pure US-ASCII + "q=hallo%", // incomplete escape + "q=hallo%1", // incomplete escape + "q=hallo%XX123", // invalid digit 'X' in escape + "=hallo" // missing key + }; + for (String s : invalid) { + try { + SolrRequestParsers.parseQueryString(s); + fail("Should throw SolrException"); + } catch (SolrException se) { + // pass + } } } @@ -172,7 +202,7 @@ public class SolrRequestParserTest extends SolrTestCaseJ4 { public void testStandardParseParamsAndFillStreams() throws Exception { final String getParams = "qt=%C3%BC&dup=foo", postParams = "q=hello&d%75p=bar"; - final byte[] postBytes = postParams.getBytes("UTF-8"); + final byte[] postBytes = postParams.getBytes("US-ASCII"); // Set up the expected behavior final String[] ct = new String[] { @@ -224,7 +254,7 @@ public class SolrRequestParserTest extends SolrTestCaseJ4 { expect(request.getContentLength()).andReturn(-1).anyTimes(); expect(request.getQueryString()).andReturn(null).anyTimes(); expect(request.getInputStream()).andReturn(new ServletInputStream() { - private final ByteArrayInputStream in = new ByteArrayInputStream(large.toString().getBytes("UTF-8")); + private final ByteArrayInputStream in = new ByteArrayInputStream(large.toString().getBytes("US-ASCII")); @Override public int read() { return in.read(); } }); replay(request); diff --git a/solr/example/solr/collection1/conf/solrconfig.xml b/solr/example/solr/collection1/conf/solrconfig.xml index f10ecfeb715..cee2e1609c3 100755 --- a/solr/example/solr/collection1/conf/solrconfig.xml +++ b/solr/example/solr/collection1/conf/solrconfig.xml @@ -1344,7 +1344,7 @@ http://wiki.apache.org/solr/ClusteringComponent - You'll need to set the solr.cluster.enabled system property + You'll need to set the solr.clustering.enabled system property when running solr to run with clustering enabled: java -Dsolr.clustering.enabled=true -jar start.jar diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractZkTestCase.java b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractZkTestCase.java index 87ba269983a..5d66eddc80f 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractZkTestCase.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractZkTestCase.java @@ -84,7 +84,7 @@ public abstract class AbstractZkTestCase extends SolrTestCaseJ4 { } // static to share with distrib test - static void buildZooKeeper(String zkHost, String zkAddress, File solrhome, String config, + public static void buildZooKeeper(String zkHost, String zkAddress, File solrhome, String config, String schema) throws Exception { SolrZkClient zkClient = new SolrZkClient(zkHost, AbstractZkTestCase.TIMEOUT); zkClient.makePath("/solr", false, true); diff --git a/solr/webapp/web/admin.html b/solr/webapp/web/admin.html index 17997209568..3634baf6470 100644 --- a/solr/webapp/web/admin.html +++ b/solr/webapp/web/admin.html @@ -1,3 +1,4 @@ +