diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 213a8f9401d..bdc118b95f2 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -85,6 +85,14 @@ Improvements
* LUCENE-7524: Added more detailed explanation of how IDF is computed in
ClassicSimilarity and BM25Similarity. (Adrien Grand)
+* LUCENE-7526: Enhanced UnifiedHighlighter's passage relevancy for queries with
+ wildcards and sometimes just terms. Added shouldPreferPassageRelevancyOverSpeed()
+ which can be overridden to return false to eek out more speed in some cases.
+ (Timothy M. Rodriguez, David Smiley)
+
+* LUCENE-7537: Index time sorting now supports multi-valued sorts
+ using selectors (MIN, MAX, etc.) (Jim Ferenczi via Mike McCandless)
+
Other
* LUCENE-7546: Fixed references to benchmark wikipedia data and the Jenkins line-docs file
@@ -92,6 +100,11 @@ Other
* LUCENE-7534: fix smokeTestRelease.py to run on Cygwin (Mikhail Khludnev)
+Build
+
+* LUCENE-7387: fix defaultCodec in build.xml to account for the line ending (hossman)
+
+
======================= Lucene 6.3.0 =======================
API Changes
diff --git a/lucene/benchmark/conf/highlighters-postings.alg b/lucene/benchmark/conf/highlighters-postings.alg
index cf9df118786..610908f5af5 100644
--- a/lucene/benchmark/conf/highlighters-postings.alg
+++ b/lucene/benchmark/conf/highlighters-postings.alg
@@ -34,7 +34,7 @@ content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
docs.file=temp/enwiki-20070527-pages-articles.xml.bz2
query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker
-file.query.maker.file=conf/query-phrases.txt
+file.query.maker.file=conf/query-terms.txt
log.queries=false
log.step.SearchTravRetHighlight=-1
@@ -55,7 +55,7 @@ highlighter=HlImpl:NONE:SH_A:UH_A:PH_P:UH_P:UH_PV
{ "Warm" SearchTravRetHighlight > : 1000
- { "HL" SearchTravRetHighlight > : 500
+ { "HL" SearchTravRetHighlight > : 2000
CloseReader
diff --git a/lucene/benchmark/conf/highlighters-tv.alg b/lucene/benchmark/conf/highlighters-tv.alg
index 1e51018e37d..26b64a352ec 100644
--- a/lucene/benchmark/conf/highlighters-tv.alg
+++ b/lucene/benchmark/conf/highlighters-tv.alg
@@ -54,7 +54,7 @@ highlighter=HlImpl:NONE:SH_V:FVH_V:UH_V
{ "Warm" SearchTravRetHighlight > : 1000
- { "HL" SearchTravRetHighlight > : 500
+ { "HL" SearchTravRetHighlight > : 2000
CloseReader
diff --git a/lucene/build.xml b/lucene/build.xml
index ca139412fc6..11f4644467d 100644
--- a/lucene/build.xml
+++ b/lucene/build.xml
@@ -213,6 +213,8 @@
+
+
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java
index 146e92a6a29..3d38d72385f 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java
@@ -33,9 +33,14 @@ import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.SortedNumericSelector;
+import org.apache.lucene.search.SortedNumericSortField;
+import org.apache.lucene.search.SortedSetSelector;
+import org.apache.lucene.search.SortedSetSortField;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
@@ -64,6 +69,7 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat {
final static BytesRef SI_SORT = new BytesRef(" sort ");
final static BytesRef SI_SORT_FIELD = new BytesRef(" field ");
final static BytesRef SI_SORT_TYPE = new BytesRef(" type ");
+ final static BytesRef SI_SELECTOR_TYPE = new BytesRef(" selector ");
final static BytesRef SI_SORT_REVERSE = new BytesRef(" reverse ");
final static BytesRef SI_SORT_MISSING = new BytesRef(" missing ");
@@ -158,6 +164,8 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat {
final String typeAsString = readString(SI_SORT_TYPE.length, scratch);
final SortField.Type type;
+ SortedSetSelector.Type selectorSet = null;
+ SortedNumericSelector.Type selectorNumeric = null;
switch (typeAsString) {
case "string":
type = SortField.Type.STRING;
@@ -174,6 +182,26 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat {
case "float":
type = SortField.Type.FLOAT;
break;
+ case "multi_valued_string":
+ type = SortField.Type.STRING;
+ selectorSet = readSetSelector(input, scratch);
+ break;
+ case "multi_valued_long":
+ type = SortField.Type.LONG;
+ selectorNumeric = readNumericSelector(input, scratch);
+ break;
+ case "multi_valued_int":
+ type = SortField.Type.INT;
+ selectorNumeric = readNumericSelector(input, scratch);
+ break;
+ case "multi_valued_double":
+ type = SortField.Type.DOUBLE;
+ selectorNumeric = readNumericSelector(input, scratch);
+ break;
+ case "multi_valued_float":
+ type = SortField.Type.FLOAT;
+ selectorNumeric = readNumericSelector(input, scratch);
+ break;
default:
throw new CorruptIndexException("unable to parse sort type string: " + typeAsString, input);
}
@@ -245,7 +273,13 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat {
default:
throw new AssertionError();
}
- sortField[i] = new SortField(field, type, reverse);
+ if (selectorSet != null) {
+ sortField[i] = new SortedSetSortField(field, reverse);
+ } else if (selectorNumeric != null) {
+ sortField[i] = new SortedNumericSortField(field, type, reverse);
+ } else {
+ sortField[i] = new SortField(field, type, reverse);
+ }
if (missingValue != null) {
sortField[i].setMissingValue(missingValue);
}
@@ -265,6 +299,38 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat {
private String readString(int offset, BytesRefBuilder scratch) {
return new String(scratch.bytes(), offset, scratch.length()-offset, StandardCharsets.UTF_8);
}
+
+ private SortedSetSelector.Type readSetSelector(IndexInput input, BytesRefBuilder scratch) throws IOException {
+ SimpleTextUtil.readLine(input, scratch);
+ assert StringHelper.startsWith(scratch.get(), SI_SELECTOR_TYPE);
+ final String selectorAsString = readString(SI_SELECTOR_TYPE.length, scratch);
+ switch (selectorAsString) {
+ case "min":
+ return SortedSetSelector.Type.MIN;
+ case "middle_min":
+ return SortedSetSelector.Type.MIDDLE_MIN;
+ case "middle_max":
+ return SortedSetSelector.Type.MIDDLE_MAX;
+ case "max":
+ return SortedSetSelector.Type.MAX;
+ default:
+ throw new CorruptIndexException("unable to parse SortedSetSelector type: " + selectorAsString, input);
+ }
+ }
+
+ private SortedNumericSelector.Type readNumericSelector(IndexInput input, BytesRefBuilder scratch) throws IOException {
+ SimpleTextUtil.readLine(input, scratch);
+ assert StringHelper.startsWith(scratch.get(), SI_SELECTOR_TYPE);
+ final String selectorAsString = readString(SI_SELECTOR_TYPE.length, scratch);
+ switch (selectorAsString) {
+ case "min":
+ return SortedNumericSelector.Type.MIN;
+ case "max":
+ return SortedNumericSelector.Type.MAX;
+ default:
+ throw new CorruptIndexException("unable to parse SortedNumericSelector type: " + selectorAsString, input);
+ }
+ }
@Override
public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOException {
@@ -352,29 +418,93 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat {
SimpleTextUtil.writeNewline(output);
SimpleTextUtil.write(output, SI_SORT_TYPE);
- final String sortType;
- switch (sortField.getType()) {
+ final String sortTypeString;
+ final SortField.Type sortType;
+ final boolean multiValued;
+ if (sortField instanceof SortedSetSortField) {
+ sortType = SortField.Type.STRING;
+ multiValued = true;
+ } else if (sortField instanceof SortedNumericSortField) {
+ sortType = ((SortedNumericSortField) sortField).getNumericType();
+ multiValued = true;
+ } else {
+ sortType = sortField.getType();
+ multiValued = false;
+ }
+ switch (sortType) {
case STRING:
- sortType = "string";
+ if (multiValued) {
+ sortTypeString = "multi_valued_string";
+ } else {
+ sortTypeString = "string";
+ }
break;
case LONG:
- sortType = "long";
+ if (multiValued) {
+ sortTypeString = "multi_valued_long";
+ } else {
+ sortTypeString = "long";
+ }
break;
case INT:
- sortType = "int";
+ if (multiValued) {
+ sortTypeString = "multi_valued_int";
+ } else {
+ sortTypeString = "int";
+ }
break;
case DOUBLE:
- sortType = "double";
+ if (multiValued) {
+ sortTypeString = "multi_valued_double";
+ } else {
+ sortTypeString = "double";
+ }
break;
case FLOAT:
- sortType = "float";
+ if (multiValued) {
+ sortTypeString = "multi_valued_float";
+ } else {
+ sortTypeString = "float";
+ }
break;
default:
throw new IllegalStateException("Unexpected sort type: " + sortField.getType());
}
- SimpleTextUtil.write(output, sortType, scratch);
+ SimpleTextUtil.write(output, sortTypeString, scratch);
SimpleTextUtil.writeNewline(output);
+ if (sortField instanceof SortedSetSortField) {
+ SortedSetSelector.Type selector = ((SortedSetSortField) sortField).getSelector();
+ final String selectorString;
+ if (selector == SortedSetSelector.Type.MIN) {
+ selectorString = "min";
+ } else if (selector == SortedSetSelector.Type.MIDDLE_MIN) {
+ selectorString = "middle_min";
+ } else if (selector == SortedSetSelector.Type.MIDDLE_MAX) {
+ selectorString = "middle_max";
+ } else if (selector == SortedSetSelector.Type.MAX) {
+ selectorString = "max";
+ } else {
+ throw new IllegalStateException("Unexpected SortedSetSelector type selector: " + selector);
+ }
+ SimpleTextUtil.write(output, SI_SELECTOR_TYPE);
+ SimpleTextUtil.write(output, selectorString, scratch);
+ SimpleTextUtil.writeNewline(output);
+ } else if (sortField instanceof SortedNumericSortField) {
+ SortedNumericSelector.Type selector = ((SortedNumericSortField) sortField).getSelector();
+ final String selectorString;
+ if (selector == SortedNumericSelector.Type.MIN) {
+ selectorString = "min";
+ } else if (selector == SortedNumericSelector.Type.MAX) {
+ selectorString = "max";
+ } else {
+ throw new IllegalStateException("Unexpected SortedNumericSelector type selector: " + selector);
+ }
+ SimpleTextUtil.write(output, SI_SELECTOR_TYPE);
+ SimpleTextUtil.write(output, selectorString, scratch);
+ SimpleTextUtil.writeNewline(output);
+ }
+
SimpleTextUtil.write(output, SI_SORT_REVERSE);
SimpleTextUtil.write(output, Boolean.toString(sortField.getReverse()), scratch);
SimpleTextUtil.writeNewline(output);
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java
index 1ee52588a55..da6e395e27a 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java
@@ -29,6 +29,10 @@ import org.apache.lucene.index.SegmentInfo; // javadocs
import org.apache.lucene.index.SegmentInfos; // javadocs
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.SortedNumericSelector;
+import org.apache.lucene.search.SortedNumericSortField;
+import org.apache.lucene.search.SortedSetSelector;
+import org.apache.lucene.search.SortedSetSortField;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataOutput; // javadocs
import org.apache.lucene.store.Directory;
@@ -69,7 +73,7 @@ import org.apache.lucene.util.Version;
* addIndexes), etc.
*
Files is a list of files referred to by this segment.
*
- *
+ *
* @see SegmentInfos
* @lucene.experimental
*/
@@ -78,7 +82,7 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat {
/** Sole constructor. */
public Lucene62SegmentInfoFormat() {
}
-
+
@Override
public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segment, "", Lucene62SegmentInfoFormat.SI_EXTENSION);
@@ -91,13 +95,13 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat {
Lucene62SegmentInfoFormat.VERSION_CURRENT,
segmentID, "");
final Version version = Version.fromBits(input.readInt(), input.readInt(), input.readInt());
-
+
final int docCount = input.readInt();
if (docCount < 0) {
throw new CorruptIndexException("invalid docCount: " + docCount, input);
}
final boolean isCompoundFile = input.readByte() == SegmentInfo.YES;
-
+
final Map diagnostics = input.readMapOfStrings();
final Set files = input.readSetOfStrings();
final Map attributes = input.readMapOfStrings();
@@ -110,6 +114,8 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat {
String fieldName = input.readString();
int sortTypeID = input.readVInt();
SortField.Type sortType;
+ SortedSetSelector.Type sortedSetSelector = null;
+ SortedNumericSelector.Type sortedNumericSelector = null;
switch(sortTypeID) {
case 0:
sortType = SortField.Type.STRING;
@@ -126,6 +132,43 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat {
case 4:
sortType = SortField.Type.FLOAT;
break;
+ case 5:
+ sortType = SortField.Type.STRING;
+ byte selector = input.readByte();
+ if (selector == 0) {
+ sortedSetSelector = SortedSetSelector.Type.MIN;
+ } else if (selector == 1) {
+ sortedSetSelector = SortedSetSelector.Type.MAX;
+ } else if (selector == 2) {
+ sortedSetSelector = SortedSetSelector.Type.MIDDLE_MIN;
+ } else if (selector == 3) {
+ sortedSetSelector = SortedSetSelector.Type.MIDDLE_MAX;
+ } else {
+ throw new CorruptIndexException("invalid index SortedSetSelector ID: " + selector, input);
+ }
+ break;
+ case 6:
+ byte type = input.readByte();
+ if (type == 0) {
+ sortType = SortField.Type.LONG;
+ } else if (type == 1) {
+ sortType = SortField.Type.INT;
+ } else if (type == 2) {
+ sortType = SortField.Type.DOUBLE;
+ } else if (type == 3) {
+ sortType = SortField.Type.FLOAT;
+ } else {
+ throw new CorruptIndexException("invalid index SortedNumericSortField type ID: " + type, input);
+ }
+ byte numericSelector = input.readByte();
+ if (numericSelector == 0) {
+ sortedNumericSelector = SortedNumericSelector.Type.MIN;
+ } else if (numericSelector == 1) {
+ sortedNumericSelector = SortedNumericSelector.Type.MAX;
+ } else {
+ throw new CorruptIndexException("invalid index SortedNumericSelector ID: " + numericSelector, input);
+ }
+ break;
default:
throw new CorruptIndexException("invalid index sort field type ID: " + sortTypeID, input);
}
@@ -139,7 +182,13 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat {
throw new CorruptIndexException("invalid index sort reverse: " + b, input);
}
- sortFields[i] = new SortField(fieldName, sortType, reverse);
+ if (sortedSetSelector != null) {
+ sortFields[i] = new SortedSetSortField(fieldName, reverse, sortedSetSelector);
+ } else if (sortedNumericSelector != null) {
+ sortFields[i] = new SortedNumericSortField(fieldName, sortType, reverse, sortedNumericSelector);
+ } else {
+ sortFields[i] = new SortField(fieldName, sortType, reverse);
+ }
Object missingValue;
b = input.readByte();
@@ -194,7 +243,7 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat {
} else {
indexSort = null;
}
-
+
si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, segmentID, attributes, indexSort);
si.setFiles(files);
} catch (Throwable exception) {
@@ -213,8 +262,8 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat {
try (IndexOutput output = dir.createOutput(fileName, ioContext)) {
// Only add the file once we've successfully created it, else IFD assert can trip:
si.addFile(fileName);
- CodecUtil.writeIndexHeader(output,
- Lucene62SegmentInfoFormat.CODEC_NAME,
+ CodecUtil.writeIndexHeader(output,
+ Lucene62SegmentInfoFormat.CODEC_NAME,
Lucene62SegmentInfoFormat.VERSION_CURRENT,
si.getId(),
"");
@@ -245,6 +294,7 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat {
output.writeVInt(numSortFields);
for (int i = 0; i < numSortFields; ++i) {
SortField sortField = indexSort.getSort()[i];
+ SortField.Type sortType = sortField.getType();
output.writeString(sortField.getField());
int sortTypeID;
switch (sortField.getType()) {
@@ -263,10 +313,55 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat {
case FLOAT:
sortTypeID = 4;
break;
+ case CUSTOM:
+ if (sortField instanceof SortedSetSortField) {
+ sortTypeID = 5;
+ sortType = SortField.Type.STRING;
+ } else if (sortField instanceof SortedNumericSortField) {
+ sortTypeID = 6;
+ sortType = ((SortedNumericSortField) sortField).getNumericType();
+ } else {
+ throw new IllegalStateException("Unexpected SortedNumericSortField " + sortField);
+ }
+ break;
default:
throw new IllegalStateException("Unexpected sort type: " + sortField.getType());
}
output.writeVInt(sortTypeID);
+ if (sortTypeID == 5) {
+ SortedSetSortField ssf = (SortedSetSortField) sortField;
+ if (ssf.getSelector() == SortedSetSelector.Type.MIN) {
+ output.writeByte((byte) 0);
+ } else if (ssf.getSelector() == SortedSetSelector.Type.MAX) {
+ output.writeByte((byte) 1);
+ } else if (ssf.getSelector() == SortedSetSelector.Type.MIDDLE_MIN) {
+ output.writeByte((byte) 2);
+ } else if (ssf.getSelector() == SortedSetSelector.Type.MIDDLE_MAX) {
+ output.writeByte((byte) 3);
+ } else {
+ throw new IllegalStateException("Unexpected SortedSetSelector type: " + ssf.getSelector());
+ }
+ } else if (sortTypeID == 6) {
+ SortedNumericSortField snsf = (SortedNumericSortField) sortField;
+ if (snsf.getNumericType() == SortField.Type.LONG) {
+ output.writeByte((byte) 0);
+ } else if (snsf.getNumericType() == SortField.Type.INT) {
+ output.writeByte((byte) 1);
+ } else if (snsf.getNumericType() == SortField.Type.DOUBLE) {
+ output.writeByte((byte) 2);
+ } else if (snsf.getNumericType() == SortField.Type.FLOAT) {
+ output.writeByte((byte) 3);
+ } else {
+ throw new IllegalStateException("Unexpected SortedNumericSelector type: " + snsf.getNumericType());
+ }
+ if (snsf.getSelector() == SortedNumericSelector.Type.MIN) {
+ output.writeByte((byte) 0);
+ } else if (snsf.getSelector() == SortedNumericSelector.Type.MAX) {
+ output.writeByte((byte) 1);
+ } else {
+ throw new IllegalStateException("Unexpected sorted numeric selector type: " + snsf.getSelector());
+ }
+ }
output.writeByte((byte) (sortField.getReverse() ? 0 : 1));
// write missing value
@@ -274,7 +369,7 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat {
if (missingValue == null) {
output.writeByte((byte) 0);
} else {
- switch(sortField.getType()) {
+ switch(sortType) {
case STRING:
if (missingValue == SortField.STRING_LAST) {
output.writeByte((byte) 1);
@@ -305,7 +400,7 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat {
}
}
}
-
+
CodecUtil.writeFooter(output);
}
}
@@ -314,5 +409,6 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat {
public final static String SI_EXTENSION = "si";
static final String CODEC_NAME = "Lucene62SegmentInfo";
static final int VERSION_START = 0;
- static final int VERSION_CURRENT = VERSION_START;
+ static final int VERSION_MULTI_VALUED_SORT = 1;
+ static final int VERSION_CURRENT = VERSION_MULTI_VALUED_SORT;
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
index 368259a5553..4f642eed52a 100644
--- a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
@@ -468,7 +468,8 @@ public final class IndexWriterConfig extends LiveIndexWriterConfig {
*/
public IndexWriterConfig setIndexSort(Sort sort) {
for(SortField sortField : sort.getSort()) {
- if (ALLOWED_INDEX_SORT_TYPES.contains(sortField.getType()) == false) {
+ final SortField.Type sortType = Sorter.getSortFieldType(sortField);
+ if (ALLOWED_INDEX_SORT_TYPES.contains(sortType) == false) {
throw new IllegalArgumentException("invalid SortField type: must be one of " + ALLOWED_INDEX_SORT_TYPES + " but got: " + sortField);
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java
index ee969c7b6b6..5ca6b65a7bb 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java
@@ -141,33 +141,25 @@ final class MultiSorter {
private static ComparableProvider[] getComparableProviders(List readers, SortField sortField) throws IOException {
ComparableProvider[] providers = new ComparableProvider[readers.size()];
+ final int reverseMul = sortField.getReverse() ? -1 : 1;
+ final SortField.Type sortType = Sorter.getSortFieldType(sortField);
- switch(sortField.getType()) {
+ switch(sortType) {
case STRING:
{
// this uses the efficient segment-local ordinal map:
final SortedDocValues[] values = new SortedDocValues[readers.size()];
for(int i=0;i> multiTermQueryRewrite) {
- super(field, extractedTerms, phraseHelper, automata);
+ public AnalysisOffsetStrategy(String field, BytesRef[] queryTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Analyzer analyzer) {
+ super(field, queryTerms, phraseHelper, automata);
this.analyzer = analyzer;
- // Automata (Wildcards / MultiTermQuery):
- this.automata = automata;
-
- if (terms.length > 0 && !strictPhrases.hasPositionSensitivity()) {
- this.automata = convertTermsToAutomata(terms, automata);
- // clear the terms array now that we've moved them to be expressed as automata
- terms = ZERO_LEN_BYTES_REF_ARRAY;
+ if (analyzer.getOffsetGap(field) != 1) { // note: 1 is the default. It is RARELY changed.
+ throw new IllegalArgumentException(
+ "offset gap of the provided analyzer should be 1 (field " + field + ")");
}
-
- if (terms.length > 0 || strictPhrases.willRewrite()) { //needs MemoryIndex
- // init MemoryIndex
- boolean storePayloads = strictPhrases.hasPositionSensitivity(); // might be needed
- memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets
- leafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader();
- // preFilter for MemoryIndex
- preMemIndexFilterAutomaton = buildCombinedAutomaton(field, terms, this.automata, strictPhrases,
- multiTermQueryRewrite);
- } else {
- memoryIndex = null;
- leafReader = null;
- preMemIndexFilterAutomaton = null;
- }
-
}
@Override
- public UnifiedHighlighter.OffsetSource getOffsetSource() {
+ public final UnifiedHighlighter.OffsetSource getOffsetSource() {
return UnifiedHighlighter.OffsetSource.ANALYSIS;
}
- @Override
- public List getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
- // note: don't need LimitTokenOffsetFilter since content is already truncated to maxLength
- TokenStream tokenStream = tokenStream(content);
-
- if (memoryIndex != null) { // also handles automata.length > 0
- // We use a MemoryIndex and index the tokenStream so that later we have the PostingsEnum with offsets.
-
- // note: An *alternative* strategy is to get PostingsEnums without offsets from the main index
- // and then marry this up with a fake PostingsEnum backed by a TokenStream (which has the offsets) and
- // can use that to filter applicable tokens? It would have the advantage of being able to exit
- // early and save some re-analysis. This would be an additional method/offset-source approach
- // since it's still useful to highlight without any index (so we build MemoryIndex).
-
- // note: probably unwise to re-use TermsEnum on reset mem index so we don't. But we do re-use the
- // leaf reader, which is a bit more top level than in the guts.
- memoryIndex.reset();
-
- // Filter the tokenStream to applicable terms
- if (preMemIndexFilterAutomaton != null) {
- tokenStream = newKeepWordFilter(tokenStream, preMemIndexFilterAutomaton);
- }
- memoryIndex.addField(field, tokenStream);//note: calls tokenStream.reset() & close()
- tokenStream = null; // it's consumed; done.
- docId = 0;
-
- if (automata.length > 0) {
- Terms foundTerms = leafReader.terms(field);
- if (foundTerms == null) {
- return Collections.emptyList(); //No offsets for this field.
- }
- // Un-invert for the automata. Much more compact than a CachingTokenStream
- tokenStream = MultiTermHighlighting.uninvertAndFilterTerms(foundTerms, 0, automata, content.length());
- }
-
- }
-
- return createOffsetsEnums(leafReader, docId, tokenStream);
- }
-
protected TokenStream tokenStream(String content) throws IOException {
- return MultiValueTokenStream.wrap(field, analyzer, content, UnifiedHighlighter.MULTIVAL_SEP_CHAR);
- }
-
- private static CharacterRunAutomaton[] convertTermsToAutomata(BytesRef[] terms, CharacterRunAutomaton[] automata) {
- CharacterRunAutomaton[] newAutomata = new CharacterRunAutomaton[terms.length + automata.length];
- for (int i = 0; i < terms.length; i++) {
- newAutomata[i] = MultiTermHighlighting.makeStringMatchAutomata(terms[i]);
+ // If there is no splitChar in content then we needn't wrap:
+ int splitCharIdx = content.indexOf(UnifiedHighlighter.MULTIVAL_SEP_CHAR);
+ if (splitCharIdx == -1) {
+ return analyzer.tokenStream(field, content);
}
- // Append existing automata (that which is used for MTQs)
- System.arraycopy(automata, 0, newAutomata, terms.length, automata.length);
- return newAutomata;
+
+ TokenStream subTokenStream = analyzer.tokenStream(field, content.substring(0, splitCharIdx));
+
+ return new MultiValueTokenStream(subTokenStream, field, analyzer, content, UnifiedHighlighter.MULTIVAL_SEP_CHAR, splitCharIdx);
}
- private static FilteringTokenFilter newKeepWordFilter(final TokenStream tokenStream,
- final CharacterRunAutomaton charRunAutomaton) {
- // it'd be nice to use KeepWordFilter but it demands a CharArraySet. TODO File JIRA? Need a new interface?
- return new FilteringTokenFilter(tokenStream) {
- final CharTermAttribute charAtt = addAttribute(CharTermAttribute.class);
-
- @Override
- protected boolean accept() throws IOException {
- return charRunAutomaton.run(charAtt.buffer(), 0, charAtt.length());
- }
- };
- }
-
-
/**
- * Build one {@link CharacterRunAutomaton} matching any term the query might match.
+ * Wraps an {@link Analyzer} and string text that represents multiple values delimited by a specified character. This
+ * exposes a TokenStream that matches what would get indexed considering the
+ * {@link Analyzer#getPositionIncrementGap(String)}. Currently this assumes {@link Analyzer#getOffsetGap(String)} is
+ * 1; an exception will be thrown if it isn't.
+ *
+ * It would be more orthogonal for this to be an Analyzer since we're wrapping an Analyzer but doing so seems like
+ * more work. The underlying components see a Reader not a String -- and the String is easy to
+ * split up without redundant buffering.
+ *
+ * @lucene.internal
*/
- private static CharacterRunAutomaton buildCombinedAutomaton(String field, BytesRef[] terms,
- CharacterRunAutomaton[] automata,
- PhraseHelper strictPhrases,
- Function> multiTermQueryRewrite) {
- List allAutomata = new ArrayList<>();
- if (terms.length > 0) {
- allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(terms))));
- }
- Collections.addAll(allAutomata, automata);
- for (SpanQuery spanQuery : strictPhrases.getSpanQueries()) {
- Collections.addAll(allAutomata,
- MultiTermHighlighting.extractAutomata(spanQuery, field, true, multiTermQueryRewrite));//true==lookInSpan
+ private static final class MultiValueTokenStream extends TokenFilter {
+
+ private final String fieldName;
+ private final Analyzer indexAnalyzer;
+ private final String content;
+ private final char splitChar;
+
+ private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+ private int startValIdx = 0;
+ private int endValIdx;
+ private int remainingPosInc = 0;
+
+ private MultiValueTokenStream(TokenStream subTokenStream, String fieldName, Analyzer indexAnalyzer,
+ String content, char splitChar, int splitCharIdx) {
+ super(subTokenStream); // subTokenStream is already initialized to operate on the first value
+ this.fieldName = fieldName;
+ this.indexAnalyzer = indexAnalyzer;
+ this.content = content;
+ this.splitChar = splitChar;
+ this.endValIdx = splitCharIdx;
}
- if (allAutomata.size() == 1) {
- return allAutomata.get(0);
- }
- //TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we
- // could union them all. But it's not exposed, and note TermRangeQuery isn't modelled as an Automaton
- // by MultiTermHighlighting.
-
- // Return an aggregate CharacterRunAutomaton of others
- return new CharacterRunAutomaton(Automata.makeEmpty()) {// the makeEmpty() is bogus; won't be used
- @Override
- public boolean run(char[] chars, int offset, int length) {
- for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation
- if (allAutomata.get(i).run(chars, offset, length)) {
- return true;
- }
- }
- return false;
+ @Override
+ public void reset() throws IOException {
+ if (startValIdx != 0) {
+ throw new IllegalStateException("This TokenStream wasn't developed to be re-used.");
+ // ... although we could if a need for it arises.
}
- };
- }
+ super.reset();
+ }
+ @Override
+ public boolean incrementToken() throws IOException {
+ while (true) {
+
+ if (input.incrementToken()) {
+ // Position tracking:
+ if (remainingPosInc > 0) {//usually true first token of additional values (not first val)
+ posIncAtt.setPositionIncrement(remainingPosInc + posIncAtt.getPositionIncrement());
+ remainingPosInc = 0;//reset
+ }
+ // Offset tracking:
+ offsetAtt.setOffset(
+ startValIdx + offsetAtt.startOffset(),
+ startValIdx + offsetAtt.endOffset()
+ );
+ return true;
+ }
+
+ if (endValIdx == content.length()) {//no more
+ return false;
+ }
+
+ input.end(); // might adjust position increment
+ remainingPosInc += posIncAtt.getPositionIncrement();
+ input.close();
+ remainingPosInc += indexAnalyzer.getPositionIncrementGap(fieldName);
+
+ // Get new tokenStream based on next segment divided by the splitChar
+ startValIdx = endValIdx + 1;
+ endValIdx = content.indexOf(splitChar, startValIdx);
+ if (endValIdx == -1) {//EOF
+ endValIdx = content.length();
+ }
+ TokenStream tokenStream = indexAnalyzer.tokenStream(fieldName, content.substring(startValIdx, endValIdx));
+ if (tokenStream != input) {// (input is defined in TokenFilter set in the constructor)
+ // This is a grand trick we do -- knowing that the analyzer's re-use strategy is going to produce the
+ // very same tokenStream instance and thus have the same AttributeSource as this wrapping TokenStream
+ // since we used it as our input in the constructor.
+ // Were this not the case, we'd have to copy every attribute of interest since we can't alter the
+ // AttributeSource of this wrapping TokenStream post-construction (it's all private/final).
+ // If this is a problem, we could do that instead; maybe with a custom CharTermAttribute that allows
+ // us to easily set the char[] reference without literally copying char by char.
+ throw new IllegalStateException("Require TokenStream re-use. Unsupported re-use strategy?: " +
+ indexAnalyzer.getReuseStrategy());
+ }
+ tokenStream.reset();
+ } // while loop to increment token of this new value
+ }
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+ // Offset tracking:
+ offsetAtt.setOffset(
+ startValIdx + offsetAtt.startOffset(),
+ startValIdx + offsetAtt.endOffset());
+ }
+
+ }
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CompositeOffsetsPostingsEnum.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CompositeOffsetsPostingsEnum.java
new file mode 100644
index 00000000000..356f553fa0b
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CompositeOffsetsPostingsEnum.java
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.uhighlight;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.PriorityQueue;
+
+/**
+ * Provides a view over several underlying PostingsEnums for the iteration of offsets on the current document only.
+ * It's not general purpose; the position returned is always -1 and it doesn't iterate the documents.
+ */
+final class CompositeOffsetsPostingsEnum extends PostingsEnum {
+
+ private final int docId;
+ private final int freq;
+ private final PriorityQueue queue;
+ private boolean firstPositionConsumed = false;
+
+ /**
+ * This class is used to ensure we don't over iterate the underlying
+ * postings enum by keeping track of the position relative to the
+ * frequency.
+ * Ideally this would've been an implementation of a PostingsEnum
+ * but it would have to delegate most methods and it seemed easier
+ * to just wrap the tweaked method.
+ */
+ private static final class BoundsCheckingPostingsEnum {
+
+ private final PostingsEnum postingsEnum;
+ private int remainingPositions;
+
+ BoundsCheckingPostingsEnum(PostingsEnum postingsEnum) throws IOException {
+ this.postingsEnum = postingsEnum;
+ this.remainingPositions = postingsEnum.freq();
+ nextPosition();
+ }
+
+ /** Advances to the next position and returns true, or returns false if it can't. */
+ private boolean nextPosition() throws IOException {
+ if (remainingPositions-- > 0) {
+ postingsEnum.nextPosition(); // ignore the actual position; we don't care.
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ }
+
+ /** The provided {@link PostingsEnum}s must all be positioned to the same document, and must have offsets. */
+ CompositeOffsetsPostingsEnum(List postingsEnums) throws IOException {
+ queue = new PriorityQueue(postingsEnums.size()) {
+ @Override
+ protected boolean lessThan(BoundsCheckingPostingsEnum a, BoundsCheckingPostingsEnum b) {
+ try {
+ return a.postingsEnum.startOffset() < b.postingsEnum.startOffset();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ };
+
+ int freqAdd = 0;
+ for (PostingsEnum postingsEnum : postingsEnums) {
+ queue.add(new BoundsCheckingPostingsEnum(postingsEnum));
+ freqAdd += postingsEnum.freq();
+ }
+ freq = freqAdd;
+ this.docId = queue.top().postingsEnum.docID();
+ }
+
+ @Override
+ public int freq() throws IOException {
+ return freq;
+ }
+
+ /** Advances to the next position. Always returns -1; the caller is assumed not to care for the highlighter. */
+ @Override
+ public int nextPosition() throws IOException {
+ if (!firstPositionConsumed) {
+ firstPositionConsumed = true;
+ } else if (queue.size() == 0) {
+ throw new IllegalStateException("nextPosition called too many times");
+ } else if (queue.top().nextPosition()) { // advance head
+ queue.updateTop(); //the new position may be behind another postingsEnum in the queue
+ } else {
+ queue.pop(); //this postingsEnum is consumed; get rid of it. Another will take it's place.
+ }
+ assert queue.size() > 0;
+ return -1;
+ }
+
+ @Override
+ public int startOffset() throws IOException {
+ return queue.top().postingsEnum.startOffset();
+ }
+
+ @Override
+ public int endOffset() throws IOException {
+ return queue.top().postingsEnum.endOffset();
+ }
+
+ @Override
+ public BytesRef getPayload() throws IOException {
+ return queue.top().postingsEnum.getPayload();
+ }
+
+ @Override
+ public int docID() {
+ return docId;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return NO_MORE_DOCS;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ return NO_MORE_DOCS;
+ }
+
+ @Override
+ public long cost() {
+ return 1L; //at most 1 doc is returned
+ }
+}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java
index 04df31ea588..155f0a76fb9 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java
@@ -14,16 +14,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
package org.apache.lucene.search.uhighlight;
-import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
import java.util.Map;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.PostingsEnum;
@@ -31,6 +29,7 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/**
@@ -42,14 +41,14 @@ import org.apache.lucene.util.automaton.CharacterRunAutomaton;
public abstract class FieldOffsetStrategy {
protected final String field;
- protected BytesRef[] terms; // Query: free-standing terms
- protected PhraseHelper strictPhrases; // Query: position-sensitive information TODO: rename
- protected CharacterRunAutomaton[] automata; // Query: free-standing wildcards (multi-term query)
+ protected final PhraseHelper phraseHelper; // Query: position-sensitive information TODO: rename
+ protected final BytesRef[] terms; // Query: free-standing terms
+ protected final CharacterRunAutomaton[] automata; // Query: free-standing wildcards (multi-term query)
public FieldOffsetStrategy(String field, BytesRef[] queryTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) {
this.field = field;
this.terms = queryTerms;
- this.strictPhrases = phraseHelper;
+ this.phraseHelper = phraseHelper;
this.automata = automata;
}
@@ -65,58 +64,90 @@ public abstract class FieldOffsetStrategy {
*/
public abstract List getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException;
- protected List createOffsetsEnums(LeafReader leafReader, int doc, TokenStream tokenStream) throws IOException {
- List offsetsEnums = createOffsetsEnumsFromReader(leafReader, doc);
- if (automata.length > 0) {
- offsetsEnums.add(createOffsetsEnumFromTokenStream(doc, tokenStream));
+ protected List createOffsetsEnumsFromReader(LeafReader leafReader, int doc) throws IOException {
+ final Terms termsIndex = leafReader.terms(field);
+ if (termsIndex == null) {
+ return Collections.emptyList();
}
- return offsetsEnums;
- }
- protected List createOffsetsEnumsFromReader(LeafReader atomicReader, int doc) throws IOException {
// For strict positions, get a Map of term to Spans:
// note: ScriptPhraseHelper.NONE does the right thing for these method calls
final Map strictPhrasesTermToSpans =
- strictPhrases.getTermToSpans(atomicReader, doc);
+ phraseHelper.getTermToSpans(leafReader, doc);
// Usually simply wraps terms in a List; but if willRewrite() then can be expanded
final List sourceTerms =
- strictPhrases.expandTermsIfRewrite(terms, strictPhrasesTermToSpans);
+ phraseHelper.expandTermsIfRewrite(terms, strictPhrasesTermToSpans);
- final List offsetsEnums = new ArrayList<>(sourceTerms.size() + 1);
+ final List offsetsEnums = new ArrayList<>(sourceTerms.size() + automata.length);
- Terms termsIndex = atomicReader == null || sourceTerms.isEmpty() ? null : atomicReader.terms(field);
- if (termsIndex != null) {
+ // Handle sourceTerms:
+ if (!sourceTerms.isEmpty()) {
TermsEnum termsEnum = termsIndex.iterator();//does not return null
for (BytesRef term : sourceTerms) {
- if (!termsEnum.seekExact(term)) {
- continue; // term not found
- }
- PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
- if (postingsEnum == null) {
- // no offsets or positions available
- throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
- }
- if (doc != postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted
- continue;
- }
- postingsEnum = strictPhrases.filterPostings(term, postingsEnum, strictPhrasesTermToSpans.get(term));
- if (postingsEnum == null) {
- continue;// completely filtered out
- }
+ if (termsEnum.seekExact(term)) {
+ PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
- offsetsEnums.add(new OffsetsEnum(term, postingsEnum));
+ if (postingsEnum == null) {
+ // no offsets or positions available
+ throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
+ }
+
+ if (doc == postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted
+ postingsEnum = phraseHelper.filterPostings(term, postingsEnum, strictPhrasesTermToSpans.get(term));
+ if (postingsEnum != null) {
+ offsetsEnums.add(new OffsetsEnum(term, postingsEnum));
+ }
+ }
+ }
}
}
+
+ // Handle automata
+ if (automata.length > 0) {
+ offsetsEnums.addAll(createAutomataOffsetsFromTerms(termsIndex, doc));
+ }
+
return offsetsEnums;
}
- protected OffsetsEnum createOffsetsEnumFromTokenStream(int doc, TokenStream tokenStream) throws IOException {
- // if there are automata (MTQ), we have to initialize the "fake" enum wrapping them.
- assert tokenStream != null;
- // TODO Opt: we sometimes evaluate the automata twice when this TS isn't the original; can we avoid?
- PostingsEnum mtqPostingsEnum = MultiTermHighlighting.getDocsEnum(tokenStream, automata);
- assert mtqPostingsEnum instanceof Closeable; // FYI we propagate close() later.
- mtqPostingsEnum.advance(doc);
- return new OffsetsEnum(null, mtqPostingsEnum);
+ protected List createAutomataOffsetsFromTerms(Terms termsIndex, int doc) throws IOException {
+ List> automataPostings = new ArrayList<>(automata.length);
+ for (int i = 0; i < automata.length; i++) {
+ automataPostings.add(new ArrayList<>());
+ }
+
+ TermsEnum termsEnum = termsIndex.iterator();
+ BytesRef term;
+ CharsRefBuilder refBuilder = new CharsRefBuilder();
+ while ((term = termsEnum.next()) != null) {
+ for (int i = 0; i < automata.length; i++) {
+ CharacterRunAutomaton automaton = automata[i];
+ refBuilder.copyUTF8Bytes(term);
+ if (automaton.run(refBuilder.chars(), 0, refBuilder.length())) {
+ PostingsEnum postings = termsEnum.postings(null, PostingsEnum.OFFSETS);
+ if (doc == postings.advance(doc)) {
+ automataPostings.get(i).add(postings);
+ }
+ }
+ }
+ }
+
+ List offsetsEnums = new ArrayList<>(automata.length); //will be at most this long
+ for (int i = 0; i < automata.length; i++) {
+ CharacterRunAutomaton automaton = automata[i];
+ List postingsEnums = automataPostings.get(i);
+ int size = postingsEnums.size();
+ if (size > 0) { //only add if we have offsets
+ BytesRef wildcardTerm = new BytesRef(automaton.toString());
+ if (size == 1) { //don't wrap in a composite if there's only one OffsetsEnum
+ offsetsEnums.add(new OffsetsEnum(wildcardTerm, postingsEnums.get(0)));
+ } else {
+ offsetsEnums.add(new OffsetsEnum(wildcardTerm, new CompositeOffsetsPostingsEnum(postingsEnums)));
+ }
+ }
+ }
+
+ return offsetsEnums;
}
+
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
new file mode 100644
index 00000000000..4028912fcf0
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.uhighlight;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.function.Function;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.memory.MemoryIndex;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+
+
+/**
+ * Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}.
+ *
+ * @lucene.internal
+ */
+public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
+
+ private final MemoryIndex memoryIndex;
+ private final LeafReader leafReader;
+ private final CharacterRunAutomaton preMemIndexFilterAutomaton;
+
+ public MemoryIndexOffsetStrategy(String field, BytesRef[] extractedTerms, PhraseHelper phraseHelper,
+ CharacterRunAutomaton[] automata, Analyzer analyzer,
+ Function> multiTermQueryRewrite) {
+ super(field, extractedTerms, phraseHelper, automata, analyzer);
+ boolean storePayloads = phraseHelper.hasPositionSensitivity(); // might be needed
+ memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets
+ leafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader(); // appears to be re-usable
+ // preFilter for MemoryIndex
+ preMemIndexFilterAutomaton = buildCombinedAutomaton(field, terms, this.automata, phraseHelper, multiTermQueryRewrite);
+ }
+
+ /**
+ * Build one {@link CharacterRunAutomaton} matching any term the query might match.
+ */
+ private static CharacterRunAutomaton buildCombinedAutomaton(String field, BytesRef[] terms,
+ CharacterRunAutomaton[] automata,
+ PhraseHelper strictPhrases,
+ Function> multiTermQueryRewrite) {
+ List allAutomata = new ArrayList<>();
+ if (terms.length > 0) {
+ allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(terms))));
+ }
+ Collections.addAll(allAutomata, automata);
+ for (SpanQuery spanQuery : strictPhrases.getSpanQueries()) {
+ Collections.addAll(allAutomata,
+ MultiTermHighlighting.extractAutomata(spanQuery, field, true, multiTermQueryRewrite));//true==lookInSpan
+ }
+
+ if (allAutomata.size() == 1) {
+ return allAutomata.get(0);
+ }
+ //TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we
+ // could union them all. But it's not exposed, and note TermRangeQuery isn't modelled as an Automaton
+ // by MultiTermHighlighting.
+
+ // Return an aggregate CharacterRunAutomaton of others
+ return new CharacterRunAutomaton(Automata.makeEmpty()) {// the makeEmpty() is bogus; won't be used
+ @Override
+ public boolean run(char[] chars, int offset, int length) {
+ for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation
+ if (allAutomata.get(i).run(chars, offset, length)) {
+ return true;
+ }
+ }
+ return false;
+ }
+ };
+ }
+
+ @Override
+ public List getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
+ // note: don't need LimitTokenOffsetFilter since content is already truncated to maxLength
+ TokenStream tokenStream = tokenStream(content);
+
+ // Filter the tokenStream to applicable terms
+ tokenStream = newKeepWordFilter(tokenStream, preMemIndexFilterAutomaton);
+ memoryIndex.reset();
+ memoryIndex.addField(field, tokenStream);//note: calls tokenStream.reset() & close()
+ docId = 0;
+
+ return createOffsetsEnumsFromReader(leafReader, docId);
+ }
+
+
+ private static FilteringTokenFilter newKeepWordFilter(final TokenStream tokenStream,
+ final CharacterRunAutomaton charRunAutomaton) {
+ // it'd be nice to use KeepWordFilter but it demands a CharArraySet. TODO File JIRA? Need a new interface?
+ return new FilteringTokenFilter(tokenStream) {
+ final CharTermAttribute charAtt = addAttribute(CharTermAttribute.class);
+
+ @Override
+ protected boolean accept() throws IOException {
+ return charRunAutomaton.run(charAtt.buffer(), 0, charAtt.length());
+ }
+ };
+ }
+
+}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
index e85fa3bffa9..fd6a26a778f 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
@@ -16,8 +16,6 @@
*/
package org.apache.lucene.search.uhighlight;
-import java.io.Closeable;
-import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@@ -25,15 +23,7 @@ import java.util.Comparator;
import java.util.List;
import java.util.function.Function;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.index.FilterLeafReader;
-import org.apache.lucene.index.FilteredTermsEnum;
-import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
@@ -48,9 +38,7 @@ import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanNotQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanPositionCheckQuery;
-import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
-import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
@@ -210,182 +198,4 @@ class MultiTermHighlighting {
return list.toArray(new CharacterRunAutomaton[list.size()]);
}
- /**
- * Returns a "fake" DocsAndPositionsEnum over the tokenstream, returning offsets where {@code matchers}
- * matches tokens.
- *
- * This is solely used internally by PostingsHighlighter: DO NOT USE THIS METHOD!
- */
- public static PostingsEnum getDocsEnum(final TokenStream ts, final CharacterRunAutomaton[] matchers) throws IOException {
- return new TokenStreamPostingsEnum(ts, matchers);
- }
-
- // TODO: we could use CachingWrapperFilter, (or consume twice) to allow us to have a true freq()
- // but this would have a performance cost for likely little gain in the user experience, it
- // would only serve to make this method less bogus.
- // instead, we always return freq() = Integer.MAX_VALUE and let the highlighter terminate based on offset...
- // TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl?
- private static class TokenStreamPostingsEnum extends PostingsEnum implements Closeable {
- TokenStream stream; // becomes null when closed
- final CharacterRunAutomaton[] matchers;
- final CharTermAttribute charTermAtt;
- final OffsetAttribute offsetAtt;
-
- int currentDoc = -1;
- int currentMatch = -1;
- int currentStartOffset = -1;
-
- int currentEndOffset = -1;
-
- final BytesRef matchDescriptions[];
-
- TokenStreamPostingsEnum(TokenStream ts, CharacterRunAutomaton[] matchers) throws IOException {
- this.stream = ts;
- this.matchers = matchers;
- matchDescriptions = new BytesRef[matchers.length];
- charTermAtt = ts.addAttribute(CharTermAttribute.class);
- offsetAtt = ts.addAttribute(OffsetAttribute.class);
- ts.reset();
- }
-
- @Override
- public int nextPosition() throws IOException {
- if (stream != null) {
- while (stream.incrementToken()) {
- for (int i = 0; i < matchers.length; i++) {
- if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) {
- currentStartOffset = offsetAtt.startOffset();
- currentEndOffset = offsetAtt.endOffset();
- currentMatch = i;
- return 0;
- }
- }
- }
- stream.end();
- close();
- }
- // exhausted
- currentStartOffset = currentEndOffset = Integer.MAX_VALUE;
- return Integer.MAX_VALUE;
- }
-
- @Override
- public int freq() throws IOException {
- return Integer.MAX_VALUE; // lie
- }
-
- @Override
- public int startOffset() throws IOException {
- assert currentStartOffset >= 0;
- return currentStartOffset;
- }
-
- @Override
- public int endOffset() throws IOException {
- assert currentEndOffset >= 0;
- return currentEndOffset;
- }
-
- @Override
- public BytesRef getPayload() throws IOException {
- if (matchDescriptions[currentMatch] == null) {
- matchDescriptions[currentMatch] = new BytesRef(matchers[currentMatch].toString());
- }
- return matchDescriptions[currentMatch];
- }
-
- @Override
- public int docID() {
- return currentDoc;
- }
-
- @Override
- public int nextDoc() throws IOException {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public int advance(int target) throws IOException {
- return currentDoc = target;
- }
-
- @Override
- public long cost() {
- return 0;
- }
-
- @Override
- public void close() throws IOException {
- if (stream != null) {
- stream.close();
- stream = null;
- }
- }
- }
-
- /**
- * Return a TokenStream un-inverted from the provided Terms, but filtered based on the automata. The
- * Terms must have exactly one doc count (e.g. term vector or MemoryIndex).
- */
- //TODO: Alternatively, produce a list of OffsetsEnums from the Terms that match the automata.
- public static TokenStream uninvertAndFilterTerms(Terms termsIndex,
- int doc,
- final CharacterRunAutomaton[] automata,
- int offsetLength)
- throws IOException {
- assert automata.length > 0;
- //Note: if automata were plain Automaton (not CharacterRunAutomaton), we might instead use
- // TermsEnum.intersect(compiledAutomaton). But probably won't help due to O(N) TV impl so whatever.
- FilterLeafReader.FilterTerms filteredTermsIndex = new FilterLeafReader.FilterTerms(termsIndex) {
- @Override
- public TermsEnum iterator() throws IOException {
- return new FilteredTermsEnum(super.iterator(), false) {//false == no seek
- CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//reuse only for UTF8->UTF16 call
-
- @Override
- protected AcceptStatus accept(BytesRef termBytesRef) throws IOException {
- //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
- tempCharsRefBuilder.grow(termBytesRef.length);
- final int charLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
- for (CharacterRunAutomaton runAutomaton : automata) {
- if (runAutomaton.run(tempCharsRefBuilder.chars(), 0, charLen)) {
- return AcceptStatus.YES;
- }
- }
- return AcceptStatus.NO;
- }
- };
- }
-
- @Override
- public long size() throws IOException {
- return -1; // unknown
- }
-
- @Override
- public long getSumTotalTermFreq() throws IOException {
- return -1; // unknown
- }
-
- @Override
- public long getSumDocFreq() throws IOException {
- return -1; // unknown
- }
- };
- float loadFactor = 1f / 64f;
- return new TokenStreamFromTermVector(filteredTermsIndex, doc, offsetLength, loadFactor);
- }
-
- /**
- * Returns a simple automata that matches the specified term.
- */
- public static CharacterRunAutomaton makeStringMatchAutomata(BytesRef term) {
- String termString = term.utf8ToString();
- return new CharacterRunAutomaton(Automata.makeString(termString)) {
- @Override
- public String toString() {
- return termString;
- }
- };
- }
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiValueTokenStream.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiValueTokenStream.java
deleted file mode 100644
index 4cbf7542834..00000000000
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiValueTokenStream.java
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.uhighlight;
-
-import java.io.IOException;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-
-/**
- * Wraps an {@link Analyzer} and string text that represents multiple values delimited by a specified character. This
- * exposes a TokenStream that matches what would get indexed considering the
- * {@link Analyzer#getPositionIncrementGap(String)}. Currently this assumes {@link Analyzer#getOffsetGap(String)} is
- * 1; an exception will be thrown if it isn't.
- *
- * It would be more orthogonal for this to be an Analyzer since we're wrapping an Analyzer but doing so seems like
- * more work. The underlying components see a Reader not a String -- and the String is easy to
- * split up without redundant buffering.
- *
- * @lucene.internal
- */
-final class MultiValueTokenStream extends TokenFilter {
-
- private final String fieldName;
- private final Analyzer indexAnalyzer;
- private final String content;
- private final char splitChar;
-
- private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-
- private int startValIdx = 0;
- private int endValIdx;
- private int remainingPosInc = 0;
-
- /** note: The caller must remember to close the TokenStream eventually. */
- static TokenStream wrap(String fieldName, Analyzer indexAnalyzer, String content, char splitChar)
- throws IOException {
- if (indexAnalyzer.getOffsetGap(fieldName) != 1) { // note: 1 is the default. It is RARELY changed.
- throw new IllegalArgumentException(
- "offset gap of the provided analyzer should be 1 (field " + fieldName + ")");
- }
- // If there is no splitChar in content then we needn't wrap:
- int splitCharIdx = content.indexOf(splitChar);
- if (splitCharIdx == -1) {
- return indexAnalyzer.tokenStream(fieldName, content);
- }
-
- TokenStream subTokenStream = indexAnalyzer.tokenStream(fieldName, content.substring(0, splitCharIdx));
-
- return new MultiValueTokenStream(subTokenStream, fieldName, indexAnalyzer, content, splitChar, splitCharIdx);
- }
-
- private MultiValueTokenStream(TokenStream subTokenStream, String fieldName, Analyzer indexAnalyzer,
- String content, char splitChar, int splitCharIdx) {
- super(subTokenStream); // subTokenStream is already initialized to operate on the first value
- this.fieldName = fieldName;
- this.indexAnalyzer = indexAnalyzer;
- this.content = content;
- this.splitChar = splitChar;
- this.endValIdx = splitCharIdx;
- }
-
- @Override
- public void reset() throws IOException {
- if (startValIdx != 0) {
- throw new IllegalStateException("This TokenStream wasn't developed to be re-used.");
- // ... although we could if a need for it arises.
- }
- super.reset();
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- while (true) {
-
- if (input.incrementToken()) {
- // Position tracking:
- if (remainingPosInc > 0) {//usually true first token of additional values (not first val)
- posIncAtt.setPositionIncrement(remainingPosInc + posIncAtt.getPositionIncrement());
- remainingPosInc = 0;//reset
- }
- // Offset tracking:
- offsetAtt.setOffset(
- startValIdx + offsetAtt.startOffset(),
- startValIdx + offsetAtt.endOffset()
- );
- return true;
- }
-
- if (endValIdx == content.length()) {//no more
- return false;
- }
-
- input.end(); // might adjust position increment
- remainingPosInc += posIncAtt.getPositionIncrement();
- input.close();
- remainingPosInc += indexAnalyzer.getPositionIncrementGap(fieldName);
-
- // Get new tokenStream based on next segment divided by the splitChar
- startValIdx = endValIdx + 1;
- endValIdx = content.indexOf(splitChar, startValIdx);
- if (endValIdx == -1) {//EOF
- endValIdx = content.length();
- }
- TokenStream tokenStream = indexAnalyzer.tokenStream(fieldName, content.substring(startValIdx, endValIdx));
- if (tokenStream != input) {// (input is defined in TokenFilter set in the constructor)
- // This is a grand trick we do -- knowing that the analyzer's re-use strategy is going to produce the
- // very same tokenStream instance and thus have the same AttributeSource as this wrapping TokenStream
- // since we used it as our input in the constructor.
- // Were this not the case, we'd have to copy every attribute of interest since we can't alter the
- // AttributeSource of this wrapping TokenStream post-construction (it's all private/final).
- // If this is a problem, we could do that instead; maybe with a custom CharTermAttribute that allows
- // us to easily set the char[] reference without literally copying char by char.
- throw new IllegalStateException("Require TokenStream re-use. Unsupported re-use strategy?: " +
- indexAnalyzer.getReuseStrategy());
- }
- tokenStream.reset();
- } // while loop to increment token of this new value
- }
-
- @Override
- public void end() throws IOException {
- super.end();
- // Offset tracking:
- offsetAtt.setOffset(
- startValIdx + offsetAtt.startOffset(),
- startValIdx + offsetAtt.endOffset());
- }
-
-}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
index af29ef18750..cbaeb90621f 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
@@ -76,6 +76,7 @@ public class OffsetsEnum implements Comparable, Closeable {
}
void nextPosition() throws IOException {
+ assert hasMorePositions();
pos++;
postingsEnum.nextPosition();
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java
index f4caaa06dc6..de37d5da3a3 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java
@@ -40,7 +40,7 @@ public final class Passage {
BytesRef matchTerms[] = new BytesRef[8];
int numMatches = 0;
- void addMatch(int startOffset, int endOffset, BytesRef term) {
+ public void addMatch(int startOffset, int endOffset, BytesRef term) {
assert startOffset >= this.startOffset && startOffset <= this.endOffset;
if (numMatches == matchStarts.length) {
int newLength = ArrayUtil.oversize(numMatches+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java
index 95d51c917da..cde17baf87a 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java
@@ -266,7 +266,7 @@ public class PhraseHelper {
}
/**
- * Returns terms as a List, but expanded to any terms in strictPhrases' keySet if present. That can only
+ * Returns terms as a List, but expanded to any terms in phraseHelper' keySet if present. That can only
* happen if willRewrite() is true.
*/
List expandTermsIfRewrite(BytesRef[] terms, Map strictPhrasesTermToSpans) {
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java
index 4666906c091..975d3a1dcc1 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java
@@ -41,7 +41,7 @@ public class PostingsOffsetStrategy extends FieldOffsetStrategy {
@Override
public List getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
- LeafReader leafReader;
+ final LeafReader leafReader;
if (reader instanceof LeafReader) {
leafReader = (LeafReader) reader;
} else {
@@ -54,6 +54,7 @@ public class PostingsOffsetStrategy extends FieldOffsetStrategy {
return createOffsetsEnumsFromReader(leafReader, docId);
}
+
@Override
public UnifiedHighlighter.OffsetSource getOffsetSource() {
return UnifiedHighlighter.OffsetSource.POSTINGS;
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java
index 81de3798a65..b9086a7400a 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java
@@ -20,7 +20,6 @@ import java.io.IOException;
import java.util.Collections;
import java.util.List;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
@@ -58,14 +57,11 @@ public class PostingsWithTermVectorsOffsetStrategy extends FieldOffsetStrategy {
}
leafReader = new TermVectorFilteredLeafReader(leafReader, docTerms);
- TokenStream tokenStream = automata.length > 0 ? MultiTermHighlighting
- .uninvertAndFilterTerms(leafReader.terms(field), docId, this.automata, content.length()) : null;
-
- return createOffsetsEnums(leafReader, docId, tokenStream);
+ return createOffsetsEnumsFromReader(leafReader, docId);
}
@Override
public UnifiedHighlighter.OffsetSource getOffsetSource() {
- return UnifiedHighlighter.OffsetSource.POSTINGS;
+ return UnifiedHighlighter.OffsetSource.POSTINGS_WITH_TERM_VECTORS;
}
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java
index 204679b7652..f6eedc41766 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java
@@ -20,7 +20,6 @@ import java.io.IOException;
import java.util.Collections;
import java.util.List;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Terms;
@@ -51,18 +50,10 @@ public class TermVectorOffsetStrategy extends FieldOffsetStrategy {
return Collections.emptyList();
}
- LeafReader leafReader = null;
- if ((terms.length > 0) || strictPhrases.willRewrite()) {
- leafReader = new TermVectorLeafReader(field, tvTerms);
- docId = 0;
- }
+ LeafReader leafReader = new TermVectorLeafReader(field, tvTerms);
+ docId = 0;
- TokenStream tokenStream = null;
- if (automata.length > 0) {
- tokenStream = MultiTermHighlighting.uninvertAndFilterTerms(tvTerms, 0, automata, content.length());
- }
-
- return createOffsetsEnums(leafReader, docId, tokenStream);
+ return createOffsetsEnumsFromReader(leafReader, docId);
}
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamFromTermVector.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamFromTermVector.java
deleted file mode 100644
index 980c5662d3e..00000000000
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamFromTermVector.java
+++ /dev/null
@@ -1,395 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.uhighlight;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.index.PostingsEnum;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefArray;
-import org.apache.lucene.util.BytesRefBuilder;
-import org.apache.lucene.util.CharsRefBuilder;
-import org.apache.lucene.util.Counter;
-import org.apache.lucene.util.UnicodeUtil;
-
-/**
- * TokenStream created from a term vector field. The term vector requires positions and/or offsets (either). If you
- * want payloads add PayloadAttributeImpl (as you would normally) but don't assume the attribute is already added just
- * because you know the term vector has payloads, since the first call to incrementToken() will observe if you asked
- * for them and if not then won't get them. This TokenStream supports an efficient {@link #reset()}, so there's
- * no need to wrap with a caching impl.
- *
- * @lucene.internal
- */
-final class TokenStreamFromTermVector extends TokenStream {
- // note: differs from similar class in the standard highlighter. This one is optimized for sparse cases.
-
- /**
- * content length divided by distinct positions; an average of dense text.
- */
- private static final double AVG_CHARS_PER_POSITION = 6;
-
- private static final int INSERTION_SORT_THRESHOLD = 16;
-
- private final Terms vector;
-
- private final int filteredDocId;
-
- private final CharTermAttribute termAttribute;
-
- private final PositionIncrementAttribute positionIncrementAttribute;
-
- private final int offsetLength;
-
- private final float loadFactor;
-
- private OffsetAttribute offsetAttribute;//maybe null
-
- private PayloadAttribute payloadAttribute;//maybe null
-
- private CharsRefBuilder termCharsBuilder;//term data here
-
- private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null
- private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null
-
- private TokenLL firstToken = null; // the head of a linked-list
-
- private TokenLL incrementToken = null;
-
- private boolean initialized = false;//lazy
-
- public TokenStreamFromTermVector(Terms vector, int offsetLength) throws IOException {
- this(vector, 0, offsetLength, 1f);
- }
-
- /**
- * Constructor.
- *
- * @param vector Terms that contains the data for
- * creating the TokenStream. Must have positions and/or offsets.
- * @param filteredDocId The docID we will process.
- * @param offsetLength Supply the character length of the text being uninverted, or a lower value if you don't want
- * to invert text beyond an offset (in so doing this will act as a filter). If you don't
- * know the length, pass -1. In conjunction with {@code loadFactor}, it's used to
- * determine how many buckets to create during uninversion.
- * It's also used to filter out tokens with a start offset exceeding this value.
- * @param loadFactor The percent of tokens from the original terms (by position count) that are
- * expected to be inverted. If they are filtered (e.g.
- * {@link org.apache.lucene.index.FilterLeafReader.FilterTerms})
- * then consider using less than 1.0 to avoid wasting space.
- * 1.0 means all, 1/64th would suggest 1/64th of all tokens coming from vector.
- */
- TokenStreamFromTermVector(Terms vector, int filteredDocId, int offsetLength, float loadFactor) throws IOException {
- super();
- this.filteredDocId = filteredDocId;
- this.offsetLength = offsetLength == Integer.MAX_VALUE ? -1 : offsetLength;
- if (loadFactor <= 0f || loadFactor > 1f) {
- throw new IllegalArgumentException("loadFactor should be > 0 and <= 1");
- }
- this.loadFactor = loadFactor;
- assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*";
- if (!vector.hasPositions() && !vector.hasOffsets()) {
- throw new IllegalArgumentException("The term vector needs positions and/or offsets.");
- }
- assert vector.hasFreqs();
- this.vector = vector;
- termAttribute = addAttribute(CharTermAttribute.class);
- positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
- }
-
- public Terms getTermVectorTerms() {
- return vector;
- }
-
- @Override
- public void reset() throws IOException {
- incrementToken = null;
- super.reset();
- }
-
- //We delay initialization because we can see which attributes the consumer wants, particularly payloads
- private void init() throws IOException {
- assert !initialized;
- int dpEnumFlags = 0;
- if (vector.hasOffsets()) {
- offsetAttribute = addAttribute(OffsetAttribute.class);
- dpEnumFlags |= PostingsEnum.OFFSETS;
- }
- if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
- payloadAttribute = getAttribute(PayloadAttribute.class);
- payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
- spareBytesRefBuilder = new BytesRefBuilder();
- dpEnumFlags |= PostingsEnum.PAYLOADS;
- }
-
- // We put term data here
- termCharsBuilder = new CharsRefBuilder();
- termCharsBuilder.grow(initTotalTermCharLen());
-
- // Step 1: iterate termsEnum and create a token, placing into a bucketed array (given a load factor)
-
- final TokenLL[] tokenBuckets = initTokenBucketsArray();
- final double OFFSET_TO_BUCKET_IDX = loadFactor / AVG_CHARS_PER_POSITION;
- final double POSITION_TO_BUCKET_IDX = loadFactor;
-
- final TermsEnum termsEnum = vector.iterator();
- BytesRef termBytesRef;
- PostingsEnum dpEnum = null;
- final CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//only for UTF8->UTF16 call
-
- TERM_LOOP:
- while ((termBytesRef = termsEnum.next()) != null) {
- //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
- // note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
- tempCharsRefBuilder.grow(termBytesRef.length);
- final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
- final int termCharsOff = termCharsBuilder.length();
- termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);
- dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);
- assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier
- int currentDocId = dpEnum.advance(filteredDocId);
- if (currentDocId != filteredDocId) {
- continue; //Not expected
- }
- final int freq = dpEnum.freq();
- for (int j = 0; j < freq; j++) {
- TokenLL token = new TokenLL();
- token.position = dpEnum.nextPosition(); // can be -1 if not in the TV
- token.termCharsOff = termCharsOff;
- token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
- // copy offset (if it's there) and compute bucketIdx
- int bucketIdx;
- if (offsetAttribute != null) {
- token.startOffset = dpEnum.startOffset();
- if (offsetLength >= 0 && token.startOffset > offsetLength) {
- continue TERM_LOOP;//filter this token out; exceeds threshold
- }
- token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
- bucketIdx = (int) (token.startOffset * OFFSET_TO_BUCKET_IDX);
- } else {
- bucketIdx = (int) (token.position * POSITION_TO_BUCKET_IDX);
- }
- if (bucketIdx >= tokenBuckets.length) {
- bucketIdx = tokenBuckets.length - 1;
- }
-
- if (payloadAttribute != null) {
- final BytesRef payload = dpEnum.getPayload();
- token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload);
- }
-
- //Add token to the head of the bucket linked list
- token.next = tokenBuckets[bucketIdx];
- tokenBuckets[bucketIdx] = token;
- }
- }
-
- // Step 2: Link all Tokens into a linked-list and sort all tokens at the same position
-
- firstToken = initLinkAndSortTokens(tokenBuckets);
-
- // If the term vector didn't have positions, synthesize them
- if (!vector.hasPositions() && firstToken != null) {
- TokenLL prevToken = firstToken;
- prevToken.position = 0;
- for (TokenLL token = prevToken.next; token != null; prevToken = token, token = token.next) {
- if (prevToken.startOffset == token.startOffset) {
- token.position = prevToken.position;
- } else {
- token.position = prevToken.position + 1;
- }
- }
- }
-
- initialized = true;
- }
-
- private static TokenLL initLinkAndSortTokens(TokenLL[] tokenBuckets) {
- TokenLL firstToken = null;
- List scratchTokenArray = new ArrayList<>(); // declare here for re-use. TODO use native array
- TokenLL prevToken = null;
- for (TokenLL tokenHead : tokenBuckets) {
- if (tokenHead == null) {
- continue;
- }
- //sort tokens at this position and link them; return the first
- TokenLL tokenTail;
- // just one token
- if (tokenHead.next == null) {
- tokenTail = tokenHead;
- } else {
- // add the linked list to a temporary array
- for (TokenLL cur = tokenHead; cur != null; cur = cur.next) {
- scratchTokenArray.add(cur);
- }
- // sort; and set tokenHead & tokenTail
- if (scratchTokenArray.size() < INSERTION_SORT_THRESHOLD) {
- // insertion sort by creating a linked list (leave scratchTokenArray alone)
- tokenHead = tokenTail = scratchTokenArray.get(0);
- tokenHead.next = null;
- for (int i = 1; i < scratchTokenArray.size(); i++) {
- TokenLL insertToken = scratchTokenArray.get(i);
- if (insertToken.compareTo(tokenHead) <= 0) {
- // takes the place of tokenHead
- insertToken.next = tokenHead;
- tokenHead = insertToken;
- } else {
- // goes somewhere after tokenHead
- for (TokenLL prev = tokenHead; true; prev = prev.next) {
- if (prev.next == null || insertToken.compareTo(prev.next) <= 0) {
- if (prev.next == null) {
- tokenTail = insertToken;
- }
- insertToken.next = prev.next;
- prev.next = insertToken;
- break;
- }
- }
- }
- }
- } else {
- Collections.sort(scratchTokenArray);
- // take back out and create a linked list
- TokenLL prev = tokenHead = scratchTokenArray.get(0);
- for (int i = 1; i < scratchTokenArray.size(); i++) {
- prev.next = scratchTokenArray.get(i);
- prev = prev.next;
- }
- tokenTail = prev;
- tokenTail.next = null;
- }
- scratchTokenArray.clear();//too bad ArrayList nulls it out; we don't actually need that
- }
-
- //link to previous
- if (prevToken != null) {
- assert prevToken.next == null;
- prevToken.next = tokenHead; //concatenate linked-list
- assert prevToken.compareTo(tokenHead) < 0 : "wrong offset / position ordering expectations";
- } else {
- assert firstToken == null;
- firstToken = tokenHead;
- }
-
- prevToken = tokenTail;
- }
- return firstToken;
- }
-
- private int initTotalTermCharLen() throws IOException {
- int guessNumTerms;
- if (vector.size() != -1) {
- guessNumTerms = (int) vector.size();
- } else if (offsetLength != -1) {
- guessNumTerms = (int) (offsetLength * 0.33);//guess 1/3rd
- } else {
- return 128;
- }
- return Math.max(64, (int) (guessNumTerms * loadFactor * 7.0));//7 is over-estimate of average term len
- }
-
- private TokenLL[] initTokenBucketsArray() throws IOException {
- // Estimate the number of non-empty positions (number of tokens, excluding same-position synonyms).
- int positionsEstimate;
- if (offsetLength == -1) { // no clue what the char length is.
- // Estimate the number of position slots we need from term stats based on Wikipedia.
- int sumTotalTermFreq = (int) vector.getSumTotalTermFreq();
- if (sumTotalTermFreq == -1) {//unfortunately term vectors seem to not have this stat
- int size = (int) vector.size();
- if (size == -1) {//doesn't happen with term vectors, it seems, but pick a default any way
- size = 128;
- }
- sumTotalTermFreq = (int) (size * 2.4);
- }
- positionsEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this
- } else {
- // guess number of token positions by this factor.
- positionsEstimate = (int) (offsetLength / AVG_CHARS_PER_POSITION);
- }
- // apply the load factor.
- return new TokenLL[Math.max(1, (int) (positionsEstimate * loadFactor))];
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- int posInc;
- if (incrementToken == null) {
- if (!initialized) {
- init();
- assert initialized;
- }
- incrementToken = firstToken;
- if (incrementToken == null) {
- return false;
- }
- posInc = incrementToken.position + 1;//first token normally has pos 0; add 1 to get posInc
- } else if (incrementToken.next != null) {
- int lastPosition = incrementToken.position;
- incrementToken = incrementToken.next;
- posInc = incrementToken.position - lastPosition;
- } else {
- return false;
- }
- clearAttributes();
- termAttribute.copyBuffer(termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen);
-
- positionIncrementAttribute.setPositionIncrement(posInc);
- if (offsetAttribute != null) {
- offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc);
- }
- if (payloadAttribute != null && incrementToken.payloadIndex >= 0) {
- payloadAttribute.setPayload(payloadsBytesRefArray.get(spareBytesRefBuilder, incrementToken.payloadIndex));
- }
- return true;
- }
-
- private static class TokenLL implements Comparable {
- // This class should weigh 32 bytes, including object header
-
- int termCharsOff; // see termCharsBuilder
- short termCharsLen;
-
- int position;
- int startOffset;
- short endOffsetInc; // add to startOffset to get endOffset
- int payloadIndex;
-
- TokenLL next;
-
- @Override
- public int compareTo(TokenLL tokenB) {
- int cmp = Integer.compare(this.position, tokenB.position);
- if (cmp == 0) {
- cmp = Integer.compare(this.startOffset, tokenB.startOffset);
- if (cmp == 0) {
- cmp = Short.compare(this.endOffsetInc, tokenB.endOffsetInc);
- }
- }
- return cmp;
- }
- }
-}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java
new file mode 100644
index 00000000000..966eeef9116
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.uhighlight;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+
+/**
+ * Analyzes the text, producing a single {@link OffsetsEnum} wrapping the {@link TokenStream} filtered to terms
+ * in the query, including wildcards. It can't handle position-sensitive queries (phrases). Passage accuracy suffers
+ * because the freq() is unknown -- it's always {@link Integer#MAX_VALUE} instead.
+ */
+public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
+
+ private static final BytesRef[] ZERO_LEN_BYTES_REF_ARRAY = new BytesRef[0];
+
+ public TokenStreamOffsetStrategy(String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Analyzer indexAnalyzer) {
+ super(field, ZERO_LEN_BYTES_REF_ARRAY, phraseHelper, convertTermsToAutomata(terms, automata), indexAnalyzer);
+ assert phraseHelper.hasPositionSensitivity() == false;
+ }
+
+ private static CharacterRunAutomaton[] convertTermsToAutomata(BytesRef[] terms, CharacterRunAutomaton[] automata) {
+ CharacterRunAutomaton[] newAutomata = new CharacterRunAutomaton[terms.length + automata.length];
+ for (int i = 0; i < terms.length; i++) {
+ String termString = terms[i].utf8ToString();
+ newAutomata[i] = new CharacterRunAutomaton(Automata.makeString(termString)) {
+ @Override
+ public String toString() {
+ return termString;
+ }
+ };
+ }
+ // Append existing automata (that which is used for MTQs)
+ System.arraycopy(automata, 0, newAutomata, terms.length, automata.length);
+ return newAutomata;
+ }
+
+ @Override
+ public List getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
+ TokenStream tokenStream = tokenStream(content);
+ PostingsEnum mtqPostingsEnum = new TokenStreamPostingsEnum(tokenStream, automata);
+ mtqPostingsEnum.advance(docId);
+ return Collections.singletonList(new OffsetsEnum(null, mtqPostingsEnum));
+ }
+
+ // but this would have a performance cost for likely little gain in the user experience, it
+ // would only serve to make this method less bogus.
+ // instead, we always return freq() = Integer.MAX_VALUE and let the highlighter terminate based on offset...
+ // TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl?
+ private static class TokenStreamPostingsEnum extends PostingsEnum implements Closeable {
+ TokenStream stream; // becomes null when closed
+ final CharacterRunAutomaton[] matchers;
+ final CharTermAttribute charTermAtt;
+ final OffsetAttribute offsetAtt;
+
+ int currentDoc = -1;
+ int currentMatch = -1;
+ int currentStartOffset = -1;
+
+ int currentEndOffset = -1;
+
+ final BytesRef matchDescriptions[];
+
+ TokenStreamPostingsEnum(TokenStream ts, CharacterRunAutomaton[] matchers) throws IOException {
+ this.stream = ts;
+ this.matchers = matchers;
+ matchDescriptions = new BytesRef[matchers.length];
+ charTermAtt = ts.addAttribute(CharTermAttribute.class);
+ offsetAtt = ts.addAttribute(OffsetAttribute.class);
+ ts.reset();
+ }
+
+ @Override
+ public int nextPosition() throws IOException {
+ if (stream != null) {
+ while (stream.incrementToken()) {
+ for (int i = 0; i < matchers.length; i++) {
+ if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) {
+ currentStartOffset = offsetAtt.startOffset();
+ currentEndOffset = offsetAtt.endOffset();
+ currentMatch = i;
+ return 0;
+ }
+ }
+ }
+ stream.end();
+ close();
+ }
+ // exhausted
+ currentStartOffset = currentEndOffset = Integer.MAX_VALUE;
+ return Integer.MAX_VALUE;
+ }
+
+ @Override
+ public int freq() throws IOException {
+ return Integer.MAX_VALUE; // lie
+ }
+
+ @Override
+ public int startOffset() throws IOException {
+ assert currentStartOffset >= 0;
+ return currentStartOffset;
+ }
+
+ @Override
+ public int endOffset() throws IOException {
+ assert currentEndOffset >= 0;
+ return currentEndOffset;
+ }
+
+ @Override
+ public BytesRef getPayload() throws IOException {
+ if (matchDescriptions[currentMatch] == null) {
+ matchDescriptions[currentMatch] = new BytesRef(matchers[currentMatch].toString());
+ }
+ return matchDescriptions[currentMatch];
+ }
+
+ @Override
+ public int docID() {
+ return currentDoc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ return currentDoc = target;
+ }
+
+ @Override
+ public long cost() {
+ return 0;
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (stream != null) {
+ stream.close();
+ stream = null;
+ }
+ }
+ }
+}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
index 5f09d84f033..ac5f0f69999 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
@@ -117,6 +117,8 @@ public class UnifiedHighlighter {
private boolean defaultHighlightPhrasesStrictly = true; // AKA "accuracy" or "query debugging"
+ private boolean defaultPassageRelevancyOverSpeed = true; //For analysis, prefer MemoryIndexOffsetStrategy
+
// private boolean defaultRequireFieldMatch = true; TODO
private int maxLength = DEFAULT_MAX_LENGTH;
@@ -213,6 +215,12 @@ public class UnifiedHighlighter {
return defaultHighlightPhrasesStrictly;
}
+
+ protected boolean shouldPreferPassageRelevancyOverSpeed(String field) {
+ return defaultPassageRelevancyOverSpeed;
+ }
+
+
/**
* The maximum content size to process. Content will be truncated to this size before highlighting. Typically
* snippets closer to the beginning of the document better summarize its content.
@@ -716,8 +724,13 @@ public class UnifiedHighlighter {
}
protected FieldHighlighter getFieldHighlighter(String field, Query query, SortedSet allTerms, int maxPassages) {
+ BytesRef[] terms = filterExtractedTerms(field, allTerms);
+ Set highlightFlags = getFlags(field);
+ PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
+ CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
+ OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
return new FieldHighlighter(field,
- getOffsetStrategy(field, query, allTerms),
+ getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags),
new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR),
getScorer(field),
maxPassages,
@@ -725,41 +738,7 @@ public class UnifiedHighlighter {
getFormatter(field));
}
- protected FieldOffsetStrategy getOffsetStrategy(String field, Query query, SortedSet allTerms) {
- EnumSet highlightFlags = getFlags(field);
- BytesRef[] terms = filterExtractedTerms(field, allTerms);
- PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
- CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
- OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
- switch (offsetSource) {
- case ANALYSIS:
- return new AnalysisOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer(),
- this::preMultiTermQueryRewrite);
- case NONE_NEEDED:
- return NoOpOffsetStrategy.INSTANCE;
- case TERM_VECTORS:
- return new TermVectorOffsetStrategy(field, terms, phraseHelper, automata);
- case POSTINGS:
- return new PostingsOffsetStrategy(field, terms, phraseHelper, automata);
- case POSTINGS_WITH_TERM_VECTORS:
- return new PostingsWithTermVectorsOffsetStrategy(field, terms, phraseHelper, automata);
- default:
- throw new IllegalArgumentException("Unrecognized offset source " + offsetSource);
- }
- }
-
- protected EnumSet getFlags(String field) {
- EnumSet highlightFlags = EnumSet.noneOf(HighlightFlag.class);
- if (shouldHandleMultiTermQuery(field)) {
- highlightFlags.add(HighlightFlag.MULTI_TERM_QUERY);
- }
- if (shouldHighlightPhrasesStrictly(field)) {
- highlightFlags.add(HighlightFlag.PHRASES);
- }
- return highlightFlags;
- }
-
- protected BytesRef[] filterExtractedTerms(String field, SortedSet queryTerms) {
+ protected static BytesRef[] filterExtractedTerms(String field, SortedSet queryTerms) {
// TODO consider requireFieldMatch
Term floor = new Term(field, "");
Term ceiling = new Term(field, UnicodeUtil.BIG_TERM);
@@ -774,7 +753,21 @@ public class UnifiedHighlighter {
return terms;
}
- protected PhraseHelper getPhraseHelper(String field, Query query, EnumSet highlightFlags) {
+ protected Set getFlags(String field) {
+ Set highlightFlags = EnumSet.noneOf(HighlightFlag.class);
+ if (shouldHandleMultiTermQuery(field)) {
+ highlightFlags.add(HighlightFlag.MULTI_TERM_QUERY);
+ }
+ if (shouldHighlightPhrasesStrictly(field)) {
+ highlightFlags.add(HighlightFlag.PHRASES);
+ }
+ if (shouldPreferPassageRelevancyOverSpeed(field)) {
+ highlightFlags.add(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED);
+ }
+ return highlightFlags;
+ }
+
+ protected PhraseHelper getPhraseHelper(String field, Query query, Set highlightFlags) {
boolean highlightPhrasesStrictly = highlightFlags.contains(HighlightFlag.PHRASES);
boolean handleMultiTermQuery = highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY);
return highlightPhrasesStrictly ?
@@ -782,7 +775,7 @@ public class UnifiedHighlighter {
PhraseHelper.NONE;
}
- protected CharacterRunAutomaton[] getAutomata(String field, Query query, EnumSet highlightFlags) {
+ protected CharacterRunAutomaton[] getAutomata(String field, Query query, Set highlightFlags) {
return highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY)
? MultiTermHighlighting.extractAutomata(query, field, !highlightFlags.contains(HighlightFlag.PHRASES),
this::preMultiTermQueryRewrite)
@@ -790,11 +783,12 @@ public class UnifiedHighlighter {
}
protected OffsetSource getOptimizedOffsetSource(String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) {
+ OffsetSource offsetSource = getOffsetSource(field);
+
if (terms.length == 0 && automata.length == 0 && !phraseHelper.willRewrite()) {
return OffsetSource.NONE_NEEDED; //nothing to highlight
}
- OffsetSource offsetSource = getOffsetSource(field);
switch (offsetSource) {
case POSTINGS:
if (phraseHelper.willRewrite()) {
@@ -822,6 +816,32 @@ public class UnifiedHighlighter {
return offsetSource;
}
+ protected FieldOffsetStrategy getOffsetStrategy(OffsetSource offsetSource, String field, BytesRef[] terms,
+ PhraseHelper phraseHelper, CharacterRunAutomaton[] automata,
+ Set highlightFlags) {
+ switch (offsetSource) {
+ case ANALYSIS:
+ if (!phraseHelper.hasPositionSensitivity() &&
+ !highlightFlags.contains(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED)) {
+ //skip using a memory index since it's pure term filtering
+ return new TokenStreamOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer());
+ } else {
+ return new MemoryIndexOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer(),
+ this::preMultiTermQueryRewrite);
+ }
+ case NONE_NEEDED:
+ return NoOpOffsetStrategy.INSTANCE;
+ case TERM_VECTORS:
+ return new TermVectorOffsetStrategy(field, terms, phraseHelper, automata);
+ case POSTINGS:
+ return new PostingsOffsetStrategy(field, terms, phraseHelper, automata);
+ case POSTINGS_WITH_TERM_VECTORS:
+ return new PostingsWithTermVectorsOffsetStrategy(field, terms, phraseHelper, automata);
+ default:
+ throw new IllegalArgumentException("Unrecognized offset source " + offsetSource);
+ }
+ }
+
/**
* When highlighting phrases accurately, we need to know which {@link SpanQuery}'s need to have
* {@link Query#rewrite(IndexReader)} called on them. It helps performance to avoid it if it's not needed.
@@ -1041,10 +1061,9 @@ public class UnifiedHighlighter {
*/
public enum HighlightFlag {
PHRASES,
- MULTI_TERM_QUERY
+ MULTI_TERM_QUERY,
+ PASSAGE_RELEVANCY_OVER_SPEED
// TODO: ignoreQueryFields
// TODO: useQueryBoosts
- // TODO: avoidMemoryIndexIfPossible
- // TODO: preferMemoryIndexForStats
}
}
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java
index ddc9507d62b..be0ff1b4948 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java
@@ -773,7 +773,40 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
ir.close();
}
- public void testTokenStreamIsClosed() throws IOException {
+ public void testWithMaxLenAndMultipleWildcardMatches() throws IOException {
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);
+
+ Field body = new Field("body", "", fieldType);
+ Document doc = new Document();
+ doc.add(body);
+
+ //tests interleaving of multiple wildcard matches with the CompositePostingsEnum
+ //In this case the CompositePostingsEnum will have an underlying PostingsEnum that jumps form pos 1 to 9 for bravo
+ //and a second with position 2 for Bravado
+ body.setStringValue("Alpha Bravo Bravado foo foo foo. Foo foo Alpha Bravo");
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
+ highlighter.setMaxLength(32);//a little past first sentence
+
+ BooleanQuery query = new BooleanQuery.Builder()
+ .add(new TermQuery(new Term("body", "alpha")), BooleanClause.Occur.MUST)
+ .add(new PrefixQuery(new Term("body", "bra")), BooleanClause.Occur.MUST)
+ .build();
+ TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
+ String snippets[] = highlighter.highlight("body", query, topDocs, 2);//ask for 2 but we'll only get 1
+ assertArrayEquals(
+ new String[]{"AlphaBravoBravado foo foo foo."}, snippets
+ );
+
+ ir.close();
+ }
+
+ public void testTokenStreamIsClosed() throws Exception {
// note: test is a derivative of testWithMaxLen()
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);
@@ -828,8 +861,8 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
if (fieldType == UHTestHelper.reanalysisType) {
fail("Expecting EXPECTED IOException");
}
- } catch (IOException e) {
- if (!e.getMessage().equals("EXPECTED")) {
+ } catch (Exception e) {
+ if (!e.getMessage().contains("EXPECTED")) {
throw e;
}
}
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java
index bc2a14d9f9b..64570ae17d6 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java
@@ -50,9 +50,8 @@ public class TestUnifiedHighlighterRanking extends LuceneTestCase {
Analyzer indexAnalyzer;
- // note: don't choose reanalysis because it doesn't always know the term frequency, which is a statistic used
- // in passage ranking. Sometimes it does (e.g. when it builds a MemoryIndex) but not necessarily.
- final FieldType fieldType = UHTestHelper.randomFieldType(random(), UHTestHelper.postingsType, UHTestHelper.tvType);
+ // note: all offset sources, by default, use term freq, so it shouldn't matter which we choose.
+ final FieldType fieldType = UHTestHelper.randomFieldType(random());
/**
* indexes a bunch of gibberish, and then highlights top(n).
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java
index 641a835733e..d15094000c3 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java
@@ -22,11 +22,13 @@ import java.text.BreakIterator;
import java.util.Collections;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import java.util.SortedSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
@@ -68,6 +70,11 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
return Collections.emptyList();
}
+ @Override
+ protected List createOffsetsEnumsFromReader(LeafReader leafReader, int doc) throws IOException {
+ return super.createOffsetsEnumsFromReader(leafReader, doc);
+ }
+
};
assertEquals(offsetSource, strategy.getOffsetSource());
}
@@ -142,8 +149,8 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
}
@Override
- protected FieldOffsetStrategy getOffsetStrategy(String field, Query query, SortedSet allTerms) {
- return super.getOffsetStrategy(field, query, allTerms);
+ protected FieldOffsetStrategy getOffsetStrategy(OffsetSource offsetSource, String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Set highlightFlags) {
+ return super.getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags);
}
@Override
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseSegmentInfoFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseSegmentInfoFormatTestCase.java
index 49d19ae4322..ae5416fa479 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseSegmentInfoFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseSegmentInfoFormatTestCase.java
@@ -28,6 +28,8 @@ import org.apache.lucene.document.Document;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.SortedNumericSortField;
+import org.apache.lucene.search.SortedSetSortField;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.MockDirectoryWrapper;
@@ -167,6 +169,78 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT
return true;
}
+ private SortField randomIndexSortField() {
+ boolean reversed = random().nextBoolean();
+ SortField sortField;
+ switch(random().nextInt(10)) {
+ case 0:
+ sortField = new SortField(TestUtil.randomSimpleString(random()), SortField.Type.INT, reversed);
+ if (random().nextBoolean()) {
+ sortField.setMissingValue(random().nextInt());
+ }
+ break;
+ case 1:
+ sortField = new SortedNumericSortField(TestUtil.randomSimpleString(random()), SortField.Type.INT, reversed);
+ if (random().nextBoolean()) {
+ sortField.setMissingValue(random().nextInt());
+ }
+ break;
+
+ case 2:
+ sortField = new SortField(TestUtil.randomSimpleString(random()), SortField.Type.LONG, reversed);
+ if (random().nextBoolean()) {
+ sortField.setMissingValue(random().nextLong());
+ }
+ break;
+ case 3:
+ sortField = new SortedNumericSortField(TestUtil.randomSimpleString(random()), SortField.Type.LONG, reversed);
+ if (random().nextBoolean()) {
+ sortField.setMissingValue(random().nextLong());
+ }
+ break;
+ case 4:
+ sortField = new SortField(TestUtil.randomSimpleString(random()), SortField.Type.FLOAT, reversed);
+ if (random().nextBoolean()) {
+ sortField.setMissingValue(random().nextFloat());
+ }
+ break;
+ case 5:
+ sortField = new SortedNumericSortField(TestUtil.randomSimpleString(random()), SortField.Type.FLOAT, reversed);
+ if (random().nextBoolean()) {
+ sortField.setMissingValue(random().nextFloat());
+ }
+ break;
+ case 6:
+ sortField = new SortField(TestUtil.randomSimpleString(random()), SortField.Type.DOUBLE, reversed);
+ if (random().nextBoolean()) {
+ sortField.setMissingValue(random().nextDouble());
+ }
+ break;
+ case 7:
+ sortField = new SortedNumericSortField(TestUtil.randomSimpleString(random()), SortField.Type.DOUBLE, reversed);
+ if (random().nextBoolean()) {
+ sortField.setMissingValue(random().nextDouble());
+ }
+ break;
+ case 8:
+ sortField = new SortField(TestUtil.randomSimpleString(random()), SortField.Type.STRING, reversed);
+ if (random().nextBoolean()) {
+ sortField.setMissingValue(SortField.STRING_LAST);
+ }
+ break;
+ case 9:
+ sortField = new SortedSetSortField(TestUtil.randomSimpleString(random()), reversed);
+ if (random().nextBoolean()) {
+ sortField.setMissingValue(SortField.STRING_LAST);
+ }
+ break;
+ default:
+ sortField = null;
+ fail();
+ }
+ return sortField;
+ }
+
/** Test sort */
public void testSort() throws IOException {
assumeTrue("test requires a codec that can read/write index sort", supportsIndexSort());
@@ -180,22 +254,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT
final int numSortFields = TestUtil.nextInt(random(), 1, 3);
SortField[] sortFields = new SortField[numSortFields];
for (int j = 0; j < numSortFields; ++j) {
- sortFields[j] = new SortField(
- TestUtil.randomSimpleString(random()),
- random().nextBoolean() ? SortField.Type.LONG : SortField.Type.STRING,
- random().nextBoolean());
- if (random().nextBoolean()) {
- switch (sortFields[j].getType()) {
- case LONG:
- sortFields[j].setMissingValue(random().nextLong());
- break;
- case STRING:
- sortFields[j].setMissingValue(random().nextBoolean() ? SortField.STRING_FIRST : SortField.STRING_LAST);
- break;
- default:
- fail();
- }
- }
+ sortFields[j] = randomIndexSortField();
}
sort = new Sort(sortFields);
}
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index bc939b9f99c..11687bb3537 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -116,6 +116,10 @@ New Features
* SOLR-9633: Limit memory consumed by FastLRUCache with a new 'maxRamMB' config parameter.
(yonik, Michael Sun, shalin)
+* SOLR-9666: SolrJ LukeResponse support dynamic fields (Fengtan via Kevin Risden)
+
+* SOLR-9077: Streaming expressions should support collection alias (Kevin Risden)
+
Optimizations
----------------------
* SOLR-9704: Facet Module / JSON Facet API: Optimize blockChildren facets that have
@@ -158,6 +162,8 @@ Bug Fixes
* SOLR-9284: The HDFS BlockDirectoryCache should not let it's keysToRelease or names maps grow indefinitely.
(Mark Miller, Michael Sun)
+
+* SOLR-9729: JDBCStream improvements (Kevin Risden)
Other Changes
----------------------
diff --git a/solr/core/src/test/org/apache/solr/store/blockcache/BlockDirectoryTest.java b/solr/core/src/test/org/apache/solr/store/blockcache/BlockDirectoryTest.java
index f21b5aae1d1..5e4f1c59a23 100644
--- a/solr/core/src/test/org/apache/solr/store/blockcache/BlockDirectoryTest.java
+++ b/solr/core/src/test/org/apache/solr/store/blockcache/BlockDirectoryTest.java
@@ -115,7 +115,7 @@ public class BlockDirectoryTest extends SolrTestCaseJ4 {
Metrics metrics = new Metrics();
int blockSize = 8192;
int slabSize = blockSize * 32768;
- long totalMemory = 2 * slabSize;
+ long totalMemory = 1 * slabSize;
BlockCache blockCache = new BlockCache(metrics, true, totalMemory, slabSize, blockSize);
BlockDirectoryCache cache = new BlockDirectoryCache(blockCache, "/collection1", metrics, true);
directory = new BlockDirectory("test", dir, cache, null, true, false);
@@ -267,7 +267,11 @@ public class BlockDirectoryTest extends SolrTestCaseJ4 {
BlockDirectory d = directory;
assertTrue(d.useReadCache("", IOContext.DEFAULT));
- assertTrue(d.useWriteCache("", IOContext.DEFAULT));
+ if (d.getCache() instanceof MapperCache) {
+ assertTrue(d.useWriteCache("", IOContext.DEFAULT));
+ } else {
+ assertFalse(d.useWriteCache("", IOContext.DEFAULT));
+ }
assertFalse(d.useWriteCache("", mergeContext));
d = new BlockDirectory("test", directory, mapperCache, null, true, false);
@@ -277,7 +281,11 @@ public class BlockDirectoryTest extends SolrTestCaseJ4 {
d = new BlockDirectory("test", directory, mapperCache, null, false, true);
assertFalse(d.useReadCache("", IOContext.DEFAULT));
- assertTrue(d.useWriteCache("", IOContext.DEFAULT));
+ if (d.getCache() instanceof MapperCache) {
+ assertTrue(d.useWriteCache("", IOContext.DEFAULT));
+ } else {
+ assertFalse(d.useWriteCache("", IOContext.DEFAULT));
+ }
assertFalse(d.useWriteCache("", mergeContext));
}
}
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/sql/StatementImpl.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/sql/StatementImpl.java
index c05028deb58..a2c06d4c0e2 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/io/sql/StatementImpl.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/sql/StatementImpl.java
@@ -28,8 +28,8 @@ import java.util.Collections;
import java.util.List;
import java.util.Random;
+import org.apache.solr.client.solrj.io.stream.CloudSolrStream;
import org.apache.solr.client.solrj.io.stream.SolrStream;
-import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.ZkCoreNodeProps;
@@ -78,12 +78,7 @@ class StatementImpl implements Statement {
protected SolrStream constructStream(String sql) throws IOException {
try {
ZkStateReader zkStateReader = this.connection.getClient().getZkStateReader();
- ClusterState clusterState = zkStateReader.getClusterState();
- Collection slices = clusterState.getActiveSlices(this.connection.getCollection());
-
- if(slices == null) {
- throw new Exception("Collection not found:"+this.connection.getCollection());
- }
+ Collection slices = CloudSolrStream.getSlices(this.connection.getCollection(), zkStateReader, true);
List shuffler = new ArrayList<>();
for(Slice slice : slices) {
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/CloudSolrStream.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/CloudSolrStream.java
index 2fb56ee37b2..0580122bf38 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/CloudSolrStream.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/CloudSolrStream.java
@@ -49,6 +49,7 @@ import org.apache.solr.client.solrj.io.stream.expr.StreamExpression;
import org.apache.solr.client.solrj.io.stream.expr.StreamExpressionNamedParameter;
import org.apache.solr.client.solrj.io.stream.expr.StreamExpressionValue;
import org.apache.solr.client.solrj.io.stream.expr.StreamFactory;
+import org.apache.solr.common.cloud.Aliases;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica;
@@ -60,6 +61,7 @@ import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.ExecutorUtil;
import org.apache.solr.common.util.SolrjNamedThreadFactory;
+import org.apache.solr.common.util.StrUtils;
/**
* Connects to Zookeeper to pick replicas from a specific collection to send the query to.
@@ -352,37 +354,57 @@ public class CloudSolrStream extends TupleStream implements Expressible {
}
}
- public static Collection getSlicesIgnoreCase(String name, ClusterState clusterState) {
- for (String coll : clusterState.getCollectionStates().keySet()) {
- if (coll.equalsIgnoreCase(name)) {
- DocCollection collection = clusterState.getCollectionOrNull(coll);
- if (collection != null) return collection.getActiveSlices();
+ public static Collection getSlices(String collectionName, ZkStateReader zkStateReader, boolean checkAlias) throws IOException {
+ ClusterState clusterState = zkStateReader.getClusterState();
+
+ Map collectionsMap = clusterState.getCollectionsMap();
+
+ // Check collection case sensitive
+ if(collectionsMap.containsKey(collectionName)) {
+ return collectionsMap.get(collectionName).getActiveSlices();
+ }
+
+ // Check collection case insensitive
+ for(String collectionMapKey : collectionsMap.keySet()) {
+ if(collectionMapKey.equalsIgnoreCase(collectionName)) {
+ return collectionsMap.get(collectionMapKey).getActiveSlices();
}
}
- return null;
+
+ if(checkAlias) {
+ // check for collection alias
+ Aliases aliases = zkStateReader.getAliases();
+ String alias = aliases.getCollectionAlias(collectionName);
+ if (alias != null) {
+ Collection slices = new ArrayList<>();
+
+ List aliasList = StrUtils.splitSmart(alias, ",", true);
+ for (String aliasCollectionName : aliasList) {
+ // Add all active slices for this alias collection
+ slices.addAll(collectionsMap.get(aliasCollectionName).getActiveSlices());
+ }
+
+ return slices;
+ }
+ }
+
+ throw new IOException("Slices not found for " + collectionName);
}
protected void constructStreams() throws IOException {
-
try {
-
ZkStateReader zkStateReader = cloudSolrClient.getZkStateReader();
ClusterState clusterState = zkStateReader.getClusterState();
- Set liveNodes = clusterState.getLiveNodes();
- //System.out.println("Connected to zk an got cluster state.");
- Collection slices = clusterState.getActiveSlices(this.collection);
- if (slices == null) slices = getSlicesIgnoreCase(this.collection, clusterState);
- if (slices == null) {
- throw new Exception("Collection not found:" + this.collection);
- }
+ Collection slices = CloudSolrStream.getSlices(this.collection, zkStateReader, true);
ModifiableSolrParams mParams = new ModifiableSolrParams(params);
mParams.set("distrib", "false"); // We are the aggregator.
+ Set liveNodes = clusterState.getLiveNodes();
for(Slice slice : slices) {
Collection replicas = slice.getReplicas();
- List shuffler = new ArrayList();
+ List shuffler = new ArrayList<>();
for(Replica replica : replicas) {
if(replica.getState() == Replica.State.ACTIVE && liveNodes.contains(replica.getNodeName()))
shuffler.add(replica);
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/FeaturesSelectionStream.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/FeaturesSelectionStream.java
index e9949da145c..cfb3941f8ae 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/FeaturesSelectionStream.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/FeaturesSelectionStream.java
@@ -250,17 +250,15 @@ public class FeaturesSelectionStream extends TupleStream implements Expressible{
}
private List getShardUrls() throws IOException {
-
try {
-
ZkStateReader zkStateReader = cloudSolrClient.getZkStateReader();
- ClusterState clusterState = zkStateReader.getClusterState();
- Collection slices = clusterState.getActiveSlices(this.collection);
+ Collection slices = CloudSolrStream.getSlices(this.collection, zkStateReader, false);
+
+ ClusterState clusterState = zkStateReader.getClusterState();
Set liveNodes = clusterState.getLiveNodes();
List baseUrls = new ArrayList<>();
-
for(Slice slice : slices) {
Collection replicas = slice.getReplicas();
List shuffler = new ArrayList<>();
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/JDBCStream.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/JDBCStream.java
index bb0ed2c5776..143143f90be 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/JDBCStream.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/JDBCStream.java
@@ -67,7 +67,7 @@ public class JDBCStream extends TupleStream implements Expressible {
// These are java types that we can directly support as an Object instance. Other supported
// types will require some level of conversion (short -> long, etc...)
// We'll use a static constructor to load this set.
- private static HashSet directSupportedTypes = new HashSet();
+ private static final HashSet directSupportedTypes = new HashSet<>();
static {
directSupportedTypes.add(String.class.getName());
directSupportedTypes.add(Double.class.getName());
@@ -107,7 +107,7 @@ public class JDBCStream extends TupleStream implements Expressible {
// Validate there are no unknown parameters - zkHost and alias are namedParameter so we don't need to count it twice
if(expression.getParameters().size() != namedParams.size()){
- throw new IOException(String.format(Locale.ROOT,"invalid expression %s - unknown operands found",expression));
+ throw new IOException(String.format(Locale.ROOT,"invalid expression %s - unknown operands found", expression));
}
// All named params we don't care about will be passed to the driver on connection
@@ -124,7 +124,7 @@ public class JDBCStream extends TupleStream implements Expressible {
connectionUrl = ((StreamExpressionValue)connectionUrlExpression.getParameter()).getValue();
}
if(null == connectionUrl){
- throw new IOException(String.format(Locale.ROOT,"invalid expression %s - connection not found"));
+ throw new IOException(String.format(Locale.ROOT,"invalid expression %s - connection not found", connectionUrlExpression));
}
// sql, required
@@ -133,16 +133,16 @@ public class JDBCStream extends TupleStream implements Expressible {
sqlQuery = ((StreamExpressionValue)sqlQueryExpression.getParameter()).getValue();
}
if(null == sqlQuery){
- throw new IOException(String.format(Locale.ROOT,"invalid expression %s - sql not found"));
+ throw new IOException(String.format(Locale.ROOT,"invalid expression %s - sql not found", sqlQueryExpression));
}
// definedSort, required
StreamComparator definedSort = null;
- if(null != sqlQueryExpression && sqlQueryExpression.getParameter() instanceof StreamExpressionValue){
+ if(null != definedSortExpression && definedSortExpression.getParameter() instanceof StreamExpressionValue){
definedSort = factory.constructComparator(((StreamExpressionValue)definedSortExpression.getParameter()).getValue(), FieldComparator.class);
}
if(null == definedSort){
- throw new IOException(String.format(Locale.ROOT,"invalid expression %s - sort not found"));
+ throw new IOException(String.format(Locale.ROOT,"invalid expression %s - sort not found", definedSortExpression));
}
// driverClass, optional
@@ -155,7 +155,7 @@ public class JDBCStream extends TupleStream implements Expressible {
init(connectionUrl, sqlQuery, definedSort, connectionProperties, driverClass);
}
- private void init(String connectionUrl, String sqlQuery, StreamComparator definedSort, Properties connectionProperties, String driverClassName) throws IOException {
+ private void init(String connectionUrl, String sqlQuery, StreamComparator definedSort, Properties connectionProperties, String driverClassName) {
this.connectionUrl = connectionUrl;
this.sqlQuery = sqlQuery;
this.definedSort = definedSort;
@@ -188,7 +188,9 @@ public class JDBCStream extends TupleStream implements Expressible {
throw new SQLException("DriverManager.getDriver(url) returned null");
}
} catch(SQLException e){
- throw new IOException(String.format(Locale.ROOT, "Failed to determine JDBC driver from connection url '%s'. Usually this means the driver is not loaded - you can have JDBCStream try to load it by providing the 'driverClassName' value", connectionUrl), e);
+ throw new IOException(String.format(Locale.ROOT,
+ "Failed to determine JDBC driver from connection url '%s'. Usually this means the driver is not loaded - " +
+ "you can have JDBCStream try to load it by providing the 'driverClassName' value", connectionUrl), e);
}
try {
@@ -200,20 +202,23 @@ public class JDBCStream extends TupleStream implements Expressible {
try{
statement = connection.createStatement();
} catch (SQLException e) {
- throw new IOException(String.format(Locale.ROOT, "Failed to create a statement from JDBC connection '%s'", connectionUrl), e);
+ throw new IOException(String.format(Locale.ROOT, "Failed to create a statement from JDBC connection '%s'",
+ connectionUrl), e);
}
try{
resultSet = statement.executeQuery(sqlQuery);
} catch (SQLException e) {
- throw new IOException(String.format(Locale.ROOT, "Failed to execute sqlQuery '%s' against JDBC connection '%s'.\n"+ e.getMessage(), sqlQuery, connectionUrl), e);
+ throw new IOException(String.format(Locale.ROOT, "Failed to execute sqlQuery '%s' against JDBC connection '%s'.\n"
+ + e.getMessage(), sqlQuery, connectionUrl), e);
}
try{
// using the metadata, build selectors for each column
valueSelectors = constructValueSelectors(resultSet.getMetaData());
} catch (SQLException e) {
- throw new IOException(String.format(Locale.ROOT, "Failed to generate value selectors for sqlQuery '%s' against JDBC connection '%s'", sqlQuery, connectionUrl), e);
+ throw new IOException(String.format(Locale.ROOT,
+ "Failed to generate value selectors for sqlQuery '%s' against JDBC connection '%s'", sqlQuery, connectionUrl), e);
}
}
@@ -221,8 +226,8 @@ public class JDBCStream extends TupleStream implements Expressible {
ResultSetValueSelector[] valueSelectors = new ResultSetValueSelector[metadata.getColumnCount()];
for(int columnIdx = 0; columnIdx < metadata.getColumnCount(); ++columnIdx){
-
- final int columnNumber = columnIdx + 1; // cause it starts at 1
+ final int columnNumber = columnIdx + 1; // cause it starts at 1
+ // Use getColumnLabel instead of getColumnName to make sure fields renamed with AS as picked up properly
final String columnName = metadata.getColumnLabel(columnNumber);
String className = metadata.getColumnClassName(columnNumber);
String typeName = metadata.getColumnTypeName(columnNumber);
@@ -238,8 +243,7 @@ public class JDBCStream extends TupleStream implements Expressible {
return columnName;
}
};
- }
- else if(Short.class.getName().equals(className)) {
+ } else if(Short.class.getName().equals(className)) {
valueSelectors[columnIdx] = new ResultSetValueSelector() {
public Object selectValue(ResultSet resultSet) throws SQLException {
Short obj = resultSet.getShort(columnNumber);
@@ -250,8 +254,7 @@ public class JDBCStream extends TupleStream implements Expressible {
return columnName;
}
};
- }
- else if(Integer.class.getName().equals(className)) {
+ } else if(Integer.class.getName().equals(className)) {
valueSelectors[columnIdx] = new ResultSetValueSelector() {
public Object selectValue(ResultSet resultSet) throws SQLException {
Integer obj = resultSet.getInt(columnNumber);
@@ -262,8 +265,7 @@ public class JDBCStream extends TupleStream implements Expressible {
return columnName;
}
};
- }
- else if(Float.class.getName().equals(className)) {
+ } else if(Float.class.getName().equals(className)) {
valueSelectors[columnIdx] = new ResultSetValueSelector() {
public Object selectValue(ResultSet resultSet) throws SQLException {
Float obj = resultSet.getFloat(columnNumber);
@@ -274,9 +276,10 @@ public class JDBCStream extends TupleStream implements Expressible {
return columnName;
}
};
- }
- else{
- throw new SQLException(String.format(Locale.ROOT, "Unable to determine the valueSelector for column '%s' (col #%d) of java class '%s' and type '%s'", columnName, columnNumber, className, typeName));
+ } else {
+ throw new SQLException(String.format(Locale.ROOT,
+ "Unable to determine the valueSelector for column '%s' (col #%d) of java class '%s' and type '%s'",
+ columnName, columnNumber, className, typeName));
}
}
@@ -305,7 +308,7 @@ public class JDBCStream extends TupleStream implements Expressible {
public Tuple read() throws IOException {
try{
- Map