asList(reader1, reader2),
si, InfoStream.getDefault(), mergedDir,
- new FieldInfos.FieldNumbers(),
+ new FieldInfos.FieldNumbers(null),
newIOContext(random(), new IOContext(new MergeInfo(-1, -1, false, -1))));
MergeState mergeState = merger.merge();
int docsMerged = mergeState.segmentInfo.maxDoc();
@@ -96,7 +96,7 @@ public class TestSegmentMerger extends LuceneTestCase {
//Should be able to open a new SegmentReader against the new directory
SegmentReader mergedReader = new SegmentReader(new SegmentCommitInfo(
mergeState.segmentInfo,
- 0, -1L, -1L, -1L),
+ 0, 0, -1L, -1L, -1L),
Version.LATEST.major,
newIOContext(random()));
assertTrue(mergedReader != null);
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java b/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java
index dea7bc977be..d7a79997dc1 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java
@@ -104,7 +104,8 @@ public class TestSoftDeletesDirectoryReaderWrapper extends LuceneTestCase {
}
private boolean isWrapped(LeafReader reader) {
- return reader instanceof SoftDeletesDirectoryReaderWrapper.SoftDeletesFilterLeafReader;
+ return reader instanceof SoftDeletesDirectoryReaderWrapper.SoftDeletesFilterLeafReader
+ || reader instanceof SoftDeletesDirectoryReaderWrapper.SoftDeletesFilterCodecReader;
}
public void testMixSoftAndHardDeletes() throws IOException {
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java
index 144209dcceb..1eef95fdd6d 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java
@@ -81,7 +81,7 @@ public class TermVectorLeafReader extends LeafReader {
}
FieldInfo fieldInfo = new FieldInfo(field, 0,
true, true, terms.hasPayloads(),
- indexOptions, DocValuesType.NONE, -1, Collections.emptyMap(), 0, 0);
+ indexOptions, DocValuesType.NONE, -1, Collections.emptyMap(), 0, 0, false);
fieldInfos = new FieldInfos(new FieldInfo[]{fieldInfo});
}
diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
index ff248c34538..11913d1cbee 100644
--- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
+++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
@@ -501,7 +501,7 @@ public class MemoryIndex {
IndexOptions indexOptions = storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
return new FieldInfo(fieldName, ord, fieldType.storeTermVectors(), fieldType.omitNorms(), storePayloads,
indexOptions, fieldType.docValuesType(), -1, Collections.emptyMap(),
- fieldType.pointDimensionCount(), fieldType.pointNumBytes());
+ fieldType.pointDimensionCount(), fieldType.pointNumBytes(), false);
}
private void storePointValues(Info info, BytesRef pointValue) {
@@ -520,7 +520,7 @@ public class MemoryIndex {
info.fieldInfo = new FieldInfo(
info.fieldInfo.name, info.fieldInfo.number, info.fieldInfo.hasVectors(), info.fieldInfo.hasPayloads(),
info.fieldInfo.hasPayloads(), info.fieldInfo.getIndexOptions(), docValuesType, -1, info.fieldInfo.attributes(),
- info.fieldInfo.getPointDimensionCount(), info.fieldInfo.getPointNumBytes()
+ info.fieldInfo.getPointDimensionCount(), info.fieldInfo.getPointNumBytes(), info.fieldInfo.isSoftDeletesField()
);
} else if (existingDocValuesType != docValuesType) {
throw new IllegalArgumentException("Can't add [" + docValuesType + "] doc values field [" + fieldName + "], because [" + existingDocValuesType + "] doc values field already exists");
diff --git a/lucene/misc/src/java/org/apache/lucene/index/IndexSplitter.java b/lucene/misc/src/java/org/apache/lucene/index/IndexSplitter.java
index 892564826f3..a586f838170 100644
--- a/lucene/misc/src/java/org/apache/lucene/index/IndexSplitter.java
+++ b/lucene/misc/src/java/org/apache/lucene/index/IndexSplitter.java
@@ -141,7 +141,7 @@ public class IndexSplitter {
// Same info just changing the dir:
SegmentInfo newInfo = new SegmentInfo(destFSDir, info.getVersion(), info.getMinVersion(), info.name, info.maxDoc(),
info.getUseCompoundFile(), info.getCodec(), info.getDiagnostics(), info.getId(), new HashMap<>(), null);
- destInfos.add(new SegmentCommitInfo(newInfo, infoPerCommit.getDelCount(),
+ destInfos.add(new SegmentCommitInfo(newInfo, infoPerCommit.getDelCount(), infoPerCommit.getSoftDelCount(),
infoPerCommit.getDelGen(), infoPerCommit.getFieldInfosGen(),
infoPerCommit.getDocValuesGen()));
// now copy files over
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java
index 2bc422972bc..85bb6d11fff 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java
@@ -359,7 +359,7 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
: "no need \"textgrams\" when minPrefixChars="+minPrefixChars;
if (fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars > 0) {
// TODO: should use an EdgeNGramTokenFilterFactory here
- TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), 1, minPrefixChars);
+ TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), 1, minPrefixChars, false);
return new TokenStreamComponents(components.getTokenizer(), filter);
} else {
return components;
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java
index 13bd392aa9d..8888382a5ca 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java
@@ -19,7 +19,7 @@ package org.apache.lucene.search.suggest.document;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
-import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
/**
* Wraps an {@link org.apache.lucene.analysis.Analyzer}
@@ -37,24 +37,11 @@ import org.apache.lucene.util.automaton.Operations;
*/
public final class CompletionAnalyzer extends AnalyzerWrapper {
- /**
- * Represents the separation between tokens, if
- * preserveSep
is true
- *
- * Same label is used as a delimiter in the {@link org.apache.lucene.search.suggest.document.CompletionTokenStream}
- * payload
- */
- final static int SEP_LABEL = NRTSuggesterBuilder.PAYLOAD_SEP;
-
/**
* Represent a hole character, inserted by {@link org.apache.lucene.analysis.TokenStreamToAutomaton}
*/
final static int HOLE_CHARACTER = TokenStreamToAutomaton.HOLE;
- final static int DEFAULT_MAX_GRAPH_EXPANSIONS = Operations.DEFAULT_MAX_DETERMINIZED_STATES;
- final static boolean DEFAULT_PRESERVE_SEP = true;
- final static boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true;
-
private final Analyzer analyzer;
/**
@@ -101,7 +88,7 @@ public final class CompletionAnalyzer extends AnalyzerWrapper {
* preserving token separation, position increments and no limit on graph expansions
*/
public CompletionAnalyzer(Analyzer analyzer) {
- this(analyzer, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS);
+ this(analyzer, ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP, ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS, ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS);
}
/**
@@ -109,7 +96,7 @@ public final class CompletionAnalyzer extends AnalyzerWrapper {
* with no limit on graph expansions
*/
public CompletionAnalyzer(Analyzer analyzer, boolean preserveSep, boolean preservePositionIncrements) {
- this(analyzer, preserveSep, preservePositionIncrements, DEFAULT_MAX_GRAPH_EXPANSIONS);
+ this(analyzer, preserveSep, preservePositionIncrements, ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS);
}
/**
@@ -117,7 +104,7 @@ public final class CompletionAnalyzer extends AnalyzerWrapper {
* preserving token separation and position increments
*/
public CompletionAnalyzer(Analyzer analyzer, int maxGraphExpansions) {
- this(analyzer, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, maxGraphExpansions);
+ this(analyzer, ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP, ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS, maxGraphExpansions);
}
/**
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java
index 49fe7d08dff..6be0c91117f 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java
@@ -27,7 +27,7 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.suggest.BitsProducer;
import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.HOLE_CHARACTER;
-import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.SEP_LABEL;
+import static org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter.SEP_LABEL;
/**
* Abstract {@link Query} that match documents containing terms with a specified prefix
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java
index 7308e65acc9..d3bec8e50c9 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java
@@ -14,71 +14,43 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package org.apache.lucene.search.suggest.document;
import java.io.IOException;
+import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.TokenStreamToAutomaton;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
-import org.apache.lucene.util.AttributeImpl;
-import org.apache.lucene.util.AttributeReflector;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefBuilder;
-import org.apache.lucene.util.CharsRefBuilder;
-import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.FiniteStringsIterator;
-import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator;
-import org.apache.lucene.util.automaton.Operations;
-import org.apache.lucene.util.automaton.Transition;
-import org.apache.lucene.util.fst.Util;
-
-import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_MAX_GRAPH_EXPANSIONS;
-import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_POSITION_INCREMENTS;
-import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_SEP;
-import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.SEP_LABEL;
/**
- * Token stream which converts a provided token stream to an automaton.
- * The accepted strings enumeration from the automaton are available through the
- * {@link org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute} attribute
- * The token stream uses a {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute} to store
- * a completion's payload (see {@link CompletionTokenStream#setPayload(org.apache.lucene.util.BytesRef)})
- *
+ * A {@link ConcatenateGraphFilter} but we can set the payload and provide access to config options.
* @lucene.experimental
*/
-public final class CompletionTokenStream extends TokenStream {
+public final class CompletionTokenStream extends TokenFilter {
private final PayloadAttribute payloadAttr = addAttribute(PayloadAttribute.class);
- private final BytesRefBuilderTermAttribute bytesAtt = addAttribute(BytesRefBuilderTermAttribute.class);
+ // package accessible on purpose
final TokenStream inputTokenStream;
final boolean preserveSep;
final boolean preservePositionIncrements;
final int maxGraphExpansions;
- private FiniteStringsIterator finiteStrings;
- private BytesRef payload;
- private CharTermAttribute charTermAttribute;
+ private BytesRef payload; // note doesn't participate in TokenStream lifecycle; it's effectively constant
- /**
- * Creates a token stream to convert input
to a token stream
- * of accepted strings by its automaton.
- *
- * The token stream input
is converted to an automaton
- * with the default settings of {@link org.apache.lucene.search.suggest.document.CompletionAnalyzer}
- */
CompletionTokenStream(TokenStream inputTokenStream) {
- this(inputTokenStream, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS);
+ this(inputTokenStream,
+ ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP,
+ ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS,
+ ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS);
}
CompletionTokenStream(TokenStream inputTokenStream, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) {
- // Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume
- // the input stream entirely in the first call to incrementToken
+ super(new ConcatenateGraphFilter(inputTokenStream, preserveSep, preservePositionIncrements, maxGraphExpansions));
this.inputTokenStream = inputTokenStream;
this.preserveSep = preserveSep;
this.preservePositionIncrements = preservePositionIncrements;
@@ -94,248 +66,23 @@ public final class CompletionTokenStream extends TokenStream {
@Override
public boolean incrementToken() throws IOException {
- clearAttributes();
- if (finiteStrings == null) {
- Automaton automaton = toAutomaton();
- finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
- }
-
- IntsRef string = finiteStrings.next();
- if (string == null) {
+ if (input.incrementToken()) {
+ payloadAttr.setPayload(payload);
+ return true;
+ } else {
return false;
}
-
- Util.toBytesRef(string, bytesAtt.builder()); // now we have UTF-8
- if (charTermAttribute != null) {
- charTermAttribute.setLength(0);
- charTermAttribute.append(bytesAtt.toUTF16());
- }
- if (payload != null) {
- payloadAttr.setPayload(this.payload);
- }
-
- return true;
}
- @Override
- public void end() throws IOException {
- super.end();
- if (finiteStrings == null) {
- inputTokenStream.end();
- }
- }
-
- @Override
- public void close() throws IOException {
- if (finiteStrings == null) {
- inputTokenStream.close();
- }
- }
-
- @Override
- public void reset() throws IOException {
- super.reset();
- if (hasAttribute(CharTermAttribute.class)) {
- // we only create this if we really need it to safe the UTF-8 to UTF-16 conversion
- charTermAttribute = getAttribute(CharTermAttribute.class);
- }
- finiteStrings = null;
- }
-
- /**
- * Converts the token stream to an automaton,
- * treating the transition labels as utf-8
- */
+ /** Delegates to...At
+ * @see ConcatenateGraphFilter#toAutomaton() */
public Automaton toAutomaton() throws IOException {
- return toAutomaton(false);
+ return ((ConcatenateGraphFilter)input).toAutomaton();
}
- /**
- * Converts the tokenStream to an automaton
- */
+ /** Delegates to...
+ * @see ConcatenateGraphFilter#toAutomaton(boolean) */
public Automaton toAutomaton(boolean unicodeAware) throws IOException {
- // TODO refactor this
- // maybe we could hook up a modified automaton from TermAutomatonQuery here?
- Automaton automaton = null;
- try {
- // Create corresponding automaton: labels are bytes
- // from each analyzed token, with byte 0 used as
- // separator between tokens:
- final TokenStreamToAutomaton tsta;
- if (preserveSep) {
- tsta = new EscapingTokenStreamToAutomaton((char) SEP_LABEL);
- } else {
- // When we're not preserving sep, we don't steal 0xff
- // byte, so we don't need to do any escaping:
- tsta = new TokenStreamToAutomaton();
- }
- tsta.setPreservePositionIncrements(preservePositionIncrements);
- tsta.setUnicodeArcs(unicodeAware);
-
- automaton = tsta.toAutomaton(inputTokenStream);
- } finally {
- IOUtils.closeWhileHandlingException(inputTokenStream);
- }
-
- // TODO: we can optimize this somewhat by determinizing
- // while we convert
- automaton = replaceSep(automaton, preserveSep, SEP_LABEL);
- // This automaton should not blow up during determinize:
- return Operations.determinize(automaton, maxGraphExpansions);
- }
-
- /**
- * Just escapes the 0xff byte (which we still for SEP).
- */
- private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {
-
- final BytesRefBuilder spare = new BytesRefBuilder();
- private char sepLabel;
-
- public EscapingTokenStreamToAutomaton(char sepLabel) {
- this.sepLabel = sepLabel;
- }
-
- @Override
- protected BytesRef changeToken(BytesRef in) {
- int upto = 0;
- for (int i = 0; i < in.length; i++) {
- byte b = in.bytes[in.offset + i];
- if (b == (byte) sepLabel) {
- spare.grow(upto + 2);
- spare.setByteAt(upto++, (byte) sepLabel);
- spare.setByteAt(upto++, b);
- } else {
- spare.grow(upto + 1);
- spare.setByteAt(upto++, b);
- }
- }
- spare.setLength(upto);
- return spare.get();
- }
- }
-
- // Replaces SEP with epsilon or remaps them if
- // we were asked to preserve them:
- private static Automaton replaceSep(Automaton a, boolean preserveSep, int sepLabel) {
-
- Automaton result = new Automaton();
-
- // Copy all states over
- int numStates = a.getNumStates();
- for (int s = 0; s < numStates; s++) {
- result.createState();
- result.setAccept(s, a.isAccept(s));
- }
-
- // Go in reverse topo sort so we know we only have to
- // make one pass:
- Transition t = new Transition();
- int[] topoSortStates = Operations.topoSortStates(a);
- for (int i = 0; i < topoSortStates.length; i++) {
- int state = topoSortStates[topoSortStates.length - 1 - i];
- int count = a.initTransition(state, t);
- for (int j = 0; j < count; j++) {
- a.getNextTransition(t);
- if (t.min == TokenStreamToAutomaton.POS_SEP) {
- assert t.max == TokenStreamToAutomaton.POS_SEP;
- if (preserveSep) {
- // Remap to SEP_LABEL:
- result.addTransition(state, t.dest, sepLabel);
- } else {
- result.addEpsilon(state, t.dest);
- }
- } else if (t.min == TokenStreamToAutomaton.HOLE) {
- assert t.max == TokenStreamToAutomaton.HOLE;
-
- // Just remove the hole: there will then be two
- // SEP tokens next to each other, which will only
- // match another hole at search time. Note that
- // it will also match an empty-string token ... if
- // that's somehow a problem we can always map HOLE
- // to a dedicated byte (and escape it in the
- // input).
- result.addEpsilon(state, t.dest);
- } else {
- result.addTransition(state, t.dest, t.min, t.max);
- }
- }
- }
-
- result.finishState();
-
- return result;
- }
-
- /**
- * Attribute providing access to the term builder and UTF-16 conversion
- */
- public interface BytesRefBuilderTermAttribute extends TermToBytesRefAttribute {
- /**
- * Returns the builder from which the term is derived.
- */
- BytesRefBuilder builder();
-
- /**
- * Returns the term represented as UTF-16
- */
- CharSequence toUTF16();
- }
-
- /**
- * Custom attribute implementation for completion token stream
- */
- public static final class BytesRefBuilderTermAttributeImpl extends AttributeImpl implements BytesRefBuilderTermAttribute, TermToBytesRefAttribute {
- private final BytesRefBuilder bytes = new BytesRefBuilder();
- private transient CharsRefBuilder charsRef;
-
- /**
- * Sole constructor
- * no-op
- */
- public BytesRefBuilderTermAttributeImpl() {
- }
-
- @Override
- public BytesRefBuilder builder() {
- return bytes;
- }
-
- @Override
- public BytesRef getBytesRef() {
- return bytes.get();
- }
-
- @Override
- public void clear() {
- bytes.clear();
- }
-
- @Override
- public void copyTo(AttributeImpl target) {
- BytesRefBuilderTermAttributeImpl other = (BytesRefBuilderTermAttributeImpl) target;
- other.bytes.copyBytes(bytes);
- }
-
- @Override
- public AttributeImpl clone() {
- BytesRefBuilderTermAttributeImpl other = new BytesRefBuilderTermAttributeImpl();
- copyTo(other);
- return other;
- }
-
- @Override
- public void reflectWith(AttributeReflector reflector) {
- reflector.reflect(TermToBytesRefAttribute.class, "bytes", getBytesRef());
- }
-
- @Override
- public CharSequence toUTF16() {
- if (charsRef == null) {
- charsRef = new CharsRefBuilder();
- }
- charsRef.copyUTF8Bytes(getBytesRef());
- return charsRef.get();
- }
+ return ((ConcatenateGraphFilter)input).toAutomaton(unicodeAware);
}
}
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java
index 6217ca38f85..1a2680cb553 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java
@@ -22,6 +22,7 @@ import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
+import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
@@ -178,7 +179,7 @@ public class ContextQuery extends CompletionQuery {
// if separators are preserved the fst contains a SEP_LABEL
// behind each gap. To have a matching automaton, we need to
// include the SEP_LABEL in the query as well
- Automaton optionalSepLabel = Operations.optional(Automata.makeChar(CompletionAnalyzer.SEP_LABEL));
+ Automaton optionalSepLabel = Operations.optional(Automata.makeChar(ConcatenateGraphFilter.SEP_LABEL));
Automaton prefixAutomaton = Operations.concatenate(optionalSepLabel, innerAutomaton);
Automaton contextsAutomaton = Operations.concatenate(toContextAutomaton(contexts, matchAllContexts), prefixAutomaton);
contextsAutomaton = Operations.determinize(contextsAutomaton, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
@@ -302,7 +303,7 @@ public class ContextQuery extends CompletionQuery {
}
ref.offset = ++i;
assert ref.offset < ref.length : "input should not end with the context separator";
- if (ref.ints[i] == CompletionAnalyzer.SEP_LABEL) {
+ if (ref.ints[i] == ConcatenateGraphFilter.SEP_LABEL) {
ref.offset++;
assert ref.offset < ref.length : "input should not end with a context separator followed by SEP_LABEL";
}
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java
index 4cb91b8053c..cf462e1dbc8 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java
@@ -90,6 +90,7 @@ public class ContextSuggestField extends SuggestField {
}
CompletionTokenStream completionTokenStream;
if (stream instanceof CompletionTokenStream) {
+ //TODO this is awkward; is there a better way avoiding re-creating the chain?
completionTokenStream = (CompletionTokenStream) stream;
PrefixTokenFilter prefixTokenFilter = new PrefixTokenFilter(completionTokenStream.inputTokenStream, (char) CONTEXT_SEPARATOR, contexts);
completionTokenStream = new CompletionTokenStream(prefixTokenFilter,
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java
index b243f4ede83..14479fecd12 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java
@@ -144,9 +144,12 @@ public class FuzzyCompletionQuery extends PrefixCompletionQuery {
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
- CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text());
+ final Automaton originalAutomata;
+ try (CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()) ) {
+ originalAutomata = stream.toAutomaton(unicodeAware);
+ }
Set refs = new HashSet<>();
- Automaton automaton = toLevenshteinAutomata(stream.toAutomaton(unicodeAware), refs);
+ Automaton automaton = toLevenshteinAutomata(originalAutomata, refs);
if (unicodeAware) {
Automaton utf8automaton = new UTF32ToUTF8().convert(automaton);
utf8automaton = Operations.determinize(utf8automaton, maxDeterminizedStates);
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java
index 270463175d7..5ca4993396f 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java
@@ -19,6 +19,7 @@ package org.apache.lucene.search.suggest.document;
import java.io.IOException;
import java.util.PriorityQueue;
+import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
@@ -42,7 +43,7 @@ final class NRTSuggesterBuilder {
* Label used to separate surface form and docID
* in the output
*/
- public static final int PAYLOAD_SEP = '\u001F';
+ public static final int PAYLOAD_SEP = ConcatenateGraphFilter.SEP_LABEL;
/**
* Marks end of the analyzed input and start of dedup
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java
index 7bb75e9261c..a8da150f504 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java
@@ -68,8 +68,9 @@ public class PrefixCompletionQuery extends CompletionQuery {
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
- CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text());
- return new CompletionWeight(this, stream.toAutomaton());
+ try (CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text())) {
+ return new CompletionWeight(this, stream.toAutomaton());
+ }
}
/**
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java
index 7f06328ee1b..b2d24c2c84e 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java
@@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexOptions;
@@ -140,7 +141,7 @@ public class SuggestField extends Field {
private boolean isReserved(char c) {
switch (c) {
- case CompletionAnalyzer.SEP_LABEL:
+ case ConcatenateGraphFilter.SEP_LABEL:
case CompletionAnalyzer.HOLE_CHARACTER:
case NRTSuggesterBuilder.END_BYTE:
return true;
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java
index 0c3b254c132..8beea129622 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java
@@ -21,6 +21,7 @@ import java.io.ByteArrayOutputStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
@@ -109,21 +110,21 @@ public class TestContextSuggestField extends LuceneTestCase {
CharsRefBuilder builder = new CharsRefBuilder();
builder.append("context1");
builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR));
- builder.append(((char) CompletionAnalyzer.SEP_LABEL));
+ builder.append((char) ConcatenateGraphFilter.SEP_LABEL);
builder.append("input");
expectedOutputs[0] = builder.toCharsRef().toString();
builder.clear();
builder.append("context2");
builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR));
- builder.append(((char) CompletionAnalyzer.SEP_LABEL));
+ builder.append((char) ConcatenateGraphFilter.SEP_LABEL);
builder.append("input");
expectedOutputs[1] = builder.toCharsRef().toString();
- TokenStream stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(field.tokenStream(analyzer, null));
- assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 1}, null, null);
+ TokenStream stream = new TestSuggestField.PayloadAttrToTypeAttrFilter(field.tokenStream(analyzer, null));
+ assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 0}, null, null);
CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer);
- stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(field.tokenStream(completionAnalyzer, null));
- assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 1}, null, null);
+ stream = new TestSuggestField.PayloadAttrToTypeAttrFilter(field.tokenStream(completionAnalyzer, null));
+ assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 0}, null, null);
}
@Test
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java
index a6659e082d5..e6d7062c925 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java
@@ -32,7 +32,11 @@ import java.util.concurrent.CyclicBarrier;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene70.Lucene70Codec;
@@ -99,7 +103,7 @@ public class TestSuggestField extends LuceneTestCase {
public void testReservedChars() throws Exception {
CharsRefBuilder charsRefBuilder = new CharsRefBuilder();
charsRefBuilder.append("sugg");
- charsRefBuilder.setCharAt(2, (char) CompletionAnalyzer.SEP_LABEL);
+ charsRefBuilder.setCharAt(2, (char) ConcatenateGraphFilter.SEP_LABEL);
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
new SuggestField("name", charsRefBuilder.toString(), 1);
});
@@ -144,11 +148,11 @@ public class TestSuggestField extends LuceneTestCase {
output.writeByte(SuggestField.TYPE);
}
BytesRef payload = new BytesRef(byteArrayOutputStream.toByteArray());
- TokenStream stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(suggestField.tokenStream(analyzer, null));
+ TokenStream stream = new PayloadAttrToTypeAttrFilter(suggestField.tokenStream(analyzer, null));
assertTokenStreamContents(stream, new String[] {"input"}, null, null, new String[]{payload.utf8ToString()}, new int[]{1}, null, null);
CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer);
- stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(suggestField.tokenStream(completionAnalyzer, null));
+ stream = new PayloadAttrToTypeAttrFilter(suggestField.tokenStream(completionAnalyzer, null));
assertTokenStreamContents(stream, new String[] {"input"}, null, null, new String[]{payload.utf8ToString()}, new int[]{1}, null, null);
}
@@ -894,4 +898,23 @@ public class TestSuggestField extends LuceneTestCase {
iwc.setCodec(filterCodec);
return iwc;
}
+
+ public final static class PayloadAttrToTypeAttrFilter extends TokenFilter {
+ private PayloadAttribute payload = addAttribute(PayloadAttribute.class);
+ private TypeAttribute type = addAttribute(TypeAttribute.class);
+
+ protected PayloadAttrToTypeAttrFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ // we move them over so we can assert them more easily in the tests
+ type.setType(payload.getPayload().utf8ToString());
+ return true;
+ }
+ return false;
+ }
+ }
}
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseFieldInfoFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseFieldInfoFormatTestCase.java
index 9363ce63fe3..3515b9a9c97 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseFieldInfoFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseFieldInfoFormatTestCase.java
@@ -53,7 +53,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes
Directory dir = newDirectory();
Codec codec = getCodec();
SegmentInfo segmentInfo = newSegmentInfo(dir, "_123");
- FieldInfos.Builder builder = new FieldInfos.Builder();
+ FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null));
FieldInfo fi = builder.getOrAdd("field");
fi.setIndexOptions(TextField.TYPE_STORED.indexOptions());
addAttributes(fi);
@@ -75,7 +75,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes
Directory dir = newDirectory();
Codec codec = getCodec();
SegmentInfo segmentInfo = newSegmentInfo(dir, "_123");
- FieldInfos.Builder builder = new FieldInfos.Builder();
+ FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null));
FieldInfo fi = builder.getOrAdd("field");
fi.setIndexOptions(TextField.TYPE_STORED.indexOptions());
addAttributes(fi);
@@ -115,7 +115,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes
dir.failOn(fail);
Codec codec = getCodec();
SegmentInfo segmentInfo = newSegmentInfo(dir, "_123");
- FieldInfos.Builder builder = new FieldInfos.Builder();
+ FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null));
FieldInfo fi = builder.getOrAdd("field");
fi.setIndexOptions(TextField.TYPE_STORED.indexOptions());
addAttributes(fi);
@@ -150,7 +150,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes
dir.failOn(fail);
Codec codec = getCodec();
SegmentInfo segmentInfo = newSegmentInfo(dir, "_123");
- FieldInfos.Builder builder = new FieldInfos.Builder();
+ FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null));
FieldInfo fi = builder.getOrAdd("field");
fi.setIndexOptions(TextField.TYPE_STORED.indexOptions());
addAttributes(fi);
@@ -185,7 +185,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes
dir.failOn(fail);
Codec codec = getCodec();
SegmentInfo segmentInfo = newSegmentInfo(dir, "_123");
- FieldInfos.Builder builder = new FieldInfos.Builder();
+ FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null));
FieldInfo fi = builder.getOrAdd("field");
fi.setIndexOptions(TextField.TYPE_STORED.indexOptions());
addAttributes(fi);
@@ -221,7 +221,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes
dir.failOn(fail);
Codec codec = getCodec();
SegmentInfo segmentInfo = newSegmentInfo(dir, "_123");
- FieldInfos.Builder builder = new FieldInfos.Builder();
+ FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null));
FieldInfo fi = builder.getOrAdd("field");
fi.setIndexOptions(TextField.TYPE_STORED.indexOptions());
addAttributes(fi);
@@ -251,7 +251,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes
for (int i = 0; i < numFields; i++) {
fieldNames.add(TestUtil.randomUnicodeString(random()));
}
- FieldInfos.Builder builder = new FieldInfos.Builder();
+ FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null));
for (String field : fieldNames) {
IndexableFieldType fieldType = randomFieldType(random());
FieldInfo fi = builder.getOrAdd(field);
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java
index f5b52239057..83419de52e2 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java
@@ -323,7 +323,7 @@ abstract class BaseIndexFileFormatTestCase extends LuceneTestCase {
FieldInfo proto = oneDocReader.getFieldInfos().fieldInfo("field");
FieldInfo field = new FieldInfo(proto.name, proto.number, proto.hasVectors(), proto.omitsNorms(), proto.hasPayloads(),
proto.getIndexOptions(), proto.getDocValuesType(), proto.getDocValuesGen(), new HashMap<>(),
- proto.getPointDimensionCount(), proto.getPointNumBytes());
+ proto.getPointDimensionCount(), proto.getPointNumBytes(), proto.isSoftDeletesField());
FieldInfos fieldInfos = new FieldInfos(new FieldInfo[] { field } );
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseLiveDocsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseLiveDocsFormatTestCase.java
index b4799f86fdb..9c01990b195 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseLiveDocsFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseLiveDocsFormatTestCase.java
@@ -125,10 +125,10 @@ public abstract class BaseLiveDocsFormatTestCase extends LuceneTestCase {
final Directory dir = newDirectory();
final SegmentInfo si = new SegmentInfo(dir, Version.LATEST, Version.LATEST, "foo", maxDoc, random().nextBoolean(),
codec, Collections.emptyMap(), StringHelper.randomId(), Collections.emptyMap(), null);
- SegmentCommitInfo sci = new SegmentCommitInfo(si, 0, 0, -1, -1);
+ SegmentCommitInfo sci = new SegmentCommitInfo(si, 0, 0, 0, -1, -1);
format.writeLiveDocs(bits, dir, sci, maxDoc - numLiveDocs, IOContext.DEFAULT);
- sci = new SegmentCommitInfo(si, maxDoc - numLiveDocs, 1, -1, -1);
+ sci = new SegmentCommitInfo(si, maxDoc - numLiveDocs, 0, 1, -1, -1);
final Bits bits2 = format.readLiveDocs(dir, sci, IOContext.READONCE);
assertEquals(maxDoc, bits2.length());
for (int i = 0; i < maxDoc; ++i) {
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseMergePolicyTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseMergePolicyTestCase.java
index 8f986277f5d..477b0a3c548 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseMergePolicyTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseMergePolicyTestCase.java
@@ -116,7 +116,7 @@ public abstract class BaseMergePolicyTestCase extends LuceneTestCase {
Collections.emptyMap(), // attributes
null /* indexSort */);
info.setFiles(Collections.emptyList());
- infos.add(new SegmentCommitInfo(info, random().nextInt(1), -1, -1, -1));
+ infos.add(new SegmentCommitInfo(info, random().nextInt(1), 0, -1, -1, -1));
}
MergePolicy.MergeSpecification forcedDeletesMerges = mp.findForcedDeletesMerges(infos, context);
if (forcedDeletesMerges != null) {
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java b/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java
index 7dd6ba89bd0..2c746773f94 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java
@@ -77,7 +77,8 @@ public class MismatchedLeafReader extends FilterLeafReader {
oldInfo.getDocValuesGen(), // dvGen
oldInfo.attributes(), // attributes
oldInfo.getPointDimensionCount(), // dimension count
- oldInfo.getPointNumBytes()); // dimension numBytes
+ oldInfo.getPointNumBytes(), // dimension numBytes
+ oldInfo.isSoftDeletesField()); // used as soft-deletes field
shuffled.set(i, newInfo);
}
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java
index 29962e609a7..9f2d9b7adc0 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java
@@ -130,7 +130,7 @@ public class RandomPostingsTester {
fieldInfoArray[fieldUpto] = new FieldInfo(field, fieldUpto, false, false, true,
IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS,
DocValuesType.NONE, -1, new HashMap<>(),
- 0, 0);
+ 0, 0, false);
fieldUpto++;
SortedMap postings = new TreeMap<>();
@@ -651,7 +651,7 @@ public class RandomPostingsTester {
DocValuesType.NONE,
-1,
new HashMap<>(),
- 0, 0);
+ 0, 0, false);
}
FieldInfos newFieldInfos = new FieldInfos(newFieldInfoArray);
diff --git a/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java b/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java
index 60f671c25fc..019417771ed 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java
@@ -804,7 +804,7 @@ public class MockDirectoryWrapper extends BaseDirectoryWrapper {
}
// NOTE: This is off by default; see LUCENE-5574
- private boolean assertNoUnreferencedFilesOnClose;
+ private volatile boolean assertNoUnreferencedFilesOnClose;
public void setAssertNoUnrefencedFilesOnClose(boolean v) {
assertNoUnreferencedFilesOnClose = v;
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 66d885362fd..6dd4889300e 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -70,8 +70,8 @@ Upgrade Notes
To return the previous behavior pass false to skipCommitOnMasterVersionZero in slave section of replication
handler configuration, or pass it to the fetchindex command.
-* SOLR-11453: Configuring slowQueryThresholdMillis now logs slow requests to a separate file - solr_slow_requests.log .
- Previously they would get logged in the solr.xml file
+* SOLR-11453: Configuring slowQueryThresholdMillis now logs slow requests to a separate file - solr_slow_requests.log.
+ Previously they would get logged in the solr.log file.
New Features
----------------------
@@ -140,7 +140,7 @@ New Features
* SOLR-12328: JSON Facet API: Domain change with graph query.
(Daniel Meehl, Kevin Watters, yonik)
-* SOLR-11453: Configuring slowQueryThresholdMillis logs slow requests to a separate file - solr_slow_requests.log .
+* SOLR-11453: Configuring slowQueryThresholdMillis logs slow requests to a separate file - solr_slow_requests.log.
(Shawn Heisey, Remko Popma, Varun Thacker)
* SOLR-12401: Add getValue() and setValue() Stream Evaluators (Joel Bernstein, janhoy)
@@ -154,6 +154,9 @@ New Features
* SOLR-12389: support deeply nested json objects in clusterprops.json (noble)
+* SOLR-12376: Added the TaggerRequestHandler (AKA SolrTextTagger) for tagging text. It's used as a component of
+ NER/ERD systems including query-understanding. See the ref guide for more info. (David Smiley)
+
Bug Fixes
----------------------
@@ -283,13 +286,15 @@ Bug Fixes
* SOLR-12374: SnapShooter.getIndexCommit can forget to decref the searcher; though it's not clear in practice when.
(David Smiley)
-* SOLR-12417: velocity response writer should enforce valid function name for v.json parameter (yonik)
+* SOLR-12417: velocity response writer should enforce valid function name for v.json parameter (Mano Kovacs, yonik)
* SOLR-12271: Fixed bug in how Analytics component reads negative values from float and double fields. (Houston Putman)
* SOLR-12433: Recovering flag of a replica is set equals to leader even it failed to receive update
on recovering. (Cao Manh Dat)
+* SOLR-12354: Register the /admin/info/key end-point at the startup time to avoid 404 (noble)
+
Optimizations
----------------------
@@ -325,6 +330,12 @@ Optimizations
SolrConstantScoreQuery as well. QWF since v5.4.0 sometimes needlessly internally executed and cached the query.
Affects ExpandComponent, ChildDocTransformer, CurrencyFieldType, TermsQParser. (David Smiley)
+* SOLR-9922: Write buffering updates to another tlog. (Cao Manh Dat)
+
+* SOLR-12233: QParserPlugin's built-in static registry now holds actual QParserPlugin instances instead of class
+ references. This is consistent with other plugin registries and allows a SolrCore to load faster.
+ (Jeff Miller, David Smiley)
+
Other Changes
----------------------
@@ -1308,6 +1319,8 @@ Bug Fixes
* SOLR-11477: Disallow resolving of external entities in the XML query parser (defType=xmlparser).
(Michael Stepankin, Olga Barinova, Uwe Schindler, Christine Poerschke)
+* SOLR-12444: Updating a cluster policy fails (noble)
+
Optimizations
----------------------
diff --git a/solr/NOTICE.txt b/solr/NOTICE.txt
index fd954f4ef4f..a5b2070a39b 100644
--- a/solr/NOTICE.txt
+++ b/solr/NOTICE.txt
@@ -537,3 +537,17 @@ See http://www.restlet.org/
Protocol Buffers - Google's data interchange format
Copyright 2008 Google Inc.
http://code.google.com/apis/protocolbuffers/
+
+=========================================================================
+== SolrTextTagger Notice ==
+=========================================================================
+
+The TaggerRequestHandler and related classes in its package came from the
+OpenSextant Solr Text Tagger,
+Copyright 2013 The MITRE Corporation. All Rights Reserved.
+
+ This software was produced for the U. S. Government
+ under Contract No. W15P7T-11-C-F600, and is
+ subject to the Rights in Noncommercial Computer Software
+ and Noncommercial Computer Software Documentation
+ Clause 252.227-7014 (JUN 1995)
\ No newline at end of file
diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
index c8f5ae89fbe..966497b0938 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
@@ -449,7 +449,6 @@ public class RecoveryStrategy implements Runnable, Closeable {
// TODO: perhaps make this grab a new core each time through the loop to handle core reloads?
final public void doSyncOrReplicateRecovery(SolrCore core) throws Exception {
- boolean replayed = false;
boolean successfulRecovery = false;
UpdateLog ulog;
@@ -500,8 +499,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
// when we went down. We may have received updates since then.
recentVersions = startingVersions;
try {
- if ((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0) {
- // last operation at the time of startup had the GAP flag set...
+ if (ulog.existOldBufferLog()) {
// this means we were previously doing a full index replication
// that probably didn't complete and buffering updates in the
// meantime.
@@ -542,9 +540,9 @@ public class RecoveryStrategy implements Runnable, Closeable {
}
LOG.info("Begin buffering updates. core=[{}]", coreName);
+ // recalling buffer updates will drop the old buffer tlog
ulog.bufferUpdates();
- replayed = false;
-
+
LOG.info("Publishing state of core [{}] as recovering, leader is [{}] and I am [{}]", core.getName(), leader.getCoreUrl(),
ourUrl);
zkController.publish(core.getCoreDescriptor(), Replica.State.RECOVERING);
@@ -603,8 +601,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
LOG.info("Replaying updates buffered during PeerSync.");
replay(core);
- replayed = true;
-
+
// sync success
successfulRecovery = true;
return;
@@ -630,8 +627,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
}
replayFuture = replay(core);
- replayed = true;
-
+
if (isClosed()) {
LOG.info("RecoveryStrategy has been closed");
break;
@@ -650,21 +646,6 @@ public class RecoveryStrategy implements Runnable, Closeable {
} catch (Exception e) {
SolrException.log(LOG, "Error while trying to recover. core=" + coreName, e);
} finally {
- if (!replayed) {
- // dropBufferedUpdate()s currently only supports returning to ACTIVE state, which risks additional updates
- // being added w/o UpdateLog.FLAG_GAP, hence losing the info on restart that we are not up-to-date.
- // For now, ulog will simply remain in BUFFERING state, and an additional call to bufferUpdates() will
- // reset our starting point for playback.
- LOG.info("Replay not started, or was not successful... still buffering updates.");
-
- /** this prev code is retained in case we want to switch strategies.
- try {
- ulog.dropBufferedUpdates();
- } catch (Exception e) {
- SolrException.log(log, "", e);
- }
- **/
- }
if (successfulRecovery) {
LOG.info("Registering as Active after recovery.");
try {
diff --git a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java
index 0a742e3a5ae..aa648dd8869 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java
@@ -97,7 +97,7 @@ public class ReplicateFromLeader {
new ModifiableSolrParams());
CommitUpdateCommand cuc = new CommitUpdateCommand(req, false);
cuc.setVersion(Long.parseLong(commitVersion));
- updateLog.copyOverOldUpdates(cuc);
+ updateLog.commitAndSwitchToNewTlog(cuc);
lastVersion = Long.parseLong(commitVersion);
}
});
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java
index 4a9c7442774..22e3ef5e77e 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java
@@ -168,7 +168,13 @@ public class ComputePlanAction extends TriggerActionBase {
// estimate a maximum default limit that should be sufficient for most purposes:
// number of nodes * total number of replicas * 3
AtomicInteger totalRF = new AtomicInteger();
- clusterState.forEachCollection(coll -> totalRF.addAndGet(coll.getReplicationFactor() * coll.getSlices().size()));
+ clusterState.forEachCollection(coll -> {
+ Integer rf = coll.getReplicationFactor();
+ if (rf == null) {
+ rf = coll.getReplicas().size() / coll.getSlices().size();
+ }
+ totalRF.addAndGet(rf * coll.getSlices().size());
+ });
int totalMax = clusterState.getLiveNodes().size() * totalRF.get() * 3;
int maxOp = (Integer) autoScalingConfig.getProperties().getOrDefault(AutoScalingParams.MAX_COMPUTE_OPERATIONS, totalMax);
Object o = event.getProperty(AutoScalingParams.MAX_COMPUTE_OPERATIONS, maxOp);
diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
index e108ae19f89..d546dd29b9c 100644
--- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
+++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
@@ -16,20 +16,6 @@
*/
package org.apache.solr.core;
-import static java.util.Objects.requireNonNull;
-import static org.apache.solr.common.params.CommonParams.AUTHC_PATH;
-import static org.apache.solr.common.params.CommonParams.AUTHZ_PATH;
-import static org.apache.solr.common.params.CommonParams.AUTOSCALING_HISTORY_PATH;
-import static org.apache.solr.common.params.CommonParams.COLLECTIONS_HANDLER_PATH;
-import static org.apache.solr.common.params.CommonParams.CONFIGSETS_HANDLER_PATH;
-import static org.apache.solr.common.params.CommonParams.CORES_HANDLER_PATH;
-import static org.apache.solr.common.params.CommonParams.HEALTH_CHECK_HANDLER_PATH;
-import static org.apache.solr.common.params.CommonParams.INFO_HANDLER_PATH;
-import static org.apache.solr.common.params.CommonParams.METRICS_HISTORY_PATH;
-import static org.apache.solr.common.params.CommonParams.METRICS_PATH;
-import static org.apache.solr.common.params.CommonParams.ZK_PATH;
-import static org.apache.solr.security.AuthenticationPlugin.AUTHENTICATION_PLUGIN_PROP;
-
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.nio.file.Path;
@@ -64,15 +50,15 @@ import org.apache.solr.client.solrj.impl.SolrHttpClientContextBuilder;
import org.apache.solr.client.solrj.impl.SolrHttpClientContextBuilder.AuthSchemeRegistryProvider;
import org.apache.solr.client.solrj.impl.SolrHttpClientContextBuilder.CredentialsProviderProvider;
import org.apache.solr.client.solrj.util.SolrIdentifierValidator;
-import org.apache.solr.cloud.autoscaling.AutoScalingHandler;
import org.apache.solr.cloud.CloudDescriptor;
import org.apache.solr.cloud.Overseer;
import org.apache.solr.cloud.ZkController;
+import org.apache.solr.cloud.autoscaling.AutoScalingHandler;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
+import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Replica.State;
-import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.util.ExecutorUtil;
import org.apache.solr.common.util.IOUtils;
import org.apache.solr.common.util.Utils;
@@ -106,6 +92,7 @@ import org.apache.solr.security.AuthenticationPlugin;
import org.apache.solr.security.AuthorizationPlugin;
import org.apache.solr.security.HttpClientBuilderPlugin;
import org.apache.solr.security.PKIAuthenticationPlugin;
+import org.apache.solr.security.PublicKeyHandler;
import org.apache.solr.security.SecurityPluginHolder;
import org.apache.solr.update.SolrCoreState;
import org.apache.solr.update.UpdateShardHandler;
@@ -116,7 +103,20 @@ import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import static java.util.Objects.requireNonNull;
+import static org.apache.solr.common.params.CommonParams.AUTHC_PATH;
+import static org.apache.solr.common.params.CommonParams.AUTHZ_PATH;
+import static org.apache.solr.common.params.CommonParams.AUTOSCALING_HISTORY_PATH;
+import static org.apache.solr.common.params.CommonParams.COLLECTIONS_HANDLER_PATH;
+import static org.apache.solr.common.params.CommonParams.CONFIGSETS_HANDLER_PATH;
+import static org.apache.solr.common.params.CommonParams.CORES_HANDLER_PATH;
+import static org.apache.solr.common.params.CommonParams.HEALTH_CHECK_HANDLER_PATH;
+import static org.apache.solr.common.params.CommonParams.INFO_HANDLER_PATH;
+import static org.apache.solr.common.params.CommonParams.METRICS_HISTORY_PATH;
+import static org.apache.solr.common.params.CommonParams.METRICS_PATH;
+import static org.apache.solr.common.params.CommonParams.ZK_PATH;
import static org.apache.solr.core.CorePropertiesLocator.PROPERTIES_FILENAME;
+import static org.apache.solr.security.AuthenticationPlugin.AUTHENTICATION_PLUGIN_PROP;
/**
*
@@ -301,6 +301,7 @@ public class CoreContainer {
public CoreContainer(NodeConfig config, Properties properties, CoresLocator locator, boolean asyncSolrCoreLoad) {
this.loader = config.getSolrResourceLoader();
this.solrHome = loader.getInstancePath().toString();
+ containerHandlers.put(PublicKeyHandler.PATH, new PublicKeyHandler());
this.cfg = requireNonNull(config);
this.coresLocator = locator;
this.containerProperties = new Properties(properties);
@@ -548,7 +549,8 @@ public class CoreContainer {
hostName = cfg.getNodeName();
zkSys.initZooKeeper(this, solrHome, cfg.getCloudConfig());
- if(isZooKeeperAware()) pkiAuthenticationPlugin = new PKIAuthenticationPlugin(this, zkSys.getZkController().getNodeName());
+ if(isZooKeeperAware()) pkiAuthenticationPlugin = new PKIAuthenticationPlugin(this, zkSys.getZkController().getNodeName(),
+ (PublicKeyHandler) containerHandlers.get(PublicKeyHandler.PATH));
MDCLoggingContext.setNode(this);
@@ -592,8 +594,7 @@ public class CoreContainer {
containerHandlers.put(AUTHZ_PATH, securityConfHandler);
securityConfHandler.initializeMetrics(metricManager, SolrInfoBean.Group.node.toString(), metricTag, AUTHZ_PATH);
containerHandlers.put(AUTHC_PATH, securityConfHandler);
- if(pkiAuthenticationPlugin != null)
- containerHandlers.put(PKIAuthenticationPlugin.PATH, pkiAuthenticationPlugin.getRequestHandler());
+
PluginInfo[] metricReporters = cfg.getMetricsConfig().getMetricReporters();
metricManager.loadReporters(metricReporters, loader, this, null, null, SolrInfoBean.Group.node);
diff --git a/solr/core/src/java/org/apache/solr/core/SolrCore.java b/solr/core/src/java/org/apache/solr/core/SolrCore.java
index 99c0cca0669..feab22dce1c 100644
--- a/solr/core/src/java/org/apache/solr/core/SolrCore.java
+++ b/solr/core/src/java/org/apache/solr/core/SolrCore.java
@@ -958,7 +958,7 @@ public final class SolrCore implements SolrInfoBean, SolrMetricProducer, Closeab
initIndex(prev != null, reload);
initWriters();
- qParserPlugins.init(createInstances(QParserPlugin.standardPlugins), this);
+ qParserPlugins.init(QParserPlugin.standardPlugins, this);
valueSourceParsers.init(ValueSourceParser.standardValueSourceParsers, this);
transformerFactories.init(TransformerFactory.defaultFactories, this);
loadSearchComponents();
diff --git a/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java b/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java
index 22753dd0c6a..0ff5c7b362c 100644
--- a/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java
+++ b/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java
@@ -16,6 +16,10 @@
*/
package org.apache.solr.core;
+import javax.naming.Context;
+import javax.naming.InitialContext;
+import javax.naming.NamingException;
+import javax.naming.NoInitialContextException;
import java.io.Closeable;
import java.io.File;
import java.io.FileOutputStream;
@@ -47,10 +51,6 @@ import java.util.concurrent.ConcurrentSkipListSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
-import javax.naming.Context;
-import javax.naming.InitialContext;
-import javax.naming.NamingException;
-import javax.naming.NoInitialContextException;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.util.CharFilterFactory;
@@ -88,9 +88,9 @@ public class SolrResourceLoader implements ResourceLoader,Closeable
static final String project = "solr";
static final String base = "org.apache" + "." + project;
static final String[] packages = {
- "", "analysis.", "schema.", "handler.", "search.", "update.", "core.", "response.", "request.",
+ "", "analysis.", "schema.", "handler.", "handler.tagger.", "search.", "update.", "core.", "response.", "request.",
"update.processor.", "util.", "spelling.", "handler.component.", "handler.dataimport.",
- "spelling.suggest.", "spelling.suggest.fst.", "rest.schema.analysis.", "security.","handler.admin.",
+ "spelling.suggest.", "spelling.suggest.fst.", "rest.schema.analysis.", "security.", "handler.admin.",
"cloud.autoscaling."
};
private static final java.lang.String SOLR_CORE_NAME = "solr.core.name";
diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java
index 01d2fe89884..269bb50641d 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java
@@ -206,9 +206,10 @@ public class CollectionsHandler extends RequestHandlerBase implements Permission
return this.coreContainer;
}
- protected void copyFromClusterProp(Map props, String prop) {
+ protected void copyFromClusterProp(Map props, String prop) throws IOException {
if (props.get(prop) != null) return;//if it's already specified , return
- Object defVal = coreContainer.getZkController().getZkStateReader().getClusterProperty(ImmutableList.of(COLLECTION_DEF, prop), null);
+ Object defVal = new ClusterProperties(coreContainer.getZkController().getZkStateReader().getZkClient())
+ .getClusterProperty(ImmutableList.of(COLLECTION_DEF, prop), null);
if (defVal != null) props.put(prop, String.valueOf(defVal));
}
diff --git a/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java b/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java
index 82a62d56d3e..9ffea4bc9c8 100644
--- a/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java
@@ -797,7 +797,8 @@ public class ExpandComponent extends SearchComponent implements PluginInfoInitia
fieldInfo.getDocValuesGen(),
fieldInfo.attributes(),
fieldInfo.getPointDimensionCount(),
- fieldInfo.getPointNumBytes());
+ fieldInfo.getPointNumBytes(),
+ fieldInfo.isSoftDeletesField());
newInfos.add(f);
} else {
diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/OffsetCorrector.java b/solr/core/src/java/org/apache/solr/handler/tagger/OffsetCorrector.java
new file mode 100644
index 00000000000..1fb4911195d
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/tagger/OffsetCorrector.java
@@ -0,0 +1,178 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.util.Arrays;
+
+import com.carrotsearch.hppc.IntArrayList;
+
+public abstract class OffsetCorrector {
+
+ //TODO support a streaming style of consuming input text so that we need not take a
+ // String. Trickier because we need to keep more information as we parse to know when tags
+ // are adjacent with/without whitespace
+
+ //Data structure requirements:
+ // Given a character offset:
+ // * determine what tagId is it's parent.
+ // * determine if it is adjacent to the parent open tag, ignoring whitespace
+ // * determine if it is adjacent to the parent close tag, ignoring whitespace
+ // Given a tagId:
+ // * What is it's parent tagId
+ // * What's the char offset of the start and end of the open tag
+ // * What's the char offset of the start and end of the close tag
+
+ /** Document text. */
+ protected final String docText;
+
+ /** Array of tag info comprised of 5 int fields:
+ * [int parentTag, int openStartOff, int openEndOff, int closeStartOff, int closeEndOff].
+ * It's size indicates how many tags there are. Tag's are ID'ed sequentially from 0. */
+ protected final IntArrayList tagInfo;
+
+ /** offsets of parent tag id change (ascending order) */
+ protected final IntArrayList parentChangeOffsets;
+ /** tag id; parallel array to parentChangeOffsets */
+ protected final IntArrayList parentChangeIds;
+
+ protected final int[] offsetPair = new int[] { -1, -1};//non-thread-safe state
+
+ /** Disjoint start and end span offsets (inclusive) of non-taggable sections. Null if none. */
+ protected final IntArrayList nonTaggableOffsets;
+
+ /**
+ * Initialize based on the document text.
+ * @param docText non-null structured content.
+ * @param hasNonTaggable if there may be "non-taggable" tags to track
+ */
+ protected OffsetCorrector(String docText, boolean hasNonTaggable) {
+ this.docText = docText;
+ final int guessNumElements = Math.max(docText.length() / 20, 4);
+
+ tagInfo = new IntArrayList(guessNumElements * 5);
+ parentChangeOffsets = new IntArrayList(guessNumElements * 2);
+ parentChangeIds = new IntArrayList(guessNumElements * 2);
+ nonTaggableOffsets = hasNonTaggable ? new IntArrayList(guessNumElements / 5) : null;
+ }
+
+ /** Corrects the start and end offset pair. It will return null if it can't
+ * due to a failure to keep the offsets balance-able, or if it spans "non-taggable" tags.
+ * The start (left) offset is pulled left as needed over whitespace and opening tags. The end
+ * (right) offset is pulled right as needed over whitespace and closing tags. It's returned as
+ * a 2-element array.
+ * Note that the returned array is internally reused; just use it to examine the response.
+ */
+ public int[] correctPair(int leftOffset, int rightOffset) {
+ rightOffset = correctEndOffsetForCloseElement(rightOffset);
+ if (spansNonTaggable(leftOffset, rightOffset))
+ return null;
+
+ int startTag = lookupTag(leftOffset);
+ //offsetPair[0] = Math.max(offsetPair[0], getOpenStartOff(startTag));
+ int endTag = lookupTag(rightOffset-1);
+ //offsetPair[1] = Math.min(offsetPair[1], getCloseStartOff(endTag));
+
+ // Find the ancestor tag enclosing offsetPair. And bump out left offset along the way.
+ int iTag = startTag;
+ for (; !tagEnclosesOffset(iTag, rightOffset); iTag = getParentTag(iTag)) {
+ //Ensure there is nothing except whitespace thru OpenEndOff
+ int tagOpenEndOff = getOpenEndOff(iTag);
+ if (hasNonWhitespace(tagOpenEndOff, leftOffset))
+ return null;
+ leftOffset = getOpenStartOff(iTag);
+ }
+ final int ancestorTag = iTag;
+ // Bump out rightOffset until we get to ancestorTag.
+ for (iTag = endTag; iTag != ancestorTag; iTag = getParentTag(iTag)) {
+ //Ensure there is nothing except whitespace thru CloseStartOff
+ int tagCloseStartOff = getCloseStartOff(iTag);
+ if (hasNonWhitespace(rightOffset, tagCloseStartOff))
+ return null;
+ rightOffset = getCloseEndOff(iTag);
+ }
+
+ offsetPair[0] = leftOffset;
+ offsetPair[1] = rightOffset;
+ return offsetPair;
+ }
+
+ /** Correct endOffset for adjacent element at the right side. E.g. offsetPair might point to:
+ *
+ * foo</tag>
+ *
+ * and this method pulls the end offset left to the '<'. This is necessary for use with
+ * {@link org.apache.lucene.analysis.charfilter.HTMLStripCharFilter}.
+ *
+ * See https://issues.apache.org/jira/browse/LUCENE-5734 */
+ protected int correctEndOffsetForCloseElement(int endOffset) {
+ if (docText.charAt(endOffset-1) == '>') {
+ final int newEndOffset = docText.lastIndexOf('<', endOffset - 2);
+ if (newEndOffset > offsetPair[0])//just to be sure
+ return newEndOffset;
+ }
+ return endOffset;
+ }
+
+ protected boolean hasNonWhitespace(int start, int end) {
+ for (int i = start; i < end; i++) {
+ if (!Character.isWhitespace(docText.charAt(i)))
+ return true;
+ }
+ return false;
+ }
+
+ protected boolean tagEnclosesOffset(int tag, int off) {
+ return off >= getOpenStartOff(tag) && off < getCloseEndOff(tag);
+ }
+
+ protected int getParentTag(int tag) { return tagInfo.get(tag * 5 + 0); }
+ protected int getOpenStartOff(int tag) { return tagInfo.get(tag * 5 + 1); }
+ protected int getOpenEndOff(int tag) { return tagInfo.get(tag * 5 + 2); }
+ protected int getCloseStartOff(int tag) { return tagInfo.get(tag * 5 + 3); }
+ protected int getCloseEndOff(int tag) { return tagInfo.get(tag * 5 + 4); }
+
+ protected int lookupTag(int off) {
+ int idx = Arrays.binarySearch(parentChangeOffsets.buffer, 0, parentChangeOffsets.size(), off);
+ if (idx < 0)
+ idx = (-idx - 1) - 1;//round down
+ return parentChangeIds.get(idx);
+ }
+
+ protected boolean spansNonTaggable(int startOff, int endOff) {
+ if (nonTaggableOffsets == null)
+ return false;
+ int idx = Arrays.binarySearch(nonTaggableOffsets.buffer, 0, nonTaggableOffsets.size(), startOff);
+ //if tag start coincides with first or last char of non-taggable span then result is true.
+ // (probably never happens since those characters are actual element markup)
+ if (idx >= 0)
+ return true;
+ idx = -idx - 1;//modify for where we would insert
+ //if idx is odd then our span intersects a non-taggable span; return true
+ if ((idx & 1) == 1)
+ return true;
+ //it's non-taggable if the next non-taggable start span is before our endOff
+ if (idx == nonTaggableOffsets.size())
+ return false;
+ return nonTaggableOffsets.get(idx) < endOff;
+ }
+}
diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TagClusterReducer.java b/solr/core/src/java/org/apache/solr/handler/tagger/TagClusterReducer.java
new file mode 100644
index 00000000000..9310a0429e1
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/tagger/TagClusterReducer.java
@@ -0,0 +1,103 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+public interface TagClusterReducer {
+ /**
+ * Reduces the linked-list to only those tags that should be emitted
+ * @param head not null; 1-element array to head which isn't null either
+ */
+ void reduce(TagLL[] head);
+
+ static final TagClusterReducer ALL = new TagClusterReducer() {
+ @Override
+ public void reduce(TagLL[] head) {
+ }
+ };
+
+ static final TagClusterReducer NO_SUB = new TagClusterReducer() {
+ @Override
+ public void reduce(TagLL[] head) {
+ //loop forward over all tags
+ for (TagLL tag = head[0].nextTag; tag != null; tag = tag.nextTag) {
+ //loop backwards over prev tags from this tag
+ for (TagLL tPrev = tag.prevTag; tPrev != null; tPrev = tPrev.prevTag) {
+ assert tPrev.startOffset <= tag.startOffset;
+ //if a previous tag's endOffset is <= this one's, tForward can be removed
+ if (tPrev.endOffset >= tag.endOffset) {
+ tag.removeLL();
+ break;
+ } else if (tPrev.startOffset == tag.startOffset) {
+ tPrev.removeLL();
+ //continue; 'tag' is still valid
+ }
+ }
+ }
+ }
+ };
+
+ static final TagClusterReducer LONGEST_DOMINANT_RIGHT = new TagClusterReducer() {
+ @Override
+ public void reduce(TagLL[] head) {
+
+ //--Optimize for common single-tag case
+ if (head[0].nextTag == null)
+ return;
+
+ while (true) {
+ //--Find longest not already marked
+ TagLL longest = null;
+ for (TagLL t = head[0]; t != null; t = t.nextTag) {
+ if (!t.mark && (longest == null || t.charLen() >= longest.charLen()))
+ longest = t;
+ }
+ if (longest == null)
+ break;
+ //--Mark longest (so we return it eventually)
+ longest.mark = true;
+ //--Remove tags overlapping this longest
+ for (TagLL t = head[0]; t != null; t = t.nextTag) {
+ if (t.mark)
+ continue;
+
+ if (t.overlaps(longest)) {
+ t.removeLL();
+ } else if (t.startOffset >= longest.endOffset) {
+ break;//no subsequent can possibly overlap
+ }
+ }
+ }//loop
+
+ //all-remaining should be marked
+// for (TagLL t = head; t != null; t = t.nextTag) {
+// assert t.mark;
+//// if (!t.mark) {
+//// t.removeLL();
+//// if (head == t)
+//// head = t.nextTag;
+//// }
+// }
+ assert head[0].mark;
+ }
+ };
+}
diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TagLL.java b/solr/core/src/java/org/apache/solr/handler/tagger/TagLL.java
new file mode 100644
index 00000000000..e8bb0a3bc9b
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/tagger/TagLL.java
@@ -0,0 +1,176 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.io.IOException;
+
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * This is a Tag -- a startOffset, endOffset and value.
+ *
+ * A Tag starts without a value in an
+ * "advancing" state. {@link #advance(org.apache.lucene.util.BytesRef, int)}
+ * is called with subsequent words and then eventually it won't advance any
+ * more, and value is set (could be null).
+ *
+ * A Tag is also a doubly-linked-list (hence the LL in the name). All tags share
+ * a reference to the head via a 1-element array, which is potentially modified
+ * if any of the linked-list methods are called. Tags in the list should have
+ * equal or increasing start offsets.
+ */
+public class TagLL{
+
+ private final TagLL[] head;//a shared pointer to the head; 1 element
+ TagLL prevTag, nextTag; // linked list
+
+ private TermPrefixCursor cursor;
+
+ final int startOffset;//inclusive
+ int endOffset;//exclusive
+ Object value;//null means unset
+
+ /** optional boolean used by some TagClusterReducer's */
+ boolean mark = false;
+
+ TagLL(TagLL[] head, TermPrefixCursor cursor, int startOffset, int endOffset, Object value) {
+ this.head = head;
+ this.cursor = cursor;
+ this.startOffset = startOffset;
+ this.endOffset = endOffset;
+ this.value = value;
+ }
+
+ /**
+ * Advances this tag with "word" at offset "offset". If this tag is not in
+ * an advancing state then it does nothing. If it is advancing and prior to
+ * advancing further it sees a value, then a non-advancing tag may be inserted
+ * into the LL as side-effect. If this returns false (it didn't advance) and
+ * if there is no value, then it will also be removed.
+ *
+ *
+ * @param word The next word or null if at an end
+ * @param offset The last character in word's offset in the underlying
+ * stream. If word is null then it's meaningless.
+ *
+ * @return Whether it advanced or not.
+ */
+ boolean advance(BytesRef word, int offset) throws IOException {
+ if (!isAdvancing())
+ return false;
+
+ Object iVal = cursor.getDocIds();
+
+ if (word != null && cursor.advance(word)) {
+
+ if (iVal != null) {
+ addBeforeLL(new TagLL(head, null, startOffset, endOffset, iVal));
+ }
+
+ assert offset >= endOffset;
+ endOffset = offset;
+ return true;
+ } else {
+ this.value = iVal;
+ this.cursor = null;
+ if (iVal == null)
+ removeLL();
+ return false;
+ }
+ }
+
+ /** Removes this tag from the chain, connecting prevTag and nextTag. Does not
+ * modify "this" object's pointers, so the caller can refer to nextTag after
+ * removing it. */
+ public void removeLL() {
+ if (head[0] == this)
+ head[0] = nextTag;
+ if (prevTag != null) {
+ prevTag.nextTag = nextTag;
+ }
+ if (nextTag != null) {
+ nextTag.prevTag = prevTag;
+ }
+ }
+
+ void addBeforeLL(TagLL tag) {
+ assert tag.startOffset <= startOffset;
+ if (prevTag != null) {
+ assert prevTag.startOffset <= tag.startOffset;
+ prevTag.nextTag = tag;
+ tag.prevTag = prevTag;
+ } else {
+ assert head[0] == this;
+ head[0] = tag;
+ }
+ prevTag = tag;
+ tag.nextTag = this;
+ }
+
+ void addAfterLL(TagLL tag) {
+ assert tag.startOffset >= startOffset;
+ if (nextTag != null) {
+ assert nextTag.startOffset >= tag.startOffset;
+ nextTag.prevTag = tag;
+ tag.nextTag = nextTag;
+ }
+ nextTag = tag;
+ tag.prevTag = this;
+ }
+
+ public int charLen() {
+ return endOffset - startOffset;
+ }
+
+ public TagLL getNextTag() {
+ return nextTag;
+ }
+
+ public TagLL getPrevTag() {
+ return prevTag;
+ }
+
+ public int getStartOffset() {
+ return startOffset;
+ }
+ public int getEndOffset() {
+ return endOffset;
+ }
+ public boolean overlaps(TagLL other) {
+ //don't use >= or <= because startOffset is inclusive while endOffset is exclusive
+ if (startOffset < other.startOffset)
+ return endOffset > other.startOffset;
+ else
+ return startOffset < other.endOffset;
+ }
+
+ boolean isAdvancing() {
+ return cursor != null;
+ }
+
+ @Override
+ public String toString() {
+ return (prevTag != null ? '*' : '-') + "|" + (nextTag != null ? '*' : '-') +
+ " " + startOffset + " to " + endOffset + (isAdvancing() ? '+' : " #" + value);
+ }
+}
diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/Tagger.java b/solr/core/src/java/org/apache/solr/handler/tagger/Tagger.java
new file mode 100644
index 00000000000..12a4cf0a035
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/tagger/Tagger.java
@@ -0,0 +1,230 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Tags maximum string of words in a corpus. This is a callback-style API
+ * in which you implement {@link #tagCallback(int, int, Object)}.
+ *
+ * This class should be independently usable outside Solr.
+ */
+public abstract class Tagger {
+ private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+ private final TokenStream tokenStream;
+ private final TermToBytesRefAttribute byteRefAtt;
+ private final PositionIncrementAttribute posIncAtt;
+ private final OffsetAttribute offsetAtt;
+ private final TaggingAttribute taggingAtt;
+
+ private final TagClusterReducer tagClusterReducer;
+ private final Terms terms;
+ private final Bits liveDocs;
+ private final boolean skipAltTokens;
+ private final boolean ignoreStopWords;
+
+ private Map docIdsCache;
+
+ /** Whether the WARNING about skipped tokens was already logged. */
+ private boolean loggedSkippedAltTokenWarning = false;
+
+ public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream,
+ TagClusterReducer tagClusterReducer, boolean skipAltTokens,
+ boolean ignoreStopWords) throws IOException {
+ this.terms = terms;
+ this.liveDocs = liveDocs;
+ this.tokenStream = tokenStream;
+ this.skipAltTokens = skipAltTokens;
+ this.ignoreStopWords = ignoreStopWords;
+ byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class);
+ posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
+ offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
+ taggingAtt = tokenStream.addAttribute(TaggingAttribute.class);
+ tokenStream.reset();
+
+ this.tagClusterReducer = tagClusterReducer;
+ }
+
+ public void enableDocIdsCache(int initSize) {
+ if (initSize > 0)
+ docIdsCache = new HashMap<>(initSize);
+ }
+
+ public void process() throws IOException {
+ if (terms == null)
+ return;
+
+ //a shared pointer to the head used by this method and each Tag instance.
+ final TagLL[] head = new TagLL[1];
+
+ TermPrefixCursor cursor = null;//re-used
+
+ //boolean switch used to log warnings in case tokens where skipped during tagging.
+ boolean skippedTokens = false;
+
+ while (tokenStream.incrementToken()) {
+ if (log.isTraceEnabled()) {
+ log.trace("Token: {}, posInc: {}, offset: [{},{}]",
+ byteRefAtt, posIncAtt.getPositionIncrement(),
+ offsetAtt.startOffset(), offsetAtt.endOffset());
+ }
+ //check for posInc < 1 (alternate Tokens, such as expanded Synonyms)
+ if (posIncAtt.getPositionIncrement() < 1) {
+ //(a) Deal with this as a configuration issue and throw an exception
+ if (!skipAltTokens) {
+ //TODO throw UnsupportedTokenException when PhraseBuilder is ported
+ throw new IllegalStateException("Query Analyzer generates alternate "
+ + "Tokens (posInc == 0). Please adapt your Analyzer configuration or "
+ + "enable '" + TaggerRequestHandler.SKIP_ALT_TOKENS + "' to skip such "
+ + "tokens. NOTE: enabling '" + TaggerRequestHandler.SKIP_ALT_TOKENS
+ + "' might result in wrong tagging results if the index time analyzer "
+ + "is not configured accordingly. For detailed information see "
+ + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225");
+ } else {
+ //(b) In case the index time analyser had indexed all variants (users
+ // need to ensure that) processing of alternate tokens can be skipped
+ // as anyways all alternatives will be contained in the FST.
+ skippedTokens = true;
+ log.trace(" ... ignored token");
+ continue;
+ }
+ }
+ //-- If PositionIncrement > 1 (stopwords)
+ if (!ignoreStopWords && posIncAtt.getPositionIncrement() > 1) {
+ log.trace(" - posInc > 1 ... mark cluster as done");
+ advanceTagsAndProcessClusterIfDone(head, null);
+ }
+
+ final BytesRef term;
+ //NOTE: we need to lookup tokens if
+ // * the LookupAtt is true OR
+ // * there are still advancing tags (to find the longest possible match)
+ if(taggingAtt.isTaggable() || head[0] != null){
+ //-- Lookup the term id from the next token
+ term = byteRefAtt.getBytesRef();
+ if (term.length == 0) {
+ throw new IllegalArgumentException("term: " + term.utf8ToString() + " analyzed to a zero-length token");
+ }
+ } else { //no current cluster AND lookup == false ...
+ term = null; //skip this token
+ }
+
+ //-- Process tag
+ advanceTagsAndProcessClusterIfDone(head, term);
+
+ //-- only create new Tags for Tokens we need to lookup
+ if (taggingAtt.isTaggable() && term != null) {
+
+ //determine if the terms index has a term starting with the provided term
+ // TODO create a pool of these cursors to reuse them more? could be trivial impl
+ if (cursor == null)// (else the existing cursor will be re-used)
+ cursor = new TermPrefixCursor(terms.iterator(), liveDocs, docIdsCache);
+ if (cursor.advance(term)) {
+ TagLL newTail = new TagLL(head, cursor, offsetAtt.startOffset(), offsetAtt.endOffset(), null);
+ cursor = null;//because the new tag now "owns" this instance
+ //and add it to the end
+ if (head[0] == null) {
+ head[0] = newTail;
+ } else {
+ for (TagLL t = head[0]; true; t = t.nextTag) {
+ if (t.nextTag == null) {
+ t.addAfterLL(newTail);
+ break;
+ }
+ }
+ }
+ }
+ }//if termId >= 0
+ }//end while(incrementToken())
+
+ //-- Finish all tags
+ advanceTagsAndProcessClusterIfDone(head, null);
+ assert head[0] == null;
+
+ if(!loggedSkippedAltTokenWarning && skippedTokens){
+ loggedSkippedAltTokenWarning = true; //only log once
+ log.warn("The Tagger skipped some alternate tokens (tokens with posInc == 0) "
+ + "while processing text. This may cause problems with some Analyzer "
+ + "configurations (e.g. query time synonym expansion). For details see "
+ + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225");
+ }
+
+ tokenStream.end();
+ //tokenStream.close(); caller closes because caller acquired it
+ }
+
+ private void advanceTagsAndProcessClusterIfDone(TagLL[] head, BytesRef term) throws IOException {
+ //-- Advance tags
+ final int endOffset = term != null ? offsetAtt.endOffset() : -1;
+ boolean anyAdvance = false;
+ for (TagLL t = head[0]; t != null; t = t.nextTag) {
+ anyAdvance |= t.advance(term, endOffset);
+ }
+
+ //-- Process cluster if done
+ if (!anyAdvance && head[0] != null) {
+ tagClusterReducer.reduce(head);
+ for (TagLL t = head[0]; t != null; t = t.nextTag) {
+ assert t.value != null;
+ tagCallback(t.startOffset, t.endOffset, t.value);
+ }
+ head[0] = null;
+ }
+ }
+
+ /**
+ * Invoked by {@link #process()} for each tag found. endOffset is always >= the endOffset
+ * given in the previous call.
+ *
+ * @param startOffset The character offset of the original stream where the tag starts.
+ * @param endOffset One more than the character offset of the original stream where the tag ends.
+ * @param docIdsKey A reference to the matching docIds that can be resolved via {@link #lookupDocIds(Object)}.
+ */
+ protected abstract void tagCallback(int startOffset, int endOffset, Object docIdsKey);
+
+ /**
+ * Returns a sorted array of integer docIds given the corresponding key.
+ * @param docIdsKey The lookup key.
+ * @return Not null
+ */
+ protected IntsRef lookupDocIds(Object docIdsKey) {
+ return (IntsRef) docIdsKey;
+ }
+}
+
diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TaggerRequestHandler.java b/solr/core/src/java/org/apache/solr/handler/tagger/TaggerRequestHandler.java
new file mode 100644
index 00000000000..a972e47165a
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/tagger/TaggerRequestHandler.java
@@ -0,0 +1,397 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import javax.xml.stream.XMLStreamException;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.Callable;
+
+import com.google.common.io.CharStreams;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.StopFilterFactory;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.ReaderUtil;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.queries.function.FunctionValues;
+import org.apache.lucene.queries.function.ValueSource;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.IntsRef;
+import org.apache.solr.analysis.TokenizerChain;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.params.CommonParams;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.ContentStream;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.handler.RequestHandlerBase;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.schema.FieldType;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.search.BitDocSet;
+import org.apache.solr.search.DocList;
+import org.apache.solr.search.DocSet;
+import org.apache.solr.search.DocSlice;
+import org.apache.solr.search.QParser;
+import org.apache.solr.search.SolrIndexSearcher;
+import org.apache.solr.search.SolrReturnFields;
+import org.apache.solr.search.SyntaxError;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Scans posted text, looking for matching strings in the Solr index.
+ * The public static final String members are request parameters.
+ * This handler is also called the "SolrTextTagger".
+ *
+ * @since 7.4.0
+ */
+public class TaggerRequestHandler extends RequestHandlerBase {
+
+ /** Request parameter. */
+ public static final String OVERLAPS = "overlaps";
+ /** Request parameter. */
+ public static final String TAGS_LIMIT = "tagsLimit";
+ /** Request parameter. */
+ public static final String MATCH_TEXT = "matchText";
+ /** Request parameter. */
+ public static final String SKIP_ALT_TOKENS = "skipAltTokens";
+ /** Request parameter. */
+ public static final String IGNORE_STOPWORDS = "ignoreStopwords";
+ /** Request parameter. */
+ public static final String XML_OFFSET_ADJUST = "xmlOffsetAdjust";
+
+ private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+ @Override
+ public String getDescription() {
+ return "Processes input text to find matching tokens stored in the index.";
+ }
+
+ @Override
+ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
+
+ //--Read params
+ final String indexedField = req.getParams().get("field");
+ if (indexedField == null)
+ throw new RuntimeException("required param 'field'");
+
+ final TagClusterReducer tagClusterReducer =
+ chooseTagClusterReducer(req.getParams().get(OVERLAPS));
+ final int rows = req.getParams().getInt(CommonParams.ROWS, 10000);
+ final int tagsLimit = req.getParams().getInt(TAGS_LIMIT, 1000);
+ final boolean addMatchText = req.getParams().getBool(MATCH_TEXT, false);
+ final SchemaField idSchemaField = req.getSchema().getUniqueKeyField();
+ if (idSchemaField == null) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "The tagger requires a" +
+ "uniqueKey in the schema.");//TODO this could be relaxed
+ }
+ final boolean skipAltTokens = req.getParams().getBool(SKIP_ALT_TOKENS, false);
+ final boolean ignoreStopWords = req.getParams().getBool(IGNORE_STOPWORDS,
+ fieldHasIndexedStopFilter(indexedField, req));
+
+ //--Get posted data
+ Reader inputReader = null;
+ Iterable streams = req.getContentStreams();
+ if (streams != null) {
+ Iterator iter = streams.iterator();
+ if (iter.hasNext()) {
+ inputReader = iter.next().getReader();
+ }
+ if (iter.hasNext()) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
+ getClass().getSimpleName()+" does not support multiple ContentStreams"); //TODO support bulk tagging?
+ }
+ }
+ if (inputReader == null) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
+ getClass().getSimpleName()+" requires text to be POSTed to it");
+ }
+
+ // We may or may not need to read the input into a string
+ final InputStringLazy inputStringFuture = new InputStringLazy(inputReader);
+
+ final OffsetCorrector offsetCorrector = getOffsetCorrector(req.getParams(), inputStringFuture);
+
+ final String inputString;//only populated if needed
+ if (addMatchText || inputStringFuture.inputString != null) {
+ //Read the input fully into a String buffer that we'll need later,
+ // then replace the input with a reader wrapping the buffer.
+ inputString = inputStringFuture.call();
+ inputReader.close();
+ inputReader = new StringReader(inputString);
+ } else {
+ inputString = null;//not used
+ }
+
+ final SolrIndexSearcher searcher = req.getSearcher();
+ final FixedBitSet matchDocIdsBS = new FixedBitSet(searcher.maxDoc());
+ final List tags = new ArrayList(2000);
+
+ try {
+ Analyzer analyzer = req.getSchema().getField(indexedField).getType().getQueryAnalyzer();
+ try (TokenStream tokenStream = analyzer.tokenStream("", inputReader)) {
+ Terms terms = searcher.getSlowAtomicReader().terms(indexedField);
+ if (terms == null)
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
+ "field " + indexedField + " has no indexed data");
+ Tagger tagger = new Tagger(terms, computeDocCorpus(req), tokenStream, tagClusterReducer,
+ skipAltTokens, ignoreStopWords) {
+ @SuppressWarnings("unchecked")
+ @Override
+ protected void tagCallback(int startOffset, int endOffset, Object docIdsKey) {
+ if (tags.size() >= tagsLimit)
+ return;
+ if (offsetCorrector != null) {
+ int[] offsetPair = offsetCorrector.correctPair(startOffset, endOffset);
+ if (offsetPair == null) {
+ log.debug("Discarded offsets [{}, {}] because couldn't balance XML.",
+ startOffset, endOffset);
+ return;
+ }
+ startOffset = offsetPair[0];
+ endOffset = offsetPair[1];
+ }
+
+ NamedList tag = new NamedList();
+ tag.add("startOffset", startOffset);
+ tag.add("endOffset", endOffset);
+ if (addMatchText)
+ tag.add("matchText", inputString.substring(startOffset, endOffset));
+ //below caches, and also flags matchDocIdsBS
+ tag.add("ids", lookupSchemaDocIds(docIdsKey));
+ tags.add(tag);
+ }
+
+ Map docIdsListCache = new HashMap<>(2000);
+
+ ValueSourceAccessor uniqueKeyCache = new ValueSourceAccessor(searcher,
+ idSchemaField.getType().getValueSource(idSchemaField, null));
+
+ @SuppressWarnings("unchecked")
+ private List lookupSchemaDocIds(Object docIdsKey) {
+ List schemaDocIds = docIdsListCache.get(docIdsKey);
+ if (schemaDocIds != null)
+ return schemaDocIds;
+ IntsRef docIds = lookupDocIds(docIdsKey);
+ //translate lucene docIds to schema ids
+ schemaDocIds = new ArrayList(docIds.length);
+ for (int i = docIds.offset; i < docIds.offset + docIds.length; i++) {
+ int docId = docIds.ints[i];
+ assert i == docIds.offset || docIds.ints[i - 1] < docId : "not sorted?";
+ matchDocIdsBS.set(docId);//also, flip docid in bitset
+ try {
+ schemaDocIds.add(uniqueKeyCache.objectVal(docId));//translates here
+ } catch (IOException e) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+ }
+ }
+ assert !schemaDocIds.isEmpty();
+
+ docIdsListCache.put(docIds, schemaDocIds);
+ return schemaDocIds;
+ }
+
+ };
+ tagger.enableDocIdsCache(2000);//TODO configurable
+ tagger.process();
+ }
+ } finally {
+ inputReader.close();
+ }
+ rsp.add("tagsCount",tags.size());
+ rsp.add("tags", tags);
+
+ rsp.setReturnFields(new SolrReturnFields( req ));
+
+ //Solr's standard name for matching docs in response
+ rsp.add("response", getDocList(rows, matchDocIdsBS));
+ }
+
+ private static class InputStringLazy implements Callable {
+ final Reader inputReader;
+ String inputString;
+
+ InputStringLazy(Reader inputReader) {
+ this.inputReader = inputReader;
+ }
+
+ @Override
+ public String call() throws IOException {
+ if (inputString == null) {
+ inputString = CharStreams.toString(inputReader);
+ }
+ return inputString;
+ }
+ }
+
+ protected OffsetCorrector getOffsetCorrector(SolrParams params, Callable inputStringProvider) throws Exception {
+ final boolean xmlOffsetAdjust = params.getBool(XML_OFFSET_ADJUST, false);
+ if (!xmlOffsetAdjust) {
+ return null;
+ }
+ try {
+ return new XmlOffsetCorrector(inputStringProvider.call());
+ } catch (XMLStreamException e) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
+ "Expecting XML but wasn't: " + e, e);
+ }
+ }
+
+ private DocList getDocList(int rows, FixedBitSet matchDocIdsBS) throws IOException {
+ //Now we must supply a Solr DocList and add it to the response.
+ // Typically this is gotten via a SolrIndexSearcher.search(), but in this case we
+ // know exactly what documents to return, the order doesn't matter nor does
+ // scoring.
+ // Ideally an implementation of DocList could be directly implemented off
+ // of a BitSet, but there are way too many methods to implement for a minor
+ // payoff.
+ int matchDocs = matchDocIdsBS.cardinality();
+ int[] docIds = new int[ Math.min(rows, matchDocs) ];
+ DocIdSetIterator docIdIter = new BitSetIterator(matchDocIdsBS, 1);
+ for (int i = 0; i < docIds.length; i++) {
+ docIds[i] = docIdIter.nextDoc();
+ }
+ return new DocSlice(0, docIds.length, docIds, null, matchDocs, 1f);
+ }
+
+ private TagClusterReducer chooseTagClusterReducer(String overlaps) {
+ TagClusterReducer tagClusterReducer;
+ if (overlaps == null || overlaps.equals("NO_SUB")) {
+ tagClusterReducer = TagClusterReducer.NO_SUB;
+ } else if (overlaps.equals("ALL")) {
+ tagClusterReducer = TagClusterReducer.ALL;
+ } else if (overlaps.equals("LONGEST_DOMINANT_RIGHT")) {
+ tagClusterReducer = TagClusterReducer.LONGEST_DOMINANT_RIGHT;
+ } else {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
+ "unknown tag overlap mode: "+overlaps);
+ }
+ return tagClusterReducer;
+ }
+
+ /**
+ * The set of documents matching the provided 'fq' (filter query). Don't include deleted docs
+ * either. If null is returned, then all docs are available.
+ */
+ private Bits computeDocCorpus(SolrQueryRequest req) throws SyntaxError, IOException {
+ final String[] corpusFilterQueries = req.getParams().getParams("fq");
+ final SolrIndexSearcher searcher = req.getSearcher();
+ final Bits docBits;
+ if (corpusFilterQueries != null && corpusFilterQueries.length > 0) {
+ List filterQueries = new ArrayList(corpusFilterQueries.length);
+ for (String corpusFilterQuery : corpusFilterQueries) {
+ QParser qParser = QParser.getParser(corpusFilterQuery, null, req);
+ try {
+ filterQueries.add(qParser.parse());
+ } catch (SyntaxError e) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
+ }
+ }
+
+ final DocSet docSet = searcher.getDocSet(filterQueries);//hopefully in the cache
+ //note: before Solr 4.7 we could call docSet.getBits() but no longer.
+ if (docSet instanceof BitDocSet) {
+ docBits = ((BitDocSet)docSet).getBits();
+ } else {
+ docBits = new Bits() {
+
+ @Override
+ public boolean get(int index) {
+ return docSet.exists(index);
+ }
+
+ @Override
+ public int length() {
+ return searcher.maxDoc();
+ }
+ };
+ }
+ } else {
+ docBits = searcher.getSlowAtomicReader().getLiveDocs();
+ }
+ return docBits;
+ }
+
+ private boolean fieldHasIndexedStopFilter(String field, SolrQueryRequest req) {
+ FieldType fieldType = req.getSchema().getFieldType(field);
+ Analyzer analyzer = fieldType.getIndexAnalyzer();//index analyzer
+ if (analyzer instanceof TokenizerChain) {
+ TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
+ TokenFilterFactory[] tokenFilterFactories = tokenizerChain.getTokenFilterFactories();
+ for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) {
+ if (tokenFilterFactory instanceof StopFilterFactory)
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /** See LUCENE-4541 or {@link org.apache.solr.response.transform.ValueSourceAugmenter}. */
+ static class ValueSourceAccessor {
+ private final List readerContexts;
+ private final ValueSource valueSource;
+ private final Map fContext;
+ private final FunctionValues[] functionValuesPerSeg;
+ private final int[] functionValuesDocIdPerSeg;
+
+ ValueSourceAccessor(IndexSearcher searcher, ValueSource valueSource) {
+ readerContexts = searcher.getIndexReader().leaves();
+ this.valueSource = valueSource;
+ fContext = ValueSource.newContext(searcher);
+ functionValuesPerSeg = new FunctionValues[readerContexts.size()];
+ functionValuesDocIdPerSeg = new int[readerContexts.size()];
+ }
+
+ Object objectVal(int topDocId) throws IOException {
+ // lookup segment level stuff:
+ int segIdx = ReaderUtil.subIndex(topDocId, readerContexts);
+ LeafReaderContext rcontext = readerContexts.get(segIdx);
+ int segDocId = topDocId - rcontext.docBase;
+ // unfortunately Lucene 7.0 requires forward only traversal (with no reset method).
+ // So we need to track our last docId (per segment) and re-fetch the FunctionValues. :-(
+ FunctionValues functionValues = functionValuesPerSeg[segIdx];
+ if (functionValues == null || segDocId < functionValuesDocIdPerSeg[segIdx]) {
+ functionValues = functionValuesPerSeg[segIdx] = valueSource.getValues(fContext, rcontext);
+ }
+ functionValuesDocIdPerSeg[segIdx] = segDocId;
+
+ // get value:
+ return functionValues.objectVal(segDocId);
+ }
+ }
+}
diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttribute.java b/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttribute.java
new file mode 100644
index 00000000000..b7803e4f31a
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttribute.java
@@ -0,0 +1,65 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.Attribute;
+
+/**
+ * Attribute used by the {@link Tagger} to decide if a token can start a
+ * new {@link TagLL tag}.
+ *
+ * By default this Attribute will return true
, but it might be
+ * reset by some {@link TokenFilter} added to the {@link TokenStream} used
+ * to analyze the parsed text. Typically this will be done based on NLP
+ * processing results (e.g. to only lookup Named Entities).
+ *
+ * NOTE: that all Tokens are used to advance existing {@link TagLL tags}.
+ */
+public interface TaggingAttribute extends Attribute {
+
+ /**
+ * By default this Attribute will be initialised with true
.
+ * This ensures that all tokens are taggable by default (especially if
+ * the {@link TaggingAttribute} is not set by any component in the configured
+ * {@link TokenStream}
+ */
+ public static final boolean DEFAULT_TAGGABLE = true;
+
+ /**
+ * Getter for the taggable state of the current Token
+ *
+ * @return the state
+ */
+ public boolean isTaggable();
+
+ /**
+ * Setter for the taggable state. Typically called by code within
+ * {@link TokenFilter#incrementToken()}.
+ *
+ * @param lookup the state
+ */
+ public void setTaggable(boolean lookup);
+
+}
diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttributeImpl.java b/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttributeImpl.java
new file mode 100644
index 00000000000..55ecfbc6ef2
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttributeImpl.java
@@ -0,0 +1,79 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
+
+/**
+ * Implementation of the {@link TaggingAttribute}
+ */
+public class TaggingAttributeImpl extends AttributeImpl implements TaggingAttribute {
+
+ /**
+ * the private field initialised with {@link TaggingAttribute#DEFAULT_TAGGABLE}
+ */
+ private boolean taggable = TaggingAttribute.DEFAULT_TAGGABLE;
+
+ /*
+ * (non-Javadoc)
+ * @see org.opensextant.solrtexttagger.LookupAttribute#isLookup()
+ */
+ @Override
+ public boolean isTaggable() {
+ return taggable;
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see org.opensextant.solrtexttagger.LookupAttribute#setLookup(boolean)
+ */
+ @Override
+ public void setTaggable(boolean lookup) {
+ this.taggable = lookup;
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see org.apache.lucene.util.AttributeImpl#clear()
+ */
+ @Override
+ public void clear() {
+ taggable = DEFAULT_TAGGABLE;
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see org.apache.lucene.util.AttributeImpl#copyTo(org.apache.lucene.util.AttributeImpl)
+ */
+ @Override
+ public void copyTo(AttributeImpl target) {
+ ((TaggingAttribute) target).setTaggable(taggable);
+ }
+
+ @Override
+ public void reflectWith(AttributeReflector reflector) {
+ reflector.reflect(TaggingAttribute.class, "taggable", isTaggable());
+ }
+
+}
diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TermPrefixCursor.java b/solr/core/src/java/org/apache/solr/handler/tagger/TermPrefixCursor.java
new file mode 100644
index 00000000000..1e82dbe4b5b
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/tagger/TermPrefixCursor.java
@@ -0,0 +1,189 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.IntsRef;
+
+/**
+ * Cursor into the terms that advances by prefix.
+ */
+class TermPrefixCursor {
+
+ //Note: this could be a lot more efficient if MemoryPostingsFormat supported ordinal lookup.
+ // Maybe that could be added to Lucene.
+
+ // TODO add bloom filter of hashcode of first ~ 6 bytes to avoid lookup into terms dict?
+
+ private static final byte SEPARATOR_CHAR = ConcatenateGraphFilter.SEP_LABEL; // used to be ' '; TODO configurable?
+ private static final IntsRef EMPTY_INTSREF = new IntsRef();
+
+ private final TermsEnum termsEnum;
+ private final Bits liveDocs;
+ private final Map docIdsCache;
+
+ private BytesRef prefixBuf;//we append to this
+ private BytesRefBuilder prefixBufBuilder = new BytesRefBuilder();
+ private boolean prefixBufOnLoan;//if true, PB is loaned; needs to be copied
+ private PostingsEnum postingsEnum;
+ private IntsRef docIds;
+
+ TermPrefixCursor(TermsEnum termsEnum, Bits liveDocs, Map docIdsCache) {
+ this.termsEnum = termsEnum;
+ this.liveDocs = liveDocs;
+ this.docIdsCache = docIdsCache;
+ }
+
+ /** Appends the separator char (if not the first) plus the given word to the prefix buffer,
+ * then seeks to it. If the seek fails, false is returned and this cursor
+ * can be re-used as if in a new state. The {@code word} BytesRef is considered temporary,
+ * and is not saved within this class. */
+ boolean advance(BytesRef word) throws IOException {
+ if (prefixBuf == null) { // first advance
+ //set prefixBuf to word temporary. When advance() completes, we either null out or copy.
+ prefixBuf = word;
+ prefixBufOnLoan = true;
+ if (seekPrefix()) {//... and we have to
+ ensureBufIsACopy();
+ return true;
+ } else {
+ prefixBuf = null;//just to be darned sure 'word' isn't referenced here
+ return false;
+ }
+
+ } else { // subsequent advance
+ //append to existing
+ assert !prefixBufOnLoan;
+
+ prefixBufBuilder.append(SEPARATOR_CHAR);
+ prefixBufBuilder.append(word);
+ prefixBuf = prefixBufBuilder.get();
+ if (seekPrefix()) {
+ return true;
+ } else {
+ prefixBuf = null;
+ return false;
+ }
+ }
+ }
+
+ private void ensureBufIsACopy() {
+ if (!prefixBufOnLoan)
+ return;
+
+ prefixBufBuilder.clear();
+ prefixBufBuilder.copyBytes(prefixBuf);
+ prefixBuf = prefixBufBuilder.get();
+ prefixBufOnLoan = false;
+ }
+
+ /** Seeks to prefixBuf or the next term that is prefixed by prefixBuf plus the separator char.
+ * Sets docIds. **/
+ private boolean seekPrefix() throws IOException {
+ TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefixBuf);
+
+ docIds = null;//invalidate
+ switch (seekStatus) {
+ case END:
+ return false;
+
+ case FOUND:
+ postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
+ docIds = postingsEnumToIntsRef(postingsEnum, liveDocs);
+ if (docIds.length > 0) {
+ return true;
+ }
+
+ //Pretend we didn't find it; go to next term
+ docIds = null;
+ if (termsEnum.next() == null) { // case END
+ return false;
+ }
+ //fall through to NOT_FOUND
+
+ case NOT_FOUND:
+ //termsEnum must start with prefixBuf to continue
+ BytesRef teTerm = termsEnum.term();
+
+ if (teTerm.length > prefixBuf.length) {
+ for (int i = 0; i < prefixBuf.length; i++) {
+ if (prefixBuf.bytes[prefixBuf.offset + i] != teTerm.bytes[teTerm.offset + i])
+ return false;
+ }
+ if (teTerm.bytes[teTerm.offset + prefixBuf.length] != SEPARATOR_CHAR)
+ return false;
+ return true;
+ }
+ return false;
+ }
+ throw new IllegalStateException(seekStatus.toString());
+ }
+
+ /** Returns an IntsRef either cached or reading postingsEnum. Not null. */
+ private IntsRef postingsEnumToIntsRef(PostingsEnum postingsEnum, Bits liveDocs) throws IOException {
+ // (The cache can have empty IntsRefs)
+
+ //lookup prefixBuf in a cache
+ if (docIdsCache != null) {
+ docIds = docIdsCache.get(prefixBuf);
+ if (docIds != null) {
+ return docIds;
+ }
+ }
+
+ //read postingsEnum
+ docIds = new IntsRef(termsEnum.docFreq());
+ int docId;
+ while ((docId = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
+ if (liveDocs != null && !liveDocs.get(postingsEnum.docID())) {
+ continue;
+ }
+ docIds.ints[docIds.length++] = docId;
+ }
+ if (docIds.length == 0)
+ docIds = EMPTY_INTSREF;
+
+ //cache
+ if (docIdsCache != null) {
+ ensureBufIsACopy();
+ //clone is shallow; that's okay as the prefix isn't overwritten; it's just appended to
+ docIdsCache.put(prefixBuf.clone(), docIds);
+ }
+ return docIds;
+ }
+
+ /** The docIds of the last call to advance, if it returned true. It might be null, but
+ * its length won't be 0. Treat as immutable. */
+ IntsRef getDocIds() {
+ assert docIds == null || docIds.length != 0;
+ return docIds;
+ }
+}
diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/XmlOffsetCorrector.java b/solr/core/src/java/org/apache/solr/handler/tagger/XmlOffsetCorrector.java
new file mode 100644
index 00000000000..576328f65be
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/tagger/XmlOffsetCorrector.java
@@ -0,0 +1,113 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import javax.xml.stream.XMLResolver;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.events.XMLEvent;
+import java.io.InputStream;
+import java.io.StringReader;
+
+import com.ctc.wstx.stax.WstxInputFactory;
+import org.apache.commons.io.input.ClosedInputStream;
+import org.codehaus.stax2.LocationInfo;
+import org.codehaus.stax2.XMLInputFactory2;
+import org.codehaus.stax2.XMLStreamReader2;
+
+/**
+ * Corrects offsets to adjust for XML formatted data. The goal is such that the caller should be
+ * able to insert a start XML tag at the start offset and a corresponding end XML tag at the end
+ * offset of the tagger, and have it be valid XML. See {@link #correctPair(int, int)}.
+ *
+ * This will not work on invalid XML.
+ *
+ * Not thread-safe.
+ */
+public class XmlOffsetCorrector extends OffsetCorrector {
+
+ //TODO use StAX without hard requirement on woodstox. xmlStreamReader.getLocation().getCharacterOffset()
+
+ private static final XMLInputFactory2 XML_INPUT_FACTORY;
+ static {
+ // note: similar code in Solr's EmptyEntityResolver
+ XML_INPUT_FACTORY = new WstxInputFactory();
+ XML_INPUT_FACTORY.setXMLResolver(new XMLResolver() {
+ @Override
+ public InputStream resolveEntity(String publicId, String systemId, String baseURI, String namespace) {
+ return ClosedInputStream.CLOSED_INPUT_STREAM;
+ }
+ });
+ // TODO disable DTD?
+ // XML_INPUT_FACTORY.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE)
+ XML_INPUT_FACTORY.configureForSpeed();
+ }
+
+ /**
+ * Initialize based on the document text.
+ * @param docText non-null XML content.
+ * @throws XMLStreamException If there's a problem parsing the XML.
+ */
+ public XmlOffsetCorrector(String docText) throws XMLStreamException {
+ super(docText, false);
+
+ int tagCounter = 0;
+ int thisTag = -1;
+
+ //note: we *could* add a virtual outer tag to guarantee all text is in the context of a tag,
+ // but we shouldn't need to because there is no findable text outside the top element.
+
+ final XMLStreamReader2 xmlStreamReader =
+ (XMLStreamReader2) XML_INPUT_FACTORY.createXMLStreamReader(new StringReader(docText));
+
+ while (xmlStreamReader.hasNext()) {
+ int eventType = xmlStreamReader.next();
+ switch (eventType) {
+ case XMLEvent.START_ELEMENT: {
+ tagInfo.ensureCapacity(tagInfo.size() + 5);
+ final int parentTag = thisTag;
+ final LocationInfo info = xmlStreamReader.getLocationInfo();
+ tagInfo.add(parentTag);
+ tagInfo.add((int) info.getStartingCharOffset(), (int) info.getEndingCharOffset());
+ tagInfo.add(-1, -1);//these 2 will be populated when we get to the close tag
+ thisTag = tagCounter++;
+
+ parentChangeOffsets.add((int) info.getStartingCharOffset());
+ parentChangeIds.add(thisTag);
+ break;
+ }
+ case XMLEvent.END_ELEMENT: {
+ final LocationInfo info = xmlStreamReader.getLocationInfo();
+ tagInfo.set(5 * thisTag + 3, (int) info.getStartingCharOffset());
+ tagInfo.set(5 * thisTag + 4, (int) info.getEndingCharOffset());
+ thisTag = getParentTag(thisTag);
+
+ parentChangeOffsets.add((int) info.getEndingCharOffset());
+ parentChangeIds.add(thisTag);
+ break;
+ }
+ default: //do nothing
+ }
+ }
+ }
+
+}
diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/package-info.java b/solr/core/src/java/org/apache/solr/handler/tagger/package-info.java
new file mode 100644
index 00000000000..c2055b308e5
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/tagger/package-info.java
@@ -0,0 +1,27 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * The {@link org.apache.solr.handler.tagger.TaggerRequestHandler} and supporting classes.
+ * This was formerly known as OpenSextant's SolrTextTagger.
+ */
+package org.apache.solr.handler.tagger;
\ No newline at end of file
diff --git a/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java b/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java
index 76a52583e32..d0f8cd4633e 100644
--- a/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java
+++ b/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java
@@ -425,7 +425,7 @@ public class CollapsingQParserPlugin extends QParserPlugin {
DocValuesType.NONE,
fieldInfo.getDocValuesGen(),
fieldInfo.attributes(),
- 0, 0);
+ 0, 0, fieldInfo.isSoftDeletesField());
newInfos.add(f);
} else {
diff --git a/solr/core/src/java/org/apache/solr/search/Insanity.java b/solr/core/src/java/org/apache/solr/search/Insanity.java
index aa366521e88..8fe081f947b 100644
--- a/solr/core/src/java/org/apache/solr/search/Insanity.java
+++ b/solr/core/src/java/org/apache/solr/search/Insanity.java
@@ -66,7 +66,7 @@ public class Insanity {
if (fi.name.equals(insaneField)) {
filteredInfos.add(new FieldInfo(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(),
fi.hasPayloads(), fi.getIndexOptions(), DocValuesType.NONE, -1, Collections.emptyMap(),
- fi.getPointDimensionCount(), fi.getPointNumBytes()));
+ fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()));
} else {
filteredInfos.add(fi);
}
diff --git a/solr/core/src/java/org/apache/solr/search/QParserPlugin.java b/solr/core/src/java/org/apache/solr/search/QParserPlugin.java
index f80bc9c3cda..b20c3c87ac9 100644
--- a/solr/core/src/java/org/apache/solr/search/QParserPlugin.java
+++ b/solr/core/src/java/org/apache/solr/search/QParserPlugin.java
@@ -37,53 +37,53 @@ public abstract class QParserPlugin implements NamedListInitializedPlugin, SolrI
public static final String DEFAULT_QTYPE = LuceneQParserPlugin.NAME;
/**
- * Internal use - name to class mappings of builtin parsers.
+ * Internal use - name to parser for the builtin parsers.
* Each query parser plugin extending {@link QParserPlugin} has own instance of standardPlugins.
* This leads to cyclic dependencies of static fields and to case when NAME field is not yet initialized.
* This result to NPE during initialization.
* For every plugin, listed here, NAME field has to be final and static.
*/
- public static final Map> standardPlugins;
+ public static final Map standardPlugins;
static {
- HashMap> map = new HashMap<>(30, 1);
- map.put(LuceneQParserPlugin.NAME, LuceneQParserPlugin.class);
- map.put(FunctionQParserPlugin.NAME, FunctionQParserPlugin.class);
- map.put(PrefixQParserPlugin.NAME, PrefixQParserPlugin.class);
- map.put(BoostQParserPlugin.NAME, BoostQParserPlugin.class);
- map.put(DisMaxQParserPlugin.NAME, DisMaxQParserPlugin.class);
- map.put(ExtendedDismaxQParserPlugin.NAME, ExtendedDismaxQParserPlugin.class);
- map.put(FieldQParserPlugin.NAME, FieldQParserPlugin.class);
- map.put(RawQParserPlugin.NAME, RawQParserPlugin.class);
- map.put(TermQParserPlugin.NAME, TermQParserPlugin.class);
- map.put(TermsQParserPlugin.NAME, TermsQParserPlugin.class);
- map.put(NestedQParserPlugin.NAME, NestedQParserPlugin.class);
- map.put(FunctionRangeQParserPlugin.NAME, FunctionRangeQParserPlugin.class);
- map.put(SpatialFilterQParserPlugin.NAME, SpatialFilterQParserPlugin.class);
- map.put(SpatialBoxQParserPlugin.NAME, SpatialBoxQParserPlugin.class);
- map.put(JoinQParserPlugin.NAME, JoinQParserPlugin.class);
- map.put(SurroundQParserPlugin.NAME, SurroundQParserPlugin.class);
- map.put(SwitchQParserPlugin.NAME, SwitchQParserPlugin.class);
- map.put(MaxScoreQParserPlugin.NAME, MaxScoreQParserPlugin.class);
- map.put(BlockJoinParentQParserPlugin.NAME, BlockJoinParentQParserPlugin.class);
- map.put(BlockJoinChildQParserPlugin.NAME, BlockJoinChildQParserPlugin.class);
- map.put(FiltersQParserPlugin.NAME, FiltersQParserPlugin.class);
- map.put(CollapsingQParserPlugin.NAME, CollapsingQParserPlugin.class);
- map.put(SimpleQParserPlugin.NAME, SimpleQParserPlugin.class);
- map.put(ComplexPhraseQParserPlugin.NAME, ComplexPhraseQParserPlugin.class);
- map.put(ReRankQParserPlugin.NAME, ReRankQParserPlugin.class);
- map.put(ExportQParserPlugin.NAME, ExportQParserPlugin.class);
- map.put(MLTQParserPlugin.NAME, MLTQParserPlugin.class);
- map.put(HashQParserPlugin.NAME, HashQParserPlugin.class);
- map.put(GraphQParserPlugin.NAME, GraphQParserPlugin.class);
- map.put(XmlQParserPlugin.NAME, XmlQParserPlugin.class);
- map.put(GraphTermsQParserPlugin.NAME, GraphTermsQParserPlugin.class);
- map.put(IGainTermsQParserPlugin.NAME, IGainTermsQParserPlugin.class);
- map.put(TextLogisticRegressionQParserPlugin.NAME, TextLogisticRegressionQParserPlugin.class);
- map.put(SignificantTermsQParserPlugin.NAME, SignificantTermsQParserPlugin.class);
- map.put(PayloadScoreQParserPlugin.NAME, PayloadScoreQParserPlugin.class);
- map.put(PayloadCheckQParserPlugin.NAME, PayloadCheckQParserPlugin.class);
- map.put(BoolQParserPlugin.NAME, BoolQParserPlugin.class);
+ HashMap map = new HashMap<>(30, 1);
+ map.put(LuceneQParserPlugin.NAME, new LuceneQParserPlugin());
+ map.put(FunctionQParserPlugin.NAME, new FunctionQParserPlugin());
+ map.put(PrefixQParserPlugin.NAME, new PrefixQParserPlugin());
+ map.put(BoostQParserPlugin.NAME, new BoostQParserPlugin());
+ map.put(DisMaxQParserPlugin.NAME, new DisMaxQParserPlugin());
+ map.put(ExtendedDismaxQParserPlugin.NAME, new ExtendedDismaxQParserPlugin());
+ map.put(FieldQParserPlugin.NAME, new FieldQParserPlugin());
+ map.put(RawQParserPlugin.NAME, new RawQParserPlugin());
+ map.put(TermQParserPlugin.NAME, new TermQParserPlugin());
+ map.put(TermsQParserPlugin.NAME, new TermsQParserPlugin());
+ map.put(NestedQParserPlugin.NAME, new NestedQParserPlugin());
+ map.put(FunctionRangeQParserPlugin.NAME, new FunctionRangeQParserPlugin());
+ map.put(SpatialFilterQParserPlugin.NAME, new SpatialFilterQParserPlugin());
+ map.put(SpatialBoxQParserPlugin.NAME, new SpatialBoxQParserPlugin());
+ map.put(JoinQParserPlugin.NAME, new JoinQParserPlugin());
+ map.put(SurroundQParserPlugin.NAME, new SurroundQParserPlugin());
+ map.put(SwitchQParserPlugin.NAME, new SwitchQParserPlugin());
+ map.put(MaxScoreQParserPlugin.NAME, new MaxScoreQParserPlugin());
+ map.put(BlockJoinParentQParserPlugin.NAME, new BlockJoinParentQParserPlugin());
+ map.put(BlockJoinChildQParserPlugin.NAME, new BlockJoinChildQParserPlugin());
+ map.put(FiltersQParserPlugin.NAME, new FiltersQParserPlugin());
+ map.put(CollapsingQParserPlugin.NAME, new CollapsingQParserPlugin());
+ map.put(SimpleQParserPlugin.NAME, new SimpleQParserPlugin());
+ map.put(ComplexPhraseQParserPlugin.NAME, new ComplexPhraseQParserPlugin());
+ map.put(ReRankQParserPlugin.NAME, new ReRankQParserPlugin());
+ map.put(ExportQParserPlugin.NAME, new ExportQParserPlugin());
+ map.put(MLTQParserPlugin.NAME, new MLTQParserPlugin());
+ map.put(HashQParserPlugin.NAME, new HashQParserPlugin());
+ map.put(GraphQParserPlugin.NAME, new GraphQParserPlugin());
+ map.put(XmlQParserPlugin.NAME, new XmlQParserPlugin());
+ map.put(GraphTermsQParserPlugin.NAME, new GraphTermsQParserPlugin());
+ map.put(IGainTermsQParserPlugin.NAME, new IGainTermsQParserPlugin());
+ map.put(TextLogisticRegressionQParserPlugin.NAME, new TextLogisticRegressionQParserPlugin());
+ map.put(SignificantTermsQParserPlugin.NAME, new SignificantTermsQParserPlugin());
+ map.put(PayloadScoreQParserPlugin.NAME, new PayloadScoreQParserPlugin());
+ map.put(PayloadCheckQParserPlugin.NAME, new PayloadCheckQParserPlugin());
+ map.put(BoolQParserPlugin.NAME, new BoolQParserPlugin());
standardPlugins = Collections.unmodifiableMap(map);
}
diff --git a/solr/core/src/java/org/apache/solr/security/PKIAuthenticationPlugin.java b/solr/core/src/java/org/apache/solr/security/PKIAuthenticationPlugin.java
index 877e4f16cd6..43dac480168 100644
--- a/solr/core/src/java/org/apache/solr/security/PKIAuthenticationPlugin.java
+++ b/solr/core/src/java/org/apache/solr/security/PKIAuthenticationPlugin.java
@@ -47,11 +47,7 @@ import org.apache.solr.common.util.StrUtils;
import org.apache.solr.common.util.SuppressForbidden;
import org.apache.solr.common.util.Utils;
import org.apache.solr.core.CoreContainer;
-import org.apache.solr.handler.RequestHandlerBase;
-import org.apache.solr.request.SolrQueryRequest;
-import org.apache.solr.request.SolrRequestHandler;
import org.apache.solr.request.SolrRequestInfo;
-import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.util.CryptoKeys;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -62,7 +58,7 @@ import static java.nio.charset.StandardCharsets.UTF_8;
public class PKIAuthenticationPlugin extends AuthenticationPlugin implements HttpClientBuilderPlugin {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private final Map keyCache = new ConcurrentHashMap<>();
- private final CryptoKeys.RSAKeyPair keyPair = new CryptoKeys.RSAKeyPair();
+ private final PublicKeyHandler publicKeyHandler;
private final CoreContainer cores;
private final int MAX_VALIDITY = Integer.parseInt(System.getProperty("pkiauth.ttl", "10000"));
private final String myNodeName;
@@ -77,7 +73,8 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt
return interceptorRegistered;
}
- public PKIAuthenticationPlugin(CoreContainer cores, String nodeName) {
+ public PKIAuthenticationPlugin(CoreContainer cores, String nodeName, PublicKeyHandler publicKeyHandler) {
+ this.publicKeyHandler = publicKeyHandler;
this.cores = cores;
myNodeName = nodeName;
}
@@ -92,7 +89,7 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt
public boolean doAuthenticate(ServletRequest request, ServletResponse response, FilterChain filterChain) throws Exception {
String requestURI = ((HttpServletRequest) request).getRequestURI();
- if (requestURI.endsWith(PATH)) {
+ if (requestURI.endsWith(PublicKeyHandler.PATH)) {
filterChain.doFilter(request, response);
return true;
}
@@ -198,7 +195,7 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt
String url = cores.getZkController().getZkStateReader().getBaseUrlForNodeName(nodename);
HttpEntity entity = null;
try {
- String uri = url + PATH + "?wt=json&omitHeader=true";
+ String uri = url + PublicKeyHandler.PATH + "?wt=json&omitHeader=true";
log.debug("Fetching fresh public key from : {}",uri);
HttpResponse rsp = cores.getUpdateShardHandler().getDefaultHttpClient()
.execute(new HttpGet(uri), HttpClientUtil.createNewHttpClientRequestContext());
@@ -207,7 +204,7 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt
Map m = (Map) Utils.fromJSON(bytes);
String key = (String) m.get("key");
if (key == null) {
- log.error("No key available from " + url + PATH);
+ log.error("No key available from " + url + PublicKeyHandler.PATH);
return null;
} else {
log.info("New Key obtained from node: {} / {}", nodename, key);
@@ -230,26 +227,6 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt
return builder;
}
- public SolrRequestHandler getRequestHandler() {
- return new RequestHandlerBase() {
- @Override
- public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
- rsp.add("key", keyPair.getPublicKeyStr());
- }
-
- @Override
- public String getDescription() {
- return "Return the public key of this server";
- }
-
- @Override
- public Category getCategory() {
- return Category.ADMIN;
- }
-
- };
- }
-
public boolean needsAuthorization(HttpServletRequest req) {
return req.getUserPrincipal() != SU;
}
@@ -292,7 +269,7 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt
String s = usr + " " + System.currentTimeMillis();
byte[] payload = s.getBytes(UTF_8);
- byte[] payloadCipher = keyPair.encrypt(ByteBuffer.wrap(payload));
+ byte[] payloadCipher = publicKeyHandler.keyPair.encrypt(ByteBuffer.wrap(payload));
String base64Cipher = Base64.byteArrayToBase64(payloadCipher);
httpRequest.setHeader(HEADER, myNodeName + " " + base64Cipher);
}
@@ -316,11 +293,10 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt
}
public String getPublicKey() {
- return keyPair.getPublicKeyStr();
+ return publicKeyHandler.getPublicKey();
}
public static final String HEADER = "SolrAuth";
- public static final String PATH = "/admin/info/key";
public static final String NODE_IS_USER = "$";
// special principal to denote the cluster member
private static final Principal SU = new BasicUserPrincipal("$");
diff --git a/solr/core/src/java/org/apache/solr/security/PublicKeyHandler.java b/solr/core/src/java/org/apache/solr/security/PublicKeyHandler.java
new file mode 100644
index 00000000000..ad835782a74
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/security/PublicKeyHandler.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.security;
+
+import org.apache.solr.handler.RequestHandlerBase;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.util.CryptoKeys;
+
+public class PublicKeyHandler extends RequestHandlerBase {
+ public static final String PATH = "/admin/info/key";
+ final CryptoKeys.RSAKeyPair keyPair = new CryptoKeys.RSAKeyPair();
+
+ public String getPublicKey() {
+ return keyPair.getPublicKeyStr();
+ }
+
+ @Override
+ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
+ rsp.add("key", keyPair.getPublicKeyStr());
+ }
+
+ @Override
+ public String getDescription() {
+ return "Return the public key of this server";
+ }
+
+ @Override
+ public Category getCategory() {
+ return Category.ADMIN;
+ }
+}
diff --git a/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java b/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java
index d1347298505..b297a4430a1 100644
--- a/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java
+++ b/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java
@@ -97,7 +97,7 @@ import org.apache.solr.security.AuthorizationContext;
import org.apache.solr.security.AuthorizationContext.CollectionRequest;
import org.apache.solr.security.AuthorizationContext.RequestType;
import org.apache.solr.security.AuthorizationResponse;
-import org.apache.solr.security.PKIAuthenticationPlugin;
+import org.apache.solr.security.PublicKeyHandler;
import org.apache.solr.servlet.SolrDispatchFilter.Action;
import org.apache.solr.servlet.cache.HttpCacheHeaderUtil;
import org.apache.solr.servlet.cache.Method;
@@ -547,7 +547,7 @@ public class HttpSolrCall {
}
private boolean shouldAuthorize() {
- if(PKIAuthenticationPlugin.PATH.equals(path)) return false;
+ if(PublicKeyHandler.PATH.equals(path)) return false;
//admin/info/key is the path where public key is exposed . it is always unsecured
if (cores.getPkiAuthenticationPlugin() != null && req.getUserPrincipal() != null) {
boolean b = cores.getPkiAuthenticationPlugin().needsAuthorization(req);
diff --git a/solr/core/src/java/org/apache/solr/servlet/SolrDispatchFilter.java b/solr/core/src/java/org/apache/solr/servlet/SolrDispatchFilter.java
index c7fdd57f90d..78e58d000aa 100644
--- a/solr/core/src/java/org/apache/solr/servlet/SolrDispatchFilter.java
+++ b/solr/core/src/java/org/apache/solr/servlet/SolrDispatchFilter.java
@@ -16,6 +16,20 @@
*/
package org.apache.solr.servlet;
+import javax.servlet.FilterChain;
+import javax.servlet.FilterConfig;
+import javax.servlet.ReadListener;
+import javax.servlet.ServletException;
+import javax.servlet.ServletInputStream;
+import javax.servlet.ServletOutputStream;
+import javax.servlet.ServletRequest;
+import javax.servlet.ServletResponse;
+import javax.servlet.UnavailableException;
+import javax.servlet.WriteListener;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletRequestWrapper;
+import javax.servlet.http.HttpServletResponse;
+import javax.servlet.http.HttpServletResponseWrapper;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
@@ -35,21 +49,10 @@ import java.util.concurrent.atomic.AtomicReference;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import javax.servlet.FilterChain;
-import javax.servlet.FilterConfig;
-import javax.servlet.ReadListener;
-import javax.servlet.ServletException;
-import javax.servlet.ServletInputStream;
-import javax.servlet.ServletOutputStream;
-import javax.servlet.ServletRequest;
-import javax.servlet.ServletResponse;
-import javax.servlet.UnavailableException;
-import javax.servlet.WriteListener;
-import javax.servlet.http.HttpServletRequest;
-import javax.servlet.http.HttpServletRequestWrapper;
-import javax.servlet.http.HttpServletResponse;
-import javax.servlet.http.HttpServletResponseWrapper;
-
+import com.codahale.metrics.jvm.ClassLoadingGaugeSet;
+import com.codahale.metrics.jvm.GarbageCollectorMetricSet;
+import com.codahale.metrics.jvm.MemoryUsageGaugeSet;
+import com.codahale.metrics.jvm.ThreadStatesGaugeSet;
import org.apache.commons.io.FileCleaningTracker;
import org.apache.commons.lang.StringUtils;
import org.apache.http.client.HttpClient;
@@ -72,17 +75,13 @@ import org.apache.solr.metrics.SolrMetricManager;
import org.apache.solr.request.SolrRequestInfo;
import org.apache.solr.security.AuthenticationPlugin;
import org.apache.solr.security.PKIAuthenticationPlugin;
+import org.apache.solr.security.PublicKeyHandler;
import org.apache.solr.util.SolrFileCleaningTracker;
import org.apache.solr.util.StartupLoggingUtils;
import org.apache.solr.util.configuration.SSLConfigurationsFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import com.codahale.metrics.jvm.ClassLoadingGaugeSet;
-import com.codahale.metrics.jvm.GarbageCollectorMetricSet;
-import com.codahale.metrics.jvm.MemoryUsageGaugeSet;
-import com.codahale.metrics.jvm.ThreadStatesGaugeSet;
-
/**
* This filter looks at the incoming URL maps them to handlers defined in solrconfig.xml
*
@@ -441,8 +440,8 @@ public class SolrDispatchFilter extends BaseSolrFilter {
// /admin/info/key must be always open. see SOLR-9188
// tests work only w/ getPathInfo
//otherwise it's just enough to have getServletPath()
- if (PKIAuthenticationPlugin.PATH.equals(request.getServletPath()) ||
- PKIAuthenticationPlugin.PATH.equals(request.getPathInfo())) return true;
+ if (PublicKeyHandler.PATH.equals(request.getServletPath()) ||
+ PublicKeyHandler.PATH.equals(request.getPathInfo())) return true;
String header = request.getHeader(PKIAuthenticationPlugin.HEADER);
if (header != null && cores.getPkiAuthenticationPlugin() != null)
authenticationPlugin = cores.getPkiAuthenticationPlugin();
diff --git a/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java
index 967db541414..9f0f5271c67 100644
--- a/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java
+++ b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java
@@ -282,7 +282,7 @@ public class UninvertingReader extends FilterLeafReader {
}
filteredInfos.add(new FieldInfo(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(),
fi.hasPayloads(), fi.getIndexOptions(), type, fi.getDocValuesGen(), fi.attributes(),
- fi.getPointDimensionCount(), fi.getPointNumBytes()));
+ fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()));
}
fieldInfos = new FieldInfos(filteredInfos.toArray(new FieldInfo[filteredInfos.size()]));
}
diff --git a/solr/core/src/java/org/apache/solr/update/CdcrTransactionLog.java b/solr/core/src/java/org/apache/solr/update/CdcrTransactionLog.java
index 3534f622908..f668540325e 100644
--- a/solr/core/src/java/org/apache/solr/update/CdcrTransactionLog.java
+++ b/solr/core/src/java/org/apache/solr/update/CdcrTransactionLog.java
@@ -41,7 +41,7 @@ import org.slf4j.LoggerFactory;
* methods {@link #incref()}, {@link #close()} and {@link #reopenOutputStream()}.
* encode the number of records in the tlog file in the last commit record. The number of records will be
* decoded and reuse if the tlog file is reopened. This is achieved by extending the constructor, and the
- * methods {@link #writeCommit(CommitUpdateCommand, int)} and {@link #getReader(long)}.
+ * methods {@link #writeCommit(CommitUpdateCommand)} and {@link #getReader(long)}.
*
*/
public class CdcrTransactionLog extends TransactionLog {
@@ -108,7 +108,7 @@ public class CdcrTransactionLog extends TransactionLog {
}
@Override
- public long write(AddUpdateCommand cmd, long prevPointer, int flags) {
+ public long write(AddUpdateCommand cmd, long prevPointer) {
assert (-1 <= prevPointer && (cmd.isInPlaceUpdate() || (-1 == prevPointer)));
LogCodec codec = new LogCodec(resolver);
@@ -125,7 +125,7 @@ public class CdcrTransactionLog extends TransactionLog {
codec.init(out);
if (cmd.isInPlaceUpdate()) {
codec.writeTag(JavaBinCodec.ARR, 6);
- codec.writeInt(UpdateLog.UPDATE_INPLACE | flags); // should just take one byte
+ codec.writeInt(UpdateLog.UPDATE_INPLACE); // should just take one byte
codec.writeLong(cmd.getVersion());
codec.writeLong(prevPointer);
codec.writeLong(cmd.prevVersion);
@@ -141,7 +141,7 @@ public class CdcrTransactionLog extends TransactionLog {
} else {
codec.writeTag(JavaBinCodec.ARR, 4);
- codec.writeInt(UpdateLog.ADD | flags); // should just take one byte
+ codec.writeInt(UpdateLog.ADD); // should just take one byte
codec.writeLong(cmd.getVersion());
if (cmd.getReq().getParamString().contains(CdcrUpdateProcessor.CDCR_UPDATE)) {
// if the update is received via cdcr source; add extra boolean entry
@@ -179,7 +179,7 @@ public class CdcrTransactionLog extends TransactionLog {
}
@Override
- public long writeDelete(DeleteUpdateCommand cmd, int flags) {
+ public long writeDelete(DeleteUpdateCommand cmd) {
LogCodec codec = new LogCodec(resolver);
try {
@@ -190,7 +190,7 @@ public class CdcrTransactionLog extends TransactionLog {
MemOutputStream out = new MemOutputStream(new byte[20 + br.length]);
codec.init(out);
codec.writeTag(JavaBinCodec.ARR, 4);
- codec.writeInt(UpdateLog.DELETE | flags); // should just take one byte
+ codec.writeInt(UpdateLog.DELETE); // should just take one byte
codec.writeLong(cmd.getVersion());
codec.writeByteArray(br.bytes, br.offset, br.length);
if (cmd.getReq().getParamString().contains(CdcrUpdateProcessor.CDCR_UPDATE)) {
@@ -217,7 +217,7 @@ public class CdcrTransactionLog extends TransactionLog {
}
@Override
- public long writeDeleteByQuery(DeleteUpdateCommand cmd, int flags) {
+ public long writeDeleteByQuery(DeleteUpdateCommand cmd) {
LogCodec codec = new LogCodec(resolver);
try {
checkWriteHeader(codec, null);
@@ -225,7 +225,7 @@ public class CdcrTransactionLog extends TransactionLog {
MemOutputStream out = new MemOutputStream(new byte[20 + (cmd.query.length())]);
codec.init(out);
codec.writeTag(JavaBinCodec.ARR, 4);
- codec.writeInt(UpdateLog.DELETE_BY_QUERY | flags); // should just take one byte
+ codec.writeInt(UpdateLog.DELETE_BY_QUERY); // should just take one byte
codec.writeLong(cmd.getVersion());
codec.writeStr(cmd.query);
if (cmd.getReq().getParamString().contains(CdcrUpdateProcessor.CDCR_UPDATE)) {
@@ -249,7 +249,7 @@ public class CdcrTransactionLog extends TransactionLog {
}
@Override
- public long writeCommit(CommitUpdateCommand cmd, int flags) {
+ public long writeCommit(CommitUpdateCommand cmd) {
LogCodec codec = new LogCodec(resolver);
synchronized (this) {
try {
@@ -261,7 +261,7 @@ public class CdcrTransactionLog extends TransactionLog {
}
codec.init(fos);
codec.writeTag(JavaBinCodec.ARR, 4);
- codec.writeInt(UpdateLog.COMMIT | flags); // should just take one byte
+ codec.writeInt(UpdateLog.COMMIT); // should just take one byte
codec.writeLong(cmd.getVersion());
codec.writeTag(JavaBinCodec.INT); // Enforce the encoding of a plain integer, to simplify decoding
fos.writeInt(numRecords + 1); // the number of records in the file - +1 to account for the commit operation being written
diff --git a/solr/core/src/java/org/apache/solr/update/CdcrUpdateLog.java b/solr/core/src/java/org/apache/solr/update/CdcrUpdateLog.java
index 6b202044d76..bff16122ecf 100644
--- a/solr/core/src/java/org/apache/solr/update/CdcrUpdateLog.java
+++ b/solr/core/src/java/org/apache/solr/update/CdcrUpdateLog.java
@@ -352,7 +352,6 @@ public class CdcrUpdateLog extends UpdateLog {
long latestVersion = startingUpdates.getMaxRecentVersion();
try {
startingVersions = startingUpdates.getVersions(numRecordsToKeep);
- startingOperation = startingUpdates.getLatestOperation();
// populate recent deletes list (since we can't get that info from the index)
for (int i=startingUpdates.deleteList.size()-1; i>=0; i--) {
@@ -389,9 +388,7 @@ public class CdcrUpdateLog extends UpdateLog {
*/
private void copyBufferedUpdates(File tlogSrc, long offsetSrc, long latestVersion) {
recoveryInfo = new RecoveryInfo();
- recoveryInfo.positionOfStart = tlog == null ? 0 : tlog.snapshot();
state = State.BUFFERING;
- operationFlags |= FLAG_GAP;
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(DistributingUpdateProcessorFactory.DISTRIB_UPDATE_PARAM, DistributedUpdateProcessor.DistribPhase.FROMLEADER.toString());
diff --git a/solr/core/src/java/org/apache/solr/update/HdfsTransactionLog.java b/solr/core/src/java/org/apache/solr/update/HdfsTransactionLog.java
index 0f89016a107..8ed7d7ad65a 100644
--- a/solr/core/src/java/org/apache/solr/update/HdfsTransactionLog.java
+++ b/solr/core/src/java/org/apache/solr/update/HdfsTransactionLog.java
@@ -166,20 +166,6 @@ public class HdfsTransactionLog extends TransactionLog {
}
return true;
}
-
- // This could mess with any readers or reverse readers that are open, or anything that might try to do a log lookup.
- // This should only be used to roll back buffered updates, not actually applied updates.
- @Override
- public void rollback(long pos) throws IOException {
- synchronized (this) {
- assert snapshot_size == pos;
- ensureFlushed();
- // TODO: how do we rollback with hdfs?? We need HDFS-3107
- fos.setWritten(pos);
- assert fos.size() == pos;
- numRecords = snapshot_numRecords;
- }
- }
private void readHeader(FastInputStream fis) throws IOException {
// read existing header
@@ -210,7 +196,7 @@ public class HdfsTransactionLog extends TransactionLog {
}
@Override
- public long writeCommit(CommitUpdateCommand cmd, int flags) {
+ public long writeCommit(CommitUpdateCommand cmd) {
LogCodec codec = new LogCodec(resolver);
synchronized (this) {
try {
@@ -223,7 +209,7 @@ public class HdfsTransactionLog extends TransactionLog {
codec.init(fos);
codec.writeTag(JavaBinCodec.ARR, 3);
- codec.writeInt(UpdateLog.COMMIT | flags); // should just take one byte
+ codec.writeInt(UpdateLog.COMMIT); // should just take one byte
codec.writeLong(cmd.getVersion());
codec.writeStr(END_MESSAGE); // ensure these bytes are (almost) last in the file
diff --git a/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java b/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java
index 7bb74d05bf9..8ca4b1cb3e5 100644
--- a/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java
+++ b/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java
@@ -65,37 +65,6 @@ public class HdfsUpdateLog extends UpdateLog {
this.confDir = confDir;
}
- // HACK
- // while waiting for HDFS-3107, instead of quickly
- // dropping, we slowly apply
- // This is somewhat brittle, but current usage
- // allows for it
- @Override
- public boolean dropBufferedUpdates() {
- versionInfo.blockUpdates();
- try {
- if (state != State.BUFFERING) return false;
-
- if (log.isInfoEnabled()) {
- log.info("Dropping buffered updates " + this);
- }
-
- // since we blocked updates, this synchronization shouldn't strictly be
- // necessary.
- synchronized (this) {
- if (tlog != null) {
- // tlog.rollback(recoveryInfo.positionOfStart);
- }
- }
-
- state = State.ACTIVE;
- operationFlags &= ~FLAG_GAP;
- } finally {
- versionInfo.unblockUpdates();
- }
- return true;
- }
-
@Override
public void init(PluginInfo info) {
super.init(info);
@@ -186,6 +155,11 @@ public class HdfsUpdateLog extends UpdateLog {
throw new RuntimeException("Problem creating directory: " + tlogDir, e);
}
}
+
+ String[] oldBufferTlog = getBufferLogList(fs, tlogDir);
+ if (oldBufferTlog != null && oldBufferTlog.length != 0) {
+ existOldBufferLog = true;
+ }
tlogFiles = getLogList(fs, tlogDir);
id = getLastLogId() + 1; // add 1 since we will create a new log for the
@@ -241,7 +215,6 @@ public class HdfsUpdateLog extends UpdateLog {
// non-complete tlogs.
try (RecentUpdates startingUpdates = getRecentUpdates()) {
startingVersions = startingUpdates.getVersions(getNumRecordsToKeep());
- startingOperation = startingUpdates.getLatestOperation();
// populate recent deletes list (since we can't get that info from the
// index)
@@ -269,6 +242,23 @@ public class HdfsUpdateLog extends UpdateLog {
public String getLogDir() {
return tlogDir.toUri().toString();
}
+
+ public static String[] getBufferLogList(FileSystem fs, Path tlogDir) {
+ final String prefix = BUFFER_TLOG_NAME+'.';
+ assert fs != null;
+ FileStatus[] fileStatuses;
+ try {
+ fileStatuses = fs.listStatus(tlogDir, path -> path.getName().startsWith(prefix));
+ } catch (IOException e) {
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Failed on listing old buffer tlog", e);
+ }
+
+ String[] names = new String[fileStatuses.length];
+ for (int i = 0; i < fileStatuses.length; i++) {
+ names[i] = fileStatuses[i].getPath().getName();
+ }
+ return names;
+ }
public static String[] getLogList(FileSystem fs, Path tlogDir) {
final String prefix = TLOG_NAME + '.';
@@ -307,7 +297,35 @@ public class HdfsUpdateLog extends UpdateLog {
IOUtils.closeQuietly(fs);
}
}
-
+
+ @Override
+ protected void ensureBufferTlog() {
+ if (bufferTlog != null) return;
+ String newLogName = String.format(Locale.ROOT, LOG_FILENAME_PATTERN, BUFFER_TLOG_NAME, System.nanoTime());
+ bufferTlog = new HdfsTransactionLog(fs, new Path(tlogDir, newLogName),
+ globalStrings, tlogDfsReplication);
+ }
+
+ @Override
+ protected void deleteBufferLogs() {
+ // Delete old buffer logs
+ String[] oldBufferTlog = getBufferLogList(fs, tlogDir);
+ if (oldBufferTlog != null && oldBufferTlog.length != 0) {
+ for (String oldBufferLogName : oldBufferTlog) {
+ Path f = new Path(tlogDir, oldBufferLogName);
+ try {
+ boolean s = fs.delete(f, false);
+ if (!s) {
+ log.error("Could not remove old buffer tlog file:" + f);
+ }
+ } catch (IOException e) {
+ // No need to bubble up this exception, because it won't cause any problems on recovering
+ log.error("Could not remove old buffer tlog file:" + f, e);
+ }
+ }
+ }
+ }
+
@Override
protected void ensureLog() {
if (tlog == null) {
diff --git a/solr/core/src/java/org/apache/solr/update/TransactionLog.java b/solr/core/src/java/org/apache/solr/update/TransactionLog.java
index 96a928cc1a8..2a23896d491 100644
--- a/solr/core/src/java/org/apache/solr/update/TransactionLog.java
+++ b/solr/core/src/java/org/apache/solr/update/TransactionLog.java
@@ -85,9 +85,6 @@ public class TransactionLog implements Closeable {
Map globalStringMap = new HashMap<>();
List globalStringList = new ArrayList<>();
- long snapshot_size;
- int snapshot_numRecords;
-
// write a BytesRef as a byte array
static final JavaBinCodec.ObjectResolver resolver = new JavaBinCodec.ObjectResolver() {
@Override
@@ -153,7 +150,7 @@ public class TransactionLog implements Closeable {
// Parse tlog id from the filename
String filename = tlogFile.getName();
- id = Long.parseLong(filename.substring(filename.indexOf('.') + 1, filename.indexOf('.') + 20));
+ id = Long.parseLong(filename.substring(filename.lastIndexOf('.')+1));
this.tlogFile = tlogFile;
raf = new RandomAccessFile(this.tlogFile, "rw");
@@ -233,29 +230,6 @@ public class TransactionLog implements Closeable {
return true;
}
- /** takes a snapshot of the current position and number of records
- * for later possible rollback, and returns the position */
- public long snapshot() {
- synchronized (this) {
- snapshot_size = fos.size();
- snapshot_numRecords = numRecords;
- return snapshot_size;
- }
- }
-
- // This could mess with any readers or reverse readers that are open, or anything that might try to do a log lookup.
- // This should only be used to roll back buffered updates, not actually applied updates.
- public void rollback(long pos) throws IOException {
- synchronized (this) {
- assert snapshot_size == pos;
- fos.flush();
- raf.setLength(pos);
- fos.setWritten(pos);
- assert fos.size() == pos;
- numRecords = snapshot_numRecords;
- }
- }
-
public long writeData(Object o) {
@SuppressWarnings("resource") final LogCodec codec = new LogCodec(resolver);
try {
@@ -346,17 +320,16 @@ public class TransactionLog implements Closeable {
/**
* Writes an add update command to the transaction log. This is not applicable for
- * in-place updates; use {@link #write(AddUpdateCommand, long, int)}.
+ * in-place updates; use {@link #write(AddUpdateCommand, long)}.
* (The previous pointer (applicable for in-place updates) is set to -1 while writing
* the command to the transaction log.)
* @param cmd The add update command to be written
- * @param flags Options for writing the command to the transaction log
* @return Returns the position pointer of the written update command
*
- * @see #write(AddUpdateCommand, long, int)
+ * @see #write(AddUpdateCommand, long)
*/
- public long write(AddUpdateCommand cmd, int flags) {
- return write(cmd, -1, flags);
+ public long write(AddUpdateCommand cmd) {
+ return write(cmd, -1);
}
/**
@@ -365,10 +338,9 @@ public class TransactionLog implements Closeable {
* @param cmd The add update command to be written
* @param prevPointer The pointer in the transaction log which this update depends
* on (applicable for in-place updates)
- * @param flags Options for writing the command to the transaction log
* @return Returns the position pointer of the written update command
*/
- public long write(AddUpdateCommand cmd, long prevPointer, int flags) {
+ public long write(AddUpdateCommand cmd, long prevPointer) {
assert (-1 <= prevPointer && (cmd.isInPlaceUpdate() || (-1 == prevPointer)));
LogCodec codec = new LogCodec(resolver);
@@ -386,14 +358,14 @@ public class TransactionLog implements Closeable {
codec.init(out);
if (cmd.isInPlaceUpdate()) {
codec.writeTag(JavaBinCodec.ARR, 5);
- codec.writeInt(UpdateLog.UPDATE_INPLACE | flags); // should just take one byte
+ codec.writeInt(UpdateLog.UPDATE_INPLACE); // should just take one byte
codec.writeLong(cmd.getVersion());
codec.writeLong(prevPointer);
codec.writeLong(cmd.prevVersion);
codec.writeSolrInputDocument(cmd.getSolrInputDocument());
} else {
codec.writeTag(JavaBinCodec.ARR, 3);
- codec.writeInt(UpdateLog.ADD | flags); // should just take one byte
+ codec.writeInt(UpdateLog.ADD); // should just take one byte
codec.writeLong(cmd.getVersion());
codec.writeSolrInputDocument(cmd.getSolrInputDocument());
}
@@ -422,7 +394,7 @@ public class TransactionLog implements Closeable {
}
}
- public long writeDelete(DeleteUpdateCommand cmd, int flags) {
+ public long writeDelete(DeleteUpdateCommand cmd) {
LogCodec codec = new LogCodec(resolver);
try {
@@ -433,7 +405,7 @@ public class TransactionLog implements Closeable {
MemOutputStream out = new MemOutputStream(new byte[20 + br.length]);
codec.init(out);
codec.writeTag(JavaBinCodec.ARR, 3);
- codec.writeInt(UpdateLog.DELETE | flags); // should just take one byte
+ codec.writeInt(UpdateLog.DELETE); // should just take one byte
codec.writeLong(cmd.getVersion());
codec.writeByteArray(br.bytes, br.offset, br.length);
@@ -452,7 +424,7 @@ public class TransactionLog implements Closeable {
}
- public long writeDeleteByQuery(DeleteUpdateCommand cmd, int flags) {
+ public long writeDeleteByQuery(DeleteUpdateCommand cmd) {
LogCodec codec = new LogCodec(resolver);
try {
checkWriteHeader(codec, null);
@@ -460,7 +432,7 @@ public class TransactionLog implements Closeable {
MemOutputStream out = new MemOutputStream(new byte[20 + (cmd.query.length())]);
codec.init(out);
codec.writeTag(JavaBinCodec.ARR, 3);
- codec.writeInt(UpdateLog.DELETE_BY_QUERY | flags); // should just take one byte
+ codec.writeInt(UpdateLog.DELETE_BY_QUERY); // should just take one byte
codec.writeLong(cmd.getVersion());
codec.writeStr(cmd.query);
@@ -478,7 +450,7 @@ public class TransactionLog implements Closeable {
}
- public long writeCommit(CommitUpdateCommand cmd, int flags) {
+ public long writeCommit(CommitUpdateCommand cmd) {
LogCodec codec = new LogCodec(resolver);
synchronized (this) {
try {
@@ -490,7 +462,7 @@ public class TransactionLog implements Closeable {
}
codec.init(fos);
codec.writeTag(JavaBinCodec.ARR, 3);
- codec.writeInt(UpdateLog.COMMIT | flags); // should just take one byte
+ codec.writeInt(UpdateLog.COMMIT); // should just take one byte
codec.writeLong(cmd.getVersion());
codec.writeStr(END_MESSAGE); // ensure these bytes are (almost) last in the file
diff --git a/solr/core/src/java/org/apache/solr/update/UpdateLog.java b/solr/core/src/java/org/apache/solr/update/UpdateLog.java
index 7f821eafc0e..1bda23fc038 100644
--- a/solr/core/src/java/org/apache/solr/update/UpdateLog.java
+++ b/solr/core/src/java/org/apache/solr/update/UpdateLog.java
@@ -96,6 +96,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
private static final long STATUS_TIME = TimeUnit.NANOSECONDS.convert(60, TimeUnit.SECONDS);
public static String LOG_FILENAME_PATTERN = "%s.%019d";
public static String TLOG_NAME="tlog";
+ public static String BUFFER_TLOG_NAME="buffer.tlog";
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private boolean debug = log.isDebugEnabled();
@@ -139,11 +140,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
public static final int DELETE_BY_QUERY = 0x03;
public static final int COMMIT = 0x04;
public static final int UPDATE_INPLACE = 0x08;
- // Flag indicating that this is a buffered operation, and that a gap exists before buffering started.
- // for example, if full index replication starts and we are buffering updates, then this flag should
- // be set to indicate that replaying the log would not bring us into sync (i.e. peersync should
- // fail if this flag is set on the last update in the tlog).
- public static final int FLAG_GAP = 0x10;
+ // For backward-compatibility, we should delete this field in 9.0
public static final int OPERATION_MASK = 0x0f; // mask off flags to get the operation
/**
@@ -186,8 +183,8 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
long id = -1;
protected State state = State.ACTIVE;
- protected int operationFlags; // flags to write in the transaction log with operations (i.e. FLAG_GAP)
+ protected TransactionLog bufferTlog;
protected TransactionLog tlog;
protected TransactionLog prevTlog;
protected final Deque logs = new LinkedList<>(); // list of recent logs, newest first
@@ -206,6 +203,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
protected int maxNumLogsToKeep;
protected int numVersionBuckets; // This should only be used to initialize VersionInfo... the actual number of buckets may be rounded up to a power of two.
protected Long maxVersionFromIndex = null;
+ protected boolean existOldBufferLog = false;
// keep track of deletes only... this is not updated on an add
protected LinkedHashMap oldDeletes = new LinkedHashMap(numDeletesToKeep) {
@@ -244,7 +242,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
volatile UpdateHandler uhandler; // a core reload can change this reference!
protected volatile boolean cancelApplyBufferUpdate;
List startingVersions;
- int startingOperation; // last operation in the logs on startup
// metrics
protected Gauge bufferedOpsGauge;
@@ -378,6 +375,10 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
log.debug("UpdateHandler init: tlogDir=" + tlogDir + ", existing tlogs=" + Arrays.asList(tlogFiles) + ", next id=" + id);
}
+ String[] oldBufferTlog = getBufferLogList(tlogDir);
+ if (oldBufferTlog != null && oldBufferTlog.length != 0) {
+ existOldBufferLog = true;
+ }
TransactionLog oldLog = null;
for (String oldLogName : tlogFiles) {
File f = new File(tlogDir, oldLogName);
@@ -408,7 +409,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
// TODO: these startingVersions assume that we successfully recover from all non-complete tlogs.
try (RecentUpdates startingUpdates = getRecentUpdates()) {
startingVersions = startingUpdates.getVersions(numRecordsToKeep);
- startingOperation = startingUpdates.getLatestOperation();
// populate recent deletes list (since we can't get that info from the index)
for (int i = startingUpdates.deleteList.size() - 1; i >= 0; i--) {
@@ -434,14 +434,16 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
this.metricManager = manager;
this.registryName = registry;
bufferedOpsGauge = () -> {
+ if (state == State.BUFFERING) {
+ if (bufferTlog == null) return 0;
+ // numRecords counts header as a record
+ return bufferTlog.numRecords() - 1;
+ }
if (tlog == null) {
return 0;
} else if (state == State.APPLYING_BUFFERED) {
// numRecords counts header as a record
return tlog.numRecords() - 1 - recoveryInfo.adds - recoveryInfo.deleteByQuery - recoveryInfo.deletes - recoveryInfo.errors;
- } else if (state == State.BUFFERING) {
- // numRecords counts header as a record
- return tlog.numRecords() - 1;
} else {
return 0;
}
@@ -472,8 +474,8 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
return startingVersions;
}
- public int getStartingOperation() {
- return startingOperation;
+ public boolean existOldBufferLog() {
+ return existOldBufferLog;
}
/* Takes over ownership of the log, keeping it until no longer needed
@@ -509,6 +511,19 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
logs.addFirst(oldLog);
}
+ public String[] getBufferLogList(File directory) {
+ final String prefix = BUFFER_TLOG_NAME+'.';
+ return directory.list((dir, name) -> name.startsWith(prefix));
+ }
+
+ /**
+ * Does update from old tlogs (not from buffer tlog)?
+ * If yes we must skip writing {@code cmd} to current tlog
+ */
+ private boolean updateFromOldTlogs(UpdateCommand cmd) {
+ return (cmd.getFlags() & UpdateCommand.REPLAY) != 0 && state == State.REPLAYING;
+ }
+
public String[] getLogList(File directory) {
final String prefix = TLOG_NAME+'.';
String[] names = directory.list(new FilenameFilter() {
@@ -541,14 +556,19 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
// if ((cmd.getFlags() & UpdateCommand.REPLAY) != 0) return;
synchronized (this) {
- long pos = -1;
+ if ((cmd.getFlags() & UpdateCommand.BUFFERING) != 0) {
+ ensureBufferTlog();
+ bufferTlog.write(cmd);
+ return;
+ }
+ long pos = -1;
long prevPointer = getPrevPointerForUpdate(cmd);
// don't log if we are replaying from another log
- if ((cmd.getFlags() & UpdateCommand.REPLAY) == 0) {
+ if (!updateFromOldTlogs(cmd)) {
ensureLog();
- pos = tlog.write(cmd, prevPointer, operationFlags);
+ pos = tlog.write(cmd, prevPointer);
}
if (!clearCaches) {
@@ -556,10 +576,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
// Only currently would be useful for RTG while in recovery mode though.
LogPtr ptr = new LogPtr(pos, cmd.getVersion(), prevPointer);
- // only update our map if we're not buffering
- if ((cmd.getFlags() & UpdateCommand.BUFFERING) == 0) {
- map.put(cmd.getIndexedId(), ptr);
- }
+ map.put(cmd.getIndexedId(), ptr);
if (trace) {
log.trace("TLOG: added id " + cmd.getPrintableId() + " to " + tlog + " " + ptr + " map=" + System.identityHashCode(map));
@@ -606,22 +623,21 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
BytesRef br = cmd.getIndexedId();
synchronized (this) {
- long pos = -1;
+ if ((cmd.getFlags() & UpdateCommand.BUFFERING) != 0) {
+ ensureBufferTlog();
+ bufferTlog.writeDelete(cmd);
+ return;
+ }
- // don't log if we are replaying from another log
- if ((cmd.getFlags() & UpdateCommand.REPLAY) == 0) {
+ long pos = -1;
+ if (!updateFromOldTlogs(cmd)) {
ensureLog();
- pos = tlog.writeDelete(cmd, operationFlags);
+ pos = tlog.writeDelete(cmd);
}
LogPtr ptr = new LogPtr(pos, cmd.version);
-
- // only update our map if we're not buffering
- if ((cmd.getFlags() & UpdateCommand.BUFFERING) == 0) {
- map.put(br, ptr);
-
- oldDeletes.put(br, ptr);
- }
+ map.put(br, ptr);
+ oldDeletes.put(br, ptr);
if (trace) {
log.trace("TLOG: added delete for id " + cmd.id + " to " + tlog + " " + ptr + " map=" + System.identityHashCode(map));
@@ -631,15 +647,20 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
public void deleteByQuery(DeleteUpdateCommand cmd) {
synchronized (this) {
- long pos = -1;
- // don't log if we are replaying from another log
- if ((cmd.getFlags() & UpdateCommand.REPLAY) == 0) {
- ensureLog();
- pos = tlog.writeDeleteByQuery(cmd, operationFlags);
+ if ((cmd.getFlags() & UpdateCommand.BUFFERING) != 0) {
+ ensureBufferTlog();
+ bufferTlog.writeDeleteByQuery(cmd);
+ return;
}
- // only change our caches if we are not buffering
- if ((cmd.getFlags() & UpdateCommand.BUFFERING) == 0 && (cmd.getFlags() & UpdateCommand.IGNORE_INDEXWRITER) == 0) {
+ long pos = -1;
+ if (!updateFromOldTlogs(cmd)) {
+ ensureLog();
+ pos = tlog.writeDeleteByQuery(cmd);
+ }
+
+ // skip purge our caches in case of tlog replica
+ if ((cmd.getFlags() & UpdateCommand.IGNORE_INDEXWRITER) == 0) {
// given that we just did a delete-by-query, we don't know what documents were
// affected and hence we must purge our caches.
openRealtimeSearcher();
@@ -802,7 +823,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
if (prevTlog != null) {
// if we made it through the commit, write a commit command to the log
// TODO: check that this works to cap a tlog we were using to buffer so we don't replay on startup.
- prevTlog.writeCommit(cmd, operationFlags);
+ prevTlog.writeCommit(cmd);
addOldLog(prevTlog, true);
// the old log list will decref when no longer needed
@@ -1152,9 +1173,16 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
public void copyOverBufferingUpdates(CommitUpdateCommand cuc) {
versionInfo.blockUpdates();
try {
- operationFlags &= ~FLAG_GAP;
- state = State.ACTIVE;
- copyAndSwitchToNewTlog(cuc);
+ synchronized (this) {
+ state = State.ACTIVE;
+ if (bufferTlog == null) {
+ return;
+ }
+ // by calling this, we won't switch to new tlog (compared to applyBufferedUpdates())
+ // if we switch to new tlog we can possible lose updates on the next fetch
+ copyOverOldUpdates(cuc.getVersion(), bufferTlog);
+ dropBufferTlog();
+ }
} finally {
versionInfo.unblockUpdates();
}
@@ -1165,33 +1193,25 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
* So any updates which hasn't made it to the index is preserved in the current tlog
* @param cuc any updates that have version larger than the version of cuc will be copied over
*/
- public void copyOverOldUpdates(CommitUpdateCommand cuc) {
+ public void commitAndSwitchToNewTlog(CommitUpdateCommand cuc) {
versionInfo.blockUpdates();
try {
- copyAndSwitchToNewTlog(cuc);
+ synchronized (this) {
+ if (tlog == null) {
+ return;
+ }
+ preCommit(cuc);
+ try {
+ copyOverOldUpdates(cuc.getVersion());
+ } finally {
+ postCommit(cuc);
+ }
+ }
} finally {
versionInfo.unblockUpdates();
}
}
- protected void copyAndSwitchToNewTlog(CommitUpdateCommand cuc) {
- synchronized (this) {
- if (tlog == null) {
- return;
- }
- preCommit(cuc);
- try {
- copyOverOldUpdates(cuc.getVersion());
- } finally {
- postCommit(cuc);
- }
- }
- }
-
- /**
- * Copy over updates from prevTlog or last tlog (in tlog folder) to a new tlog
- * @param commitVersion any updates that have version larger than the commitVersion will be copied over
- */
public void copyOverOldUpdates(long commitVersion) {
TransactionLog oldTlog = prevTlog;
if (oldTlog == null && !logs.isEmpty()) {
@@ -1207,6 +1227,14 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
log.warn("Exception reading log", e);
return;
}
+ copyOverOldUpdates(commitVersion, oldTlog);
+ }
+
+ /**
+ * Copy over updates from prevTlog or last tlog (in tlog folder) to a new tlog
+ * @param commitVersion any updates that have version larger than the commitVersion will be copied over
+ */
+ public void copyOverOldUpdates(long commitVersion, TransactionLog oldTlog) {
copyOverOldUpdatesMeter.mark();
SolrQueryRequest req = new LocalSolrQueryRequest(uhandler.core,
@@ -1270,6 +1298,22 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
}
}
+ protected void ensureBufferTlog() {
+ if (bufferTlog != null) return;
+ String newLogName = String.format(Locale.ROOT, LOG_FILENAME_PATTERN, BUFFER_TLOG_NAME, System.nanoTime());
+ bufferTlog = newTransactionLog(new File(tlogDir, newLogName), globalStrings, false);
+ }
+
+ // Cleanup old buffer tlogs
+ protected void deleteBufferLogs() {
+ String[] oldBufferTlog = getBufferLogList(tlogDir);
+ if (oldBufferTlog != null && oldBufferTlog.length != 0) {
+ for (String oldBufferLogName : oldBufferTlog) {
+ deleteFile(new File(tlogDir, oldBufferLogName));
+ }
+ }
+ }
+
protected void ensureLog() {
if (tlog == null) {
@@ -1285,7 +1329,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
// record a commit
log.info("Recording current closed for " + uhandler.core + " log=" + theLog);
CommitUpdateCommand cmd = new CommitUpdateCommand(new LocalSolrQueryRequest(uhandler.core, new ModifiableSolrParams((SolrParams)null)), false);
- theLog.writeCommit(cmd, operationFlags);
+ theLog.writeCommit(cmd);
}
theLog.deleteOnClose = false;
@@ -1314,6 +1358,13 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
log.forceClose();
}
+ if (bufferTlog != null) {
+ // should not delete bufferTlog on close, existing bufferTlog is a sign for skip peerSync
+ bufferTlog.deleteOnClose = false;
+ bufferTlog.decref();
+ bufferTlog.forceClose();
+ }
+
try {
ExecutorUtil.shutdownAndAwaitTermination(recoveryExecutor);
} catch (Exception e) {
@@ -1347,7 +1398,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
HashMap updates;
List deleteByQueryList;
List deleteList;
- int latestOperation;
public RecentUpdates(Deque logList) {
this.logList = logList;
@@ -1401,11 +1451,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
return result;
}
- public int getLatestOperation() {
- return latestOperation;
- }
-
-
private void update() {
int numUpdates = 0;
updateList = new ArrayList<>(logList.size());
@@ -1431,9 +1476,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
// TODO: refactor this out so we get common error handling
int opAndFlags = (Integer)entry.get(UpdateLog.FLAGS_IDX);
- if (latestOperation == 0) {
- latestOperation = opAndFlags;
- }
int oper = opAndFlags & UpdateLog.OPERATION_MASK;
long version = (Long) entry.get(UpdateLog.VERSION_IDX);
@@ -1525,6 +1567,10 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
tlog.incref();
logList.addFirst(tlog);
}
+ if (bufferTlog != null) {
+ bufferTlog.incref();
+ logList.addFirst(bufferTlog);
+ }
}
// TODO: what if I hand out a list of updates, then do an update, then hand out another list (and
@@ -1542,13 +1588,13 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
// reading state and acting on it in the distributed update processor
versionInfo.blockUpdates();
try {
- if (state == State.BUFFERING) {
- log.info("Restarting buffering. previous=" + recoveryInfo);
- } else if (state != State.ACTIVE) {
+ if (state != State.ACTIVE && state != State.BUFFERING) {
// we don't currently have support for handling other states
log.warn("Unexpected state for bufferUpdates: " + state + ", Ignoring request.");
return;
}
+ dropBufferTlog();
+ deleteBufferLogs();
recoveryInfo = new RecoveryInfo();
@@ -1556,15 +1602,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
log.info("Starting to buffer updates. " + this);
}
- // since we blocked updates, this synchronization shouldn't strictly be necessary.
- synchronized (this) {
- recoveryInfo.positionOfStart = tlog == null ? 0 : tlog.snapshot();
- }
-
state = State.BUFFERING;
-
- // currently, buffering is only called by recovery, meaning that there is most likely a gap in updates
- operationFlags |= FLAG_GAP;
} finally {
versionInfo.unblockUpdates();
}
@@ -1580,25 +1618,24 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
log.info("Dropping buffered updates " + this);
}
- // since we blocked updates, this synchronization shouldn't strictly be necessary.
- synchronized (this) {
- if (tlog != null) {
- tlog.rollback(recoveryInfo.positionOfStart);
- }
- }
+ dropBufferTlog();
state = State.ACTIVE;
- operationFlags &= ~FLAG_GAP;
- } catch (IOException e) {
- SolrException.log(log,"Error attempting to roll back log", e);
- return false;
- }
- finally {
+ } finally {
versionInfo.unblockUpdates();
}
return true;
}
+ private void dropBufferTlog() {
+ synchronized (this) {
+ if (bufferTlog != null) {
+ bufferTlog.decref();
+ bufferTlog = null;
+ }
+ }
+ }
+
/** Returns the Future to wait on, or null if no replay was needed */
public Future applyBufferedUpdates() {
@@ -1612,27 +1649,30 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
try {
cancelApplyBufferUpdate = false;
if (state != State.BUFFERING) return null;
- operationFlags &= ~FLAG_GAP;
- // handle case when no log was even created because no updates
- // were received.
- if (tlog == null) {
- state = State.ACTIVE;
- return null;
+ synchronized (this) {
+ // handle case when no updates were received.
+ if (bufferTlog == null) {
+ state = State.ACTIVE;
+ return null;
+ }
+ bufferTlog.incref();
}
- tlog.incref();
+
state = State.APPLYING_BUFFERED;
} finally {
versionInfo.unblockUpdates();
}
if (recoveryExecutor.isShutdown()) {
- tlog.decref();
throw new RuntimeException("executor is not running...");
}
ExecutorCompletionService cs = new ExecutorCompletionService<>(recoveryExecutor);
- LogReplayer replayer = new LogReplayer(Arrays.asList(new TransactionLog[]{tlog}), true);
- return cs.submit(replayer, recoveryInfo);
+ LogReplayer replayer = new LogReplayer(Collections.singletonList(bufferTlog), true);
+ return cs.submit(() -> {
+ replayer.run();
+ dropBufferTlog();
+ }, recoveryInfo);
}
public State getState() {
@@ -1903,10 +1943,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
if (!activeLog) {
// if we are replaying an old tlog file, we need to add a commit to the end
// so we don't replay it again if we restart right after.
-
- // if the last operation we replayed had FLAG_GAP set, we want to use that again so we don't lose it
- // as the flag on the last operation.
- translog.writeCommit(cmd, operationFlags | (operationAndFlags & ~OPERATION_MASK));
+ translog.writeCommit(cmd);
}
try {
@@ -2037,10 +2074,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
return cmd;
}
- public void cancelApplyBufferedUpdates() {
- this.cancelApplyBufferUpdate = true;
- }
-
ThreadPoolExecutor recoveryExecutor = new ExecutorUtil.MDCAwareThreadPoolExecutor(0,
Integer.MAX_VALUE, 1, TimeUnit.SECONDS, new SynchronousQueue(),
new DefaultSolrThreadFactory("recoveryExecutor"));
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-tagger.xml b/solr/core/src/test-files/solr/collection1/conf/schema-tagger.xml
new file mode 100644
index 00000000000..051cd10c7a5
--- /dev/null
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-tagger.xml
@@ -0,0 +1,187 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ id
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/solr/core/src/test-files/solr/collection1/conf/solrconfig-tagger.xml b/solr/core/src/test-files/solr/collection1/conf/solrconfig-tagger.xml
new file mode 100644
index 00000000000..e0d367731d5
--- /dev/null
+++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig-tagger.xml
@@ -0,0 +1,59 @@
+
+
+
+
+
+ ${tests.luceneMatchVersion:LUCENE_CURRENT}
+ ${solr.data.dir:}
+
+
+
+
+
+
+
+
+
+
+
+
+ name_tag:[* TO *]
+
+
+
+
+ name_tag:[* TO *]
+
+
+
+
+
+
+
+
+ name_tag
+
+
+
+
diff --git a/solr/core/src/test/org/apache/solr/cloud/RollingRestartTest.java b/solr/core/src/test/org/apache/solr/cloud/RollingRestartTest.java
index 14586664ec0..addf732a6df 100644
--- a/solr/core/src/test/org/apache/solr/cloud/RollingRestartTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/RollingRestartTest.java
@@ -16,6 +16,11 @@
*/
package org.apache.solr.cloud;
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+
import org.apache.commons.collections.CollectionUtils;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.common.cloud.SolrZkClient;
@@ -24,11 +29,6 @@ import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.lang.invoke.MethodHandles;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.concurrent.TimeUnit;
-
public class RollingRestartTest extends AbstractFullDistribZkTestBase {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoScalingHandlerTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoScalingHandlerTest.java
index 483b60c14ff..cf119535e12 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoScalingHandlerTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoScalingHandlerTest.java
@@ -17,6 +17,7 @@
package org.apache.solr.cloud.autoscaling;
+import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.List;
import java.util.Map;
@@ -26,6 +27,7 @@ import java.util.concurrent.TimeUnit;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.SolrResponse;
+import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.cloud.autoscaling.Policy;
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
@@ -1011,6 +1013,25 @@ public class AutoScalingHandlerTest extends SolrCloudTestCase {
assertEquals(5L, properties.get(AutoScalingParams.ACTION_THROTTLE_PERIOD_SECONDS));
}
+ public void testUpdatePolicy() throws IOException, SolrServerException {
+ CloudSolrClient solrClient = cluster.getSolrClient();
+ String setPropertiesCommand = "{'set-cluster-policy': [" +
+ "{'cores': '<4','node': '#ANY'}]}";
+ solrClient.request(createAutoScalingRequest(SolrRequest.METHOD.POST, setPropertiesCommand));
+ SolrRequest req = createAutoScalingRequest(SolrRequest.METHOD.GET, null);
+ NamedList response = solrClient.request(req);
+ assertEquals("<4", Utils.getObjectByPath(response,false,"cluster-policy[0]/cores"));
+ assertEquals("#ANY", Utils.getObjectByPath(response,false,"cluster-policy[0]/node"));
+ setPropertiesCommand = "{'set-cluster-policy': [" +
+ "{'cores': '<3','node': '#ANY'}]}";
+ solrClient.request(createAutoScalingRequest(SolrRequest.METHOD.POST, setPropertiesCommand));
+ req = createAutoScalingRequest(SolrRequest.METHOD.GET, null);
+ response = solrClient.request(req);
+ System.out.println(response);
+
+
+ }
+
static class AutoScalingRequest extends SolrRequest {
protected final String message;
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java
index c09d4a48c35..234eaea29a1 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java
@@ -392,8 +392,8 @@ public class SimCloudManager implements SolrCloudManager {
public String simAddNode() throws Exception {
Map values = createNodeValues(null);
String nodeId = (String)values.get(ImplicitSnitch.NODE);
- clusterStateProvider.simAddNode(nodeId);
nodeStateProvider.simSetNodeValues(nodeId, values);
+ clusterStateProvider.simAddNode(nodeId);
LOG.trace("-- added node " + nodeId);
// initialize history handler if this is the first node
if (historyHandler == null && liveNodesSet.size() == 1) {
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java
index ca2dd48858d..20ffca92fe3 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java
@@ -111,11 +111,11 @@ import static org.apache.solr.common.params.CommonParams.NAME;
public class SimClusterStateProvider implements ClusterStateProvider {
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
- private final Map> nodeReplicaMap = new ConcurrentHashMap<>();
private final LiveNodesSet liveNodes;
private final SimDistribStateManager stateManager;
private final SimCloudManager cloudManager;
+ private final Map> nodeReplicaMap = new ConcurrentHashMap<>();
private final Map clusterProperties = new ConcurrentHashMap<>();
private final Map> collProperties = new ConcurrentHashMap<>();
private final Map>> sliceProperties = new ConcurrentHashMap<>();
@@ -257,8 +257,8 @@ public class SimClusterStateProvider implements ClusterStateProvider {
try {
Set collections = new HashSet<>();
// mark every replica on that node as down
- setReplicaStates(nodeId, Replica.State.DOWN, collections);
boolean res = liveNodes.remove(nodeId);
+ setReplicaStates(nodeId, Replica.State.DOWN, collections);
if (!collections.isEmpty()) {
collectionsStatesRef.set(null);
}
@@ -279,6 +279,20 @@ public class SimClusterStateProvider implements ClusterStateProvider {
}
}
+ /**
+ * Remove all replica information related to dead nodes.
+ */
+ public void simRemoveDeadNodes() throws Exception {
+ lock.lockInterruptibly();
+ try {
+ Set myNodes = new HashSet<>(nodeReplicaMap.keySet());
+ myNodes.removeAll(liveNodes.get());
+ collectionsStatesRef.set(null);
+ } finally {
+ lock.unlock();
+ }
+ }
+
private synchronized void updateOverseerLeader() throws Exception {
if (overseerLeader != null && liveNodes.contains(overseerLeader)) {
return;
@@ -436,6 +450,8 @@ public class SimClusterStateProvider implements ClusterStateProvider {
opDelay(replicaInfo.getCollection(), CollectionParams.CollectionAction.ADDREPLICA.name());
+ // at this point nuke our cached DocCollection state
+ collectionsStatesRef.set(null);
List replicas = nodeReplicaMap.computeIfAbsent(nodeId, n -> new ArrayList<>());
// mark replica as active
replicaInfo.getVariables().put(ZkStateReader.STATE_PROP, Replica.State.ACTIVE.toString());
@@ -445,8 +461,6 @@ public class SimClusterStateProvider implements ClusterStateProvider {
replicaInfo.getVariables().put(Suggestion.coreidxsize, 1);
replicas.add(replicaInfo);
- // at this point nuke our cached DocCollection state
- collectionsStatesRef.set(null);
LOG.trace("-- simAddReplica {}", replicaInfo);
Map values = cloudManager.getSimNodeStateProvider().simGetAllNodeValues()
@@ -483,8 +497,8 @@ public class SimClusterStateProvider implements ClusterStateProvider {
* @param coreNodeName coreNodeName
*/
public void simRemoveReplica(String nodeId, String coreNodeName) throws Exception {
- List replicas = nodeReplicaMap.computeIfAbsent(nodeId, n -> new ArrayList<>());
lock.lockInterruptibly();
+ List replicas = nodeReplicaMap.computeIfAbsent(nodeId, n -> new ArrayList<>());
try {
for (int i = 0; i < replicas.size(); i++) {
if (coreNodeName.equals(replicas.get(i).getName())) {
@@ -572,7 +586,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
});
}
- private void simRunLeaderElection(String collection, Slice s, boolean saveClusterState) throws Exception {
+ private void simRunLeaderElection(String collection, Slice s, boolean saveState) throws Exception {
AtomicBoolean stateChanged = new AtomicBoolean(Boolean.FALSE);
Replica leader = s.getLeader();
if (leader == null || !liveNodes.contains(leader.getNodeName())) {
@@ -636,8 +650,9 @@ public class SimClusterStateProvider implements ClusterStateProvider {
} else {
LOG.trace("-- already has leader for {} / {}", collection, s.getName());
}
- if (stateChanged.get()) {
+ if (stateChanged.get() || saveState) {
collectionsStatesRef.set(null);
+ saveClusterState.set(true);
}
}
@@ -654,6 +669,8 @@ public class SimClusterStateProvider implements ClusterStateProvider {
List nodeList = new ArrayList<>();
List shardNames = new ArrayList<>();
final String collectionName = props.getStr(NAME);
+ // always force getting fresh state
+ collectionsStatesRef.set(null);
ClusterState clusterState = getClusterState();
ZkWriteCommand cmd = new ClusterStateMutator(cloudManager).createCollection(clusterState, props);
if (cmd.noop) {
@@ -758,12 +775,18 @@ public class SimClusterStateProvider implements ClusterStateProvider {
if (cores == 0) {
throw new RuntimeException("Unexpected value of 'cores' (" + cores + ") on node: " + n);
}
- cloudManager.getSimNodeStateProvider().simSetNodeValue(n, "cores", cores - 1);
+ try {
+ cloudManager.getSimNodeStateProvider().simSetNodeValue(n, "cores", cores - 1);
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ throw new RuntimeException("interrupted");
+ }
}
}
}
});
collectionsStatesRef.set(null);
+ saveClusterState.set(true);
results.add("success", "");
} catch (Exception e) {
LOG.warn("Exception", e);
@@ -787,6 +810,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
values.put(ImplicitSnitch.DISK, 1000);
});
collectionsStatesRef.set(null);
+ saveClusterState.set(true);
} finally {
lock.unlock();
}
@@ -1057,7 +1081,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
}
}
- public synchronized void createSystemCollection() throws IOException {
+ public void createSystemCollection() throws IOException {
try {
if (simListCollections().contains(CollectionAdminParams.SYSTEM_COLL)) {
return;
@@ -1065,7 +1089,8 @@ public class SimClusterStateProvider implements ClusterStateProvider {
ZkNodeProps props = new ZkNodeProps(
NAME, CollectionAdminParams.SYSTEM_COLL,
REPLICATION_FACTOR, "1",
- OverseerCollectionMessageHandler.NUM_SLICES, "1"
+ OverseerCollectionMessageHandler.NUM_SLICES, "1",
+ CommonAdminParams.WAIT_FOR_FINAL_STATE, "true"
);
simCreateCollection(props, new NamedList());
} catch (Exception e) {
@@ -1389,7 +1414,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
});
});
if (infos.isEmpty()) {
- throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Collection " + collection + " doesn't exist.");
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Collection " + collection + " doesn't exist (shard=" + shard + ").");
}
if (divide && value != null && (value instanceof Number)) {
if ((value instanceof Long) || (value instanceof Integer)) {
@@ -1455,6 +1480,9 @@ public class SimClusterStateProvider implements ClusterStateProvider {
nodeReplicaMap.forEach((n, replicas) -> {
replicas.forEach(ri -> collections.add(ri.getCollection()));
});
+ // check collProps and sliceProps too
+ collProperties.forEach((coll, props) -> collections.add(coll));
+ sliceProperties.forEach((coll, slices) -> collections.add(coll));
return new ArrayList<>(collections);
} finally {
lock.unlock();
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java
index b9169eb2263..cb8640c155e 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java
@@ -29,6 +29,7 @@ import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.locks.ReentrantLock;
import java.util.stream.Collectors;
import org.apache.solr.client.solrj.cloud.NodeStateProvider;
@@ -50,6 +51,7 @@ public class SimNodeStateProvider implements NodeStateProvider {
private final SimClusterStateProvider clusterStateProvider;
private final SimDistribStateManager stateManager;
private final LiveNodesSet liveNodesSet;
+ private final ReentrantLock lock = new ReentrantLock();
public SimNodeStateProvider(LiveNodesSet liveNodesSet, SimDistribStateManager stateManager,
SimClusterStateProvider clusterStateProvider,
@@ -84,14 +86,19 @@ public class SimNodeStateProvider implements NodeStateProvider {
* @param node node id
* @param values values.
*/
- public void simSetNodeValues(String node, Map values) {
- Map existing = nodeValues.computeIfAbsent(node, n -> new ConcurrentHashMap<>());
- existing.clear();
- if (values != null) {
- existing.putAll(values);
- }
- if (values == null || values.isEmpty() || values.containsKey("nodeRole")) {
- saveRoles();
+ public void simSetNodeValues(String node, Map values) throws InterruptedException {
+ lock.lockInterruptibly();
+ try {
+ Map existing = nodeValues.computeIfAbsent(node, n -> new ConcurrentHashMap<>());
+ existing.clear();
+ if (values != null) {
+ existing.putAll(values);
+ }
+ if (values == null || values.isEmpty() || values.containsKey("nodeRole")) {
+ saveRoles();
+ }
+ } finally {
+ lock.unlock();
}
}
@@ -102,15 +109,20 @@ public class SimNodeStateProvider implements NodeStateProvider {
* @param key property name
* @param value property value
*/
- public void simSetNodeValue(String node, String key, Object value) {
- Map existing = nodeValues.computeIfAbsent(node, n -> new ConcurrentHashMap<>());
- if (value == null) {
- existing.remove(key);
- } else {
- existing.put(key, value);
- }
- if (key.equals("nodeRole")) {
- saveRoles();
+ public void simSetNodeValue(String node, String key, Object value) throws InterruptedException {
+ lock.lockInterruptibly();
+ try {
+ Map existing = nodeValues.computeIfAbsent(node, n -> new ConcurrentHashMap<>());
+ if (value == null) {
+ existing.remove(key);
+ } else {
+ existing.put(key, value);
+ }
+ if (key.equals("nodeRole")) {
+ saveRoles();
+ }
+ } finally {
+ lock.unlock();
}
}
@@ -121,21 +133,26 @@ public class SimNodeStateProvider implements NodeStateProvider {
* @param key property name
* @param value property value.
*/
- public void simAddNodeValue(String node, String key, Object value) {
- Map values = nodeValues.computeIfAbsent(node, n -> new ConcurrentHashMap<>());
- Object existing = values.get(key);
- if (existing == null) {
- values.put(key, value);
- } else if (existing instanceof Set) {
- ((Set)existing).add(value);
- } else {
- Set vals = new HashSet<>();
- vals.add(existing);
- vals.add(value);
- values.put(key, vals);
- }
- if (key.equals("nodeRole")) {
- saveRoles();
+ public void simAddNodeValue(String node, String key, Object value) throws InterruptedException {
+ lock.lockInterruptibly();
+ try {
+ Map values = nodeValues.computeIfAbsent(node, n -> new ConcurrentHashMap<>());
+ Object existing = values.get(key);
+ if (existing == null) {
+ values.put(key, value);
+ } else if (existing instanceof Set) {
+ ((Set)existing).add(value);
+ } else {
+ Set vals = new HashSet<>();
+ vals.add(existing);
+ vals.add(value);
+ values.put(key, vals);
+ }
+ if (key.equals("nodeRole")) {
+ saveRoles();
+ }
+ } finally {
+ lock.unlock();
}
}
@@ -144,10 +161,16 @@ public class SimNodeStateProvider implements NodeStateProvider {
* /roles.json is updated.
* @param node node id
*/
- public void simRemoveNodeValues(String node) {
- Map values = nodeValues.remove(node);
- if (values != null && values.containsKey("nodeRole")) {
- saveRoles();
+ public void simRemoveNodeValues(String node) throws InterruptedException {
+ LOG.debug("--removing value for " + node);
+ lock.lockInterruptibly();
+ try {
+ Map values = nodeValues.remove(node);
+ if (values != null && values.containsKey("nodeRole")) {
+ saveRoles();
+ }
+ } finally {
+ lock.unlock();
}
}
@@ -155,19 +178,24 @@ public class SimNodeStateProvider implements NodeStateProvider {
* Remove values that correspond to dead nodes. If values contained a 'nodeRole'
* key then /roles.json is updated.
*/
- public void simRemoveDeadNodes() {
+ public void simRemoveDeadNodes() throws InterruptedException {
Set myNodes = new HashSet<>(nodeValues.keySet());
myNodes.removeAll(liveNodesSet.get());
- AtomicBoolean updateRoles = new AtomicBoolean(false);
- myNodes.forEach(n -> {
- LOG.debug("- removing dead node values: " + n);
- Map vals = nodeValues.remove(n);
- if (vals.containsKey("nodeRole")) {
- updateRoles.set(true);
+ lock.lockInterruptibly();
+ try {
+ AtomicBoolean updateRoles = new AtomicBoolean(false);
+ myNodes.forEach(n -> {
+ LOG.debug("- removing dead node values: " + n);
+ Map vals = nodeValues.remove(n);
+ if (vals.containsKey("nodeRole")) {
+ updateRoles.set(true);
+ }
+ });
+ if (updateRoles.get()) {
+ saveRoles();
}
- });
- if (updateRoles.get()) {
- saveRoles();
+ } finally {
+ lock.unlock();
}
}
@@ -187,7 +215,7 @@ public class SimNodeStateProvider implements NodeStateProvider {
return nodeValues;
}
- private synchronized void saveRoles() {
+ private void saveRoles() {
final Map> roles = new HashMap<>();
nodeValues.forEach((n, values) -> {
String nodeRole = (String)values.get("nodeRole");
@@ -211,6 +239,9 @@ public class SimNodeStateProvider implements NodeStateProvider {
* @return map of metrics names / values
*/
public Map getReplicaMetricsValues(String node, Collection tags) {
+ if (!liveNodesSet.contains(node)) {
+ throw new RuntimeException("non-live node " + node);
+ }
List replicas = clusterStateProvider.simGetReplicaInfos(node);
if (replicas == null || replicas.isEmpty()) {
return Collections.emptyMap();
@@ -258,8 +289,7 @@ public class SimNodeStateProvider implements NodeStateProvider {
public Map getNodeValues(String node, Collection tags) {
LOG.trace("-- requested values for " + node + ": " + tags);
if (!liveNodesSet.contains(node)) {
- nodeValues.remove(node);
- return Collections.emptyMap();
+ throw new RuntimeException("non-live node " + node);
}
if (tags.isEmpty()) {
return Collections.emptyMap();
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimSolrCloudTestCase.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimSolrCloudTestCase.java
index 757e2975cd9..e83f72f5712 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimSolrCloudTestCase.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimSolrCloudTestCase.java
@@ -84,6 +84,10 @@ public class SimSolrCloudTestCase extends SolrTestCaseJ4 {
// clear any persisted configuration
cluster.getDistribStateManager().setData(SOLR_AUTOSCALING_CONF_PATH, Utils.toJSON(new ZkNodeProps()), -1);
cluster.getDistribStateManager().setData(ZkStateReader.ROLES, Utils.toJSON(new HashMap<>()), -1);
+ cluster.getSimClusterStateProvider().simDeleteAllCollections();
+ cluster.simClearSystemCollection();
+ cluster.getSimNodeStateProvider().simRemoveDeadNodes();
+ cluster.getSimClusterStateProvider().simRemoveDeadNodes();
// restore the expected number of nodes
int currentSize = cluster.getLiveNodesSet().size();
if (currentSize < clusterNodeCount) {
@@ -99,10 +103,6 @@ public class SimSolrCloudTestCase extends SolrTestCaseJ4 {
removeChildren(ZkStateReader.SOLR_AUTOSCALING_TRIGGER_STATE_PATH);
removeChildren(ZkStateReader.SOLR_AUTOSCALING_NODE_LOST_PATH);
removeChildren(ZkStateReader.SOLR_AUTOSCALING_NODE_ADDED_PATH);
- cluster.getSimClusterStateProvider().simDeleteAllCollections();
- cluster.simClearSystemCollection();
- // clear any dead nodes
- cluster.getSimNodeStateProvider().simRemoveDeadNodes();
cluster.getSimClusterStateProvider().simResetLeaderThrottles();
cluster.simRestartOverseer(null);
cluster.getTimeSource().sleep(5000);
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestClusterStateProvider.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestClusterStateProvider.java
index 71106452ffb..e395985d027 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestClusterStateProvider.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestClusterStateProvider.java
@@ -109,7 +109,11 @@ public class TestClusterStateProvider extends SolrCloudTestCase {
simCloudManager.getSimClusterStateProvider().simSetClusterProperties(clusterProperties);
simCloudManager.getSimDistribStateManager().simSetAutoScalingConfig(autoScalingConfig);
nodeValues.forEach((n, values) -> {
- simCloudManager.getSimNodeStateProvider().simSetNodeValues(n, values);
+ try {
+ simCloudManager.getSimNodeStateProvider().simSetNodeValues(n, values);
+ } catch (InterruptedException e) {
+ fail("Interrupted:" + e);
+ }
});
simCloudManager.getSimClusterStateProvider().simSetClusterState(realState);
ClusterState simState = simCloudManager.getClusterStateProvider().getClusterState();
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java
index 6d53363a078..934d2ea77cb 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java
@@ -33,7 +33,6 @@ import java.util.concurrent.atomic.AtomicInteger;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering;
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
-import org.apache.lucene.util.LuceneTestCase;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.cloud.autoscaling.AutoScalingConfig;
@@ -54,12 +53,14 @@ import org.apache.solr.cloud.autoscaling.CapturedEvent;
import org.apache.solr.cloud.autoscaling.TriggerValidationException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.cloud.Replica;
+import org.apache.solr.common.params.CollectionAdminParams;
import org.apache.solr.common.params.CollectionParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.Pair;
import org.apache.solr.common.util.TimeSource;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.util.LogLevel;
+import org.apache.solr.util.TimeOut;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
@@ -74,7 +75,7 @@ import static org.apache.solr.cloud.autoscaling.AutoScalingHandlerTest.createAut
@TimeoutSuite(millis = 4 * 3600 * 1000)
@LogLevel("org.apache.solr.cloud.autoscaling=DEBUG")
@ThreadLeakLingering(linger = 20000) // ComputePlanAction may take significant time to complete
-@LuceneTestCase.BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-12075")
+//@LuceneTestCase.BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-12075")
public class TestLargeCluster extends SimSolrCloudTestCase {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@@ -83,8 +84,9 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
public static final int NUM_NODES = 100;
static Map> listenerEvents = new ConcurrentHashMap<>();
- static AtomicInteger triggerFiredCount = new AtomicInteger();
- static CountDownLatch triggerFiredLatch;
+ static AtomicInteger triggerFinishedCount = new AtomicInteger();
+ static AtomicInteger triggerStartedCount = new AtomicInteger();
+ static CountDownLatch triggerFinishedLatch;
static int waitForSeconds;
@BeforeClass
@@ -94,10 +96,10 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
@Before
public void setupTest() throws Exception {
-
waitForSeconds = 5;
- triggerFiredCount.set(0);
- triggerFiredLatch = new CountDownLatch(1);
+ triggerStartedCount.set(0);
+ triggerFinishedCount.set(0);
+ triggerFinishedLatch = new CountDownLatch(1);
listenerEvents.clear();
// disable .scheduled_maintenance
String suspendTriggerCommand = "{" +
@@ -107,6 +109,13 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
SolrClient solrClient = cluster.simGetSolrClient();
NamedList response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
+
+ // do this in advance if missing
+ if (!cluster.getSimClusterStateProvider().simListCollections().contains(CollectionAdminParams.SYSTEM_COLL)) {
+ cluster.getSimClusterStateProvider().createSystemCollection();
+ CloudTestUtils.waitForState(cluster, CollectionAdminParams.SYSTEM_COLL, 120, TimeUnit.SECONDS,
+ CloudTestUtils.clusterShape(1, 1));
+ }
}
public static class TestTriggerListener extends TriggerListenerBase {
@@ -123,11 +132,18 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
}
}
- public static class TestTriggerAction extends TriggerActionBase {
+ public static class FinishTriggerAction extends TriggerActionBase {
@Override
public void process(TriggerEvent event, ActionContext context) throws Exception {
- triggerFiredCount.incrementAndGet();
- triggerFiredLatch.countDown();
+ triggerFinishedCount.incrementAndGet();
+ triggerFinishedLatch.countDown();
+ }
+ }
+
+ public static class StartTriggerAction extends TriggerActionBase {
+ @Override
+ public void process(TriggerEvent event, ActionContext context) throws Exception {
+ triggerStartedCount.incrementAndGet();
}
}
@@ -136,14 +152,15 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
SolrClient solrClient = cluster.simGetSolrClient();
String setTriggerCommand = "{" +
"'set-trigger' : {" +
- "'name' : 'node_lost_trigger'," +
+ "'name' : 'node_lost_trigger1'," +
"'event' : 'nodeLost'," +
"'waitFor' : '" + waitForSeconds + "s'," +
"'enabled' : true," +
"'actions' : [" +
+ "{'name':'start','class':'" + StartTriggerAction.class.getName() + "'}," +
"{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," +
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," +
- "{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}" +
+ "{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" +
"]" +
"}}";
SolrRequest req = createAutoScalingRequest(SolrRequest.METHOD.POST, setTriggerCommand);
@@ -154,7 +171,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
"'set-listener' : " +
"{" +
"'name' : 'foo'," +
- "'trigger' : 'node_lost_trigger'," +
+ "'trigger' : 'node_lost_trigger1'," +
"'stage' : ['STARTED','ABORTED','SUCCEEDED', 'FAILED']," +
"'beforeAction' : ['compute', 'execute']," +
"'afterAction' : ['compute', 'execute']," +
@@ -217,6 +234,19 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
}
}
+ // wait until started == finished
+ TimeOut timeOut = new TimeOut(20 * waitForSeconds * NUM_NODES, TimeUnit.SECONDS, cluster.getTimeSource());
+ while (!timeOut.hasTimedOut()) {
+ if (triggerStartedCount.get() == triggerFinishedCount.get()) {
+ break;
+ }
+ timeOut.sleep(1000);
+ }
+ if (timeOut.hasTimedOut()) {
+ fail("did not finish processing all events in time: started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get());
+ }
+
+
log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 30 * nodes.size(), TimeUnit.SECONDS,
CloudTestUtils.clusterShape(5, 15)) + "ms");
long newMoveReplicaOps = cluster.simGetOpCount(CollectionParams.CollectionAction.MOVEREPLICA.name());
@@ -232,14 +262,15 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
SolrClient solrClient = cluster.simGetSolrClient();
String setTriggerCommand = "{" +
"'set-trigger' : {" +
- "'name' : 'node_added_trigger'," +
+ "'name' : 'node_added_trigger2'," +
"'event' : 'nodeAdded'," +
"'waitFor' : '" + waitForSeconds + "s'," +
"'enabled' : true," +
"'actions' : [" +
+ "{'name':'start','class':'" + StartTriggerAction.class.getName() + "'}," +
"{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," +
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," +
- "{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}" +
+ "{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" +
"]" +
"}}";
SolrRequest req = createAutoScalingRequest(SolrRequest.METHOD.POST, setTriggerCommand);
@@ -257,20 +288,34 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS,
CloudTestUtils.clusterShape(NUM_NODES / 10, NUM_NODES / 8 * 3)) + " ms");
+ // start adding nodes
int numAddNode = NUM_NODES / 5;
List addNodesList = new ArrayList<>(numAddNode);
for (int i = 0; i < numAddNode; i++) {
addNodesList.add(cluster.simAddNode());
cluster.getTimeSource().sleep(5000);
}
- boolean await = triggerFiredLatch.await(1000000 / SPEED, TimeUnit.MILLISECONDS);
+ // wait until at least one event is generated
+ boolean await = triggerFinishedLatch.await(10000 / SPEED, TimeUnit.MILLISECONDS);
assertTrue("trigger did not fire", await);
+ // wait until started == finished
+ TimeOut timeOut = new TimeOut(20 * waitForSeconds * NUM_NODES, TimeUnit.SECONDS, cluster.getTimeSource());
+ while (!timeOut.hasTimedOut()) {
+ if (triggerStartedCount.get() == triggerFinishedCount.get()) {
+ break;
+ }
+ timeOut.sleep(1000);
+ }
+ if (timeOut.hasTimedOut()) {
+ fail("did not finish processing all events in time: started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get());
+ }
+
List systemColl = cluster.simGetSystemCollection();
int startedEventPos = -1;
for (int i = 0; i < systemColl.size(); i++) {
SolrInputDocument d = systemColl.get(i);
- if (!"node_added_trigger".equals(d.getFieldValue("event.source_s"))) {
+ if (!"node_added_trigger2".equals(d.getFieldValue("event.source_s"))) {
continue;
}
if ("NODEADDED".equals(d.getFieldValue("event.type_s")) &&
@@ -292,13 +337,13 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
SolrInputDocument finishedEvent = null;
long lastNumOps = cluster.simGetOpCount("MOVEREPLICA");
while (count-- > 0) {
- cluster.getTimeSource().sleep(150000);
+ cluster.getTimeSource().sleep(10000);
long currentNumOps = cluster.simGetOpCount("MOVEREPLICA");
if (currentNumOps == lastNumOps) {
int size = systemColl.size() - 1;
for (int i = size; i > lastIgnoredPos; i--) {
SolrInputDocument d = systemColl.get(i);
- if (!"node_added_trigger".equals(d.getFieldValue("event.source_s"))) {
+ if (!"node_added_trigger2".equals(d.getFieldValue("event.source_s"))) {
continue;
}
if ("SUCCEEDED".equals(d.getFieldValue("stage_s"))) {
@@ -401,14 +446,15 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
SolrClient solrClient = cluster.simGetSolrClient();
String setTriggerCommand = "{" +
"'set-trigger' : {" +
- "'name' : 'node_lost_trigger'," +
+ "'name' : 'node_lost_trigger3'," +
"'event' : 'nodeLost'," +
"'waitFor' : '" + waitFor + "s'," +
"'enabled' : true," +
"'actions' : [" +
+ "{'name':'start','class':'" + StartTriggerAction.class.getName() + "'}," +
"{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," +
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," +
- "{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}" +
+ "{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" +
"]" +
"}}";
SolrRequest req = createAutoScalingRequest(SolrRequest.METHOD.POST, setTriggerCommand);
@@ -435,8 +481,8 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
cluster.simRemoveNode(nodes.get(i), false);
cluster.getTimeSource().sleep(killDelay);
}
- // wait for the trigger to fire
- boolean await = triggerFiredLatch.await(20 * waitFor * 1000 / SPEED, TimeUnit.MILLISECONDS);
+ // wait for the trigger to fire at least once
+ boolean await = triggerFinishedLatch.await(20 * waitFor * 1000 / SPEED, TimeUnit.MILLISECONDS);
assertTrue("trigger did not fire within timeout, " +
"waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored,
await);
@@ -444,7 +490,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
int startedEventPos = -1;
for (int i = 0; i < systemColl.size(); i++) {
SolrInputDocument d = systemColl.get(i);
- if (!"node_lost_trigger".equals(d.getFieldValue("event.source_s"))) {
+ if (!"node_lost_trigger3".equals(d.getFieldValue("event.source_s"))) {
continue;
}
if ("NODELOST".equals(d.getFieldValue("event.type_s")) &&
@@ -457,11 +503,22 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
"waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored,
startedEventPos > -1);
SolrInputDocument startedEvent = systemColl.get(startedEventPos);
+ // wait until started == finished
+ TimeOut timeOut = new TimeOut(20 * waitFor * NUM_NODES, TimeUnit.SECONDS, cluster.getTimeSource());
+ while (!timeOut.hasTimedOut()) {
+ if (triggerStartedCount.get() == triggerFinishedCount.get()) {
+ break;
+ }
+ timeOut.sleep(1000);
+ }
+ if (timeOut.hasTimedOut()) {
+ fail("did not finish processing all events in time: started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get());
+ }
int ignored = 0;
int lastIgnoredPos = startedEventPos;
for (int i = startedEventPos + 1; i < systemColl.size(); i++) {
SolrInputDocument d = systemColl.get(i);
- if (!"node_lost_trigger".equals(d.getFieldValue("event.source_s"))) {
+ if (!"node_lost_trigger3".equals(d.getFieldValue("event.source_s"))) {
continue;
}
if ("NODELOST".equals(d.getFieldValue("event.type_s"))) {
@@ -486,13 +543,13 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
SolrInputDocument finishedEvent = null;
long lastNumOps = cluster.simGetOpCount("MOVEREPLICA");
while (count-- > 0) {
- cluster.getTimeSource().sleep(150000);
+ cluster.getTimeSource().sleep(waitFor * 10000);
long currentNumOps = cluster.simGetOpCount("MOVEREPLICA");
if (currentNumOps == lastNumOps) {
int size = systemColl.size() - 1;
for (int i = size; i > lastIgnoredPos; i--) {
SolrInputDocument d = systemColl.get(i);
- if (!"node_lost_trigger".equals(d.getFieldValue("event.source_s"))) {
+ if (!"node_lost_trigger3".equals(d.getFieldValue("event.source_s"))) {
continue;
}
if ("SUCCEEDED".equals(d.getFieldValue("stage_s"))) {
@@ -520,8 +577,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
}
@Test
- // JIRA closed 24-Feb-2018. Still apparently a problem.
- @BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-11714")
+ //@BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-11714")
public void testSearchRate() throws Exception {
SolrClient solrClient = cluster.simGetSolrClient();
String collectionName = "testSearchRate";
@@ -555,7 +611,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
"'actions' : [" +
"{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," +
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," +
- "{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}" +
+ "{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" +
"]" +
"}}";
SolrRequest req = createAutoScalingRequest(SolrRequest.METHOD.POST, setTriggerCommand);
@@ -575,7 +631,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase {
assertEquals(response.get("result").toString(), "success");
- boolean await = triggerFiredLatch.await(40000 / SPEED, TimeUnit.MILLISECONDS);
+ boolean await = triggerFinishedLatch.await(waitForSeconds * 20000 / SPEED, TimeUnit.MILLISECONDS);
assertTrue("The trigger did not fire at all", await);
// wait for listener to capture the SUCCEEDED stage
cluster.getTimeSource().sleep(2000);
diff --git a/solr/core/src/test/org/apache/solr/core/ResourceLoaderTest.java b/solr/core/src/test/org/apache/solr/core/ResourceLoaderTest.java
index 3d212c51c36..dc06a6f7c65 100644
--- a/solr/core/src/test/org/apache/solr/core/ResourceLoaderTest.java
+++ b/solr/core/src/test/org/apache/solr/core/ResourceLoaderTest.java
@@ -72,7 +72,8 @@ public class ResourceLoaderTest extends SolrTestCaseJ4 {
Class> clazz = ResourceLoaderAware.class;
// Check ResourceLoaderAware valid objects
- assertAwareCompatibility(clazz, new NGramFilterFactory(new HashMap<>()));
+ //noinspection unchecked
+ assertAwareCompatibility(clazz, new NGramFilterFactory(map("minGramSize", "1", "maxGramSize", "2")));
assertAwareCompatibility(clazz, new KeywordTokenizerFactory(new HashMap<>()));
// Make sure it throws an error for invalid objects
@@ -98,9 +99,10 @@ public class ResourceLoaderTest extends SolrTestCaseJ4 {
assertAwareCompatibility(clazz, new JSONResponseWriter());
// Make sure it throws an error for invalid objects
+ //noinspection unchecked
invalid = new Object[] {
- new NGramFilterFactory(new HashMap<>()),
- "hello", 12.3f,
+ new NGramFilterFactory(map("minGramSize", "1", "maxGramSize", "2")),
+ "hello", 12.3f ,
new KeywordTokenizerFactory(new HashMap<>())
};
for( Object obj : invalid ) {
diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/EmbeddedSolrNoSerializeTest.java b/solr/core/src/test/org/apache/solr/handler/tagger/EmbeddedSolrNoSerializeTest.java
new file mode 100644
index 00000000000..8d31ad007ef
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/tagger/EmbeddedSolrNoSerializeTest.java
@@ -0,0 +1,153 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.BiFunction;
+
+import org.apache.lucene.document.Field;
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.StreamingResponseCallback;
+import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
+import org.apache.solr.client.solrj.request.QueryRequest;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrDocumentList;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.ContentStream;
+import org.apache.solr.common.util.ContentStreamBase;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Ignore;
+import org.junit.Test;
+
+/**
+ * Tests that we can skip serialization of the documents when embedding
+ * Solr.
+ */
+public class EmbeddedSolrNoSerializeTest extends SolrTestCaseJ4 {
+
+ static EmbeddedSolrServer solrServer;
+
+ @BeforeClass
+ public static void init() throws Exception {
+ initCore("solrconfig-tagger.xml", "schema-tagger.xml");
+ solrServer = new EmbeddedSolrServer(h.getCoreContainer(), "collection1");
+ //we don't need to close the EmbeddedSolrServer because SolrTestCaseJ4 closes the core
+ }
+
+ @Before
+ public void setUp() throws Exception {
+ super.setUp();
+ clearIndex();
+ assertU(adoc("id", "9999", "name", "Boston"));
+ assertU(commit());
+ }
+
+ @Test
+ public void testTag() throws SolrServerException, IOException {
+ ModifiableSolrParams params = params();
+ String input = "foo boston bar";//just one tag;
+ QueryRequest req = new SolrTaggerRequest(params, input);
+ req.setPath("/tag");
+
+ QueryResponse rsp = req.process(solrServer);
+ SolrDocumentList results= (SolrDocumentList) rsp.getResponse().get("response");
+ assertNotNull(rsp.getResponse().get("tags"));
+ assertNotNull(results.get(0));
+ }
+
+ @SuppressWarnings("serial")
+ public static class SolrTaggerRequest extends QueryRequest {
+
+ private final String input;
+
+ public SolrTaggerRequest(SolrParams p, String input) {
+ super(p, METHOD.POST);
+ this.input = input;
+ }
+
+ // Deprecated in 7.2 but should live on until 8.x
+ @SuppressWarnings("deprecation")
+ @Override
+ public Collection getContentStreams() {
+ return Collections.singleton(new ContentStreamBase.StringStream(input));
+ }
+
+ // As of 7.2. But won't work until: https://issues.apache.org/jira/browse/SOLR-12142
+// @Override
+// public RequestWriter.ContentWriter getContentWriter(String expectedType) {
+// return new RequestWriter.StringPayloadContentWriter(input, "text/plain; charset=UTF8");
+// }
+ }
+
+ @Test
+ public void testSearch() throws Exception {
+ QueryResponse rsp = solrServer.query(params("q", "name:Boston"));
+ assertNotNull(rsp.getResults().get(0));
+ }
+
+ @Test
+ public void testAssertTagStreamingWithSolrTaggerRequest() throws Exception {
+ doTestAssertTagStreaming(SolrTaggerRequest::new);
+ }
+
+ @Test
+ @Ignore("As of Solr 7, stream.body is disabled by default for security ") // DWS: dubious, IMO
+ // and it can't be enabled with EmbeddedSolrServer until SOLR-12126
+ public void testAssertTagStreamingWithStreamBodyParam() throws Exception {
+ doTestAssertTagStreaming((params, input) -> {
+ params.set("stream.body", input);
+ return new QueryRequest(params);
+ });
+ }
+
+ public void doTestAssertTagStreaming(BiFunction newQueryRequest) throws IOException, SolrServerException {
+ ModifiableSolrParams params = params();
+ String input = "foo boston bar";//just one tag;
+ QueryRequest req = newQueryRequest.apply(params, input);
+ req.setPath("/tag");
+
+ final AtomicReference refDoc = new AtomicReference<>();
+ req.setStreamingResponseCallback(new StreamingResponseCallback() {
+ @Override
+ public void streamSolrDocument(SolrDocument doc) {
+ refDoc.set(doc);
+ }
+
+ @Override
+ public void streamDocListInfo(long numFound, long start, Float maxScore) {
+
+ }
+ });
+ QueryResponse rsp = req.process(solrServer);
+ assertNotNull(rsp.getResponse().get("tags"));
+ assertNotNull(refDoc.get());
+ assertEquals("Boston", ((Field)refDoc.get().getFieldValue("name")).stringValue());
+ }
+}
diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/RandomizedTaggerTest.java b/solr/core/src/test/org/apache/solr/handler/tagger/RandomizedTaggerTest.java
new file mode 100644
index 00000000000..cb742a87a8c
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/tagger/RandomizedTaggerTest.java
@@ -0,0 +1,150 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Random;
+import java.util.Set;
+
+import com.carrotsearch.randomizedtesting.annotations.Repeat;
+import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
+import com.carrotsearch.randomizedtesting.generators.RandomStrings;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Randomly generate taggable text and verify via simple tag algorithm.
+ */
+@Repeat(iterations = 10)
+public class RandomizedTaggerTest extends TaggerTestCase {
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ initCore("solrconfig-tagger.xml", "schema-tagger.xml");
+ }
+
+ @Test
+ public void test() throws Exception {
+ final Random R = random();
+
+ Set names = new HashSet<>();
+ //random list of single-word names
+ final int NUM_SINGLES = 4;//RandomInts.randomIntBetween(R, 1, 5);
+ for (int i = 0; i < NUM_SINGLES; i++) {
+ if (i == 0)//first is a big string (perhaps triggers bugs related to growing buffers)
+ names.add(randomStringOfLength(16, 32));
+ else
+ names.add(randomString());
+ }
+
+ //add random list of multi-word names, partially including existing names
+ final int NUM_MULTI = 10;
+ for (int i = 0; i < NUM_MULTI; i++) {
+ final int numWords = RandomNumbers.randomIntBetween(R, 2, 4);
+ StringBuilder buf = new StringBuilder();
+ for (int j = 0; j < numWords; j++) {
+ if (j != 0)
+ buf.append(' ');
+ if (R.nextBoolean()) {//new likely non-existent word
+ buf.append(randomString());
+ } else {//existing word (possible multi-word from prev iteration)
+ buf.append(RandomPicks.randomFrom(R, names));
+ }
+ }
+ names.add(buf.toString());
+ }
+
+ // BUILD NAMES
+ buildNames(names.toArray(new String[names.size()]));
+
+ // QUERY LOOP
+ for (int tTries = 0; tTries < 10 * RANDOM_MULTIPLIER; tTries++) {
+ // Build up random input, similar to multi-word random names above
+ StringBuilder input = new StringBuilder();
+ final int INPUT_WORD_LEN = 20;
+ input.append(' ');//must start with space based on assertBruteForce logic
+ for (int i = 0; i < INPUT_WORD_LEN; i++) {
+ if (R.nextBoolean()) {//new likely non-existent word
+ input.append(randomString());
+ } else {//existing word (possible multi-word from prev iteration)
+ input.append(RandomPicks.randomFrom(R, NAMES));
+ }
+ input.append(' ');//must end with a space
+ }
+
+ boolean madeIt = false;
+ try {
+ assertBruteForce(input.toString());
+ madeIt = true;
+ } finally {
+ if (!madeIt) {
+ System.out.println("Reproduce with:");
+ System.out.print(" buildNames(");
+ for (int i = 0; i < NAMES.size(); i++) {
+ if (i != 0)
+ System.out.print(',');
+ System.out.print('"');
+ System.out.print(NAMES.get(i));
+ System.out.print('"');
+ }
+ System.out.println(");");
+ System.out.println(" assertBruteForce(\"" + input+"\");");
+ }
+ }
+ }
+
+ }
+
+ private void assertBruteForce(String input) throws Exception {
+ assert input.matches(" .* ");
+ baseParams.set("overlaps", "ALL");
+
+ //loop through NAMES and find all tag offsets
+ List testTags = new ArrayList<>();
+ for (String name : NAMES) {
+ String spaceName = " "+name+" ";
+ int off = 0;
+ while (true) {
+ int idx = input.indexOf(spaceName, off);
+ if (idx < 0)
+ break;
+ testTags.add(new TestTag(idx + 1, idx + 1 + name.length(), name, name));
+ off = idx + 1;
+ }
+ }
+
+ //assert
+ assertTags(reqDoc(input), testTags.toArray(new TestTag[testTags.size()]));
+ }
+
+ private String randomString() { return randomStringOfLength(1, 1); }
+
+ private String randomStringOfLength(int min, int max) {
+ return RandomStrings.randomAsciiLettersOfLengthBetween(random(), min, max).toLowerCase(Locale.ROOT);
+ }
+
+}
diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/Tagger2Test.java b/solr/core/src/test/org/apache/solr/handler/tagger/Tagger2Test.java
new file mode 100644
index 00000000000..c7580e1f729
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/tagger/Tagger2Test.java
@@ -0,0 +1,175 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.nio.charset.StandardCharsets;
+
+import org.junit.BeforeClass;
+import org.junit.Ignore;
+import org.junit.Test;
+
+/**
+ * Test the {@link TaggerRequestHandler}.
+ */
+public class Tagger2Test extends TaggerTestCase {
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ initCore("solrconfig-tagger.xml", "schema-tagger.xml");
+ }
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT");
+ }
+
+ /** whole matching, no sub-tags */
+ @Test
+ public void testLongestDominantRight() throws Exception {
+ buildNames("in", "San", "in San", "Francisco", "San Francisco",
+ "San Francisco State College", "College of California",
+ "Clayton", "Clayton North", "North Carolina");
+
+ assertTags("He lived in San Francisco.",
+ "in", "San Francisco");
+
+ assertTags("He enrolled in San Francisco State College of California",
+ "in", "San Francisco State College");
+
+ assertTags("He lived in Clayton North Carolina",
+ "in", "Clayton", "North Carolina");
+
+ }
+
+ // As of Lucene/Solr 4.9, StandardTokenizer never does this anymore (reported to Lucene dev-list,
+ // Jan 26th 2015. Honestly it's not particularly important to us but it renders this test
+ // pointless.
+ /** Orig issue https://github.com/OpenSextant/SolrTextTagger/issues/2 related: #13 */
+ @Test
+ @Ignore
+ public void testVeryLongWord() throws Exception {
+ String SANFRAN = "San Francisco";
+ buildNames(SANFRAN);
+
+ // exceeds default 255 max token length which means it in-effect becomes a stop-word
+ StringBuilder STOP = new StringBuilder(260);//>255
+ for (int i = 0; i < STOP.capacity(); i++) {
+ STOP.append((char) ('0' + (i % 10)));
+ }
+
+ String doc = "San " + STOP + " Francisco";
+ assertTags(doc);//no match due to default stop word handling
+ //and we find it when we ignore stop words
+ assertTags(reqDoc(doc, "ignoreStopwords", "true"), new TestTag(0, doc.length(), doc, lookupByName(SANFRAN)));
+ }
+
+ /** Support for stopwords (posInc > 1);
+ * discussion: https://github.com/OpenSextant/SolrTextTagger/issues/13 */
+ @Test
+ public void testStopWords() throws Exception {
+ baseParams.set("field", "name_tagStop");//stop filter (pos inc enabled) index & query
+
+ String SOUTHOFWALES = "South of Wales";//'of' is stop word index time & query
+ String ACITYA = "A City A";
+
+ buildNames(SOUTHOFWALES, ACITYA);
+
+ //round-trip works
+ assertTags(reqDoc(SOUTHOFWALES), new TestTag(0, SOUTHOFWALES.length(), SOUTHOFWALES,
+ lookupByName(SOUTHOFWALES)));
+ // but offsets doesn't include stopword when leading or trailing...
+ assertTags(reqDoc(ACITYA), new TestTag(2, 6, "City",
+ lookupByName(ACITYA)));
+ //break on stop words
+ assertTags(reqDoc(SOUTHOFWALES, "ignoreStopwords", "false"));//match nothing
+ }
+
+ /** Tests WordDelimiterGraphFilter, stacked/synonymous tokens at index time (catenate options) */
+ @Test
+ public void testWDF() throws Exception {
+ baseParams.set("field", "name_tagWDF");
+
+ final String WINSTONSALEM = "City of Winston-Salem";//hyphen
+ final String BOSTONHARBOR = "Boston Harbor";//space
+ buildNames(WINSTONSALEM, BOSTONHARBOR);
+
+ //round-trip works
+ assertTags(reqDoc(WINSTONSALEM), new TestTag(0, WINSTONSALEM.length(), WINSTONSALEM,
+ lookupByName(WINSTONSALEM)));
+
+ // space separated works
+ final String WS_SPACE = WINSTONSALEM.replace('-', ' ');
+ assertTags(reqDoc(WS_SPACE),
+ new TestTag(0, WS_SPACE.length(), WS_SPACE,
+ lookupByName(WINSTONSALEM)));
+
+ //must be full match
+ assertTags(reqDoc("Winston"));//match nothing
+ assertTags(reqDoc("Salem"));//match nothing
+
+ // round-trip works
+ assertTags(reqDoc(BOSTONHARBOR), new TestTag(0, BOSTONHARBOR.length(), BOSTONHARBOR,
+ lookupByName(BOSTONHARBOR)));
+
+ // hyphen separated works
+ final String BH_HYPHEN = BOSTONHARBOR.replace(' ', '-');
+ assertTags(reqDoc(BH_HYPHEN),
+ new TestTag(0, BH_HYPHEN.length(), BH_HYPHEN,
+ lookupByName(BOSTONHARBOR)));
+ //must be full match
+ assertTags(reqDoc("Boston"));//match nothing
+ assertTags(reqDoc("Harbor"));//match nothing
+ }
+
+ /** Ensure character offsets work for multi-byte characters */
+ @Test
+ public void testMultibyteChar() throws Exception {
+ // https://unicode-table.com/en/2019/
+ // 0 1 2 3 4
+ // 01234567890123456789012345678901234567890
+ String TEXT = "He mentionned ’Obama’ in the White House";
+ assertEquals(40, TEXT.length()); // char length (in Java, UTF16)
+
+ String QUOTE = TEXT.substring(14, 15);
+ assertEquals(8217, QUOTE.codePointAt(0));
+
+ //UTF8
+ assertEquals(3, QUOTE.getBytes(StandardCharsets.UTF_8).length);
+ assertEquals(1, "a".getBytes(StandardCharsets.UTF_8).length);
+ assertEquals(40 + 2*2, TEXT.getBytes(StandardCharsets.UTF_8).length);
+
+ //UTF16 big endian (by specifying big/little endian, there is no "byte order mark")
+ assertEquals(2, QUOTE.getBytes(StandardCharsets.UTF_16BE).length);
+ assertEquals(2, "a".getBytes(StandardCharsets.UTF_16BE).length);
+ assertEquals(40 * 2, TEXT.getBytes(StandardCharsets.UTF_16BE).length);
+
+
+ buildNames("Obama");
+
+ assertTags(TEXT, "Obama");
+
+ // TODO test surrogate pairs (i.e. code points not in the BMP)
+ }
+
+}
diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTest.java b/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTest.java
new file mode 100644
index 00000000000..93b11b50a28
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTest.java
@@ -0,0 +1,296 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.util.Arrays;
+import java.util.stream.Collectors;
+
+import org.apache.solr.common.params.CommonParams;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.request.SolrQueryRequest;
+import org.junit.BeforeClass;
+import org.junit.Ignore;
+
+/**
+ * The original test for {@link TaggerRequestHandler}.
+ */
+public class TaggerTest extends TaggerTestCase {
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ initCore("solrconfig-tagger.xml", "schema-tagger.xml");
+ }
+
+ private void indexAndBuild() throws Exception {
+ N[] names = N.values();
+ String[] namesStrs = new String[names.length];
+ for (int i = 0; i < names.length; i++) {
+ namesStrs[i] = names[i].getName();
+ }
+ buildNames(namesStrs);
+ }
+
+ /** Name corpus */
+ enum N {
+ //keep order to retain ord()
+ London, London_Business_School, Boston, City_of_London,
+ of, the//filtered out of the corpus by a custom query
+ ;
+
+ String getName() { return name().replace('_',' '); }
+ static N lookupByName(String name) { return N.valueOf(name.replace(' ', '_')); }
+ int getId() { return ordinal(); }
+ }
+
+ public void testFormat() throws Exception {
+ baseParams.set("overlaps", "NO_SUB");
+ indexAndBuild();
+
+ String rspStr = _testFormatRequest(false);
+ String expected = "\n" +
+ "\n" +
+ "\n" +
+ "1 \n" +
+ "\n" +
+ " \n" +
+ " 0 \n" +
+ " 22 \n" +
+ " \n" +
+ " 1 \n" +
+ " \n" +
+ " \n" +
+ " \n" +
+ "\n" +
+ " \n" +
+ " 1 \n" +
+ " London Business School \n" +
+ " \n" +
+ " \n";
+ assertEquals(expected, rspStr);
+ }
+
+ public void testFormatMatchText() throws Exception {
+ baseParams.set("overlaps", "NO_SUB");
+ indexAndBuild();
+
+ String rspStr = _testFormatRequest(true);
+ String expected = "\n" +
+ "\n" +
+ "\n" +
+ "1 \n" +
+ "\n" +
+ " \n" +
+ " 0 \n" +
+ " 22 \n" +
+ " london business school \n" +
+ " \n" +
+ " 1 \n" +
+ " \n" +
+ " \n" +
+ " \n" +
+ "\n" +
+ " \n" +
+ " 1 \n" +
+ " London Business School \n" +
+ " \n" +
+ " \n";
+ assertEquals(expected, rspStr);
+ }
+
+ private String _testFormatRequest(boolean matchText) throws Exception {
+ String doc = "london business school";//just one tag
+ SolrQueryRequest req = reqDoc(doc, "indent", "on", "omitHeader", "on", "matchText", ""+matchText);
+ String rspStr = h.query(req);
+ req.close();
+ return rspStr;
+ }
+
+ /** Partial matching, no sub-tags */
+ @Ignore //TODO ConcatenateGraphFilter uses a special separator char that we can't put into XML (invalid char)
+ public void testPartialMatching() throws Exception {
+ baseParams.set("field", "name_tagPartial");
+ baseParams.set("overlaps", "NO_SUB");
+ baseParams.set("fq", "NOT name:(of the)");//test filtering
+ indexAndBuild();
+
+ //these match nothing
+ assertTags(reqDoc("") );
+ assertTags(reqDoc(" ") );
+ assertTags(reqDoc("the") );
+
+ String doc;
+
+ //just London Business School via "school" substring
+ doc = "school";
+ assertTags(reqDoc(doc), tt(doc,"school", 0, N.London_Business_School));
+
+ doc = "a school";
+ assertTags(reqDoc(doc), tt(doc,"school", 0, N.London_Business_School));
+
+ doc = "school a";
+ assertTags(reqDoc(doc), tt(doc,"school", 0, N.London_Business_School));
+
+ //More interesting
+
+ doc = "school City";
+ assertTags(reqDoc(doc),
+ tt(doc, "school", 0, N.London_Business_School),
+ tt(doc, "City", 0, N.City_of_London) );
+
+ doc = "City of London Business School";
+ assertTags(reqDoc(doc), //no plain London (sub-tag)
+ tt(doc, "City of London", 0, N.City_of_London),
+ tt(doc, "London Business School", 0, N.London_Business_School));
+ }
+
+ /** whole matching, no sub-tags */
+ public void testWholeMatching() throws Exception {
+ baseParams.set("overlaps", "NO_SUB");
+ baseParams.set("fq", "NOT name:(of the)");//test filtering
+ indexAndBuild();
+
+ //these match nothing
+ assertTags(reqDoc(""));
+ assertTags(reqDoc(" ") );
+ assertTags(reqDoc("the") );
+
+ //partial on N.London_Business_School matches nothing
+ assertTags(reqDoc("school") );
+ assertTags(reqDoc("a school") );
+ assertTags(reqDoc("school a") );
+ assertTags(reqDoc("school City") );
+
+ String doc;
+
+ doc = "school business london";//backwards
+ assertTags(reqDoc(doc), tt(doc,"london", 0, N.London));
+
+ doc = "of London Business School";
+ assertTags(reqDoc(doc), //no plain London (sub-tag)
+ tt(doc, "London Business School", 0, N.London_Business_School));
+
+ //More interesting
+ doc = "City of London Business School";
+ assertTags(reqDoc(doc), //no plain London (sub-tag)
+ tt(doc, "City of London", 0, N.City_of_London),
+ tt(doc, "London Business School", 0, N.London_Business_School));
+
+ doc = "City of London Business";
+ assertTags(reqDoc(doc), //no plain London (sub-tag) no Business (partial-match)
+ tt(doc, "City of London", 0, N.City_of_London));
+
+ doc = "London Business magazine";
+ assertTags(reqDoc(doc), //Just London; L.B.S. fails
+ tt(doc, "London", 0, N.London));
+ }
+
+ /** whole matching, with sub-tags */
+ public void testSubTags() throws Exception {
+ baseParams.set("overlaps", "ALL");
+ baseParams.set("fq", "NOT name:(of the)");//test filtering
+ indexAndBuild();
+
+ //these match nothing
+ assertTags(reqDoc(""));
+ assertTags(reqDoc(" ") );
+ assertTags(reqDoc("the") );
+
+ //partial on N.London_Business_School matches nothing
+ assertTags(reqDoc("school") );
+ assertTags(reqDoc("a school") );
+ assertTags(reqDoc("school a") );
+ assertTags(reqDoc("school City") );
+
+ String doc;
+
+ doc = "school business london";//backwards
+ assertTags(reqDoc(doc), tt(doc,"london", 0, N.London));
+
+ //More interesting
+ doc = "City of London Business School";
+ assertTags(reqDoc(doc),
+ tt(doc, "City of London", 0, N.City_of_London),
+ tt(doc, "London", 0, N.London),
+ tt(doc, "London Business School", 0, N.London_Business_School));
+
+ doc = "City of London Business";
+ assertTags(reqDoc(doc),
+ tt(doc, "City of London", 0, N.City_of_London),
+ tt(doc, "London", 0, N.London));
+ }
+
+ public void testMultipleFilterQueries() throws Exception {
+ baseParams.set("overlaps", "ALL");
+
+ // build up the corpus with some additional fields for filtering purposes
+ deleteByQueryAndGetVersion("*:*", null);
+
+ int i = 0;
+ assertU(adoc("id", ""+i++, "name", N.London.getName(), "type", "city", "country", "UK"));
+ assertU(adoc("id", ""+i++, "name", N.London_Business_School.getName(), "type", "school", "country", "UK"));
+ assertU(adoc("id", ""+i++, "name", N.Boston.getName(), "type", "city", "country", "US"));
+ assertU(adoc("id", ""+i++, "name", N.City_of_London.getName(), "type", "org", "country", "UK"));
+ assertU(commit());
+
+ // not calling buildNames so that we can bring along extra attributes for filtering
+ NAMES = Arrays.stream(N.values()).map(N::getName).collect(Collectors.toList());
+
+ // phrase that matches everything
+ String doc = "City of London Business School in Boston";
+
+ // first do no filtering
+ ModifiableSolrParams p = new ModifiableSolrParams();
+ p.add(CommonParams.Q, "*:*");
+ assertTags(reqDoc(doc, p),
+ tt(doc, "City of London", 0, N.City_of_London),
+ tt(doc, "London", 0, N.London),
+ tt(doc, "London Business School", 0, N.London_Business_School),
+ tt(doc, "Boston", 0, N.Boston));
+
+ // add a single fq
+ p.add(CommonParams.FQ, "type:city");
+ assertTags(reqDoc(doc, p),
+ tt(doc, "London", 0, N.London),
+ tt(doc, "Boston", 0, N.Boston));
+
+ // add another fq
+ p.add(CommonParams.FQ, "country:US");
+ assertTags(reqDoc(doc, p),
+ tt(doc, "Boston", 0, N.Boston));
+ }
+
+ private TestTag tt(String doc, String substring, int substringIndex, N name) {
+ assert substringIndex == 0;
+
+ //little bit of copy-paste code from super.tt()
+ int startOffset = -1, endOffset;
+ int substringIndex1 = 0;
+ for(int i = 0; i <= substringIndex1; i++) {
+ startOffset = doc.indexOf(substring, ++startOffset);
+ assert startOffset >= 0 : "The test itself is broken";
+ }
+ endOffset = startOffset+ substring.length();//1 greater (exclusive)
+ return new TestTag(startOffset, endOffset, substring, lookupByName(name.getName()));
+ }
+
+}
diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTestCase.java b/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTestCase.java
new file mode 100644
index 00000000000..e525ce9265a
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTestCase.java
@@ -0,0 +1,251 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeSet;
+
+import org.apache.commons.lang.builder.CompareToBuilder;
+import org.apache.commons.lang.builder.EqualsBuilder;
+import org.apache.lucene.document.Document;
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.params.CommonParams;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.ContentStream;
+import org.apache.solr.common.util.ContentStreamBase;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.request.SolrQueryRequestBase;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.search.DocIterator;
+import org.apache.solr.search.DocList;
+import org.apache.solr.search.SolrIndexSearcher;
+import org.junit.Rule;
+import org.junit.rules.TestWatcher;
+import org.junit.runner.Description;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public abstract class TaggerTestCase extends SolrTestCaseJ4 {
+
+ private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+ @Rule
+ public TestWatcher watchman = new TestWatcher() {
+ @Override
+ protected void starting(Description description) {
+ log.info("{} being run...", description.getDisplayName());
+ }
+ };
+
+ protected final ModifiableSolrParams baseParams = new ModifiableSolrParams();
+
+ //populated in buildNames; tested in assertTags
+ protected static List NAMES;
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ baseParams.clear();
+ baseParams.set(CommonParams.QT, "/tag");
+ baseParams.set(CommonParams.WT, "xml");
+ }
+
+ protected void assertTags(String doc, String... tags) throws Exception {
+ TestTag[] tts = new TestTag[tags.length];
+ for (int i = 0; i < tags.length; i++) {
+ tts[i] = tt(doc, tags[i]);
+ }
+ assertTags(reqDoc(doc), tts);
+ }
+
+ protected static void buildNames(String... names) throws Exception {
+ deleteByQueryAndGetVersion("*:*", null);
+ NAMES = Arrays.asList(names);
+ //Collections.sort(NAMES);
+ int i = 0;
+ for (String n : NAMES) {
+ assertU(adoc("id", ""+(i++), "name", n));
+ }
+ assertU(commit());
+ }
+
+ protected String lookupByName(String name) {
+ for (String n : NAMES) {
+ if (n.equalsIgnoreCase(name))
+ return n;
+ }
+ return null;
+ }
+
+ protected TestTag tt(String doc, String substring) {
+ int startOffset = -1, endOffset;
+ int substringIndex = 0;
+ for(int i = 0; i <= substringIndex; i++) {
+ startOffset = doc.indexOf(substring,++startOffset);
+ assert startOffset >= 0 : "The test itself is broken";
+ }
+ endOffset = startOffset+substring.length();//1 greater (exclusive)
+ return new TestTag(startOffset, endOffset, substring, lookupByName(substring));
+ }
+
+ /** Asserts the tags. Will call req.close(). */
+ protected void assertTags(SolrQueryRequest req, TestTag... eTags) throws Exception {
+ try {
+ SolrQueryResponse rsp = h.queryAndResponse(req.getParams().get(CommonParams.QT), req);
+ TestTag[] aTags = pullTagsFromResponse(req, rsp);
+
+ String message;
+ if (aTags.length > 10)
+ message = null;
+ else
+ message = Arrays.asList(aTags).toString();
+ Arrays.sort(eTags);
+ assertSortedArrayEquals(message, eTags, aTags);
+
+ } finally {
+ req.close();
+ }
+ }
+
+ @SuppressWarnings("unchecked")
+ protected TestTag[] pullTagsFromResponse(SolrQueryRequest req, SolrQueryResponse rsp ) throws IOException {
+ NamedList rspValues = rsp.getValues();
+ Map matchingNames = new HashMap<>();
+ SolrIndexSearcher searcher = req.getSearcher();
+ DocList docList = (DocList) rspValues.get("response");
+ DocIterator iter = docList.iterator();
+ while (iter.hasNext()) {
+ int docId = iter.next();
+ Document doc = searcher.doc(docId);
+ String id = doc.getField("id").stringValue();
+ String name = lookupByName(doc.get("name"));
+ assertEquals("looking for "+name, NAMES.indexOf(name)+"", id);
+ matchingNames.put(id, name);
+ }
+
+ //build TestTag[] aTags from response ('a' is actual)
+ List mTagsList = (List) rspValues.get("tags");
+ List aTags = new ArrayList<>();
+ for (NamedList map : mTagsList) {
+ List foundIds = (List) map.get("ids");
+ for (String id : foundIds) {
+ aTags.add(new TestTag(
+ ((Number)map.get("startOffset")).intValue(),
+ ((Number)map.get("endOffset")).intValue(),
+ null,
+ matchingNames.get(id)));
+ }
+ }
+ return aTags.toArray(new TestTag[0]);
+ }
+
+ /** REMEMBER to close() the result req object. */
+ protected SolrQueryRequest reqDoc(String doc, String... moreParams) {
+ return reqDoc(doc, params(moreParams));
+ }
+
+ /** REMEMBER to close() the result req object. */
+ protected SolrQueryRequest reqDoc(String doc, SolrParams moreParams) {
+ log.debug("Test doc: "+doc);
+ SolrParams params = SolrParams.wrapDefaults(moreParams, baseParams);
+ SolrQueryRequestBase req = new SolrQueryRequestBase(h.getCore(), params) {};
+ Iterable stream = Collections.singleton((ContentStream)new ContentStreamBase.StringStream(doc));
+ req.setContentStreams(stream);
+ return req;
+ }
+
+ /** Asserts the sorted arrays are equals, with a helpful error message when not.*/
+ public void assertSortedArrayEquals(String message, Object[] expecteds, Object[] actuals) {
+ AssertionError error = null;
+ try {
+ assertArrayEquals(null, expecteds, actuals);
+ } catch (AssertionError e) {
+ error = e;
+ }
+ if (error == null)
+ return;
+ TreeSet expectedRemaining = new TreeSet<>(Arrays.asList(expecteds));
+ expectedRemaining.removeAll(Arrays.asList(actuals));
+ if (!expectedRemaining.isEmpty())
+ fail(message+": didn't find expected "+expectedRemaining.first()+" (of "+expectedRemaining.size()+"); "+ error);
+ TreeSet actualsRemaining = new TreeSet<>(Arrays.asList(actuals));
+ actualsRemaining.removeAll(Arrays.asList(expecteds));
+ fail(message+": didn't expect "+actualsRemaining.first()+" (of "+actualsRemaining.size()+"); "+ error);
+ }
+
+ class TestTag implements Comparable {
+ final int startOffset, endOffset;
+ final String substring;
+ final String docName;
+
+ TestTag(int startOffset, int endOffset, String substring, String docName) {
+ this.startOffset = startOffset;
+ this.endOffset = endOffset;
+ this.substring = substring;
+ this.docName = docName;
+ }
+
+ @Override
+ public String toString() {
+ return "TestTag{" +
+ "[" + startOffset + "-" + endOffset + "]" +
+ " doc=" + NAMES.indexOf(docName) + ":'" + docName + "'" +
+ (docName.equals(substring) || substring == null ? "" : " substr="+substring)+
+ '}';
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ TestTag that = (TestTag) obj;
+ return new EqualsBuilder()
+ .append(this.startOffset, that.startOffset)
+ .append(this.endOffset, that.endOffset)
+ .append(this.docName, that.docName)
+ .isEquals();
+ }
+
+ @Override
+ public int hashCode() {
+ return startOffset;//cheesy but acceptable
+ }
+
+ @Override
+ public int compareTo(Object o) {
+ TestTag that = (TestTag) o;
+ return new CompareToBuilder()
+ .append(this.startOffset, that.startOffset)
+ .append(this.endOffset, that.endOffset)
+ .append(this.docName,that.docName)
+ .toComparison();
+ }
+ }
+}
diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/TaggingAttributeTest.java b/solr/core/src/test/org/apache/solr/handler/tagger/TaggingAttributeTest.java
new file mode 100644
index 00000000000..39c78286713
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/tagger/TaggingAttributeTest.java
@@ -0,0 +1,73 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Test the {@link TaggerRequestHandler} with
+ * a Analyzer chain that does use the {@link TaggingAttribute}. See the test
+ * configuration under 'taggingattribute'.
+ */
+public class TaggingAttributeTest extends TaggerTestCase {
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ initCore("solrconfig-tagger.xml", "schema-tagger.xml");
+ }
+
+ /**
+ * Whole matching, no sub-tags. Links only words with > 3 letters.
+ * Because of that "San" is not used to start tags
+ *
+ */
+ @Test
+ public void testTaggingAttribute() throws Exception {
+ baseParams.set("field", "name_tagAttribute"); // has WordLengthTaggingFilter using the TaggingAttribute
+ // this test is based on the longest dominant right test, so we use the
+ // the same TagClusterReducer setting
+ baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT");
+
+ buildNames("in", "San", "in San", "Francisco", "San Francisco",
+ "San Francisco State College", "College of California",
+ "Clayton", "Clayton North", "North Carolina");
+
+ assertTags("He lived in San Francisco.",
+ //"in", "San Francisco"); //whis would be expected without taggable
+ "Francisco");// this are the expected results with taggable
+
+ assertTags("He enrolled in San Francisco State College of California",
+ //"in", "San Francisco State College"); //without taggable enabled
+ "Francisco", "College of California");// With taggable
+ //NOTE this also tests that started tags are advanced for non-taggable
+ // tokens, as otherwise 'College of California' would not be
+ // suggested.
+
+ assertTags("He lived in Clayton North Carolina",
+ //"in", "Clayton", "North Carolina");
+ "Clayton", "North Carolina");
+
+ }
+
+}
diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilter.java b/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilter.java
new file mode 100644
index 00000000000..237a8b82c39
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilter.java
@@ -0,0 +1,110 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+/**
+ * Simple TokenFilter that lookup only Tokens with more as the parsed number
+ * of chars.
+ * NOTE: This implementation is only intended to be used as an example
+ * and for unit testing the {@link TaggingAttribute} feature. Typically
+ * implementations will be based on NLP results (e.g. using POS tags or
+ * detected Named Entities).
+ *
+ * Example Usage:
+ * Currently the usage requires to modify the Analyzer as defined by the
+ * indexedField
. An alternative would be to allow the configuration
+ * of a special FieldType in the schema.xml and use this Analyzer for processing
+ * the text sent to the request.
+ * While the current solution is fine for direct API usage, defining the
+ * Analyzer in the schema.xml would be better suitable for using this feature
+ * with the {@link TaggerRequestHandler}.
+ *
+ *
+ * Analyzer analyzer = req.getSchema().getField(indexedField).getType().getAnalyzer();
+ * //get the TokenStream from the Analyzer
+ * TokenStream baseStream = analyzer.tokenStream("", reader);
+ * //add a FilterStream that sets the LookupAttribute to the end
+ * TokenStream filterStream = new WordLengthLookupFilter(baseStream);
+ * //create the Tagger using the modified analyzer chain.
+ * new Tagger(corpus, filterStream, tagClusterReducer) {
+ *
+ * protected void tagCallback(int startOffset, int endOffset, long docIdsKey) {
+ * //implement the callback
+ * }
+ *
+ * }.process();
+ *
+ */
+public class WordLengthTaggingFilter extends TokenFilter {
+
+ /**
+ * The default minimum length is 3
+ */
+ public static final int DEFAULT_MIN_LENGTH = 3;
+ private final TaggingAttribute lookupAtt = addAttribute(TaggingAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private int minLength;
+
+ /**
+ * TokenFilter only marks tokens to be looked up with equals or more as
+ * {@link #DEFAULT_MIN_LENGTH} characters
+ */
+ public WordLengthTaggingFilter(TokenStream input) {
+ this(input, null);
+ }
+
+ /**
+ * TokenFilter only marks tokens to be looked up with equals or more characters
+ * as the parsed minimum.
+ *
+ * @param input the TokenStream to consume tokens from
+ * @param minLength The minimum length to lookup a Token. null
+ * or <= 0 to use the #DEFAULT_MIN_LENGTH
+ */
+ public WordLengthTaggingFilter(TokenStream input, Integer minLength) {
+ super(input);
+ if (minLength == null || minLength <= 0) {
+ this.minLength = DEFAULT_MIN_LENGTH;
+ } else {
+ this.minLength = minLength;
+ }
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ int size = offsetAtt.endOffset() - offsetAtt.startOffset();
+ lookupAtt.setTaggable(size >= minLength);
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+}
diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilterFactory.java b/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilterFactory.java
new file mode 100644
index 00000000000..dbfc5381bb6
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilterFactory.java
@@ -0,0 +1,67 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.lang.invoke.MethodHandles;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class WordLengthTaggingFilterFactory extends TokenFilterFactory {
+
+ private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+ public static final String MIN_LENGTH = "minLength";
+
+ private final Integer minLength;
+
+ public WordLengthTaggingFilterFactory(Map args) {
+ super(args);
+ int minLength = -1;
+ Object value = args.get(MIN_LENGTH);
+ if (value != null) {
+ try {
+ minLength = Integer.parseInt(value.toString());
+ } catch (NumberFormatException e) {
+ log.warn("Unable to parse minLength from value 'minLength=\"{}\"'", value);
+
+ }
+ }
+ if (minLength <= 0) {
+ log.info("use default minLength={}", WordLengthTaggingFilter.DEFAULT_MIN_LENGTH);
+ this.minLength = null;
+ } else {
+ log.info("set minLength={}", minLength);
+ this.minLength = minLength;
+ }
+ }
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ return new WordLengthTaggingFilter(input, minLength);
+ }
+
+}
diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/XmlInterpolationTest.java b/solr/core/src/test/org/apache/solr/handler/tagger/XmlInterpolationTest.java
new file mode 100644
index 00000000000..d7dd5dff213
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/tagger/XmlInterpolationTest.java
@@ -0,0 +1,224 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.xml.sax.InputSource;
+
+public class XmlInterpolationTest extends TaggerTestCase {
+
+ private static DocumentBuilder xmlDocBuilder;
+
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ DocumentBuilderFactory xmlDocBuilderFactory = DocumentBuilderFactory.newInstance();
+ xmlDocBuilderFactory.setValidating(true);
+ xmlDocBuilderFactory.setNamespaceAware(true);
+ xmlDocBuilder = xmlDocBuilderFactory.newDocumentBuilder();
+
+ initCore("solrconfig-tagger.xml", "schema-tagger.xml");
+ }
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ baseParams.set("field", "name_tagXml");
+ baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT");
+ baseParams.set("xmlOffsetAdjust", "true");
+ }
+
+ @Test
+ public void test() throws Exception {
+ buildNames("start end");
+
+ assertXmlTag("before start end after ", true);
+ assertXmlTag("before start end after ", true);
+ assertXmlTag("before start end after ", true);
+ assertXmlTag("before start end after ", true);
+ assertXmlTag("before start end after ", true);
+ assertXmlTag("before start end after ", true);//adjacent tags
+ assertXmlTag("before start end after ", true);
+ assertXmlTag("before start end after ", true);
+
+ assertXmlTag("before start
end after ", false);
+ assertXmlTag("before start end after
", false);
+
+ assertXmlTag("before start end after ", true);
+ }
+
+ @Test(expected = SolrException.class)
+ public void testInvalidXml() throws Exception {
+ assertXmlTag("notXml", false);
+ }
+
+ @Test(expected = Exception.class)
+ public void testValidatingXml() throws Exception {
+ validateXml("foo");
+ }
+
+ protected void assertXmlTag(String docText, boolean expected) throws Exception {
+ final SolrQueryRequest req = reqDoc(docText);
+ try { // 5.4 and beyond we can use try-with-resources
+ final SolrQueryResponse rsp = h.queryAndResponse(req.getParams().get("qt"), req);
+ final TestTag[] testTags = pullTagsFromResponse(req, rsp);
+ if (!expected) {
+ assertEquals(0, testTags.length);
+ } else {
+ assertEquals(1, testTags.length);
+ final TestTag tag = testTags[0];
+ validateXml(insertAnchorAtOffsets(docText, tag.startOffset, tag.endOffset, tag.docName));
+ }
+ } finally {
+ req.close();
+ }
+ }
+
+ protected void validateXml(String xml) throws Exception {
+ // the "parse" method also validates XML, will throw an exception if mis-formatted
+ xmlDocBuilder.parse(new InputSource(new StringReader(xml)));
+ }
+
+
+ @Test
+ public void testLuceneHtmlFilterBehavior() {
+ String docText;
+
+ //Close tag adjacent to start & end results in end offset including the close tag. LUCENE-5734
+ docText = "start end ";
+ assertArrayEquals(tagExpect(docText, "start", "end"), analyzeTagOne(docText, "start", "end"));
+
+ //Space after "end" means offset doesn't include
+ docText = "start end ";
+ assertArrayEquals(tagExpect(docText, "start", "end"), analyzeTagOne(docText, "start", "end"));
+
+ //Matches entity at end
+ final String endStr = String.format(Locale.ROOT, "en%02x;", (int) 'd');
+ docText = "start " + endStr + " ";
+ assertArrayEquals(tagExpect(docText, "start", endStr), analyzeTagOne(docText, "start", "end"));
+ //... and at start
+ final String startStr = String.format(Locale.ROOT, "%02x;tart", (int) 's');
+ docText = "" + startStr + " end ";
+ assertArrayEquals(tagExpect(docText, startStr, "end"), analyzeTagOne(docText, "start", "end"));
+
+ //Test ignoring proc instructions & comments. Note: doesn't expand the entity to "start".
+ docText = ""
+ + "]> &start; ";
+ assertArrayEquals(new int[]{-1, -1}, analyzeTagOne(docText, "start", "start"));
+
+ //Test entity behavior
+ docText = " — – & &foo; a b";
+ assertArrayEquals(new String[]{"—", "–", "&", "&foo;", "\u00A0", "a", "b"},
+ analyzeReturnTokens(docText));
+
+ //Observe offset adjustment of trailing entity to end tag
+ docText = "foo bar";
+ assertArrayEquals(tagExpect(docText, "foo", "foo"), analyzeTagOne(docText, "foo", "foo"));
+ }
+
+ private String insertAnchorAtOffsets(String docText, int startOffset, int endOffset, String id) {
+ String insertStart = "";// (normally we'd escape id)
+ String insertEnd = " ";
+ return docText.substring(0, startOffset)
+ + insertStart
+ + docText.substring(startOffset, endOffset)
+ + insertEnd
+ + docText.substring(endOffset);
+ }
+
+ private int[] tagExpect(String docText, String start, String end) {
+ return new int[]{docText.indexOf(start), docText.indexOf(end) + end.length()};
+ }
+
+ private int[] analyzeTagOne(String docText, String start, String end) {
+ int[] result = {-1, -1};
+
+ Reader filter = new HTMLStripCharFilter(new StringReader(docText));
+
+ WhitespaceTokenizer ts = new WhitespaceTokenizer();
+ final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
+ final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
+ try {
+ ts.setReader(filter);
+ ts.reset();
+ while (ts.incrementToken()) {
+ final String termString = termAttribute.toString();
+ if (termString.equals(start))
+ result[0] = offsetAttribute.startOffset();
+ if (termString.equals(end)) {
+ result[1] = offsetAttribute.endOffset();
+ return result;
+ }
+ }
+ ts.end();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ } finally {
+ IOUtils.closeQuietly(ts);
+ }
+ return result;
+ }
+
+ private String[] analyzeReturnTokens(String docText) {
+ List result = new ArrayList<>();
+
+ Reader filter = new HTMLStripCharFilter(new StringReader(docText),
+ Collections.singleton("unescaped"));
+ WhitespaceTokenizer ts = new WhitespaceTokenizer();
+ final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
+ try {
+ ts.setReader(filter);
+ ts.reset();
+ while (ts.incrementToken()) {
+ result.add(termAttribute.toString());
+ }
+ ts.end();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ } finally {
+ IOUtils.closeQuietly(ts);
+ }
+ return result.toArray(new String[result.size()]);
+ }
+
+}
diff --git a/solr/core/src/test/org/apache/solr/search/TestRecovery.java b/solr/core/src/test/org/apache/solr/search/TestRecovery.java
index 1d622076c99..1b79cee61c1 100644
--- a/solr/core/src/test/org/apache/solr/search/TestRecovery.java
+++ b/solr/core/src/test/org/apache/solr/search/TestRecovery.java
@@ -24,7 +24,9 @@ import com.codahale.metrics.Gauge;
import com.codahale.metrics.Meter;
import com.codahale.metrics.Metric;
import com.codahale.metrics.MetricRegistry;
+import org.apache.solr.common.util.TimeSource;
import org.apache.solr.metrics.SolrMetricManager;
+import org.apache.solr.util.TimeOut;
import org.noggit.ObjectBuilder;
import org.slf4j.Logger;
@@ -820,6 +822,7 @@ public class TestRecovery extends SolrTestCaseJ4 {
+"]"
);
+ // Note that the v101->v103 are dropped, therefore it does not present in RTG
assertJQ(req("qt","/get", "getVersions","6")
,"=={'versions':["+String.join(",",v206,v205,v201,v200,v105,v104)+"]}"
);
@@ -929,7 +932,6 @@ public class TestRecovery extends SolrTestCaseJ4 {
,"=={'versions':["+v105+","+v104+"]}"
);
- // this time add some docs first before buffering starts (so tlog won't be at pos 0)
updateJ(jsonAdd(sdoc("id","c100", "_version_",v200)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER));
updateJ(jsonAdd(sdoc("id","c101", "_version_",v201)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER));
@@ -957,10 +959,8 @@ public class TestRecovery extends SolrTestCaseJ4 {
+"" +"]"
);
- // The updates that were buffered (but never applied) still appear in recent versions!
- // This is good for some uses, but may not be good for others.
- assertJQ(req("qt","/get", "getVersions","11")
- ,"=={'versions':["+String.join(",",v206,v205,v204,v203,v201,v200,v105,v104,v103,v102,v101)+"]}"
+ assertJQ(req("qt","/get", "getVersions","6")
+ ,"=={'versions':["+String.join(",",v206,v205,v201,v200,v105,v104)+"]}"
);
assertEquals(UpdateLog.State.ACTIVE, ulog.getState()); // leave each test method in a good state
@@ -1008,13 +1008,9 @@ public class TestRecovery extends SolrTestCaseJ4 {
@Test
- public void testBufferingFlags() throws Exception {
+ public void testExistOldBufferLog() throws Exception {
DirectUpdateHandler2.commitOnClose = false;
- final Semaphore logReplayFinish = new Semaphore(0);
-
- UpdateLog.testing_logReplayFinishHook = () -> logReplayFinish.release();
-
SolrQueryRequest req = req();
UpdateHandler uhandler = req.getCore().getUpdateHandler();
@@ -1024,9 +1020,6 @@ public class TestRecovery extends SolrTestCaseJ4 {
String v101 = getNextVersion();
String v102 = getNextVersion();
String v103 = getNextVersion();
- String v114 = getNextVersion();
- String v115 = getNextVersion();
- String v116 = getNextVersion();
String v117 = getNextVersion();
clearIndex();
@@ -1049,14 +1042,10 @@ public class TestRecovery extends SolrTestCaseJ4 {
uhandler = req.getCore().getUpdateHandler();
ulog = uhandler.getUpdateLog();
- logReplayFinish.acquire(); // wait for replay to finish
-
- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0); // since we died while buffering, we should see this last
-
- //
- // Try again to ensure that the previous log replay didn't wipe out our flags
- //
+ // the core does not replay updates from buffer tlog on startup
+ assertTrue(ulog.existOldBufferLog()); // since we died while buffering, we should see this last
+ // buffer tlog won't be removed on restart
req.close();
h.close();
createCore();
@@ -1065,26 +1054,9 @@ public class TestRecovery extends SolrTestCaseJ4 {
uhandler = req.getCore().getUpdateHandler();
ulog = uhandler.getUpdateLog();
- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0);
-
- // now do some normal non-buffered adds
- updateJ(jsonAdd(sdoc("id","Q4", "_version_",v114)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER));
- updateJ(jsonAdd(sdoc("id","Q5", "_version_",v115)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER));
- updateJ(jsonAdd(sdoc("id","Q6", "_version_",v116)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER));
- assertU(commit());
-
- req.close();
- h.close();
- createCore();
-
- req = req();
- uhandler = req.getCore().getUpdateHandler();
- ulog = uhandler.getUpdateLog();
-
- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) == 0);
+ assertTrue(ulog.existOldBufferLog());
ulog.bufferUpdates();
- // simulate receiving no updates
ulog.applyBufferedUpdates();
updateJ(jsonAdd(sdoc("id","Q7", "_version_",v117)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); // do another add to make sure flags are back to normal
@@ -1096,10 +1068,12 @@ public class TestRecovery extends SolrTestCaseJ4 {
uhandler = req.getCore().getUpdateHandler();
ulog = uhandler.getUpdateLog();
- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) == 0); // check flags on Q7
-
- logReplayFinish.acquire();
- assertEquals(UpdateLog.State.ACTIVE, ulog.getState()); // leave each test method in a good state
+ assertFalse(ulog.existOldBufferLog());
+ // Timeout for Q7 get replayed, because it was added on tlog, therefore it will be replayed on restart
+ TimeOut timeout = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME);
+ timeout.waitFor("Timeout waiting for finish replay updates",
+ () -> h.getCore().getUpdateHandler().getUpdateLog().getState() == UpdateLog.State.ACTIVE);
+ assertJQ(req("qt","/get", "id", "Q7") ,"/doc/id==Q7");
} finally {
DirectUpdateHandler2.commitOnClose = true;
UpdateLog.testing_logReplayHook = null;
diff --git a/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java b/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java
index e6bb9a6edb0..1796319295d 100644
--- a/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java
+++ b/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java
@@ -44,6 +44,7 @@ import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.cloud.hdfs.HdfsTestUtil;
import org.apache.solr.common.util.IOUtils;
+import org.apache.solr.common.util.TimeSource;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.update.DirectUpdateHandler2;
import org.apache.solr.update.HdfsUpdateLog;
@@ -51,6 +52,7 @@ import org.apache.solr.update.UpdateHandler;
import org.apache.solr.update.UpdateLog;
import org.apache.solr.update.processor.DistributedUpdateProcessor.DistribPhase;
import org.apache.solr.util.BadHdfsThreadsFilter;
+import org.apache.solr.util.TimeOut;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Ignore;
@@ -515,13 +517,9 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 {
@Test
- public void testBufferingFlags() throws Exception {
+ public void testExistOldBufferLog() throws Exception {
DirectUpdateHandler2.commitOnClose = false;
- final Semaphore logReplayFinish = new Semaphore(0);
-
- UpdateLog.testing_logReplayFinishHook = () -> logReplayFinish.release();
-
SolrQueryRequest req = req();
UpdateHandler uhandler = req.getCore().getUpdateHandler();
@@ -548,14 +546,10 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 {
uhandler = req.getCore().getUpdateHandler();
ulog = uhandler.getUpdateLog();
- logReplayFinish.acquire(); // wait for replay to finish
-
- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0); // since we died while buffering, we should see this last
-
- //
- // Try again to ensure that the previous log replay didn't wipe out our flags
- //
+ // the core no longer replay updates from buffer tlog on startup
+ assertTrue(ulog.existOldBufferLog()); // since we died while buffering, we should see this last
+ // buffer tlog won't be removed on restart
req.close();
h.close();
createCore();
@@ -564,23 +558,7 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 {
uhandler = req.getCore().getUpdateHandler();
ulog = uhandler.getUpdateLog();
- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0);
-
- // now do some normal non-buffered adds
- updateJ(jsonAdd(sdoc("id","Q4", "_version_","114")), params(DISTRIB_UPDATE_PARAM,FROM_LEADER));
- updateJ(jsonAdd(sdoc("id","Q5", "_version_","115")), params(DISTRIB_UPDATE_PARAM,FROM_LEADER));
- updateJ(jsonAdd(sdoc("id","Q6", "_version_","116")), params(DISTRIB_UPDATE_PARAM,FROM_LEADER));
- assertU(commit());
-
- req.close();
- h.close();
- createCore();
-
- req = req();
- uhandler = req.getCore().getUpdateHandler();
- ulog = uhandler.getUpdateLog();
-
- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) == 0);
+ assertTrue(ulog.existOldBufferLog());
ulog.bufferUpdates();
// simulate receiving no updates
@@ -595,10 +573,12 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 {
uhandler = req.getCore().getUpdateHandler();
ulog = uhandler.getUpdateLog();
- assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) == 0); // check flags on Q7
-
- logReplayFinish.acquire();
- assertEquals(UpdateLog.State.ACTIVE, ulog.getState()); // leave each test method in a good state
+ assertFalse(ulog.existOldBufferLog());
+ // Timeout for Q7 get replayed, because it was added on tlog, therefore it will be replayed on restart
+ TimeOut timeout = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME);
+ timeout.waitFor("Timeout waiting for finish replay updates",
+ () -> h.getCore().getUpdateHandler().getUpdateLog().getState() == UpdateLog.State.ACTIVE);
+ assertJQ(req("qt","/get", "id", "Q7") ,"/doc/id==Q7");
} finally {
DirectUpdateHandler2.commitOnClose = true;
UpdateLog.testing_logReplayHook = null;
diff --git a/solr/core/src/test/org/apache/solr/search/TestStandardQParsers.java b/solr/core/src/test/org/apache/solr/search/TestStandardQParsers.java
index ff9ffffcdfa..cab9026602a 100644
--- a/solr/core/src/test/org/apache/solr/search/TestStandardQParsers.java
+++ b/solr/core/src/test/org/apache/solr/search/TestStandardQParsers.java
@@ -16,15 +16,15 @@
*/
package org.apache.solr.search;
-import org.apache.lucene.util.LuceneTestCase;
-import org.junit.Test;
-
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Test;
+
/**
* Check standard query parsers for class loading problems during initialization (NAME field is final and static).
* Because every query plugin extend {@link org.apache.solr.search.QParserPlugin} and contains own instance of {@link org.apache.solr.search.QParserPlugin#standardPlugins},
@@ -50,9 +50,9 @@ public class TestStandardQParsers extends LuceneTestCase {
List notFinal = new ArrayList<>(QParserPlugin.standardPlugins.size());
List mismatch = new ArrayList<>(QParserPlugin.standardPlugins.size());
- for (Map.Entry> pair : QParserPlugin.standardPlugins.entrySet()) {
+ for (Map.Entry pair : QParserPlugin.standardPlugins.entrySet()) {
String regName = pair.getKey();
- Class extends QParserPlugin> clazz = pair.getValue();
+ Class extends QParserPlugin> clazz = pair.getValue().getClass();;
Field nameField = clazz.getField(FIELD_NAME);
int modifiers = nameField.getModifiers();
diff --git a/solr/core/src/test/org/apache/solr/security/TestPKIAuthenticationPlugin.java b/solr/core/src/test/org/apache/solr/security/TestPKIAuthenticationPlugin.java
index a664cc04205..2d324cbd534 100644
--- a/solr/core/src/test/org/apache/solr/security/TestPKIAuthenticationPlugin.java
+++ b/solr/core/src/test/org/apache/solr/security/TestPKIAuthenticationPlugin.java
@@ -35,7 +35,10 @@ import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrRequestInfo;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.util.CryptoKeys;
-import static org.mockito.Mockito.*;
+
+import static org.mockito.Mockito.any;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
public class TestPKIAuthenticationPlugin extends SolrTestCaseJ4 {
@@ -45,7 +48,7 @@ public class TestPKIAuthenticationPlugin extends SolrTestCaseJ4 {
Map remoteKeys = new HashMap<>();
public MockPKIAuthenticationPlugin(CoreContainer cores, String node) {
- super(cores, node);
+ super(cores, node, new PublicKeyHandler());
}
@Override
diff --git a/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java b/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java
index 1bf4ad41978..d2b4b26df01 100644
--- a/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java
+++ b/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java
@@ -35,7 +35,7 @@ public class TransactionLogTest extends LuceneTestCase {
transactionLog.lastAddSize = 2000000000;
AddUpdateCommand updateCommand = new AddUpdateCommand(null);
updateCommand.solrDoc = new SolrInputDocument();
- transactionLog.write(updateCommand, 0);
+ transactionLog.write(updateCommand);
}
}
diff --git a/solr/solr-ref-guide/src/collections-api.adoc b/solr/solr-ref-guide/src/collections-api.adoc
index 1e895b287fb..53b6395d3fe 100644
--- a/solr/solr-ref-guide/src/collections-api.adoc
+++ b/solr/solr-ref-guide/src/collections-api.adoc
@@ -1085,6 +1085,48 @@ http://localhost:8983/solr/admin/collections?action=CLUSTERPROP&name=urlScheme&v
----
+=== Deeply Nested Cluster Properties ===
+
+==== `collectionDefaults` ====
+It is possible to set cluster-wide default values for certain attributes of a collection.
+
+
+*Example 1: Set/update default values*
+[source]
+----
+curl -X POST -H 'Content-type:application/json' --data-binary '
+{ "set-obj-property" : {
+ "collectionDefaults" : {
+ "numShards" : 2,
+ "nrtReplicas" : 1,
+ "tlogReplicas" : 1,
+ "pullReplicas" : 1,
+
+ }
+}' http://localhost:8983/api/cluster
+----
+
+*Example 2: Unset the value of `nrtReplicas` alone*
+[source]
+----
+curl -X POST -H 'Content-type:application/json' --data-binary '
+{ "set-obj-property" : {
+ "collectionDefaults" : {
+ "nrtReplicas" : null,
+ }
+}' http://localhost:8983/api/cluster
+----
+
+*Example 2: Unset all values in `collectionDefaults`*
+[source]
+----
+curl -X POST -H 'Content-type:application/json' --data-binary '
+{ "set-obj-property" : {
+ "collectionDefaults" : null
+}' http://localhost:8983/api/cluster
+----
+
+
[[collectionprop]]
== COLLECTIONPROP: Collection Properties
diff --git a/solr/solr-ref-guide/src/searching.adoc b/solr/solr-ref-guide/src/searching.adoc
index 145c1a4dcd5..753c2d88038 100644
--- a/solr/solr-ref-guide/src/searching.adoc
+++ b/solr/solr-ref-guide/src/searching.adoc
@@ -1,5 +1,35 @@
= Searching
-:page-children: overview-of-searching-in-solr, velocity-search-ui, relevance, query-syntax-and-parsing, json-request-api, json-facet-api, faceting, highlighting, spell-checking, query-re-ranking, transforming-result-documents, suggester, morelikethis, pagination-of-results, collapse-and-expand-results, result-grouping, result-clustering, spatial-search, the-terms-component, the-term-vector-component, the-stats-component, the-query-elevation-component, response-writers, near-real-time-searching, realtime-get, exporting-result-sets, streaming-expressions, parallel-sql-interface, analytics
+:page-children: overview-of-searching-in-solr, +
+ velocity-search-ui, +
+ relevance, +
+ query-syntax-and-parsing, +
+ json-request-api, +
+ json-facet-api, +
+ faceting, +
+ highlighting, +
+ spell-checking, +
+ query-re-ranking, +
+ transforming-result-documents, +
+ suggester, +
+ morelikethis, +
+ pagination-of-results, +
+ collapse-and-expand-results, +
+ result-grouping, +
+ result-clustering, +
+ spatial-search, +
+ the-terms-component, +
+ the-term-vector-component, +
+ the-stats-component, +
+ the-query-elevation-component, +
+ the-tagger-handler, +
+ response-writers, +
+ near-real-time-searching, +
+ realtime-get, +
+ exporting-result-sets, +
+ streaming-expressions, +
+ parallel-sql-interface, +
+ analytics
+
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
@@ -50,6 +80,7 @@ This section describes how Solr works with search requests. It covers the follow
* <>: How to get term information about specific documents.
* <>: How to return information from numeric fields within a document set.
* <>: How to force documents to the top of the results for certain queries.
+* <>: The SolrTextTagger, for basic named entity tagging in text.
* <>: Detailed information about configuring and using Solr's response writers.
* <>: How to include documents in search results nearly immediately after they are indexed.
* <>: How to get the latest version of a document without opening a searcher.
diff --git a/solr/solr-ref-guide/src/the-tagger-handler.adoc b/solr/solr-ref-guide/src/the-tagger-handler.adoc
new file mode 100644
index 00000000000..14ba8ed6f9f
--- /dev/null
+++ b/solr/solr-ref-guide/src/the-tagger-handler.adoc
@@ -0,0 +1,265 @@
+[[the-tagger-handler]]
+= The Tagger Handler
+
+The "Tagger" Request Handler, AKA the "SolrTextTagger" is a "text tagger".
+Given a dictionary (a Solr index) with a name-like field,
+ you post text to this request handler and it will return every occurrence of one of those names with offsets and other document metadata desired.
+It's used for named entity recognition (NER).
+It doesn't do any NLP (outside of Lucene text analysis) so it's said to be a "naive tagger",
+ but it's definitely useful as-is and a more complete NER or ERD (entity recognition and disambiguation)
+ system can be built with this as a key component.
+The SolrTextTagger might be used on queries for query-understanding or large documents as well.
+
+To get a sense of how to use it, jump to the tutorial below.
+
+The tagger does not yet support a sharded index.
+Tens, perhaps hundreds of millions of names (documents) are supported, mostly limited by memory.
+
+[[tagger-configuration]]
+== Configuration
+
+The Solr schema needs 2 things:
+
+* A unique key field (see ``).
+ Recommended field settings: set `docValues=true`
+* A tag field, a TextField, with `ConcatenateGraphFilterFactory` at the end of the index chain (not the query chain):
+ Set `preservePositionIncrements=false` on that filter.
+ Recommended field settings: `omitNorms=true`, `omitTermFreqAndPositions=true` and `postingsFormat=FST50`
+
+The text field's _index analysis chain_, aside from needing ConcatenateGraphFilterFactory at the end,
+ can otherwise have whatever tokenizer and filters suit your matching preferences.
+It can have multi-word synonyms and use WordDelimiterGraphFilterFactory for example.
+However, do _not_ use FlattenGraphFilterFactory as it will interfere with ConcatenateGraphFilterFactory.
+Position gaps (e.g. stop words) get ignored; it's not (yet) supported for the gap to be significant.
+
+The text field's _query analysis chain_, on the other hand, is more limited.
+There should not be tokens at the same position, thus no synonym expansion -- do that at index time instead.
+Stop words (or any other filter introducing a position gap) are supported.
+At runtime the tagger can be configured to either treat it as a tag break or to ignore it.
+
+The Solr config needs the `solr.TagRequestHandler` defined, which supports `defaults`, `invariants`, and `appends`
+sections just like the search handler.
+
+[[tagger-parameters]]
+== Tagger Parameters
+
+The tagger's execution is completely configurable with request parameters. Only `field` is required.
+
+`field`::
+ The tag field that serves as the dictionary.
+ This is required; you'll probably specify it in the request handler.
+
+`fq`::
+ You can specify some number of _filter queries_ to limit the dictionary used for tagging.
+ This parameter is the same as is used by the `solr.SearchHandler`.
+
+`rows`::
+ The maximum number of documents to return, but defaulting to 10000 for a tag request.
+ This parameter is the same as is used by the `solr.SearchHandler`.
+
+`fl`::
+ Solr's standard param for listing the fields to return.
+ This parameter is the same as is used by the `solr.SearchHandler`.
+
+`overlaps`::
+ Choose the algorithm to determine which tags in an overlapping set should be retained, versus being pruned away.
+ Options are:
+
+ * `ALL`: Emit all tags.
+ * `NO_SUB`: Don't emit a tag that is completely within another tag (i.e. no subtag).
+ * `LONGEST_DOMINANT_RIGHT`: Given a cluster of overlapping tags, emit the longest one (by character length).
+ If there is a tie, pick the right-most.
+ Remove any tags overlapping with this tag then repeat the algorithm to potentially find other tags
+ that can be emitted in the cluster.
+
+`matchText`::
+ A boolean indicating whether to return the matched text in the tag response.
+ This will trigger the tagger to fully buffer the input before tagging.
+
+`tagsLimit`::
+ The maximum number of tags to return in the response.
+ Tagging effectively stops after this point.
+ By default this is 1000.
+
+`skipAltTokens`::
+ A boolean flag used to suppress errors that can occur if, for example,
+ you enable synonym expansion at query time in the analyzer, which you normally shouldn't do.
+ Let this default to false unless you know that such tokens can't be avoided.
+
+`ignoreStopwords`::
+ A boolean flag that causes stopwords (or any condition causing positions to skip like >255 char words)
+ to be ignored as if it wasn't there.
+ Otherwise, the behavior is to treat them as breaks in tagging on the presumption your indexed text-analysis
+ configuration doesn't have a StopWordFilter.
+ By default the indexed analysis chain is checked for the presence of a StopWordFilter and if found
+ then ignoreStopWords is true if unspecified.
+ You probably shouldn't have a StopWordFilter configured and probably won't need to set this param either.
+
+`xmlOffsetAdjust`::
+ A boolean indicating that the input is XML and furthermore that the offsets of returned tags should be adjusted as
+ necessary to allow for the client to insert an openening and closing element at the tag offset pair.
+ If it isn't possible to do so then the tag will be omitted.
+ You are expected to configure `HTMLStripCharFilterFactory` in the schema when using this option.
+ This will trigger the tagger to fully buffer the input before tagging.
+
+Solr's parameters for controlling the response format are supported, like:
+ `echoParams`, `wt`, `indent`, etc.
+
+[[tagger-tutorial-with-geonames]]
+== Tutorial with Geonames
+
+This is a tutorial that demonstrates how to configure and use the text
+tagger with the popular Geonames data set. It's more than a tutorial;
+it's a how-to with information that wasn't described above.
+
+[[tagger-create-and-configure-a-solr-collection]]
+=== Create and Configure a Solr Collection
+
+Create a Solr collection named "geonames". For the tutorial, we'll
+assume the default "data-driven" configuration. It's good for
+experimentation and getting going fast but not for production or being
+optimal.
+
+....
+bin/solr create -c geonames
+....
+
+[[tagger-configuring]]
+==== Configuring
+
+We need to configure the schema first. The "data driven" mode we're
+using allows us to keep this step fairly minimal -- we just need to
+declare a field type, 2 fields, and a copy-field. The critical part
+up-front is to define the "tag" field type. There are many many ways to
+configure text analysis; and we're not going to get into those choices
+here. But an important bit is the `ConcatenateGraphFilterFactory` at the
+end of the index analyzer chain. Another important bit for performance
+is postingsFormat=FST50 resulting in a compact FST based in-memory data
+structure that is especially beneficial for the text tagger.
+
+Schema configuration:
+
+....
+curl -X POST -H 'Content-type:application/json' http://localhost:8983/solr/geonames/schema -d '{
+ "add-field-type":{
+ "name":"tag",
+ "class":"solr.TextField",
+ "postingsFormat":"FST50",
+ "omitNorms":true,
+ "omitTermFreqAndPositions":true,
+ "indexAnalyzer":{
+ "tokenizer":{
+ "class":"solr.StandardTokenizerFactory" },
+ "filters":[
+ {"class":"solr.EnglishPossessiveFilterFactory"},
+ {"class":"solr.ASCIIFoldingFilterFactory"},
+ {"class":"solr.LowerCaseFilterFactory"},
+ {"class":"solr.ConcatenateGraphFilterFactory", "preservePositionIncrements":false }
+ ]},
+ "queryAnalyzer":{
+ "tokenizer":{
+ "class":"solr.StandardTokenizerFactory" },
+ "filters":[
+ {"class":"solr.EnglishPossessiveFilterFactory"},
+ {"class":"solr.ASCIIFoldingFilterFactory"},
+ {"class":"solr.LowerCaseFilterFactory"}
+ ]}
+ },
+
+ "add-field":{ "name":"name", "type":"text_general"},
+
+ "add-field":{ "name":"name_tag", "type":"tag", "stored":false },
+
+ "add-copy-field":{ "source":"name", "dest":[ "name_tag" ]}
+}'
+....
+
+Configure a custom Solr Request Handler:
+
+....
+curl -X POST -H 'Content-type:application/json' http://localhost:8983/solr/geonames/config -d '{
+ "add-requesthandler" : {
+ "name": "/tag",
+ "class":"solr.TaggerRequestHandler",
+ "defaults":{ "field":"name_tag" }
+ }
+}'
+....
+
+[[tagger-load-some-sample-data]]
+=== Load Some Sample Data
+
+We'll go with some Geonames.org data in CSV format. Solr is quite
+flexible in loading data in a variety of formats. This
+http://download.geonames.org/export/dump/cities1000.zip[cities1000.zip]
+should be almost 7MB file expanding to a cities1000.txt file around
+22.2MB containing 145k lines, each a city in the world of at least 1000
+population.
+
+Using bin/post:
+....
+bin/post -c geonames -type text/csv \
+ -params 'optimize=true&separator=%09&encapsulator=%00&fieldnames=id,name,,alternative_names,latitude,longitude,,,countrycode,,,,,,population,elevation,,timezone,lastupdate' \
+ /tmp/cities1000.txt
+....
+or using curl:
+....
+curl -X POST --data-binary @/path/to/cities1000.txt -H 'Content-type:application/csv' \
+ 'http://localhost:8983/solr/geonames/update?commit=true&optimize=true&separator=%09&encapsulator=%00&fieldnames=id,name,,alternative_names,latitude,longitude,,,countrycode,,,,,,population,elevation,,timezone,lastupdate'
+....
+
+That might take around 35 seconds; it depends. It can be a lot faster if
+the schema were tuned to only have what we truly need (no text search if
+not needed).
+
+In that command we said optimize=true to put the index in a state that
+will make tagging faster. The encapsulator=%00 is a bit of a hack to
+disable the default double-quote.
+
+[[tagger-tag-time]]
+=== Tag Time!
+
+This is a trivial example tagging a small piece of text. For more
+options, see the earlier documentation.
+
+....
+curl -X POST \
+ 'http://localhost:8983/solr/geonames/tag?overlaps=NO_SUB&tagsLimit=5000&fl=id,name,countrycode&wt=json&indent=on' \
+ -H 'Content-Type:text/plain' -d 'Hello New York City'
+....
+
+The response should be this (the QTime may vary):
+
+....
+{
+ "responseHeader":{
+ "status":0,
+ "QTime":1},
+ "tagsCount":1,
+ "tags":[[
+ "startOffset",6,
+ "endOffset",19,
+ "ids",["5128581"]]],
+ "response":{"numFound":1,"start":0,"docs":[
+ {
+ "id":"5128581",
+ "name":["New York City"],
+ "countrycode":["US"]}]
+ }}
+....
+
+[[tagger-tips]]
+== Tips
+
+Performance Tips:
+
+* Follow the recommended configuration field settings, especially `postingsFormat=FST50`.
+* "optimize" after loading your dictionary down to 1 Lucene segment, or at least to as few as possible.
+* For bulk tagging lots of documents, there are some strategies, not mutually exclusive:
+** Batch them.
+ The tagger doesn't directly support batching but as a hack you can send a bunch of documents concatenated with
+ a nonsense word that is not in the dictionary like "ZZYYXXAABBCC" between them.
+ You'll need to keep track of the character offsets of these so you can subtract them from the results.
+** For reducing tagging latency even further, consider embedding Solr with `EmbeddedSolrServer`.
+ See `EmbeddedSolrNoSerializeTest`.
+** Use more than one thread -- perhaps as many as there are CPU cores available to Solr.
\ No newline at end of file
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Clause.java b/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Clause.java
index c739588d354..8f198bd8bbc 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Clause.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Clause.java
@@ -140,7 +140,7 @@ public class Clause implements MapWriter, Comparable {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Clause that = (Clause)o;
- return compareTo(that) == 0;
+ return Objects.equals(this.original, that.original);
}
void addTags(Collection params) {
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Policy.java b/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Policy.java
index fb01cc5e962..60ff0c929be 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Policy.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Policy.java
@@ -383,11 +383,10 @@ public class Policy implements MapWriter {
return p.compare(r1, r2, false);
});
} catch (Exception e) {
- LOG.error("Exception! prefs = {}, recent r1 = {}, r2 = {}, compare : {} matrix = {}",
+ LOG.error("Exception! prefs = {}, recent r1 = {}, r2 = {}, matrix = {}",
clusterPreferences,
- lastComparison[0].node,
- lastComparison[1].node,
- p.compare(lastComparison[0],lastComparison[1], false ),
+ lastComparison[0],
+ lastComparison[1],
Utils.toJSONString(Utils.getDeepCopy(tmpMatrix, 6, false)));
throw e;
}
diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ClusterProperties.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ClusterProperties.java
index 87896daad5b..446923b81de 100644
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/ClusterProperties.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ClusterProperties.java
@@ -21,6 +21,7 @@ import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Collections;
import java.util.LinkedHashMap;
+import java.util.List;
import java.util.Map;
import org.apache.solr.common.SolrException;
@@ -67,6 +68,23 @@ public class ClusterProperties {
return value;
}
+ /**
+ * Read the value of a cluster property, returning a default if it is not set
+ *
+ * @param key the property name or the full path to the property as a list of parts.
+ * @param defaultValue the default value
+ * @param the type of the property
+ * @return the property value
+ * @throws IOException if there is an error reading the value from the cluster
+ */
+ @SuppressWarnings("unchecked")
+ public T getClusterProperty(List key, T defaultValue) throws IOException {
+ T value = (T) Utils.getObjectByPath(getClusterProperties(), false, key);
+ if (value == null)
+ return defaultValue;
+ return value;
+ }
+
/**
* Return the cluster properties
* @throws IOException if there is an error reading properties from the cluster
diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
index 6b65c344d4c..a86c5e28448 100644
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
@@ -961,6 +961,12 @@ public class ZkStateReader implements Closeable {
return value;
}
+ /**Same as the above but allows a full json path as a list of parts
+ *
+ * @param keyPath path to the property example ["collectionDefauls", "numShards"]
+ * @param defaultValue a default value to use if no such property exists
+ * @return the cluster property, or a default if the property is not set
+ */
public T getClusterProperty(List keyPath, T defaultValue) {
T value = (T) Utils.getObjectByPath( clusterProperties, false, keyPath);
if (value == null)