diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index cd11e7dbeb5..9eecc4206b0 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -202,6 +202,18 @@ New Features IndexFileDeleter already accounts for that for existing files which we can now use to also take pending deletes into account which ensures that all file generations per segment always go forward. (Simon Willnauer) + +* LUCENE-7960: Add preserveOriginal option to the NGram and EdgeNGram filters. + (Ingomar Wesp, Shawn Heisey via Robert Muir) + +* LUCENE-8335: Enforce soft-deletes field up-front. Soft deletes field must be marked + as such once it's introduced and can't be changed after the fact. + (Nhat Nguyen via Simon Willnauer) + +* LUCENE-8332: New ConcatenateGraphFilter for concatenating all tokens into one (or more + in the event of a graph input). This is useful for fast analyzed exact-match lookup, + suggesters, and as a component of a named entity recognition system. This was excised + out of CompletionTokenStream in the NRT doc suggester. (David Smiley, Jim Ferenczi) Bug Fixes diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java new file mode 100644 index 00000000000..b6c4f223eb4 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java @@ -0,0 +1,375 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.miscellaneous; + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.TokenStreamToAutomaton; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.AttributeReflector; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; +import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.fst.Util; + +/** + * Concatenates/Joins every incoming token with a separator into one output token for every path through the + * token stream (which is a graph). In simple cases this yields one token, but in the presence of any tokens with + * a zero positionIncrmeent (e.g. synonyms) it will be more. This filter uses the token bytes, position increment, + * and position length of the incoming stream. Other attributes are not used or manipulated. + * + * @lucene.experimental + */ +public final class ConcatenateGraphFilter extends TokenStream { + + /* + * Token stream which converts a provided token stream to an automaton. + * The accepted strings enumeration from the automaton are available through the + * {@link org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute} attribute + * The token stream uses a {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute} to store + * a completion's payload (see {@link ConcatenateGraphFilter#setPayload(org.apache.lucene.util.BytesRef)}) + */ + + /** + * Represents the separation between tokens, if + * preserveSep is true. + */ + public final static int SEP_LABEL = TokenStreamToAutomaton.POS_SEP; + public final static int DEFAULT_MAX_GRAPH_EXPANSIONS = Operations.DEFAULT_MAX_DETERMINIZED_STATES; + public final static boolean DEFAULT_PRESERVE_SEP = true; + public final static boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true; + + private final BytesRefBuilderTermAttribute bytesAtt = addAttribute(BytesRefBuilderTermAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + + private final TokenStream inputTokenStream; + private final boolean preserveSep; + private final boolean preservePositionIncrements; + private final int maxGraphExpansions; + + private LimitedFiniteStringsIterator finiteStrings; + private CharTermAttribute charTermAttribute; + private boolean wasReset = false; + private int endOffset; + + /** + * Creates a token stream to convert input to a token stream + * of accepted strings by its token stream graph. + *

+ * This constructor uses the default settings of the constants in this class. + */ + public ConcatenateGraphFilter(TokenStream inputTokenStream) { + this(inputTokenStream, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS); + } + + /** + * Creates a token stream to convert input to a token stream + * of accepted strings by its token stream graph. + * + * @param inputTokenStream The input/incoming TokenStream + * @param preserveSep Whether {@link #SEP_LABEL} should separate the input tokens in the concatenated token + * @param preservePositionIncrements Whether to add an empty token for missing positions. + * The effect is a consecutive {@link #SEP_LABEL}. + * When false, it's as if there were no missing positions + * (we pretend the surrounding tokens were adjacent). + * @param maxGraphExpansions If the tokenStream graph has more than this many possible paths through, then we'll throw + * {@link TooComplexToDeterminizeException} to preserve the stability and memory of the + * machine. + * @throws TooComplexToDeterminizeException if the tokenStream graph has more than {@code maxGraphExpansions} + * expansions + * + */ + public ConcatenateGraphFilter(TokenStream inputTokenStream, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) { + // Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume + // the input stream entirely in the first call to incrementToken + this.inputTokenStream = inputTokenStream; + this.preserveSep = preserveSep; + this.preservePositionIncrements = preservePositionIncrements; + this.maxGraphExpansions = maxGraphExpansions; + } + + @Override + public void reset() throws IOException { + super.reset(); + // we only capture this if we really need it to save the UTF-8 to UTF-16 conversion + charTermAttribute = getAttribute(CharTermAttribute.class); // may return null + wasReset = true; + } + + @Override + public boolean incrementToken() throws IOException { + if (finiteStrings == null) { + if (wasReset == false) { + throw new IllegalStateException("reset() missing before incrementToken"); + } + // lazy init/consume + Automaton automaton = toAutomaton(); // calls reset(), incrementToken() repeatedly, and end() on inputTokenStream + finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions); + //note: would be nice to know the startOffset but toAutomaton doesn't capture it. We'll assume 0 + endOffset = inputTokenStream.getAttribute(OffsetAttribute.class).endOffset(); + } + + IntsRef string = finiteStrings.next(); + if (string == null) { + return false; + } + + clearAttributes(); + + if (finiteStrings.size() > 1) { // if number of iterated strings so far is more than one... + posIncrAtt.setPositionIncrement(0); // stacked + } + + offsetAtt.setOffset(0, endOffset); + + Util.toBytesRef(string, bytesAtt.builder()); // now we have UTF-8 + if (charTermAttribute != null) { + charTermAttribute.setLength(0); + charTermAttribute.append(bytesAtt.toUTF16()); + } + + return true; + } + + @Override + public void end() throws IOException { + super.end(); + if (finiteStrings == null) { // thus inputTokenStream hasn't yet received end() + inputTokenStream.end(); // the input TS may really want to see "end()" called even if incrementToken hasn't. + } // else we already eagerly consumed inputTokenStream including end() + if (endOffset != -1) { + offsetAtt.setOffset(0, endOffset); + } + } + + @Override + public void close() throws IOException { + super.close(); + //delegate lifecycle. Note toAutomaton does not close the stream + inputTokenStream.close(); + finiteStrings = null; + wasReset = false;//reset + endOffset = -1;//reset + } + + /** + * Converts the tokenStream to an automaton, treating the transition labels as utf-8. Does *not* close it. + */ + public Automaton toAutomaton() throws IOException { + return toAutomaton(false); + } + + /** + * Converts the tokenStream to an automaton. Does *not* close it. + */ + public Automaton toAutomaton(boolean unicodeAware) throws IOException { + // TODO refactor this + // maybe we could hook up a modified automaton from TermAutomatonQuery here? + + // Create corresponding automaton: labels are bytes + // from each analyzed token, with byte 0 used as + // separator between tokens: + final TokenStreamToAutomaton tsta; + if (preserveSep) { + tsta = new EscapingTokenStreamToAutomaton(SEP_LABEL); + } else { + // When we're not preserving sep, we don't steal 0xff + // byte, so we don't need to do any escaping: + tsta = new TokenStreamToAutomaton(); + } + tsta.setPreservePositionIncrements(preservePositionIncrements); + tsta.setUnicodeArcs(unicodeAware); + + Automaton automaton = tsta.toAutomaton(inputTokenStream); + + // TODO: we can optimize this somewhat by determinizing + // while we convert + automaton = replaceSep(automaton, preserveSep, SEP_LABEL); + // This automaton should not blow up during determinize: + return Operations.determinize(automaton, maxGraphExpansions); + } + + /** + * Just escapes the {@link #SEP_LABEL} byte with an extra. + */ + private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton { + + final BytesRefBuilder spare = new BytesRefBuilder(); + final byte sepLabel; + + public EscapingTokenStreamToAutomaton(int sepLabel) { + assert sepLabel <= Byte.MAX_VALUE; + this.sepLabel = (byte) sepLabel; + } + + @Override + protected BytesRef changeToken(BytesRef in) { + int upto = 0; + for (int i = 0; i < in.length; i++) { + byte b = in.bytes[in.offset + i]; + if (b == sepLabel) { + spare.grow(upto + 2); + spare.setByteAt(upto++, sepLabel); + spare.setByteAt(upto++, b); + } else { + spare.grow(upto + 1); + spare.setByteAt(upto++, b); + } + } + spare.setLength(upto); + return spare.get(); + } + } + + // Replaces SEP with epsilon or remaps them if + // we were asked to preserve them: + private static Automaton replaceSep(Automaton a, boolean preserveSep, int sepLabel) { + + Automaton result = new Automaton(); + + // Copy all states over + int numStates = a.getNumStates(); + for (int s = 0; s < numStates; s++) { + result.createState(); + result.setAccept(s, a.isAccept(s)); + } + + // Go in reverse topo sort so we know we only have to + // make one pass: + Transition t = new Transition(); + int[] topoSortStates = Operations.topoSortStates(a); + for (int i = 0; i < topoSortStates.length; i++) { + int state = topoSortStates[topoSortStates.length - 1 - i]; + int count = a.initTransition(state, t); + for (int j = 0; j < count; j++) { + a.getNextTransition(t); + if (t.min == TokenStreamToAutomaton.POS_SEP) { + assert t.max == TokenStreamToAutomaton.POS_SEP; + if (preserveSep) { + // Remap to SEP_LABEL: + result.addTransition(state, t.dest, sepLabel); + } else { + result.addEpsilon(state, t.dest); + } + } else if (t.min == TokenStreamToAutomaton.HOLE) { + assert t.max == TokenStreamToAutomaton.HOLE; + + // Just remove the hole: there will then be two + // SEP tokens next to each other, which will only + // match another hole at search time. Note that + // it will also match an empty-string token ... if + // that's somehow a problem we can always map HOLE + // to a dedicated byte (and escape it in the + // input). + result.addEpsilon(state, t.dest); + } else { + result.addTransition(state, t.dest, t.min, t.max); + } + } + } + + result.finishState(); + + return result; + } + + /** + * Attribute providing access to the term builder and UTF-16 conversion + * @lucene.internal + */ + public interface BytesRefBuilderTermAttribute extends TermToBytesRefAttribute { + /** + * Returns the builder from which the term is derived. + */ + BytesRefBuilder builder(); + + /** + * Returns the term represented as UTF-16 + */ + CharSequence toUTF16(); + } + + /** + * Implementation of {@link BytesRefBuilderTermAttribute} + * @lucene.internal + */ + public static final class BytesRefBuilderTermAttributeImpl extends AttributeImpl implements BytesRefBuilderTermAttribute, TermToBytesRefAttribute { + private final BytesRefBuilder bytes = new BytesRefBuilder(); + private transient CharsRefBuilder charsRef; + + /** + * Sole constructor + * no-op + */ + public BytesRefBuilderTermAttributeImpl() { + } + + @Override + public BytesRefBuilder builder() { + return bytes; + } + + @Override + public BytesRef getBytesRef() { + return bytes.get(); + } + + @Override + public void clear() { + bytes.clear(); + } + + @Override + public void copyTo(AttributeImpl target) { + BytesRefBuilderTermAttributeImpl other = (BytesRefBuilderTermAttributeImpl) target; + other.bytes.copyBytes(bytes); + } + + @Override + public AttributeImpl clone() { + BytesRefBuilderTermAttributeImpl other = new BytesRefBuilderTermAttributeImpl(); + copyTo(other); + return other; + } + + @Override + public void reflectWith(AttributeReflector reflector) { + reflector.reflect(TermToBytesRefAttribute.class, "bytes", getBytesRef()); + } + + @Override + public CharSequence toUTF16() { + if (charsRef == null) { + charsRef = new CharsRefBuilder(); + } + charsRef.copyUTF8Bytes(getBytesRef()); + return charsRef.get(); + } + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilterFactory.java new file mode 100644 index 00000000000..5d8ccbacf3d --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilterFactory.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.miscellaneous; + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; + +/** + * Factory for {@link ConcatenateGraphFilter}. + * + *

+ * @see ConcatenateGraphFilter + * @since 7.4.0 + */ +public class ConcatenateGraphFilterFactory extends TokenFilterFactory { + + private boolean preserveSep; + private boolean preservePositionIncrements; + private int maxGraphExpansions; + + public ConcatenateGraphFilterFactory(Map args) { + super(args); + + preserveSep = getBoolean(args, "preserveSep", ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP); + preservePositionIncrements = getBoolean(args, "preservePositionIncrements", ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS); + maxGraphExpansions = getInt(args, "maxGraphExpansions", ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS); + + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + @Override + public TokenStream create(TokenStream input) { + return new ConcatenateGraphFilter(input, preserveSep, preservePositionIncrements, maxGraphExpansions); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java index dfe06c88fbf..71dab429191 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java @@ -81,8 +81,7 @@ public class FingerprintFilter extends TokenFilter { @Override public final boolean incrementToken() throws IOException { - if (uniqueTerms != null) { - // We have already built the single output token - there's no more + if (inputEnded) { return false; } boolean result = buildSingleOutputToken(); @@ -177,6 +176,7 @@ public class FingerprintFilter extends TokenFilter { } }); + //TODO lets append directly to termAttribute? StringBuilder sb = new StringBuilder(); for (Object item : items) { if (sb.length() >= 1) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java index 020b85bb5e9..db6a22a7cdd 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java @@ -29,19 +29,21 @@ import org.apache.lucene.analysis.util.TokenFilterFactory; * <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100"> * <analyzer> * <tokenizer class="solr.WhitespaceTokenizerFactory"/> - * <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="1"/> + * <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="2" preserveOriginal="true"/> * </analyzer> * </fieldType> */ public class EdgeNGramFilterFactory extends TokenFilterFactory { private final int maxGramSize; private final int minGramSize; + private final boolean preserveOriginal; /** Creates a new EdgeNGramFilterFactory */ public EdgeNGramFilterFactory(Map args) { super(args); - minGramSize = getInt(args, "minGramSize", EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE); - maxGramSize = getInt(args, "maxGramSize", EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE); + minGramSize = requireInt(args, "minGramSize"); + maxGramSize = requireInt(args, "maxGramSize"); + preserveOriginal = getBoolean(args, "preserveOriginal", EdgeNGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -49,6 +51,6 @@ public class EdgeNGramFilterFactory extends TokenFilterFactory { @Override public TokenFilter create(TokenStream input) { - return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize); + return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal); } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java index 56efd897d17..613f8a173ce 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java @@ -32,29 +32,36 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; * supplementary characters. */ public final class EdgeNGramTokenFilter extends TokenFilter { - public static final int DEFAULT_MAX_GRAM_SIZE = 1; - public static final int DEFAULT_MIN_GRAM_SIZE = 1; + public static final boolean DEFAULT_PRESERVE_ORIGINAL = false; private final int minGram; private final int maxGram; + private final boolean preserveOriginal; + private char[] curTermBuffer; private int curTermLength; - private int curCodePointCount; + private int curTermCodePointCount; private int curGramSize; - private int savePosIncr; + private int curPosIncr; private State state; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); /** - * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range - * + * Creates an EdgeNGramTokenFilter that, for a given input term, produces all + * edge n-grams with lengths >= minGram and <= maxGram. Will + * optionally preserve the original term when its length is outside of the + * defined range. + * * @param input {@link TokenStream} holding the input to be tokenized - * @param minGram the smallest n-gram to generate - * @param maxGram the largest n-gram to generate + * @param minGram the minimum length of the generated n-grams + * @param maxGram the maximum length of the generated n-grams + * @param preserveOriginal Whether or not to keep the original term when it + * is outside the min/max size range. */ - public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) { + public EdgeNGramTokenFilter( + TokenStream input, int minGram, int maxGram, boolean preserveOriginal) { super(input); if (minGram < 1) { @@ -67,6 +74,18 @@ public final class EdgeNGramTokenFilter extends TokenFilter { this.minGram = minGram; this.maxGram = maxGram; + this.preserveOriginal = preserveOriginal; + } + + /** + * Creates an EdgeNGramTokenFilter that produces edge n-grams of the given + * size. + * + * @param input {@link TokenStream} holding the input to be tokenized + * @param gramSize the n-gram size to generate. + */ + public EdgeNGramTokenFilter(TokenStream input, int gramSize) { + this(input, gramSize, gramSize, DEFAULT_PRESERVE_ORIGINAL); } @Override @@ -75,32 +94,46 @@ public final class EdgeNGramTokenFilter extends TokenFilter { if (curTermBuffer == null) { if (!input.incrementToken()) { return false; - } else { - curTermBuffer = termAtt.buffer().clone(); - curTermLength = termAtt.length(); - curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length()); - curGramSize = minGram; - state = captureState(); - savePosIncr += posIncrAtt.getPositionIncrement(); } + state = captureState(); + + curTermLength = termAtt.length(); + curTermCodePointCount = Character.codePointCount(termAtt, 0, curTermLength); + curPosIncr += posIncrAtt.getPositionIncrement(); + + if (preserveOriginal && curTermCodePointCount < minGram) { + // Token is shorter than minGram, but we'd still like to keep it. + posIncrAtt.setPositionIncrement(curPosIncr); + curPosIncr = 0; + return true; + } + + curTermBuffer = termAtt.buffer().clone(); + curGramSize = minGram; } - if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit - if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams - // grab gramSize chars from front or back + + if (curGramSize <= curTermCodePointCount) { + if (curGramSize <= maxGram) { // curGramSize is between minGram and maxGram restoreState(state); // first ngram gets increment, others don't - if (curGramSize == minGram) { - posIncrAtt.setPositionIncrement(savePosIncr); - savePosIncr = 0; - } else { - posIncrAtt.setPositionIncrement(0); - } + posIncrAtt.setPositionIncrement(curPosIncr); + curPosIncr = 0; + final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize); termAtt.copyBuffer(curTermBuffer, 0, charLength); curGramSize++; return true; } + else if (preserveOriginal) { + // Token is longer than maxGram, but we'd still like to keep it. + restoreState(state); + posIncrAtt.setPositionIncrement(0); + termAtt.copyBuffer(curTermBuffer, 0, curTermLength); + curTermBuffer = null; + return true; + } } + // Done with this input token, get next token on the next iteration. curTermBuffer = null; } } @@ -109,6 +142,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter { public void reset() throws IOException { super.reset(); curTermBuffer = null; - savePosIncr = 0; + curPosIncr = 0; } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java index 2064716b78b..9a681dfaf3e 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java @@ -29,19 +29,21 @@ import org.apache.lucene.analysis.util.TokenFilterFactory; * <fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100"> * <analyzer> * <tokenizer class="solr.WhitespaceTokenizerFactory"/> - * <filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2"/> + * <filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2" preserveOriginal="true"/> * </analyzer> * </fieldType> */ public class NGramFilterFactory extends TokenFilterFactory { private final int maxGramSize; private final int minGramSize; + private final boolean preserveOriginal; /** Creates a new NGramFilterFactory */ public NGramFilterFactory(Map args) { super(args); - minGramSize = getInt(args, "minGramSize", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE); - maxGramSize = getInt(args, "maxGramSize", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE); + minGramSize = requireInt(args, "minGramSize"); + maxGramSize = requireInt(args, "maxGramSize"); + preserveOriginal = getBoolean(args, "keepShortTerm", NGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -49,6 +51,6 @@ public class NGramFilterFactory extends TokenFilterFactory { @Override public TokenFilter create(TokenStream input) { - return new NGramTokenFilter(input, minGramSize, maxGramSize); + return new NGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal); } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java index a2e0aa7e588..5b6147b8ea0 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java @@ -21,7 +21,6 @@ import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -40,30 +39,41 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; * override {@link NGramTokenizer#isTokenChar(int)} to perform pre-tokenization. */ public final class NGramTokenFilter extends TokenFilter { - public static final int DEFAULT_MIN_NGRAM_SIZE = 1; - public static final int DEFAULT_MAX_NGRAM_SIZE = 2; + public static final boolean DEFAULT_PRESERVE_ORIGINAL = false; - private final int minGram, maxGram; + private final int minGram; + private final int maxGram; + private final boolean preserveOriginal; private char[] curTermBuffer; private int curTermLength; - private int curCodePointCount; + private int curTermCodePointCount; private int curGramSize; private int curPos; - private int curPosInc; + private int curPosIncr; private State state; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final PositionIncrementAttribute posIncAtt; + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); /** - * Creates NGramTokenFilter with given min and max n-grams. + * Creates an NGramTokenFilter that, for a given input term, produces all + * contained n-grams with lengths >= minGram and <= maxGram. Will + * optionally preserve the original term when its length is outside of the + * defined range. + * + * Note: Care must be taken when choosing minGram and maxGram; depending + * on the input token size, this filter potentially produces a huge number + * of terms. + * * @param input {@link TokenStream} holding the input to be tokenized - * @param minGram the smallest n-gram to generate - * @param maxGram the largest n-gram to generate + * @param minGram the minimum length of the generated n-grams + * @param maxGram the maximum length of the generated n-grams + * @param preserveOriginal Whether or not to keep the original term when it + * is shorter than minGram or longer than maxGram */ - public NGramTokenFilter(TokenStream input, int minGram, int maxGram) { - super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE)); + public NGramTokenFilter(TokenStream input, int minGram, int maxGram, boolean preserveOriginal) { + super(input); if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } @@ -72,51 +82,69 @@ public final class NGramTokenFilter extends TokenFilter { } this.minGram = minGram; this.maxGram = maxGram; - - posIncAtt = addAttribute(PositionIncrementAttribute.class); + this.preserveOriginal = preserveOriginal; } - + /** - * Creates NGramTokenFilter with default min and max n-grams. + * Creates an NGramTokenFilter that produces n-grams of the indicated size. + * * @param input {@link TokenStream} holding the input to be tokenized + * @param gramSize the size of n-grams to generate. */ - public NGramTokenFilter(TokenStream input) { - this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE); + public NGramTokenFilter(TokenStream input, int gramSize) { + this(input, gramSize, gramSize, DEFAULT_PRESERVE_ORIGINAL); } - /** Returns the next token in the stream, or null at EOS. */ @Override public final boolean incrementToken() throws IOException { while (true) { if (curTermBuffer == null) { if (!input.incrementToken()) { return false; - } else { - curTermBuffer = termAtt.buffer().clone(); - curTermLength = termAtt.length(); - curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length()); - curGramSize = minGram; - curPos = 0; - curPosInc = posIncAtt.getPositionIncrement(); - state = captureState(); } + state = captureState(); + + curTermLength = termAtt.length(); + curTermCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length()); + curPosIncr += posIncrAtt.getPositionIncrement(); + curPos = 0; + + if (preserveOriginal && curTermCodePointCount < minGram) { + // Token is shorter than minGram, but we'd still like to keep it. + posIncrAtt.setPositionIncrement(curPosIncr); + curPosIncr = 0; + return true; + } + + curTermBuffer = termAtt.buffer().clone(); + curGramSize = minGram; } - if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) { + if (curGramSize > maxGram || (curPos + curGramSize) > curTermCodePointCount) { ++curPos; curGramSize = minGram; } - if ((curPos + curGramSize) <= curCodePointCount) { + if ((curPos + curGramSize) <= curTermCodePointCount) { restoreState(state); final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos); final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); termAtt.copyBuffer(curTermBuffer, start, end - start); - posIncAtt.setPositionIncrement(curPosInc); - curPosInc = 0; + posIncrAtt.setPositionIncrement(curPosIncr); + curPosIncr = 0; curGramSize++; return true; } - curTermBuffer = null; + else if (preserveOriginal && curTermCodePointCount > maxGram) { + // Token is longer than maxGram, but we'd still like to keep it. + restoreState(state); + posIncrAtt.setPositionIncrement(0); + termAtt.copyBuffer(curTermBuffer, 0, curTermLength); + curTermBuffer = null; + return true; + } + + // Done with this input token, get next token on next iteration. + curTermBuffer = null; } } @@ -124,5 +152,6 @@ public final class NGramTokenFilter extends TokenFilter { public void reset() throws IOException { super.reset(); curTermBuffer = null; + curPosIncr = 0; } } diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory index 18119202179..df868a0a1e6 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory @@ -64,6 +64,7 @@ org.apache.lucene.analysis.minhash.MinHashFilterFactory org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory +org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilterFactory org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory org.apache.lucene.analysis.miscellaneous.FingerprintFilterFactory diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java index 1d17237fe3f..6cdff4bd5c9 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java @@ -236,7 +236,7 @@ public class TestBugInSomething extends BaseTokenStreamTestCase { //TokenStream stream = new SopTokenFilter(tokenizer); TokenStream stream = new ShingleFilter(tokenizer, 5); //stream = new SopTokenFilter(stream); - stream = new NGramTokenFilter(stream, 55, 83); + stream = new NGramTokenFilter(stream, 55, 83, false); //stream = new SopTokenFilter(stream); return new TokenStreamComponents(tokenizer, stream); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java index 8cb159129ef..d94b39607bb 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java @@ -72,6 +72,7 @@ import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; import org.apache.lucene.analysis.hunspell.Dictionary; import org.apache.lucene.analysis.hunspell.TestHunspellStemFilter; import org.apache.lucene.analysis.minhash.MinHashFilter; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter; import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter; import org.apache.lucene.analysis.miscellaneous.FingerprintFilter; @@ -119,10 +120,10 @@ public class TestRandomChains extends BaseTokenStreamTestCase { private static final Set> avoidConditionals = new HashSet<>(); static { - // Fingerprint filter needs to consume the whole tokenstream, so conditionals don't make sense here + // These filters needs to consume the whole tokenstream, so conditionals don't make sense here avoidConditionals.add(FingerprintFilter.class); - // Ditto MinHashFilter avoidConditionals.add(MinHashFilter.class); + avoidConditionals.add(ConcatenateGraphFilter.class); } private static final Map,Predicate> brokenConstructors = new HashMap<>(); @@ -156,7 +157,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase { return !((Boolean) args[2]); // args are broken if consumeAllTokens is false }); for (Class c : Arrays.>asList( - // doesn't actual reset itself! + // doesn't actual reset itself! TODO this statement is probably obsolete as of LUCENE-6121 ? CachingTokenFilter.class, // LUCENE-8092: doesn't handle graph inputs CJKBigramFilter.class, diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/CompletionTokenStreamTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilter.java similarity index 52% rename from lucene/suggest/src/test/org/apache/lucene/search/suggest/document/CompletionTokenStreamTest.java rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilter.java index 6f558d1985d..453dcbf9dab 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/CompletionTokenStreamTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilter.java @@ -14,50 +14,42 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.search.suggest.document; +package org.apache.lucene.analysis.miscellaneous; import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymMap; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRefBuilder; import org.junit.Test; -public class CompletionTokenStreamTest extends BaseTokenStreamTestCase { +public class TestConcatenateGraphFilter extends BaseTokenStreamTestCase { + private static final char SEP_LABEL = (char) ConcatenateGraphFilter.SEP_LABEL; + @Test public void testBasic() throws Exception { Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true); String input = "mykeyword"; - BytesRef payload = new BytesRef("payload"); tokenStream.setReader(new StringReader(input)); - CompletionTokenStream completionTokenStream = new CompletionTokenStream(tokenStream); - completionTokenStream.setPayload(payload); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream); - assertTokenStreamContents(stream, new String[] {input}, null, null, new String[] {payload.utf8ToString()}, new int[] { 1 }, null, null); + ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream); + assertTokenStreamContents(stream, new String[] {input}, null, null, new int[] { 1 }); } @Test public void testWithNoPreserveSep() throws Exception { Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true); String input = "mykeyword another keyword"; - BytesRef payload = new BytesRef("payload"); tokenStream.setReader(new StringReader(input)); - CompletionTokenStream completionTokenStream = new CompletionTokenStream(tokenStream, false, false, 100); - completionTokenStream.setPayload(payload); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream); - assertTokenStreamContents(stream, new String[] {"mykeywordanotherkeyword"}, null, null, new String[] {payload.utf8ToString()}, new int[] { 1 }, null, null); + ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream, false, false, 100); + assertTokenStreamContents(stream, new String[] {"mykeywordanotherkeyword"}, null, null, new int[] { 1 }); } @Test @@ -65,17 +57,14 @@ public class CompletionTokenStreamTest extends BaseTokenStreamTestCase { Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true); String input = "mykeyword another keyword"; tokenStream.setReader(new StringReader(input)); - BytesRef payload = new BytesRef("payload"); - CompletionTokenStream completionTokenStream = new CompletionTokenStream(tokenStream); - completionTokenStream.setPayload(payload); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream); + ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream); CharsRefBuilder builder = new CharsRefBuilder(); builder.append("mykeyword"); - builder.append(((char) CompletionAnalyzer.SEP_LABEL)); + builder.append(SEP_LABEL); builder.append("another"); - builder.append(((char) CompletionAnalyzer.SEP_LABEL)); + builder.append(SEP_LABEL); builder.append("keyword"); - assertTokenStreamContents(stream, new String[]{builder.toCharsRef().toString()}, null, null, new String[]{payload.utf8ToString()}, new int[]{1}, null, null); + assertTokenStreamContents(stream, new String[]{builder.toCharsRef().toString()}, null, null, new int[]{1}); } @Test @@ -85,11 +74,8 @@ public class CompletionTokenStreamTest extends BaseTokenStreamTestCase { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); tokenizer.setReader(new StringReader("mykeyword")); SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true); - CompletionTokenStream completionTokenStream = new CompletionTokenStream(filter); - BytesRef payload = new BytesRef("payload"); - completionTokenStream.setPayload(payload); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream); - assertTokenStreamContents(stream, new String[] {"mykeyword", "mysynonym"}, null, null, new String[] {payload.utf8ToString(), payload.utf8ToString()}, new int[] { 1, 1 }, null, null); + ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter); + assertTokenStreamContents(stream, new String[] {"mykeyword", "mysynonym"}, null, null, new int[] { 1, 0 }); } @Test @@ -100,26 +86,48 @@ public class CompletionTokenStreamTest extends BaseTokenStreamTestCase { String input = "mykeyword another keyword"; tokenStream.setReader(new StringReader(input)); SynonymFilter filter = new SynonymFilter(tokenStream, builder.build(), true); - BytesRef payload = new BytesRef("payload"); - CompletionTokenStream completionTokenStream = new CompletionTokenStream(filter, true, false, 100); - completionTokenStream.setPayload(payload); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream); + ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, true, false, 100); String[] expectedOutputs = new String[2]; CharsRefBuilder expectedOutput = new CharsRefBuilder(); expectedOutput.append("mykeyword"); - expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL)); + expectedOutput.append(SEP_LABEL); expectedOutput.append("another"); - expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL)); + expectedOutput.append(SEP_LABEL); expectedOutput.append("keyword"); expectedOutputs[0] = expectedOutput.toCharsRef().toString(); expectedOutput.clear(); expectedOutput.append("mysynonym"); - expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL)); + expectedOutput.append(SEP_LABEL); expectedOutput.append("another"); - expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL)); + expectedOutput.append(SEP_LABEL); expectedOutput.append("keyword"); expectedOutputs[1] = expectedOutput.toCharsRef().toString(); - assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 1}, null, null); + assertTokenStreamContents(stream, expectedOutputs, null, null, new int[]{1, 0}); + } + + @Test + public void testWithStopword() throws Exception { + for (boolean preservePosInc : new boolean[]{true, false}) { + Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true); + String input = "a mykeyword a keyword"; //LUCENE-8344 add "a" + tokenStream.setReader(new StringReader(input)); + TokenFilter tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("a")); + ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, true, preservePosInc, 10); + CharsRefBuilder builder = new CharsRefBuilder(); + if (preservePosInc) { + builder.append(SEP_LABEL); + } + builder.append("mykeyword"); + builder.append(SEP_LABEL); + if (preservePosInc) { + builder.append(SEP_LABEL); + } + builder.append("keyword"); +// if (preservePosInc) { LUCENE-8344 uncomment +// builder.append(SEP_LABEL); +// } + assertTokenStreamContents(concatStream, new String[]{builder.toCharsRef().toString()}); + } } @Test @@ -137,41 +145,24 @@ public class CompletionTokenStreamTest extends BaseTokenStreamTestCase { tokenizer.setReader(new StringReader(valueBuilder.toString())); SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true); - CompletionTokenStream completionTokenStream = new CompletionTokenStream(filter); - completionTokenStream.setPayload(new BytesRef()); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream); - stream.reset(); - CompletionTokenStream.BytesRefBuilderTermAttribute attr = stream.addAttribute(CompletionTokenStream.BytesRefBuilderTermAttribute.class); - PositionIncrementAttribute posAttr = stream.addAttribute(PositionIncrementAttribute.class); - int maxPos = 0; - int count = 0; - while(stream.incrementToken()) { - count++; - assertNotNull(attr.getBytesRef()); - assertTrue(attr.getBytesRef().length > 0); - maxPos += posAttr.getPositionIncrement(); - } - stream.close(); - assertEquals(count, 256); - assertEquals(count, maxPos); - } - - public final static class PayloadAttrToTypeAttrFilter extends TokenFilter { - private PayloadAttribute payload = addAttribute(PayloadAttribute.class); - private TypeAttribute type = addAttribute(TypeAttribute.class); - - protected PayloadAttrToTypeAttrFilter(TokenStream input) { - super(input); - } - - @Override - public boolean incrementToken() throws IOException { - if (input.incrementToken()) { - // we move them over so we can assert them more easily in the tests - type.setType(payload.getPayload().utf8ToString()); - return true; + int count; + try (ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter)) { + stream.reset(); + ConcatenateGraphFilter.BytesRefBuilderTermAttribute attr = stream.addAttribute(ConcatenateGraphFilter.BytesRefBuilderTermAttribute.class); + count = 0; + while (stream.incrementToken()) { + count++; + assertNotNull(attr.getBytesRef()); + assertTrue(attr.getBytesRef().length > 0); } - return false; } + assertEquals(count, 256); } + + public void testEmpty() throws IOException { + Tokenizer tokenizer = whitespaceMockTokenizer(""); + ConcatenateGraphFilter filter = new ConcatenateGraphFilter(tokenizer); + assertTokenStreamContents(filter, new String[0]); + } + } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilterFactory.java new file mode 100644 index 00000000000..1e149f03b1b --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilterFactory.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.miscellaneous; + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase; + +public class TestConcatenateGraphFilterFactory extends BaseTokenStreamFactoryTestCase { + public void test() throws Exception { + for (final boolean consumeAll : new boolean[]{true, false}) { + final String input = "A1 B2 A1 D4 C3"; + Reader reader = new StringReader(input); + MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + tokenizer.setReader(reader); + tokenizer.setEnableChecks(consumeAll); + TokenStream stream = tokenizer; + stream = tokenFilterFactory("ConcatenateGraph").create(stream); + assertTokenStreamContents(stream, new String[]{input.replace(' ', (char) ConcatenateGraphFilter.SEP_LABEL)}); + } + } + + public void testPreserveSep() throws Exception { + final String input = "A1 B2 A1 D4 C3"; + final String output = "A1A1D4C3"; + Reader reader = new StringReader(input); + MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + tokenizer.setReader(reader); + TokenStream stream = tokenizer; + stream = new StopFilter(stream, StopFilter.makeStopSet("B2")); + stream = tokenFilterFactory("ConcatenateGraph", + "preserveSep", "false" + ).create(stream); + assertTokenStreamContents(stream, new String[]{output}); + } + + public void testPreservePositionIncrements() throws Exception { + final String input = "A1 B2 A1 D4 C3"; + final String output = "A1 A1 D4 C3"; + Reader reader = new StringReader(input); + MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + tokenizer.setReader(reader); + TokenStream stream = tokenizer; + stream = new StopFilter(stream, StopFilter.makeStopSet("B2")); + stream = tokenFilterFactory("ConcatenateGraph", + "preservePositionIncrements", "false" + ).create(stream); + assertTokenStreamContents(stream, new String[]{output.replace(' ', (char) ConcatenateGraphFilter.SEP_LABEL)}); + } + + public void testRequired() throws Exception { + // no params are required + tokenFilterFactory("ConcatenateGraph"); + } + + /** + * Test that bogus arguments result in exception + */ + public void testBogusArguments() throws Exception { + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> + tokenFilterFactory("ConcatenateGraph", "bogusArg", "bogusValue")); + assertTrue(expected.getMessage().contains("Unknown parameters")); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java index 450447ac9ba..76bd617f408 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java @@ -69,4 +69,13 @@ public class TestFingerprintFilter extends BaseTokenStreamTestCase { } } + public void testEmpty() throws Exception { + for (final boolean consumeAll : new boolean[] { true, false }) { + MockTokenizer tokenizer = whitespaceMockTokenizer(""); + tokenizer.setEnableChecks(consumeAll); + TokenStream stream = new FingerprintFilter(tokenizer); + assertTokenStreamContents(stream, new String[0]); + } + } + } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java index d7536e7050f..fd1949a0359 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java @@ -50,49 +50,73 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { public void testInvalidInput() throws Exception { expectThrows(IllegalArgumentException.class, () -> { - new EdgeNGramTokenFilter(input, 0, 0); + new EdgeNGramTokenFilter(input, 0, 0, false); }); } public void testInvalidInput2() throws Exception { expectThrows(IllegalArgumentException.class, () -> { - new EdgeNGramTokenFilter(input, 2, 1); + new EdgeNGramTokenFilter(input, 2, 1, false); }); } public void testInvalidInput3() throws Exception { expectThrows(IllegalArgumentException.class, () -> { - new EdgeNGramTokenFilter(input, -1, 2); + new EdgeNGramTokenFilter(input, -1, 2, false); }); } public void testFrontUnigram() throws Exception { - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 1); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 1, false); assertTokenStreamContents(tokenizer, new String[]{"a"}, new int[]{0}, new int[]{5}); } public void testOversizedNgrams() throws Exception { - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6, false); assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]); } + public void testOversizedNgramsPreserveOriginal() throws Exception { + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6, true); + assertTokenStreamContents(tokenizer, new String[] {"abcde"}, new int[] {0}, new int[] {5}); + } + + public void testPreserveOriginal() throws Exception { + final String inputString = "a bcd efghi jk"; + + { // preserveOriginal = false + TokenStream ts = whitespaceMockTokenizer(inputString); + EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, false); + assertTokenStreamContents(filter, + new String[] { "bc", "bcd", "ef", "efg", "jk" }, + new int[] { 2, 2, 6, 6, 12 }, + new int[] { 5, 5, 11, 11, 14 }, + new int[] { 2, 0, 1, 0, 1 }); + } + + { // preserveOriginal = true + TokenStream ts = whitespaceMockTokenizer(inputString); + EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, true); + assertTokenStreamContents(filter, + new String[] { "a", "bc", "bcd", "ef", "efg", "efghi", "jk" }, + new int[] { 0, 2, 2, 6, 6, 6, 12 }, + new int[] { 1, 5, 5, 11, 11, 11, 14 }, + new int[] { 1, 1, 0, 1, 0, 0, 1 }); + } + } + public void testFrontRangeOfNgrams() throws Exception { - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 3); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 3, false); assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5}); } public void testFilterPositions() throws Exception { TokenStream ts = whitespaceMockTokenizer("abcde vwxyz"); - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, 1, 3); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, 1, 3, false); assertTokenStreamContents(tokenizer, - new String[]{"a","ab","abc","v","vw","vwx"}, - new int[]{0,0,0,6,6,6}, - new int[]{5,5,5,11,11,11}, - null, - new int[]{1,0,0,1,0,0}, - null, - null, - false); + new String[] {"a","ab","abc","v","vw","vwx"}, + new int[] {0, 0, 0, 6, 6, 6}, + new int[] {5, 5, 5, 11, 11, 11}); } private static class PositionFilter extends TokenFilter { @@ -128,7 +152,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { public void testFirstTokenPositionIncrement() throws Exception { TokenStream ts = whitespaceMockTokenizer("a abc"); ts = new PositionFilter(ts); // All but first token will get 0 position increment - EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3); + EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, false); // The first token "a" will not be output, since it's smaller than the mingram size of 2. // The second token on input to EdgeNGramTokenFilter will have position increment of 0, // which should be increased to 1, since this is the first output token in the stream. @@ -142,14 +166,14 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { public void testSmallTokenInStream() throws Exception { input = whitespaceMockTokenizer("abc de fgh"); - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 3, 3); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 3, 3, false); assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}); } public void testReset() throws Exception { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("abcde")); - EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, 1, 3); + EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, 1, 3, false); assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5}); tokenizer.setReader(new StringReader("abcde")); assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5}); @@ -160,13 +184,14 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { for (int i = 0; i < 10; i++) { final int min = TestUtil.nextInt(random(), 2, 10); final int max = TestUtil.nextInt(random(), min, 20); + final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0; Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, - new EdgeNGramTokenFilter(tokenizer, min, max)); + new EdgeNGramTokenFilter(tokenizer, min, max, preserveOriginal)); } }; checkRandomData(random(), a, 100*RANDOM_MULTIPLIER); @@ -181,7 +206,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new KeywordTokenizer(); return new TokenStreamComponents(tokenizer, - new EdgeNGramTokenFilter(tokenizer, 2, 15)); + new EdgeNGramTokenFilter(tokenizer, 2, 15, false)); } }; checkAnalysisConsistency(random, a, random.nextBoolean(), ""); @@ -192,7 +217,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { TokenStream tk = new LetterTokenizer(); ((Tokenizer)tk).setReader(new StringReader("abc d efgh ij klmno p q")); tk = new ShingleFilter(tk); - tk = new EdgeNGramTokenFilter(tk, 7, 10); + tk = new EdgeNGramTokenFilter(tk, 7, 10, false); assertTokenStreamContents(tk, new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" }, new int[] { 6,11,11,14 }, @@ -204,23 +229,44 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { } public void testSupplementaryCharacters() throws IOException { - final String s = TestUtil.randomUnicodeString(random(), 10); - final int codePointCount = s.codePointCount(0, s.length()); - final int minGram = TestUtil.nextInt(random(), 1, 3); - final int maxGram = TestUtil.nextInt(random(), minGram, 10); - TokenStream tk = new KeywordTokenizer(); - ((Tokenizer)tk).setReader(new StringReader(s)); - tk = new EdgeNGramTokenFilter(tk, minGram, maxGram); - final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); - final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); - tk.reset(); - for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) { - assertTrue(tk.incrementToken()); - assertEquals(0, offsetAtt.startOffset()); - assertEquals(s.length(), offsetAtt.endOffset()); - final int end = Character.offsetByCodePoints(s, 0, i); - assertEquals(s.substring(0, end), termAtt.toString()); + for (int i = 0; i < 20; i++) { + final String s = TestUtil.randomUnicodeString(random(), 10); + final int codePointCount = s.codePointCount(0, s.length()); + final int minGram = TestUtil.nextInt(random(), 1, 3); + final int maxGram = TestUtil.nextInt(random(), minGram, 10); + final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0; + + TokenStream tk = new KeywordTokenizer(); + ((Tokenizer)tk).setReader(new StringReader(s)); + tk = new EdgeNGramTokenFilter(tk, minGram, maxGram, preserveOriginal); + final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); + final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); + tk.reset(); + + if (codePointCount < minGram && preserveOriginal) { + assertTrue(tk.incrementToken()); + assertEquals(0, offsetAtt.startOffset()); + assertEquals(s.length(), offsetAtt.endOffset()); + assertEquals(s, termAtt.toString()); + } + + for (int j = minGram; j <= Math.min(codePointCount, maxGram); j++) { + assertTrue(tk.incrementToken()); + assertEquals(0, offsetAtt.startOffset()); + assertEquals(s.length(), offsetAtt.endOffset()); + final int end = Character.offsetByCodePoints(s, 0, j); + assertEquals(s.substring(0, end), termAtt.toString()); + } + + if (codePointCount > maxGram && preserveOriginal) { + assertTrue(tk.incrementToken()); + assertEquals(0, offsetAtt.startOffset()); + assertEquals(s.length(), offsetAtt.endOffset()); + assertEquals(s, termAtt.toString()); + } + + assertFalse(tk.incrementToken()); + tk.close(); } - assertFalse(tk.incrementToken()); } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java index d8591a9726e..2a473961c06 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java @@ -48,28 +48,28 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { public void testInvalidInput() throws Exception { expectThrows(IllegalArgumentException.class, () -> { - new NGramTokenFilter(input, 2, 1); + new NGramTokenFilter(input, 2, 1, false); }); } public void testInvalidInput2() throws Exception { expectThrows(IllegalArgumentException.class, () -> { - new NGramTokenFilter(input, 0, 1); + new NGramTokenFilter(input, 0, 1, false); }); } public void testUnigrams() throws Exception { - NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1); + NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1, false); assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0}); } public void testBigrams() throws Exception { - NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2); + NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2, false); assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,0,0,0}, new int[]{5,5,5,5}, new int[]{1,0,0,0}); } public void testNgrams() throws Exception { - NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3); + NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3, false); assertTokenStreamContents(filter, new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"}, new int[]{0,0,0,0,0,0,0,0,0,0,0,0}, @@ -81,7 +81,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { } public void testNgramsNoIncrement() throws Exception { - NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3); + NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3, false); assertTokenStreamContents(filter, new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"}, new int[]{0,0,0,0,0,0,0,0,0,0,0,0}, @@ -93,25 +93,61 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { } public void testOversizedNgrams() throws Exception { - NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7); + NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7, false); assertTokenStreamContents(filter, new String[0], new int[0], new int[0]); } + public void testOversizedNgramsPreserveOriginal() throws Exception { + NGramTokenFilter tokenizer = new NGramTokenFilter(input, 6, 6, true); + assertTokenStreamContents(tokenizer, new String[] {"abcde"}, new int[] {0}, new int[] {5}); + } + public void testSmallTokenInStream() throws Exception { input = whitespaceMockTokenizer("abc de fgh"); - NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3); - assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2}); + NGramTokenFilter tokenizer = new NGramTokenFilter(input, 3, 3, false); + assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2}); + } + + public void testSmallTokenInStreamPreserveOriginal() throws Exception { + input = whitespaceMockTokenizer("abc de fgh"); + NGramTokenFilter tokenizer = new NGramTokenFilter(input, 3, 3, true); + assertTokenStreamContents(tokenizer, new String[]{"abc","de","fgh"}, new int[]{0,4,7}, new int[]{3,6,10}, new int[] {1, 1, 1}); + } public void testReset() throws Exception { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("abcde")); - NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1); + NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1, false); assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0}); tokenizer.setReader(new StringReader("abcde")); assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0}); } + public void testKeepShortTermKeepLongTerm() throws Exception { + final String inputString = "a bcd efghi jk"; + + { // preserveOriginal = false + TokenStream ts = whitespaceMockTokenizer(inputString); + NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, false); + assertTokenStreamContents(filter, + new String[] { "bc", "bcd", "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "jk" }, + new int[] { 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 12 }, + new int[] { 5, 5, 5, 11, 11, 11, 11, 11, 11, 11, 14 }, + new int[] { 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1 }); + } + + { // preserveOriginal = true + TokenStream ts = whitespaceMockTokenizer(inputString); + NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, true); + assertTokenStreamContents(filter, + new String[] { "a", "bc", "bcd", "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "efghi", "jk" }, + new int[] { 0, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6, 12 }, + new int[] { 1, 5, 5, 5, 11, 11, 11, 11, 11, 11, 11, 11, 14 }, + new int[] { 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1 }); + } + } + // LUCENE-3642 // EdgeNgram blindly adds term length to offset, but this can take things out of bounds // wrt original text if a previous filter increases the length of the word (in this case æ -> ae) @@ -122,7 +158,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenFilter filters = new ASCIIFoldingFilter(tokenizer); - filters = new NGramTokenFilter(filters, 2, 2); + filters = new NGramTokenFilter(filters, 2, 2, false); return new TokenStreamComponents(tokenizer, filters); } }; @@ -139,12 +175,14 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { for (int i = 0; i < 10; i++) { final int min = TestUtil.nextInt(random(), 2, 10); final int max = TestUtil.nextInt(random(), min, 20); + final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0; + Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, - new NGramTokenFilter(tokenizer, min, max)); + new NGramTokenFilter(tokenizer, min, max, preserveOriginal)); } }; checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20); @@ -159,7 +197,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new KeywordTokenizer(); return new TokenStreamComponents(tokenizer, - new NGramTokenFilter(tokenizer, 2, 15)); + new NGramTokenFilter(tokenizer, 2, 15, false)); } }; checkAnalysisConsistency(random, a, random.nextBoolean(), ""); @@ -167,27 +205,47 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { } public void testSupplementaryCharacters() throws IOException { - final String s = TestUtil.randomUnicodeString(random(), 10); - final int codePointCount = s.codePointCount(0, s.length()); - final int minGram = TestUtil.nextInt(random(), 1, 3); - final int maxGram = TestUtil.nextInt(random(), minGram, 10); - TokenStream tk = new KeywordTokenizer(); - ((Tokenizer)tk).setReader(new StringReader(s)); - tk = new NGramTokenFilter(tk, minGram, maxGram); - final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); - final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); - tk.reset(); - for (int start = 0; start < codePointCount; ++start) { - for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) { + for (int i = 0; i < 20; i++) { + final String s = TestUtil.randomUnicodeString(random(), 10); + final int codePointCount = s.codePointCount(0, s.length()); + final int minGram = TestUtil.nextInt(random(), 1, 3); + final int maxGram = TestUtil.nextInt(random(), minGram, 10); + final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0; + + TokenStream tk = new KeywordTokenizer(); + ((Tokenizer)tk).setReader(new StringReader(s)); + tk = new NGramTokenFilter(tk, minGram, maxGram, preserveOriginal); + final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); + final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); + tk.reset(); + + if (codePointCount < minGram && preserveOriginal) { assertTrue(tk.incrementToken()); assertEquals(0, offsetAtt.startOffset()); assertEquals(s.length(), offsetAtt.endOffset()); - final int startIndex = Character.offsetByCodePoints(s, 0, start); - final int endIndex = Character.offsetByCodePoints(s, 0, end); - assertEquals(s.substring(startIndex, endIndex), termAtt.toString()); + assertEquals(s, termAtt.toString()); } + + for (int start = 0; start < codePointCount; ++start) { + for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) { + assertTrue(tk.incrementToken()); + assertEquals(0, offsetAtt.startOffset()); + assertEquals(s.length(), offsetAtt.endOffset()); + final int startIndex = Character.offsetByCodePoints(s, 0, start); + final int endIndex = Character.offsetByCodePoints(s, 0, end); + assertEquals(s.substring(startIndex, endIndex), termAtt.toString()); + } + } + + if (codePointCount > maxGram && preserveOriginal) { + assertTrue(tk.incrementToken()); + assertEquals(0, offsetAtt.startOffset()); + assertEquals(s.length(), offsetAtt.endOffset()); + assertEquals(s, termAtt.toString()); + } + + assertFalse(tk.incrementToken()); + tk.close(); } - assertFalse(tk.incrementToken()); } - } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java index 5de532f4c09..aa98f403644 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java @@ -56,12 +56,14 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { } /** - * Test the NGramFilterFactory + * Test the NGramFilterFactory with old defaults */ public void testNGramFilter() throws Exception { Reader reader = new StringReader("test"); TokenStream stream = whitespaceMockTokenizer(reader); - stream = tokenFilterFactory("NGram").create(stream); + stream = tokenFilterFactory("NGram", + "minGramSize", "1", + "maxGramSize", "2").create(stream); assertTokenStreamContents(stream, new String[] { "t", "te", "e", "es", "s", "st", "t" }); } @@ -126,12 +128,13 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { } /** - * Test EdgeNGramFilterFactory + * Test EdgeNGramFilterFactory with old defaults */ public void testEdgeNGramFilter() throws Exception { Reader reader = new StringReader("test"); TokenStream stream = whitespaceMockTokenizer(reader); - stream = tokenFilterFactory("EdgeNGram").create(stream); + stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1", + "maxGramSize", "1").create(stream); assertTokenStreamContents(stream, new String[] { "t" }); } @@ -173,7 +176,8 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { /** Test that bogus arguments result in exception */ public void testBogusArguments() throws Exception { - IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { + IllegalArgumentException expected = null; + expected = expectThrows(IllegalArgumentException.class, () -> { tokenizerFactory("NGram", "bogusArg", "bogusValue"); }); assertTrue(expected.getMessage().contains("Unknown parameters")); @@ -184,12 +188,12 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { assertTrue(expected.getMessage().contains("Unknown parameters")); expected = expectThrows(IllegalArgumentException.class, () -> { - tokenFilterFactory("NGram", "bogusArg", "bogusValue"); + tokenFilterFactory("NGram", "minGramSize", "2", "maxGramSize", "5", "bogusArg", "bogusValue"); }); assertTrue(expected.getMessage().contains("Unknown parameters")); expected = expectThrows(IllegalArgumentException.class, () -> { - tokenFilterFactory("EdgeNGram", "bogusArg", "bogusValue"); + tokenFilterFactory("EdgeNGram", "minGramSize", "2", "maxGramSize", "5", "bogusArg", "bogusValue"); }); assertTrue(expected.getMessage().contains("Unknown parameters")); } diff --git a/lucene/classification/src/test/org/apache/lucene/classification/BM25NBClassifierTest.java b/lucene/classification/src/test/org/apache/lucene/classification/BM25NBClassifierTest.java index 237c53fd643..050073c32fe 100644 --- a/lucene/classification/src/test/org/apache/lucene/classification/BM25NBClassifierTest.java +++ b/lucene/classification/src/test/org/apache/lucene/classification/BM25NBClassifierTest.java @@ -87,7 +87,7 @@ public class BM25NBClassifierTest extends ClassificationTestBase { @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer tokenizer = new KeywordTokenizer(); - return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20))); + return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20, false))); } } diff --git a/lucene/classification/src/test/org/apache/lucene/classification/CachingNaiveBayesClassifierTest.java b/lucene/classification/src/test/org/apache/lucene/classification/CachingNaiveBayesClassifierTest.java index 00fa4fe3505..8669df4e05a 100644 --- a/lucene/classification/src/test/org/apache/lucene/classification/CachingNaiveBayesClassifierTest.java +++ b/lucene/classification/src/test/org/apache/lucene/classification/CachingNaiveBayesClassifierTest.java @@ -86,7 +86,7 @@ public class CachingNaiveBayesClassifierTest extends ClassificationTestBase onClose, SegmentCommitInfo info) throws IOException { this.rld = rld; + reader = rld.getReader(IOContext.READ); startDelCount = rld.getDelCount(); delGen = info.getBufferedDeletesGen(); this.onClose = onClose; - reader = rld.getReader(IOContext.READ); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index 0f7871acd23..9a4e3e50d93 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -46,6 +46,7 @@ import org.apache.lucene.index.CheckIndex.Status.DocValuesStatus; import org.apache.lucene.index.PointValues.IntersectVisitor; import org.apache.lucene.index.PointValues.Relation; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.DocValuesFieldExistsQuery; import org.apache.lucene.search.LeafFieldComparator; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; @@ -411,7 +412,7 @@ public final class CheckIndex implements Closeable { * that would otherwise be more complicated to debug if they had to close the writer * for each check. */ - public CheckIndex(Directory dir, Lock writeLock) throws IOException { + public CheckIndex(Directory dir, Lock writeLock) { this.dir = dir; this.writeLock = writeLock; this.infoStream = null; @@ -781,7 +782,10 @@ public final class CheckIndex implements Closeable { throw new RuntimeException("Points test failed"); } } - + final String softDeletesField = reader.getFieldInfos().getSoftDeletesField(); + if (softDeletesField != null) { + checkSoftDeletes(softDeletesField, info, reader, infoStream, failFast); + } msg(infoStream, ""); if (verbose) { @@ -3049,6 +3053,25 @@ public final class CheckIndex implements Closeable { } } + private static void checkSoftDeletes(String softDeletesField, SegmentCommitInfo info, SegmentReader reader, PrintStream infoStream, boolean failFast) throws IOException { + if (infoStream != null) + infoStream.print(" test: check soft deletes....."); + try { + int softDeletes = PendingSoftDeletes.countSoftDeletes(DocValuesFieldExistsQuery.getDocValuesDocIdSetIterator(softDeletesField, reader), reader.getLiveDocs()); + if (softDeletes != info.getSoftDelCount()) { + throw new RuntimeException("actual soft deletes: " + softDeletes + " but expected: " +info.getSoftDelCount()); + } + } catch (Exception e) { + if (failFast) { + throw IOUtils.rethrowAlways(e); + } + msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]"); + if (infoStream != null) { + e.printStackTrace(infoStream); + } + } + } + private static double nsToSec(long ns) { return ns/1000000000.0; } diff --git a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java index 705d7bc6de4..e55251696e9 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java @@ -36,6 +36,7 @@ import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PointsFormat; import org.apache.lucene.codecs.PointsWriter; import org.apache.lucene.document.FieldType; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.similarities.Similarity; @@ -841,4 +842,19 @@ final class DefaultIndexingChain extends DocConsumer { } } } + + @Override + DocIdSetIterator getHasDocValues(String field) { + PerField perField = getPerField(field); + if (perField != null) { + if (perField.docValuesWriter != null) { + if (perField.fieldInfo.getDocValuesType() == DocValuesType.NONE) { + return null; + } + + return perField.docValuesWriter.getDocIdSet(); + } + } + return null; + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/DocConsumer.java b/lucene/core/src/java/org/apache/lucene/index/DocConsumer.java index a64f13c5ba2..d124434a5f9 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocConsumer.java @@ -19,8 +19,17 @@ package org.apache.lucene.index; import java.io.IOException; +import org.apache.lucene.search.DocIdSetIterator; + abstract class DocConsumer { abstract void processDocument() throws IOException; abstract Sorter.DocMap flush(final SegmentWriteState state) throws IOException; abstract void abort() throws IOException; + + /** + * Returns a {@link DocIdSetIterator} for the given field or null if the field doesn't have + * doc values. + */ + abstract DocIdSetIterator getHasDocValues(String field); + } diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/DocValuesWriter.java index 9dde81728f2..b739b14a2a7 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocValuesWriter.java @@ -20,10 +20,13 @@ package org.apache.lucene.index; import java.io.IOException; import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.SortField; abstract class DocValuesWriter { abstract void finish(int numDoc); abstract void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer consumer) throws IOException; abstract Sorter.DocComparator getDocComparator(int numDoc, SortField sortField) throws IOException; + abstract DocIdSetIterator getDocIdSet(); + } diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index 094afc5a568..c8ebc4db491 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -28,6 +28,7 @@ import java.util.concurrent.atomic.AtomicLong; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.index.DocumentsWriterDeleteQueue.DeleteSlice; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FlushInfo; @@ -460,14 +461,27 @@ final class DocumentsWriterPerThread { } final Sorter.DocMap sortMap; try { + DocIdSetIterator softDeletedDocs; + if (indexWriterConfig.getSoftDeletesField() != null) { + softDeletedDocs = consumer.getHasDocValues(indexWriterConfig.getSoftDeletesField()); + } else { + softDeletedDocs = null; + } sortMap = consumer.flush(flushState); + if (softDeletedDocs == null) { + flushState.softDelCountOnFlush = 0; + } else { + flushState.softDelCountOnFlush = PendingSoftDeletes.countSoftDeletes(softDeletedDocs, flushState.liveDocs); + assert flushState.segmentInfo.maxDoc() >= flushState.softDelCountOnFlush + flushState.delCountOnFlush; + } // We clear this here because we already resolved them (private to this segment) when writing postings: pendingUpdates.clearDeleteTerms(); segmentInfo.setFiles(new HashSet<>(directory.getCreatedFiles())); - final SegmentCommitInfo segmentInfoPerCommit = new SegmentCommitInfo(segmentInfo, 0, -1L, -1L, -1L); + final SegmentCommitInfo segmentInfoPerCommit = new SegmentCommitInfo(segmentInfo, 0, flushState.softDelCountOnFlush, -1L, -1L, -1L); if (infoStream.isEnabled("DWPT")) { infoStream.message("DWPT", "new segment has " + (flushState.liveDocs == null ? 0 : flushState.delCountOnFlush) + " deleted docs"); + infoStream.message("DWPT", "new segment has " + flushState.softDelCountOnFlush + " soft-deleted docs"); infoStream.message("DWPT", "new segment has " + (flushState.fieldInfos.hasVectors() ? "vectors" : "no vectors") + "; " + (flushState.fieldInfos.hasNorms() ? "norms" : "no norms") + "; " + @@ -497,8 +511,7 @@ final class DocumentsWriterPerThread { assert segmentInfo != null; FlushedSegment fs = new FlushedSegment(infoStream, segmentInfoPerCommit, flushState.fieldInfos, - segmentDeletes, flushState.liveDocs, flushState.delCountOnFlush, - sortMap); + segmentDeletes, flushState.liveDocs, flushState.delCountOnFlush, sortMap); sealFlushedSegment(fs, sortMap, flushNotifications); if (infoStream.isEnabled("DWPT")) { infoStream.message("DWPT", "flush time " + ((System.nanoTime() - t0) / 1000000.0) + " msec"); diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java index 037fe5c1bc7..b50cb12cd5e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java @@ -53,14 +53,17 @@ public final class FieldInfo { private int pointDimensionCount; private int pointNumBytes; + // whether this field is used as the soft-deletes field + private final boolean softDeletesField; + /** * Sole constructor. * * @lucene.experimental */ - public FieldInfo(String name, int number, boolean storeTermVector, boolean omitNorms, - boolean storePayloads, IndexOptions indexOptions, DocValuesType docValues, - long dvGen, Map attributes, int pointDimensionCount, int pointNumBytes) { + public FieldInfo(String name, int number, boolean storeTermVector, boolean omitNorms, boolean storePayloads, + IndexOptions indexOptions, DocValuesType docValues, long dvGen, Map attributes, + int pointDimensionCount, int pointNumBytes, boolean softDeletesField) { this.name = Objects.requireNonNull(name); this.number = number; this.docValuesType = Objects.requireNonNull(docValues, "DocValuesType must not be null (field: \"" + name + "\")"); @@ -78,6 +81,7 @@ public final class FieldInfo { this.attributes = Objects.requireNonNull(attributes); this.pointDimensionCount = pointDimensionCount; this.pointNumBytes = pointNumBytes; + this.softDeletesField = softDeletesField; assert checkConsistency(); } @@ -332,4 +336,12 @@ public final class FieldInfo { public Map attributes() { return attributes; } + + /** + * Returns true if this field is configured and used as the soft-deletes field. + * See {@link IndexWriterConfig#softDeletesField} + */ + public boolean isSoftDeletesField() { + return softDeletesField; + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java index 4b472a55503..0a0ff5ee605 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java @@ -43,6 +43,7 @@ public class FieldInfos implements Iterable { private final boolean hasNorms; private final boolean hasDocValues; private final boolean hasPointValues; + private final String softDeletesField; // used only by fieldInfo(int) private final FieldInfo[] byNumber; @@ -62,6 +63,7 @@ public class FieldInfos implements Iterable { boolean hasNorms = false; boolean hasDocValues = false; boolean hasPointValues = false; + String softDeletesField = null; int size = 0; // number of elements in byNumberTemp, number of used array slots FieldInfo[] byNumberTemp = new FieldInfo[10]; // initial array capacity of 10 @@ -92,6 +94,12 @@ public class FieldInfos implements Iterable { hasDocValues |= info.getDocValuesType() != DocValuesType.NONE; hasPayloads |= info.hasPayloads(); hasPointValues |= (info.getPointDimensionCount() != 0); + if (info.isSoftDeletesField()) { + if (softDeletesField != null && softDeletesField.equals(info.name) == false) { + throw new IllegalArgumentException("multiple soft-deletes fields [" + info.name + ", " + softDeletesField + "]"); + } + softDeletesField = info.name; + } } this.hasVectors = hasVectors; @@ -102,6 +110,7 @@ public class FieldInfos implements Iterable { this.hasNorms = hasNorms; this.hasDocValues = hasDocValues; this.hasPointValues = hasPointValues; + this.softDeletesField = softDeletesField; List valuesTemp = new ArrayList<>(); byNumber = new FieldInfo[size]; @@ -153,6 +162,11 @@ public class FieldInfos implements Iterable { public boolean hasPointValues() { return hasPointValues; } + + /** Returns the soft-deletes field name if exists; otherwise returns null */ + public String getSoftDeletesField() { + return softDeletesField; + } /** Returns the number of fields */ public int size() { @@ -221,13 +235,17 @@ public class FieldInfos implements Iterable { // norms back on after they were already ommitted; today // we silently discard the norm but this is badly trappy private int lowestUnassignedFieldNumber = -1; + + // The soft-deletes field from IWC to enforce a single soft-deletes field + private final String softDeletesFieldName; - FieldNumbers() { + FieldNumbers(String softDeletesFieldName) { this.nameToNumber = new HashMap<>(); this.numberToName = new HashMap<>(); this.indexOptions = new HashMap<>(); this.docValuesType = new HashMap<>(); this.dimensions = new HashMap<>(); + this.softDeletesFieldName = softDeletesFieldName; } /** @@ -236,7 +254,7 @@ public class FieldInfos implements Iterable { * number assigned if possible otherwise the first unassigned field number * is used as the field number. */ - synchronized int addOrGet(String fieldName, int preferredFieldNumber, IndexOptions indexOptions, DocValuesType dvType, int dimensionCount, int dimensionNumBytes) { + synchronized int addOrGet(String fieldName, int preferredFieldNumber, IndexOptions indexOptions, DocValuesType dvType, int dimensionCount, int dimensionNumBytes, boolean isSoftDeletesField) { if (indexOptions != IndexOptions.NONE) { IndexOptions currentOpts = this.indexOptions.get(fieldName); if (currentOpts == null) { @@ -284,6 +302,16 @@ public class FieldInfos implements Iterable { nameToNumber.put(fieldName, fieldNumber); } + if (isSoftDeletesField) { + if (softDeletesFieldName == null) { + throw new IllegalArgumentException("this index has [" + fieldName + "] as soft-deletes already but soft-deletes field is not configured in IWC"); + } else if (fieldName.equals(softDeletesFieldName) == false) { + throw new IllegalArgumentException("cannot configure [" + softDeletesFieldName + "] as soft-deletes; this index uses [" + fieldName + "] as soft-deletes already"); + } + } else if (fieldName.equals(softDeletesFieldName)) { + throw new IllegalArgumentException("cannot configure [" + softDeletesFieldName + "] as soft-deletes; this index uses [" + fieldName + "] as non-soft-deletes already"); + } + return fieldNumber.intValue(); } @@ -383,11 +411,7 @@ public class FieldInfos implements Iterable { private final HashMap byName = new HashMap<>(); final FieldNumbers globalFieldNumbers; private boolean finished; - - Builder() { - this(new FieldNumbers()); - } - + /** * Creates a new instance with the given {@link FieldNumbers}. */ @@ -413,8 +437,9 @@ public class FieldInfos implements Iterable { // number for this field. If the field was seen // before then we'll get the same name and number, // else we'll allocate a new one: - final int fieldNumber = globalFieldNumbers.addOrGet(name, -1, IndexOptions.NONE, DocValuesType.NONE, 0, 0); - fi = new FieldInfo(name, fieldNumber, false, false, false, IndexOptions.NONE, DocValuesType.NONE, -1, new HashMap<>(), 0, 0); + final boolean isSoftDeletesField = name.equals(globalFieldNumbers.softDeletesFieldName); + final int fieldNumber = globalFieldNumbers.addOrGet(name, -1, IndexOptions.NONE, DocValuesType.NONE, 0, 0, isSoftDeletesField); + fi = new FieldInfo(name, fieldNumber, false, false, false, IndexOptions.NONE, DocValuesType.NONE, -1, new HashMap<>(), 0, 0, isSoftDeletesField); assert !byName.containsKey(fi.name); globalFieldNumbers.verifyConsistent(Integer.valueOf(fi.number), fi.name, DocValuesType.NONE); byName.put(fi.name, fi); @@ -427,7 +452,7 @@ public class FieldInfos implements Iterable { boolean storeTermVector, boolean omitNorms, boolean storePayloads, IndexOptions indexOptions, DocValuesType docValues, long dvGen, - int dimensionCount, int dimensionNumBytes) { + int dimensionCount, int dimensionNumBytes, boolean isSoftDeletesField) { assert assertNotFinished(); if (docValues == null) { throw new NullPointerException("DocValuesType must not be null"); @@ -439,8 +464,8 @@ public class FieldInfos implements Iterable { // number for this field. If the field was seen // before then we'll get the same name and number, // else we'll allocate a new one: - final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, indexOptions, docValues, dimensionCount, dimensionNumBytes); - fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, new HashMap<>(), dimensionCount, dimensionNumBytes); + final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, indexOptions, docValues, dimensionCount, dimensionNumBytes, isSoftDeletesField); + fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, new HashMap<>(), dimensionCount, dimensionNumBytes, isSoftDeletesField); assert !byName.containsKey(fi.name); globalFieldNumbers.verifyConsistent(Integer.valueOf(fi.number), fi.name, fi.getDocValuesType()); byName.put(fi.name, fi); @@ -473,7 +498,7 @@ public class FieldInfos implements Iterable { return addOrUpdateInternal(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(), fi.getIndexOptions(), fi.getDocValuesType(), dvGen, - fi.getPointDimensionCount(), fi.getPointNumBytes()); + fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()); } public FieldInfo fieldInfo(String fieldName) { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index bc2264b7eab..037ff7230b6 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -49,6 +49,7 @@ import org.apache.lucene.index.DocValuesUpdate.BinaryDocValuesUpdate; import org.apache.lucene.index.DocValuesUpdate.NumericDocValuesUpdate; import org.apache.lucene.index.FieldInfos.FieldNumbers; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.search.DocValuesFieldExistsQuery; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.Sort; @@ -347,6 +348,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, * much like how hotels place an "authorization hold" on your credit * card to make sure they can later charge you when you check out. */ final AtomicLong pendingNumDocs = new AtomicLong(); + final boolean softDeletesEnabled; private final DocumentsWriter.FlushNotifications flushNotifications = new DocumentsWriter.FlushNotifications() { @Override @@ -639,7 +641,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, if (rld != null) { return rld.getDelCount(); // get the full count from here since SCI might change concurrently } else { - int delCount = info.getDelCount(); + final int delCount = info.getDelCount(softDeletesEnabled); assert delCount <= info.info.maxDoc(): "delCount: " + delCount + " maxDoc: " + info.info.maxDoc(); return delCount; } @@ -703,7 +705,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, conf.setIndexWriter(this); // prevent reuse by other instances config = conf; infoStream = config.getInfoStream(); - + softDeletesEnabled = config.getSoftDeletesField() != null; // obtain the write.lock. If the user configured a timeout, // we wrap with a sleeper and this might take some time. writeLock = d.obtainLock(WRITE_LOCK_NAME); @@ -960,12 +962,12 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, * If this {@link SegmentInfos} has no global field number map the returned instance is empty */ private FieldNumbers getFieldNumberMap() throws IOException { - final FieldNumbers map = new FieldNumbers(); + final FieldNumbers map = new FieldNumbers(config.softDeletesField); for(SegmentCommitInfo info : segmentInfos) { FieldInfos fis = readFieldInfos(info); for(FieldInfo fi : fis) { - map.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointNumBytes()); + map.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()); } } @@ -1154,7 +1156,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, if (docWriter.anyDeletions()) { return true; } - if (readerPool.anyPendingDeletes()) { + if (readerPool.anyDeletions()) { return true; } for (final SegmentCommitInfo info : segmentInfos) { @@ -1787,7 +1789,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, if (globalFieldNumberMap.contains(f.name(), dvType) == false) { // if this field doesn't exists we try to add it. if it exists and the DV type doesn't match we // get a consistent error message as if you try to do that during an indexing operation. - globalFieldNumberMap.addOrGet(f.name(), -1, IndexOptions.NONE, dvType, 0, 0); + globalFieldNumberMap.addOrGet(f.name(), -1, IndexOptions.NONE, dvType, 0, 0, f.name().equals(config.softDeletesField)); assert globalFieldNumberMap.contains(f.name(), dvType); } if (config.getIndexSortFields().contains(f.name())) { @@ -2824,7 +2826,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, FieldInfos fis = readFieldInfos(info); for(FieldInfo fi : fis) { // This will throw exceptions if any of the incoming fields have an illegal schema change: - globalFieldNumberMap.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointNumBytes()); + globalFieldNumberMap.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()); } infos.add(copySegmentAsIs(info, newSegName, context)); } @@ -2939,11 +2941,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, // long so we can detect int overflow: long numDocs = 0; - - Sort indexSort = config.getIndexSort(); - long seqNo; - try { if (infoStream.isEnabled("IW")) { infoStream.message("IW", "flush at addIndexes(CodecReader...)"); @@ -2951,10 +2949,15 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, flush(false, true); String mergedName = newSegmentName(); - + int numSoftDeleted = 0; for (CodecReader leaf : readers) { numDocs += leaf.numDocs(); validateMergeReader(leaf); + if (softDeletesEnabled) { + Bits liveDocs = leaf.getLiveDocs(); + numSoftDeleted += PendingSoftDeletes.countSoftDeletes( + DocValuesFieldExistsQuery.getDocValuesDocIdSetIterator(config.getSoftDeletesField(), leaf), liveDocs); + } } // Best-effort up front check: @@ -2979,8 +2982,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, } merger.merge(); // merge 'em - - SegmentCommitInfo infoPerCommit = new SegmentCommitInfo(info, 0, -1L, -1L, -1L); + SegmentCommitInfo infoPerCommit = new SegmentCommitInfo(info, 0, numSoftDeleted, -1L, -1L, -1L); info.setFiles(new HashSet<>(trackingDir.getCreatedFiles())); trackingDir.clearCreatedFiles(); @@ -3057,7 +3059,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, SegmentInfo newInfo = new SegmentInfo(directoryOrig, info.info.getVersion(), info.info.getMinVersion(), segName, info.info.maxDoc(), info.info.getUseCompoundFile(), info.info.getCodec(), info.info.getDiagnostics(), info.info.getId(), info.info.getAttributes(), info.info.getIndexSort()); - SegmentCommitInfo newInfoPerCommit = new SegmentCommitInfo(newInfo, info.getDelCount(), info.getDelGen(), + SegmentCommitInfo newInfoPerCommit = new SegmentCommitInfo(newInfo, info.getDelCount(), info.getSoftDelCount(), info.getDelGen(), info.getFieldInfosGen(), info.getDocValuesGen()); newInfo.setFiles(info.info.files()); @@ -4249,7 +4251,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, details.put("mergeMaxNumSegments", "" + merge.maxNumSegments); details.put("mergeFactor", Integer.toString(merge.segments.size())); setDiagnostics(si, SOURCE_MERGE, details); - merge.setMergeInfo(new SegmentCommitInfo(si, 0, -1L, -1L, -1L)); + merge.setMergeInfo(new SegmentCommitInfo(si, 0, 0, -1L, -1L, -1L)); if (infoStream.isEnabled("IW")) { infoStream.message("IW", "merge seg=" + merge.info.info.name + " " + segString(merge.segments)); @@ -4373,16 +4375,25 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, // Let the merge wrap readers List mergeReaders = new ArrayList<>(); + int numSoftDeleted = 0; for (SegmentReader reader : merge.readers) { CodecReader wrappedReader = merge.wrapForMerge(reader); validateMergeReader(wrappedReader); mergeReaders.add(wrappedReader); + if (softDeletesEnabled) { + if (reader != wrappedReader) { // if we don't have a wrapped reader we won't preserve any soft-deletes + Bits liveDocs = wrappedReader.getLiveDocs(); + numSoftDeleted += PendingSoftDeletes.countSoftDeletes( + DocValuesFieldExistsQuery.getDocValuesDocIdSetIterator(config.getSoftDeletesField(), wrappedReader), + liveDocs); + } + } } final SegmentMerger merger = new SegmentMerger(mergeReaders, merge.info.info, infoStream, dirWrapper, globalFieldNumberMap, context); - + merge.info.setSoftDelCount(numSoftDeleted); merge.checkAborted(); merge.mergeStartNS = System.nanoTime(); @@ -4604,7 +4615,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable, * * @lucene.internal */ private synchronized String segString(SegmentCommitInfo info) { - return info.toString(numDeletedDocs(info) - info.getDelCount()); + return info.toString(numDeletedDocs(info) - info.getDelCount(softDeletesEnabled)); } private synchronized void doWait() { diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiFields.java b/lucene/core/src/java/org/apache/lucene/index/MultiFields.java index 1a7b15bd81c..19078a83c15 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiFields.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiFields.java @@ -25,6 +25,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.concurrent.ConcurrentHashMap; import org.apache.lucene.util.Bits; @@ -263,7 +264,10 @@ public final class MultiFields extends Fields { * will be unavailable. */ public static FieldInfos getMergedFieldInfos(IndexReader reader) { - final FieldInfos.Builder builder = new FieldInfos.Builder(); + final String softDeletesField = reader.leaves().stream() + .map(l -> l.reader().getFieldInfos().getSoftDeletesField()) + .filter(Objects::nonNull).findAny().orElse(null); + final FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(softDeletesField)); for(final LeafReaderContext ctx : reader.leaves()) { builder.add(ctx.reader().getFieldInfos()); } diff --git a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java index 0a58f0d5021..980849fb58c 100644 --- a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java @@ -82,6 +82,11 @@ class NumericDocValuesWriter extends DocValuesWriter { return Sorter.getDocComparator(maxDoc, sortField, () -> null, () -> docValues); } + @Override + DocIdSetIterator getDocIdSet() { + return docsWithField.iterator(); + } + static SortingLeafReader.CachedNumericDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, NumericDocValues oldDocValues) throws IOException { FixedBitSet docsWithField = new FixedBitSet(maxDoc); long[] values = new long[maxDoc]; diff --git a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java index 492b6e7bc97..25f200a4243 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java @@ -23,6 +23,7 @@ import java.util.HashMap; import java.util.IdentityHashMap; import java.util.Iterator; import java.util.Map; +import java.util.Objects; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; @@ -101,9 +102,11 @@ public class ParallelLeafReader extends LeafReader { throw new IllegalArgumentException("All readers must have same maxDoc: "+maxDoc+"!="+reader.maxDoc()); } } - + final String softDeletesField = completeReaderSet.stream() + .map(r -> r.getFieldInfos().getSoftDeletesField()) + .filter(Objects::nonNull).findAny().orElse(null); // TODO: make this read-only in a cleaner way? - FieldInfos.Builder builder = new FieldInfos.Builder(); + FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(softDeletesField)); Sort indexSort = null; int createdVersionMajor = -1; diff --git a/lucene/core/src/java/org/apache/lucene/index/PendingDeletes.java b/lucene/core/src/java/org/apache/lucene/index/PendingDeletes.java index f19b05391d3..4ab037c9501 100644 --- a/lucene/core/src/java/org/apache/lucene/index/PendingDeletes.java +++ b/lucene/core/src/java/org/apache/lucene/index/PendingDeletes.java @@ -220,7 +220,7 @@ class PendingDeletes { * Returns true iff the segment represented by this {@link PendingDeletes} is fully deleted */ boolean isFullyDeleted(IOSupplier readerIOSupplier) throws IOException { - return info.getDelCount() + numPendingDeletes() == info.info.maxDoc(); + return getDelCount() == info.info.maxDoc(); } /** @@ -246,7 +246,8 @@ class PendingDeletes { * Returns the number of deleted docs in the segment. */ final int getDelCount() { - return info.getDelCount() + numPendingDeletes(); + int delCount = info.getDelCount() + info.getSoftDelCount() + numPendingDeletes(); + return delCount; } /** @@ -270,7 +271,8 @@ class PendingDeletes { count = info.info.maxDoc(); } assert numDocs() == count: "info.maxDoc=" + info.info.maxDoc() + " info.getDelCount()=" + info.getDelCount() + - " pendingDeletes=" + toString() + " count=" + count; + " info.getSoftDelCount()=" + info.getSoftDelCount() + + " pendingDeletes=" + toString() + " count=" + count + " numDocs: " + numDocs(); assert reader.numDocs() == numDocs() : "reader.numDocs() = " + reader.numDocs() + " numDocs() " + numDocs(); assert reader.numDeletedDocs() <= info.info.maxDoc(): "delCount=" + reader.numDeletedDocs() + " info.maxDoc=" + info.info.maxDoc() + " rld.pendingDeleteCount=" + numPendingDeletes() + diff --git a/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java b/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java index 1c32e4fa92e..4074903a363 100644 --- a/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java +++ b/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java @@ -58,7 +58,7 @@ final class PendingSoftDeletes extends PendingDeletes { } else { // if it was deleted subtract the delCount pendingDeleteCount--; - assert pendingDeleteCount >= 0 : " illegal pending delete count: " + pendingDeleteCount; + assert assertPendingDeletes(); } return true; } @@ -76,11 +76,15 @@ final class PendingSoftDeletes extends PendingDeletes { hardDeletes.onNewReader(reader, info); if (dvGeneration < info.getDocValuesGen()) { // only re-calculate this if we haven't seen this generation final DocIdSetIterator iterator = DocValuesFieldExistsQuery.getDocValuesDocIdSetIterator(field, reader); + int newDelCount; if (iterator != null) { // nothing is deleted we don't have a soft deletes field in this segment assert info.info.maxDoc() > 0 : "maxDoc is 0"; - pendingDeleteCount += applySoftDeletes(iterator, getMutableBits()); - assert pendingDeleteCount >= 0 : " illegal pending delete count: " + pendingDeleteCount; + newDelCount = applySoftDeletes(iterator, getMutableBits()); + assert newDelCount >= 0 : " illegal pending delete count: " + newDelCount; + } else { + newDelCount = 0; } + assert info.getSoftDelCount() == newDelCount : "softDeleteCount doesn't match " + info.getSoftDelCount() + " != " + newDelCount; dvGeneration = info.getDocValuesGen(); } assert getDelCount() <= info.info.maxDoc() : getDelCount() + " > " + info.info.maxDoc(); @@ -88,8 +92,15 @@ final class PendingSoftDeletes extends PendingDeletes { @Override boolean writeLiveDocs(Directory dir) throws IOException { + // we need to set this here to make sure our stats in SCI are up-to-date otherwise we might hit an assertion + // when the hard deletes are set since we need to account for docs that used to be only soft-delete but now hard-deleted + this.info.setSoftDelCount(this.info.getSoftDelCount() + pendingDeleteCount); + super.dropChanges(); // delegate the write to the hard deletes - it will only write if somebody used it. - return hardDeletes.writeLiveDocs(dir); + if (hardDeletes.writeLiveDocs(dir)) { + return true; + } + return false; } @Override @@ -134,13 +145,21 @@ final class PendingSoftDeletes extends PendingDeletes { void onDocValuesUpdate(FieldInfo info, DocValuesFieldUpdates.Iterator iterator) throws IOException { if (this.field.equals(info.name)) { pendingDeleteCount += applySoftDeletes(iterator, getMutableBits()); - assert pendingDeleteCount >= 0 : " illegal pending delete count: " + pendingDeleteCount; + assert assertPendingDeletes(); assert dvGeneration < info.getDocValuesGen() : "we have seen this generation update already: " + dvGeneration + " vs. " + info.getDocValuesGen(); assert dvGeneration != -2 : "docValues generation is still uninitialized"; dvGeneration = info.getDocValuesGen(); + this.info.setSoftDelCount(this.info.getSoftDelCount() + pendingDeleteCount); + super.dropChanges(); } } + private boolean assertPendingDeletes() { + assert pendingDeleteCount + info.getSoftDelCount() >= 0 : " illegal pending delete count: " + pendingDeleteCount + info.getSoftDelCount(); + assert info.info.maxDoc() >= getDelCount(); + return true; + } + @Override public String toString() { StringBuilder sb = new StringBuilder(); @@ -210,4 +229,17 @@ final class PendingSoftDeletes extends PendingDeletes { Bits getHardLiveDocs() { return hardDeletes.getLiveDocs(); } + + static int countSoftDeletes(DocIdSetIterator softDeletedDocs, Bits hardDeletes) throws IOException { + int count = 0; + if (softDeletedDocs != null) { + int doc; + while ((doc = softDeletedDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + if (hardDeletes == null || hardDeletes.get(doc)) { + count++; + } + } + } + return count; + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java b/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java index 45f58a602ca..5f62c3724d6 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java +++ b/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java @@ -130,9 +130,9 @@ final class ReaderPool implements Closeable { /** * Returns true iff any of the buffered readers and updates has at least one pending delete */ - synchronized boolean anyPendingDeletes() { + synchronized boolean anyDeletions() { for(ReadersAndUpdates rld : readerMap.values()) { - if (rld.anyPendingDeletes()) { + if (rld.getDelCount() > 0) { return true; } } diff --git a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java index 710b74876cd..3453447ecce 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java +++ b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java @@ -406,10 +406,6 @@ final class ReadersAndUpdates { } } - synchronized boolean anyPendingDeletes() { - return pendingDeletes.numPendingDeletes() != 0; - } - /** * This class merges the current on-disk DV with an incoming update DV instance and merges the two instances * giving the incoming update precedence in terms of values, in other words the values of the update always @@ -713,8 +709,6 @@ final class ReadersAndUpdates { reader = createNewReaderWithLatestLiveDocs(reader); } assert pendingDeletes.verifyDocCounts(reader); - - return new MergeReader(reader, pendingDeletes.getHardLiveDocs()); } diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentCommitInfo.java b/lucene/core/src/java/org/apache/lucene/index/SegmentCommitInfo.java index 661283b4203..954a1382a48 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentCommitInfo.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentCommitInfo.java @@ -38,6 +38,9 @@ public class SegmentCommitInfo { // How many deleted docs in the segment: private int delCount; + // How many soft-deleted docs in the segment that are not also hard-deleted: + private int softDelCount; + // Generation number of the live docs file (-1 if there // are no deletes yet): private long delGen; @@ -73,7 +76,7 @@ public class SegmentCommitInfo { // NOTE: only used in-RAM by IW to track buffered deletes; // this is never written to/read from the Directory private long bufferedDeletesGen = -1; - + /** * Sole constructor. * @@ -88,9 +91,10 @@ public class SegmentCommitInfo { * @param docValuesGen * DocValues generation number (used to name doc-values updates files) */ - public SegmentCommitInfo(SegmentInfo info, int delCount, long delGen, long fieldInfosGen, long docValuesGen) { + public SegmentCommitInfo(SegmentInfo info, int delCount, int softDelCount, long delGen, long fieldInfosGen, long docValuesGen) { this.info = info; this.delCount = delCount; + this.softDelCount = softDelCount; this.delGen = delGen; this.nextWriteDelGen = delGen == -1 ? 1 : delGen + 1; this.fieldInfosGen = fieldInfosGen; @@ -313,13 +317,29 @@ public class SegmentCommitInfo { return delCount; } + /** + * Returns the number of only soft-deleted docs. + */ + public int getSoftDelCount() { + return softDelCount; + } + void setDelCount(int delCount) { if (delCount < 0 || delCount > info.maxDoc()) { throw new IllegalArgumentException("invalid delCount=" + delCount + " (maxDoc=" + info.maxDoc() + ")"); } + assert softDelCount + delCount <= info.maxDoc(); this.delCount = delCount; } + void setSoftDelCount(int softDelCount) { + if (softDelCount < 0 || softDelCount > info.maxDoc()) { + throw new IllegalArgumentException("invalid softDelCount=" + softDelCount + " (maxDoc=" + info.maxDoc() + ")"); + } + assert softDelCount + delCount <= info.maxDoc(); + this.softDelCount = softDelCount; + } + /** Returns a description of this segment. */ public String toString(int pendingDelCount) { String s = info.toString(delCount + pendingDelCount); @@ -332,6 +352,10 @@ public class SegmentCommitInfo { if (docValuesGen != -1) { s += ":dvGen=" + docValuesGen; } + if (softDelCount > 0) { + s += " :softDel=" + softDelCount; + } + return s; } @@ -342,7 +366,7 @@ public class SegmentCommitInfo { @Override public SegmentCommitInfo clone() { - SegmentCommitInfo other = new SegmentCommitInfo(info, delCount, delGen, fieldInfosGen, docValuesGen); + SegmentCommitInfo other = new SegmentCommitInfo(info, delCount, softDelCount, delGen, fieldInfosGen, docValuesGen); // Not clear that we need to carry over nextWriteDelGen // (i.e. do we ever clone after a failed write and // before the next successful write?), but just do it to @@ -360,4 +384,8 @@ public class SegmentCommitInfo { return other; } + + final int getDelCount(boolean includeSoftDeletes) { + return includeSoftDeletes ? getDelCount() + getSoftDelCount() : getDelCount(); + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java b/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java index ec88fef6bc2..5697eed3cd2 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java @@ -122,8 +122,9 @@ public final class SegmentInfos implements Cloneable, Iterable VERSION_72 ? input.readInt() : 0; + if (softDelCount < 0 || softDelCount > info.maxDoc()) { + throw new CorruptIndexException("invalid deletion count: " + softDelCount + " vs maxDoc=" + info.maxDoc(), input); + } + if (softDelCount + delCount > info.maxDoc()) { + throw new CorruptIndexException("invalid deletion count: " + softDelCount + delCount + " vs maxDoc=" + info.maxDoc(), input); + } + SegmentCommitInfo siPerCommit = new SegmentCommitInfo(info, delCount, softDelCount, delGen, fieldInfosGen, dvGen); siPerCommit.setFieldInfosFiles(input.readSetOfStrings()); final Map> dvUpdateFiles; final int numDVFields = input.readInt(); @@ -517,6 +525,11 @@ public final class SegmentInfos implements Cloneable, Iterable si.maxDoc()) { + throw new IllegalStateException("cannot write segment: invalid maxDoc segment=" + si.name + " maxDoc=" + si.maxDoc() + " softDelCount=" + softDelCount); + } + out.writeInt(softDelCount); out.writeSetOfStrings(siPerCommit.getFieldInfosFiles()); final Map> dvUpdatesFiles = siPerCommit.getDocValuesUpdatesFiles(); out.writeInt(dvUpdatesFiles.size()); diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java index ad60a94298d..6554cc59da1 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java @@ -71,6 +71,7 @@ final class SegmentMerger { if (minVersion.onOrAfter(leafMinVersion)) { minVersion = leafMinVersion; } + } assert segmentInfo.minVersion == null : "The min version should be set by SegmentMerger for merged segments"; segmentInfo.minVersion = minVersion; diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentWriteState.java b/lucene/core/src/java/org/apache/lucene/index/SegmentWriteState.java index f56970109e8..d00a19edf52 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentWriteState.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentWriteState.java @@ -47,7 +47,9 @@ public class SegmentWriteState { /** Number of deleted documents set while flushing the * segment. */ public int delCountOnFlush; - + /** Number of only soft deleted documents set while flushing the + * segment. */ + public int softDelCountOnFlush; /** * Deletes and updates to apply while we are flushing the segment. A Term is * enrolled in here if it was deleted/updated at one point, and it's mapped to diff --git a/lucene/core/src/java/org/apache/lucene/index/SoftDeletesDirectoryReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SoftDeletesDirectoryReaderWrapper.java index 36568f6d5f1..dc350115c19 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SoftDeletesDirectoryReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/SoftDeletesDirectoryReaderWrapper.java @@ -62,6 +62,8 @@ public final class SoftDeletesDirectoryReaderWrapper extends FilterDirectoryRead // we try to reuse the life docs instances here if the reader cache key didn't change if (reader instanceof SoftDeletesFilterLeafReader && reader.getReaderCacheHelper() != null) { readerCache.put(((SoftDeletesFilterLeafReader) reader).reader.getReaderCacheHelper().getKey(), reader); + } else if (reader instanceof SoftDeletesFilterCodecReader && reader.getReaderCacheHelper() != null) { + readerCache.put(((SoftDeletesFilterCodecReader) reader).reader.getReaderCacheHelper().getKey(), reader); } } @@ -112,9 +114,35 @@ public final class SoftDeletesDirectoryReaderWrapper extends FilterDirectoryRead bits = new FixedBitSet(reader.maxDoc()); bits.set(0, reader.maxDoc()); } - int numDeletes = reader.numDeletedDocs() + PendingSoftDeletes.applySoftDeletes(iterator, bits); + int numSoftDeletes = PendingSoftDeletes.applySoftDeletes(iterator, bits); + int numDeletes = reader.numDeletedDocs() + numSoftDeletes; int numDocs = reader.maxDoc() - numDeletes; - return new SoftDeletesFilterLeafReader(reader, bits, numDocs); + assert assertDocCounts(numDocs, numSoftDeletes, reader); + return reader instanceof CodecReader ? new SoftDeletesFilterCodecReader((CodecReader) reader, bits, numDocs) + : new SoftDeletesFilterLeafReader(reader, bits, numDocs); + } + + private static boolean assertDocCounts(int expectedNumDocs, int numSoftDeletes, LeafReader reader) { + if (reader instanceof SegmentReader) { + SegmentReader segmentReader = (SegmentReader) reader; + SegmentCommitInfo segmentInfo = segmentReader.getSegmentInfo(); + if (segmentReader.isNRT == false) { + int numDocs = segmentInfo.info.maxDoc() - segmentInfo.getSoftDelCount() - segmentInfo.getDelCount(); + assert numDocs == expectedNumDocs : "numDocs: " + numDocs + " expected: " + expectedNumDocs + + " maxDoc: " + segmentInfo.info.maxDoc() + + " getDelCount: " + segmentInfo.getDelCount() + + " getSoftDelCount: " + segmentInfo.getSoftDelCount() + + " numSoftDeletes: " + numSoftDeletes + + " reader.numDeletedDocs(): " + reader.numDeletedDocs(); + } + // in the NRT case we don't have accurate numbers for getDelCount and getSoftDelCount since they might not be + // flushed to disk when this reader is opened. We don't necessarily flush deleted doc on reopen but + // we do for docValues. + + + } + + return true; } static final class SoftDeletesFilterLeafReader extends FilterLeafReader { @@ -153,6 +181,42 @@ public final class SoftDeletesDirectoryReaderWrapper extends FilterDirectoryRead } } + final static class SoftDeletesFilterCodecReader extends FilterCodecReader { + private final LeafReader reader; + private final FixedBitSet bits; + private final int numDocs; + private final CacheHelper readerCacheHelper; + + private SoftDeletesFilterCodecReader(CodecReader reader, FixedBitSet bits, int numDocs) { + super(reader); + this.reader = reader; + this.bits = bits; + this.numDocs = numDocs; + this.readerCacheHelper = reader.getReaderCacheHelper() == null ? null : + new DelegatingCacheHelper(reader.getReaderCacheHelper()); + } + + @Override + public Bits getLiveDocs() { + return bits; + } + + @Override + public int numDocs() { + return numDocs; + } + + @Override + public CacheHelper getCoreCacheHelper() { + return reader.getCoreCacheHelper(); + } + + @Override + public CacheHelper getReaderCacheHelper() { + return readerCacheHelper; + } + } + private static class DelegatingCacheHelper implements CacheHelper { private final CacheHelper delegate; private final CacheKey cacheKey = new CacheKey(); diff --git a/lucene/core/src/java/org/apache/lucene/index/SoftDeletesRetentionMergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/SoftDeletesRetentionMergePolicy.java index ad725ff02e0..515068c207a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SoftDeletesRetentionMergePolicy.java +++ b/lucene/core/src/java/org/apache/lucene/index/SoftDeletesRetentionMergePolicy.java @@ -175,7 +175,7 @@ public final class SoftDeletesRetentionMergePolicy extends OneMergeWrappingMerge @Override public int numDeletesToMerge(SegmentCommitInfo info, int delCount, IOSupplier readerSupplier) throws IOException { final int numDeletesToMerge = super.numDeletesToMerge(info, delCount, readerSupplier); - if (numDeletesToMerge != 0) { + if (numDeletesToMerge != 0 && info.getSoftDelCount() > 0) { final CodecReader reader = readerSupplier.get(); if (reader.getLiveDocs() != null) { BooleanQuery.Builder builder = new BooleanQuery.Builder(); diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java index be7f4886588..86d0f0bab33 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java @@ -244,4 +244,9 @@ class SortedDocValuesWriter extends DocValuesWriter { return valueCount; } } + + @Override + DocIdSetIterator getDocIdSet() { + return docsWithField.iterator(); + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java index 8f58014f69c..bdc65cc8057 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java @@ -231,4 +231,9 @@ class SortedNumericDocValuesWriter extends DocValuesWriter { return docsWithField.cost(); } } + + @Override + DocIdSetIterator getDocIdSet() { + return docsWithField.iterator(); + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java index 22be7e50ba9..700090a48fd 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java @@ -315,4 +315,8 @@ class SortedSetDocValuesWriter extends DocValuesWriter { return scratch; } } + @Override + DocIdSetIterator getDocIdSet() { + return docsWithField.iterator(); + } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java index 0df7ac8c74a..13073186abc 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java @@ -1410,4 +1410,50 @@ public class TestAddIndexes extends LuceneTestCase { dir1.close(); dir2.close(); } + + public void testAddIndicesWithSoftDeletes() throws IOException { + Directory dir1 = newDirectory(); + IndexWriterConfig iwc1 = newIndexWriterConfig(new MockAnalyzer(random())).setSoftDeletesField("soft_delete"); + IndexWriter writer = new IndexWriter(dir1, iwc1); + for (int i = 0; i < 30; i++) { + Document doc = new Document(); + int docID = random().nextInt(5); + doc.add(new StringField("id", "" + docID, Field.Store.YES)); + writer.softUpdateDocument(new Term("id", "" + docID), doc, new NumericDocValuesField("soft_delete", 1)); + if (random().nextBoolean()) { + writer.flush(); + } + } + writer.commit(); + writer.close(); + DirectoryReader reader = DirectoryReader.open(dir1); + DirectoryReader wrappedReader = new SoftDeletesDirectoryReaderWrapper(reader, "soft_delete"); + Directory dir2 = newDirectory(); + int numDocs = reader.numDocs(); + int maxDoc = reader.maxDoc(); + assertEquals(numDocs, maxDoc); + iwc1 = newIndexWriterConfig(new MockAnalyzer(random())).setSoftDeletesField("soft_delete"); + writer = new IndexWriter(dir2, iwc1); + CodecReader[] readers = new CodecReader[reader.leaves().size()]; + for (int i = 0; i < readers.length; i++) { + readers[i] = (CodecReader)reader.leaves().get(i).reader(); + } + writer.addIndexes(readers); + assertEquals(wrappedReader.numDocs(), writer.numDocs()); + assertEquals(maxDoc, writer.maxDoc()); + writer.commit(); + SegmentCommitInfo commitInfo = writer.segmentInfos.asList().get(0); + assertEquals(maxDoc-wrappedReader.numDocs(), commitInfo.getSoftDelCount()); + writer.close(); + Directory dir3 = newDirectory(); + iwc1 = newIndexWriterConfig(new MockAnalyzer(random())).setSoftDeletesField("soft_delete"); + writer = new IndexWriter(dir3, iwc1); + for (int i = 0; i < readers.length; i++) { + readers[i] = (CodecReader)wrappedReader.leaves().get(i).reader(); + } + writer.addIndexes(readers); + assertEquals(wrappedReader.numDocs(), writer.numDocs()); + assertEquals(wrappedReader.numDocs(), writer.maxDoc()); + IOUtils.close(reader, writer, dir3, dir2, dir1); + } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java b/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java index acc6506f4f6..5ff0dde4515 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java @@ -212,7 +212,7 @@ public class TestCodecs extends LuceneTestCase { terms[i] = new TermData(text, docs, null); } - final FieldInfos.Builder builder = new FieldInfos.Builder(); + final FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); final FieldData field = new FieldData("field", builder, terms, true, false); final FieldData[] fields = new FieldData[] {field}; @@ -259,7 +259,7 @@ public class TestCodecs extends LuceneTestCase { } public void testRandomPostings() throws Throwable { - final FieldInfos.Builder builder = new FieldInfos.Builder(); + final FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); final FieldData[] fields = new FieldData[NUM_FIELDS]; for(int i=0;iasList(r1, r2), si, InfoStream.getDefault(), trackingDir, - new FieldInfos.FieldNumbers(), context); + new FieldInfos.FieldNumbers(null), context); MergeState mergeState = merger.merge(); r1.close(); @@ -238,7 +238,7 @@ public class TestDoc extends LuceneTestCase { } } - return new SegmentCommitInfo(si, 0, -1L, -1L, -1L); + return new SegmentCommitInfo(si, 0, 0, -1L, -1L, -1L); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestFieldsReader.java b/lucene/core/src/test/org/apache/lucene/index/TestFieldsReader.java index 48d69ec4149..ce24b7f19a6 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestFieldsReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestFieldsReader.java @@ -44,7 +44,7 @@ public class TestFieldsReader extends LuceneTestCase { @BeforeClass public static void beforeClass() throws Exception { testDoc = new Document(); - fieldInfos = new FieldInfos.Builder(); + fieldInfos = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); DocHelper.setupDoc(testDoc); for (IndexableField field : testDoc.getFields()) { FieldInfo fieldInfo = fieldInfos.getOrAdd(field.name()); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index 5e394d560fc..967055ed852 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -3137,7 +3137,11 @@ public class TestIndexWriter extends LuceneTestCase { searcher = new IndexSearcher(reader); topDocs = searcher.search(new TermQuery(new Term("id", "1")), 10); assertEquals(0, topDocs.totalHits); - + int numSoftDeleted = 0; + for (SegmentCommitInfo info : writer.segmentInfos) { + numSoftDeleted += info.getSoftDelCount(); + } + assertEquals(writer.maxDoc() - writer.numDocs(), numSoftDeleted); writer.close(); reader.close(); dir.close(); @@ -3267,6 +3271,20 @@ public class TestIndexWriter extends LuceneTestCase { assertEquals(1, reader.docFreq(new Term("id", id))); } } + int numSoftDeleted = 0; + for (SegmentCommitInfo info : writer.segmentInfos) { + numSoftDeleted += info.getSoftDelCount() + info.getDelCount(); + } + assertEquals(writer.maxDoc() - writer.numDocs(), numSoftDeleted); + writer.commit(); + try (DirectoryReader dirReader = DirectoryReader.open(dir)) { + int delCount = 0; + for (LeafReaderContext ctx : dirReader.leaves()) { + SegmentCommitInfo segmentInfo = ((SegmentReader) ctx.reader()).getSegmentInfo(); + delCount += segmentInfo.getSoftDelCount() + segmentInfo.getDelCount(); + } + assertEquals(numSoftDeleted, delCount); + } IOUtils.close(reader, writer, dir); } @@ -3376,4 +3394,110 @@ public class TestIndexWriter extends LuceneTestCase { IOUtils.close(reader, writer, dir); } + public void testPreventChangingSoftDeletesField() throws Exception { + Directory dir = newDirectory(); + IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig().setSoftDeletesField("my_deletes")); + Document v1 = new Document(); + v1.add(new StringField("id", "1", Field.Store.YES)); + v1.add(new StringField("version", "1", Field.Store.YES)); + writer.addDocument(v1); + Document v2 = new Document(); + v2.add(new StringField("id", "1", Field.Store.YES)); + v2.add(new StringField("version", "2", Field.Store.YES)); + writer.softUpdateDocument(new Term("id", "1"), v2, new NumericDocValuesField("my_deletes", 1)); + writer.commit(); + writer.close(); + for (SegmentCommitInfo si : SegmentInfos.readLatestCommit(dir)) { + FieldInfos fieldInfos = IndexWriter.readFieldInfos(si); + assertEquals("my_deletes", fieldInfos.getSoftDeletesField()); + assertTrue(fieldInfos.fieldInfo("my_deletes").isSoftDeletesField()); + } + + IllegalArgumentException illegalError = expectThrows(IllegalArgumentException.class, () -> { + new IndexWriter(dir, newIndexWriterConfig().setSoftDeletesField("your_deletes")); + }); + assertEquals("cannot configure [your_deletes] as soft-deletes; " + + "this index uses [my_deletes] as soft-deletes already", illegalError.getMessage()); + + IndexWriterConfig softDeleteConfig = newIndexWriterConfig().setSoftDeletesField("my_deletes") + .setMergePolicy(new SoftDeletesRetentionMergePolicy("my_deletes", () -> new MatchAllDocsQuery(), newMergePolicy())); + writer = new IndexWriter(dir, softDeleteConfig); + Document tombstone = new Document(); + tombstone.add(new StringField("id", "tombstone", Field.Store.YES)); + tombstone.add(new NumericDocValuesField("my_deletes", 1)); + writer.addDocument(tombstone); + writer.flush(); + for (SegmentCommitInfo si : writer.segmentInfos) { + FieldInfos fieldInfos = IndexWriter.readFieldInfos(si); + assertEquals("my_deletes", fieldInfos.getSoftDeletesField()); + assertTrue(fieldInfos.fieldInfo("my_deletes").isSoftDeletesField()); + } + writer.close(); + // reopen writer without soft-deletes field should be prevented + IllegalArgumentException reopenError = expectThrows(IllegalArgumentException.class, () -> { + new IndexWriter(dir, newIndexWriterConfig()); + }); + assertEquals("this index has [my_deletes] as soft-deletes already" + + " but soft-deletes field is not configured in IWC", reopenError.getMessage()); + dir.close(); + } + + public void testPreventAddingIndexesWithDifferentSoftDeletesField() throws Exception { + Directory dir1 = newDirectory(); + IndexWriter w1 = new IndexWriter(dir1, newIndexWriterConfig().setSoftDeletesField("soft_deletes_1")); + for (int i = 0; i < 2; i++) { + Document d = new Document(); + d.add(new StringField("id", "1", Field.Store.YES)); + d.add(new StringField("version", Integer.toString(i), Field.Store.YES)); + w1.softUpdateDocument(new Term("id", "1"), d, new NumericDocValuesField("soft_deletes_1", 1)); + } + w1.commit(); + w1.close(); + + Directory dir2 = newDirectory(); + IndexWriter w2 = new IndexWriter(dir2, newIndexWriterConfig().setSoftDeletesField("soft_deletes_2")); + IllegalArgumentException error = expectThrows(IllegalArgumentException.class, () -> w2.addIndexes(dir1)); + assertEquals("cannot configure [soft_deletes_2] as soft-deletes; this index uses [soft_deletes_1] as soft-deletes already", + error.getMessage()); + w2.close(); + + Directory dir3 = newDirectory(); + IndexWriterConfig config = newIndexWriterConfig().setSoftDeletesField("soft_deletes_1"); + IndexWriter w3 = new IndexWriter(dir3, config); + w3.addIndexes(dir1); + for (SegmentCommitInfo si : w3.segmentInfos) { + FieldInfo softDeleteField = IndexWriter.readFieldInfos(si).fieldInfo("soft_deletes_1"); + assertTrue(softDeleteField.isSoftDeletesField()); + } + w3.close(); + IOUtils.close(dir1, dir2, dir3); + } + + public void testNotAllowUsingExistingFieldAsSoftDeletes() throws Exception { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + for (int i = 0; i < 2; i++) { + Document d = new Document(); + d.add(new StringField("id", "1", Field.Store.YES)); + if (random().nextBoolean()) { + d.add(new NumericDocValuesField("dv_field", 1)); + w.updateDocument(new Term("id", "1"), d); + } else { + w.softUpdateDocument(new Term("id", "1"), d, new NumericDocValuesField("dv_field", 1)); + } + } + w.commit(); + w.close(); + String softDeletesField = random().nextBoolean() ? "id" : "dv_field"; + IllegalArgumentException error = expectThrows(IllegalArgumentException.class, () -> { + IndexWriterConfig config = newIndexWriterConfig().setSoftDeletesField(softDeletesField); + new IndexWriter(dir, config); + }); + assertEquals("cannot configure [" + softDeletesField + "] as soft-deletes;" + + " this index uses [" + softDeletesField + "] as non-soft-deletes already", error.getMessage()); + IndexWriterConfig config = newIndexWriterConfig().setSoftDeletesField("non-existing-field"); + w = new IndexWriter(dir, config); + w.close(); + dir.close(); + } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterThreadsToSegments.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterThreadsToSegments.java index 359e7d02739..4339d3e5b86 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterThreadsToSegments.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterThreadsToSegments.java @@ -331,7 +331,7 @@ public class TestIndexWriterThreadsToSegments extends LuceneTestCase { byte id[] = readSegmentInfoID(dir, fileName); SegmentInfo si = TestUtil.getDefaultCodec().segmentInfoFormat().read(dir, segName, id, IOContext.DEFAULT); si.setCodec(codec); - SegmentCommitInfo sci = new SegmentCommitInfo(si, 0, -1, -1, -1); + SegmentCommitInfo sci = new SegmentCommitInfo(si, 0, 0, -1, -1, -1); SegmentReader sr = new SegmentReader(sci, Version.LATEST.major, IOContext.DEFAULT); try { thread0Count += sr.docFreq(new Term("field", "threadID0")); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestOneMergeWrappingMergePolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestOneMergeWrappingMergePolicy.java index 219c7770d83..e240f549ecd 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestOneMergeWrappingMergePolicy.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestOneMergeWrappingMergePolicy.java @@ -137,7 +137,7 @@ public class TestOneMergeWrappingMergePolicy extends LuceneTestCase { Collections.emptyMap(), // attributes null /* indexSort */); final List segments = new LinkedList(); - segments.add(new SegmentCommitInfo(si, 0, 0, 0, 0)); + segments.add(new SegmentCommitInfo(si, 0, 0, 0, 0, 0)); ms.add(new MergePolicy.OneMerge(segments)); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPendingDeletes.java b/lucene/core/src/test/org/apache/lucene/index/TestPendingDeletes.java index ecc2d4de51e..d4530344adf 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPendingDeletes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPendingDeletes.java @@ -40,7 +40,7 @@ public class TestPendingDeletes extends LuceneTestCase { RAMDirectory dir = new RAMDirectory(); SegmentInfo si = new SegmentInfo(dir, Version.LATEST, Version.LATEST, "test", 10, false, Codec.getDefault(), Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null); - SegmentCommitInfo commitInfo = new SegmentCommitInfo(si, 0, -1, -1, -1); + SegmentCommitInfo commitInfo = new SegmentCommitInfo(si, 0, 0, -1, -1, -1); PendingDeletes deletes = newPendingDeletes(commitInfo); assertNull(deletes.getLiveDocs()); int docToDelete = TestUtil.nextInt(random(), 0, 7); @@ -74,7 +74,7 @@ public class TestPendingDeletes extends LuceneTestCase { RAMDirectory dir = new RAMDirectory(); SegmentInfo si = new SegmentInfo(dir, Version.LATEST, Version.LATEST, "test", 6, false, Codec.getDefault(), Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null); - SegmentCommitInfo commitInfo = new SegmentCommitInfo(si, 0, -1, -1, -1); + SegmentCommitInfo commitInfo = new SegmentCommitInfo(si, 0, 0, -1, -1, -1); PendingDeletes deletes = newPendingDeletes(commitInfo); assertFalse(deletes.writeLiveDocs(dir)); assertEquals(0, dir.listAll().length); @@ -131,7 +131,7 @@ public class TestPendingDeletes extends LuceneTestCase { RAMDirectory dir = new RAMDirectory(); SegmentInfo si = new SegmentInfo(dir, Version.LATEST, Version.LATEST, "test", 3, false, Codec.getDefault(), Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null); - SegmentCommitInfo commitInfo = new SegmentCommitInfo(si, 0, -1, -1, -1); + SegmentCommitInfo commitInfo = new SegmentCommitInfo(si, 0, 0, -1, -1, -1); FieldInfos fieldInfos = new FieldInfos(new FieldInfo[0]); si.getCodec().fieldInfosFormat().write(dir, si, "", fieldInfos, IOContext.DEFAULT); PendingDeletes deletes = newPendingDeletes(commitInfo); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java index 5fadd3f10cd..b6438552976 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java @@ -44,6 +44,45 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { return new PendingSoftDeletes("_soft_deletes", commitInfo); } + public void testHardDeleteSoftDeleted() throws IOException { + Directory dir = newDirectory(); + IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig() + .setSoftDeletesField("_soft_deletes") + // make sure all docs will end up in the same segment + .setMaxBufferedDocs(10) + .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH)); + Document doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.YES)); + writer.softUpdateDocument(new Term("id", "1"), doc, + new NumericDocValuesField("_soft_deletes", 1)); + doc = new Document(); + doc.add(new StringField("id", "2", Field.Store.YES)); + writer.softUpdateDocument(new Term("id", "2"), doc, + new NumericDocValuesField("_soft_deletes", 1)); + doc = new Document(); + doc.add(new StringField("id", "2", Field.Store.YES)); + writer.softUpdateDocument(new Term("id", "2"), doc, + new NumericDocValuesField("_soft_deletes", 1)); + writer.commit(); + DirectoryReader reader = writer.getReader(); + assertEquals(1, reader.leaves().size()); + SegmentReader segmentReader = (SegmentReader) reader.leaves().get(0).reader(); + SegmentCommitInfo segmentInfo = segmentReader.getSegmentInfo(); + PendingSoftDeletes pendingSoftDeletes = newPendingDeletes(segmentInfo); + pendingSoftDeletes.onNewReader(segmentReader, segmentInfo); + assertEquals(0, pendingSoftDeletes.numPendingDeletes()); + assertEquals(1, pendingSoftDeletes.getDelCount()); + assertTrue(pendingSoftDeletes.getLiveDocs().get(0)); + assertFalse(pendingSoftDeletes.getLiveDocs().get(1)); + assertTrue(pendingSoftDeletes.getLiveDocs().get(2)); + assertNull(pendingSoftDeletes.getHardLiveDocs()); + assertTrue(pendingSoftDeletes.delete(1)); + assertEquals(0, pendingSoftDeletes.numPendingDeletes()); + assertEquals(-1, pendingSoftDeletes.pendingDeleteCount); // transferred the delete + assertEquals(1, pendingSoftDeletes.getDelCount()); + IOUtils.close(reader, writer, dir); + } + public void testDeleteSoft() throws IOException { Directory dir = newDirectory(); IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig() @@ -70,7 +109,8 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { SegmentCommitInfo segmentInfo = segmentReader.getSegmentInfo(); PendingSoftDeletes pendingSoftDeletes = newPendingDeletes(segmentInfo); pendingSoftDeletes.onNewReader(segmentReader, segmentInfo); - assertEquals(1, pendingSoftDeletes.numPendingDeletes()); + assertEquals(0, pendingSoftDeletes.numPendingDeletes()); + assertEquals(1, pendingSoftDeletes.getDelCount()); assertTrue(pendingSoftDeletes.getLiveDocs().get(0)); assertFalse(pendingSoftDeletes.getLiveDocs().get(1)); assertTrue(pendingSoftDeletes.getLiveDocs().get(2)); @@ -78,7 +118,8 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { // pass reader again Bits liveDocs = pendingSoftDeletes.getLiveDocs(); pendingSoftDeletes.onNewReader(segmentReader, segmentInfo); - assertEquals(1, pendingSoftDeletes.numPendingDeletes()); + assertEquals(0, pendingSoftDeletes.numPendingDeletes()); + assertEquals(1, pendingSoftDeletes.getDelCount()); assertSame(liveDocs, pendingSoftDeletes.getLiveDocs()); // now apply a hard delete @@ -91,7 +132,8 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { segmentInfo = segmentReader.getSegmentInfo(); pendingSoftDeletes = newPendingDeletes(segmentInfo); pendingSoftDeletes.onNewReader(segmentReader, segmentInfo); - assertEquals(1, pendingSoftDeletes.numPendingDeletes()); + assertEquals(0, pendingSoftDeletes.numPendingDeletes()); + assertEquals(2, pendingSoftDeletes.getDelCount()); assertFalse(pendingSoftDeletes.getLiveDocs().get(0)); assertFalse(pendingSoftDeletes.getLiveDocs().get(1)); assertTrue(pendingSoftDeletes.getLiveDocs().get(2)); @@ -106,7 +148,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { RAMDirectory dir = new RAMDirectory(); SegmentInfo si = new SegmentInfo(dir, Version.LATEST, Version.LATEST, "test", 10, false, Codec.getDefault(), Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null); - SegmentCommitInfo commitInfo = new SegmentCommitInfo(si, 0, -1, -1, -1); + SegmentCommitInfo commitInfo = new SegmentCommitInfo(si, 0, 0, -1, -1, -1); IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig()); for (int i = 0; i < si.maxDoc(); i++) { writer.addDocument(new Document()); @@ -120,13 +162,14 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { deletes.onNewReader(segmentReader, commitInfo); reader.close(); writer.close(); - FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 0, Collections.emptyMap(), 0, 0); + FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 0, Collections.emptyMap(), 0, 0, true); List docsDeleted = Arrays.asList(1, 3, 7, 8, DocIdSetIterator.NO_MORE_DOCS); List updates = Arrays.asList(singleUpdate(docsDeleted, 10, true)); for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); } - assertEquals(4, deletes.numPendingDeletes()); + assertEquals(0, deletes.numPendingDeletes()); + assertEquals(4, deletes.getDelCount()); assertTrue(deletes.getLiveDocs().get(0)); assertFalse(deletes.getLiveDocs().get(1)); assertTrue(deletes.getLiveDocs().get(2)); @@ -140,11 +183,12 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { docsDeleted = Arrays.asList(1, 2, DocIdSetIterator.NO_MORE_DOCS); updates = Arrays.asList(singleUpdate(docsDeleted, 10, true)); - fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 1, Collections.emptyMap(), 0, 0); + fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 1, Collections.emptyMap(), 0, 0, true); for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); } - assertEquals(5, deletes.numPendingDeletes()); + assertEquals(0, deletes.numPendingDeletes()); + assertEquals(5, deletes.getDelCount()); assertTrue(deletes.getLiveDocs().get(0)); assertFalse(deletes.getLiveDocs().get(1)); assertFalse(deletes.getLiveDocs().get(2)); @@ -182,13 +226,14 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { SegmentCommitInfo segmentInfo = segmentReader.getSegmentInfo(); PendingDeletes deletes = newPendingDeletes(segmentInfo); deletes.onNewReader(segmentReader, segmentInfo); - FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0); + FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, true); List docsDeleted = Arrays.asList(1, DocIdSetIterator.NO_MORE_DOCS); List updates = Arrays.asList(singleUpdate(docsDeleted, 3, true)); for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); } - assertEquals(1, deletes.numPendingDeletes()); + assertEquals(0, deletes.numPendingDeletes()); + assertEquals(1, deletes.getDelCount()); assertTrue(deletes.getLiveDocs().get(0)); assertFalse(deletes.getLiveDocs().get(1)); assertTrue(deletes.getLiveDocs().get(2)); @@ -199,7 +244,8 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { assertTrue(deletes.getLiveDocs().get(0)); assertFalse(deletes.getLiveDocs().get(1)); assertTrue(deletes.getLiveDocs().get(2)); - assertEquals(1, deletes.numPendingDeletes()); + assertEquals(0, deletes.numPendingDeletes()); + assertEquals(1, deletes.getDelCount()); IOUtils.close(reader, writer, dir); } @@ -228,7 +274,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { SegmentCommitInfo segmentInfo = segmentReader.getSegmentInfo(); PendingDeletes deletes = newPendingDeletes(segmentInfo); deletes.onNewReader(segmentReader, segmentInfo); - FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0); + FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, true); List updates = Arrays.asList(singleUpdate(Arrays.asList(0, 1, DocIdSetIterator.NO_MORE_DOCS), 3, false)); for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); @@ -247,7 +293,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { assertEquals(0, deletes.numPendingDeletes()); segmentInfo.advanceDocValuesGen(); - fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0); + fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, true); updates = Arrays.asList(singleUpdate(Arrays.asList(1, DocIdSetIterator.NO_MORE_DOCS), 3, true)); for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); @@ -257,7 +303,8 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { assertTrue(deletes.getLiveDocs().get(0)); assertFalse(deletes.getLiveDocs().get(1)); assertTrue(deletes.getLiveDocs().get(2)); - assertEquals(1, deletes.numPendingDeletes()); + assertEquals(0, deletes.numPendingDeletes()); + assertEquals(1, deletes.getDelCount()); IOUtils.close(reader, writer, dir); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java index e9edf4e3ce9..de78ffc50d0 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java @@ -60,7 +60,7 @@ public class TestSegmentInfos extends LuceneTestCase { Collections.emptyMap(), id, Collections.emptyMap(), null); info.setFiles(Collections.emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); - SegmentCommitInfo commitInfo = new SegmentCommitInfo(info, 0, -1, -1, -1); + SegmentCommitInfo commitInfo = new SegmentCommitInfo(info, 0, 0, -1, -1, -1); sis.add(commitInfo); sis.commit(dir); @@ -82,14 +82,14 @@ public class TestSegmentInfos extends LuceneTestCase { Collections.emptyMap(), id, Collections.emptyMap(), null); info.setFiles(Collections.emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); - SegmentCommitInfo commitInfo = new SegmentCommitInfo(info, 0, -1, -1, -1); + SegmentCommitInfo commitInfo = new SegmentCommitInfo(info, 0, 0, -1, -1, -1); sis.add(commitInfo); info = new SegmentInfo(dir, Version.LUCENE_8_0_0, Version.LUCENE_8_0_0, "_1", 1, false, Codec.getDefault(), Collections.emptyMap(), id, Collections.emptyMap(), null); info.setFiles(Collections.emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); - commitInfo = new SegmentCommitInfo(info, 0, -1, -1, -1); + commitInfo = new SegmentCommitInfo(info, 0, 0,-1, -1, -1); sis.add(commitInfo); sis.commit(dir); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java index 6d0e04bbb2c..610523a9fd4 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java @@ -88,7 +88,7 @@ public class TestSegmentMerger extends LuceneTestCase { SegmentMerger merger = new SegmentMerger(Arrays.asList(reader1, reader2), si, InfoStream.getDefault(), mergedDir, - new FieldInfos.FieldNumbers(), + new FieldInfos.FieldNumbers(null), newIOContext(random(), new IOContext(new MergeInfo(-1, -1, false, -1)))); MergeState mergeState = merger.merge(); int docsMerged = mergeState.segmentInfo.maxDoc(); @@ -96,7 +96,7 @@ public class TestSegmentMerger extends LuceneTestCase { //Should be able to open a new SegmentReader against the new directory SegmentReader mergedReader = new SegmentReader(new SegmentCommitInfo( mergeState.segmentInfo, - 0, -1L, -1L, -1L), + 0, 0, -1L, -1L, -1L), Version.LATEST.major, newIOContext(random())); assertTrue(mergedReader != null); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java b/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java index dea7bc977be..d7a79997dc1 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java @@ -104,7 +104,8 @@ public class TestSoftDeletesDirectoryReaderWrapper extends LuceneTestCase { } private boolean isWrapped(LeafReader reader) { - return reader instanceof SoftDeletesDirectoryReaderWrapper.SoftDeletesFilterLeafReader; + return reader instanceof SoftDeletesDirectoryReaderWrapper.SoftDeletesFilterLeafReader + || reader instanceof SoftDeletesDirectoryReaderWrapper.SoftDeletesFilterCodecReader; } public void testMixSoftAndHardDeletes() throws IOException { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java index 144209dcceb..1eef95fdd6d 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java @@ -81,7 +81,7 @@ public class TermVectorLeafReader extends LeafReader { } FieldInfo fieldInfo = new FieldInfo(field, 0, true, true, terms.hasPayloads(), - indexOptions, DocValuesType.NONE, -1, Collections.emptyMap(), 0, 0); + indexOptions, DocValuesType.NONE, -1, Collections.emptyMap(), 0, 0, false); fieldInfos = new FieldInfos(new FieldInfo[]{fieldInfo}); } diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index ff248c34538..11913d1cbee 100644 --- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -501,7 +501,7 @@ public class MemoryIndex { IndexOptions indexOptions = storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; return new FieldInfo(fieldName, ord, fieldType.storeTermVectors(), fieldType.omitNorms(), storePayloads, indexOptions, fieldType.docValuesType(), -1, Collections.emptyMap(), - fieldType.pointDimensionCount(), fieldType.pointNumBytes()); + fieldType.pointDimensionCount(), fieldType.pointNumBytes(), false); } private void storePointValues(Info info, BytesRef pointValue) { @@ -520,7 +520,7 @@ public class MemoryIndex { info.fieldInfo = new FieldInfo( info.fieldInfo.name, info.fieldInfo.number, info.fieldInfo.hasVectors(), info.fieldInfo.hasPayloads(), info.fieldInfo.hasPayloads(), info.fieldInfo.getIndexOptions(), docValuesType, -1, info.fieldInfo.attributes(), - info.fieldInfo.getPointDimensionCount(), info.fieldInfo.getPointNumBytes() + info.fieldInfo.getPointDimensionCount(), info.fieldInfo.getPointNumBytes(), info.fieldInfo.isSoftDeletesField() ); } else if (existingDocValuesType != docValuesType) { throw new IllegalArgumentException("Can't add [" + docValuesType + "] doc values field [" + fieldName + "], because [" + existingDocValuesType + "] doc values field already exists"); diff --git a/lucene/misc/src/java/org/apache/lucene/index/IndexSplitter.java b/lucene/misc/src/java/org/apache/lucene/index/IndexSplitter.java index 892564826f3..a586f838170 100644 --- a/lucene/misc/src/java/org/apache/lucene/index/IndexSplitter.java +++ b/lucene/misc/src/java/org/apache/lucene/index/IndexSplitter.java @@ -141,7 +141,7 @@ public class IndexSplitter { // Same info just changing the dir: SegmentInfo newInfo = new SegmentInfo(destFSDir, info.getVersion(), info.getMinVersion(), info.name, info.maxDoc(), info.getUseCompoundFile(), info.getCodec(), info.getDiagnostics(), info.getId(), new HashMap<>(), null); - destInfos.add(new SegmentCommitInfo(newInfo, infoPerCommit.getDelCount(), + destInfos.add(new SegmentCommitInfo(newInfo, infoPerCommit.getDelCount(), infoPerCommit.getSoftDelCount(), infoPerCommit.getDelGen(), infoPerCommit.getFieldInfosGen(), infoPerCommit.getDocValuesGen())); // now copy files over diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java index 2bc422972bc..85bb6d11fff 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java @@ -359,7 +359,7 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable { : "no need \"textgrams\" when minPrefixChars="+minPrefixChars; if (fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars > 0) { // TODO: should use an EdgeNGramTokenFilterFactory here - TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), 1, minPrefixChars); + TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), 1, minPrefixChars, false); return new TokenStreamComponents(components.getTokenizer(), filter); } else { return components; diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java index 13bd392aa9d..8888382a5ca 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java @@ -19,7 +19,7 @@ package org.apache.lucene.search.suggest.document; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.AnalyzerWrapper; import org.apache.lucene.analysis.TokenStreamToAutomaton; -import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; /** * Wraps an {@link org.apache.lucene.analysis.Analyzer} @@ -37,24 +37,11 @@ import org.apache.lucene.util.automaton.Operations; */ public final class CompletionAnalyzer extends AnalyzerWrapper { - /** - * Represents the separation between tokens, if - * preserveSep is true - *

- * Same label is used as a delimiter in the {@link org.apache.lucene.search.suggest.document.CompletionTokenStream} - * payload - */ - final static int SEP_LABEL = NRTSuggesterBuilder.PAYLOAD_SEP; - /** * Represent a hole character, inserted by {@link org.apache.lucene.analysis.TokenStreamToAutomaton} */ final static int HOLE_CHARACTER = TokenStreamToAutomaton.HOLE; - final static int DEFAULT_MAX_GRAPH_EXPANSIONS = Operations.DEFAULT_MAX_DETERMINIZED_STATES; - final static boolean DEFAULT_PRESERVE_SEP = true; - final static boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true; - private final Analyzer analyzer; /** @@ -101,7 +88,7 @@ public final class CompletionAnalyzer extends AnalyzerWrapper { * preserving token separation, position increments and no limit on graph expansions */ public CompletionAnalyzer(Analyzer analyzer) { - this(analyzer, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS); + this(analyzer, ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP, ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS, ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS); } /** @@ -109,7 +96,7 @@ public final class CompletionAnalyzer extends AnalyzerWrapper { * with no limit on graph expansions */ public CompletionAnalyzer(Analyzer analyzer, boolean preserveSep, boolean preservePositionIncrements) { - this(analyzer, preserveSep, preservePositionIncrements, DEFAULT_MAX_GRAPH_EXPANSIONS); + this(analyzer, preserveSep, preservePositionIncrements, ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS); } /** @@ -117,7 +104,7 @@ public final class CompletionAnalyzer extends AnalyzerWrapper { * preserving token separation and position increments */ public CompletionAnalyzer(Analyzer analyzer, int maxGraphExpansions) { - this(analyzer, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, maxGraphExpansions); + this(analyzer, ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP, ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS, maxGraphExpansions); } /** diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java index 49fe7d08dff..6be0c91117f 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java @@ -27,7 +27,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.suggest.BitsProducer; import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.HOLE_CHARACTER; -import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.SEP_LABEL; +import static org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter.SEP_LABEL; /** * Abstract {@link Query} that match documents containing terms with a specified prefix diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java index 7308e65acc9..d3bec8e50c9 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java @@ -14,71 +14,43 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.lucene.search.suggest.document; import java.io.IOException; +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.TokenStreamToAutomaton; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; -import org.apache.lucene.util.AttributeImpl; -import org.apache.lucene.util.AttributeReflector; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefBuilder; -import org.apache.lucene.util.CharsRefBuilder; -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.FiniteStringsIterator; -import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator; -import org.apache.lucene.util.automaton.Operations; -import org.apache.lucene.util.automaton.Transition; -import org.apache.lucene.util.fst.Util; - -import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_MAX_GRAPH_EXPANSIONS; -import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_POSITION_INCREMENTS; -import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_SEP; -import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.SEP_LABEL; /** - * Token stream which converts a provided token stream to an automaton. - * The accepted strings enumeration from the automaton are available through the - * {@link org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute} attribute - * The token stream uses a {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute} to store - * a completion's payload (see {@link CompletionTokenStream#setPayload(org.apache.lucene.util.BytesRef)}) - * + * A {@link ConcatenateGraphFilter} but we can set the payload and provide access to config options. * @lucene.experimental */ -public final class CompletionTokenStream extends TokenStream { +public final class CompletionTokenStream extends TokenFilter { private final PayloadAttribute payloadAttr = addAttribute(PayloadAttribute.class); - private final BytesRefBuilderTermAttribute bytesAtt = addAttribute(BytesRefBuilderTermAttribute.class); + // package accessible on purpose final TokenStream inputTokenStream; final boolean preserveSep; final boolean preservePositionIncrements; final int maxGraphExpansions; - private FiniteStringsIterator finiteStrings; - private BytesRef payload; - private CharTermAttribute charTermAttribute; + private BytesRef payload; // note doesn't participate in TokenStream lifecycle; it's effectively constant - /** - * Creates a token stream to convert input to a token stream - * of accepted strings by its automaton. - *

- * The token stream input is converted to an automaton - * with the default settings of {@link org.apache.lucene.search.suggest.document.CompletionAnalyzer} - */ CompletionTokenStream(TokenStream inputTokenStream) { - this(inputTokenStream, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS); + this(inputTokenStream, + ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP, + ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS, + ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS); } CompletionTokenStream(TokenStream inputTokenStream, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) { - // Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume - // the input stream entirely in the first call to incrementToken + super(new ConcatenateGraphFilter(inputTokenStream, preserveSep, preservePositionIncrements, maxGraphExpansions)); this.inputTokenStream = inputTokenStream; this.preserveSep = preserveSep; this.preservePositionIncrements = preservePositionIncrements; @@ -94,248 +66,23 @@ public final class CompletionTokenStream extends TokenStream { @Override public boolean incrementToken() throws IOException { - clearAttributes(); - if (finiteStrings == null) { - Automaton automaton = toAutomaton(); - finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions); - } - - IntsRef string = finiteStrings.next(); - if (string == null) { + if (input.incrementToken()) { + payloadAttr.setPayload(payload); + return true; + } else { return false; } - - Util.toBytesRef(string, bytesAtt.builder()); // now we have UTF-8 - if (charTermAttribute != null) { - charTermAttribute.setLength(0); - charTermAttribute.append(bytesAtt.toUTF16()); - } - if (payload != null) { - payloadAttr.setPayload(this.payload); - } - - return true; } - @Override - public void end() throws IOException { - super.end(); - if (finiteStrings == null) { - inputTokenStream.end(); - } - } - - @Override - public void close() throws IOException { - if (finiteStrings == null) { - inputTokenStream.close(); - } - } - - @Override - public void reset() throws IOException { - super.reset(); - if (hasAttribute(CharTermAttribute.class)) { - // we only create this if we really need it to safe the UTF-8 to UTF-16 conversion - charTermAttribute = getAttribute(CharTermAttribute.class); - } - finiteStrings = null; - } - - /** - * Converts the token stream to an automaton, - * treating the transition labels as utf-8 - */ + /** Delegates to...At + * @see ConcatenateGraphFilter#toAutomaton() */ public Automaton toAutomaton() throws IOException { - return toAutomaton(false); + return ((ConcatenateGraphFilter)input).toAutomaton(); } - /** - * Converts the tokenStream to an automaton - */ + /** Delegates to... + * @see ConcatenateGraphFilter#toAutomaton(boolean) */ public Automaton toAutomaton(boolean unicodeAware) throws IOException { - // TODO refactor this - // maybe we could hook up a modified automaton from TermAutomatonQuery here? - Automaton automaton = null; - try { - // Create corresponding automaton: labels are bytes - // from each analyzed token, with byte 0 used as - // separator between tokens: - final TokenStreamToAutomaton tsta; - if (preserveSep) { - tsta = new EscapingTokenStreamToAutomaton((char) SEP_LABEL); - } else { - // When we're not preserving sep, we don't steal 0xff - // byte, so we don't need to do any escaping: - tsta = new TokenStreamToAutomaton(); - } - tsta.setPreservePositionIncrements(preservePositionIncrements); - tsta.setUnicodeArcs(unicodeAware); - - automaton = tsta.toAutomaton(inputTokenStream); - } finally { - IOUtils.closeWhileHandlingException(inputTokenStream); - } - - // TODO: we can optimize this somewhat by determinizing - // while we convert - automaton = replaceSep(automaton, preserveSep, SEP_LABEL); - // This automaton should not blow up during determinize: - return Operations.determinize(automaton, maxGraphExpansions); - } - - /** - * Just escapes the 0xff byte (which we still for SEP). - */ - private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton { - - final BytesRefBuilder spare = new BytesRefBuilder(); - private char sepLabel; - - public EscapingTokenStreamToAutomaton(char sepLabel) { - this.sepLabel = sepLabel; - } - - @Override - protected BytesRef changeToken(BytesRef in) { - int upto = 0; - for (int i = 0; i < in.length; i++) { - byte b = in.bytes[in.offset + i]; - if (b == (byte) sepLabel) { - spare.grow(upto + 2); - spare.setByteAt(upto++, (byte) sepLabel); - spare.setByteAt(upto++, b); - } else { - spare.grow(upto + 1); - spare.setByteAt(upto++, b); - } - } - spare.setLength(upto); - return spare.get(); - } - } - - // Replaces SEP with epsilon or remaps them if - // we were asked to preserve them: - private static Automaton replaceSep(Automaton a, boolean preserveSep, int sepLabel) { - - Automaton result = new Automaton(); - - // Copy all states over - int numStates = a.getNumStates(); - for (int s = 0; s < numStates; s++) { - result.createState(); - result.setAccept(s, a.isAccept(s)); - } - - // Go in reverse topo sort so we know we only have to - // make one pass: - Transition t = new Transition(); - int[] topoSortStates = Operations.topoSortStates(a); - for (int i = 0; i < topoSortStates.length; i++) { - int state = topoSortStates[topoSortStates.length - 1 - i]; - int count = a.initTransition(state, t); - for (int j = 0; j < count; j++) { - a.getNextTransition(t); - if (t.min == TokenStreamToAutomaton.POS_SEP) { - assert t.max == TokenStreamToAutomaton.POS_SEP; - if (preserveSep) { - // Remap to SEP_LABEL: - result.addTransition(state, t.dest, sepLabel); - } else { - result.addEpsilon(state, t.dest); - } - } else if (t.min == TokenStreamToAutomaton.HOLE) { - assert t.max == TokenStreamToAutomaton.HOLE; - - // Just remove the hole: there will then be two - // SEP tokens next to each other, which will only - // match another hole at search time. Note that - // it will also match an empty-string token ... if - // that's somehow a problem we can always map HOLE - // to a dedicated byte (and escape it in the - // input). - result.addEpsilon(state, t.dest); - } else { - result.addTransition(state, t.dest, t.min, t.max); - } - } - } - - result.finishState(); - - return result; - } - - /** - * Attribute providing access to the term builder and UTF-16 conversion - */ - public interface BytesRefBuilderTermAttribute extends TermToBytesRefAttribute { - /** - * Returns the builder from which the term is derived. - */ - BytesRefBuilder builder(); - - /** - * Returns the term represented as UTF-16 - */ - CharSequence toUTF16(); - } - - /** - * Custom attribute implementation for completion token stream - */ - public static final class BytesRefBuilderTermAttributeImpl extends AttributeImpl implements BytesRefBuilderTermAttribute, TermToBytesRefAttribute { - private final BytesRefBuilder bytes = new BytesRefBuilder(); - private transient CharsRefBuilder charsRef; - - /** - * Sole constructor - * no-op - */ - public BytesRefBuilderTermAttributeImpl() { - } - - @Override - public BytesRefBuilder builder() { - return bytes; - } - - @Override - public BytesRef getBytesRef() { - return bytes.get(); - } - - @Override - public void clear() { - bytes.clear(); - } - - @Override - public void copyTo(AttributeImpl target) { - BytesRefBuilderTermAttributeImpl other = (BytesRefBuilderTermAttributeImpl) target; - other.bytes.copyBytes(bytes); - } - - @Override - public AttributeImpl clone() { - BytesRefBuilderTermAttributeImpl other = new BytesRefBuilderTermAttributeImpl(); - copyTo(other); - return other; - } - - @Override - public void reflectWith(AttributeReflector reflector) { - reflector.reflect(TermToBytesRefAttribute.class, "bytes", getBytesRef()); - } - - @Override - public CharSequence toUTF16() { - if (charsRef == null) { - charsRef = new CharsRefBuilder(); - } - charsRef.copyUTF8Bytes(getBytesRef()); - return charsRef.get(); - } + return ((ConcatenateGraphFilter)input).toAutomaton(unicodeAware); } } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java index 6217ca38f85..1a2680cb553 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java @@ -22,6 +22,7 @@ import java.util.Iterator; import java.util.Map; import java.util.TreeSet; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Weight; @@ -178,7 +179,7 @@ public class ContextQuery extends CompletionQuery { // if separators are preserved the fst contains a SEP_LABEL // behind each gap. To have a matching automaton, we need to // include the SEP_LABEL in the query as well - Automaton optionalSepLabel = Operations.optional(Automata.makeChar(CompletionAnalyzer.SEP_LABEL)); + Automaton optionalSepLabel = Operations.optional(Automata.makeChar(ConcatenateGraphFilter.SEP_LABEL)); Automaton prefixAutomaton = Operations.concatenate(optionalSepLabel, innerAutomaton); Automaton contextsAutomaton = Operations.concatenate(toContextAutomaton(contexts, matchAllContexts), prefixAutomaton); contextsAutomaton = Operations.determinize(contextsAutomaton, Operations.DEFAULT_MAX_DETERMINIZED_STATES); @@ -302,7 +303,7 @@ public class ContextQuery extends CompletionQuery { } ref.offset = ++i; assert ref.offset < ref.length : "input should not end with the context separator"; - if (ref.ints[i] == CompletionAnalyzer.SEP_LABEL) { + if (ref.ints[i] == ConcatenateGraphFilter.SEP_LABEL) { ref.offset++; assert ref.offset < ref.length : "input should not end with a context separator followed by SEP_LABEL"; } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java index 4cb91b8053c..cf462e1dbc8 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java @@ -90,6 +90,7 @@ public class ContextSuggestField extends SuggestField { } CompletionTokenStream completionTokenStream; if (stream instanceof CompletionTokenStream) { + //TODO this is awkward; is there a better way avoiding re-creating the chain? completionTokenStream = (CompletionTokenStream) stream; PrefixTokenFilter prefixTokenFilter = new PrefixTokenFilter(completionTokenStream.inputTokenStream, (char) CONTEXT_SEPARATOR, contexts); completionTokenStream = new CompletionTokenStream(prefixTokenFilter, diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java index b243f4ede83..14479fecd12 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java @@ -144,9 +144,12 @@ public class FuzzyCompletionQuery extends PrefixCompletionQuery { @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()); + final Automaton originalAutomata; + try (CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()) ) { + originalAutomata = stream.toAutomaton(unicodeAware); + } Set refs = new HashSet<>(); - Automaton automaton = toLevenshteinAutomata(stream.toAutomaton(unicodeAware), refs); + Automaton automaton = toLevenshteinAutomata(originalAutomata, refs); if (unicodeAware) { Automaton utf8automaton = new UTF32ToUTF8().convert(automaton); utf8automaton = Operations.determinize(utf8automaton, maxDeterminizedStates); diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java index 270463175d7..5ca4993396f 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java @@ -19,6 +19,7 @@ package org.apache.lucene.search.suggest.document; import java.io.IOException; import java.util.PriorityQueue; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; @@ -42,7 +43,7 @@ final class NRTSuggesterBuilder { * Label used to separate surface form and docID * in the output */ - public static final int PAYLOAD_SEP = '\u001F'; + public static final int PAYLOAD_SEP = ConcatenateGraphFilter.SEP_LABEL; /** * Marks end of the analyzed input and start of dedup diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java index 7bb75e9261c..a8da150f504 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java @@ -68,8 +68,9 @@ public class PrefixCompletionQuery extends CompletionQuery { @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()); - return new CompletionWeight(this, stream.toAutomaton()); + try (CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text())) { + return new CompletionWeight(this, stream.toAutomaton()); + } } /** diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java index 7f06328ee1b..b2d24c2c84e 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java @@ -21,6 +21,7 @@ import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.IndexOptions; @@ -140,7 +141,7 @@ public class SuggestField extends Field { private boolean isReserved(char c) { switch (c) { - case CompletionAnalyzer.SEP_LABEL: + case ConcatenateGraphFilter.SEP_LABEL: case CompletionAnalyzer.HOLE_CHARACTER: case NRTSuggesterBuilder.END_BYTE: return true; diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java index 0c3b254c132..8beea129622 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java @@ -21,6 +21,7 @@ import java.io.ByteArrayOutputStream; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; @@ -109,21 +110,21 @@ public class TestContextSuggestField extends LuceneTestCase { CharsRefBuilder builder = new CharsRefBuilder(); builder.append("context1"); builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR)); - builder.append(((char) CompletionAnalyzer.SEP_LABEL)); + builder.append((char) ConcatenateGraphFilter.SEP_LABEL); builder.append("input"); expectedOutputs[0] = builder.toCharsRef().toString(); builder.clear(); builder.append("context2"); builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR)); - builder.append(((char) CompletionAnalyzer.SEP_LABEL)); + builder.append((char) ConcatenateGraphFilter.SEP_LABEL); builder.append("input"); expectedOutputs[1] = builder.toCharsRef().toString(); - TokenStream stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(field.tokenStream(analyzer, null)); - assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 1}, null, null); + TokenStream stream = new TestSuggestField.PayloadAttrToTypeAttrFilter(field.tokenStream(analyzer, null)); + assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 0}, null, null); CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer); - stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(field.tokenStream(completionAnalyzer, null)); - assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 1}, null, null); + stream = new TestSuggestField.PayloadAttrToTypeAttrFilter(field.tokenStream(completionAnalyzer, null)); + assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 0}, null, null); } @Test diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java index a6659e082d5..e6d7062c925 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java @@ -32,7 +32,11 @@ import java.util.concurrent.CyclicBarrier; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.lucene70.Lucene70Codec; @@ -99,7 +103,7 @@ public class TestSuggestField extends LuceneTestCase { public void testReservedChars() throws Exception { CharsRefBuilder charsRefBuilder = new CharsRefBuilder(); charsRefBuilder.append("sugg"); - charsRefBuilder.setCharAt(2, (char) CompletionAnalyzer.SEP_LABEL); + charsRefBuilder.setCharAt(2, (char) ConcatenateGraphFilter.SEP_LABEL); IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { new SuggestField("name", charsRefBuilder.toString(), 1); }); @@ -144,11 +148,11 @@ public class TestSuggestField extends LuceneTestCase { output.writeByte(SuggestField.TYPE); } BytesRef payload = new BytesRef(byteArrayOutputStream.toByteArray()); - TokenStream stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(suggestField.tokenStream(analyzer, null)); + TokenStream stream = new PayloadAttrToTypeAttrFilter(suggestField.tokenStream(analyzer, null)); assertTokenStreamContents(stream, new String[] {"input"}, null, null, new String[]{payload.utf8ToString()}, new int[]{1}, null, null); CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer); - stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(suggestField.tokenStream(completionAnalyzer, null)); + stream = new PayloadAttrToTypeAttrFilter(suggestField.tokenStream(completionAnalyzer, null)); assertTokenStreamContents(stream, new String[] {"input"}, null, null, new String[]{payload.utf8ToString()}, new int[]{1}, null, null); } @@ -894,4 +898,23 @@ public class TestSuggestField extends LuceneTestCase { iwc.setCodec(filterCodec); return iwc; } + + public final static class PayloadAttrToTypeAttrFilter extends TokenFilter { + private PayloadAttribute payload = addAttribute(PayloadAttribute.class); + private TypeAttribute type = addAttribute(TypeAttribute.class); + + protected PayloadAttrToTypeAttrFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + // we move them over so we can assert them more easily in the tests + type.setType(payload.getPayload().utf8ToString()); + return true; + } + return false; + } + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseFieldInfoFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseFieldInfoFormatTestCase.java index 9363ce63fe3..3515b9a9c97 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseFieldInfoFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseFieldInfoFormatTestCase.java @@ -53,7 +53,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes Directory dir = newDirectory(); Codec codec = getCodec(); SegmentInfo segmentInfo = newSegmentInfo(dir, "_123"); - FieldInfos.Builder builder = new FieldInfos.Builder(); + FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); FieldInfo fi = builder.getOrAdd("field"); fi.setIndexOptions(TextField.TYPE_STORED.indexOptions()); addAttributes(fi); @@ -75,7 +75,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes Directory dir = newDirectory(); Codec codec = getCodec(); SegmentInfo segmentInfo = newSegmentInfo(dir, "_123"); - FieldInfos.Builder builder = new FieldInfos.Builder(); + FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); FieldInfo fi = builder.getOrAdd("field"); fi.setIndexOptions(TextField.TYPE_STORED.indexOptions()); addAttributes(fi); @@ -115,7 +115,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes dir.failOn(fail); Codec codec = getCodec(); SegmentInfo segmentInfo = newSegmentInfo(dir, "_123"); - FieldInfos.Builder builder = new FieldInfos.Builder(); + FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); FieldInfo fi = builder.getOrAdd("field"); fi.setIndexOptions(TextField.TYPE_STORED.indexOptions()); addAttributes(fi); @@ -150,7 +150,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes dir.failOn(fail); Codec codec = getCodec(); SegmentInfo segmentInfo = newSegmentInfo(dir, "_123"); - FieldInfos.Builder builder = new FieldInfos.Builder(); + FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); FieldInfo fi = builder.getOrAdd("field"); fi.setIndexOptions(TextField.TYPE_STORED.indexOptions()); addAttributes(fi); @@ -185,7 +185,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes dir.failOn(fail); Codec codec = getCodec(); SegmentInfo segmentInfo = newSegmentInfo(dir, "_123"); - FieldInfos.Builder builder = new FieldInfos.Builder(); + FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); FieldInfo fi = builder.getOrAdd("field"); fi.setIndexOptions(TextField.TYPE_STORED.indexOptions()); addAttributes(fi); @@ -221,7 +221,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes dir.failOn(fail); Codec codec = getCodec(); SegmentInfo segmentInfo = newSegmentInfo(dir, "_123"); - FieldInfos.Builder builder = new FieldInfos.Builder(); + FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); FieldInfo fi = builder.getOrAdd("field"); fi.setIndexOptions(TextField.TYPE_STORED.indexOptions()); addAttributes(fi); @@ -251,7 +251,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes for (int i = 0; i < numFields; i++) { fieldNames.add(TestUtil.randomUnicodeString(random())); } - FieldInfos.Builder builder = new FieldInfos.Builder(); + FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null)); for (String field : fieldNames) { IndexableFieldType fieldType = randomFieldType(random()); FieldInfo fi = builder.getOrAdd(field); diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java index f5b52239057..83419de52e2 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java @@ -323,7 +323,7 @@ abstract class BaseIndexFileFormatTestCase extends LuceneTestCase { FieldInfo proto = oneDocReader.getFieldInfos().fieldInfo("field"); FieldInfo field = new FieldInfo(proto.name, proto.number, proto.hasVectors(), proto.omitsNorms(), proto.hasPayloads(), proto.getIndexOptions(), proto.getDocValuesType(), proto.getDocValuesGen(), new HashMap<>(), - proto.getPointDimensionCount(), proto.getPointNumBytes()); + proto.getPointDimensionCount(), proto.getPointNumBytes(), proto.isSoftDeletesField()); FieldInfos fieldInfos = new FieldInfos(new FieldInfo[] { field } ); diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseLiveDocsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseLiveDocsFormatTestCase.java index b4799f86fdb..9c01990b195 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseLiveDocsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseLiveDocsFormatTestCase.java @@ -125,10 +125,10 @@ public abstract class BaseLiveDocsFormatTestCase extends LuceneTestCase { final Directory dir = newDirectory(); final SegmentInfo si = new SegmentInfo(dir, Version.LATEST, Version.LATEST, "foo", maxDoc, random().nextBoolean(), codec, Collections.emptyMap(), StringHelper.randomId(), Collections.emptyMap(), null); - SegmentCommitInfo sci = new SegmentCommitInfo(si, 0, 0, -1, -1); + SegmentCommitInfo sci = new SegmentCommitInfo(si, 0, 0, 0, -1, -1); format.writeLiveDocs(bits, dir, sci, maxDoc - numLiveDocs, IOContext.DEFAULT); - sci = new SegmentCommitInfo(si, maxDoc - numLiveDocs, 1, -1, -1); + sci = new SegmentCommitInfo(si, maxDoc - numLiveDocs, 0, 1, -1, -1); final Bits bits2 = format.readLiveDocs(dir, sci, IOContext.READONCE); assertEquals(maxDoc, bits2.length()); for (int i = 0; i < maxDoc; ++i) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseMergePolicyTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseMergePolicyTestCase.java index 8f986277f5d..477b0a3c548 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseMergePolicyTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseMergePolicyTestCase.java @@ -116,7 +116,7 @@ public abstract class BaseMergePolicyTestCase extends LuceneTestCase { Collections.emptyMap(), // attributes null /* indexSort */); info.setFiles(Collections.emptyList()); - infos.add(new SegmentCommitInfo(info, random().nextInt(1), -1, -1, -1)); + infos.add(new SegmentCommitInfo(info, random().nextInt(1), 0, -1, -1, -1)); } MergePolicy.MergeSpecification forcedDeletesMerges = mp.findForcedDeletesMerges(infos, context); if (forcedDeletesMerges != null) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java b/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java index 7dd6ba89bd0..2c746773f94 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java @@ -77,7 +77,8 @@ public class MismatchedLeafReader extends FilterLeafReader { oldInfo.getDocValuesGen(), // dvGen oldInfo.attributes(), // attributes oldInfo.getPointDimensionCount(), // dimension count - oldInfo.getPointNumBytes()); // dimension numBytes + oldInfo.getPointNumBytes(), // dimension numBytes + oldInfo.isSoftDeletesField()); // used as soft-deletes field shuffled.set(i, newInfo); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java index 29962e609a7..9f2d9b7adc0 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java @@ -130,7 +130,7 @@ public class RandomPostingsTester { fieldInfoArray[fieldUpto] = new FieldInfo(field, fieldUpto, false, false, true, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, DocValuesType.NONE, -1, new HashMap<>(), - 0, 0); + 0, 0, false); fieldUpto++; SortedMap postings = new TreeMap<>(); @@ -651,7 +651,7 @@ public class RandomPostingsTester { DocValuesType.NONE, -1, new HashMap<>(), - 0, 0); + 0, 0, false); } FieldInfos newFieldInfos = new FieldInfos(newFieldInfoArray); diff --git a/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java b/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java index 60f671c25fc..019417771ed 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java +++ b/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java @@ -804,7 +804,7 @@ public class MockDirectoryWrapper extends BaseDirectoryWrapper { } // NOTE: This is off by default; see LUCENE-5574 - private boolean assertNoUnreferencedFilesOnClose; + private volatile boolean assertNoUnreferencedFilesOnClose; public void setAssertNoUnrefencedFilesOnClose(boolean v) { assertNoUnreferencedFilesOnClose = v; diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 66d885362fd..6dd4889300e 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -70,8 +70,8 @@ Upgrade Notes To return the previous behavior pass false to skipCommitOnMasterVersionZero in slave section of replication handler configuration, or pass it to the fetchindex command. -* SOLR-11453: Configuring slowQueryThresholdMillis now logs slow requests to a separate file - solr_slow_requests.log . - Previously they would get logged in the solr.xml file +* SOLR-11453: Configuring slowQueryThresholdMillis now logs slow requests to a separate file - solr_slow_requests.log. + Previously they would get logged in the solr.log file. New Features ---------------------- @@ -140,7 +140,7 @@ New Features * SOLR-12328: JSON Facet API: Domain change with graph query. (Daniel Meehl, Kevin Watters, yonik) -* SOLR-11453: Configuring slowQueryThresholdMillis logs slow requests to a separate file - solr_slow_requests.log . +* SOLR-11453: Configuring slowQueryThresholdMillis logs slow requests to a separate file - solr_slow_requests.log. (Shawn Heisey, Remko Popma, Varun Thacker) * SOLR-12401: Add getValue() and setValue() Stream Evaluators (Joel Bernstein, janhoy) @@ -154,6 +154,9 @@ New Features * SOLR-12389: support deeply nested json objects in clusterprops.json (noble) +* SOLR-12376: Added the TaggerRequestHandler (AKA SolrTextTagger) for tagging text. It's used as a component of + NER/ERD systems including query-understanding. See the ref guide for more info. (David Smiley) + Bug Fixes ---------------------- @@ -283,13 +286,15 @@ Bug Fixes * SOLR-12374: SnapShooter.getIndexCommit can forget to decref the searcher; though it's not clear in practice when. (David Smiley) -* SOLR-12417: velocity response writer should enforce valid function name for v.json parameter (yonik) +* SOLR-12417: velocity response writer should enforce valid function name for v.json parameter (Mano Kovacs, yonik) * SOLR-12271: Fixed bug in how Analytics component reads negative values from float and double fields. (Houston Putman) * SOLR-12433: Recovering flag of a replica is set equals to leader even it failed to receive update on recovering. (Cao Manh Dat) +* SOLR-12354: Register the /admin/info/key end-point at the startup time to avoid 404 (noble) + Optimizations ---------------------- @@ -325,6 +330,12 @@ Optimizations SolrConstantScoreQuery as well. QWF since v5.4.0 sometimes needlessly internally executed and cached the query. Affects ExpandComponent, ChildDocTransformer, CurrencyFieldType, TermsQParser. (David Smiley) +* SOLR-9922: Write buffering updates to another tlog. (Cao Manh Dat) + +* SOLR-12233: QParserPlugin's built-in static registry now holds actual QParserPlugin instances instead of class + references. This is consistent with other plugin registries and allows a SolrCore to load faster. + (Jeff Miller, David Smiley) + Other Changes ---------------------- @@ -1308,6 +1319,8 @@ Bug Fixes * SOLR-11477: Disallow resolving of external entities in the XML query parser (defType=xmlparser). (Michael Stepankin, Olga Barinova, Uwe Schindler, Christine Poerschke) +* SOLR-12444: Updating a cluster policy fails (noble) + Optimizations ---------------------- diff --git a/solr/NOTICE.txt b/solr/NOTICE.txt index fd954f4ef4f..a5b2070a39b 100644 --- a/solr/NOTICE.txt +++ b/solr/NOTICE.txt @@ -537,3 +537,17 @@ See http://www.restlet.org/ Protocol Buffers - Google's data interchange format Copyright 2008 Google Inc. http://code.google.com/apis/protocolbuffers/ + +========================================================================= +== SolrTextTagger Notice == +========================================================================= + +The TaggerRequestHandler and related classes in its package came from the +OpenSextant Solr Text Tagger, +Copyright 2013 The MITRE Corporation. All Rights Reserved. + + This software was produced for the U. S. Government + under Contract No. W15P7T-11-C-F600, and is + subject to the Rights in Noncommercial Computer Software + and Noncommercial Computer Software Documentation + Clause 252.227-7014 (JUN 1995) \ No newline at end of file diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java index c8f5ae89fbe..966497b0938 100644 --- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java +++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java @@ -449,7 +449,6 @@ public class RecoveryStrategy implements Runnable, Closeable { // TODO: perhaps make this grab a new core each time through the loop to handle core reloads? final public void doSyncOrReplicateRecovery(SolrCore core) throws Exception { - boolean replayed = false; boolean successfulRecovery = false; UpdateLog ulog; @@ -500,8 +499,7 @@ public class RecoveryStrategy implements Runnable, Closeable { // when we went down. We may have received updates since then. recentVersions = startingVersions; try { - if ((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0) { - // last operation at the time of startup had the GAP flag set... + if (ulog.existOldBufferLog()) { // this means we were previously doing a full index replication // that probably didn't complete and buffering updates in the // meantime. @@ -542,9 +540,9 @@ public class RecoveryStrategy implements Runnable, Closeable { } LOG.info("Begin buffering updates. core=[{}]", coreName); + // recalling buffer updates will drop the old buffer tlog ulog.bufferUpdates(); - replayed = false; - + LOG.info("Publishing state of core [{}] as recovering, leader is [{}] and I am [{}]", core.getName(), leader.getCoreUrl(), ourUrl); zkController.publish(core.getCoreDescriptor(), Replica.State.RECOVERING); @@ -603,8 +601,7 @@ public class RecoveryStrategy implements Runnable, Closeable { LOG.info("Replaying updates buffered during PeerSync."); replay(core); - replayed = true; - + // sync success successfulRecovery = true; return; @@ -630,8 +627,7 @@ public class RecoveryStrategy implements Runnable, Closeable { } replayFuture = replay(core); - replayed = true; - + if (isClosed()) { LOG.info("RecoveryStrategy has been closed"); break; @@ -650,21 +646,6 @@ public class RecoveryStrategy implements Runnable, Closeable { } catch (Exception e) { SolrException.log(LOG, "Error while trying to recover. core=" + coreName, e); } finally { - if (!replayed) { - // dropBufferedUpdate()s currently only supports returning to ACTIVE state, which risks additional updates - // being added w/o UpdateLog.FLAG_GAP, hence losing the info on restart that we are not up-to-date. - // For now, ulog will simply remain in BUFFERING state, and an additional call to bufferUpdates() will - // reset our starting point for playback. - LOG.info("Replay not started, or was not successful... still buffering updates."); - - /** this prev code is retained in case we want to switch strategies. - try { - ulog.dropBufferedUpdates(); - } catch (Exception e) { - SolrException.log(log, "", e); - } - **/ - } if (successfulRecovery) { LOG.info("Registering as Active after recovery."); try { diff --git a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java index 0a742e3a5ae..aa648dd8869 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java +++ b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java @@ -97,7 +97,7 @@ public class ReplicateFromLeader { new ModifiableSolrParams()); CommitUpdateCommand cuc = new CommitUpdateCommand(req, false); cuc.setVersion(Long.parseLong(commitVersion)); - updateLog.copyOverOldUpdates(cuc); + updateLog.commitAndSwitchToNewTlog(cuc); lastVersion = Long.parseLong(commitVersion); } }); diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java index 4a9c7442774..22e3ef5e77e 100644 --- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java +++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java @@ -168,7 +168,13 @@ public class ComputePlanAction extends TriggerActionBase { // estimate a maximum default limit that should be sufficient for most purposes: // number of nodes * total number of replicas * 3 AtomicInteger totalRF = new AtomicInteger(); - clusterState.forEachCollection(coll -> totalRF.addAndGet(coll.getReplicationFactor() * coll.getSlices().size())); + clusterState.forEachCollection(coll -> { + Integer rf = coll.getReplicationFactor(); + if (rf == null) { + rf = coll.getReplicas().size() / coll.getSlices().size(); + } + totalRF.addAndGet(rf * coll.getSlices().size()); + }); int totalMax = clusterState.getLiveNodes().size() * totalRF.get() * 3; int maxOp = (Integer) autoScalingConfig.getProperties().getOrDefault(AutoScalingParams.MAX_COMPUTE_OPERATIONS, totalMax); Object o = event.getProperty(AutoScalingParams.MAX_COMPUTE_OPERATIONS, maxOp); diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java index e108ae19f89..d546dd29b9c 100644 --- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java +++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java @@ -16,20 +16,6 @@ */ package org.apache.solr.core; -import static java.util.Objects.requireNonNull; -import static org.apache.solr.common.params.CommonParams.AUTHC_PATH; -import static org.apache.solr.common.params.CommonParams.AUTHZ_PATH; -import static org.apache.solr.common.params.CommonParams.AUTOSCALING_HISTORY_PATH; -import static org.apache.solr.common.params.CommonParams.COLLECTIONS_HANDLER_PATH; -import static org.apache.solr.common.params.CommonParams.CONFIGSETS_HANDLER_PATH; -import static org.apache.solr.common.params.CommonParams.CORES_HANDLER_PATH; -import static org.apache.solr.common.params.CommonParams.HEALTH_CHECK_HANDLER_PATH; -import static org.apache.solr.common.params.CommonParams.INFO_HANDLER_PATH; -import static org.apache.solr.common.params.CommonParams.METRICS_HISTORY_PATH; -import static org.apache.solr.common.params.CommonParams.METRICS_PATH; -import static org.apache.solr.common.params.CommonParams.ZK_PATH; -import static org.apache.solr.security.AuthenticationPlugin.AUTHENTICATION_PLUGIN_PROP; - import java.io.IOException; import java.lang.invoke.MethodHandles; import java.nio.file.Path; @@ -64,15 +50,15 @@ import org.apache.solr.client.solrj.impl.SolrHttpClientContextBuilder; import org.apache.solr.client.solrj.impl.SolrHttpClientContextBuilder.AuthSchemeRegistryProvider; import org.apache.solr.client.solrj.impl.SolrHttpClientContextBuilder.CredentialsProviderProvider; import org.apache.solr.client.solrj.util.SolrIdentifierValidator; -import org.apache.solr.cloud.autoscaling.AutoScalingHandler; import org.apache.solr.cloud.CloudDescriptor; import org.apache.solr.cloud.Overseer; import org.apache.solr.cloud.ZkController; +import org.apache.solr.cloud.autoscaling.AutoScalingHandler; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; +import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Replica.State; -import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.util.ExecutorUtil; import org.apache.solr.common.util.IOUtils; import org.apache.solr.common.util.Utils; @@ -106,6 +92,7 @@ import org.apache.solr.security.AuthenticationPlugin; import org.apache.solr.security.AuthorizationPlugin; import org.apache.solr.security.HttpClientBuilderPlugin; import org.apache.solr.security.PKIAuthenticationPlugin; +import org.apache.solr.security.PublicKeyHandler; import org.apache.solr.security.SecurityPluginHolder; import org.apache.solr.update.SolrCoreState; import org.apache.solr.update.UpdateShardHandler; @@ -116,7 +103,20 @@ import org.apache.zookeeper.KeeperException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import static java.util.Objects.requireNonNull; +import static org.apache.solr.common.params.CommonParams.AUTHC_PATH; +import static org.apache.solr.common.params.CommonParams.AUTHZ_PATH; +import static org.apache.solr.common.params.CommonParams.AUTOSCALING_HISTORY_PATH; +import static org.apache.solr.common.params.CommonParams.COLLECTIONS_HANDLER_PATH; +import static org.apache.solr.common.params.CommonParams.CONFIGSETS_HANDLER_PATH; +import static org.apache.solr.common.params.CommonParams.CORES_HANDLER_PATH; +import static org.apache.solr.common.params.CommonParams.HEALTH_CHECK_HANDLER_PATH; +import static org.apache.solr.common.params.CommonParams.INFO_HANDLER_PATH; +import static org.apache.solr.common.params.CommonParams.METRICS_HISTORY_PATH; +import static org.apache.solr.common.params.CommonParams.METRICS_PATH; +import static org.apache.solr.common.params.CommonParams.ZK_PATH; import static org.apache.solr.core.CorePropertiesLocator.PROPERTIES_FILENAME; +import static org.apache.solr.security.AuthenticationPlugin.AUTHENTICATION_PLUGIN_PROP; /** * @@ -301,6 +301,7 @@ public class CoreContainer { public CoreContainer(NodeConfig config, Properties properties, CoresLocator locator, boolean asyncSolrCoreLoad) { this.loader = config.getSolrResourceLoader(); this.solrHome = loader.getInstancePath().toString(); + containerHandlers.put(PublicKeyHandler.PATH, new PublicKeyHandler()); this.cfg = requireNonNull(config); this.coresLocator = locator; this.containerProperties = new Properties(properties); @@ -548,7 +549,8 @@ public class CoreContainer { hostName = cfg.getNodeName(); zkSys.initZooKeeper(this, solrHome, cfg.getCloudConfig()); - if(isZooKeeperAware()) pkiAuthenticationPlugin = new PKIAuthenticationPlugin(this, zkSys.getZkController().getNodeName()); + if(isZooKeeperAware()) pkiAuthenticationPlugin = new PKIAuthenticationPlugin(this, zkSys.getZkController().getNodeName(), + (PublicKeyHandler) containerHandlers.get(PublicKeyHandler.PATH)); MDCLoggingContext.setNode(this); @@ -592,8 +594,7 @@ public class CoreContainer { containerHandlers.put(AUTHZ_PATH, securityConfHandler); securityConfHandler.initializeMetrics(metricManager, SolrInfoBean.Group.node.toString(), metricTag, AUTHZ_PATH); containerHandlers.put(AUTHC_PATH, securityConfHandler); - if(pkiAuthenticationPlugin != null) - containerHandlers.put(PKIAuthenticationPlugin.PATH, pkiAuthenticationPlugin.getRequestHandler()); + PluginInfo[] metricReporters = cfg.getMetricsConfig().getMetricReporters(); metricManager.loadReporters(metricReporters, loader, this, null, null, SolrInfoBean.Group.node); diff --git a/solr/core/src/java/org/apache/solr/core/SolrCore.java b/solr/core/src/java/org/apache/solr/core/SolrCore.java index 99c0cca0669..feab22dce1c 100644 --- a/solr/core/src/java/org/apache/solr/core/SolrCore.java +++ b/solr/core/src/java/org/apache/solr/core/SolrCore.java @@ -958,7 +958,7 @@ public final class SolrCore implements SolrInfoBean, SolrMetricProducer, Closeab initIndex(prev != null, reload); initWriters(); - qParserPlugins.init(createInstances(QParserPlugin.standardPlugins), this); + qParserPlugins.init(QParserPlugin.standardPlugins, this); valueSourceParsers.init(ValueSourceParser.standardValueSourceParsers, this); transformerFactories.init(TransformerFactory.defaultFactories, this); loadSearchComponents(); diff --git a/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java b/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java index 22753dd0c6a..0ff5c7b362c 100644 --- a/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java +++ b/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java @@ -16,6 +16,10 @@ */ package org.apache.solr.core; +import javax.naming.Context; +import javax.naming.InitialContext; +import javax.naming.NamingException; +import javax.naming.NoInitialContextException; import java.io.Closeable; import java.io.File; import java.io.FileOutputStream; @@ -47,10 +51,6 @@ import java.util.concurrent.ConcurrentSkipListSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; -import javax.naming.Context; -import javax.naming.InitialContext; -import javax.naming.NamingException; -import javax.naming.NoInitialContextException; import org.apache.lucene.analysis.WordlistLoader; import org.apache.lucene.analysis.util.CharFilterFactory; @@ -88,9 +88,9 @@ public class SolrResourceLoader implements ResourceLoader,Closeable static final String project = "solr"; static final String base = "org.apache" + "." + project; static final String[] packages = { - "", "analysis.", "schema.", "handler.", "search.", "update.", "core.", "response.", "request.", + "", "analysis.", "schema.", "handler.", "handler.tagger.", "search.", "update.", "core.", "response.", "request.", "update.processor.", "util.", "spelling.", "handler.component.", "handler.dataimport.", - "spelling.suggest.", "spelling.suggest.fst.", "rest.schema.analysis.", "security.","handler.admin.", + "spelling.suggest.", "spelling.suggest.fst.", "rest.schema.analysis.", "security.", "handler.admin.", "cloud.autoscaling." }; private static final java.lang.String SOLR_CORE_NAME = "solr.core.name"; diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java index 01d2fe89884..269bb50641d 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java @@ -206,9 +206,10 @@ public class CollectionsHandler extends RequestHandlerBase implements Permission return this.coreContainer; } - protected void copyFromClusterProp(Map props, String prop) { + protected void copyFromClusterProp(Map props, String prop) throws IOException { if (props.get(prop) != null) return;//if it's already specified , return - Object defVal = coreContainer.getZkController().getZkStateReader().getClusterProperty(ImmutableList.of(COLLECTION_DEF, prop), null); + Object defVal = new ClusterProperties(coreContainer.getZkController().getZkStateReader().getZkClient()) + .getClusterProperty(ImmutableList.of(COLLECTION_DEF, prop), null); if (defVal != null) props.put(prop, String.valueOf(defVal)); } diff --git a/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java b/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java index 82a62d56d3e..9ffea4bc9c8 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/ExpandComponent.java @@ -797,7 +797,8 @@ public class ExpandComponent extends SearchComponent implements PluginInfoInitia fieldInfo.getDocValuesGen(), fieldInfo.attributes(), fieldInfo.getPointDimensionCount(), - fieldInfo.getPointNumBytes()); + fieldInfo.getPointNumBytes(), + fieldInfo.isSoftDeletesField()); newInfos.add(f); } else { diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/OffsetCorrector.java b/solr/core/src/java/org/apache/solr/handler/tagger/OffsetCorrector.java new file mode 100644 index 00000000000..1fb4911195d --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/tagger/OffsetCorrector.java @@ -0,0 +1,178 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.util.Arrays; + +import com.carrotsearch.hppc.IntArrayList; + +public abstract class OffsetCorrector { + + //TODO support a streaming style of consuming input text so that we need not take a + // String. Trickier because we need to keep more information as we parse to know when tags + // are adjacent with/without whitespace + + //Data structure requirements: + // Given a character offset: + // * determine what tagId is it's parent. + // * determine if it is adjacent to the parent open tag, ignoring whitespace + // * determine if it is adjacent to the parent close tag, ignoring whitespace + // Given a tagId: + // * What is it's parent tagId + // * What's the char offset of the start and end of the open tag + // * What's the char offset of the start and end of the close tag + + /** Document text. */ + protected final String docText; + + /** Array of tag info comprised of 5 int fields: + * [int parentTag, int openStartOff, int openEndOff, int closeStartOff, int closeEndOff]. + * It's size indicates how many tags there are. Tag's are ID'ed sequentially from 0. */ + protected final IntArrayList tagInfo; + + /** offsets of parent tag id change (ascending order) */ + protected final IntArrayList parentChangeOffsets; + /** tag id; parallel array to parentChangeOffsets */ + protected final IntArrayList parentChangeIds; + + protected final int[] offsetPair = new int[] { -1, -1};//non-thread-safe state + + /** Disjoint start and end span offsets (inclusive) of non-taggable sections. Null if none. */ + protected final IntArrayList nonTaggableOffsets; + + /** + * Initialize based on the document text. + * @param docText non-null structured content. + * @param hasNonTaggable if there may be "non-taggable" tags to track + */ + protected OffsetCorrector(String docText, boolean hasNonTaggable) { + this.docText = docText; + final int guessNumElements = Math.max(docText.length() / 20, 4); + + tagInfo = new IntArrayList(guessNumElements * 5); + parentChangeOffsets = new IntArrayList(guessNumElements * 2); + parentChangeIds = new IntArrayList(guessNumElements * 2); + nonTaggableOffsets = hasNonTaggable ? new IntArrayList(guessNumElements / 5) : null; + } + + /** Corrects the start and end offset pair. It will return null if it can't + * due to a failure to keep the offsets balance-able, or if it spans "non-taggable" tags. + * The start (left) offset is pulled left as needed over whitespace and opening tags. The end + * (right) offset is pulled right as needed over whitespace and closing tags. It's returned as + * a 2-element array. + *

Note that the returned array is internally reused; just use it to examine the response. + */ + public int[] correctPair(int leftOffset, int rightOffset) { + rightOffset = correctEndOffsetForCloseElement(rightOffset); + if (spansNonTaggable(leftOffset, rightOffset)) + return null; + + int startTag = lookupTag(leftOffset); + //offsetPair[0] = Math.max(offsetPair[0], getOpenStartOff(startTag)); + int endTag = lookupTag(rightOffset-1); + //offsetPair[1] = Math.min(offsetPair[1], getCloseStartOff(endTag)); + + // Find the ancestor tag enclosing offsetPair. And bump out left offset along the way. + int iTag = startTag; + for (; !tagEnclosesOffset(iTag, rightOffset); iTag = getParentTag(iTag)) { + //Ensure there is nothing except whitespace thru OpenEndOff + int tagOpenEndOff = getOpenEndOff(iTag); + if (hasNonWhitespace(tagOpenEndOff, leftOffset)) + return null; + leftOffset = getOpenStartOff(iTag); + } + final int ancestorTag = iTag; + // Bump out rightOffset until we get to ancestorTag. + for (iTag = endTag; iTag != ancestorTag; iTag = getParentTag(iTag)) { + //Ensure there is nothing except whitespace thru CloseStartOff + int tagCloseStartOff = getCloseStartOff(iTag); + if (hasNonWhitespace(rightOffset, tagCloseStartOff)) + return null; + rightOffset = getCloseEndOff(iTag); + } + + offsetPair[0] = leftOffset; + offsetPair[1] = rightOffset; + return offsetPair; + } + + /** Correct endOffset for adjacent element at the right side. E.g. offsetPair might point to: + *

+   *   foo</tag>
+   * 
+ * and this method pulls the end offset left to the '<'. This is necessary for use with + * {@link org.apache.lucene.analysis.charfilter.HTMLStripCharFilter}. + * + * See https://issues.apache.org/jira/browse/LUCENE-5734 */ + protected int correctEndOffsetForCloseElement(int endOffset) { + if (docText.charAt(endOffset-1) == '>') { + final int newEndOffset = docText.lastIndexOf('<', endOffset - 2); + if (newEndOffset > offsetPair[0])//just to be sure + return newEndOffset; + } + return endOffset; + } + + protected boolean hasNonWhitespace(int start, int end) { + for (int i = start; i < end; i++) { + if (!Character.isWhitespace(docText.charAt(i))) + return true; + } + return false; + } + + protected boolean tagEnclosesOffset(int tag, int off) { + return off >= getOpenStartOff(tag) && off < getCloseEndOff(tag); + } + + protected int getParentTag(int tag) { return tagInfo.get(tag * 5 + 0); } + protected int getOpenStartOff(int tag) { return tagInfo.get(tag * 5 + 1); } + protected int getOpenEndOff(int tag) { return tagInfo.get(tag * 5 + 2); } + protected int getCloseStartOff(int tag) { return tagInfo.get(tag * 5 + 3); } + protected int getCloseEndOff(int tag) { return tagInfo.get(tag * 5 + 4); } + + protected int lookupTag(int off) { + int idx = Arrays.binarySearch(parentChangeOffsets.buffer, 0, parentChangeOffsets.size(), off); + if (idx < 0) + idx = (-idx - 1) - 1;//round down + return parentChangeIds.get(idx); + } + + protected boolean spansNonTaggable(int startOff, int endOff) { + if (nonTaggableOffsets == null) + return false; + int idx = Arrays.binarySearch(nonTaggableOffsets.buffer, 0, nonTaggableOffsets.size(), startOff); + //if tag start coincides with first or last char of non-taggable span then result is true. + // (probably never happens since those characters are actual element markup) + if (idx >= 0) + return true; + idx = -idx - 1;//modify for where we would insert + //if idx is odd then our span intersects a non-taggable span; return true + if ((idx & 1) == 1) + return true; + //it's non-taggable if the next non-taggable start span is before our endOff + if (idx == nonTaggableOffsets.size()) + return false; + return nonTaggableOffsets.get(idx) < endOff; + } +} diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TagClusterReducer.java b/solr/core/src/java/org/apache/solr/handler/tagger/TagClusterReducer.java new file mode 100644 index 00000000000..9310a0429e1 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/tagger/TagClusterReducer.java @@ -0,0 +1,103 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +public interface TagClusterReducer { + /** + * Reduces the linked-list to only those tags that should be emitted + * @param head not null; 1-element array to head which isn't null either + */ + void reduce(TagLL[] head); + + static final TagClusterReducer ALL = new TagClusterReducer() { + @Override + public void reduce(TagLL[] head) { + } + }; + + static final TagClusterReducer NO_SUB = new TagClusterReducer() { + @Override + public void reduce(TagLL[] head) { + //loop forward over all tags + for (TagLL tag = head[0].nextTag; tag != null; tag = tag.nextTag) { + //loop backwards over prev tags from this tag + for (TagLL tPrev = tag.prevTag; tPrev != null; tPrev = tPrev.prevTag) { + assert tPrev.startOffset <= tag.startOffset; + //if a previous tag's endOffset is <= this one's, tForward can be removed + if (tPrev.endOffset >= tag.endOffset) { + tag.removeLL(); + break; + } else if (tPrev.startOffset == tag.startOffset) { + tPrev.removeLL(); + //continue; 'tag' is still valid + } + } + } + } + }; + + static final TagClusterReducer LONGEST_DOMINANT_RIGHT = new TagClusterReducer() { + @Override + public void reduce(TagLL[] head) { + + //--Optimize for common single-tag case + if (head[0].nextTag == null) + return; + + while (true) { + //--Find longest not already marked + TagLL longest = null; + for (TagLL t = head[0]; t != null; t = t.nextTag) { + if (!t.mark && (longest == null || t.charLen() >= longest.charLen())) + longest = t; + } + if (longest == null) + break; + //--Mark longest (so we return it eventually) + longest.mark = true; + //--Remove tags overlapping this longest + for (TagLL t = head[0]; t != null; t = t.nextTag) { + if (t.mark) + continue; + + if (t.overlaps(longest)) { + t.removeLL(); + } else if (t.startOffset >= longest.endOffset) { + break;//no subsequent can possibly overlap + } + } + }//loop + + //all-remaining should be marked +// for (TagLL t = head; t != null; t = t.nextTag) { +// assert t.mark; +//// if (!t.mark) { +//// t.removeLL(); +//// if (head == t) +//// head = t.nextTag; +//// } +// } + assert head[0].mark; + } + }; +} diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TagLL.java b/solr/core/src/java/org/apache/solr/handler/tagger/TagLL.java new file mode 100644 index 00000000000..e8bb0a3bc9b --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/tagger/TagLL.java @@ -0,0 +1,176 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.io.IOException; + +import org.apache.lucene.util.BytesRef; + +/** + * This is a Tag -- a startOffset, endOffset and value. + *

+ * A Tag starts without a value in an + * "advancing" state. {@link #advance(org.apache.lucene.util.BytesRef, int)} + * is called with subsequent words and then eventually it won't advance any + * more, and value is set (could be null). + *

+ * A Tag is also a doubly-linked-list (hence the LL in the name). All tags share + * a reference to the head via a 1-element array, which is potentially modified + * if any of the linked-list methods are called. Tags in the list should have + * equal or increasing start offsets. + */ +public class TagLL{ + + private final TagLL[] head;//a shared pointer to the head; 1 element + TagLL prevTag, nextTag; // linked list + + private TermPrefixCursor cursor; + + final int startOffset;//inclusive + int endOffset;//exclusive + Object value;//null means unset + + /** optional boolean used by some TagClusterReducer's */ + boolean mark = false; + + TagLL(TagLL[] head, TermPrefixCursor cursor, int startOffset, int endOffset, Object value) { + this.head = head; + this.cursor = cursor; + this.startOffset = startOffset; + this.endOffset = endOffset; + this.value = value; + } + + /** + * Advances this tag with "word" at offset "offset". If this tag is not in + * an advancing state then it does nothing. If it is advancing and prior to + * advancing further it sees a value, then a non-advancing tag may be inserted + * into the LL as side-effect. If this returns false (it didn't advance) and + * if there is no value, then it will also be removed. + * + * + * @param word The next word or null if at an end + * @param offset The last character in word's offset in the underlying + * stream. If word is null then it's meaningless. + * + * @return Whether it advanced or not. + */ + boolean advance(BytesRef word, int offset) throws IOException { + if (!isAdvancing()) + return false; + + Object iVal = cursor.getDocIds(); + + if (word != null && cursor.advance(word)) { + + if (iVal != null) { + addBeforeLL(new TagLL(head, null, startOffset, endOffset, iVal)); + } + + assert offset >= endOffset; + endOffset = offset; + return true; + } else { + this.value = iVal; + this.cursor = null; + if (iVal == null) + removeLL(); + return false; + } + } + + /** Removes this tag from the chain, connecting prevTag and nextTag. Does not + * modify "this" object's pointers, so the caller can refer to nextTag after + * removing it. */ + public void removeLL() { + if (head[0] == this) + head[0] = nextTag; + if (prevTag != null) { + prevTag.nextTag = nextTag; + } + if (nextTag != null) { + nextTag.prevTag = prevTag; + } + } + + void addBeforeLL(TagLL tag) { + assert tag.startOffset <= startOffset; + if (prevTag != null) { + assert prevTag.startOffset <= tag.startOffset; + prevTag.nextTag = tag; + tag.prevTag = prevTag; + } else { + assert head[0] == this; + head[0] = tag; + } + prevTag = tag; + tag.nextTag = this; + } + + void addAfterLL(TagLL tag) { + assert tag.startOffset >= startOffset; + if (nextTag != null) { + assert nextTag.startOffset >= tag.startOffset; + nextTag.prevTag = tag; + tag.nextTag = nextTag; + } + nextTag = tag; + tag.prevTag = this; + } + + public int charLen() { + return endOffset - startOffset; + } + + public TagLL getNextTag() { + return nextTag; + } + + public TagLL getPrevTag() { + return prevTag; + } + + public int getStartOffset() { + return startOffset; + } + public int getEndOffset() { + return endOffset; + } + public boolean overlaps(TagLL other) { + //don't use >= or <= because startOffset is inclusive while endOffset is exclusive + if (startOffset < other.startOffset) + return endOffset > other.startOffset; + else + return startOffset < other.endOffset; + } + + boolean isAdvancing() { + return cursor != null; + } + + @Override + public String toString() { + return (prevTag != null ? '*' : '-') + "|" + (nextTag != null ? '*' : '-') + + " " + startOffset + " to " + endOffset + (isAdvancing() ? '+' : " #" + value); + } +} diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/Tagger.java b/solr/core/src/java/org/apache/solr/handler/tagger/Tagger.java new file mode 100644 index 00000000000..12a4cf0a035 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/tagger/Tagger.java @@ -0,0 +1,230 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.index.Terms; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Tags maximum string of words in a corpus. This is a callback-style API + * in which you implement {@link #tagCallback(int, int, Object)}. + * + * This class should be independently usable outside Solr. + */ +public abstract class Tagger { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private final TokenStream tokenStream; + private final TermToBytesRefAttribute byteRefAtt; + private final PositionIncrementAttribute posIncAtt; + private final OffsetAttribute offsetAtt; + private final TaggingAttribute taggingAtt; + + private final TagClusterReducer tagClusterReducer; + private final Terms terms; + private final Bits liveDocs; + private final boolean skipAltTokens; + private final boolean ignoreStopWords; + + private Map docIdsCache; + + /** Whether the WARNING about skipped tokens was already logged. */ + private boolean loggedSkippedAltTokenWarning = false; + + public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream, + TagClusterReducer tagClusterReducer, boolean skipAltTokens, + boolean ignoreStopWords) throws IOException { + this.terms = terms; + this.liveDocs = liveDocs; + this.tokenStream = tokenStream; + this.skipAltTokens = skipAltTokens; + this.ignoreStopWords = ignoreStopWords; + byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class); + posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); + offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); + taggingAtt = tokenStream.addAttribute(TaggingAttribute.class); + tokenStream.reset(); + + this.tagClusterReducer = tagClusterReducer; + } + + public void enableDocIdsCache(int initSize) { + if (initSize > 0) + docIdsCache = new HashMap<>(initSize); + } + + public void process() throws IOException { + if (terms == null) + return; + + //a shared pointer to the head used by this method and each Tag instance. + final TagLL[] head = new TagLL[1]; + + TermPrefixCursor cursor = null;//re-used + + //boolean switch used to log warnings in case tokens where skipped during tagging. + boolean skippedTokens = false; + + while (tokenStream.incrementToken()) { + if (log.isTraceEnabled()) { + log.trace("Token: {}, posInc: {}, offset: [{},{}]", + byteRefAtt, posIncAtt.getPositionIncrement(), + offsetAtt.startOffset(), offsetAtt.endOffset()); + } + //check for posInc < 1 (alternate Tokens, such as expanded Synonyms) + if (posIncAtt.getPositionIncrement() < 1) { + //(a) Deal with this as a configuration issue and throw an exception + if (!skipAltTokens) { + //TODO throw UnsupportedTokenException when PhraseBuilder is ported + throw new IllegalStateException("Query Analyzer generates alternate " + + "Tokens (posInc == 0). Please adapt your Analyzer configuration or " + + "enable '" + TaggerRequestHandler.SKIP_ALT_TOKENS + "' to skip such " + + "tokens. NOTE: enabling '" + TaggerRequestHandler.SKIP_ALT_TOKENS + + "' might result in wrong tagging results if the index time analyzer " + + "is not configured accordingly. For detailed information see " + + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225"); + } else { + //(b) In case the index time analyser had indexed all variants (users + // need to ensure that) processing of alternate tokens can be skipped + // as anyways all alternatives will be contained in the FST. + skippedTokens = true; + log.trace(" ... ignored token"); + continue; + } + } + //-- If PositionIncrement > 1 (stopwords) + if (!ignoreStopWords && posIncAtt.getPositionIncrement() > 1) { + log.trace(" - posInc > 1 ... mark cluster as done"); + advanceTagsAndProcessClusterIfDone(head, null); + } + + final BytesRef term; + //NOTE: we need to lookup tokens if + // * the LookupAtt is true OR + // * there are still advancing tags (to find the longest possible match) + if(taggingAtt.isTaggable() || head[0] != null){ + //-- Lookup the term id from the next token + term = byteRefAtt.getBytesRef(); + if (term.length == 0) { + throw new IllegalArgumentException("term: " + term.utf8ToString() + " analyzed to a zero-length token"); + } + } else { //no current cluster AND lookup == false ... + term = null; //skip this token + } + + //-- Process tag + advanceTagsAndProcessClusterIfDone(head, term); + + //-- only create new Tags for Tokens we need to lookup + if (taggingAtt.isTaggable() && term != null) { + + //determine if the terms index has a term starting with the provided term + // TODO create a pool of these cursors to reuse them more? could be trivial impl + if (cursor == null)// (else the existing cursor will be re-used) + cursor = new TermPrefixCursor(terms.iterator(), liveDocs, docIdsCache); + if (cursor.advance(term)) { + TagLL newTail = new TagLL(head, cursor, offsetAtt.startOffset(), offsetAtt.endOffset(), null); + cursor = null;//because the new tag now "owns" this instance + //and add it to the end + if (head[0] == null) { + head[0] = newTail; + } else { + for (TagLL t = head[0]; true; t = t.nextTag) { + if (t.nextTag == null) { + t.addAfterLL(newTail); + break; + } + } + } + } + }//if termId >= 0 + }//end while(incrementToken()) + + //-- Finish all tags + advanceTagsAndProcessClusterIfDone(head, null); + assert head[0] == null; + + if(!loggedSkippedAltTokenWarning && skippedTokens){ + loggedSkippedAltTokenWarning = true; //only log once + log.warn("The Tagger skipped some alternate tokens (tokens with posInc == 0) " + + "while processing text. This may cause problems with some Analyzer " + + "configurations (e.g. query time synonym expansion). For details see " + + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225"); + } + + tokenStream.end(); + //tokenStream.close(); caller closes because caller acquired it + } + + private void advanceTagsAndProcessClusterIfDone(TagLL[] head, BytesRef term) throws IOException { + //-- Advance tags + final int endOffset = term != null ? offsetAtt.endOffset() : -1; + boolean anyAdvance = false; + for (TagLL t = head[0]; t != null; t = t.nextTag) { + anyAdvance |= t.advance(term, endOffset); + } + + //-- Process cluster if done + if (!anyAdvance && head[0] != null) { + tagClusterReducer.reduce(head); + for (TagLL t = head[0]; t != null; t = t.nextTag) { + assert t.value != null; + tagCallback(t.startOffset, t.endOffset, t.value); + } + head[0] = null; + } + } + + /** + * Invoked by {@link #process()} for each tag found. endOffset is always >= the endOffset + * given in the previous call. + * + * @param startOffset The character offset of the original stream where the tag starts. + * @param endOffset One more than the character offset of the original stream where the tag ends. + * @param docIdsKey A reference to the matching docIds that can be resolved via {@link #lookupDocIds(Object)}. + */ + protected abstract void tagCallback(int startOffset, int endOffset, Object docIdsKey); + + /** + * Returns a sorted array of integer docIds given the corresponding key. + * @param docIdsKey The lookup key. + * @return Not null + */ + protected IntsRef lookupDocIds(Object docIdsKey) { + return (IntsRef) docIdsKey; + } +} + diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TaggerRequestHandler.java b/solr/core/src/java/org/apache/solr/handler/tagger/TaggerRequestHandler.java new file mode 100644 index 00000000000..a972e47165a --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/tagger/TaggerRequestHandler.java @@ -0,0 +1,397 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import javax.xml.stream.XMLStreamException; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Callable; + +import com.google.common.io.CharStreams; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.StopFilterFactory; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.ReaderUtil; +import org.apache.lucene.index.Terms; +import org.apache.lucene.queries.function.FunctionValues; +import org.apache.lucene.queries.function.ValueSource; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IntsRef; +import org.apache.solr.analysis.TokenizerChain; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.ContentStream; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.handler.RequestHandlerBase; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.schema.FieldType; +import org.apache.solr.schema.SchemaField; +import org.apache.solr.search.BitDocSet; +import org.apache.solr.search.DocList; +import org.apache.solr.search.DocSet; +import org.apache.solr.search.DocSlice; +import org.apache.solr.search.QParser; +import org.apache.solr.search.SolrIndexSearcher; +import org.apache.solr.search.SolrReturnFields; +import org.apache.solr.search.SyntaxError; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Scans posted text, looking for matching strings in the Solr index. + * The public static final String members are request parameters. + * This handler is also called the "SolrTextTagger". + * + * @since 7.4.0 + */ +public class TaggerRequestHandler extends RequestHandlerBase { + + /** Request parameter. */ + public static final String OVERLAPS = "overlaps"; + /** Request parameter. */ + public static final String TAGS_LIMIT = "tagsLimit"; + /** Request parameter. */ + public static final String MATCH_TEXT = "matchText"; + /** Request parameter. */ + public static final String SKIP_ALT_TOKENS = "skipAltTokens"; + /** Request parameter. */ + public static final String IGNORE_STOPWORDS = "ignoreStopwords"; + /** Request parameter. */ + public static final String XML_OFFSET_ADJUST = "xmlOffsetAdjust"; + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + @Override + public String getDescription() { + return "Processes input text to find matching tokens stored in the index."; + } + + @Override + public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception { + + //--Read params + final String indexedField = req.getParams().get("field"); + if (indexedField == null) + throw new RuntimeException("required param 'field'"); + + final TagClusterReducer tagClusterReducer = + chooseTagClusterReducer(req.getParams().get(OVERLAPS)); + final int rows = req.getParams().getInt(CommonParams.ROWS, 10000); + final int tagsLimit = req.getParams().getInt(TAGS_LIMIT, 1000); + final boolean addMatchText = req.getParams().getBool(MATCH_TEXT, false); + final SchemaField idSchemaField = req.getSchema().getUniqueKeyField(); + if (idSchemaField == null) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "The tagger requires a" + + "uniqueKey in the schema.");//TODO this could be relaxed + } + final boolean skipAltTokens = req.getParams().getBool(SKIP_ALT_TOKENS, false); + final boolean ignoreStopWords = req.getParams().getBool(IGNORE_STOPWORDS, + fieldHasIndexedStopFilter(indexedField, req)); + + //--Get posted data + Reader inputReader = null; + Iterable streams = req.getContentStreams(); + if (streams != null) { + Iterator iter = streams.iterator(); + if (iter.hasNext()) { + inputReader = iter.next().getReader(); + } + if (iter.hasNext()) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + getClass().getSimpleName()+" does not support multiple ContentStreams"); //TODO support bulk tagging? + } + } + if (inputReader == null) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + getClass().getSimpleName()+" requires text to be POSTed to it"); + } + + // We may or may not need to read the input into a string + final InputStringLazy inputStringFuture = new InputStringLazy(inputReader); + + final OffsetCorrector offsetCorrector = getOffsetCorrector(req.getParams(), inputStringFuture); + + final String inputString;//only populated if needed + if (addMatchText || inputStringFuture.inputString != null) { + //Read the input fully into a String buffer that we'll need later, + // then replace the input with a reader wrapping the buffer. + inputString = inputStringFuture.call(); + inputReader.close(); + inputReader = new StringReader(inputString); + } else { + inputString = null;//not used + } + + final SolrIndexSearcher searcher = req.getSearcher(); + final FixedBitSet matchDocIdsBS = new FixedBitSet(searcher.maxDoc()); + final List tags = new ArrayList(2000); + + try { + Analyzer analyzer = req.getSchema().getField(indexedField).getType().getQueryAnalyzer(); + try (TokenStream tokenStream = analyzer.tokenStream("", inputReader)) { + Terms terms = searcher.getSlowAtomicReader().terms(indexedField); + if (terms == null) + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + "field " + indexedField + " has no indexed data"); + Tagger tagger = new Tagger(terms, computeDocCorpus(req), tokenStream, tagClusterReducer, + skipAltTokens, ignoreStopWords) { + @SuppressWarnings("unchecked") + @Override + protected void tagCallback(int startOffset, int endOffset, Object docIdsKey) { + if (tags.size() >= tagsLimit) + return; + if (offsetCorrector != null) { + int[] offsetPair = offsetCorrector.correctPair(startOffset, endOffset); + if (offsetPair == null) { + log.debug("Discarded offsets [{}, {}] because couldn't balance XML.", + startOffset, endOffset); + return; + } + startOffset = offsetPair[0]; + endOffset = offsetPair[1]; + } + + NamedList tag = new NamedList(); + tag.add("startOffset", startOffset); + tag.add("endOffset", endOffset); + if (addMatchText) + tag.add("matchText", inputString.substring(startOffset, endOffset)); + //below caches, and also flags matchDocIdsBS + tag.add("ids", lookupSchemaDocIds(docIdsKey)); + tags.add(tag); + } + + Map docIdsListCache = new HashMap<>(2000); + + ValueSourceAccessor uniqueKeyCache = new ValueSourceAccessor(searcher, + idSchemaField.getType().getValueSource(idSchemaField, null)); + + @SuppressWarnings("unchecked") + private List lookupSchemaDocIds(Object docIdsKey) { + List schemaDocIds = docIdsListCache.get(docIdsKey); + if (schemaDocIds != null) + return schemaDocIds; + IntsRef docIds = lookupDocIds(docIdsKey); + //translate lucene docIds to schema ids + schemaDocIds = new ArrayList(docIds.length); + for (int i = docIds.offset; i < docIds.offset + docIds.length; i++) { + int docId = docIds.ints[i]; + assert i == docIds.offset || docIds.ints[i - 1] < docId : "not sorted?"; + matchDocIdsBS.set(docId);//also, flip docid in bitset + try { + schemaDocIds.add(uniqueKeyCache.objectVal(docId));//translates here + } catch (IOException e) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); + } + } + assert !schemaDocIds.isEmpty(); + + docIdsListCache.put(docIds, schemaDocIds); + return schemaDocIds; + } + + }; + tagger.enableDocIdsCache(2000);//TODO configurable + tagger.process(); + } + } finally { + inputReader.close(); + } + rsp.add("tagsCount",tags.size()); + rsp.add("tags", tags); + + rsp.setReturnFields(new SolrReturnFields( req )); + + //Solr's standard name for matching docs in response + rsp.add("response", getDocList(rows, matchDocIdsBS)); + } + + private static class InputStringLazy implements Callable { + final Reader inputReader; + String inputString; + + InputStringLazy(Reader inputReader) { + this.inputReader = inputReader; + } + + @Override + public String call() throws IOException { + if (inputString == null) { + inputString = CharStreams.toString(inputReader); + } + return inputString; + } + } + + protected OffsetCorrector getOffsetCorrector(SolrParams params, Callable inputStringProvider) throws Exception { + final boolean xmlOffsetAdjust = params.getBool(XML_OFFSET_ADJUST, false); + if (!xmlOffsetAdjust) { + return null; + } + try { + return new XmlOffsetCorrector(inputStringProvider.call()); + } catch (XMLStreamException e) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + "Expecting XML but wasn't: " + e, e); + } + } + + private DocList getDocList(int rows, FixedBitSet matchDocIdsBS) throws IOException { + //Now we must supply a Solr DocList and add it to the response. + // Typically this is gotten via a SolrIndexSearcher.search(), but in this case we + // know exactly what documents to return, the order doesn't matter nor does + // scoring. + // Ideally an implementation of DocList could be directly implemented off + // of a BitSet, but there are way too many methods to implement for a minor + // payoff. + int matchDocs = matchDocIdsBS.cardinality(); + int[] docIds = new int[ Math.min(rows, matchDocs) ]; + DocIdSetIterator docIdIter = new BitSetIterator(matchDocIdsBS, 1); + for (int i = 0; i < docIds.length; i++) { + docIds[i] = docIdIter.nextDoc(); + } + return new DocSlice(0, docIds.length, docIds, null, matchDocs, 1f); + } + + private TagClusterReducer chooseTagClusterReducer(String overlaps) { + TagClusterReducer tagClusterReducer; + if (overlaps == null || overlaps.equals("NO_SUB")) { + tagClusterReducer = TagClusterReducer.NO_SUB; + } else if (overlaps.equals("ALL")) { + tagClusterReducer = TagClusterReducer.ALL; + } else if (overlaps.equals("LONGEST_DOMINANT_RIGHT")) { + tagClusterReducer = TagClusterReducer.LONGEST_DOMINANT_RIGHT; + } else { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + "unknown tag overlap mode: "+overlaps); + } + return tagClusterReducer; + } + + /** + * The set of documents matching the provided 'fq' (filter query). Don't include deleted docs + * either. If null is returned, then all docs are available. + */ + private Bits computeDocCorpus(SolrQueryRequest req) throws SyntaxError, IOException { + final String[] corpusFilterQueries = req.getParams().getParams("fq"); + final SolrIndexSearcher searcher = req.getSearcher(); + final Bits docBits; + if (corpusFilterQueries != null && corpusFilterQueries.length > 0) { + List filterQueries = new ArrayList(corpusFilterQueries.length); + for (String corpusFilterQuery : corpusFilterQueries) { + QParser qParser = QParser.getParser(corpusFilterQuery, null, req); + try { + filterQueries.add(qParser.parse()); + } catch (SyntaxError e) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e); + } + } + + final DocSet docSet = searcher.getDocSet(filterQueries);//hopefully in the cache + //note: before Solr 4.7 we could call docSet.getBits() but no longer. + if (docSet instanceof BitDocSet) { + docBits = ((BitDocSet)docSet).getBits(); + } else { + docBits = new Bits() { + + @Override + public boolean get(int index) { + return docSet.exists(index); + } + + @Override + public int length() { + return searcher.maxDoc(); + } + }; + } + } else { + docBits = searcher.getSlowAtomicReader().getLiveDocs(); + } + return docBits; + } + + private boolean fieldHasIndexedStopFilter(String field, SolrQueryRequest req) { + FieldType fieldType = req.getSchema().getFieldType(field); + Analyzer analyzer = fieldType.getIndexAnalyzer();//index analyzer + if (analyzer instanceof TokenizerChain) { + TokenizerChain tokenizerChain = (TokenizerChain) analyzer; + TokenFilterFactory[] tokenFilterFactories = tokenizerChain.getTokenFilterFactories(); + for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) { + if (tokenFilterFactory instanceof StopFilterFactory) + return true; + } + } + return false; + } + + /** See LUCENE-4541 or {@link org.apache.solr.response.transform.ValueSourceAugmenter}. */ + static class ValueSourceAccessor { + private final List readerContexts; + private final ValueSource valueSource; + private final Map fContext; + private final FunctionValues[] functionValuesPerSeg; + private final int[] functionValuesDocIdPerSeg; + + ValueSourceAccessor(IndexSearcher searcher, ValueSource valueSource) { + readerContexts = searcher.getIndexReader().leaves(); + this.valueSource = valueSource; + fContext = ValueSource.newContext(searcher); + functionValuesPerSeg = new FunctionValues[readerContexts.size()]; + functionValuesDocIdPerSeg = new int[readerContexts.size()]; + } + + Object objectVal(int topDocId) throws IOException { + // lookup segment level stuff: + int segIdx = ReaderUtil.subIndex(topDocId, readerContexts); + LeafReaderContext rcontext = readerContexts.get(segIdx); + int segDocId = topDocId - rcontext.docBase; + // unfortunately Lucene 7.0 requires forward only traversal (with no reset method). + // So we need to track our last docId (per segment) and re-fetch the FunctionValues. :-( + FunctionValues functionValues = functionValuesPerSeg[segIdx]; + if (functionValues == null || segDocId < functionValuesDocIdPerSeg[segIdx]) { + functionValues = functionValuesPerSeg[segIdx] = valueSource.getValues(fContext, rcontext); + } + functionValuesDocIdPerSeg[segIdx] = segDocId; + + // get value: + return functionValues.objectVal(segDocId); + } + } +} diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttribute.java b/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttribute.java new file mode 100644 index 00000000000..b7803e4f31a --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttribute.java @@ -0,0 +1,65 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.util.Attribute; + +/** + * Attribute used by the {@link Tagger} to decide if a token can start a + * new {@link TagLL tag}. + *

+ * By default this Attribute will return true, but it might be + * reset by some {@link TokenFilter} added to the {@link TokenStream} used + * to analyze the parsed text. Typically this will be done based on NLP + * processing results (e.g. to only lookup Named Entities). + *

+ * NOTE: that all Tokens are used to advance existing {@link TagLL tags}. + */ +public interface TaggingAttribute extends Attribute { + + /** + * By default this Attribute will be initialised with true. + * This ensures that all tokens are taggable by default (especially if + * the {@link TaggingAttribute} is not set by any component in the configured + * {@link TokenStream} + */ + public static final boolean DEFAULT_TAGGABLE = true; + + /** + * Getter for the taggable state of the current Token + * + * @return the state + */ + public boolean isTaggable(); + + /** + * Setter for the taggable state. Typically called by code within + * {@link TokenFilter#incrementToken()}. + * + * @param lookup the state + */ + public void setTaggable(boolean lookup); + +} diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttributeImpl.java b/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttributeImpl.java new file mode 100644 index 00000000000..55ecfbc6ef2 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttributeImpl.java @@ -0,0 +1,79 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.AttributeReflector; + +/** + * Implementation of the {@link TaggingAttribute} + */ +public class TaggingAttributeImpl extends AttributeImpl implements TaggingAttribute { + + /** + * the private field initialised with {@link TaggingAttribute#DEFAULT_TAGGABLE} + */ + private boolean taggable = TaggingAttribute.DEFAULT_TAGGABLE; + + /* + * (non-Javadoc) + * @see org.opensextant.solrtexttagger.LookupAttribute#isLookup() + */ + @Override + public boolean isTaggable() { + return taggable; + } + + /* + * (non-Javadoc) + * @see org.opensextant.solrtexttagger.LookupAttribute#setLookup(boolean) + */ + @Override + public void setTaggable(boolean lookup) { + this.taggable = lookup; + } + + /* + * (non-Javadoc) + * @see org.apache.lucene.util.AttributeImpl#clear() + */ + @Override + public void clear() { + taggable = DEFAULT_TAGGABLE; + } + + /* + * (non-Javadoc) + * @see org.apache.lucene.util.AttributeImpl#copyTo(org.apache.lucene.util.AttributeImpl) + */ + @Override + public void copyTo(AttributeImpl target) { + ((TaggingAttribute) target).setTaggable(taggable); + } + + @Override + public void reflectWith(AttributeReflector reflector) { + reflector.reflect(TaggingAttribute.class, "taggable", isTaggable()); + } + +} diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TermPrefixCursor.java b/solr/core/src/java/org/apache/solr/handler/tagger/TermPrefixCursor.java new file mode 100644 index 00000000000..1e82dbe4b5b --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/tagger/TermPrefixCursor.java @@ -0,0 +1,189 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.io.IOException; +import java.util.Map; + +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.IntsRef; + +/** + * Cursor into the terms that advances by prefix. + */ +class TermPrefixCursor { + + //Note: this could be a lot more efficient if MemoryPostingsFormat supported ordinal lookup. + // Maybe that could be added to Lucene. + + // TODO add bloom filter of hashcode of first ~ 6 bytes to avoid lookup into terms dict? + + private static final byte SEPARATOR_CHAR = ConcatenateGraphFilter.SEP_LABEL; // used to be ' '; TODO configurable? + private static final IntsRef EMPTY_INTSREF = new IntsRef(); + + private final TermsEnum termsEnum; + private final Bits liveDocs; + private final Map docIdsCache; + + private BytesRef prefixBuf;//we append to this + private BytesRefBuilder prefixBufBuilder = new BytesRefBuilder(); + private boolean prefixBufOnLoan;//if true, PB is loaned; needs to be copied + private PostingsEnum postingsEnum; + private IntsRef docIds; + + TermPrefixCursor(TermsEnum termsEnum, Bits liveDocs, Map docIdsCache) { + this.termsEnum = termsEnum; + this.liveDocs = liveDocs; + this.docIdsCache = docIdsCache; + } + + /** Appends the separator char (if not the first) plus the given word to the prefix buffer, + * then seeks to it. If the seek fails, false is returned and this cursor + * can be re-used as if in a new state. The {@code word} BytesRef is considered temporary, + * and is not saved within this class. */ + boolean advance(BytesRef word) throws IOException { + if (prefixBuf == null) { // first advance + //set prefixBuf to word temporary. When advance() completes, we either null out or copy. + prefixBuf = word; + prefixBufOnLoan = true; + if (seekPrefix()) {//... and we have to + ensureBufIsACopy(); + return true; + } else { + prefixBuf = null;//just to be darned sure 'word' isn't referenced here + return false; + } + + } else { // subsequent advance + //append to existing + assert !prefixBufOnLoan; + + prefixBufBuilder.append(SEPARATOR_CHAR); + prefixBufBuilder.append(word); + prefixBuf = prefixBufBuilder.get(); + if (seekPrefix()) { + return true; + } else { + prefixBuf = null; + return false; + } + } + } + + private void ensureBufIsACopy() { + if (!prefixBufOnLoan) + return; + + prefixBufBuilder.clear(); + prefixBufBuilder.copyBytes(prefixBuf); + prefixBuf = prefixBufBuilder.get(); + prefixBufOnLoan = false; + } + + /** Seeks to prefixBuf or the next term that is prefixed by prefixBuf plus the separator char. + * Sets docIds. **/ + private boolean seekPrefix() throws IOException { + TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefixBuf); + + docIds = null;//invalidate + switch (seekStatus) { + case END: + return false; + + case FOUND: + postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); + docIds = postingsEnumToIntsRef(postingsEnum, liveDocs); + if (docIds.length > 0) { + return true; + } + + //Pretend we didn't find it; go to next term + docIds = null; + if (termsEnum.next() == null) { // case END + return false; + } + //fall through to NOT_FOUND + + case NOT_FOUND: + //termsEnum must start with prefixBuf to continue + BytesRef teTerm = termsEnum.term(); + + if (teTerm.length > prefixBuf.length) { + for (int i = 0; i < prefixBuf.length; i++) { + if (prefixBuf.bytes[prefixBuf.offset + i] != teTerm.bytes[teTerm.offset + i]) + return false; + } + if (teTerm.bytes[teTerm.offset + prefixBuf.length] != SEPARATOR_CHAR) + return false; + return true; + } + return false; + } + throw new IllegalStateException(seekStatus.toString()); + } + + /** Returns an IntsRef either cached or reading postingsEnum. Not null. */ + private IntsRef postingsEnumToIntsRef(PostingsEnum postingsEnum, Bits liveDocs) throws IOException { + // (The cache can have empty IntsRefs) + + //lookup prefixBuf in a cache + if (docIdsCache != null) { + docIds = docIdsCache.get(prefixBuf); + if (docIds != null) { + return docIds; + } + } + + //read postingsEnum + docIds = new IntsRef(termsEnum.docFreq()); + int docId; + while ((docId = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) { + if (liveDocs != null && !liveDocs.get(postingsEnum.docID())) { + continue; + } + docIds.ints[docIds.length++] = docId; + } + if (docIds.length == 0) + docIds = EMPTY_INTSREF; + + //cache + if (docIdsCache != null) { + ensureBufIsACopy(); + //clone is shallow; that's okay as the prefix isn't overwritten; it's just appended to + docIdsCache.put(prefixBuf.clone(), docIds); + } + return docIds; + } + + /** The docIds of the last call to advance, if it returned true. It might be null, but + * its length won't be 0. Treat as immutable. */ + IntsRef getDocIds() { + assert docIds == null || docIds.length != 0; + return docIds; + } +} diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/XmlOffsetCorrector.java b/solr/core/src/java/org/apache/solr/handler/tagger/XmlOffsetCorrector.java new file mode 100644 index 00000000000..576328f65be --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/tagger/XmlOffsetCorrector.java @@ -0,0 +1,113 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import javax.xml.stream.XMLResolver; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.events.XMLEvent; +import java.io.InputStream; +import java.io.StringReader; + +import com.ctc.wstx.stax.WstxInputFactory; +import org.apache.commons.io.input.ClosedInputStream; +import org.codehaus.stax2.LocationInfo; +import org.codehaus.stax2.XMLInputFactory2; +import org.codehaus.stax2.XMLStreamReader2; + +/** + * Corrects offsets to adjust for XML formatted data. The goal is such that the caller should be + * able to insert a start XML tag at the start offset and a corresponding end XML tag at the end + * offset of the tagger, and have it be valid XML. See {@link #correctPair(int, int)}. + * + * This will not work on invalid XML. + * + * Not thread-safe. + */ +public class XmlOffsetCorrector extends OffsetCorrector { + + //TODO use StAX without hard requirement on woodstox. xmlStreamReader.getLocation().getCharacterOffset() + + private static final XMLInputFactory2 XML_INPUT_FACTORY; + static { + // note: similar code in Solr's EmptyEntityResolver + XML_INPUT_FACTORY = new WstxInputFactory(); + XML_INPUT_FACTORY.setXMLResolver(new XMLResolver() { + @Override + public InputStream resolveEntity(String publicId, String systemId, String baseURI, String namespace) { + return ClosedInputStream.CLOSED_INPUT_STREAM; + } + }); + // TODO disable DTD? + // XML_INPUT_FACTORY.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE) + XML_INPUT_FACTORY.configureForSpeed(); + } + + /** + * Initialize based on the document text. + * @param docText non-null XML content. + * @throws XMLStreamException If there's a problem parsing the XML. + */ + public XmlOffsetCorrector(String docText) throws XMLStreamException { + super(docText, false); + + int tagCounter = 0; + int thisTag = -1; + + //note: we *could* add a virtual outer tag to guarantee all text is in the context of a tag, + // but we shouldn't need to because there is no findable text outside the top element. + + final XMLStreamReader2 xmlStreamReader = + (XMLStreamReader2) XML_INPUT_FACTORY.createXMLStreamReader(new StringReader(docText)); + + while (xmlStreamReader.hasNext()) { + int eventType = xmlStreamReader.next(); + switch (eventType) { + case XMLEvent.START_ELEMENT: { + tagInfo.ensureCapacity(tagInfo.size() + 5); + final int parentTag = thisTag; + final LocationInfo info = xmlStreamReader.getLocationInfo(); + tagInfo.add(parentTag); + tagInfo.add((int) info.getStartingCharOffset(), (int) info.getEndingCharOffset()); + tagInfo.add(-1, -1);//these 2 will be populated when we get to the close tag + thisTag = tagCounter++; + + parentChangeOffsets.add((int) info.getStartingCharOffset()); + parentChangeIds.add(thisTag); + break; + } + case XMLEvent.END_ELEMENT: { + final LocationInfo info = xmlStreamReader.getLocationInfo(); + tagInfo.set(5 * thisTag + 3, (int) info.getStartingCharOffset()); + tagInfo.set(5 * thisTag + 4, (int) info.getEndingCharOffset()); + thisTag = getParentTag(thisTag); + + parentChangeOffsets.add((int) info.getEndingCharOffset()); + parentChangeIds.add(thisTag); + break; + } + default: //do nothing + } + } + } + +} diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/package-info.java b/solr/core/src/java/org/apache/solr/handler/tagger/package-info.java new file mode 100644 index 00000000000..c2055b308e5 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/tagger/package-info.java @@ -0,0 +1,27 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * The {@link org.apache.solr.handler.tagger.TaggerRequestHandler} and supporting classes. + * This was formerly known as OpenSextant's SolrTextTagger. + */ +package org.apache.solr.handler.tagger; \ No newline at end of file diff --git a/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java b/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java index 76a52583e32..d0f8cd4633e 100644 --- a/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java +++ b/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java @@ -425,7 +425,7 @@ public class CollapsingQParserPlugin extends QParserPlugin { DocValuesType.NONE, fieldInfo.getDocValuesGen(), fieldInfo.attributes(), - 0, 0); + 0, 0, fieldInfo.isSoftDeletesField()); newInfos.add(f); } else { diff --git a/solr/core/src/java/org/apache/solr/search/Insanity.java b/solr/core/src/java/org/apache/solr/search/Insanity.java index aa366521e88..8fe081f947b 100644 --- a/solr/core/src/java/org/apache/solr/search/Insanity.java +++ b/solr/core/src/java/org/apache/solr/search/Insanity.java @@ -66,7 +66,7 @@ public class Insanity { if (fi.name.equals(insaneField)) { filteredInfos.add(new FieldInfo(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(), fi.getIndexOptions(), DocValuesType.NONE, -1, Collections.emptyMap(), - fi.getPointDimensionCount(), fi.getPointNumBytes())); + fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField())); } else { filteredInfos.add(fi); } diff --git a/solr/core/src/java/org/apache/solr/search/QParserPlugin.java b/solr/core/src/java/org/apache/solr/search/QParserPlugin.java index f80bc9c3cda..b20c3c87ac9 100644 --- a/solr/core/src/java/org/apache/solr/search/QParserPlugin.java +++ b/solr/core/src/java/org/apache/solr/search/QParserPlugin.java @@ -37,53 +37,53 @@ public abstract class QParserPlugin implements NamedListInitializedPlugin, SolrI public static final String DEFAULT_QTYPE = LuceneQParserPlugin.NAME; /** - * Internal use - name to class mappings of builtin parsers. + * Internal use - name to parser for the builtin parsers. * Each query parser plugin extending {@link QParserPlugin} has own instance of standardPlugins. * This leads to cyclic dependencies of static fields and to case when NAME field is not yet initialized. * This result to NPE during initialization. * For every plugin, listed here, NAME field has to be final and static. */ - public static final Map> standardPlugins; + public static final Map standardPlugins; static { - HashMap> map = new HashMap<>(30, 1); - map.put(LuceneQParserPlugin.NAME, LuceneQParserPlugin.class); - map.put(FunctionQParserPlugin.NAME, FunctionQParserPlugin.class); - map.put(PrefixQParserPlugin.NAME, PrefixQParserPlugin.class); - map.put(BoostQParserPlugin.NAME, BoostQParserPlugin.class); - map.put(DisMaxQParserPlugin.NAME, DisMaxQParserPlugin.class); - map.put(ExtendedDismaxQParserPlugin.NAME, ExtendedDismaxQParserPlugin.class); - map.put(FieldQParserPlugin.NAME, FieldQParserPlugin.class); - map.put(RawQParserPlugin.NAME, RawQParserPlugin.class); - map.put(TermQParserPlugin.NAME, TermQParserPlugin.class); - map.put(TermsQParserPlugin.NAME, TermsQParserPlugin.class); - map.put(NestedQParserPlugin.NAME, NestedQParserPlugin.class); - map.put(FunctionRangeQParserPlugin.NAME, FunctionRangeQParserPlugin.class); - map.put(SpatialFilterQParserPlugin.NAME, SpatialFilterQParserPlugin.class); - map.put(SpatialBoxQParserPlugin.NAME, SpatialBoxQParserPlugin.class); - map.put(JoinQParserPlugin.NAME, JoinQParserPlugin.class); - map.put(SurroundQParserPlugin.NAME, SurroundQParserPlugin.class); - map.put(SwitchQParserPlugin.NAME, SwitchQParserPlugin.class); - map.put(MaxScoreQParserPlugin.NAME, MaxScoreQParserPlugin.class); - map.put(BlockJoinParentQParserPlugin.NAME, BlockJoinParentQParserPlugin.class); - map.put(BlockJoinChildQParserPlugin.NAME, BlockJoinChildQParserPlugin.class); - map.put(FiltersQParserPlugin.NAME, FiltersQParserPlugin.class); - map.put(CollapsingQParserPlugin.NAME, CollapsingQParserPlugin.class); - map.put(SimpleQParserPlugin.NAME, SimpleQParserPlugin.class); - map.put(ComplexPhraseQParserPlugin.NAME, ComplexPhraseQParserPlugin.class); - map.put(ReRankQParserPlugin.NAME, ReRankQParserPlugin.class); - map.put(ExportQParserPlugin.NAME, ExportQParserPlugin.class); - map.put(MLTQParserPlugin.NAME, MLTQParserPlugin.class); - map.put(HashQParserPlugin.NAME, HashQParserPlugin.class); - map.put(GraphQParserPlugin.NAME, GraphQParserPlugin.class); - map.put(XmlQParserPlugin.NAME, XmlQParserPlugin.class); - map.put(GraphTermsQParserPlugin.NAME, GraphTermsQParserPlugin.class); - map.put(IGainTermsQParserPlugin.NAME, IGainTermsQParserPlugin.class); - map.put(TextLogisticRegressionQParserPlugin.NAME, TextLogisticRegressionQParserPlugin.class); - map.put(SignificantTermsQParserPlugin.NAME, SignificantTermsQParserPlugin.class); - map.put(PayloadScoreQParserPlugin.NAME, PayloadScoreQParserPlugin.class); - map.put(PayloadCheckQParserPlugin.NAME, PayloadCheckQParserPlugin.class); - map.put(BoolQParserPlugin.NAME, BoolQParserPlugin.class); + HashMap map = new HashMap<>(30, 1); + map.put(LuceneQParserPlugin.NAME, new LuceneQParserPlugin()); + map.put(FunctionQParserPlugin.NAME, new FunctionQParserPlugin()); + map.put(PrefixQParserPlugin.NAME, new PrefixQParserPlugin()); + map.put(BoostQParserPlugin.NAME, new BoostQParserPlugin()); + map.put(DisMaxQParserPlugin.NAME, new DisMaxQParserPlugin()); + map.put(ExtendedDismaxQParserPlugin.NAME, new ExtendedDismaxQParserPlugin()); + map.put(FieldQParserPlugin.NAME, new FieldQParserPlugin()); + map.put(RawQParserPlugin.NAME, new RawQParserPlugin()); + map.put(TermQParserPlugin.NAME, new TermQParserPlugin()); + map.put(TermsQParserPlugin.NAME, new TermsQParserPlugin()); + map.put(NestedQParserPlugin.NAME, new NestedQParserPlugin()); + map.put(FunctionRangeQParserPlugin.NAME, new FunctionRangeQParserPlugin()); + map.put(SpatialFilterQParserPlugin.NAME, new SpatialFilterQParserPlugin()); + map.put(SpatialBoxQParserPlugin.NAME, new SpatialBoxQParserPlugin()); + map.put(JoinQParserPlugin.NAME, new JoinQParserPlugin()); + map.put(SurroundQParserPlugin.NAME, new SurroundQParserPlugin()); + map.put(SwitchQParserPlugin.NAME, new SwitchQParserPlugin()); + map.put(MaxScoreQParserPlugin.NAME, new MaxScoreQParserPlugin()); + map.put(BlockJoinParentQParserPlugin.NAME, new BlockJoinParentQParserPlugin()); + map.put(BlockJoinChildQParserPlugin.NAME, new BlockJoinChildQParserPlugin()); + map.put(FiltersQParserPlugin.NAME, new FiltersQParserPlugin()); + map.put(CollapsingQParserPlugin.NAME, new CollapsingQParserPlugin()); + map.put(SimpleQParserPlugin.NAME, new SimpleQParserPlugin()); + map.put(ComplexPhraseQParserPlugin.NAME, new ComplexPhraseQParserPlugin()); + map.put(ReRankQParserPlugin.NAME, new ReRankQParserPlugin()); + map.put(ExportQParserPlugin.NAME, new ExportQParserPlugin()); + map.put(MLTQParserPlugin.NAME, new MLTQParserPlugin()); + map.put(HashQParserPlugin.NAME, new HashQParserPlugin()); + map.put(GraphQParserPlugin.NAME, new GraphQParserPlugin()); + map.put(XmlQParserPlugin.NAME, new XmlQParserPlugin()); + map.put(GraphTermsQParserPlugin.NAME, new GraphTermsQParserPlugin()); + map.put(IGainTermsQParserPlugin.NAME, new IGainTermsQParserPlugin()); + map.put(TextLogisticRegressionQParserPlugin.NAME, new TextLogisticRegressionQParserPlugin()); + map.put(SignificantTermsQParserPlugin.NAME, new SignificantTermsQParserPlugin()); + map.put(PayloadScoreQParserPlugin.NAME, new PayloadScoreQParserPlugin()); + map.put(PayloadCheckQParserPlugin.NAME, new PayloadCheckQParserPlugin()); + map.put(BoolQParserPlugin.NAME, new BoolQParserPlugin()); standardPlugins = Collections.unmodifiableMap(map); } diff --git a/solr/core/src/java/org/apache/solr/security/PKIAuthenticationPlugin.java b/solr/core/src/java/org/apache/solr/security/PKIAuthenticationPlugin.java index 877e4f16cd6..43dac480168 100644 --- a/solr/core/src/java/org/apache/solr/security/PKIAuthenticationPlugin.java +++ b/solr/core/src/java/org/apache/solr/security/PKIAuthenticationPlugin.java @@ -47,11 +47,7 @@ import org.apache.solr.common.util.StrUtils; import org.apache.solr.common.util.SuppressForbidden; import org.apache.solr.common.util.Utils; import org.apache.solr.core.CoreContainer; -import org.apache.solr.handler.RequestHandlerBase; -import org.apache.solr.request.SolrQueryRequest; -import org.apache.solr.request.SolrRequestHandler; import org.apache.solr.request.SolrRequestInfo; -import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.util.CryptoKeys; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -62,7 +58,7 @@ import static java.nio.charset.StandardCharsets.UTF_8; public class PKIAuthenticationPlugin extends AuthenticationPlugin implements HttpClientBuilderPlugin { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private final Map keyCache = new ConcurrentHashMap<>(); - private final CryptoKeys.RSAKeyPair keyPair = new CryptoKeys.RSAKeyPair(); + private final PublicKeyHandler publicKeyHandler; private final CoreContainer cores; private final int MAX_VALIDITY = Integer.parseInt(System.getProperty("pkiauth.ttl", "10000")); private final String myNodeName; @@ -77,7 +73,8 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt return interceptorRegistered; } - public PKIAuthenticationPlugin(CoreContainer cores, String nodeName) { + public PKIAuthenticationPlugin(CoreContainer cores, String nodeName, PublicKeyHandler publicKeyHandler) { + this.publicKeyHandler = publicKeyHandler; this.cores = cores; myNodeName = nodeName; } @@ -92,7 +89,7 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt public boolean doAuthenticate(ServletRequest request, ServletResponse response, FilterChain filterChain) throws Exception { String requestURI = ((HttpServletRequest) request).getRequestURI(); - if (requestURI.endsWith(PATH)) { + if (requestURI.endsWith(PublicKeyHandler.PATH)) { filterChain.doFilter(request, response); return true; } @@ -198,7 +195,7 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt String url = cores.getZkController().getZkStateReader().getBaseUrlForNodeName(nodename); HttpEntity entity = null; try { - String uri = url + PATH + "?wt=json&omitHeader=true"; + String uri = url + PublicKeyHandler.PATH + "?wt=json&omitHeader=true"; log.debug("Fetching fresh public key from : {}",uri); HttpResponse rsp = cores.getUpdateShardHandler().getDefaultHttpClient() .execute(new HttpGet(uri), HttpClientUtil.createNewHttpClientRequestContext()); @@ -207,7 +204,7 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt Map m = (Map) Utils.fromJSON(bytes); String key = (String) m.get("key"); if (key == null) { - log.error("No key available from " + url + PATH); + log.error("No key available from " + url + PublicKeyHandler.PATH); return null; } else { log.info("New Key obtained from node: {} / {}", nodename, key); @@ -230,26 +227,6 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt return builder; } - public SolrRequestHandler getRequestHandler() { - return new RequestHandlerBase() { - @Override - public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception { - rsp.add("key", keyPair.getPublicKeyStr()); - } - - @Override - public String getDescription() { - return "Return the public key of this server"; - } - - @Override - public Category getCategory() { - return Category.ADMIN; - } - - }; - } - public boolean needsAuthorization(HttpServletRequest req) { return req.getUserPrincipal() != SU; } @@ -292,7 +269,7 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt String s = usr + " " + System.currentTimeMillis(); byte[] payload = s.getBytes(UTF_8); - byte[] payloadCipher = keyPair.encrypt(ByteBuffer.wrap(payload)); + byte[] payloadCipher = publicKeyHandler.keyPair.encrypt(ByteBuffer.wrap(payload)); String base64Cipher = Base64.byteArrayToBase64(payloadCipher); httpRequest.setHeader(HEADER, myNodeName + " " + base64Cipher); } @@ -316,11 +293,10 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt } public String getPublicKey() { - return keyPair.getPublicKeyStr(); + return publicKeyHandler.getPublicKey(); } public static final String HEADER = "SolrAuth"; - public static final String PATH = "/admin/info/key"; public static final String NODE_IS_USER = "$"; // special principal to denote the cluster member private static final Principal SU = new BasicUserPrincipal("$"); diff --git a/solr/core/src/java/org/apache/solr/security/PublicKeyHandler.java b/solr/core/src/java/org/apache/solr/security/PublicKeyHandler.java new file mode 100644 index 00000000000..ad835782a74 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/security/PublicKeyHandler.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.security; + +import org.apache.solr.handler.RequestHandlerBase; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.util.CryptoKeys; + +public class PublicKeyHandler extends RequestHandlerBase { + public static final String PATH = "/admin/info/key"; + final CryptoKeys.RSAKeyPair keyPair = new CryptoKeys.RSAKeyPair(); + + public String getPublicKey() { + return keyPair.getPublicKeyStr(); + } + + @Override + public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception { + rsp.add("key", keyPair.getPublicKeyStr()); + } + + @Override + public String getDescription() { + return "Return the public key of this server"; + } + + @Override + public Category getCategory() { + return Category.ADMIN; + } +} diff --git a/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java b/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java index d1347298505..b297a4430a1 100644 --- a/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java +++ b/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java @@ -97,7 +97,7 @@ import org.apache.solr.security.AuthorizationContext; import org.apache.solr.security.AuthorizationContext.CollectionRequest; import org.apache.solr.security.AuthorizationContext.RequestType; import org.apache.solr.security.AuthorizationResponse; -import org.apache.solr.security.PKIAuthenticationPlugin; +import org.apache.solr.security.PublicKeyHandler; import org.apache.solr.servlet.SolrDispatchFilter.Action; import org.apache.solr.servlet.cache.HttpCacheHeaderUtil; import org.apache.solr.servlet.cache.Method; @@ -547,7 +547,7 @@ public class HttpSolrCall { } private boolean shouldAuthorize() { - if(PKIAuthenticationPlugin.PATH.equals(path)) return false; + if(PublicKeyHandler.PATH.equals(path)) return false; //admin/info/key is the path where public key is exposed . it is always unsecured if (cores.getPkiAuthenticationPlugin() != null && req.getUserPrincipal() != null) { boolean b = cores.getPkiAuthenticationPlugin().needsAuthorization(req); diff --git a/solr/core/src/java/org/apache/solr/servlet/SolrDispatchFilter.java b/solr/core/src/java/org/apache/solr/servlet/SolrDispatchFilter.java index c7fdd57f90d..78e58d000aa 100644 --- a/solr/core/src/java/org/apache/solr/servlet/SolrDispatchFilter.java +++ b/solr/core/src/java/org/apache/solr/servlet/SolrDispatchFilter.java @@ -16,6 +16,20 @@ */ package org.apache.solr.servlet; +import javax.servlet.FilterChain; +import javax.servlet.FilterConfig; +import javax.servlet.ReadListener; +import javax.servlet.ServletException; +import javax.servlet.ServletInputStream; +import javax.servlet.ServletOutputStream; +import javax.servlet.ServletRequest; +import javax.servlet.ServletResponse; +import javax.servlet.UnavailableException; +import javax.servlet.WriteListener; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletRequestWrapper; +import javax.servlet.http.HttpServletResponse; +import javax.servlet.http.HttpServletResponseWrapper; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; @@ -35,21 +49,10 @@ import java.util.concurrent.atomic.AtomicReference; import java.util.regex.Matcher; import java.util.regex.Pattern; -import javax.servlet.FilterChain; -import javax.servlet.FilterConfig; -import javax.servlet.ReadListener; -import javax.servlet.ServletException; -import javax.servlet.ServletInputStream; -import javax.servlet.ServletOutputStream; -import javax.servlet.ServletRequest; -import javax.servlet.ServletResponse; -import javax.servlet.UnavailableException; -import javax.servlet.WriteListener; -import javax.servlet.http.HttpServletRequest; -import javax.servlet.http.HttpServletRequestWrapper; -import javax.servlet.http.HttpServletResponse; -import javax.servlet.http.HttpServletResponseWrapper; - +import com.codahale.metrics.jvm.ClassLoadingGaugeSet; +import com.codahale.metrics.jvm.GarbageCollectorMetricSet; +import com.codahale.metrics.jvm.MemoryUsageGaugeSet; +import com.codahale.metrics.jvm.ThreadStatesGaugeSet; import org.apache.commons.io.FileCleaningTracker; import org.apache.commons.lang.StringUtils; import org.apache.http.client.HttpClient; @@ -72,17 +75,13 @@ import org.apache.solr.metrics.SolrMetricManager; import org.apache.solr.request.SolrRequestInfo; import org.apache.solr.security.AuthenticationPlugin; import org.apache.solr.security.PKIAuthenticationPlugin; +import org.apache.solr.security.PublicKeyHandler; import org.apache.solr.util.SolrFileCleaningTracker; import org.apache.solr.util.StartupLoggingUtils; import org.apache.solr.util.configuration.SSLConfigurationsFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.codahale.metrics.jvm.ClassLoadingGaugeSet; -import com.codahale.metrics.jvm.GarbageCollectorMetricSet; -import com.codahale.metrics.jvm.MemoryUsageGaugeSet; -import com.codahale.metrics.jvm.ThreadStatesGaugeSet; - /** * This filter looks at the incoming URL maps them to handlers defined in solrconfig.xml * @@ -441,8 +440,8 @@ public class SolrDispatchFilter extends BaseSolrFilter { // /admin/info/key must be always open. see SOLR-9188 // tests work only w/ getPathInfo //otherwise it's just enough to have getServletPath() - if (PKIAuthenticationPlugin.PATH.equals(request.getServletPath()) || - PKIAuthenticationPlugin.PATH.equals(request.getPathInfo())) return true; + if (PublicKeyHandler.PATH.equals(request.getServletPath()) || + PublicKeyHandler.PATH.equals(request.getPathInfo())) return true; String header = request.getHeader(PKIAuthenticationPlugin.HEADER); if (header != null && cores.getPkiAuthenticationPlugin() != null) authenticationPlugin = cores.getPkiAuthenticationPlugin(); diff --git a/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java index 967db541414..9f0f5271c67 100644 --- a/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java +++ b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java @@ -282,7 +282,7 @@ public class UninvertingReader extends FilterLeafReader { } filteredInfos.add(new FieldInfo(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(), fi.getIndexOptions(), type, fi.getDocValuesGen(), fi.attributes(), - fi.getPointDimensionCount(), fi.getPointNumBytes())); + fi.getPointDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField())); } fieldInfos = new FieldInfos(filteredInfos.toArray(new FieldInfo[filteredInfos.size()])); } diff --git a/solr/core/src/java/org/apache/solr/update/CdcrTransactionLog.java b/solr/core/src/java/org/apache/solr/update/CdcrTransactionLog.java index 3534f622908..f668540325e 100644 --- a/solr/core/src/java/org/apache/solr/update/CdcrTransactionLog.java +++ b/solr/core/src/java/org/apache/solr/update/CdcrTransactionLog.java @@ -41,7 +41,7 @@ import org.slf4j.LoggerFactory; * methods {@link #incref()}, {@link #close()} and {@link #reopenOutputStream()}. *

  • encode the number of records in the tlog file in the last commit record. The number of records will be * decoded and reuse if the tlog file is reopened. This is achieved by extending the constructor, and the - * methods {@link #writeCommit(CommitUpdateCommand, int)} and {@link #getReader(long)}.
  • + * methods {@link #writeCommit(CommitUpdateCommand)} and {@link #getReader(long)}. * */ public class CdcrTransactionLog extends TransactionLog { @@ -108,7 +108,7 @@ public class CdcrTransactionLog extends TransactionLog { } @Override - public long write(AddUpdateCommand cmd, long prevPointer, int flags) { + public long write(AddUpdateCommand cmd, long prevPointer) { assert (-1 <= prevPointer && (cmd.isInPlaceUpdate() || (-1 == prevPointer))); LogCodec codec = new LogCodec(resolver); @@ -125,7 +125,7 @@ public class CdcrTransactionLog extends TransactionLog { codec.init(out); if (cmd.isInPlaceUpdate()) { codec.writeTag(JavaBinCodec.ARR, 6); - codec.writeInt(UpdateLog.UPDATE_INPLACE | flags); // should just take one byte + codec.writeInt(UpdateLog.UPDATE_INPLACE); // should just take one byte codec.writeLong(cmd.getVersion()); codec.writeLong(prevPointer); codec.writeLong(cmd.prevVersion); @@ -141,7 +141,7 @@ public class CdcrTransactionLog extends TransactionLog { } else { codec.writeTag(JavaBinCodec.ARR, 4); - codec.writeInt(UpdateLog.ADD | flags); // should just take one byte + codec.writeInt(UpdateLog.ADD); // should just take one byte codec.writeLong(cmd.getVersion()); if (cmd.getReq().getParamString().contains(CdcrUpdateProcessor.CDCR_UPDATE)) { // if the update is received via cdcr source; add extra boolean entry @@ -179,7 +179,7 @@ public class CdcrTransactionLog extends TransactionLog { } @Override - public long writeDelete(DeleteUpdateCommand cmd, int flags) { + public long writeDelete(DeleteUpdateCommand cmd) { LogCodec codec = new LogCodec(resolver); try { @@ -190,7 +190,7 @@ public class CdcrTransactionLog extends TransactionLog { MemOutputStream out = new MemOutputStream(new byte[20 + br.length]); codec.init(out); codec.writeTag(JavaBinCodec.ARR, 4); - codec.writeInt(UpdateLog.DELETE | flags); // should just take one byte + codec.writeInt(UpdateLog.DELETE); // should just take one byte codec.writeLong(cmd.getVersion()); codec.writeByteArray(br.bytes, br.offset, br.length); if (cmd.getReq().getParamString().contains(CdcrUpdateProcessor.CDCR_UPDATE)) { @@ -217,7 +217,7 @@ public class CdcrTransactionLog extends TransactionLog { } @Override - public long writeDeleteByQuery(DeleteUpdateCommand cmd, int flags) { + public long writeDeleteByQuery(DeleteUpdateCommand cmd) { LogCodec codec = new LogCodec(resolver); try { checkWriteHeader(codec, null); @@ -225,7 +225,7 @@ public class CdcrTransactionLog extends TransactionLog { MemOutputStream out = new MemOutputStream(new byte[20 + (cmd.query.length())]); codec.init(out); codec.writeTag(JavaBinCodec.ARR, 4); - codec.writeInt(UpdateLog.DELETE_BY_QUERY | flags); // should just take one byte + codec.writeInt(UpdateLog.DELETE_BY_QUERY); // should just take one byte codec.writeLong(cmd.getVersion()); codec.writeStr(cmd.query); if (cmd.getReq().getParamString().contains(CdcrUpdateProcessor.CDCR_UPDATE)) { @@ -249,7 +249,7 @@ public class CdcrTransactionLog extends TransactionLog { } @Override - public long writeCommit(CommitUpdateCommand cmd, int flags) { + public long writeCommit(CommitUpdateCommand cmd) { LogCodec codec = new LogCodec(resolver); synchronized (this) { try { @@ -261,7 +261,7 @@ public class CdcrTransactionLog extends TransactionLog { } codec.init(fos); codec.writeTag(JavaBinCodec.ARR, 4); - codec.writeInt(UpdateLog.COMMIT | flags); // should just take one byte + codec.writeInt(UpdateLog.COMMIT); // should just take one byte codec.writeLong(cmd.getVersion()); codec.writeTag(JavaBinCodec.INT); // Enforce the encoding of a plain integer, to simplify decoding fos.writeInt(numRecords + 1); // the number of records in the file - +1 to account for the commit operation being written diff --git a/solr/core/src/java/org/apache/solr/update/CdcrUpdateLog.java b/solr/core/src/java/org/apache/solr/update/CdcrUpdateLog.java index 6b202044d76..bff16122ecf 100644 --- a/solr/core/src/java/org/apache/solr/update/CdcrUpdateLog.java +++ b/solr/core/src/java/org/apache/solr/update/CdcrUpdateLog.java @@ -352,7 +352,6 @@ public class CdcrUpdateLog extends UpdateLog { long latestVersion = startingUpdates.getMaxRecentVersion(); try { startingVersions = startingUpdates.getVersions(numRecordsToKeep); - startingOperation = startingUpdates.getLatestOperation(); // populate recent deletes list (since we can't get that info from the index) for (int i=startingUpdates.deleteList.size()-1; i>=0; i--) { @@ -389,9 +388,7 @@ public class CdcrUpdateLog extends UpdateLog { */ private void copyBufferedUpdates(File tlogSrc, long offsetSrc, long latestVersion) { recoveryInfo = new RecoveryInfo(); - recoveryInfo.positionOfStart = tlog == null ? 0 : tlog.snapshot(); state = State.BUFFERING; - operationFlags |= FLAG_GAP; ModifiableSolrParams params = new ModifiableSolrParams(); params.set(DistributingUpdateProcessorFactory.DISTRIB_UPDATE_PARAM, DistributedUpdateProcessor.DistribPhase.FROMLEADER.toString()); diff --git a/solr/core/src/java/org/apache/solr/update/HdfsTransactionLog.java b/solr/core/src/java/org/apache/solr/update/HdfsTransactionLog.java index 0f89016a107..8ed7d7ad65a 100644 --- a/solr/core/src/java/org/apache/solr/update/HdfsTransactionLog.java +++ b/solr/core/src/java/org/apache/solr/update/HdfsTransactionLog.java @@ -166,20 +166,6 @@ public class HdfsTransactionLog extends TransactionLog { } return true; } - - // This could mess with any readers or reverse readers that are open, or anything that might try to do a log lookup. - // This should only be used to roll back buffered updates, not actually applied updates. - @Override - public void rollback(long pos) throws IOException { - synchronized (this) { - assert snapshot_size == pos; - ensureFlushed(); - // TODO: how do we rollback with hdfs?? We need HDFS-3107 - fos.setWritten(pos); - assert fos.size() == pos; - numRecords = snapshot_numRecords; - } - } private void readHeader(FastInputStream fis) throws IOException { // read existing header @@ -210,7 +196,7 @@ public class HdfsTransactionLog extends TransactionLog { } @Override - public long writeCommit(CommitUpdateCommand cmd, int flags) { + public long writeCommit(CommitUpdateCommand cmd) { LogCodec codec = new LogCodec(resolver); synchronized (this) { try { @@ -223,7 +209,7 @@ public class HdfsTransactionLog extends TransactionLog { codec.init(fos); codec.writeTag(JavaBinCodec.ARR, 3); - codec.writeInt(UpdateLog.COMMIT | flags); // should just take one byte + codec.writeInt(UpdateLog.COMMIT); // should just take one byte codec.writeLong(cmd.getVersion()); codec.writeStr(END_MESSAGE); // ensure these bytes are (almost) last in the file diff --git a/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java b/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java index 7bb74d05bf9..8ca4b1cb3e5 100644 --- a/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java +++ b/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java @@ -65,37 +65,6 @@ public class HdfsUpdateLog extends UpdateLog { this.confDir = confDir; } - // HACK - // while waiting for HDFS-3107, instead of quickly - // dropping, we slowly apply - // This is somewhat brittle, but current usage - // allows for it - @Override - public boolean dropBufferedUpdates() { - versionInfo.blockUpdates(); - try { - if (state != State.BUFFERING) return false; - - if (log.isInfoEnabled()) { - log.info("Dropping buffered updates " + this); - } - - // since we blocked updates, this synchronization shouldn't strictly be - // necessary. - synchronized (this) { - if (tlog != null) { - // tlog.rollback(recoveryInfo.positionOfStart); - } - } - - state = State.ACTIVE; - operationFlags &= ~FLAG_GAP; - } finally { - versionInfo.unblockUpdates(); - } - return true; - } - @Override public void init(PluginInfo info) { super.init(info); @@ -186,6 +155,11 @@ public class HdfsUpdateLog extends UpdateLog { throw new RuntimeException("Problem creating directory: " + tlogDir, e); } } + + String[] oldBufferTlog = getBufferLogList(fs, tlogDir); + if (oldBufferTlog != null && oldBufferTlog.length != 0) { + existOldBufferLog = true; + } tlogFiles = getLogList(fs, tlogDir); id = getLastLogId() + 1; // add 1 since we will create a new log for the @@ -241,7 +215,6 @@ public class HdfsUpdateLog extends UpdateLog { // non-complete tlogs. try (RecentUpdates startingUpdates = getRecentUpdates()) { startingVersions = startingUpdates.getVersions(getNumRecordsToKeep()); - startingOperation = startingUpdates.getLatestOperation(); // populate recent deletes list (since we can't get that info from the // index) @@ -269,6 +242,23 @@ public class HdfsUpdateLog extends UpdateLog { public String getLogDir() { return tlogDir.toUri().toString(); } + + public static String[] getBufferLogList(FileSystem fs, Path tlogDir) { + final String prefix = BUFFER_TLOG_NAME+'.'; + assert fs != null; + FileStatus[] fileStatuses; + try { + fileStatuses = fs.listStatus(tlogDir, path -> path.getName().startsWith(prefix)); + } catch (IOException e) { + throw new SolrException(ErrorCode.SERVER_ERROR, "Failed on listing old buffer tlog", e); + } + + String[] names = new String[fileStatuses.length]; + for (int i = 0; i < fileStatuses.length; i++) { + names[i] = fileStatuses[i].getPath().getName(); + } + return names; + } public static String[] getLogList(FileSystem fs, Path tlogDir) { final String prefix = TLOG_NAME + '.'; @@ -307,7 +297,35 @@ public class HdfsUpdateLog extends UpdateLog { IOUtils.closeQuietly(fs); } } - + + @Override + protected void ensureBufferTlog() { + if (bufferTlog != null) return; + String newLogName = String.format(Locale.ROOT, LOG_FILENAME_PATTERN, BUFFER_TLOG_NAME, System.nanoTime()); + bufferTlog = new HdfsTransactionLog(fs, new Path(tlogDir, newLogName), + globalStrings, tlogDfsReplication); + } + + @Override + protected void deleteBufferLogs() { + // Delete old buffer logs + String[] oldBufferTlog = getBufferLogList(fs, tlogDir); + if (oldBufferTlog != null && oldBufferTlog.length != 0) { + for (String oldBufferLogName : oldBufferTlog) { + Path f = new Path(tlogDir, oldBufferLogName); + try { + boolean s = fs.delete(f, false); + if (!s) { + log.error("Could not remove old buffer tlog file:" + f); + } + } catch (IOException e) { + // No need to bubble up this exception, because it won't cause any problems on recovering + log.error("Could not remove old buffer tlog file:" + f, e); + } + } + } + } + @Override protected void ensureLog() { if (tlog == null) { diff --git a/solr/core/src/java/org/apache/solr/update/TransactionLog.java b/solr/core/src/java/org/apache/solr/update/TransactionLog.java index 96a928cc1a8..2a23896d491 100644 --- a/solr/core/src/java/org/apache/solr/update/TransactionLog.java +++ b/solr/core/src/java/org/apache/solr/update/TransactionLog.java @@ -85,9 +85,6 @@ public class TransactionLog implements Closeable { Map globalStringMap = new HashMap<>(); List globalStringList = new ArrayList<>(); - long snapshot_size; - int snapshot_numRecords; - // write a BytesRef as a byte array static final JavaBinCodec.ObjectResolver resolver = new JavaBinCodec.ObjectResolver() { @Override @@ -153,7 +150,7 @@ public class TransactionLog implements Closeable { // Parse tlog id from the filename String filename = tlogFile.getName(); - id = Long.parseLong(filename.substring(filename.indexOf('.') + 1, filename.indexOf('.') + 20)); + id = Long.parseLong(filename.substring(filename.lastIndexOf('.')+1)); this.tlogFile = tlogFile; raf = new RandomAccessFile(this.tlogFile, "rw"); @@ -233,29 +230,6 @@ public class TransactionLog implements Closeable { return true; } - /** takes a snapshot of the current position and number of records - * for later possible rollback, and returns the position */ - public long snapshot() { - synchronized (this) { - snapshot_size = fos.size(); - snapshot_numRecords = numRecords; - return snapshot_size; - } - } - - // This could mess with any readers or reverse readers that are open, or anything that might try to do a log lookup. - // This should only be used to roll back buffered updates, not actually applied updates. - public void rollback(long pos) throws IOException { - synchronized (this) { - assert snapshot_size == pos; - fos.flush(); - raf.setLength(pos); - fos.setWritten(pos); - assert fos.size() == pos; - numRecords = snapshot_numRecords; - } - } - public long writeData(Object o) { @SuppressWarnings("resource") final LogCodec codec = new LogCodec(resolver); try { @@ -346,17 +320,16 @@ public class TransactionLog implements Closeable { /** * Writes an add update command to the transaction log. This is not applicable for - * in-place updates; use {@link #write(AddUpdateCommand, long, int)}. + * in-place updates; use {@link #write(AddUpdateCommand, long)}. * (The previous pointer (applicable for in-place updates) is set to -1 while writing * the command to the transaction log.) * @param cmd The add update command to be written - * @param flags Options for writing the command to the transaction log * @return Returns the position pointer of the written update command * - * @see #write(AddUpdateCommand, long, int) + * @see #write(AddUpdateCommand, long) */ - public long write(AddUpdateCommand cmd, int flags) { - return write(cmd, -1, flags); + public long write(AddUpdateCommand cmd) { + return write(cmd, -1); } /** @@ -365,10 +338,9 @@ public class TransactionLog implements Closeable { * @param cmd The add update command to be written * @param prevPointer The pointer in the transaction log which this update depends * on (applicable for in-place updates) - * @param flags Options for writing the command to the transaction log * @return Returns the position pointer of the written update command */ - public long write(AddUpdateCommand cmd, long prevPointer, int flags) { + public long write(AddUpdateCommand cmd, long prevPointer) { assert (-1 <= prevPointer && (cmd.isInPlaceUpdate() || (-1 == prevPointer))); LogCodec codec = new LogCodec(resolver); @@ -386,14 +358,14 @@ public class TransactionLog implements Closeable { codec.init(out); if (cmd.isInPlaceUpdate()) { codec.writeTag(JavaBinCodec.ARR, 5); - codec.writeInt(UpdateLog.UPDATE_INPLACE | flags); // should just take one byte + codec.writeInt(UpdateLog.UPDATE_INPLACE); // should just take one byte codec.writeLong(cmd.getVersion()); codec.writeLong(prevPointer); codec.writeLong(cmd.prevVersion); codec.writeSolrInputDocument(cmd.getSolrInputDocument()); } else { codec.writeTag(JavaBinCodec.ARR, 3); - codec.writeInt(UpdateLog.ADD | flags); // should just take one byte + codec.writeInt(UpdateLog.ADD); // should just take one byte codec.writeLong(cmd.getVersion()); codec.writeSolrInputDocument(cmd.getSolrInputDocument()); } @@ -422,7 +394,7 @@ public class TransactionLog implements Closeable { } } - public long writeDelete(DeleteUpdateCommand cmd, int flags) { + public long writeDelete(DeleteUpdateCommand cmd) { LogCodec codec = new LogCodec(resolver); try { @@ -433,7 +405,7 @@ public class TransactionLog implements Closeable { MemOutputStream out = new MemOutputStream(new byte[20 + br.length]); codec.init(out); codec.writeTag(JavaBinCodec.ARR, 3); - codec.writeInt(UpdateLog.DELETE | flags); // should just take one byte + codec.writeInt(UpdateLog.DELETE); // should just take one byte codec.writeLong(cmd.getVersion()); codec.writeByteArray(br.bytes, br.offset, br.length); @@ -452,7 +424,7 @@ public class TransactionLog implements Closeable { } - public long writeDeleteByQuery(DeleteUpdateCommand cmd, int flags) { + public long writeDeleteByQuery(DeleteUpdateCommand cmd) { LogCodec codec = new LogCodec(resolver); try { checkWriteHeader(codec, null); @@ -460,7 +432,7 @@ public class TransactionLog implements Closeable { MemOutputStream out = new MemOutputStream(new byte[20 + (cmd.query.length())]); codec.init(out); codec.writeTag(JavaBinCodec.ARR, 3); - codec.writeInt(UpdateLog.DELETE_BY_QUERY | flags); // should just take one byte + codec.writeInt(UpdateLog.DELETE_BY_QUERY); // should just take one byte codec.writeLong(cmd.getVersion()); codec.writeStr(cmd.query); @@ -478,7 +450,7 @@ public class TransactionLog implements Closeable { } - public long writeCommit(CommitUpdateCommand cmd, int flags) { + public long writeCommit(CommitUpdateCommand cmd) { LogCodec codec = new LogCodec(resolver); synchronized (this) { try { @@ -490,7 +462,7 @@ public class TransactionLog implements Closeable { } codec.init(fos); codec.writeTag(JavaBinCodec.ARR, 3); - codec.writeInt(UpdateLog.COMMIT | flags); // should just take one byte + codec.writeInt(UpdateLog.COMMIT); // should just take one byte codec.writeLong(cmd.getVersion()); codec.writeStr(END_MESSAGE); // ensure these bytes are (almost) last in the file diff --git a/solr/core/src/java/org/apache/solr/update/UpdateLog.java b/solr/core/src/java/org/apache/solr/update/UpdateLog.java index 7f821eafc0e..1bda23fc038 100644 --- a/solr/core/src/java/org/apache/solr/update/UpdateLog.java +++ b/solr/core/src/java/org/apache/solr/update/UpdateLog.java @@ -96,6 +96,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { private static final long STATUS_TIME = TimeUnit.NANOSECONDS.convert(60, TimeUnit.SECONDS); public static String LOG_FILENAME_PATTERN = "%s.%019d"; public static String TLOG_NAME="tlog"; + public static String BUFFER_TLOG_NAME="buffer.tlog"; private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private boolean debug = log.isDebugEnabled(); @@ -139,11 +140,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { public static final int DELETE_BY_QUERY = 0x03; public static final int COMMIT = 0x04; public static final int UPDATE_INPLACE = 0x08; - // Flag indicating that this is a buffered operation, and that a gap exists before buffering started. - // for example, if full index replication starts and we are buffering updates, then this flag should - // be set to indicate that replaying the log would not bring us into sync (i.e. peersync should - // fail if this flag is set on the last update in the tlog). - public static final int FLAG_GAP = 0x10; + // For backward-compatibility, we should delete this field in 9.0 public static final int OPERATION_MASK = 0x0f; // mask off flags to get the operation /** @@ -186,8 +183,8 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { long id = -1; protected State state = State.ACTIVE; - protected int operationFlags; // flags to write in the transaction log with operations (i.e. FLAG_GAP) + protected TransactionLog bufferTlog; protected TransactionLog tlog; protected TransactionLog prevTlog; protected final Deque logs = new LinkedList<>(); // list of recent logs, newest first @@ -206,6 +203,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { protected int maxNumLogsToKeep; protected int numVersionBuckets; // This should only be used to initialize VersionInfo... the actual number of buckets may be rounded up to a power of two. protected Long maxVersionFromIndex = null; + protected boolean existOldBufferLog = false; // keep track of deletes only... this is not updated on an add protected LinkedHashMap oldDeletes = new LinkedHashMap(numDeletesToKeep) { @@ -244,7 +242,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { volatile UpdateHandler uhandler; // a core reload can change this reference! protected volatile boolean cancelApplyBufferUpdate; List startingVersions; - int startingOperation; // last operation in the logs on startup // metrics protected Gauge bufferedOpsGauge; @@ -378,6 +375,10 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { log.debug("UpdateHandler init: tlogDir=" + tlogDir + ", existing tlogs=" + Arrays.asList(tlogFiles) + ", next id=" + id); } + String[] oldBufferTlog = getBufferLogList(tlogDir); + if (oldBufferTlog != null && oldBufferTlog.length != 0) { + existOldBufferLog = true; + } TransactionLog oldLog = null; for (String oldLogName : tlogFiles) { File f = new File(tlogDir, oldLogName); @@ -408,7 +409,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { // TODO: these startingVersions assume that we successfully recover from all non-complete tlogs. try (RecentUpdates startingUpdates = getRecentUpdates()) { startingVersions = startingUpdates.getVersions(numRecordsToKeep); - startingOperation = startingUpdates.getLatestOperation(); // populate recent deletes list (since we can't get that info from the index) for (int i = startingUpdates.deleteList.size() - 1; i >= 0; i--) { @@ -434,14 +434,16 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { this.metricManager = manager; this.registryName = registry; bufferedOpsGauge = () -> { + if (state == State.BUFFERING) { + if (bufferTlog == null) return 0; + // numRecords counts header as a record + return bufferTlog.numRecords() - 1; + } if (tlog == null) { return 0; } else if (state == State.APPLYING_BUFFERED) { // numRecords counts header as a record return tlog.numRecords() - 1 - recoveryInfo.adds - recoveryInfo.deleteByQuery - recoveryInfo.deletes - recoveryInfo.errors; - } else if (state == State.BUFFERING) { - // numRecords counts header as a record - return tlog.numRecords() - 1; } else { return 0; } @@ -472,8 +474,8 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { return startingVersions; } - public int getStartingOperation() { - return startingOperation; + public boolean existOldBufferLog() { + return existOldBufferLog; } /* Takes over ownership of the log, keeping it until no longer needed @@ -509,6 +511,19 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { logs.addFirst(oldLog); } + public String[] getBufferLogList(File directory) { + final String prefix = BUFFER_TLOG_NAME+'.'; + return directory.list((dir, name) -> name.startsWith(prefix)); + } + + /** + * Does update from old tlogs (not from buffer tlog)? + * If yes we must skip writing {@code cmd} to current tlog + */ + private boolean updateFromOldTlogs(UpdateCommand cmd) { + return (cmd.getFlags() & UpdateCommand.REPLAY) != 0 && state == State.REPLAYING; + } + public String[] getLogList(File directory) { final String prefix = TLOG_NAME+'.'; String[] names = directory.list(new FilenameFilter() { @@ -541,14 +556,19 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { // if ((cmd.getFlags() & UpdateCommand.REPLAY) != 0) return; synchronized (this) { - long pos = -1; + if ((cmd.getFlags() & UpdateCommand.BUFFERING) != 0) { + ensureBufferTlog(); + bufferTlog.write(cmd); + return; + } + long pos = -1; long prevPointer = getPrevPointerForUpdate(cmd); // don't log if we are replaying from another log - if ((cmd.getFlags() & UpdateCommand.REPLAY) == 0) { + if (!updateFromOldTlogs(cmd)) { ensureLog(); - pos = tlog.write(cmd, prevPointer, operationFlags); + pos = tlog.write(cmd, prevPointer); } if (!clearCaches) { @@ -556,10 +576,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { // Only currently would be useful for RTG while in recovery mode though. LogPtr ptr = new LogPtr(pos, cmd.getVersion(), prevPointer); - // only update our map if we're not buffering - if ((cmd.getFlags() & UpdateCommand.BUFFERING) == 0) { - map.put(cmd.getIndexedId(), ptr); - } + map.put(cmd.getIndexedId(), ptr); if (trace) { log.trace("TLOG: added id " + cmd.getPrintableId() + " to " + tlog + " " + ptr + " map=" + System.identityHashCode(map)); @@ -606,22 +623,21 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { BytesRef br = cmd.getIndexedId(); synchronized (this) { - long pos = -1; + if ((cmd.getFlags() & UpdateCommand.BUFFERING) != 0) { + ensureBufferTlog(); + bufferTlog.writeDelete(cmd); + return; + } - // don't log if we are replaying from another log - if ((cmd.getFlags() & UpdateCommand.REPLAY) == 0) { + long pos = -1; + if (!updateFromOldTlogs(cmd)) { ensureLog(); - pos = tlog.writeDelete(cmd, operationFlags); + pos = tlog.writeDelete(cmd); } LogPtr ptr = new LogPtr(pos, cmd.version); - - // only update our map if we're not buffering - if ((cmd.getFlags() & UpdateCommand.BUFFERING) == 0) { - map.put(br, ptr); - - oldDeletes.put(br, ptr); - } + map.put(br, ptr); + oldDeletes.put(br, ptr); if (trace) { log.trace("TLOG: added delete for id " + cmd.id + " to " + tlog + " " + ptr + " map=" + System.identityHashCode(map)); @@ -631,15 +647,20 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { public void deleteByQuery(DeleteUpdateCommand cmd) { synchronized (this) { - long pos = -1; - // don't log if we are replaying from another log - if ((cmd.getFlags() & UpdateCommand.REPLAY) == 0) { - ensureLog(); - pos = tlog.writeDeleteByQuery(cmd, operationFlags); + if ((cmd.getFlags() & UpdateCommand.BUFFERING) != 0) { + ensureBufferTlog(); + bufferTlog.writeDeleteByQuery(cmd); + return; } - // only change our caches if we are not buffering - if ((cmd.getFlags() & UpdateCommand.BUFFERING) == 0 && (cmd.getFlags() & UpdateCommand.IGNORE_INDEXWRITER) == 0) { + long pos = -1; + if (!updateFromOldTlogs(cmd)) { + ensureLog(); + pos = tlog.writeDeleteByQuery(cmd); + } + + // skip purge our caches in case of tlog replica + if ((cmd.getFlags() & UpdateCommand.IGNORE_INDEXWRITER) == 0) { // given that we just did a delete-by-query, we don't know what documents were // affected and hence we must purge our caches. openRealtimeSearcher(); @@ -802,7 +823,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { if (prevTlog != null) { // if we made it through the commit, write a commit command to the log // TODO: check that this works to cap a tlog we were using to buffer so we don't replay on startup. - prevTlog.writeCommit(cmd, operationFlags); + prevTlog.writeCommit(cmd); addOldLog(prevTlog, true); // the old log list will decref when no longer needed @@ -1152,9 +1173,16 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { public void copyOverBufferingUpdates(CommitUpdateCommand cuc) { versionInfo.blockUpdates(); try { - operationFlags &= ~FLAG_GAP; - state = State.ACTIVE; - copyAndSwitchToNewTlog(cuc); + synchronized (this) { + state = State.ACTIVE; + if (bufferTlog == null) { + return; + } + // by calling this, we won't switch to new tlog (compared to applyBufferedUpdates()) + // if we switch to new tlog we can possible lose updates on the next fetch + copyOverOldUpdates(cuc.getVersion(), bufferTlog); + dropBufferTlog(); + } } finally { versionInfo.unblockUpdates(); } @@ -1165,33 +1193,25 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { * So any updates which hasn't made it to the index is preserved in the current tlog * @param cuc any updates that have version larger than the version of cuc will be copied over */ - public void copyOverOldUpdates(CommitUpdateCommand cuc) { + public void commitAndSwitchToNewTlog(CommitUpdateCommand cuc) { versionInfo.blockUpdates(); try { - copyAndSwitchToNewTlog(cuc); + synchronized (this) { + if (tlog == null) { + return; + } + preCommit(cuc); + try { + copyOverOldUpdates(cuc.getVersion()); + } finally { + postCommit(cuc); + } + } } finally { versionInfo.unblockUpdates(); } } - protected void copyAndSwitchToNewTlog(CommitUpdateCommand cuc) { - synchronized (this) { - if (tlog == null) { - return; - } - preCommit(cuc); - try { - copyOverOldUpdates(cuc.getVersion()); - } finally { - postCommit(cuc); - } - } - } - - /** - * Copy over updates from prevTlog or last tlog (in tlog folder) to a new tlog - * @param commitVersion any updates that have version larger than the commitVersion will be copied over - */ public void copyOverOldUpdates(long commitVersion) { TransactionLog oldTlog = prevTlog; if (oldTlog == null && !logs.isEmpty()) { @@ -1207,6 +1227,14 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { log.warn("Exception reading log", e); return; } + copyOverOldUpdates(commitVersion, oldTlog); + } + + /** + * Copy over updates from prevTlog or last tlog (in tlog folder) to a new tlog + * @param commitVersion any updates that have version larger than the commitVersion will be copied over + */ + public void copyOverOldUpdates(long commitVersion, TransactionLog oldTlog) { copyOverOldUpdatesMeter.mark(); SolrQueryRequest req = new LocalSolrQueryRequest(uhandler.core, @@ -1270,6 +1298,22 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { } } + protected void ensureBufferTlog() { + if (bufferTlog != null) return; + String newLogName = String.format(Locale.ROOT, LOG_FILENAME_PATTERN, BUFFER_TLOG_NAME, System.nanoTime()); + bufferTlog = newTransactionLog(new File(tlogDir, newLogName), globalStrings, false); + } + + // Cleanup old buffer tlogs + protected void deleteBufferLogs() { + String[] oldBufferTlog = getBufferLogList(tlogDir); + if (oldBufferTlog != null && oldBufferTlog.length != 0) { + for (String oldBufferLogName : oldBufferTlog) { + deleteFile(new File(tlogDir, oldBufferLogName)); + } + } + } + protected void ensureLog() { if (tlog == null) { @@ -1285,7 +1329,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { // record a commit log.info("Recording current closed for " + uhandler.core + " log=" + theLog); CommitUpdateCommand cmd = new CommitUpdateCommand(new LocalSolrQueryRequest(uhandler.core, new ModifiableSolrParams((SolrParams)null)), false); - theLog.writeCommit(cmd, operationFlags); + theLog.writeCommit(cmd); } theLog.deleteOnClose = false; @@ -1314,6 +1358,13 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { log.forceClose(); } + if (bufferTlog != null) { + // should not delete bufferTlog on close, existing bufferTlog is a sign for skip peerSync + bufferTlog.deleteOnClose = false; + bufferTlog.decref(); + bufferTlog.forceClose(); + } + try { ExecutorUtil.shutdownAndAwaitTermination(recoveryExecutor); } catch (Exception e) { @@ -1347,7 +1398,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { HashMap updates; List deleteByQueryList; List deleteList; - int latestOperation; public RecentUpdates(Deque logList) { this.logList = logList; @@ -1401,11 +1451,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { return result; } - public int getLatestOperation() { - return latestOperation; - } - - private void update() { int numUpdates = 0; updateList = new ArrayList<>(logList.size()); @@ -1431,9 +1476,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { // TODO: refactor this out so we get common error handling int opAndFlags = (Integer)entry.get(UpdateLog.FLAGS_IDX); - if (latestOperation == 0) { - latestOperation = opAndFlags; - } int oper = opAndFlags & UpdateLog.OPERATION_MASK; long version = (Long) entry.get(UpdateLog.VERSION_IDX); @@ -1525,6 +1567,10 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { tlog.incref(); logList.addFirst(tlog); } + if (bufferTlog != null) { + bufferTlog.incref(); + logList.addFirst(bufferTlog); + } } // TODO: what if I hand out a list of updates, then do an update, then hand out another list (and @@ -1542,13 +1588,13 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { // reading state and acting on it in the distributed update processor versionInfo.blockUpdates(); try { - if (state == State.BUFFERING) { - log.info("Restarting buffering. previous=" + recoveryInfo); - } else if (state != State.ACTIVE) { + if (state != State.ACTIVE && state != State.BUFFERING) { // we don't currently have support for handling other states log.warn("Unexpected state for bufferUpdates: " + state + ", Ignoring request."); return; } + dropBufferTlog(); + deleteBufferLogs(); recoveryInfo = new RecoveryInfo(); @@ -1556,15 +1602,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { log.info("Starting to buffer updates. " + this); } - // since we blocked updates, this synchronization shouldn't strictly be necessary. - synchronized (this) { - recoveryInfo.positionOfStart = tlog == null ? 0 : tlog.snapshot(); - } - state = State.BUFFERING; - - // currently, buffering is only called by recovery, meaning that there is most likely a gap in updates - operationFlags |= FLAG_GAP; } finally { versionInfo.unblockUpdates(); } @@ -1580,25 +1618,24 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { log.info("Dropping buffered updates " + this); } - // since we blocked updates, this synchronization shouldn't strictly be necessary. - synchronized (this) { - if (tlog != null) { - tlog.rollback(recoveryInfo.positionOfStart); - } - } + dropBufferTlog(); state = State.ACTIVE; - operationFlags &= ~FLAG_GAP; - } catch (IOException e) { - SolrException.log(log,"Error attempting to roll back log", e); - return false; - } - finally { + } finally { versionInfo.unblockUpdates(); } return true; } + private void dropBufferTlog() { + synchronized (this) { + if (bufferTlog != null) { + bufferTlog.decref(); + bufferTlog = null; + } + } + } + /** Returns the Future to wait on, or null if no replay was needed */ public Future applyBufferedUpdates() { @@ -1612,27 +1649,30 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { try { cancelApplyBufferUpdate = false; if (state != State.BUFFERING) return null; - operationFlags &= ~FLAG_GAP; - // handle case when no log was even created because no updates - // were received. - if (tlog == null) { - state = State.ACTIVE; - return null; + synchronized (this) { + // handle case when no updates were received. + if (bufferTlog == null) { + state = State.ACTIVE; + return null; + } + bufferTlog.incref(); } - tlog.incref(); + state = State.APPLYING_BUFFERED; } finally { versionInfo.unblockUpdates(); } if (recoveryExecutor.isShutdown()) { - tlog.decref(); throw new RuntimeException("executor is not running..."); } ExecutorCompletionService cs = new ExecutorCompletionService<>(recoveryExecutor); - LogReplayer replayer = new LogReplayer(Arrays.asList(new TransactionLog[]{tlog}), true); - return cs.submit(replayer, recoveryInfo); + LogReplayer replayer = new LogReplayer(Collections.singletonList(bufferTlog), true); + return cs.submit(() -> { + replayer.run(); + dropBufferTlog(); + }, recoveryInfo); } public State getState() { @@ -1903,10 +1943,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { if (!activeLog) { // if we are replaying an old tlog file, we need to add a commit to the end // so we don't replay it again if we restart right after. - - // if the last operation we replayed had FLAG_GAP set, we want to use that again so we don't lose it - // as the flag on the last operation. - translog.writeCommit(cmd, operationFlags | (operationAndFlags & ~OPERATION_MASK)); + translog.writeCommit(cmd); } try { @@ -2037,10 +2074,6 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer { return cmd; } - public void cancelApplyBufferedUpdates() { - this.cancelApplyBufferUpdate = true; - } - ThreadPoolExecutor recoveryExecutor = new ExecutorUtil.MDCAwareThreadPoolExecutor(0, Integer.MAX_VALUE, 1, TimeUnit.SECONDS, new SynchronousQueue(), new DefaultSolrThreadFactory("recoveryExecutor")); diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-tagger.xml b/solr/core/src/test-files/solr/collection1/conf/schema-tagger.xml new file mode 100644 index 00000000000..051cd10c7a5 --- /dev/null +++ b/solr/core/src/test-files/solr/collection1/conf/schema-tagger.xml @@ -0,0 +1,187 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + id + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/solr/core/src/test-files/solr/collection1/conf/solrconfig-tagger.xml b/solr/core/src/test-files/solr/collection1/conf/solrconfig-tagger.xml new file mode 100644 index 00000000000..e0d367731d5 --- /dev/null +++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig-tagger.xml @@ -0,0 +1,59 @@ + + + + + + ${tests.luceneMatchVersion:LUCENE_CURRENT} + ${solr.data.dir:} + + + + + + + + + + + + + name_tag:[* TO *] + + + + + name_tag:[* TO *] + + + + + + + + + name_tag + + + + diff --git a/solr/core/src/test/org/apache/solr/cloud/RollingRestartTest.java b/solr/core/src/test/org/apache/solr/cloud/RollingRestartTest.java index 14586664ec0..addf732a6df 100644 --- a/solr/core/src/test/org/apache/solr/cloud/RollingRestartTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/RollingRestartTest.java @@ -16,6 +16,11 @@ */ package org.apache.solr.cloud; +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.TimeUnit; + import org.apache.commons.collections.CollectionUtils; import org.apache.solr.client.solrj.request.CollectionAdminRequest; import org.apache.solr.common.cloud.SolrZkClient; @@ -24,11 +29,6 @@ import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.lang.invoke.MethodHandles; -import java.util.ArrayList; -import java.util.List; -import java.util.concurrent.TimeUnit; - public class RollingRestartTest extends AbstractFullDistribZkTestBase { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoScalingHandlerTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoScalingHandlerTest.java index 483b60c14ff..cf119535e12 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoScalingHandlerTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoScalingHandlerTest.java @@ -17,6 +17,7 @@ package org.apache.solr.cloud.autoscaling; +import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.List; import java.util.Map; @@ -26,6 +27,7 @@ import java.util.concurrent.TimeUnit; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrRequest; import org.apache.solr.client.solrj.SolrResponse; +import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.cloud.autoscaling.Policy; import org.apache.solr.client.solrj.embedded.JettySolrRunner; import org.apache.solr.client.solrj.impl.CloudSolrClient; @@ -1011,6 +1013,25 @@ public class AutoScalingHandlerTest extends SolrCloudTestCase { assertEquals(5L, properties.get(AutoScalingParams.ACTION_THROTTLE_PERIOD_SECONDS)); } + public void testUpdatePolicy() throws IOException, SolrServerException { + CloudSolrClient solrClient = cluster.getSolrClient(); + String setPropertiesCommand = "{'set-cluster-policy': [" + + "{'cores': '<4','node': '#ANY'}]}"; + solrClient.request(createAutoScalingRequest(SolrRequest.METHOD.POST, setPropertiesCommand)); + SolrRequest req = createAutoScalingRequest(SolrRequest.METHOD.GET, null); + NamedList response = solrClient.request(req); + assertEquals("<4", Utils.getObjectByPath(response,false,"cluster-policy[0]/cores")); + assertEquals("#ANY", Utils.getObjectByPath(response,false,"cluster-policy[0]/node")); + setPropertiesCommand = "{'set-cluster-policy': [" + + "{'cores': '<3','node': '#ANY'}]}"; + solrClient.request(createAutoScalingRequest(SolrRequest.METHOD.POST, setPropertiesCommand)); + req = createAutoScalingRequest(SolrRequest.METHOD.GET, null); + response = solrClient.request(req); + System.out.println(response); + + + } + static class AutoScalingRequest extends SolrRequest { protected final String message; diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java index c09d4a48c35..234eaea29a1 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java @@ -392,8 +392,8 @@ public class SimCloudManager implements SolrCloudManager { public String simAddNode() throws Exception { Map values = createNodeValues(null); String nodeId = (String)values.get(ImplicitSnitch.NODE); - clusterStateProvider.simAddNode(nodeId); nodeStateProvider.simSetNodeValues(nodeId, values); + clusterStateProvider.simAddNode(nodeId); LOG.trace("-- added node " + nodeId); // initialize history handler if this is the first node if (historyHandler == null && liveNodesSet.size() == 1) { diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java index ca2dd48858d..20ffca92fe3 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java @@ -111,11 +111,11 @@ import static org.apache.solr.common.params.CommonParams.NAME; public class SimClusterStateProvider implements ClusterStateProvider { private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - private final Map> nodeReplicaMap = new ConcurrentHashMap<>(); private final LiveNodesSet liveNodes; private final SimDistribStateManager stateManager; private final SimCloudManager cloudManager; + private final Map> nodeReplicaMap = new ConcurrentHashMap<>(); private final Map clusterProperties = new ConcurrentHashMap<>(); private final Map> collProperties = new ConcurrentHashMap<>(); private final Map>> sliceProperties = new ConcurrentHashMap<>(); @@ -257,8 +257,8 @@ public class SimClusterStateProvider implements ClusterStateProvider { try { Set collections = new HashSet<>(); // mark every replica on that node as down - setReplicaStates(nodeId, Replica.State.DOWN, collections); boolean res = liveNodes.remove(nodeId); + setReplicaStates(nodeId, Replica.State.DOWN, collections); if (!collections.isEmpty()) { collectionsStatesRef.set(null); } @@ -279,6 +279,20 @@ public class SimClusterStateProvider implements ClusterStateProvider { } } + /** + * Remove all replica information related to dead nodes. + */ + public void simRemoveDeadNodes() throws Exception { + lock.lockInterruptibly(); + try { + Set myNodes = new HashSet<>(nodeReplicaMap.keySet()); + myNodes.removeAll(liveNodes.get()); + collectionsStatesRef.set(null); + } finally { + lock.unlock(); + } + } + private synchronized void updateOverseerLeader() throws Exception { if (overseerLeader != null && liveNodes.contains(overseerLeader)) { return; @@ -436,6 +450,8 @@ public class SimClusterStateProvider implements ClusterStateProvider { opDelay(replicaInfo.getCollection(), CollectionParams.CollectionAction.ADDREPLICA.name()); + // at this point nuke our cached DocCollection state + collectionsStatesRef.set(null); List replicas = nodeReplicaMap.computeIfAbsent(nodeId, n -> new ArrayList<>()); // mark replica as active replicaInfo.getVariables().put(ZkStateReader.STATE_PROP, Replica.State.ACTIVE.toString()); @@ -445,8 +461,6 @@ public class SimClusterStateProvider implements ClusterStateProvider { replicaInfo.getVariables().put(Suggestion.coreidxsize, 1); replicas.add(replicaInfo); - // at this point nuke our cached DocCollection state - collectionsStatesRef.set(null); LOG.trace("-- simAddReplica {}", replicaInfo); Map values = cloudManager.getSimNodeStateProvider().simGetAllNodeValues() @@ -483,8 +497,8 @@ public class SimClusterStateProvider implements ClusterStateProvider { * @param coreNodeName coreNodeName */ public void simRemoveReplica(String nodeId, String coreNodeName) throws Exception { - List replicas = nodeReplicaMap.computeIfAbsent(nodeId, n -> new ArrayList<>()); lock.lockInterruptibly(); + List replicas = nodeReplicaMap.computeIfAbsent(nodeId, n -> new ArrayList<>()); try { for (int i = 0; i < replicas.size(); i++) { if (coreNodeName.equals(replicas.get(i).getName())) { @@ -572,7 +586,7 @@ public class SimClusterStateProvider implements ClusterStateProvider { }); } - private void simRunLeaderElection(String collection, Slice s, boolean saveClusterState) throws Exception { + private void simRunLeaderElection(String collection, Slice s, boolean saveState) throws Exception { AtomicBoolean stateChanged = new AtomicBoolean(Boolean.FALSE); Replica leader = s.getLeader(); if (leader == null || !liveNodes.contains(leader.getNodeName())) { @@ -636,8 +650,9 @@ public class SimClusterStateProvider implements ClusterStateProvider { } else { LOG.trace("-- already has leader for {} / {}", collection, s.getName()); } - if (stateChanged.get()) { + if (stateChanged.get() || saveState) { collectionsStatesRef.set(null); + saveClusterState.set(true); } } @@ -654,6 +669,8 @@ public class SimClusterStateProvider implements ClusterStateProvider { List nodeList = new ArrayList<>(); List shardNames = new ArrayList<>(); final String collectionName = props.getStr(NAME); + // always force getting fresh state + collectionsStatesRef.set(null); ClusterState clusterState = getClusterState(); ZkWriteCommand cmd = new ClusterStateMutator(cloudManager).createCollection(clusterState, props); if (cmd.noop) { @@ -758,12 +775,18 @@ public class SimClusterStateProvider implements ClusterStateProvider { if (cores == 0) { throw new RuntimeException("Unexpected value of 'cores' (" + cores + ") on node: " + n); } - cloudManager.getSimNodeStateProvider().simSetNodeValue(n, "cores", cores - 1); + try { + cloudManager.getSimNodeStateProvider().simSetNodeValue(n, "cores", cores - 1); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException("interrupted"); + } } } } }); collectionsStatesRef.set(null); + saveClusterState.set(true); results.add("success", ""); } catch (Exception e) { LOG.warn("Exception", e); @@ -787,6 +810,7 @@ public class SimClusterStateProvider implements ClusterStateProvider { values.put(ImplicitSnitch.DISK, 1000); }); collectionsStatesRef.set(null); + saveClusterState.set(true); } finally { lock.unlock(); } @@ -1057,7 +1081,7 @@ public class SimClusterStateProvider implements ClusterStateProvider { } } - public synchronized void createSystemCollection() throws IOException { + public void createSystemCollection() throws IOException { try { if (simListCollections().contains(CollectionAdminParams.SYSTEM_COLL)) { return; @@ -1065,7 +1089,8 @@ public class SimClusterStateProvider implements ClusterStateProvider { ZkNodeProps props = new ZkNodeProps( NAME, CollectionAdminParams.SYSTEM_COLL, REPLICATION_FACTOR, "1", - OverseerCollectionMessageHandler.NUM_SLICES, "1" + OverseerCollectionMessageHandler.NUM_SLICES, "1", + CommonAdminParams.WAIT_FOR_FINAL_STATE, "true" ); simCreateCollection(props, new NamedList()); } catch (Exception e) { @@ -1389,7 +1414,7 @@ public class SimClusterStateProvider implements ClusterStateProvider { }); }); if (infos.isEmpty()) { - throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Collection " + collection + " doesn't exist."); + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Collection " + collection + " doesn't exist (shard=" + shard + ")."); } if (divide && value != null && (value instanceof Number)) { if ((value instanceof Long) || (value instanceof Integer)) { @@ -1455,6 +1480,9 @@ public class SimClusterStateProvider implements ClusterStateProvider { nodeReplicaMap.forEach((n, replicas) -> { replicas.forEach(ri -> collections.add(ri.getCollection())); }); + // check collProps and sliceProps too + collProperties.forEach((coll, props) -> collections.add(coll)); + sliceProperties.forEach((coll, slices) -> collections.add(coll)); return new ArrayList<>(collections); } finally { lock.unlock(); diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java index b9169eb2263..cb8640c155e 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimNodeStateProvider.java @@ -29,6 +29,7 @@ import java.util.Set; import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.locks.ReentrantLock; import java.util.stream.Collectors; import org.apache.solr.client.solrj.cloud.NodeStateProvider; @@ -50,6 +51,7 @@ public class SimNodeStateProvider implements NodeStateProvider { private final SimClusterStateProvider clusterStateProvider; private final SimDistribStateManager stateManager; private final LiveNodesSet liveNodesSet; + private final ReentrantLock lock = new ReentrantLock(); public SimNodeStateProvider(LiveNodesSet liveNodesSet, SimDistribStateManager stateManager, SimClusterStateProvider clusterStateProvider, @@ -84,14 +86,19 @@ public class SimNodeStateProvider implements NodeStateProvider { * @param node node id * @param values values. */ - public void simSetNodeValues(String node, Map values) { - Map existing = nodeValues.computeIfAbsent(node, n -> new ConcurrentHashMap<>()); - existing.clear(); - if (values != null) { - existing.putAll(values); - } - if (values == null || values.isEmpty() || values.containsKey("nodeRole")) { - saveRoles(); + public void simSetNodeValues(String node, Map values) throws InterruptedException { + lock.lockInterruptibly(); + try { + Map existing = nodeValues.computeIfAbsent(node, n -> new ConcurrentHashMap<>()); + existing.clear(); + if (values != null) { + existing.putAll(values); + } + if (values == null || values.isEmpty() || values.containsKey("nodeRole")) { + saveRoles(); + } + } finally { + lock.unlock(); } } @@ -102,15 +109,20 @@ public class SimNodeStateProvider implements NodeStateProvider { * @param key property name * @param value property value */ - public void simSetNodeValue(String node, String key, Object value) { - Map existing = nodeValues.computeIfAbsent(node, n -> new ConcurrentHashMap<>()); - if (value == null) { - existing.remove(key); - } else { - existing.put(key, value); - } - if (key.equals("nodeRole")) { - saveRoles(); + public void simSetNodeValue(String node, String key, Object value) throws InterruptedException { + lock.lockInterruptibly(); + try { + Map existing = nodeValues.computeIfAbsent(node, n -> new ConcurrentHashMap<>()); + if (value == null) { + existing.remove(key); + } else { + existing.put(key, value); + } + if (key.equals("nodeRole")) { + saveRoles(); + } + } finally { + lock.unlock(); } } @@ -121,21 +133,26 @@ public class SimNodeStateProvider implements NodeStateProvider { * @param key property name * @param value property value. */ - public void simAddNodeValue(String node, String key, Object value) { - Map values = nodeValues.computeIfAbsent(node, n -> new ConcurrentHashMap<>()); - Object existing = values.get(key); - if (existing == null) { - values.put(key, value); - } else if (existing instanceof Set) { - ((Set)existing).add(value); - } else { - Set vals = new HashSet<>(); - vals.add(existing); - vals.add(value); - values.put(key, vals); - } - if (key.equals("nodeRole")) { - saveRoles(); + public void simAddNodeValue(String node, String key, Object value) throws InterruptedException { + lock.lockInterruptibly(); + try { + Map values = nodeValues.computeIfAbsent(node, n -> new ConcurrentHashMap<>()); + Object existing = values.get(key); + if (existing == null) { + values.put(key, value); + } else if (existing instanceof Set) { + ((Set)existing).add(value); + } else { + Set vals = new HashSet<>(); + vals.add(existing); + vals.add(value); + values.put(key, vals); + } + if (key.equals("nodeRole")) { + saveRoles(); + } + } finally { + lock.unlock(); } } @@ -144,10 +161,16 @@ public class SimNodeStateProvider implements NodeStateProvider { * /roles.json is updated. * @param node node id */ - public void simRemoveNodeValues(String node) { - Map values = nodeValues.remove(node); - if (values != null && values.containsKey("nodeRole")) { - saveRoles(); + public void simRemoveNodeValues(String node) throws InterruptedException { + LOG.debug("--removing value for " + node); + lock.lockInterruptibly(); + try { + Map values = nodeValues.remove(node); + if (values != null && values.containsKey("nodeRole")) { + saveRoles(); + } + } finally { + lock.unlock(); } } @@ -155,19 +178,24 @@ public class SimNodeStateProvider implements NodeStateProvider { * Remove values that correspond to dead nodes. If values contained a 'nodeRole' * key then /roles.json is updated. */ - public void simRemoveDeadNodes() { + public void simRemoveDeadNodes() throws InterruptedException { Set myNodes = new HashSet<>(nodeValues.keySet()); myNodes.removeAll(liveNodesSet.get()); - AtomicBoolean updateRoles = new AtomicBoolean(false); - myNodes.forEach(n -> { - LOG.debug("- removing dead node values: " + n); - Map vals = nodeValues.remove(n); - if (vals.containsKey("nodeRole")) { - updateRoles.set(true); + lock.lockInterruptibly(); + try { + AtomicBoolean updateRoles = new AtomicBoolean(false); + myNodes.forEach(n -> { + LOG.debug("- removing dead node values: " + n); + Map vals = nodeValues.remove(n); + if (vals.containsKey("nodeRole")) { + updateRoles.set(true); + } + }); + if (updateRoles.get()) { + saveRoles(); } - }); - if (updateRoles.get()) { - saveRoles(); + } finally { + lock.unlock(); } } @@ -187,7 +215,7 @@ public class SimNodeStateProvider implements NodeStateProvider { return nodeValues; } - private synchronized void saveRoles() { + private void saveRoles() { final Map> roles = new HashMap<>(); nodeValues.forEach((n, values) -> { String nodeRole = (String)values.get("nodeRole"); @@ -211,6 +239,9 @@ public class SimNodeStateProvider implements NodeStateProvider { * @return map of metrics names / values */ public Map getReplicaMetricsValues(String node, Collection tags) { + if (!liveNodesSet.contains(node)) { + throw new RuntimeException("non-live node " + node); + } List replicas = clusterStateProvider.simGetReplicaInfos(node); if (replicas == null || replicas.isEmpty()) { return Collections.emptyMap(); @@ -258,8 +289,7 @@ public class SimNodeStateProvider implements NodeStateProvider { public Map getNodeValues(String node, Collection tags) { LOG.trace("-- requested values for " + node + ": " + tags); if (!liveNodesSet.contains(node)) { - nodeValues.remove(node); - return Collections.emptyMap(); + throw new RuntimeException("non-live node " + node); } if (tags.isEmpty()) { return Collections.emptyMap(); diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimSolrCloudTestCase.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimSolrCloudTestCase.java index 757e2975cd9..e83f72f5712 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimSolrCloudTestCase.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimSolrCloudTestCase.java @@ -84,6 +84,10 @@ public class SimSolrCloudTestCase extends SolrTestCaseJ4 { // clear any persisted configuration cluster.getDistribStateManager().setData(SOLR_AUTOSCALING_CONF_PATH, Utils.toJSON(new ZkNodeProps()), -1); cluster.getDistribStateManager().setData(ZkStateReader.ROLES, Utils.toJSON(new HashMap<>()), -1); + cluster.getSimClusterStateProvider().simDeleteAllCollections(); + cluster.simClearSystemCollection(); + cluster.getSimNodeStateProvider().simRemoveDeadNodes(); + cluster.getSimClusterStateProvider().simRemoveDeadNodes(); // restore the expected number of nodes int currentSize = cluster.getLiveNodesSet().size(); if (currentSize < clusterNodeCount) { @@ -99,10 +103,6 @@ public class SimSolrCloudTestCase extends SolrTestCaseJ4 { removeChildren(ZkStateReader.SOLR_AUTOSCALING_TRIGGER_STATE_PATH); removeChildren(ZkStateReader.SOLR_AUTOSCALING_NODE_LOST_PATH); removeChildren(ZkStateReader.SOLR_AUTOSCALING_NODE_ADDED_PATH); - cluster.getSimClusterStateProvider().simDeleteAllCollections(); - cluster.simClearSystemCollection(); - // clear any dead nodes - cluster.getSimNodeStateProvider().simRemoveDeadNodes(); cluster.getSimClusterStateProvider().simResetLeaderThrottles(); cluster.simRestartOverseer(null); cluster.getTimeSource().sleep(5000); diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestClusterStateProvider.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestClusterStateProvider.java index 71106452ffb..e395985d027 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestClusterStateProvider.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestClusterStateProvider.java @@ -109,7 +109,11 @@ public class TestClusterStateProvider extends SolrCloudTestCase { simCloudManager.getSimClusterStateProvider().simSetClusterProperties(clusterProperties); simCloudManager.getSimDistribStateManager().simSetAutoScalingConfig(autoScalingConfig); nodeValues.forEach((n, values) -> { - simCloudManager.getSimNodeStateProvider().simSetNodeValues(n, values); + try { + simCloudManager.getSimNodeStateProvider().simSetNodeValues(n, values); + } catch (InterruptedException e) { + fail("Interrupted:" + e); + } }); simCloudManager.getSimClusterStateProvider().simSetClusterState(realState); ClusterState simState = simCloudManager.getClusterStateProvider().getClusterState(); diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java index 6d53363a078..934d2ea77cb 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java @@ -33,7 +33,6 @@ import java.util.concurrent.atomic.AtomicInteger; import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering; import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite; import org.apache.commons.math3.stat.descriptive.SummaryStatistics; -import org.apache.lucene.util.LuceneTestCase; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrRequest; import org.apache.solr.client.solrj.cloud.autoscaling.AutoScalingConfig; @@ -54,12 +53,14 @@ import org.apache.solr.cloud.autoscaling.CapturedEvent; import org.apache.solr.cloud.autoscaling.TriggerValidationException; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.params.CollectionAdminParams; import org.apache.solr.common.params.CollectionParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.Pair; import org.apache.solr.common.util.TimeSource; import org.apache.solr.core.SolrResourceLoader; import org.apache.solr.util.LogLevel; +import org.apache.solr.util.TimeOut; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; @@ -74,7 +75,7 @@ import static org.apache.solr.cloud.autoscaling.AutoScalingHandlerTest.createAut @TimeoutSuite(millis = 4 * 3600 * 1000) @LogLevel("org.apache.solr.cloud.autoscaling=DEBUG") @ThreadLeakLingering(linger = 20000) // ComputePlanAction may take significant time to complete -@LuceneTestCase.BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-12075") +//@LuceneTestCase.BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-12075") public class TestLargeCluster extends SimSolrCloudTestCase { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @@ -83,8 +84,9 @@ public class TestLargeCluster extends SimSolrCloudTestCase { public static final int NUM_NODES = 100; static Map> listenerEvents = new ConcurrentHashMap<>(); - static AtomicInteger triggerFiredCount = new AtomicInteger(); - static CountDownLatch triggerFiredLatch; + static AtomicInteger triggerFinishedCount = new AtomicInteger(); + static AtomicInteger triggerStartedCount = new AtomicInteger(); + static CountDownLatch triggerFinishedLatch; static int waitForSeconds; @BeforeClass @@ -94,10 +96,10 @@ public class TestLargeCluster extends SimSolrCloudTestCase { @Before public void setupTest() throws Exception { - waitForSeconds = 5; - triggerFiredCount.set(0); - triggerFiredLatch = new CountDownLatch(1); + triggerStartedCount.set(0); + triggerFinishedCount.set(0); + triggerFinishedLatch = new CountDownLatch(1); listenerEvents.clear(); // disable .scheduled_maintenance String suspendTriggerCommand = "{" + @@ -107,6 +109,13 @@ public class TestLargeCluster extends SimSolrCloudTestCase { SolrClient solrClient = cluster.simGetSolrClient(); NamedList response = solrClient.request(req); assertEquals(response.get("result").toString(), "success"); + + // do this in advance if missing + if (!cluster.getSimClusterStateProvider().simListCollections().contains(CollectionAdminParams.SYSTEM_COLL)) { + cluster.getSimClusterStateProvider().createSystemCollection(); + CloudTestUtils.waitForState(cluster, CollectionAdminParams.SYSTEM_COLL, 120, TimeUnit.SECONDS, + CloudTestUtils.clusterShape(1, 1)); + } } public static class TestTriggerListener extends TriggerListenerBase { @@ -123,11 +132,18 @@ public class TestLargeCluster extends SimSolrCloudTestCase { } } - public static class TestTriggerAction extends TriggerActionBase { + public static class FinishTriggerAction extends TriggerActionBase { @Override public void process(TriggerEvent event, ActionContext context) throws Exception { - triggerFiredCount.incrementAndGet(); - triggerFiredLatch.countDown(); + triggerFinishedCount.incrementAndGet(); + triggerFinishedLatch.countDown(); + } + } + + public static class StartTriggerAction extends TriggerActionBase { + @Override + public void process(TriggerEvent event, ActionContext context) throws Exception { + triggerStartedCount.incrementAndGet(); } } @@ -136,14 +152,15 @@ public class TestLargeCluster extends SimSolrCloudTestCase { SolrClient solrClient = cluster.simGetSolrClient(); String setTriggerCommand = "{" + "'set-trigger' : {" + - "'name' : 'node_lost_trigger'," + + "'name' : 'node_lost_trigger1'," + "'event' : 'nodeLost'," + "'waitFor' : '" + waitForSeconds + "s'," + "'enabled' : true," + "'actions' : [" + + "{'name':'start','class':'" + StartTriggerAction.class.getName() + "'}," + "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," + "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," + - "{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}" + + "{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" + "]" + "}}"; SolrRequest req = createAutoScalingRequest(SolrRequest.METHOD.POST, setTriggerCommand); @@ -154,7 +171,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase { "'set-listener' : " + "{" + "'name' : 'foo'," + - "'trigger' : 'node_lost_trigger'," + + "'trigger' : 'node_lost_trigger1'," + "'stage' : ['STARTED','ABORTED','SUCCEEDED', 'FAILED']," + "'beforeAction' : ['compute', 'execute']," + "'afterAction' : ['compute', 'execute']," + @@ -217,6 +234,19 @@ public class TestLargeCluster extends SimSolrCloudTestCase { } } + // wait until started == finished + TimeOut timeOut = new TimeOut(20 * waitForSeconds * NUM_NODES, TimeUnit.SECONDS, cluster.getTimeSource()); + while (!timeOut.hasTimedOut()) { + if (triggerStartedCount.get() == triggerFinishedCount.get()) { + break; + } + timeOut.sleep(1000); + } + if (timeOut.hasTimedOut()) { + fail("did not finish processing all events in time: started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get()); + } + + log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 30 * nodes.size(), TimeUnit.SECONDS, CloudTestUtils.clusterShape(5, 15)) + "ms"); long newMoveReplicaOps = cluster.simGetOpCount(CollectionParams.CollectionAction.MOVEREPLICA.name()); @@ -232,14 +262,15 @@ public class TestLargeCluster extends SimSolrCloudTestCase { SolrClient solrClient = cluster.simGetSolrClient(); String setTriggerCommand = "{" + "'set-trigger' : {" + - "'name' : 'node_added_trigger'," + + "'name' : 'node_added_trigger2'," + "'event' : 'nodeAdded'," + "'waitFor' : '" + waitForSeconds + "s'," + "'enabled' : true," + "'actions' : [" + + "{'name':'start','class':'" + StartTriggerAction.class.getName() + "'}," + "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," + "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," + - "{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}" + + "{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" + "]" + "}}"; SolrRequest req = createAutoScalingRequest(SolrRequest.METHOD.POST, setTriggerCommand); @@ -257,20 +288,34 @@ public class TestLargeCluster extends SimSolrCloudTestCase { log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS, CloudTestUtils.clusterShape(NUM_NODES / 10, NUM_NODES / 8 * 3)) + " ms"); + // start adding nodes int numAddNode = NUM_NODES / 5; List addNodesList = new ArrayList<>(numAddNode); for (int i = 0; i < numAddNode; i++) { addNodesList.add(cluster.simAddNode()); cluster.getTimeSource().sleep(5000); } - boolean await = triggerFiredLatch.await(1000000 / SPEED, TimeUnit.MILLISECONDS); + // wait until at least one event is generated + boolean await = triggerFinishedLatch.await(10000 / SPEED, TimeUnit.MILLISECONDS); assertTrue("trigger did not fire", await); + // wait until started == finished + TimeOut timeOut = new TimeOut(20 * waitForSeconds * NUM_NODES, TimeUnit.SECONDS, cluster.getTimeSource()); + while (!timeOut.hasTimedOut()) { + if (triggerStartedCount.get() == triggerFinishedCount.get()) { + break; + } + timeOut.sleep(1000); + } + if (timeOut.hasTimedOut()) { + fail("did not finish processing all events in time: started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get()); + } + List systemColl = cluster.simGetSystemCollection(); int startedEventPos = -1; for (int i = 0; i < systemColl.size(); i++) { SolrInputDocument d = systemColl.get(i); - if (!"node_added_trigger".equals(d.getFieldValue("event.source_s"))) { + if (!"node_added_trigger2".equals(d.getFieldValue("event.source_s"))) { continue; } if ("NODEADDED".equals(d.getFieldValue("event.type_s")) && @@ -292,13 +337,13 @@ public class TestLargeCluster extends SimSolrCloudTestCase { SolrInputDocument finishedEvent = null; long lastNumOps = cluster.simGetOpCount("MOVEREPLICA"); while (count-- > 0) { - cluster.getTimeSource().sleep(150000); + cluster.getTimeSource().sleep(10000); long currentNumOps = cluster.simGetOpCount("MOVEREPLICA"); if (currentNumOps == lastNumOps) { int size = systemColl.size() - 1; for (int i = size; i > lastIgnoredPos; i--) { SolrInputDocument d = systemColl.get(i); - if (!"node_added_trigger".equals(d.getFieldValue("event.source_s"))) { + if (!"node_added_trigger2".equals(d.getFieldValue("event.source_s"))) { continue; } if ("SUCCEEDED".equals(d.getFieldValue("stage_s"))) { @@ -401,14 +446,15 @@ public class TestLargeCluster extends SimSolrCloudTestCase { SolrClient solrClient = cluster.simGetSolrClient(); String setTriggerCommand = "{" + "'set-trigger' : {" + - "'name' : 'node_lost_trigger'," + + "'name' : 'node_lost_trigger3'," + "'event' : 'nodeLost'," + "'waitFor' : '" + waitFor + "s'," + "'enabled' : true," + "'actions' : [" + + "{'name':'start','class':'" + StartTriggerAction.class.getName() + "'}," + "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," + "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," + - "{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}" + + "{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" + "]" + "}}"; SolrRequest req = createAutoScalingRequest(SolrRequest.METHOD.POST, setTriggerCommand); @@ -435,8 +481,8 @@ public class TestLargeCluster extends SimSolrCloudTestCase { cluster.simRemoveNode(nodes.get(i), false); cluster.getTimeSource().sleep(killDelay); } - // wait for the trigger to fire - boolean await = triggerFiredLatch.await(20 * waitFor * 1000 / SPEED, TimeUnit.MILLISECONDS); + // wait for the trigger to fire at least once + boolean await = triggerFinishedLatch.await(20 * waitFor * 1000 / SPEED, TimeUnit.MILLISECONDS); assertTrue("trigger did not fire within timeout, " + "waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored, await); @@ -444,7 +490,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase { int startedEventPos = -1; for (int i = 0; i < systemColl.size(); i++) { SolrInputDocument d = systemColl.get(i); - if (!"node_lost_trigger".equals(d.getFieldValue("event.source_s"))) { + if (!"node_lost_trigger3".equals(d.getFieldValue("event.source_s"))) { continue; } if ("NODELOST".equals(d.getFieldValue("event.type_s")) && @@ -457,11 +503,22 @@ public class TestLargeCluster extends SimSolrCloudTestCase { "waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored, startedEventPos > -1); SolrInputDocument startedEvent = systemColl.get(startedEventPos); + // wait until started == finished + TimeOut timeOut = new TimeOut(20 * waitFor * NUM_NODES, TimeUnit.SECONDS, cluster.getTimeSource()); + while (!timeOut.hasTimedOut()) { + if (triggerStartedCount.get() == triggerFinishedCount.get()) { + break; + } + timeOut.sleep(1000); + } + if (timeOut.hasTimedOut()) { + fail("did not finish processing all events in time: started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get()); + } int ignored = 0; int lastIgnoredPos = startedEventPos; for (int i = startedEventPos + 1; i < systemColl.size(); i++) { SolrInputDocument d = systemColl.get(i); - if (!"node_lost_trigger".equals(d.getFieldValue("event.source_s"))) { + if (!"node_lost_trigger3".equals(d.getFieldValue("event.source_s"))) { continue; } if ("NODELOST".equals(d.getFieldValue("event.type_s"))) { @@ -486,13 +543,13 @@ public class TestLargeCluster extends SimSolrCloudTestCase { SolrInputDocument finishedEvent = null; long lastNumOps = cluster.simGetOpCount("MOVEREPLICA"); while (count-- > 0) { - cluster.getTimeSource().sleep(150000); + cluster.getTimeSource().sleep(waitFor * 10000); long currentNumOps = cluster.simGetOpCount("MOVEREPLICA"); if (currentNumOps == lastNumOps) { int size = systemColl.size() - 1; for (int i = size; i > lastIgnoredPos; i--) { SolrInputDocument d = systemColl.get(i); - if (!"node_lost_trigger".equals(d.getFieldValue("event.source_s"))) { + if (!"node_lost_trigger3".equals(d.getFieldValue("event.source_s"))) { continue; } if ("SUCCEEDED".equals(d.getFieldValue("stage_s"))) { @@ -520,8 +577,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase { } @Test - // JIRA closed 24-Feb-2018. Still apparently a problem. - @BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-11714") + //@BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-11714") public void testSearchRate() throws Exception { SolrClient solrClient = cluster.simGetSolrClient(); String collectionName = "testSearchRate"; @@ -555,7 +611,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase { "'actions' : [" + "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," + "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," + - "{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}" + + "{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" + "]" + "}}"; SolrRequest req = createAutoScalingRequest(SolrRequest.METHOD.POST, setTriggerCommand); @@ -575,7 +631,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase { assertEquals(response.get("result").toString(), "success"); - boolean await = triggerFiredLatch.await(40000 / SPEED, TimeUnit.MILLISECONDS); + boolean await = triggerFinishedLatch.await(waitForSeconds * 20000 / SPEED, TimeUnit.MILLISECONDS); assertTrue("The trigger did not fire at all", await); // wait for listener to capture the SUCCEEDED stage cluster.getTimeSource().sleep(2000); diff --git a/solr/core/src/test/org/apache/solr/core/ResourceLoaderTest.java b/solr/core/src/test/org/apache/solr/core/ResourceLoaderTest.java index 3d212c51c36..dc06a6f7c65 100644 --- a/solr/core/src/test/org/apache/solr/core/ResourceLoaderTest.java +++ b/solr/core/src/test/org/apache/solr/core/ResourceLoaderTest.java @@ -72,7 +72,8 @@ public class ResourceLoaderTest extends SolrTestCaseJ4 { Class clazz = ResourceLoaderAware.class; // Check ResourceLoaderAware valid objects - assertAwareCompatibility(clazz, new NGramFilterFactory(new HashMap<>())); + //noinspection unchecked + assertAwareCompatibility(clazz, new NGramFilterFactory(map("minGramSize", "1", "maxGramSize", "2"))); assertAwareCompatibility(clazz, new KeywordTokenizerFactory(new HashMap<>())); // Make sure it throws an error for invalid objects @@ -98,9 +99,10 @@ public class ResourceLoaderTest extends SolrTestCaseJ4 { assertAwareCompatibility(clazz, new JSONResponseWriter()); // Make sure it throws an error for invalid objects + //noinspection unchecked invalid = new Object[] { - new NGramFilterFactory(new HashMap<>()), - "hello", 12.3f, + new NGramFilterFactory(map("minGramSize", "1", "maxGramSize", "2")), + "hello", 12.3f , new KeywordTokenizerFactory(new HashMap<>()) }; for( Object obj : invalid ) { diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/EmbeddedSolrNoSerializeTest.java b/solr/core/src/test/org/apache/solr/handler/tagger/EmbeddedSolrNoSerializeTest.java new file mode 100644 index 00000000000..8d31ad007ef --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/tagger/EmbeddedSolrNoSerializeTest.java @@ -0,0 +1,153 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.BiFunction; + +import org.apache.lucene.document.Field; +import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.StreamingResponseCallback; +import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; +import org.apache.solr.client.solrj.request.QueryRequest; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.ContentStream; +import org.apache.solr.common.util.ContentStreamBase; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.junit.Test; + +/** + * Tests that we can skip serialization of the documents when embedding + * Solr. + */ +public class EmbeddedSolrNoSerializeTest extends SolrTestCaseJ4 { + + static EmbeddedSolrServer solrServer; + + @BeforeClass + public static void init() throws Exception { + initCore("solrconfig-tagger.xml", "schema-tagger.xml"); + solrServer = new EmbeddedSolrServer(h.getCoreContainer(), "collection1"); + //we don't need to close the EmbeddedSolrServer because SolrTestCaseJ4 closes the core + } + + @Before + public void setUp() throws Exception { + super.setUp(); + clearIndex(); + assertU(adoc("id", "9999", "name", "Boston")); + assertU(commit()); + } + + @Test + public void testTag() throws SolrServerException, IOException { + ModifiableSolrParams params = params(); + String input = "foo boston bar";//just one tag; + QueryRequest req = new SolrTaggerRequest(params, input); + req.setPath("/tag"); + + QueryResponse rsp = req.process(solrServer); + SolrDocumentList results= (SolrDocumentList) rsp.getResponse().get("response"); + assertNotNull(rsp.getResponse().get("tags")); + assertNotNull(results.get(0)); + } + + @SuppressWarnings("serial") + public static class SolrTaggerRequest extends QueryRequest { + + private final String input; + + public SolrTaggerRequest(SolrParams p, String input) { + super(p, METHOD.POST); + this.input = input; + } + + // Deprecated in 7.2 but should live on until 8.x + @SuppressWarnings("deprecation") + @Override + public Collection getContentStreams() { + return Collections.singleton(new ContentStreamBase.StringStream(input)); + } + + // As of 7.2. But won't work until: https://issues.apache.org/jira/browse/SOLR-12142 +// @Override +// public RequestWriter.ContentWriter getContentWriter(String expectedType) { +// return new RequestWriter.StringPayloadContentWriter(input, "text/plain; charset=UTF8"); +// } + } + + @Test + public void testSearch() throws Exception { + QueryResponse rsp = solrServer.query(params("q", "name:Boston")); + assertNotNull(rsp.getResults().get(0)); + } + + @Test + public void testAssertTagStreamingWithSolrTaggerRequest() throws Exception { + doTestAssertTagStreaming(SolrTaggerRequest::new); + } + + @Test + @Ignore("As of Solr 7, stream.body is disabled by default for security ") // DWS: dubious, IMO + // and it can't be enabled with EmbeddedSolrServer until SOLR-12126 + public void testAssertTagStreamingWithStreamBodyParam() throws Exception { + doTestAssertTagStreaming((params, input) -> { + params.set("stream.body", input); + return new QueryRequest(params); + }); + } + + public void doTestAssertTagStreaming(BiFunction newQueryRequest) throws IOException, SolrServerException { + ModifiableSolrParams params = params(); + String input = "foo boston bar";//just one tag; + QueryRequest req = newQueryRequest.apply(params, input); + req.setPath("/tag"); + + final AtomicReference refDoc = new AtomicReference<>(); + req.setStreamingResponseCallback(new StreamingResponseCallback() { + @Override + public void streamSolrDocument(SolrDocument doc) { + refDoc.set(doc); + } + + @Override + public void streamDocListInfo(long numFound, long start, Float maxScore) { + + } + }); + QueryResponse rsp = req.process(solrServer); + assertNotNull(rsp.getResponse().get("tags")); + assertNotNull(refDoc.get()); + assertEquals("Boston", ((Field)refDoc.get().getFieldValue("name")).stringValue()); + } +} diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/RandomizedTaggerTest.java b/solr/core/src/test/org/apache/solr/handler/tagger/RandomizedTaggerTest.java new file mode 100644 index 00000000000..cb742a87a8c --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/tagger/RandomizedTaggerTest.java @@ -0,0 +1,150 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Random; +import java.util.Set; + +import com.carrotsearch.randomizedtesting.annotations.Repeat; +import com.carrotsearch.randomizedtesting.generators.RandomNumbers; +import com.carrotsearch.randomizedtesting.generators.RandomPicks; +import com.carrotsearch.randomizedtesting.generators.RandomStrings; +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Randomly generate taggable text and verify via simple tag algorithm. + */ +@Repeat(iterations = 10) +public class RandomizedTaggerTest extends TaggerTestCase { + + @BeforeClass + public static void beforeClass() throws Exception { + initCore("solrconfig-tagger.xml", "schema-tagger.xml"); + } + + @Test + public void test() throws Exception { + final Random R = random(); + + Set names = new HashSet<>(); + //random list of single-word names + final int NUM_SINGLES = 4;//RandomInts.randomIntBetween(R, 1, 5); + for (int i = 0; i < NUM_SINGLES; i++) { + if (i == 0)//first is a big string (perhaps triggers bugs related to growing buffers) + names.add(randomStringOfLength(16, 32)); + else + names.add(randomString()); + } + + //add random list of multi-word names, partially including existing names + final int NUM_MULTI = 10; + for (int i = 0; i < NUM_MULTI; i++) { + final int numWords = RandomNumbers.randomIntBetween(R, 2, 4); + StringBuilder buf = new StringBuilder(); + for (int j = 0; j < numWords; j++) { + if (j != 0) + buf.append(' '); + if (R.nextBoolean()) {//new likely non-existent word + buf.append(randomString()); + } else {//existing word (possible multi-word from prev iteration) + buf.append(RandomPicks.randomFrom(R, names)); + } + } + names.add(buf.toString()); + } + + // BUILD NAMES + buildNames(names.toArray(new String[names.size()])); + + // QUERY LOOP + for (int tTries = 0; tTries < 10 * RANDOM_MULTIPLIER; tTries++) { + // Build up random input, similar to multi-word random names above + StringBuilder input = new StringBuilder(); + final int INPUT_WORD_LEN = 20; + input.append(' ');//must start with space based on assertBruteForce logic + for (int i = 0; i < INPUT_WORD_LEN; i++) { + if (R.nextBoolean()) {//new likely non-existent word + input.append(randomString()); + } else {//existing word (possible multi-word from prev iteration) + input.append(RandomPicks.randomFrom(R, NAMES)); + } + input.append(' ');//must end with a space + } + + boolean madeIt = false; + try { + assertBruteForce(input.toString()); + madeIt = true; + } finally { + if (!madeIt) { + System.out.println("Reproduce with:"); + System.out.print(" buildNames("); + for (int i = 0; i < NAMES.size(); i++) { + if (i != 0) + System.out.print(','); + System.out.print('"'); + System.out.print(NAMES.get(i)); + System.out.print('"'); + } + System.out.println(");"); + System.out.println(" assertBruteForce(\"" + input+"\");"); + } + } + } + + } + + private void assertBruteForce(String input) throws Exception { + assert input.matches(" .* "); + baseParams.set("overlaps", "ALL"); + + //loop through NAMES and find all tag offsets + List testTags = new ArrayList<>(); + for (String name : NAMES) { + String spaceName = " "+name+" "; + int off = 0; + while (true) { + int idx = input.indexOf(spaceName, off); + if (idx < 0) + break; + testTags.add(new TestTag(idx + 1, idx + 1 + name.length(), name, name)); + off = idx + 1; + } + } + + //assert + assertTags(reqDoc(input), testTags.toArray(new TestTag[testTags.size()])); + } + + private String randomString() { return randomStringOfLength(1, 1); } + + private String randomStringOfLength(int min, int max) { + return RandomStrings.randomAsciiLettersOfLengthBetween(random(), min, max).toLowerCase(Locale.ROOT); + } + +} diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/Tagger2Test.java b/solr/core/src/test/org/apache/solr/handler/tagger/Tagger2Test.java new file mode 100644 index 00000000000..c7580e1f729 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/tagger/Tagger2Test.java @@ -0,0 +1,175 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.nio.charset.StandardCharsets; + +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.junit.Test; + +/** + * Test the {@link TaggerRequestHandler}. + */ +public class Tagger2Test extends TaggerTestCase { + + @BeforeClass + public static void beforeClass() throws Exception { + initCore("solrconfig-tagger.xml", "schema-tagger.xml"); + } + + @Override + public void setUp() throws Exception { + super.setUp(); + baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT"); + } + + /** whole matching, no sub-tags */ + @Test + public void testLongestDominantRight() throws Exception { + buildNames("in", "San", "in San", "Francisco", "San Francisco", + "San Francisco State College", "College of California", + "Clayton", "Clayton North", "North Carolina"); + + assertTags("He lived in San Francisco.", + "in", "San Francisco"); + + assertTags("He enrolled in San Francisco State College of California", + "in", "San Francisco State College"); + + assertTags("He lived in Clayton North Carolina", + "in", "Clayton", "North Carolina"); + + } + + // As of Lucene/Solr 4.9, StandardTokenizer never does this anymore (reported to Lucene dev-list, + // Jan 26th 2015. Honestly it's not particularly important to us but it renders this test + // pointless. + /** Orig issue https://github.com/OpenSextant/SolrTextTagger/issues/2 related: #13 */ + @Test + @Ignore + public void testVeryLongWord() throws Exception { + String SANFRAN = "San Francisco"; + buildNames(SANFRAN); + + // exceeds default 255 max token length which means it in-effect becomes a stop-word + StringBuilder STOP = new StringBuilder(260);//>255 + for (int i = 0; i < STOP.capacity(); i++) { + STOP.append((char) ('0' + (i % 10))); + } + + String doc = "San " + STOP + " Francisco"; + assertTags(doc);//no match due to default stop word handling + //and we find it when we ignore stop words + assertTags(reqDoc(doc, "ignoreStopwords", "true"), new TestTag(0, doc.length(), doc, lookupByName(SANFRAN))); + } + + /** Support for stopwords (posInc > 1); + * discussion: https://github.com/OpenSextant/SolrTextTagger/issues/13 */ + @Test + public void testStopWords() throws Exception { + baseParams.set("field", "name_tagStop");//stop filter (pos inc enabled) index & query + + String SOUTHOFWALES = "South of Wales";//'of' is stop word index time & query + String ACITYA = "A City A"; + + buildNames(SOUTHOFWALES, ACITYA); + + //round-trip works + assertTags(reqDoc(SOUTHOFWALES), new TestTag(0, SOUTHOFWALES.length(), SOUTHOFWALES, + lookupByName(SOUTHOFWALES))); + // but offsets doesn't include stopword when leading or trailing... + assertTags(reqDoc(ACITYA), new TestTag(2, 6, "City", + lookupByName(ACITYA))); + //break on stop words + assertTags(reqDoc(SOUTHOFWALES, "ignoreStopwords", "false"));//match nothing + } + + /** Tests WordDelimiterGraphFilter, stacked/synonymous tokens at index time (catenate options) */ + @Test + public void testWDF() throws Exception { + baseParams.set("field", "name_tagWDF"); + + final String WINSTONSALEM = "City of Winston-Salem";//hyphen + final String BOSTONHARBOR = "Boston Harbor";//space + buildNames(WINSTONSALEM, BOSTONHARBOR); + + //round-trip works + assertTags(reqDoc(WINSTONSALEM), new TestTag(0, WINSTONSALEM.length(), WINSTONSALEM, + lookupByName(WINSTONSALEM))); + + // space separated works + final String WS_SPACE = WINSTONSALEM.replace('-', ' '); + assertTags(reqDoc(WS_SPACE), + new TestTag(0, WS_SPACE.length(), WS_SPACE, + lookupByName(WINSTONSALEM))); + + //must be full match + assertTags(reqDoc("Winston"));//match nothing + assertTags(reqDoc("Salem"));//match nothing + + // round-trip works + assertTags(reqDoc(BOSTONHARBOR), new TestTag(0, BOSTONHARBOR.length(), BOSTONHARBOR, + lookupByName(BOSTONHARBOR))); + + // hyphen separated works + final String BH_HYPHEN = BOSTONHARBOR.replace(' ', '-'); + assertTags(reqDoc(BH_HYPHEN), + new TestTag(0, BH_HYPHEN.length(), BH_HYPHEN, + lookupByName(BOSTONHARBOR))); + //must be full match + assertTags(reqDoc("Boston"));//match nothing + assertTags(reqDoc("Harbor"));//match nothing + } + + /** Ensure character offsets work for multi-byte characters */ + @Test + public void testMultibyteChar() throws Exception { + // https://unicode-table.com/en/2019/ + // 0 1 2 3 4 + // 01234567890123456789012345678901234567890 + String TEXT = "He mentionned ’Obama’ in the White House"; + assertEquals(40, TEXT.length()); // char length (in Java, UTF16) + + String QUOTE = TEXT.substring(14, 15); + assertEquals(8217, QUOTE.codePointAt(0)); + + //UTF8 + assertEquals(3, QUOTE.getBytes(StandardCharsets.UTF_8).length); + assertEquals(1, "a".getBytes(StandardCharsets.UTF_8).length); + assertEquals(40 + 2*2, TEXT.getBytes(StandardCharsets.UTF_8).length); + + //UTF16 big endian (by specifying big/little endian, there is no "byte order mark") + assertEquals(2, QUOTE.getBytes(StandardCharsets.UTF_16BE).length); + assertEquals(2, "a".getBytes(StandardCharsets.UTF_16BE).length); + assertEquals(40 * 2, TEXT.getBytes(StandardCharsets.UTF_16BE).length); + + + buildNames("Obama"); + + assertTags(TEXT, "Obama"); + + // TODO test surrogate pairs (i.e. code points not in the BMP) + } + +} diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTest.java b/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTest.java new file mode 100644 index 00000000000..93b11b50a28 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTest.java @@ -0,0 +1,296 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.util.Arrays; +import java.util.stream.Collectors; + +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.request.SolrQueryRequest; +import org.junit.BeforeClass; +import org.junit.Ignore; + +/** + * The original test for {@link TaggerRequestHandler}. + */ +public class TaggerTest extends TaggerTestCase { + + @BeforeClass + public static void beforeClass() throws Exception { + initCore("solrconfig-tagger.xml", "schema-tagger.xml"); + } + + private void indexAndBuild() throws Exception { + N[] names = N.values(); + String[] namesStrs = new String[names.length]; + for (int i = 0; i < names.length; i++) { + namesStrs[i] = names[i].getName(); + } + buildNames(namesStrs); + } + + /** Name corpus */ + enum N { + //keep order to retain ord() + London, London_Business_School, Boston, City_of_London, + of, the//filtered out of the corpus by a custom query + ; + + String getName() { return name().replace('_',' '); } + static N lookupByName(String name) { return N.valueOf(name.replace(' ', '_')); } + int getId() { return ordinal(); } + } + + public void testFormat() throws Exception { + baseParams.set("overlaps", "NO_SUB"); + indexAndBuild(); + + String rspStr = _testFormatRequest(false); + String expected = "\n" + + "\n" + + "\n" + + "1\n" + + "\n" + + " \n" + + " 0\n" + + " 22\n" + + " \n" + + " 1\n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + " 1\n" + + " London Business School\n" + + "\n" + + "\n"; + assertEquals(expected, rspStr); + } + + public void testFormatMatchText() throws Exception { + baseParams.set("overlaps", "NO_SUB"); + indexAndBuild(); + + String rspStr = _testFormatRequest(true); + String expected = "\n" + + "\n" + + "\n" + + "1\n" + + "\n" + + " \n" + + " 0\n" + + " 22\n" + + " london business school\n" + + " \n" + + " 1\n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + " 1\n" + + " London Business School\n" + + "\n" + + "\n"; + assertEquals(expected, rspStr); + } + + private String _testFormatRequest(boolean matchText) throws Exception { + String doc = "london business school";//just one tag + SolrQueryRequest req = reqDoc(doc, "indent", "on", "omitHeader", "on", "matchText", ""+matchText); + String rspStr = h.query(req); + req.close(); + return rspStr; + } + + /** Partial matching, no sub-tags */ + @Ignore //TODO ConcatenateGraphFilter uses a special separator char that we can't put into XML (invalid char) + public void testPartialMatching() throws Exception { + baseParams.set("field", "name_tagPartial"); + baseParams.set("overlaps", "NO_SUB"); + baseParams.set("fq", "NOT name:(of the)");//test filtering + indexAndBuild(); + + //these match nothing + assertTags(reqDoc("") ); + assertTags(reqDoc(" ") ); + assertTags(reqDoc("the") ); + + String doc; + + //just London Business School via "school" substring + doc = "school"; + assertTags(reqDoc(doc), tt(doc,"school", 0, N.London_Business_School)); + + doc = "a school"; + assertTags(reqDoc(doc), tt(doc,"school", 0, N.London_Business_School)); + + doc = "school a"; + assertTags(reqDoc(doc), tt(doc,"school", 0, N.London_Business_School)); + + //More interesting + + doc = "school City"; + assertTags(reqDoc(doc), + tt(doc, "school", 0, N.London_Business_School), + tt(doc, "City", 0, N.City_of_London) ); + + doc = "City of London Business School"; + assertTags(reqDoc(doc), //no plain London (sub-tag) + tt(doc, "City of London", 0, N.City_of_London), + tt(doc, "London Business School", 0, N.London_Business_School)); + } + + /** whole matching, no sub-tags */ + public void testWholeMatching() throws Exception { + baseParams.set("overlaps", "NO_SUB"); + baseParams.set("fq", "NOT name:(of the)");//test filtering + indexAndBuild(); + + //these match nothing + assertTags(reqDoc("")); + assertTags(reqDoc(" ") ); + assertTags(reqDoc("the") ); + + //partial on N.London_Business_School matches nothing + assertTags(reqDoc("school") ); + assertTags(reqDoc("a school") ); + assertTags(reqDoc("school a") ); + assertTags(reqDoc("school City") ); + + String doc; + + doc = "school business london";//backwards + assertTags(reqDoc(doc), tt(doc,"london", 0, N.London)); + + doc = "of London Business School"; + assertTags(reqDoc(doc), //no plain London (sub-tag) + tt(doc, "London Business School", 0, N.London_Business_School)); + + //More interesting + doc = "City of London Business School"; + assertTags(reqDoc(doc), //no plain London (sub-tag) + tt(doc, "City of London", 0, N.City_of_London), + tt(doc, "London Business School", 0, N.London_Business_School)); + + doc = "City of London Business"; + assertTags(reqDoc(doc), //no plain London (sub-tag) no Business (partial-match) + tt(doc, "City of London", 0, N.City_of_London)); + + doc = "London Business magazine"; + assertTags(reqDoc(doc), //Just London; L.B.S. fails + tt(doc, "London", 0, N.London)); + } + + /** whole matching, with sub-tags */ + public void testSubTags() throws Exception { + baseParams.set("overlaps", "ALL"); + baseParams.set("fq", "NOT name:(of the)");//test filtering + indexAndBuild(); + + //these match nothing + assertTags(reqDoc("")); + assertTags(reqDoc(" ") ); + assertTags(reqDoc("the") ); + + //partial on N.London_Business_School matches nothing + assertTags(reqDoc("school") ); + assertTags(reqDoc("a school") ); + assertTags(reqDoc("school a") ); + assertTags(reqDoc("school City") ); + + String doc; + + doc = "school business london";//backwards + assertTags(reqDoc(doc), tt(doc,"london", 0, N.London)); + + //More interesting + doc = "City of London Business School"; + assertTags(reqDoc(doc), + tt(doc, "City of London", 0, N.City_of_London), + tt(doc, "London", 0, N.London), + tt(doc, "London Business School", 0, N.London_Business_School)); + + doc = "City of London Business"; + assertTags(reqDoc(doc), + tt(doc, "City of London", 0, N.City_of_London), + tt(doc, "London", 0, N.London)); + } + + public void testMultipleFilterQueries() throws Exception { + baseParams.set("overlaps", "ALL"); + + // build up the corpus with some additional fields for filtering purposes + deleteByQueryAndGetVersion("*:*", null); + + int i = 0; + assertU(adoc("id", ""+i++, "name", N.London.getName(), "type", "city", "country", "UK")); + assertU(adoc("id", ""+i++, "name", N.London_Business_School.getName(), "type", "school", "country", "UK")); + assertU(adoc("id", ""+i++, "name", N.Boston.getName(), "type", "city", "country", "US")); + assertU(adoc("id", ""+i++, "name", N.City_of_London.getName(), "type", "org", "country", "UK")); + assertU(commit()); + + // not calling buildNames so that we can bring along extra attributes for filtering + NAMES = Arrays.stream(N.values()).map(N::getName).collect(Collectors.toList()); + + // phrase that matches everything + String doc = "City of London Business School in Boston"; + + // first do no filtering + ModifiableSolrParams p = new ModifiableSolrParams(); + p.add(CommonParams.Q, "*:*"); + assertTags(reqDoc(doc, p), + tt(doc, "City of London", 0, N.City_of_London), + tt(doc, "London", 0, N.London), + tt(doc, "London Business School", 0, N.London_Business_School), + tt(doc, "Boston", 0, N.Boston)); + + // add a single fq + p.add(CommonParams.FQ, "type:city"); + assertTags(reqDoc(doc, p), + tt(doc, "London", 0, N.London), + tt(doc, "Boston", 0, N.Boston)); + + // add another fq + p.add(CommonParams.FQ, "country:US"); + assertTags(reqDoc(doc, p), + tt(doc, "Boston", 0, N.Boston)); + } + + private TestTag tt(String doc, String substring, int substringIndex, N name) { + assert substringIndex == 0; + + //little bit of copy-paste code from super.tt() + int startOffset = -1, endOffset; + int substringIndex1 = 0; + for(int i = 0; i <= substringIndex1; i++) { + startOffset = doc.indexOf(substring, ++startOffset); + assert startOffset >= 0 : "The test itself is broken"; + } + endOffset = startOffset+ substring.length();//1 greater (exclusive) + return new TestTag(startOffset, endOffset, substring, lookupByName(name.getName())); + } + +} diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTestCase.java b/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTestCase.java new file mode 100644 index 00000000000..e525ce9265a --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTestCase.java @@ -0,0 +1,251 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TreeSet; + +import org.apache.commons.lang.builder.CompareToBuilder; +import org.apache.commons.lang.builder.EqualsBuilder; +import org.apache.lucene.document.Document; +import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.ContentStream; +import org.apache.solr.common.util.ContentStreamBase; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.request.SolrQueryRequestBase; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.search.DocIterator; +import org.apache.solr.search.DocList; +import org.apache.solr.search.SolrIndexSearcher; +import org.junit.Rule; +import org.junit.rules.TestWatcher; +import org.junit.runner.Description; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public abstract class TaggerTestCase extends SolrTestCaseJ4 { + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + @Rule + public TestWatcher watchman = new TestWatcher() { + @Override + protected void starting(Description description) { + log.info("{} being run...", description.getDisplayName()); + } + }; + + protected final ModifiableSolrParams baseParams = new ModifiableSolrParams(); + + //populated in buildNames; tested in assertTags + protected static List NAMES; + + @Override + public void setUp() throws Exception { + super.setUp(); + baseParams.clear(); + baseParams.set(CommonParams.QT, "/tag"); + baseParams.set(CommonParams.WT, "xml"); + } + + protected void assertTags(String doc, String... tags) throws Exception { + TestTag[] tts = new TestTag[tags.length]; + for (int i = 0; i < tags.length; i++) { + tts[i] = tt(doc, tags[i]); + } + assertTags(reqDoc(doc), tts); + } + + protected static void buildNames(String... names) throws Exception { + deleteByQueryAndGetVersion("*:*", null); + NAMES = Arrays.asList(names); + //Collections.sort(NAMES); + int i = 0; + for (String n : NAMES) { + assertU(adoc("id", ""+(i++), "name", n)); + } + assertU(commit()); + } + + protected String lookupByName(String name) { + for (String n : NAMES) { + if (n.equalsIgnoreCase(name)) + return n; + } + return null; + } + + protected TestTag tt(String doc, String substring) { + int startOffset = -1, endOffset; + int substringIndex = 0; + for(int i = 0; i <= substringIndex; i++) { + startOffset = doc.indexOf(substring,++startOffset); + assert startOffset >= 0 : "The test itself is broken"; + } + endOffset = startOffset+substring.length();//1 greater (exclusive) + return new TestTag(startOffset, endOffset, substring, lookupByName(substring)); + } + + /** Asserts the tags. Will call req.close(). */ + protected void assertTags(SolrQueryRequest req, TestTag... eTags) throws Exception { + try { + SolrQueryResponse rsp = h.queryAndResponse(req.getParams().get(CommonParams.QT), req); + TestTag[] aTags = pullTagsFromResponse(req, rsp); + + String message; + if (aTags.length > 10) + message = null; + else + message = Arrays.asList(aTags).toString(); + Arrays.sort(eTags); + assertSortedArrayEquals(message, eTags, aTags); + + } finally { + req.close(); + } + } + + @SuppressWarnings("unchecked") + protected TestTag[] pullTagsFromResponse(SolrQueryRequest req, SolrQueryResponse rsp ) throws IOException { + NamedList rspValues = rsp.getValues(); + Map matchingNames = new HashMap<>(); + SolrIndexSearcher searcher = req.getSearcher(); + DocList docList = (DocList) rspValues.get("response"); + DocIterator iter = docList.iterator(); + while (iter.hasNext()) { + int docId = iter.next(); + Document doc = searcher.doc(docId); + String id = doc.getField("id").stringValue(); + String name = lookupByName(doc.get("name")); + assertEquals("looking for "+name, NAMES.indexOf(name)+"", id); + matchingNames.put(id, name); + } + + //build TestTag[] aTags from response ('a' is actual) + List mTagsList = (List) rspValues.get("tags"); + List aTags = new ArrayList<>(); + for (NamedList map : mTagsList) { + List foundIds = (List) map.get("ids"); + for (String id : foundIds) { + aTags.add(new TestTag( + ((Number)map.get("startOffset")).intValue(), + ((Number)map.get("endOffset")).intValue(), + null, + matchingNames.get(id))); + } + } + return aTags.toArray(new TestTag[0]); + } + + /** REMEMBER to close() the result req object. */ + protected SolrQueryRequest reqDoc(String doc, String... moreParams) { + return reqDoc(doc, params(moreParams)); + } + + /** REMEMBER to close() the result req object. */ + protected SolrQueryRequest reqDoc(String doc, SolrParams moreParams) { + log.debug("Test doc: "+doc); + SolrParams params = SolrParams.wrapDefaults(moreParams, baseParams); + SolrQueryRequestBase req = new SolrQueryRequestBase(h.getCore(), params) {}; + Iterable stream = Collections.singleton((ContentStream)new ContentStreamBase.StringStream(doc)); + req.setContentStreams(stream); + return req; + } + + /** Asserts the sorted arrays are equals, with a helpful error message when not.*/ + public void assertSortedArrayEquals(String message, Object[] expecteds, Object[] actuals) { + AssertionError error = null; + try { + assertArrayEquals(null, expecteds, actuals); + } catch (AssertionError e) { + error = e; + } + if (error == null) + return; + TreeSet expectedRemaining = new TreeSet<>(Arrays.asList(expecteds)); + expectedRemaining.removeAll(Arrays.asList(actuals)); + if (!expectedRemaining.isEmpty()) + fail(message+": didn't find expected "+expectedRemaining.first()+" (of "+expectedRemaining.size()+"); "+ error); + TreeSet actualsRemaining = new TreeSet<>(Arrays.asList(actuals)); + actualsRemaining.removeAll(Arrays.asList(expecteds)); + fail(message+": didn't expect "+actualsRemaining.first()+" (of "+actualsRemaining.size()+"); "+ error); + } + + class TestTag implements Comparable { + final int startOffset, endOffset; + final String substring; + final String docName; + + TestTag(int startOffset, int endOffset, String substring, String docName) { + this.startOffset = startOffset; + this.endOffset = endOffset; + this.substring = substring; + this.docName = docName; + } + + @Override + public String toString() { + return "TestTag{" + + "[" + startOffset + "-" + endOffset + "]" + + " doc=" + NAMES.indexOf(docName) + ":'" + docName + "'" + + (docName.equals(substring) || substring == null ? "" : " substr="+substring)+ + '}'; + } + + @Override + public boolean equals(Object obj) { + TestTag that = (TestTag) obj; + return new EqualsBuilder() + .append(this.startOffset, that.startOffset) + .append(this.endOffset, that.endOffset) + .append(this.docName, that.docName) + .isEquals(); + } + + @Override + public int hashCode() { + return startOffset;//cheesy but acceptable + } + + @Override + public int compareTo(Object o) { + TestTag that = (TestTag) o; + return new CompareToBuilder() + .append(this.startOffset, that.startOffset) + .append(this.endOffset, that.endOffset) + .append(this.docName,that.docName) + .toComparison(); + } + } +} diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/TaggingAttributeTest.java b/solr/core/src/test/org/apache/solr/handler/tagger/TaggingAttributeTest.java new file mode 100644 index 00000000000..39c78286713 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/tagger/TaggingAttributeTest.java @@ -0,0 +1,73 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Test the {@link TaggerRequestHandler} with + * a Analyzer chain that does use the {@link TaggingAttribute}. See the test + * configuration under 'taggingattribute'. + */ +public class TaggingAttributeTest extends TaggerTestCase { + + @BeforeClass + public static void beforeClass() throws Exception { + initCore("solrconfig-tagger.xml", "schema-tagger.xml"); + } + + /** + * Whole matching, no sub-tags. Links only words with > 3 letters. + * Because of that "San" is not used to start tags + * + */ + @Test + public void testTaggingAttribute() throws Exception { + baseParams.set("field", "name_tagAttribute"); // has WordLengthTaggingFilter using the TaggingAttribute + // this test is based on the longest dominant right test, so we use the + // the same TagClusterReducer setting + baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT"); + + buildNames("in", "San", "in San", "Francisco", "San Francisco", + "San Francisco State College", "College of California", + "Clayton", "Clayton North", "North Carolina"); + + assertTags("He lived in San Francisco.", + //"in", "San Francisco"); //whis would be expected without taggable + "Francisco");// this are the expected results with taggable + + assertTags("He enrolled in San Francisco State College of California", + //"in", "San Francisco State College"); //without taggable enabled + "Francisco", "College of California");// With taggable + //NOTE this also tests that started tags are advanced for non-taggable + // tokens, as otherwise 'College of California' would not be + // suggested. + + assertTags("He lived in Clayton North Carolina", + //"in", "Clayton", "North Carolina"); + "Clayton", "North Carolina"); + + } + +} diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilter.java b/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilter.java new file mode 100644 index 00000000000..237a8b82c39 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilter.java @@ -0,0 +1,110 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +/** + * Simple TokenFilter that lookup only Tokens with more as the parsed number + * of chars.

    + * NOTE:This implementation is only intended to be used as an example + * and for unit testing the {@link TaggingAttribute} feature. Typically + * implementations will be based on NLP results (e.g. using POS tags or + * detected Named Entities). + *

    + * Example Usage:

    + * Currently the usage requires to modify the Analyzer as defined by the + * indexedField. An alternative would be to allow the configuration + * of a special FieldType in the schema.xml and use this Analyzer for processing + * the text sent to the request.

    + * While the current solution is fine for direct API usage, defining the + * Analyzer in the schema.xml would be better suitable for using this feature + * with the {@link TaggerRequestHandler}. + * + *

    + *     Analyzer analyzer = req.getSchema().getField(indexedField).getType().getAnalyzer();
    + *     //get the TokenStream from the Analyzer
    + *     TokenStream baseStream = analyzer.tokenStream("", reader);
    + *     //add a FilterStream that sets the LookupAttribute to the end
    + *     TokenStream filterStream = new WordLengthLookupFilter(baseStream);
    + *     //create the Tagger using the modified analyzer chain.
    + *     new Tagger(corpus, filterStream, tagClusterReducer) {
    + *
    + *         protected void tagCallback(int startOffset, int endOffset, long docIdsKey) {
    + *             //implement the callback
    + *         }
    + *
    + *     }.process();
    + * 
    + */ +public class WordLengthTaggingFilter extends TokenFilter { + + /** + * The default minimum length is 3 + */ + public static final int DEFAULT_MIN_LENGTH = 3; + private final TaggingAttribute lookupAtt = addAttribute(TaggingAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private int minLength; + + /** + * TokenFilter only marks tokens to be looked up with equals or more as + * {@link #DEFAULT_MIN_LENGTH} characters + */ + public WordLengthTaggingFilter(TokenStream input) { + this(input, null); + } + + /** + * TokenFilter only marks tokens to be looked up with equals or more characters + * as the parsed minimum. + * + * @param input the TokenStream to consume tokens from + * @param minLength The minimum length to lookup a Token. null + * or <= 0 to use the #DEFAULT_MIN_LENGTH + */ + public WordLengthTaggingFilter(TokenStream input, Integer minLength) { + super(input); + if (minLength == null || minLength <= 0) { + this.minLength = DEFAULT_MIN_LENGTH; + } else { + this.minLength = minLength; + } + } + + @Override + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + int size = offsetAtt.endOffset() - offsetAtt.startOffset(); + lookupAtt.setTaggable(size >= minLength); + return true; + } else { + return false; + } + } + +} diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilterFactory.java b/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilterFactory.java new file mode 100644 index 00000000000..dbfc5381bb6 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilterFactory.java @@ -0,0 +1,67 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import java.lang.invoke.MethodHandles; +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class WordLengthTaggingFilterFactory extends TokenFilterFactory { + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + public static final String MIN_LENGTH = "minLength"; + + private final Integer minLength; + + public WordLengthTaggingFilterFactory(Map args) { + super(args); + int minLength = -1; + Object value = args.get(MIN_LENGTH); + if (value != null) { + try { + minLength = Integer.parseInt(value.toString()); + } catch (NumberFormatException e) { + log.warn("Unable to parse minLength from value 'minLength=\"{}\"'", value); + + } + } + if (minLength <= 0) { + log.info("use default minLength={}", WordLengthTaggingFilter.DEFAULT_MIN_LENGTH); + this.minLength = null; + } else { + log.info("set minLength={}", minLength); + this.minLength = minLength; + } + } + + @Override + public TokenStream create(TokenStream input) { + return new WordLengthTaggingFilter(input, minLength); + } + +} diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/XmlInterpolationTest.java b/solr/core/src/test/org/apache/solr/handler/tagger/XmlInterpolationTest.java new file mode 100644 index 00000000000..d7dd5dff213 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/tagger/XmlInterpolationTest.java @@ -0,0 +1,224 @@ +/* + * This software was produced for the U. S. Government + * under Contract No. W15P7T-11-C-F600, and is + * subject to the Rights in Noncommercial Computer Software + * and Noncommercial Computer Software Documentation + * Clause 252.227-7014 (JUN 1995) + * + * Copyright 2013 The MITRE Corporation. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.tagger; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Locale; + +import org.apache.commons.io.IOUtils; +import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.solr.common.SolrException; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.junit.BeforeClass; +import org.junit.Test; +import org.xml.sax.InputSource; + +public class XmlInterpolationTest extends TaggerTestCase { + + private static DocumentBuilder xmlDocBuilder; + + + @BeforeClass + public static void beforeClass() throws Exception { + DocumentBuilderFactory xmlDocBuilderFactory = DocumentBuilderFactory.newInstance(); + xmlDocBuilderFactory.setValidating(true); + xmlDocBuilderFactory.setNamespaceAware(true); + xmlDocBuilder = xmlDocBuilderFactory.newDocumentBuilder(); + + initCore("solrconfig-tagger.xml", "schema-tagger.xml"); + } + + @Override + public void setUp() throws Exception { + super.setUp(); + baseParams.set("field", "name_tagXml"); + baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT"); + baseParams.set("xmlOffsetAdjust", "true"); + } + + @Test + public void test() throws Exception { + buildNames("start end"); + + assertXmlTag("before start end after", true); + assertXmlTag("before start
    end after
    ", true); + assertXmlTag("before start end after", true); + assertXmlTag("before start end after", true); + assertXmlTag("before start end after", true); + assertXmlTag("before start end after", true);//adjacent tags + assertXmlTag("before start end after", true); + assertXmlTag("before start end after", true); + + assertXmlTag("

    before start

    end after
    ", false); + assertXmlTag("before start

    end after

    ", false); + + assertXmlTag("before start end after", true); + } + + @Test(expected = SolrException.class) + public void testInvalidXml() throws Exception { + assertXmlTag("notXml", false); + } + + @Test(expected = Exception.class) + public void testValidatingXml() throws Exception { + validateXml("foo"); + } + + protected void assertXmlTag(String docText, boolean expected) throws Exception { + final SolrQueryRequest req = reqDoc(docText); + try { // 5.4 and beyond we can use try-with-resources + final SolrQueryResponse rsp = h.queryAndResponse(req.getParams().get("qt"), req); + final TestTag[] testTags = pullTagsFromResponse(req, rsp); + if (!expected) { + assertEquals(0, testTags.length); + } else { + assertEquals(1, testTags.length); + final TestTag tag = testTags[0]; + validateXml(insertAnchorAtOffsets(docText, tag.startOffset, tag.endOffset, tag.docName)); + } + } finally { + req.close(); + } + } + + protected void validateXml(String xml) throws Exception { + // the "parse" method also validates XML, will throw an exception if mis-formatted + xmlDocBuilder.parse(new InputSource(new StringReader(xml))); + } + + + @Test + public void testLuceneHtmlFilterBehavior() { + String docText; + + //Close tag adjacent to start & end results in end offset including the close tag. LUCENE-5734 + docText = "start end"; + assertArrayEquals(tagExpect(docText, "start", "end"), analyzeTagOne(docText, "start", "end")); + + //Space after "end" means offset doesn't include + docText = "start end "; + assertArrayEquals(tagExpect(docText, "start", "end"), analyzeTagOne(docText, "start", "end")); + + //Matches entity at end + final String endStr = String.format(Locale.ROOT, "en&#x%02x;", (int) 'd'); + docText = "start " + endStr + ""; + assertArrayEquals(tagExpect(docText, "start", endStr), analyzeTagOne(docText, "start", "end")); + //... and at start + final String startStr = String.format(Locale.ROOT, "&#x%02x;tart", (int) 's'); + docText = "" + startStr + " end"; + assertArrayEquals(tagExpect(docText, startStr, "end"), analyzeTagOne(docText, "start", "end")); + + //Test ignoring proc instructions & comments. Note: doesn't expand the entity to "start". + docText = "" + + "]>&start;"; + assertArrayEquals(new int[]{-1, -1}, analyzeTagOne(docText, "start", "start")); + + //Test entity behavior + docText = " — – & &foo;   a b"; + assertArrayEquals(new String[]{"—", "–", "&", "&foo;", "\u00A0", "a", "b"}, + analyzeReturnTokens(docText)); + + //Observe offset adjustment of trailing entity to end tag + docText = "foo bar"; + assertArrayEquals(tagExpect(docText, "foo", "foo"), analyzeTagOne(docText, "foo", "foo")); + } + + private String insertAnchorAtOffsets(String docText, int startOffset, int endOffset, String id) { + String insertStart = "";// (normally we'd escape id) + String insertEnd = ""; + return docText.substring(0, startOffset) + + insertStart + + docText.substring(startOffset, endOffset) + + insertEnd + + docText.substring(endOffset); + } + + private int[] tagExpect(String docText, String start, String end) { + return new int[]{docText.indexOf(start), docText.indexOf(end) + end.length()}; + } + + private int[] analyzeTagOne(String docText, String start, String end) { + int[] result = {-1, -1}; + + Reader filter = new HTMLStripCharFilter(new StringReader(docText)); + + WhitespaceTokenizer ts = new WhitespaceTokenizer(); + final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); + final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class); + try { + ts.setReader(filter); + ts.reset(); + while (ts.incrementToken()) { + final String termString = termAttribute.toString(); + if (termString.equals(start)) + result[0] = offsetAttribute.startOffset(); + if (termString.equals(end)) { + result[1] = offsetAttribute.endOffset(); + return result; + } + } + ts.end(); + } catch (IOException e) { + throw new RuntimeException(e); + } finally { + IOUtils.closeQuietly(ts); + } + return result; + } + + private String[] analyzeReturnTokens(String docText) { + List result = new ArrayList<>(); + + Reader filter = new HTMLStripCharFilter(new StringReader(docText), + Collections.singleton("unescaped")); + WhitespaceTokenizer ts = new WhitespaceTokenizer(); + final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); + try { + ts.setReader(filter); + ts.reset(); + while (ts.incrementToken()) { + result.add(termAttribute.toString()); + } + ts.end(); + } catch (IOException e) { + throw new RuntimeException(e); + } finally { + IOUtils.closeQuietly(ts); + } + return result.toArray(new String[result.size()]); + } + +} diff --git a/solr/core/src/test/org/apache/solr/search/TestRecovery.java b/solr/core/src/test/org/apache/solr/search/TestRecovery.java index 1d622076c99..1b79cee61c1 100644 --- a/solr/core/src/test/org/apache/solr/search/TestRecovery.java +++ b/solr/core/src/test/org/apache/solr/search/TestRecovery.java @@ -24,7 +24,9 @@ import com.codahale.metrics.Gauge; import com.codahale.metrics.Meter; import com.codahale.metrics.Metric; import com.codahale.metrics.MetricRegistry; +import org.apache.solr.common.util.TimeSource; import org.apache.solr.metrics.SolrMetricManager; +import org.apache.solr.util.TimeOut; import org.noggit.ObjectBuilder; import org.slf4j.Logger; @@ -820,6 +822,7 @@ public class TestRecovery extends SolrTestCaseJ4 { +"]" ); + // Note that the v101->v103 are dropped, therefore it does not present in RTG assertJQ(req("qt","/get", "getVersions","6") ,"=={'versions':["+String.join(",",v206,v205,v201,v200,v105,v104)+"]}" ); @@ -929,7 +932,6 @@ public class TestRecovery extends SolrTestCaseJ4 { ,"=={'versions':["+v105+","+v104+"]}" ); - // this time add some docs first before buffering starts (so tlog won't be at pos 0) updateJ(jsonAdd(sdoc("id","c100", "_version_",v200)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); updateJ(jsonAdd(sdoc("id","c101", "_version_",v201)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); @@ -957,10 +959,8 @@ public class TestRecovery extends SolrTestCaseJ4 { +"" +"]" ); - // The updates that were buffered (but never applied) still appear in recent versions! - // This is good for some uses, but may not be good for others. - assertJQ(req("qt","/get", "getVersions","11") - ,"=={'versions':["+String.join(",",v206,v205,v204,v203,v201,v200,v105,v104,v103,v102,v101)+"]}" + assertJQ(req("qt","/get", "getVersions","6") + ,"=={'versions':["+String.join(",",v206,v205,v201,v200,v105,v104)+"]}" ); assertEquals(UpdateLog.State.ACTIVE, ulog.getState()); // leave each test method in a good state @@ -1008,13 +1008,9 @@ public class TestRecovery extends SolrTestCaseJ4 { @Test - public void testBufferingFlags() throws Exception { + public void testExistOldBufferLog() throws Exception { DirectUpdateHandler2.commitOnClose = false; - final Semaphore logReplayFinish = new Semaphore(0); - - UpdateLog.testing_logReplayFinishHook = () -> logReplayFinish.release(); - SolrQueryRequest req = req(); UpdateHandler uhandler = req.getCore().getUpdateHandler(); @@ -1024,9 +1020,6 @@ public class TestRecovery extends SolrTestCaseJ4 { String v101 = getNextVersion(); String v102 = getNextVersion(); String v103 = getNextVersion(); - String v114 = getNextVersion(); - String v115 = getNextVersion(); - String v116 = getNextVersion(); String v117 = getNextVersion(); clearIndex(); @@ -1049,14 +1042,10 @@ public class TestRecovery extends SolrTestCaseJ4 { uhandler = req.getCore().getUpdateHandler(); ulog = uhandler.getUpdateLog(); - logReplayFinish.acquire(); // wait for replay to finish - - assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0); // since we died while buffering, we should see this last - - // - // Try again to ensure that the previous log replay didn't wipe out our flags - // + // the core does not replay updates from buffer tlog on startup + assertTrue(ulog.existOldBufferLog()); // since we died while buffering, we should see this last + // buffer tlog won't be removed on restart req.close(); h.close(); createCore(); @@ -1065,26 +1054,9 @@ public class TestRecovery extends SolrTestCaseJ4 { uhandler = req.getCore().getUpdateHandler(); ulog = uhandler.getUpdateLog(); - assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0); - - // now do some normal non-buffered adds - updateJ(jsonAdd(sdoc("id","Q4", "_version_",v114)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); - updateJ(jsonAdd(sdoc("id","Q5", "_version_",v115)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); - updateJ(jsonAdd(sdoc("id","Q6", "_version_",v116)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); - assertU(commit()); - - req.close(); - h.close(); - createCore(); - - req = req(); - uhandler = req.getCore().getUpdateHandler(); - ulog = uhandler.getUpdateLog(); - - assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) == 0); + assertTrue(ulog.existOldBufferLog()); ulog.bufferUpdates(); - // simulate receiving no updates ulog.applyBufferedUpdates(); updateJ(jsonAdd(sdoc("id","Q7", "_version_",v117)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); // do another add to make sure flags are back to normal @@ -1096,10 +1068,12 @@ public class TestRecovery extends SolrTestCaseJ4 { uhandler = req.getCore().getUpdateHandler(); ulog = uhandler.getUpdateLog(); - assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) == 0); // check flags on Q7 - - logReplayFinish.acquire(); - assertEquals(UpdateLog.State.ACTIVE, ulog.getState()); // leave each test method in a good state + assertFalse(ulog.existOldBufferLog()); + // Timeout for Q7 get replayed, because it was added on tlog, therefore it will be replayed on restart + TimeOut timeout = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME); + timeout.waitFor("Timeout waiting for finish replay updates", + () -> h.getCore().getUpdateHandler().getUpdateLog().getState() == UpdateLog.State.ACTIVE); + assertJQ(req("qt","/get", "id", "Q7") ,"/doc/id==Q7"); } finally { DirectUpdateHandler2.commitOnClose = true; UpdateLog.testing_logReplayHook = null; diff --git a/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java b/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java index e6bb9a6edb0..1796319295d 100644 --- a/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java +++ b/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java @@ -44,6 +44,7 @@ import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.cloud.hdfs.HdfsTestUtil; import org.apache.solr.common.util.IOUtils; +import org.apache.solr.common.util.TimeSource; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.update.DirectUpdateHandler2; import org.apache.solr.update.HdfsUpdateLog; @@ -51,6 +52,7 @@ import org.apache.solr.update.UpdateHandler; import org.apache.solr.update.UpdateLog; import org.apache.solr.update.processor.DistributedUpdateProcessor.DistribPhase; import org.apache.solr.util.BadHdfsThreadsFilter; +import org.apache.solr.util.TimeOut; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Ignore; @@ -515,13 +517,9 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 { @Test - public void testBufferingFlags() throws Exception { + public void testExistOldBufferLog() throws Exception { DirectUpdateHandler2.commitOnClose = false; - final Semaphore logReplayFinish = new Semaphore(0); - - UpdateLog.testing_logReplayFinishHook = () -> logReplayFinish.release(); - SolrQueryRequest req = req(); UpdateHandler uhandler = req.getCore().getUpdateHandler(); @@ -548,14 +546,10 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 { uhandler = req.getCore().getUpdateHandler(); ulog = uhandler.getUpdateLog(); - logReplayFinish.acquire(); // wait for replay to finish - - assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0); // since we died while buffering, we should see this last - - // - // Try again to ensure that the previous log replay didn't wipe out our flags - // + // the core no longer replay updates from buffer tlog on startup + assertTrue(ulog.existOldBufferLog()); // since we died while buffering, we should see this last + // buffer tlog won't be removed on restart req.close(); h.close(); createCore(); @@ -564,23 +558,7 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 { uhandler = req.getCore().getUpdateHandler(); ulog = uhandler.getUpdateLog(); - assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0); - - // now do some normal non-buffered adds - updateJ(jsonAdd(sdoc("id","Q4", "_version_","114")), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); - updateJ(jsonAdd(sdoc("id","Q5", "_version_","115")), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); - updateJ(jsonAdd(sdoc("id","Q6", "_version_","116")), params(DISTRIB_UPDATE_PARAM,FROM_LEADER)); - assertU(commit()); - - req.close(); - h.close(); - createCore(); - - req = req(); - uhandler = req.getCore().getUpdateHandler(); - ulog = uhandler.getUpdateLog(); - - assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) == 0); + assertTrue(ulog.existOldBufferLog()); ulog.bufferUpdates(); // simulate receiving no updates @@ -595,10 +573,12 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 { uhandler = req.getCore().getUpdateHandler(); ulog = uhandler.getUpdateLog(); - assertTrue((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) == 0); // check flags on Q7 - - logReplayFinish.acquire(); - assertEquals(UpdateLog.State.ACTIVE, ulog.getState()); // leave each test method in a good state + assertFalse(ulog.existOldBufferLog()); + // Timeout for Q7 get replayed, because it was added on tlog, therefore it will be replayed on restart + TimeOut timeout = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME); + timeout.waitFor("Timeout waiting for finish replay updates", + () -> h.getCore().getUpdateHandler().getUpdateLog().getState() == UpdateLog.State.ACTIVE); + assertJQ(req("qt","/get", "id", "Q7") ,"/doc/id==Q7"); } finally { DirectUpdateHandler2.commitOnClose = true; UpdateLog.testing_logReplayHook = null; diff --git a/solr/core/src/test/org/apache/solr/search/TestStandardQParsers.java b/solr/core/src/test/org/apache/solr/search/TestStandardQParsers.java index ff9ffffcdfa..cab9026602a 100644 --- a/solr/core/src/test/org/apache/solr/search/TestStandardQParsers.java +++ b/solr/core/src/test/org/apache/solr/search/TestStandardQParsers.java @@ -16,15 +16,15 @@ */ package org.apache.solr.search; -import org.apache.lucene.util.LuceneTestCase; -import org.junit.Test; - import java.lang.reflect.Field; import java.lang.reflect.Modifier; import java.util.ArrayList; import java.util.List; import java.util.Map; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.Test; + /** * Check standard query parsers for class loading problems during initialization (NAME field is final and static). * Because every query plugin extend {@link org.apache.solr.search.QParserPlugin} and contains own instance of {@link org.apache.solr.search.QParserPlugin#standardPlugins}, @@ -50,9 +50,9 @@ public class TestStandardQParsers extends LuceneTestCase { List notFinal = new ArrayList<>(QParserPlugin.standardPlugins.size()); List mismatch = new ArrayList<>(QParserPlugin.standardPlugins.size()); - for (Map.Entry> pair : QParserPlugin.standardPlugins.entrySet()) { + for (Map.Entry pair : QParserPlugin.standardPlugins.entrySet()) { String regName = pair.getKey(); - Class clazz = pair.getValue(); + Class clazz = pair.getValue().getClass();; Field nameField = clazz.getField(FIELD_NAME); int modifiers = nameField.getModifiers(); diff --git a/solr/core/src/test/org/apache/solr/security/TestPKIAuthenticationPlugin.java b/solr/core/src/test/org/apache/solr/security/TestPKIAuthenticationPlugin.java index a664cc04205..2d324cbd534 100644 --- a/solr/core/src/test/org/apache/solr/security/TestPKIAuthenticationPlugin.java +++ b/solr/core/src/test/org/apache/solr/security/TestPKIAuthenticationPlugin.java @@ -35,7 +35,10 @@ import org.apache.solr.request.LocalSolrQueryRequest; import org.apache.solr.request.SolrRequestInfo; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.util.CryptoKeys; -import static org.mockito.Mockito.*; + +import static org.mockito.Mockito.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; public class TestPKIAuthenticationPlugin extends SolrTestCaseJ4 { @@ -45,7 +48,7 @@ public class TestPKIAuthenticationPlugin extends SolrTestCaseJ4 { Map remoteKeys = new HashMap<>(); public MockPKIAuthenticationPlugin(CoreContainer cores, String node) { - super(cores, node); + super(cores, node, new PublicKeyHandler()); } @Override diff --git a/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java b/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java index 1bf4ad41978..d2b4b26df01 100644 --- a/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java +++ b/solr/core/src/test/org/apache/solr/update/TransactionLogTest.java @@ -35,7 +35,7 @@ public class TransactionLogTest extends LuceneTestCase { transactionLog.lastAddSize = 2000000000; AddUpdateCommand updateCommand = new AddUpdateCommand(null); updateCommand.solrDoc = new SolrInputDocument(); - transactionLog.write(updateCommand, 0); + transactionLog.write(updateCommand); } } diff --git a/solr/solr-ref-guide/src/collections-api.adoc b/solr/solr-ref-guide/src/collections-api.adoc index 1e895b287fb..53b6395d3fe 100644 --- a/solr/solr-ref-guide/src/collections-api.adoc +++ b/solr/solr-ref-guide/src/collections-api.adoc @@ -1085,6 +1085,48 @@ http://localhost:8983/solr/admin/collections?action=CLUSTERPROP&name=urlScheme&v ---- +=== Deeply Nested Cluster Properties === + +==== `collectionDefaults` ==== +It is possible to set cluster-wide default values for certain attributes of a collection. + + +*Example 1: Set/update default values* +[source] +---- +curl -X POST -H 'Content-type:application/json' --data-binary ' +{ "set-obj-property" : { + "collectionDefaults" : { + "numShards" : 2, + "nrtReplicas" : 1, + "tlogReplicas" : 1, + "pullReplicas" : 1, + + } +}' http://localhost:8983/api/cluster +---- + +*Example 2: Unset the value of `nrtReplicas` alone* +[source] +---- +curl -X POST -H 'Content-type:application/json' --data-binary ' +{ "set-obj-property" : { + "collectionDefaults" : { + "nrtReplicas" : null, + } +}' http://localhost:8983/api/cluster +---- + +*Example 2: Unset all values in `collectionDefaults`* +[source] +---- +curl -X POST -H 'Content-type:application/json' --data-binary ' +{ "set-obj-property" : { + "collectionDefaults" : null +}' http://localhost:8983/api/cluster +---- + + [[collectionprop]] == COLLECTIONPROP: Collection Properties diff --git a/solr/solr-ref-guide/src/searching.adoc b/solr/solr-ref-guide/src/searching.adoc index 145c1a4dcd5..753c2d88038 100644 --- a/solr/solr-ref-guide/src/searching.adoc +++ b/solr/solr-ref-guide/src/searching.adoc @@ -1,5 +1,35 @@ = Searching -:page-children: overview-of-searching-in-solr, velocity-search-ui, relevance, query-syntax-and-parsing, json-request-api, json-facet-api, faceting, highlighting, spell-checking, query-re-ranking, transforming-result-documents, suggester, morelikethis, pagination-of-results, collapse-and-expand-results, result-grouping, result-clustering, spatial-search, the-terms-component, the-term-vector-component, the-stats-component, the-query-elevation-component, response-writers, near-real-time-searching, realtime-get, exporting-result-sets, streaming-expressions, parallel-sql-interface, analytics +:page-children: overview-of-searching-in-solr, + + velocity-search-ui, + + relevance, + + query-syntax-and-parsing, + + json-request-api, + + json-facet-api, + + faceting, + + highlighting, + + spell-checking, + + query-re-ranking, + + transforming-result-documents, + + suggester, + + morelikethis, + + pagination-of-results, + + collapse-and-expand-results, + + result-grouping, + + result-clustering, + + spatial-search, + + the-terms-component, + + the-term-vector-component, + + the-stats-component, + + the-query-elevation-component, + + the-tagger-handler, + + response-writers, + + near-real-time-searching, + + realtime-get, + + exporting-result-sets, + + streaming-expressions, + + parallel-sql-interface, + + analytics + // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information @@ -50,6 +80,7 @@ This section describes how Solr works with search requests. It covers the follow * <>: How to get term information about specific documents. * <>: How to return information from numeric fields within a document set. * <>: How to force documents to the top of the results for certain queries. +* <>: The SolrTextTagger, for basic named entity tagging in text. * <>: Detailed information about configuring and using Solr's response writers. * <>: How to include documents in search results nearly immediately after they are indexed. * <>: How to get the latest version of a document without opening a searcher. diff --git a/solr/solr-ref-guide/src/the-tagger-handler.adoc b/solr/solr-ref-guide/src/the-tagger-handler.adoc new file mode 100644 index 00000000000..14ba8ed6f9f --- /dev/null +++ b/solr/solr-ref-guide/src/the-tagger-handler.adoc @@ -0,0 +1,265 @@ +[[the-tagger-handler]] += The Tagger Handler + +The "Tagger" Request Handler, AKA the "SolrTextTagger" is a "text tagger". +Given a dictionary (a Solr index) with a name-like field, + you post text to this request handler and it will return every occurrence of one of those names with offsets and other document metadata desired. +It's used for named entity recognition (NER). +It doesn't do any NLP (outside of Lucene text analysis) so it's said to be a "naive tagger", + but it's definitely useful as-is and a more complete NER or ERD (entity recognition and disambiguation) + system can be built with this as a key component. +The SolrTextTagger might be used on queries for query-understanding or large documents as well. + +To get a sense of how to use it, jump to the tutorial below. + +The tagger does not yet support a sharded index. +Tens, perhaps hundreds of millions of names (documents) are supported, mostly limited by memory. + +[[tagger-configuration]] +== Configuration + +The Solr schema needs 2 things: + +* A unique key field (see ``). + Recommended field settings: set `docValues=true` +* A tag field, a TextField, with `ConcatenateGraphFilterFactory` at the end of the index chain (not the query chain): + Set `preservePositionIncrements=false` on that filter. + Recommended field settings: `omitNorms=true`, `omitTermFreqAndPositions=true` and `postingsFormat=FST50` + +The text field's _index analysis chain_, aside from needing ConcatenateGraphFilterFactory at the end, + can otherwise have whatever tokenizer and filters suit your matching preferences. +It can have multi-word synonyms and use WordDelimiterGraphFilterFactory for example. +However, do _not_ use FlattenGraphFilterFactory as it will interfere with ConcatenateGraphFilterFactory. +Position gaps (e.g. stop words) get ignored; it's not (yet) supported for the gap to be significant. + +The text field's _query analysis chain_, on the other hand, is more limited. +There should not be tokens at the same position, thus no synonym expansion -- do that at index time instead. +Stop words (or any other filter introducing a position gap) are supported. +At runtime the tagger can be configured to either treat it as a tag break or to ignore it. + +The Solr config needs the `solr.TagRequestHandler` defined, which supports `defaults`, `invariants`, and `appends` +sections just like the search handler. + +[[tagger-parameters]] +== Tagger Parameters + +The tagger's execution is completely configurable with request parameters. Only `field` is required. + +`field`:: + The tag field that serves as the dictionary. + This is required; you'll probably specify it in the request handler. + +`fq`:: + You can specify some number of _filter queries_ to limit the dictionary used for tagging. + This parameter is the same as is used by the `solr.SearchHandler`. + +`rows`:: + The maximum number of documents to return, but defaulting to 10000 for a tag request. + This parameter is the same as is used by the `solr.SearchHandler`. + +`fl`:: + Solr's standard param for listing the fields to return. + This parameter is the same as is used by the `solr.SearchHandler`. + +`overlaps`:: + Choose the algorithm to determine which tags in an overlapping set should be retained, versus being pruned away. + Options are: + + * `ALL`: Emit all tags. + * `NO_SUB`: Don't emit a tag that is completely within another tag (i.e. no subtag). + * `LONGEST_DOMINANT_RIGHT`: Given a cluster of overlapping tags, emit the longest one (by character length). + If there is a tie, pick the right-most. + Remove any tags overlapping with this tag then repeat the algorithm to potentially find other tags + that can be emitted in the cluster. + +`matchText`:: + A boolean indicating whether to return the matched text in the tag response. + This will trigger the tagger to fully buffer the input before tagging. + +`tagsLimit`:: + The maximum number of tags to return in the response. + Tagging effectively stops after this point. + By default this is 1000. + +`skipAltTokens`:: + A boolean flag used to suppress errors that can occur if, for example, + you enable synonym expansion at query time in the analyzer, which you normally shouldn't do. + Let this default to false unless you know that such tokens can't be avoided. + +`ignoreStopwords`:: + A boolean flag that causes stopwords (or any condition causing positions to skip like >255 char words) + to be ignored as if it wasn't there. + Otherwise, the behavior is to treat them as breaks in tagging on the presumption your indexed text-analysis + configuration doesn't have a StopWordFilter. + By default the indexed analysis chain is checked for the presence of a StopWordFilter and if found + then ignoreStopWords is true if unspecified. + You probably shouldn't have a StopWordFilter configured and probably won't need to set this param either. + +`xmlOffsetAdjust`:: + A boolean indicating that the input is XML and furthermore that the offsets of returned tags should be adjusted as + necessary to allow for the client to insert an openening and closing element at the tag offset pair. + If it isn't possible to do so then the tag will be omitted. + You are expected to configure `HTMLStripCharFilterFactory` in the schema when using this option. + This will trigger the tagger to fully buffer the input before tagging. + +Solr's parameters for controlling the response format are supported, like: + `echoParams`, `wt`, `indent`, etc. + +[[tagger-tutorial-with-geonames]] +== Tutorial with Geonames + +This is a tutorial that demonstrates how to configure and use the text +tagger with the popular Geonames data set. It's more than a tutorial; +it's a how-to with information that wasn't described above. + +[[tagger-create-and-configure-a-solr-collection]] +=== Create and Configure a Solr Collection + +Create a Solr collection named "geonames". For the tutorial, we'll +assume the default "data-driven" configuration. It's good for +experimentation and getting going fast but not for production or being +optimal. + +.... +bin/solr create -c geonames +.... + +[[tagger-configuring]] +==== Configuring + +We need to configure the schema first. The "data driven" mode we're +using allows us to keep this step fairly minimal -- we just need to +declare a field type, 2 fields, and a copy-field. The critical part +up-front is to define the "tag" field type. There are many many ways to +configure text analysis; and we're not going to get into those choices +here. But an important bit is the `ConcatenateGraphFilterFactory` at the +end of the index analyzer chain. Another important bit for performance +is postingsFormat=FST50 resulting in a compact FST based in-memory data +structure that is especially beneficial for the text tagger. + +Schema configuration: + +.... +curl -X POST -H 'Content-type:application/json' http://localhost:8983/solr/geonames/schema -d '{ + "add-field-type":{ + "name":"tag", + "class":"solr.TextField", + "postingsFormat":"FST50", + "omitNorms":true, + "omitTermFreqAndPositions":true, + "indexAnalyzer":{ + "tokenizer":{ + "class":"solr.StandardTokenizerFactory" }, + "filters":[ + {"class":"solr.EnglishPossessiveFilterFactory"}, + {"class":"solr.ASCIIFoldingFilterFactory"}, + {"class":"solr.LowerCaseFilterFactory"}, + {"class":"solr.ConcatenateGraphFilterFactory", "preservePositionIncrements":false } + ]}, + "queryAnalyzer":{ + "tokenizer":{ + "class":"solr.StandardTokenizerFactory" }, + "filters":[ + {"class":"solr.EnglishPossessiveFilterFactory"}, + {"class":"solr.ASCIIFoldingFilterFactory"}, + {"class":"solr.LowerCaseFilterFactory"} + ]} + }, + + "add-field":{ "name":"name", "type":"text_general"}, + + "add-field":{ "name":"name_tag", "type":"tag", "stored":false }, + + "add-copy-field":{ "source":"name", "dest":[ "name_tag" ]} +}' +.... + +Configure a custom Solr Request Handler: + +.... +curl -X POST -H 'Content-type:application/json' http://localhost:8983/solr/geonames/config -d '{ + "add-requesthandler" : { + "name": "/tag", + "class":"solr.TaggerRequestHandler", + "defaults":{ "field":"name_tag" } + } +}' +.... + +[[tagger-load-some-sample-data]] +=== Load Some Sample Data + +We'll go with some Geonames.org data in CSV format. Solr is quite +flexible in loading data in a variety of formats. This +http://download.geonames.org/export/dump/cities1000.zip[cities1000.zip] +should be almost 7MB file expanding to a cities1000.txt file around +22.2MB containing 145k lines, each a city in the world of at least 1000 +population. + +Using bin/post: +.... +bin/post -c geonames -type text/csv \ + -params 'optimize=true&separator=%09&encapsulator=%00&fieldnames=id,name,,alternative_names,latitude,longitude,,,countrycode,,,,,,population,elevation,,timezone,lastupdate' \ + /tmp/cities1000.txt +.... +or using curl: +.... +curl -X POST --data-binary @/path/to/cities1000.txt -H 'Content-type:application/csv' \ + 'http://localhost:8983/solr/geonames/update?commit=true&optimize=true&separator=%09&encapsulator=%00&fieldnames=id,name,,alternative_names,latitude,longitude,,,countrycode,,,,,,population,elevation,,timezone,lastupdate' +.... + +That might take around 35 seconds; it depends. It can be a lot faster if +the schema were tuned to only have what we truly need (no text search if +not needed). + +In that command we said optimize=true to put the index in a state that +will make tagging faster. The encapsulator=%00 is a bit of a hack to +disable the default double-quote. + +[[tagger-tag-time]] +=== Tag Time! + +This is a trivial example tagging a small piece of text. For more +options, see the earlier documentation. + +.... +curl -X POST \ + 'http://localhost:8983/solr/geonames/tag?overlaps=NO_SUB&tagsLimit=5000&fl=id,name,countrycode&wt=json&indent=on' \ + -H 'Content-Type:text/plain' -d 'Hello New York City' +.... + +The response should be this (the QTime may vary): + +.... +{ + "responseHeader":{ + "status":0, + "QTime":1}, + "tagsCount":1, + "tags":[[ + "startOffset",6, + "endOffset",19, + "ids",["5128581"]]], + "response":{"numFound":1,"start":0,"docs":[ + { + "id":"5128581", + "name":["New York City"], + "countrycode":["US"]}] + }} +.... + +[[tagger-tips]] +== Tips + +Performance Tips: + +* Follow the recommended configuration field settings, especially `postingsFormat=FST50`. +* "optimize" after loading your dictionary down to 1 Lucene segment, or at least to as few as possible. +* For bulk tagging lots of documents, there are some strategies, not mutually exclusive: +** Batch them. + The tagger doesn't directly support batching but as a hack you can send a bunch of documents concatenated with + a nonsense word that is not in the dictionary like "ZZYYXXAABBCC" between them. + You'll need to keep track of the character offsets of these so you can subtract them from the results. +** For reducing tagging latency even further, consider embedding Solr with `EmbeddedSolrServer`. + See `EmbeddedSolrNoSerializeTest`. +** Use more than one thread -- perhaps as many as there are CPU cores available to Solr. \ No newline at end of file diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Clause.java b/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Clause.java index c739588d354..8f198bd8bbc 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Clause.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Clause.java @@ -140,7 +140,7 @@ public class Clause implements MapWriter, Comparable { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; Clause that = (Clause)o; - return compareTo(that) == 0; + return Objects.equals(this.original, that.original); } void addTags(Collection params) { diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Policy.java b/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Policy.java index fb01cc5e962..60ff0c929be 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Policy.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/cloud/autoscaling/Policy.java @@ -383,11 +383,10 @@ public class Policy implements MapWriter { return p.compare(r1, r2, false); }); } catch (Exception e) { - LOG.error("Exception! prefs = {}, recent r1 = {}, r2 = {}, compare : {} matrix = {}", + LOG.error("Exception! prefs = {}, recent r1 = {}, r2 = {}, matrix = {}", clusterPreferences, - lastComparison[0].node, - lastComparison[1].node, - p.compare(lastComparison[0],lastComparison[1], false ), + lastComparison[0], + lastComparison[1], Utils.toJSONString(Utils.getDeepCopy(tmpMatrix, 6, false))); throw e; } diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ClusterProperties.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ClusterProperties.java index 87896daad5b..446923b81de 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/ClusterProperties.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ClusterProperties.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.Collections; import java.util.LinkedHashMap; +import java.util.List; import java.util.Map; import org.apache.solr.common.SolrException; @@ -67,6 +68,23 @@ public class ClusterProperties { return value; } + /** + * Read the value of a cluster property, returning a default if it is not set + * + * @param key the property name or the full path to the property as a list of parts. + * @param defaultValue the default value + * @param the type of the property + * @return the property value + * @throws IOException if there is an error reading the value from the cluster + */ + @SuppressWarnings("unchecked") + public T getClusterProperty(List key, T defaultValue) throws IOException { + T value = (T) Utils.getObjectByPath(getClusterProperties(), false, key); + if (value == null) + return defaultValue; + return value; + } + /** * Return the cluster properties * @throws IOException if there is an error reading properties from the cluster diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java index 6b65c344d4c..a86c5e28448 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java @@ -961,6 +961,12 @@ public class ZkStateReader implements Closeable { return value; } + /**Same as the above but allows a full json path as a list of parts + * + * @param keyPath path to the property example ["collectionDefauls", "numShards"] + * @param defaultValue a default value to use if no such property exists + * @return the cluster property, or a default if the property is not set + */ public T getClusterProperty(List keyPath, T defaultValue) { T value = (T) Utils.getObjectByPath( clusterProperties, false, keyPath); if (value == null)