From bed694ec8811c67b8ba4b4c8943e60eda281850a Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Wed, 27 Nov 2019 16:28:19 +0000 Subject: [PATCH] LUCENE-9062: QueryVisitor.consumeTermsMatching (#1037) This commit adds a consumeTermsMatching() method to QueryVisitor, allowing queries that match against a class of terms to report this back to the visitor. It also changes highlighting code to use this new method, replacing the current implementation via instanceof checks. --- lucene/CHANGES.txt | 4 + .../apache/lucene/search/AutomatonQuery.java | 4 +- .../org/apache/lucene/search/FuzzyQuery.java | 13 ++- .../apache/lucene/search/QueryVisitor.java | 15 ++- .../util/automaton/ByteRunAutomaton.java | 1 - .../util/automaton/CompiledAutomaton.java | 26 +++++- .../search/uhighlight/CharArrayMatcher.java | 51 +++++++++++ .../uhighlight/FieldOffsetStrategy.java | 13 ++- .../uhighlight/LabelledCharArrayMatcher.java | 88 ++++++++++++++++++ .../uhighlight/MemoryIndexOffsetStrategy.java | 32 +++---- .../uhighlight/MultiTermHighlighting.java | 91 +------------------ .../search/uhighlight/NoOpOffsetStrategy.java | 4 +- .../uhighlight/TokenStreamOffsetStrategy.java | 24 ++--- .../search/uhighlight/UHComponents.java | 7 +- .../search/uhighlight/UnifiedHighlighter.java | 7 +- .../TestUnifiedHighlighterExtensibility.java | 6 +- .../intervals/MultiTermIntervalsSource.java | 2 +- 17 files changed, 240 insertions(+), 148 deletions(-) create mode 100644 lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CharArrayMatcher.java create mode 100644 lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LabelledCharArrayMatcher.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 1c346928dd8..de611432991 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -92,6 +92,10 @@ Improvements * LUCENE-9036: ExitableDirectoryReader may interupt scaning over DocValues (Mikhail Khludnev) +* LUCENE-9062: QueryVisitor now has a consumeTermsMatching() method, allowing queries + that match a class of terms to pass a ByteRunAutomaton matching those that class + back to the visitor. (Alan Woodward, David Smiley) + Optimizations * LUCENE-8928: When building a kd-tree for dimensions n > 2, compute exact bounds for an inner node every N splits diff --git a/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java b/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java index ed71c4a0dbe..08b9eeff6fc 100644 --- a/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java @@ -162,8 +162,8 @@ public class AutomatonQuery extends MultiTermQuery implements Accountable { @Override public void visit(QueryVisitor visitor) { - if (visitor.acceptField(getField())) { - visitor.visitLeaf(this); + if (visitor.acceptField(field)) { + compiled.visit(visitor, this, field); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java index f136f7e9583..279f9e777b6 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java @@ -25,7 +25,9 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.ByteRunAutomaton; import org.apache.lucene.util.automaton.LevenshteinAutomata; +import org.apache.lucene.util.automaton.Operations; /** Implements the fuzzy search query. The similarity measurement * is based on the Damerau-Levenshtein (optimal string alignment) algorithm, @@ -156,9 +158,14 @@ public class FuzzyQuery extends MultiTermQuery { @Override public void visit(QueryVisitor visitor) { - // TODO find some way of consuming Automata - if (visitor.acceptField(term.field())) { - visitor.visitLeaf(this); + if (visitor.acceptField(field)) { + if (maxEdits == 0 || prefixLength >= term.text().length()) { + visitor.consumeTerms(this, term); + } else { + // Note: we're rebuilding the automaton here, so this can be expensive + visitor.consumeTermsMatching(this, field, + new ByteRunAutomaton(toAutomaton(), false, Operations.DEFAULT_MAX_DETERMINIZED_STATES)); + } } } diff --git a/lucene/core/src/java/org/apache/lucene/search/QueryVisitor.java b/lucene/core/src/java/org/apache/lucene/search/QueryVisitor.java index 5635f7d5d20..15a0eaffffe 100644 --- a/lucene/core/src/java/org/apache/lucene/search/QueryVisitor.java +++ b/lucene/core/src/java/org/apache/lucene/search/QueryVisitor.java @@ -21,6 +21,7 @@ import java.util.Arrays; import java.util.Set; import org.apache.lucene.index.Term; +import org.apache.lucene.util.automaton.ByteRunAutomaton; /** * Allows recursion through a query tree @@ -37,8 +38,18 @@ public abstract class QueryVisitor { */ public void consumeTerms(Query query, Term... terms) { } - // TODO it would be nice to have a way to consume 'classes' of Terms from - // things like AutomatonQuery + /** + * Called by leaf queries that match on a class of terms + * + * @param query the leaf query + * @param field the field queried against + * @param automaton an automaton defining which terms match + * + * @lucene.experimental + */ + public void consumeTermsMatching(Query query, String field, ByteRunAutomaton automaton) { + visitLeaf(query); // default impl for backward compatibility + } /** * Called by leaf queries that do not match on terms diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java index abd5109e655..abe7560f431 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java @@ -16,7 +16,6 @@ */ package org.apache.lucene.util.automaton; - /** * Automaton representation for matching UTF-8 byte[]. */ diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java index 55800dbb60f..1c9a2354b03 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java @@ -16,14 +16,17 @@ */ package org.apache.lucene.util.automaton; - + import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.index.SingleTermsEnum; +import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.util.Accountable; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; @@ -344,6 +347,27 @@ public class CompiledAutomaton implements Accountable { } } + /** + * Report back to a QueryVisitor how this automaton matches terms + */ + public void visit(QueryVisitor visitor, Query parent, String field) { + if (visitor.acceptField(field)) { + switch (type) { + case NORMAL: + visitor.consumeTermsMatching(parent, field, runAutomaton); + break; + case NONE: + break; + case ALL: + visitor.consumeTermsMatching(parent, field, new ByteRunAutomaton(Automata.makeAnyString())); + break; + case SINGLE: + visitor.consumeTerms(parent, new Term(field, term)); + break; + } + } + } + /** Finds largest term accepted by this Automaton, that's * <= the provided input term. The result is placed in * output; it's fine for output and input to point to diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CharArrayMatcher.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CharArrayMatcher.java new file mode 100644 index 00000000000..75d5606e049 --- /dev/null +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CharArrayMatcher.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.uhighlight; + +import java.util.List; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.automaton.Automata; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; + +/** + * Matches a character array + * + * @lucene.internal + */ +public interface CharArrayMatcher { + + /** + * Return {@code true} if the passed-in character array matches + */ + boolean match(char[] s, int offset, int length); + + /** + * Return {@code true} if the passed-in CharsRef matches + */ + default boolean match(CharsRef chars) { + return match(chars.chars, chars.offset, chars.length); + } + + static CharArrayMatcher fromTerms(List terms) { + CharacterRunAutomaton a = new CharacterRunAutomaton(Automata.makeStringUnion(terms)); + return a::run; + } + +} diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java index c63896a428f..d7c936f41cc 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java @@ -31,7 +31,6 @@ import org.apache.lucene.search.MatchesIterator; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRefBuilder; -import org.apache.lucene.util.automaton.CharacterRunAutomaton; /** * Ultimately returns an {@link OffsetsEnum} yielding potentially highlightable words in the text. Needs @@ -168,7 +167,7 @@ public abstract class FieldOffsetStrategy { } protected void createOffsetsEnumsForAutomata(Terms termsIndex, int doc, List results) throws IOException { - final CharacterRunAutomaton[] automata = components.getAutomata(); + final LabelledCharArrayMatcher[] automata = components.getAutomata(); List> automataPostings = new ArrayList<>(automata.length); for (int i = 0; i < automata.length; i++) { automataPostings.add(new ArrayList<>()); @@ -180,9 +179,9 @@ public abstract class FieldOffsetStrategy { CharsRefBuilder refBuilder = new CharsRefBuilder(); while ((term = termsEnum.next()) != null) { for (int i = 0; i < automata.length; i++) { - CharacterRunAutomaton automaton = automata[i]; + CharArrayMatcher automaton = automata[i]; refBuilder.copyUTF8Bytes(term); - if (automaton.run(refBuilder.chars(), 0, refBuilder.length())) { + if (automaton.match(refBuilder.get())) { PostingsEnum postings = termsEnum.postings(null, PostingsEnum.OFFSETS); if (doc == postings.advance(doc)) { automataPostings.get(i).add(postings); @@ -192,13 +191,13 @@ public abstract class FieldOffsetStrategy { } for (int i = 0; i < automata.length; i++) { - CharacterRunAutomaton automaton = automata[i]; + LabelledCharArrayMatcher automaton = automata[i]; List postingsEnums = automataPostings.get(i); if (postingsEnums.isEmpty()) { continue; } - // Build one OffsetsEnum exposing the automata.toString as the term, and the sum of freq - BytesRef wildcardTerm = new BytesRef(automaton.toString()); + // Build one OffsetsEnum exposing the automaton label as the term, and the sum of freq + BytesRef wildcardTerm = new BytesRef(automaton.getLabel()); int sumFreq = 0; for (PostingsEnum postingsEnum : postingsEnums) { sumFreq += postingsEnum.freq(); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LabelledCharArrayMatcher.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LabelledCharArrayMatcher.java new file mode 100644 index 00000000000..c2a50aee9d0 --- /dev/null +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LabelledCharArrayMatcher.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.uhighlight; + +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.automaton.ByteRunAutomaton; + +/** + * Associates a label with a CharArrayMatcher to distinguish different sources for terms in highlighting + * + * @lucene.internal + */ +public interface LabelledCharArrayMatcher extends CharArrayMatcher { + + /** + * @return the label for this matcher + */ + String getLabel(); + + /** + * Associates a label with a CharArrayMatcher + */ + static LabelledCharArrayMatcher wrap(String label, CharArrayMatcher in) { + return new LabelledCharArrayMatcher() { + @Override + public String getLabel() { + return label; + } + + @Override + public boolean match(char[] s, int offset, int length) { + return in.match(s, offset, length); + } + }; + } + + /** + * Returns a representation of the automaton that matches char[] instead of byte[] + */ + static LabelledCharArrayMatcher wrap(String label, ByteRunAutomaton runAutomaton) { + return wrap(label, (chars, offset, length) -> { + int state = 0; + final int maxIdx = offset + length; + for (int i = offset; i < maxIdx; i++) { + final int code = chars[i]; + int b; + // UTF16 to UTF8 (inlined logic from UnicodeUtil.UTF16toUTF8 ) + if (code < 0x80) { + state = runAutomaton.step(state, code); + if (state == -1) return false; + } else if (code < 0x800) { + b = (0xC0 | (code >> 6)); + state = runAutomaton.step(state, b); + if (state == -1) return false; + b = (0x80 | (code & 0x3F)); + state = runAutomaton.step(state, b); + if (state == -1) return false; + } else { + // more complex + byte[] utf8Bytes = new byte[4 * (maxIdx - i)]; + int utf8Len = UnicodeUtil.UTF16toUTF8(chars, i, maxIdx - i, utf8Bytes); + for (int utfIdx = 0; utfIdx < utf8Len; utfIdx++) { + state = runAutomaton.step(state, utf8Bytes[utfIdx] & 0xFF); + if (state == -1) return false; + } + break; + } + } + return runAutomaton.isAccept(state); + }); + } + +} diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java index e53d6e48e77..e67cef3a789 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java @@ -29,8 +29,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.memory.MemoryIndex; import org.apache.lucene.search.spans.SpanQuery; -import org.apache.lucene.util.automaton.Automata; -import org.apache.lucene.util.automaton.CharacterRunAutomaton; /** @@ -42,7 +40,7 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy { private final MemoryIndex memoryIndex; private final LeafReader memIndexLeafReader; - private final CharacterRunAutomaton preMemIndexFilterAutomaton; + private final CharArrayMatcher preMemIndexFilterAutomaton; public MemoryIndexOffsetStrategy(UHComponents components, Analyzer analyzer) { super(components, analyzer); @@ -54,17 +52,17 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy { } /** - * Build one {@link CharacterRunAutomaton} matching any term the query might match. + * Build one {@link CharArrayMatcher} matching any term the query might match. */ - private static CharacterRunAutomaton buildCombinedAutomaton(UHComponents components) { + private static CharArrayMatcher buildCombinedAutomaton(UHComponents components) { // We don't know enough about the query to do this confidently if (components.getTerms() == null || components.getAutomata() == null) { return null; } - List allAutomata = new ArrayList<>(); + List allAutomata = new ArrayList<>(); if (components.getTerms().length > 0) { - allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(components.getTerms())))); + allAutomata.add(CharArrayMatcher.fromTerms(Arrays.asList(components.getTerms()))); } Collections.addAll(allAutomata, components.getAutomata()); for (SpanQuery spanQuery : components.getPhraseHelper().getSpanQueries()) { @@ -75,20 +73,18 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy { if (allAutomata.size() == 1) { return allAutomata.get(0); } + //TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we // could union them all. But it's not exposed, and sometimes the automaton is byte (not char) oriented - // Return an aggregate CharacterRunAutomaton of others - return new CharacterRunAutomaton(Automata.makeEmpty()) {// the makeEmpty() is bogus; won't be used - @Override - public boolean run(char[] chars, int offset, int length) { - for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation - if (allAutomata.get(i).run(chars, offset, length)) { - return true; - } + // Return an aggregate CharArrayMatcher of others + return (chars, offset, length) -> { + for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation + if (allAutomata.get(i).match(chars, offset, length)) { + return true; } - return false; } + return false; }; } @@ -118,14 +114,14 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy { } private static FilteringTokenFilter newKeepWordFilter(final TokenStream tokenStream, - final CharacterRunAutomaton charRunAutomaton) { + final CharArrayMatcher matcher) { // it'd be nice to use KeepWordFilter but it demands a CharArraySet. TODO File JIRA? Need a new interface? return new FilteringTokenFilter(tokenStream) { final CharTermAttribute charAtt = addAttribute(CharTermAttribute.class); @Override protected boolean accept() throws IOException { - return charRunAutomaton.run(charAtt.buffer(), 0, charAtt.length()); + return matcher.match(charAtt.buffer(), 0, charAtt.length()); } }; } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java index 8181c2613ee..ba8e85e9900 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java @@ -26,12 +26,7 @@ import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.spans.SpanQuery; -import org.apache.lucene.util.UnicodeUtil; -import org.apache.lucene.util.automaton.Automata; -import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.ByteRunAutomaton; -import org.apache.lucene.util.automaton.CharacterRunAutomaton; -import org.apache.lucene.util.automaton.Operations; /** * Support for highlighting multi-term queries. @@ -46,11 +41,10 @@ final class MultiTermHighlighting { * Extracts MultiTermQueries that match the provided field predicate. * Returns equivalent automata that will match terms. */ - static CharacterRunAutomaton[] extractAutomata(Query query, Predicate fieldMatcher, boolean lookInSpan) { - + static LabelledCharArrayMatcher[] extractAutomata(Query query, Predicate fieldMatcher, boolean lookInSpan) { AutomataCollector collector = new AutomataCollector(lookInSpan, fieldMatcher); query.visit(collector); - return collector.runAutomata.toArray(new CharacterRunAutomaton[0]); + return collector.runAutomata.toArray(new LabelledCharArrayMatcher[0]); } /** @@ -63,7 +57,7 @@ final class MultiTermHighlighting { private static class AutomataCollector extends QueryVisitor { - List runAutomata = new ArrayList<>(); + List runAutomata = new ArrayList<>(); final boolean lookInSpan; final Predicate fieldMatcher; @@ -86,85 +80,10 @@ final class MultiTermHighlighting { } @Override - public void visitLeaf(Query query) { - if (query instanceof AutomatonQuery) { - AutomatonQuery aq = (AutomatonQuery) query; - if (aq.isAutomatonBinary() == false) { - // WildcardQuery, RegexpQuery - runAutomata.add(new CharacterRunAutomaton(aq.getAutomaton()) { - @Override - public String toString() { - return query.toString(); - } - }); - } - else { - runAutomata.add(binaryToCharRunAutomaton(aq.getAutomaton(), query.toString())); - } - } - else if (query instanceof FuzzyQuery) { - FuzzyQuery fq = (FuzzyQuery) query; - if (fq.getMaxEdits() == 0 || fq.getPrefixLength() >= fq.getTerm().text().length()) { - consumeTerms(query, fq.getTerm()); - } - else { - runAutomata.add(new CharacterRunAutomaton(fq.toAutomaton()){ - @Override - public String toString() { - return query.toString(); - } - }); - } - } + public void consumeTermsMatching(Query query, String field, ByteRunAutomaton automaton) { + runAutomata.add(LabelledCharArrayMatcher.wrap(query.toString(), automaton)); } } - private static CharacterRunAutomaton binaryToCharRunAutomaton(Automaton binaryAutomaton, String description) { - return new CharacterRunAutomaton(Automata.makeEmpty()) { // empty here is bogus just to satisfy API - // TODO can we get access to the aq.compiledAutomaton.runAutomaton ? - ByteRunAutomaton byteRunAutomaton = - new ByteRunAutomaton(binaryAutomaton, true, Operations.DEFAULT_MAX_DETERMINIZED_STATES); - - @Override - public String toString() { - return description; - } - - @Override - public boolean run(char[] chars, int offset, int length) { - int state = 0; - final int maxIdx = offset + length; - for (int i = offset; i < maxIdx; i++) { - final int code = chars[i]; - int b; - // UTF16 to UTF8 (inlined logic from UnicodeUtil.UTF16toUTF8 ) - if (code < 0x80) { - state = byteRunAutomaton.step(state, code); - if (state == -1) return false; - } else if (code < 0x800) { - b = (0xC0 | (code >> 6)); - state = byteRunAutomaton.step(state, b); - if (state == -1) return false; - b = (0x80 | (code & 0x3F)); - state = byteRunAutomaton.step(state, b); - if (state == -1) return false; - } else { - // more complex - byte[] utf8Bytes = new byte[4 * (maxIdx - i)]; - int utf8Len = UnicodeUtil.UTF16toUTF8(chars, i, maxIdx - i, utf8Bytes); - for (int utfIdx = 0; utfIdx < utf8Len; utfIdx++) { - state = byteRunAutomaton.step(state, utf8Bytes[utfIdx] & 0xFF); - if (state == -1) return false; - } - break; - } - } - return byteRunAutomaton.isAccept(state); - } - }; - } - - - } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java index 08f2b128892..d69d1cb33c3 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java @@ -22,7 +22,6 @@ import java.util.Collections; import org.apache.lucene.index.LeafReader; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.automaton.CharacterRunAutomaton; /** * Never returns offsets. Used when the query would highlight nothing. @@ -34,7 +33,8 @@ public class NoOpOffsetStrategy extends FieldOffsetStrategy { public static final NoOpOffsetStrategy INSTANCE = new NoOpOffsetStrategy(); private NoOpOffsetStrategy() { - super(new UHComponents("_ignored_", (s) -> false, new MatchNoDocsQuery(), new BytesRef[0], PhraseHelper.NONE, new CharacterRunAutomaton[0], false, Collections.emptySet())); + super(new UHComponents("_ignored_", (s) -> false, new MatchNoDocsQuery(), + new BytesRef[0], PhraseHelper.NONE, new LabelledCharArrayMatcher[0], false, Collections.emptySet())); } @Override diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java index a7282b6b5af..c8729140296 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java @@ -34,28 +34,24 @@ import org.apache.lucene.util.automaton.CharacterRunAutomaton; */ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy { - private final CharacterRunAutomaton[] combinedAutomata; + private final CharArrayMatcher[] combinedAutomata; public TokenStreamOffsetStrategy(UHComponents components, Analyzer indexAnalyzer) { super(components, indexAnalyzer); assert components.getPhraseHelper().hasPositionSensitivity() == false; - combinedAutomata = convertTermsToAutomata(components.getTerms(), components.getAutomata()); + combinedAutomata = convertTermsToMatchers(components.getTerms(), components.getAutomata()); } //TODO this is inefficient; instead build a union automata just for terms part. - private static CharacterRunAutomaton[] convertTermsToAutomata(BytesRef[] terms, CharacterRunAutomaton[] automata) { - CharacterRunAutomaton[] newAutomata = new CharacterRunAutomaton[terms.length + automata.length]; + private static CharArrayMatcher[] convertTermsToMatchers(BytesRef[] terms, CharArrayMatcher[] matchers) { + CharArrayMatcher[] newAutomata = new CharArrayMatcher[terms.length + matchers.length]; for (int i = 0; i < terms.length; i++) { String termString = terms[i].utf8ToString(); - newAutomata[i] = new CharacterRunAutomaton(Automata.makeString(termString)) { - @Override - public String toString() { - return termString; - } - }; + CharacterRunAutomaton a = new CharacterRunAutomaton(Automata.makeString(termString)); + newAutomata[i] = LabelledCharArrayMatcher.wrap(termString, a::run); } // Append existing automata (that which is used for MTQs) - System.arraycopy(automata, 0, newAutomata, terms.length, automata.length); + System.arraycopy(matchers, 0, newAutomata, terms.length, matchers.length); return newAutomata; } @@ -66,7 +62,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy { private static class TokenStreamOffsetsEnum extends OffsetsEnum { TokenStream stream; // becomes null when closed - final CharacterRunAutomaton[] matchers; + final CharArrayMatcher[] matchers; final CharTermAttribute charTermAtt; final OffsetAttribute offsetAtt; @@ -74,7 +70,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy { final BytesRef matchDescriptions[]; - TokenStreamOffsetsEnum(TokenStream ts, CharacterRunAutomaton[] matchers) throws IOException { + TokenStreamOffsetsEnum(TokenStream ts, CharArrayMatcher[] matchers) throws IOException { this.stream = ts; this.matchers = matchers; matchDescriptions = new BytesRef[matchers.length]; @@ -88,7 +84,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy { if (stream != null) { while (stream.incrementToken()) { for (int i = 0; i < matchers.length; i++) { - if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) { + if (matchers[i].match(charTermAtt.buffer(), 0, charTermAtt.length())) { currentMatch = i; return true; } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java index 4af6d7098c4..65dd84b5fa3 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java @@ -22,7 +22,6 @@ import java.util.function.Predicate; import org.apache.lucene.search.Query; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.automaton.CharacterRunAutomaton; /** * A parameter object to hold the components a {@link FieldOffsetStrategy} needs. @@ -35,12 +34,12 @@ public class UHComponents { private final Query query; private final BytesRef[] terms; // Query: all terms we extracted (some may be position sensitive) private final PhraseHelper phraseHelper; // Query: position-sensitive information - private final CharacterRunAutomaton[] automata; // Query: wildcards (i.e. multi-term query), not position sensitive + private final LabelledCharArrayMatcher[] automata; // Query: wildcards (i.e. multi-term query), not position sensitive private final boolean hasUnrecognizedQueryPart; // Query: if part of the query (other than the extracted terms / automata) is a leaf we don't know private final Set highlightFlags; public UHComponents(String field, Predicate fieldMatcher, Query query, - BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, + BytesRef[] terms, PhraseHelper phraseHelper, LabelledCharArrayMatcher[] automata, boolean hasUnrecognizedQueryPart, Set highlightFlags) { this.field = field; this.fieldMatcher = fieldMatcher; @@ -72,7 +71,7 @@ public class UHComponents { return phraseHelper; } - public CharacterRunAutomaton[] getAutomata() { + public LabelledCharArrayMatcher[] getAutomata() { return automata; } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java index e6c0742d3d1..74de2483b8a 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java @@ -62,7 +62,6 @@ import org.apache.lucene.search.Weight; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.InPlaceMergeSorter; -import org.apache.lucene.util.automaton.CharacterRunAutomaton; /** * A Highlighter that can get offsets from either @@ -110,7 +109,7 @@ public class UnifiedHighlighter { } } - protected static final CharacterRunAutomaton[] ZERO_LEN_AUTOMATA_ARRAY = new CharacterRunAutomaton[0]; + protected static final LabelledCharArrayMatcher[] ZERO_LEN_AUTOMATA_ARRAY = new LabelledCharArrayMatcher[0]; protected final IndexSearcher searcher; // if null, can only use highlightWithoutSearcher @@ -769,7 +768,7 @@ public class UnifiedHighlighter { PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags); boolean queryHasUnrecognizedPart = hasUnrecognizedQuery(fieldMatcher, query); BytesRef[] terms = null; - CharacterRunAutomaton[] automata = null; + LabelledCharArrayMatcher[] automata = null; if (!highlightFlags.contains(HighlightFlag.WEIGHT_MATCHES) || !queryHasUnrecognizedPart) { terms = filterExtractedTerms(fieldMatcher, allTerms); automata = getAutomata(field, query, highlightFlags); @@ -839,7 +838,7 @@ public class UnifiedHighlighter { : PhraseHelper.NONE; } - protected CharacterRunAutomaton[] getAutomata(String field, Query query, Set highlightFlags) { + protected LabelledCharArrayMatcher[] getAutomata(String field, Query query, Set highlightFlags) { // do we "eagerly" look in span queries for automata here, or do we not and let PhraseHelper handle those? // if don't highlight phrases strictly, final boolean lookInSpan = diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java index def44319d63..33bc7e1d2a0 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java @@ -36,6 +36,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.uhighlight.FieldHighlighter; import org.apache.lucene.search.uhighlight.FieldOffsetStrategy; +import org.apache.lucene.search.uhighlight.LabelledCharArrayMatcher; import org.apache.lucene.search.uhighlight.OffsetsEnum; import org.apache.lucene.search.uhighlight.Passage; import org.apache.lucene.search.uhighlight.PassageFormatter; @@ -46,7 +47,6 @@ import org.apache.lucene.search.uhighlight.UHComponents; import org.apache.lucene.search.uhighlight.UnifiedHighlighter; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.junit.Test; /** @@ -65,7 +65,7 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase { (s) -> false, new MatchAllDocsQuery(), new BytesRef[0], PhraseHelper.NONE, - new CharacterRunAutomaton[0], false, Collections.emptySet())) { + new LabelledCharArrayMatcher[0], false, Collections.emptySet())) { @Override public UnifiedHighlighter.OffsetSource getOffsetSource() { return offsetSource; @@ -180,7 +180,7 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase { BytesRef[] terms = filterExtractedTerms(fieldMatcher, allTerms); Set highlightFlags = getFlags(field); PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags); - CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags); + LabelledCharArrayMatcher[] automata = getAutomata(field, query, highlightFlags); boolean queryHasUnrecognizedPart = false; return new UHComponents(field, fieldMatcher, query, terms, phraseHelper, automata, queryHasUnrecognizedPart, highlightFlags); } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/MultiTermIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/MultiTermIntervalsSource.java index 5fb6389d311..589f9c6e3fc 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/MultiTermIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/MultiTermIntervalsSource.java @@ -96,7 +96,7 @@ class MultiTermIntervalsSource extends IntervalsSource { @Override public void visit(String field, QueryVisitor visitor) { - visitor.visitLeaf(new IntervalQuery(field, this)); + automaton.visit(visitor, new IntervalQuery(field, this), field); } @Override