remove Lucene class copies that are not needed anymore (#26325)
This commit is contained in:
parent
0dbebd9177
commit
8b8c06398e
|
@ -1,107 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to Elasticsearch under one or more contributor
|
|
||||||
* license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright
|
|
||||||
* ownership. Elasticsearch licenses this file to you under
|
|
||||||
* the Apache License, Version 2.0 (the "License"); you may
|
|
||||||
* not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing,
|
|
||||||
* software distributed under the License is distributed on an
|
|
||||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
||||||
* KIND, either express or implied. See the License for the
|
|
||||||
* specific language governing permissions and limitations
|
|
||||||
* under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.lucene.search;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
// this is just one possible solution for "early termination" !
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Abstract decorator class of a DocIdSetIterator
|
|
||||||
* implementation that provides on-demand filter/validation
|
|
||||||
* mechanism on an underlying DocIdSetIterator.
|
|
||||||
*/
|
|
||||||
public abstract class XFilteredDocIdSetIterator extends DocIdSetIterator {
|
|
||||||
protected DocIdSetIterator _innerIter;
|
|
||||||
private int doc;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructor.
|
|
||||||
* @param innerIter Underlying DocIdSetIterator.
|
|
||||||
*/
|
|
||||||
public XFilteredDocIdSetIterator(DocIdSetIterator innerIter) {
|
|
||||||
if (innerIter == null) {
|
|
||||||
throw new IllegalArgumentException("null iterator");
|
|
||||||
}
|
|
||||||
_innerIter = innerIter;
|
|
||||||
doc = -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Return the wrapped {@link DocIdSetIterator}. */
|
|
||||||
public DocIdSetIterator getDelegate() {
|
|
||||||
return _innerIter;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Validation method to determine whether a docid should be in the result set.
|
|
||||||
* @param doc docid to be tested
|
|
||||||
* @return true if input docid should be in the result set, false otherwise.
|
|
||||||
* @see #XFilteredDocIdSetIterator(DocIdSetIterator)
|
|
||||||
* @throws CollectionTerminatedException if the underlying iterator is exhausted.
|
|
||||||
*/
|
|
||||||
protected abstract boolean match(int doc);
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int docID() {
|
|
||||||
return doc;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int nextDoc() throws IOException {
|
|
||||||
try {
|
|
||||||
while ((doc = _innerIter.nextDoc()) != NO_MORE_DOCS) {
|
|
||||||
if (match(doc)) {
|
|
||||||
return doc;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (CollectionTerminatedException e) {
|
|
||||||
return doc = NO_MORE_DOCS;
|
|
||||||
}
|
|
||||||
return doc;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int advance(int target) throws IOException {
|
|
||||||
doc = _innerIter.advance(target);
|
|
||||||
try {
|
|
||||||
if (doc != NO_MORE_DOCS) {
|
|
||||||
if (match(doc)) {
|
|
||||||
return doc;
|
|
||||||
} else {
|
|
||||||
while ((doc = _innerIter.nextDoc()) != NO_MORE_DOCS) {
|
|
||||||
if (match(doc)) {
|
|
||||||
return doc;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return doc;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (CollectionTerminatedException e) {
|
|
||||||
return doc = NO_MORE_DOCS;
|
|
||||||
}
|
|
||||||
return doc;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public long cost() {
|
|
||||||
return _innerIter.cost();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,267 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to Elasticsearch under one or more contributor
|
|
||||||
* license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright
|
|
||||||
* ownership. Elasticsearch licenses this file to you under
|
|
||||||
* the Apache License, Version 2.0 (the "License"); you may
|
|
||||||
* not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing,
|
|
||||||
* software distributed under the License is distributed on an
|
|
||||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
||||||
* KIND, either express or implied. See the License for the
|
|
||||||
* specific language governing permissions and limitations
|
|
||||||
* under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.search.suggest.analyzing;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
|
||||||
import org.apache.lucene.analysis.TokenStreamToAutomaton;
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
|
||||||
import org.apache.lucene.util.IntsRef;
|
|
||||||
import org.apache.lucene.util.UnicodeUtil;
|
|
||||||
import org.apache.lucene.util.automaton.Automata;
|
|
||||||
import org.apache.lucene.util.automaton.Automaton;
|
|
||||||
import org.apache.lucene.util.automaton.FiniteStringsIterator;
|
|
||||||
import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
|
||||||
import org.apache.lucene.util.automaton.Operations;
|
|
||||||
import org.apache.lucene.util.automaton.UTF32ToUTF8;
|
|
||||||
import org.apache.lucene.util.fst.FST;
|
|
||||||
import org.apache.lucene.util.fst.PairOutputs;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Implements a fuzzy {@link AnalyzingSuggester}. The similarity measurement is
|
|
||||||
* based on the Damerau-Levenshtein (optimal string alignment) algorithm, though
|
|
||||||
* you can explicitly choose classic Levenshtein by passing <code>false</code>
|
|
||||||
* for the <code>transpositions</code> parameter.
|
|
||||||
* <p>
|
|
||||||
* At most, this query will match terms up to
|
|
||||||
* {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}
|
|
||||||
* edits. Higher distances are not supported. Note that the
|
|
||||||
* fuzzy distance is measured in "byte space" on the bytes
|
|
||||||
* returned by the {@link org.apache.lucene.analysis.TokenStream}'s {@link
|
|
||||||
* org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute}, usually UTF8. By default
|
|
||||||
* the analyzed bytes must be at least 3 {@link
|
|
||||||
* #DEFAULT_MIN_FUZZY_LENGTH} bytes before any edits are
|
|
||||||
* considered. Furthermore, the first 1 {@link
|
|
||||||
* #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be
|
|
||||||
* edited. We allow up to 1 (@link
|
|
||||||
* #DEFAULT_MAX_EDITS} edit.
|
|
||||||
* If {@link #unicodeAware} parameter in the constructor is set to true, maxEdits,
|
|
||||||
* minFuzzyLength, transpositions and nonFuzzyPrefix are measured in Unicode code
|
|
||||||
* points (actual letters) instead of bytes.*
|
|
||||||
*
|
|
||||||
* <p>
|
|
||||||
* NOTE: This suggester does not boost suggestions that
|
|
||||||
* required no edits over suggestions that did require
|
|
||||||
* edits. This is a known limitation.
|
|
||||||
*
|
|
||||||
* <p>
|
|
||||||
* Note: complex query analyzers can have a significant impact on the lookup
|
|
||||||
* performance. It's recommended to not use analyzers that drop or inject terms
|
|
||||||
* like synonyms to keep the complexity of the prefix intersection low for good
|
|
||||||
* lookup performance. At index time, complex analyzers can safely be used.
|
|
||||||
* </p>
|
|
||||||
*/
|
|
||||||
public final class XFuzzySuggester extends XAnalyzingSuggester {
|
|
||||||
private final int maxEdits;
|
|
||||||
private final boolean transpositions;
|
|
||||||
private final int nonFuzzyPrefix;
|
|
||||||
private final int minFuzzyLength;
|
|
||||||
private final boolean unicodeAware;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix
|
|
||||||
* parameters in Unicode code points (actual letters)
|
|
||||||
* instead of bytes.
|
|
||||||
*/
|
|
||||||
public static final boolean DEFAULT_UNICODE_AWARE = false;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The default minimum length of the key passed to {@link
|
|
||||||
* #lookup} before any edits are allowed.
|
|
||||||
*/
|
|
||||||
public static final int DEFAULT_MIN_FUZZY_LENGTH = 3;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The default prefix length where edits are not allowed.
|
|
||||||
*/
|
|
||||||
public static final int DEFAULT_NON_FUZZY_PREFIX = 1;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The default maximum number of edits for fuzzy
|
|
||||||
* suggestions.
|
|
||||||
*/
|
|
||||||
public static final int DEFAULT_MAX_EDITS = 1;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The default transposition value passed to {@link org.apache.lucene.util.automaton.LevenshteinAutomata}
|
|
||||||
*/
|
|
||||||
public static final boolean DEFAULT_TRANSPOSITIONS = true;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a {@link FuzzySuggester} instance initialized with default values.
|
|
||||||
*
|
|
||||||
* @param analyzer the analyzer used for this suggester
|
|
||||||
*/
|
|
||||||
public XFuzzySuggester(Analyzer analyzer) {
|
|
||||||
this(analyzer, analyzer);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a {@link FuzzySuggester} instance with an index & a query analyzer initialized with default values.
|
|
||||||
*
|
|
||||||
* @param indexAnalyzer
|
|
||||||
* Analyzer that will be used for analyzing suggestions while building the index.
|
|
||||||
* @param queryAnalyzer
|
|
||||||
* Analyzer that will be used for analyzing query text during lookup
|
|
||||||
*/
|
|
||||||
public XFuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
|
|
||||||
this(indexAnalyzer, null, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1,
|
|
||||||
DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS,
|
|
||||||
DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE,
|
|
||||||
null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE, HOLE_CHARACTER);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a {@link FuzzySuggester} instance.
|
|
||||||
*
|
|
||||||
* @param indexAnalyzer Analyzer that will be used for
|
|
||||||
* analyzing suggestions while building the index.
|
|
||||||
* @param queryAnalyzer Analyzer that will be used for
|
|
||||||
* analyzing query text during lookup
|
|
||||||
* @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
|
|
||||||
* @param maxSurfaceFormsPerAnalyzedForm Maximum number of
|
|
||||||
* surface forms to keep for a single analyzed form.
|
|
||||||
* When there are too many surface forms we discard the
|
|
||||||
* lowest weighted ones.
|
|
||||||
* @param maxGraphExpansions Maximum number of graph paths
|
|
||||||
* to expand from the analyzed form. Set this to -1 for
|
|
||||||
* no limit.
|
|
||||||
* @param maxEdits must be >= 0 and <= {@link org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} .
|
|
||||||
* @param transpositions <code>true</code> if transpositions should be treated as a primitive
|
|
||||||
* edit operation. If this is false, comparisons will implement the classic
|
|
||||||
* Levenshtein algorithm.
|
|
||||||
* @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}
|
|
||||||
* @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})
|
|
||||||
* @param sepLabel separation label
|
|
||||||
* @param payloadSep payload separator byte
|
|
||||||
* @param endByte end byte marker byte
|
|
||||||
*/
|
|
||||||
public XFuzzySuggester(Analyzer indexAnalyzer, Automaton queryPrefix, Analyzer queryAnalyzer,
|
|
||||||
int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
|
|
||||||
int maxEdits, boolean transpositions, int nonFuzzyPrefix, int minFuzzyLength,
|
|
||||||
boolean unicodeAware, FST<PairOutputs.Pair<Long, BytesRef>> fst, boolean hasPayloads,
|
|
||||||
int maxAnalyzedPathsForOneInput, int sepLabel, int payloadSep, int endByte, int holeCharacter) {
|
|
||||||
super(indexAnalyzer, queryPrefix, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions,
|
|
||||||
true, fst, hasPayloads, maxAnalyzedPathsForOneInput, sepLabel, payloadSep, endByte, holeCharacter);
|
|
||||||
if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
|
|
||||||
throw new IllegalArgumentException(
|
|
||||||
"maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
|
|
||||||
}
|
|
||||||
if (nonFuzzyPrefix < 0) {
|
|
||||||
throw new IllegalArgumentException("nonFuzzyPrefix must not be >= 0 (got " + nonFuzzyPrefix + ")");
|
|
||||||
}
|
|
||||||
if (minFuzzyLength < 0) {
|
|
||||||
throw new IllegalArgumentException("minFuzzyLength must not be >= 0 (got " + minFuzzyLength + ")");
|
|
||||||
}
|
|
||||||
|
|
||||||
this.maxEdits = maxEdits;
|
|
||||||
this.transpositions = transpositions;
|
|
||||||
this.nonFuzzyPrefix = nonFuzzyPrefix;
|
|
||||||
this.minFuzzyLength = minFuzzyLength;
|
|
||||||
this.unicodeAware = unicodeAware;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> getFullPrefixPaths(
|
|
||||||
List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> prefixPaths, Automaton lookupAutomaton,
|
|
||||||
FST<PairOutputs.Pair<Long,BytesRef>> fst)
|
|
||||||
throws IOException {
|
|
||||||
|
|
||||||
// TODO: right now there's no penalty for fuzzy/edits,
|
|
||||||
// ie a completion whose prefix matched exactly what the
|
|
||||||
// user typed gets no boost over completions that
|
|
||||||
// required an edit, which get no boost over completions
|
|
||||||
// requiring two edits. I suspect a multiplicative
|
|
||||||
// factor is appropriate (eg, say a fuzzy match must be at
|
|
||||||
// least 2X better weight than the non-fuzzy match to
|
|
||||||
// "compete") ... in which case I think the wFST needs
|
|
||||||
// to be log weights or something ...
|
|
||||||
|
|
||||||
Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
|
|
||||||
/*
|
|
||||||
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
|
|
||||||
w.write(levA.toDot());
|
|
||||||
w.close();
|
|
||||||
System.out.println("Wrote LevA to out.dot");
|
|
||||||
*/
|
|
||||||
return FSTUtil.intersectPrefixPaths(levA, fst);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected Automaton convertAutomaton(Automaton a) {
|
|
||||||
if (unicodeAware) {
|
|
||||||
// FLORIAN EDIT: get converted Automaton from superclass
|
|
||||||
Automaton utf8automaton = new UTF32ToUTF8().convert(super.convertAutomaton(a));
|
|
||||||
// This automaton should not blow up during determinize:
|
|
||||||
utf8automaton = Operations.determinize(utf8automaton, Integer.MAX_VALUE);
|
|
||||||
return utf8automaton;
|
|
||||||
} else {
|
|
||||||
return super.convertAutomaton(a);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public TokenStreamToAutomaton getTokenStreamToAutomaton() {
|
|
||||||
final TokenStreamToAutomaton tsta = super.getTokenStreamToAutomaton();
|
|
||||||
tsta.setUnicodeArcs(unicodeAware);
|
|
||||||
return tsta;
|
|
||||||
}
|
|
||||||
|
|
||||||
Automaton toLevenshteinAutomata(Automaton automaton) {
|
|
||||||
List<Automaton> subs = new ArrayList<>();
|
|
||||||
FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton);
|
|
||||||
for (IntsRef string; (string = finiteStrings.next()) != null;) {
|
|
||||||
if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) {
|
|
||||||
subs.add(Automata.makeString(string.ints, string.offset, string.length));
|
|
||||||
} else {
|
|
||||||
int ints[] = new int[string.length-nonFuzzyPrefix];
|
|
||||||
System.arraycopy(string.ints, string.offset+nonFuzzyPrefix, ints, 0, ints.length);
|
|
||||||
// TODO: maybe add alphaMin to LevenshteinAutomata,
|
|
||||||
// and pass 1 instead of 0? We probably don't want
|
|
||||||
// to allow the trailing dedup bytes to be
|
|
||||||
// edited... but then 0 byte is "in general" allowed
|
|
||||||
// on input (but not in UTF8).
|
|
||||||
LevenshteinAutomata lev = new LevenshteinAutomata(
|
|
||||||
ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
|
|
||||||
subs.add(lev.toAutomaton(maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (subs.isEmpty()) {
|
|
||||||
// automaton is empty, there is no accepted paths through it
|
|
||||||
return Automata.makeEmpty(); // matches nothing
|
|
||||||
} else if (subs.size() == 1) {
|
|
||||||
// no synonyms or anything: just a single path through the tokenstream
|
|
||||||
return subs.get(0);
|
|
||||||
} else {
|
|
||||||
// multiple paths: this is really scary! is it slow?
|
|
||||||
// maybe we should not do this and throw UOE?
|
|
||||||
Automaton a = Operations.union(subs);
|
|
||||||
// TODO: we could call toLevenshteinAutomata() before det?
|
|
||||||
// this only happens if you have multiple paths anyway (e.g. synonyms)
|
|
||||||
return Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in New Issue