mirror of https://github.com/apache/lucene.git
LUCENE-9006: WDGF catenateAll should come before parts
Fixes #953
(cherry picked from commit 517bfd0ab7
)
This commit is contained in:
parent
c89ec4b074
commit
bb3bcddeda
|
@ -14,8 +14,9 @@ New Features
|
||||||
(No changes)
|
(No changes)
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
---------------------
|
|
||||||
(No changes)
|
* LUCENE-9006: WordDelimiterGraphFilter's catenateAll token is now ordered before any token parts, like WDF did.
|
||||||
|
(David Smiley)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
|
|
|
@ -447,7 +447,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
|
||||||
private class PositionSorter extends InPlaceMergeSorter {
|
private class PositionSorter extends InPlaceMergeSorter {
|
||||||
@Override
|
@Override
|
||||||
protected int compare(int i, int j) {
|
protected int compare(int i, int j) {
|
||||||
// sort by smaller start position
|
// smaller start position
|
||||||
int iPosStart = bufferedParts[4*i];
|
int iPosStart = bufferedParts[4*i];
|
||||||
int jPosStart = bufferedParts[4*j];
|
int jPosStart = bufferedParts[4*j];
|
||||||
int cmp = Integer.compare(iPosStart, jPosStart);
|
int cmp = Integer.compare(iPosStart, jPosStart);
|
||||||
|
@ -455,10 +455,18 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
|
||||||
return cmp;
|
return cmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
// tie break by longest pos length:
|
// longest pos length:
|
||||||
int iPosEnd = bufferedParts[4*i+1];
|
int iPosEnd = bufferedParts[4*i+1];
|
||||||
int jPosEnd = bufferedParts[4*j+1];
|
int jPosEnd = bufferedParts[4*j+1];
|
||||||
return Integer.compare(jPosEnd, iPosEnd);
|
cmp = Integer.compare(jPosEnd, iPosEnd);
|
||||||
|
if (cmp != 0) {
|
||||||
|
return cmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
// smaller start offset
|
||||||
|
int iOff = bufferedParts[4*i + 2];
|
||||||
|
int jOff = bufferedParts[4*j + 2];
|
||||||
|
return Integer.compare(iOff, jOff);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -397,6 +397,34 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
||||||
a.close();
|
a.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// https://issues.apache.org/jira/browse/LUCENE-9006
|
||||||
|
public void testCatenateAllEmittedBeforeParts() throws Exception {
|
||||||
|
// no number parts
|
||||||
|
final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | CATENATE_ALL;
|
||||||
|
|
||||||
|
//not using getAnalyzer because we want adjustInternalOffsets=true
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
public TokenStreamComponents createComponents(String field) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
|
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, true, DEFAULT_WORD_DELIM_TABLE, flags, null));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// input starts with a number, but we don't generate numbers.
|
||||||
|
// Nonetheless preserve-original and concatenate-all show up first.
|
||||||
|
assertTokenStreamContents(a.tokenStream("dummy", "8-other"),
|
||||||
|
new String[] { "8-other", "8other", "other" }, new int[]{0, 0, 2}, new int[]{7, 7, 7});
|
||||||
|
|
||||||
|
boolean useCharFilter = true;
|
||||||
|
boolean graphOffsetsAreCorrect = false; // note: could solve via always incrementing wordPos on first word ('8')
|
||||||
|
checkAnalysisConsistency(random(), a, useCharFilter, "8-other", graphOffsetsAreCorrect);
|
||||||
|
|
||||||
|
verify("8-other", flags); // uses getAnalyzer which uses adjustInternalOffsets=false which works
|
||||||
|
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
|
||||||
/** concat numbers + words + all */
|
/** concat numbers + words + all */
|
||||||
public void testLotsOfConcatenating() throws Exception {
|
public void testLotsOfConcatenating() throws Exception {
|
||||||
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||||
|
@ -947,6 +975,9 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
fail(b.toString());
|
fail(b.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
boolean useCharFilter = true;
|
||||||
|
checkAnalysisConsistency(random(), getAnalyzer(flags), useCharFilter, text);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testOnlyNumbers() throws Exception {
|
public void testOnlyNumbers() throws Exception {
|
||||||
|
|
|
@ -373,26 +373,26 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
|
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
|
||||||
|
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, null, input.length());
|
||||||
checkResetException(a, input);
|
checkResetException(a, input);
|
||||||
checkAnalysisConsistency(random(), a, true, input);
|
checkAnalysisConsistency(random(), a, true, input);
|
||||||
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, null, input.length());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException {
|
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException {
|
||||||
|
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
|
||||||
checkResetException(a, input);
|
checkResetException(a, input);
|
||||||
checkAnalysisConsistency(random(), a, true, input);
|
checkAnalysisConsistency(random(), a, true, input);
|
||||||
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean graphOffsetsAreCorrect) throws IOException {
|
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean graphOffsetsAreCorrect) throws IOException {
|
||||||
|
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), graphOffsetsAreCorrect);
|
||||||
checkResetException(a, input);
|
checkResetException(a, input);
|
||||||
checkAnalysisConsistency(random(), a, true, input, graphOffsetsAreCorrect);
|
checkAnalysisConsistency(random(), a, true, input, graphOffsetsAreCorrect);
|
||||||
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), graphOffsetsAreCorrect);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean graphOffsetsAreCorrect, byte[][] payloads) throws IOException {
|
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean graphOffsetsAreCorrect, byte[][] payloads) throws IOException {
|
||||||
checkResetException(a, input);
|
|
||||||
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), null, null, graphOffsetsAreCorrect, payloads);
|
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), null, null, graphOffsetsAreCorrect, payloads);
|
||||||
|
checkResetException(a, input);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
|
public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
|
||||||
|
@ -948,13 +948,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
w.close();
|
w.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
static int[] toIntArray(List<Integer> list) {
|
private static int[] toIntArray(List<Integer> list) {
|
||||||
int ret[] = new int[list.size()];
|
return list.stream().mapToInt(Integer::intValue).toArray();
|
||||||
int offset = 0;
|
|
||||||
for (Integer i : list) {
|
|
||||||
ret[offset++] = i;
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static MockTokenizer whitespaceMockTokenizer(Reader input) throws IOException {
|
protected static MockTokenizer whitespaceMockTokenizer(Reader input) throws IOException {
|
||||||
|
|
Loading…
Reference in New Issue