diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 471bc1bdbd3..3a1ae2341eb 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -75,8 +75,9 @@ New Features (No changes) Improvements ---------------------- -(No changes) + +* LUCENE-9006: WordDelimiterGraphFilter's catenateAll token is now ordered before any token parts, like WDF did. + (David Smiley) Optimizations diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java index a04eaff8dd6..9d03c7ef495 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java @@ -447,7 +447,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter { private class PositionSorter extends InPlaceMergeSorter { @Override protected int compare(int i, int j) { - // sort by smaller start position + // smaller start position int iPosStart = bufferedParts[4*i]; int jPosStart = bufferedParts[4*j]; int cmp = Integer.compare(iPosStart, jPosStart); @@ -455,10 +455,18 @@ public final class WordDelimiterGraphFilter extends TokenFilter { return cmp; } - // tie break by longest pos length: + // longest pos length: int iPosEnd = bufferedParts[4*i+1]; int jPosEnd = bufferedParts[4*j+1]; - return Integer.compare(jPosEnd, iPosEnd); + cmp = Integer.compare(jPosEnd, iPosEnd); + if (cmp != 0) { + return cmp; + } + + // smaller start offset + int iOff = bufferedParts[4*i + 2]; + int jOff = bufferedParts[4*j + 2]; + return Integer.compare(iOff, jOff); } @Override diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java index 41109b8ea9c..67b80357d79 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java @@ -397,6 +397,34 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase { a.close(); } + // https://issues.apache.org/jira/browse/LUCENE-9006 + public void testCatenateAllEmittedBeforeParts() throws Exception { + // no number parts + final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | CATENATE_ALL; + + //not using getAnalyzer because we want adjustInternalOffsets=true + Analyzer a = new Analyzer() { + @Override + public TokenStreamComponents createComponents(String field) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, true, DEFAULT_WORD_DELIM_TABLE, flags, null)); + } + }; + + // input starts with a number, but we don't generate numbers. + // Nonetheless preserve-original and concatenate-all show up first. + assertTokenStreamContents(a.tokenStream("dummy", "8-other"), + new String[] { "8-other", "8other", "other" }, new int[]{0, 0, 2}, new int[]{7, 7, 7}); + + boolean useCharFilter = true; + boolean graphOffsetsAreCorrect = false; // note: could solve via always incrementing wordPos on first word ('8') + checkAnalysisConsistency(random(), a, useCharFilter, "8-other", graphOffsetsAreCorrect); + + verify("8-other", flags); // uses getAnalyzer which uses adjustInternalOffsets=false which works + + a.close(); + } + /** concat numbers + words + all */ public void testLotsOfConcatenating() throws Exception { final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; @@ -947,6 +975,9 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase { fail(b.toString()); } + + boolean useCharFilter = true; + checkAnalysisConsistency(random(), getAnalyzer(flags), useCharFilter, text); } public void testOnlyNumbers() throws Exception { diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java index c936a359523..6939e1f0136 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java @@ -373,26 +373,26 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { } public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException { + assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, null, input.length()); checkResetException(a, input); checkAnalysisConsistency(random(), a, true, input); - assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, null, input.length()); } public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException { + assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length()); checkResetException(a, input); checkAnalysisConsistency(random(), a, true, input); - assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length()); } public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean graphOffsetsAreCorrect) throws IOException { + assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), graphOffsetsAreCorrect); checkResetException(a, input); checkAnalysisConsistency(random(), a, true, input, graphOffsetsAreCorrect); - assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), graphOffsetsAreCorrect); } public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean graphOffsetsAreCorrect, byte[][] payloads) throws IOException { - checkResetException(a, input); assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), null, null, graphOffsetsAreCorrect, payloads); + checkResetException(a, input); } public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException { @@ -948,13 +948,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { w.close(); } - static int[] toIntArray(List list) { - int ret[] = new int[list.size()]; - int offset = 0; - for (Integer i : list) { - ret[offset++] = i; - } - return ret; + private static int[] toIntArray(List list) { + return list.stream().mapToInt(Integer::intValue).toArray(); } protected static MockTokenizer whitespaceMockTokenizer(Reader input) throws IOException {