mirror of https://github.com/apache/lucene.git
Fix position increment in (Reverse)PathHierarchyTokenizer (#12875)
* Fix PathHierarchyTokenizer positions PathHierarchyTokenizer was emitting multiple tokens in the same position with changing offsets. To be consistent with EdgeNGramTokenizer (which is conceptually similar -- it's emitting multiple prefixes/suffixes off the input string), we can output every token with length 1 with positions incrementing by 1. * Fix ReversePathHierarchyTokenizer positions Making ReversePathHierarchyTokenizer consistent with recent changes in PathHierarchyTokenizer. --------- Co-authored-by: Michael Froh <froh@amazon.com>
This commit is contained in:
parent
6bb244a932
commit
5d6086e199
|
@ -77,6 +77,9 @@ API Changes
|
|||
|
||||
* GITHUB#12855: Remove deprecated DrillSideways#createDrillDownFacetsCollector extension method. (Greg Miller)
|
||||
|
||||
* GITHUB#12875: Ensure token position is always increased in PathHierarchyTokenizer and ReversePathHierarchyTokenizer
|
||||
and resulting tokens do not overlap. (Michael Froh, Lukáš Vlček)
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -134,6 +134,11 @@ It now declares that it may throw `IOException`. This was an oversight because
|
|||
compiled expressions call `DoubleValues#doubleValue` behind the scenes, which
|
||||
may throw `IOException` on index problems, bubbling up unexpectedly to the caller.
|
||||
|
||||
### PathHierarchyTokenizer and ReversePathHierarchyTokenizer do not produce overlapping tokens
|
||||
|
||||
`(Reverse)PathHierarchyTokenizer` now produces sequential (instead of overlapping) tokens with accurate
|
||||
offsets, making positional queries and highlighters possible for fields tokenized with this tokenizer.
|
||||
|
||||
## Migration from Lucene 9.0 to Lucene 9.1
|
||||
|
||||
### Test framework package migration and module (LUCENE-10301)
|
||||
|
|
|
@ -100,7 +100,8 @@ public class PathHierarchyTokenizer extends Tokenizer {
|
|||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt =
|
||||
addAttribute(PositionIncrementAttribute.class);
|
||||
private int startPosition = 0;
|
||||
private int skipped = 0;
|
||||
private boolean endDelimiter = false;
|
||||
|
@ -112,11 +113,7 @@ public class PathHierarchyTokenizer extends Tokenizer {
|
|||
public final boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
termAtt.append(resultToken);
|
||||
if (resultToken.length() == 0) {
|
||||
posAtt.setPositionIncrement(1);
|
||||
} else {
|
||||
posAtt.setPositionIncrement(0);
|
||||
}
|
||||
posIncAtt.setPositionIncrement(1);
|
||||
int length = 0;
|
||||
boolean added = false;
|
||||
if (endDelimiter) {
|
||||
|
|
|
@ -112,7 +112,8 @@ public class ReversePathHierarchyTokenizer extends Tokenizer {
|
|||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt =
|
||||
addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
private int endPosition = 0;
|
||||
private int finalOffset = 0;
|
||||
|
@ -158,10 +159,8 @@ public class ReversePathHierarchyTokenizer extends Tokenizer {
|
|||
endPosition = delimiterPositions.get(idx);
|
||||
}
|
||||
finalOffset = correctOffset(length);
|
||||
posAtt.setPositionIncrement(1);
|
||||
} else {
|
||||
posAtt.setPositionIncrement(0);
|
||||
}
|
||||
posIncAtt.setPositionIncrement(1);
|
||||
|
||||
while (skipped < delimitersCount - skip - 1) {
|
||||
int start = delimiterPositions.get(skipped);
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.path;
|
|||
import static org.apache.lucene.analysis.path.PathHierarchyTokenizer.DEFAULT_DELIMITER;
|
||||
import static org.apache.lucene.analysis.path.PathHierarchyTokenizer.DEFAULT_SKIP;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Random;
|
||||
|
@ -41,7 +42,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] {"/a", "/a/b", "/a/b/c"},
|
||||
new int[] {0, 0, 0},
|
||||
new int[] {2, 4, 6},
|
||||
new int[] {1, 0, 0},
|
||||
new int[] {1, 1, 1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
|
@ -56,7 +57,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] {"/a", "/a/b", "/a/b/c", "/a/b/c/"},
|
||||
new int[] {0, 0, 0, 0},
|
||||
new int[] {2, 4, 6, 7},
|
||||
new int[] {1, 0, 0, 0},
|
||||
new int[] {1, 1, 1, 1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
|
@ -71,7 +72,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] {"a", "a/b", "a/b/c"},
|
||||
new int[] {0, 0, 0},
|
||||
new int[] {1, 3, 5},
|
||||
new int[] {1, 0, 0},
|
||||
new int[] {1, 1, 1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
|
@ -86,7 +87,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] {"a", "a/b", "a/b/c", "a/b/c/"},
|
||||
new int[] {0, 0, 0, 0},
|
||||
new int[] {1, 3, 5, 6},
|
||||
new int[] {1, 0, 0, 0},
|
||||
new int[] {1, 1, 1, 1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
|
@ -111,7 +112,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] {"/", "//"},
|
||||
new int[] {0, 0},
|
||||
new int[] {1, 2},
|
||||
new int[] {1, 0},
|
||||
new int[] {1, 1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
|
@ -125,7 +126,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] {"\\a", "\\a\\b", "\\a\\b\\c"},
|
||||
new int[] {0, 0, 0},
|
||||
new int[] {2, 4, 6},
|
||||
new int[] {1, 0, 0},
|
||||
new int[] {1, 1, 1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
|
@ -139,7 +140,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] {"c:", "c:\\a", "c:\\a\\b", "c:\\a\\b\\c"},
|
||||
new int[] {0, 0, 0, 0},
|
||||
new int[] {2, 4, 6, 8},
|
||||
new int[] {1, 0, 0, 0},
|
||||
new int[] {1, 1, 1, 1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
|
@ -158,7 +159,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] {"c:", "c:/a", "c:/a/b", "c:/a/b/c"},
|
||||
new int[] {0, 0, 0, 0},
|
||||
new int[] {2, 4, 6, 8},
|
||||
new int[] {1, 0, 0, 0},
|
||||
new int[] {1, 1, 1, 1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
|
@ -172,7 +173,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] {"/b", "/b/c"},
|
||||
new int[] {2, 2},
|
||||
new int[] {4, 6},
|
||||
new int[] {1, 0},
|
||||
new int[] {1, 1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
|
@ -186,7 +187,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] {"/b", "/b/c", "/b/c/"},
|
||||
new int[] {2, 2, 2},
|
||||
new int[] {4, 6, 7},
|
||||
new int[] {1, 0, 0},
|
||||
new int[] {1, 1, 1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
|
@ -200,7 +201,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] {"/b", "/b/c"},
|
||||
new int[] {1, 1},
|
||||
new int[] {3, 5},
|
||||
new int[] {1, 0},
|
||||
new int[] {1, 1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
|
@ -214,7 +215,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] {"/b", "/b/c", "/b/c/"},
|
||||
new int[] {1, 1, 1},
|
||||
new int[] {3, 5, 6},
|
||||
new int[] {1, 0, 0},
|
||||
new int[] {1, 1, 1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
|
@ -270,4 +271,20 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
checkRandomData(random, a, 100 * RANDOM_MULTIPLIER, 1027, false, false);
|
||||
a.close();
|
||||
}
|
||||
|
||||
private final Analyzer analyzer =
|
||||
new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new PathHierarchyTokenizer();
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
};
|
||||
|
||||
public void testTokenizerViaAnalyzerOutput() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "a/b/c", new String[] {"a", "a/b", "a/b/c"});
|
||||
assertAnalyzesTo(analyzer, "a/b/c/", new String[] {"a", "a/b", "a/b/c", "a/b/c/"});
|
||||
assertAnalyzesTo(analyzer, "/a/b/c", new String[] {"/a", "/a/b", "/a/b/c"});
|
||||
assertAnalyzesTo(analyzer, "/a/b/c/", new String[] {"/a", "/a/b", "/a/b/c", "/a/b/c/"});
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.path;
|
|||
import static org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer.DEFAULT_DELIMITER;
|
||||
import static org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer.DEFAULT_SKIP;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.Random;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
|
@ -38,7 +39,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] {"/a/b/c", "a/b/c", "b/c", "c"},
|
||||
new int[] {0, 1, 3, 5},
|
||||
new int[] {6, 6, 6, 6},
|
||||
new int[] {1, 0, 0, 0},
|
||||
new int[] {1, 1, 1, 1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
|
@ -53,7 +54,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] {"/a/b/c/", "a/b/c/", "b/c/", "c/"},
|
||||
new int[] {0, 1, 3, 5},
|
||||
new int[] {7, 7, 7, 7},
|
||||
new int[] {1, 0, 0, 0},
|
||||
new int[] {1, 1, 1, 1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
|
@ -68,7 +69,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] {"a/b/c", "b/c", "c"},
|
||||
new int[] {0, 2, 4},
|
||||
new int[] {5, 5, 5},
|
||||
new int[] {1, 0, 0},
|
||||
new int[] {1, 1, 1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
|
@ -83,7 +84,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] {"a/b/c/", "b/c/", "c/"},
|
||||
new int[] {0, 2, 4},
|
||||
new int[] {6, 6, 6},
|
||||
new int[] {1, 0, 0},
|
||||
new int[] {1, 1, 1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
|
@ -108,7 +109,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] {"//", "/"},
|
||||
new int[] {0, 1},
|
||||
new int[] {2, 2},
|
||||
new int[] {1, 0},
|
||||
new int[] {1, 1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
|
@ -124,7 +125,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] {"/a/b/", "a/b/", "b/"},
|
||||
new int[] {0, 1, 3},
|
||||
new int[] {5, 5, 5},
|
||||
new int[] {1, 0, 0},
|
||||
new int[] {1, 1, 1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
|
@ -139,7 +140,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] {"a/b/", "b/"},
|
||||
new int[] {0, 2},
|
||||
new int[] {4, 4},
|
||||
new int[] {1, 0},
|
||||
new int[] {1, 1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
|
@ -154,7 +155,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] {"a/b/", "b/"},
|
||||
new int[] {0, 2},
|
||||
new int[] {4, 4},
|
||||
new int[] {1, 0},
|
||||
new int[] {1, 1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
|
@ -189,7 +190,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] {"/a/", "a/"},
|
||||
new int[] {0, 1},
|
||||
new int[] {3, 3},
|
||||
new int[] {1, 0},
|
||||
new int[] {1, 1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
|
@ -227,4 +228,20 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
checkRandomData(random, a, 100 * RANDOM_MULTIPLIER, 1027, false, false);
|
||||
a.close();
|
||||
}
|
||||
|
||||
private final Analyzer analyzer =
|
||||
new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new ReversePathHierarchyTokenizer();
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
};
|
||||
|
||||
public void testTokenizerViaAnalyzerOutput() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "a/b/c", new String[] {"a/b/c", "b/c", "c"});
|
||||
assertAnalyzesTo(analyzer, "a/b/c/", new String[] {"a/b/c/", "b/c/", "c/"});
|
||||
assertAnalyzesTo(analyzer, "/a/b/c", new String[] {"/a/b/c", "a/b/c", "b/c", "c"});
|
||||
assertAnalyzesTo(analyzer, "/a/b/c/", new String[] {"/a/b/c/", "a/b/c/", "b/c/", "c/"});
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue