diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 784b72450a2..766cc28daa3 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -157,6 +157,10 @@ New Features * LUCENE-5440: Add LongBitSet for managing more than 2.1B bits (otherwise use FixedBitSet). (Shai Erera) +* LUCENE-5437: ASCIIFoldingFilter now has an option to preserve the original token + and emit it on the same position as the folded token only if the actual token was + folded. (Simon Willnauer, Nik Everett) + Build * LUCENE-5217,LUCENE-5420: Maven config: get dependencies from Ant+Ivy config; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java index 27b396fa011..4afe735b435 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java @@ -22,6 +22,7 @@ import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.RamUsageEstimator; @@ -57,17 +58,49 @@ import org.apache.lucene.util.RamUsageEstimator; * For example, 'à' will be replaced by 'a'. */ public final class ASCIIFoldingFilter extends TokenFilter { - public ASCIIFoldingFilter(TokenStream input) - { - super(input); - } - + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncAttr = addAttribute(PositionIncrementAttribute.class); + private final boolean preserveOriginal; private char[] output = new char[512]; private int outputPos; - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private State state; + + public ASCIIFoldingFilter(TokenStream input) + { + this(input, false); + } + + /** + * Create a new {@link ASCIIFoldingFilter}. + * + * @param input + * TokenStream to filter + * @param preserveOriginal + * should the original tokens be kept on the input stream with a 0 position increment + * from the folded tokens? + **/ + public ASCIIFoldingFilter(TokenStream input, boolean preserveOriginal) + { + super(input); + this.preserveOriginal = preserveOriginal; + } + + /** + * Does the filter preserve the original tokens? + */ + public boolean isPreserveOriginal() { + return preserveOriginal; + } @Override public boolean incrementToken() throws IOException { + if (state != null) { + assert preserveOriginal : "state should only be captured if preserveOriginal is true"; + restoreState(state); + posIncAttr.setPositionIncrement(0); + state = null; + return true; + } if (input.incrementToken()) { final char[] buffer = termAtt.buffer(); final int length = termAtt.length(); @@ -89,6 +122,12 @@ public final class ASCIIFoldingFilter extends TokenFilter { } } + @Override + public void reset() throws IOException { + super.reset(); + state = null; + } + /** * Converts characters above ASCII to their ASCII equivalents. For example, * accents are removed from accented characters. @@ -97,6 +136,9 @@ public final class ASCIIFoldingFilter extends TokenFilter { */ public void foldToASCII(char[] input, int length) { + if (preserveOriginal) { + state = captureState(); + } // Worst-case length required: final int maxSizeNeeded = 4 * length; if (output.length < maxSizeNeeded) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java index 63f12fc0626..464fe614ac6 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java @@ -31,15 +31,17 @@ import org.apache.lucene.analysis.TokenStream; * <fieldType name="text_ascii" class="solr.TextField" positionIncrementGap="100"> * <analyzer> * <tokenizer class="solr.WhitespaceTokenizerFactory"/> - * <filter class="solr.ASCIIFoldingFilterFactory"/> + * <filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="false"/> * </analyzer> * </fieldType> */ public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + private final boolean preserveOriginal; /** Creates a new ASCIIFoldingFilterFactory */ public ASCIIFoldingFilterFactory(Map args) { super(args); + preserveOriginal = getBoolean(args, "preserveOriginal", false); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -47,7 +49,7 @@ public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements Mul @Override public ASCIIFoldingFilter create(TokenStream input) { - return new ASCIIFoldingFilter(input); + return new ASCIIFoldingFilter(input, preserveOriginal); } @Override diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java index fbece77b46d..81495b14af6 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java @@ -31,91 +31,103 @@ import java.util.ArrayList; import java.util.Iterator; public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase { + /** + * Pop one input token's worth of tokens off the filter and verify that they are as expected. + */ + void assertNextTerms(String expectedUnfolded, String expectedFolded, ASCIIFoldingFilter filter, + CharTermAttribute termAtt) throws Exception { + assertTrue(filter.incrementToken()); + assertEquals(expectedFolded, termAtt.toString()); + if (filter.isPreserveOriginal() && !expectedUnfolded.equals(expectedFolded)) { + assertTrue(filter.incrementToken()); + assertEquals(expectedUnfolded, termAtt.toString()); + } + } // testLain1Accents() is a copy of TestLatin1AccentFilter.testU(). public void testLatin1Accents() throws Exception { TokenStream stream = whitespaceMockTokenizer("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ" +" Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij" +" ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl"); - ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream); + ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream, random().nextBoolean()); CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class); filter.reset(); - assertTermEquals("Des", filter, termAtt); - assertTermEquals("mot", filter, termAtt); - assertTermEquals("cles", filter, termAtt); - assertTermEquals("A", filter, termAtt); - assertTermEquals("LA", filter, termAtt); - assertTermEquals("CHAINE", filter, termAtt); - assertTermEquals("A", filter, termAtt); - assertTermEquals("A", filter, termAtt); - assertTermEquals("A", filter, termAtt); - assertTermEquals("A", filter, termAtt); - assertTermEquals("A", filter, termAtt); - assertTermEquals("A", filter, termAtt); - assertTermEquals("AE", filter, termAtt); - assertTermEquals("C", filter, termAtt); - assertTermEquals("E", filter, termAtt); - assertTermEquals("E", filter, termAtt); - assertTermEquals("E", filter, termAtt); - assertTermEquals("E", filter, termAtt); - assertTermEquals("I", filter, termAtt); - assertTermEquals("I", filter, termAtt); - assertTermEquals("I", filter, termAtt); - assertTermEquals("I", filter, termAtt); - assertTermEquals("IJ", filter, termAtt); - assertTermEquals("D", filter, termAtt); - assertTermEquals("N", filter, termAtt); - assertTermEquals("O", filter, termAtt); - assertTermEquals("O", filter, termAtt); - assertTermEquals("O", filter, termAtt); - assertTermEquals("O", filter, termAtt); - assertTermEquals("O", filter, termAtt); - assertTermEquals("O", filter, termAtt); - assertTermEquals("OE", filter, termAtt); - assertTermEquals("TH", filter, termAtt); - assertTermEquals("U", filter, termAtt); - assertTermEquals("U", filter, termAtt); - assertTermEquals("U", filter, termAtt); - assertTermEquals("U", filter, termAtt); - assertTermEquals("Y", filter, termAtt); - assertTermEquals("Y", filter, termAtt); - assertTermEquals("a", filter, termAtt); - assertTermEquals("a", filter, termAtt); - assertTermEquals("a", filter, termAtt); - assertTermEquals("a", filter, termAtt); - assertTermEquals("a", filter, termAtt); - assertTermEquals("a", filter, termAtt); - assertTermEquals("ae", filter, termAtt); - assertTermEquals("c", filter, termAtt); - assertTermEquals("e", filter, termAtt); - assertTermEquals("e", filter, termAtt); - assertTermEquals("e", filter, termAtt); - assertTermEquals("e", filter, termAtt); - assertTermEquals("i", filter, termAtt); - assertTermEquals("i", filter, termAtt); - assertTermEquals("i", filter, termAtt); - assertTermEquals("i", filter, termAtt); - assertTermEquals("ij", filter, termAtt); - assertTermEquals("d", filter, termAtt); - assertTermEquals("n", filter, termAtt); - assertTermEquals("o", filter, termAtt); - assertTermEquals("o", filter, termAtt); - assertTermEquals("o", filter, termAtt); - assertTermEquals("o", filter, termAtt); - assertTermEquals("o", filter, termAtt); - assertTermEquals("o", filter, termAtt); - assertTermEquals("oe", filter, termAtt); - assertTermEquals("ss", filter, termAtt); - assertTermEquals("th", filter, termAtt); - assertTermEquals("u", filter, termAtt); - assertTermEquals("u", filter, termAtt); - assertTermEquals("u", filter, termAtt); - assertTermEquals("u", filter, termAtt); - assertTermEquals("y", filter, termAtt); - assertTermEquals("y", filter, termAtt); - assertTermEquals("fi", filter, termAtt); - assertTermEquals("fl", filter, termAtt); + assertNextTerms("Des", "Des", filter, termAtt); + assertNextTerms("mot", "mot", filter, termAtt); + assertNextTerms("clés", "cles", filter, termAtt); + assertNextTerms("À", "A", filter, termAtt); + assertNextTerms("LA", "LA", filter, termAtt); + assertNextTerms("CHAÎNE", "CHAINE", filter, termAtt); + assertNextTerms("À", "A", filter, termAtt); + assertNextTerms("Á", "A", filter, termAtt); + assertNextTerms("Â", "A", filter, termAtt); + assertNextTerms("Ã", "A", filter, termAtt); + assertNextTerms("Ä", "A", filter, termAtt); + assertNextTerms("Å", "A", filter, termAtt); + assertNextTerms("Æ", "AE", filter, termAtt); + assertNextTerms("Ç", "C", filter, termAtt); + assertNextTerms("È", "E", filter, termAtt); + assertNextTerms("É", "E", filter, termAtt); + assertNextTerms("Ê", "E", filter, termAtt); + assertNextTerms("Ë", "E", filter, termAtt); + assertNextTerms("Ì", "I", filter, termAtt); + assertNextTerms("Í", "I", filter, termAtt); + assertNextTerms("Î", "I", filter, termAtt); + assertNextTerms("Ï", "I", filter, termAtt); + assertNextTerms("IJ", "IJ", filter, termAtt); + assertNextTerms("Ð", "D", filter, termAtt); + assertNextTerms("Ñ", "N", filter, termAtt); + assertNextTerms("Ò", "O", filter, termAtt); + assertNextTerms("Ó", "O", filter, termAtt); + assertNextTerms("Ô", "O", filter, termAtt); + assertNextTerms("Õ", "O", filter, termAtt); + assertNextTerms("Ö", "O", filter, termAtt); + assertNextTerms("Ø", "O", filter, termAtt); + assertNextTerms("Œ", "OE", filter, termAtt); + assertNextTerms("Þ", "TH", filter, termAtt); + assertNextTerms("Ù", "U", filter, termAtt); + assertNextTerms("Ú", "U", filter, termAtt); + assertNextTerms("Û", "U", filter, termAtt); + assertNextTerms("Ü", "U", filter, termAtt); + assertNextTerms("Ý", "Y", filter, termAtt); + assertNextTerms("Ÿ", "Y", filter, termAtt); + assertNextTerms("à", "a", filter, termAtt); + assertNextTerms("á", "a", filter, termAtt); + assertNextTerms("â", "a", filter, termAtt); + assertNextTerms("ã", "a", filter, termAtt); + assertNextTerms("ä", "a", filter, termAtt); + assertNextTerms("å", "a", filter, termAtt); + assertNextTerms("æ", "ae", filter, termAtt); + assertNextTerms("ç", "c", filter, termAtt); + assertNextTerms("è", "e", filter, termAtt); + assertNextTerms("é", "e", filter, termAtt); + assertNextTerms("ê", "e", filter, termAtt); + assertNextTerms("ë", "e", filter, termAtt); + assertNextTerms("ì", "i", filter, termAtt); + assertNextTerms("í", "i", filter, termAtt); + assertNextTerms("î", "i", filter, termAtt); + assertNextTerms("ï", "i", filter, termAtt); + assertNextTerms("ij", "ij", filter, termAtt); + assertNextTerms("ð", "d", filter, termAtt); + assertNextTerms("ñ", "n", filter, termAtt); + assertNextTerms("ò", "o", filter, termAtt); + assertNextTerms("ó", "o", filter, termAtt); + assertNextTerms("ô", "o", filter, termAtt); + assertNextTerms("õ", "o", filter, termAtt); + assertNextTerms("ö", "o", filter, termAtt); + assertNextTerms("ø", "o", filter, termAtt); + assertNextTerms("œ", "oe", filter, termAtt); + assertNextTerms("ß", "ss", filter, termAtt); + assertNextTerms("þ", "th", filter, termAtt); + assertNextTerms("ù", "u", filter, termAtt); + assertNextTerms("ú", "u", filter, termAtt); + assertNextTerms("û", "u", filter, termAtt); + assertNextTerms("ü", "u", filter, termAtt); + assertNextTerms("ý", "y", filter, termAtt); + assertNextTerms("ÿ", "y", filter, termAtt); + assertNextTerms("fi", "fi", filter, termAtt); + assertNextTerms("fl", "fl", filter, termAtt); assertFalse(filter.incrementToken()); } @@ -1876,7 +1888,8 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase { }; // Construct input text and expected output tokens - List expectedOutputTokens = new ArrayList(); + List expectedUnfoldedTokens = new ArrayList(); + List expectedFoldedTokens = new ArrayList(); StringBuilder inputText = new StringBuilder(); for (int n = 0 ; n < foldings.length ; n += 2) { if (n > 0) { @@ -1884,32 +1897,30 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase { } inputText.append(foldings[n]); - // Construct the expected output token: the ASCII string to fold to, - // duplicated as many times as the number of characters in the input text. + // Construct the expected output tokens: both the unfolded and folded string, + // with the folded duplicated as many times as the number of characters in + // the input text. StringBuilder expected = new StringBuilder(); int numChars = foldings[n].length(); for (int m = 0 ; m < numChars; ++m) { expected.append(foldings[n + 1]); } - expectedOutputTokens.add(expected.toString()); + expectedUnfoldedTokens.add(foldings[n]); + expectedFoldedTokens.add(expected.toString()); } TokenStream stream = whitespaceMockTokenizer(inputText.toString()); - ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream); + ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream, random().nextBoolean()); CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class); - Iterator expectedIter = expectedOutputTokens.iterator(); + Iterator unfoldedIter = expectedUnfoldedTokens.iterator(); + Iterator foldedIter = expectedFoldedTokens.iterator(); filter.reset(); - while (expectedIter.hasNext()) { - assertTermEquals(expectedIter.next(), filter, termAtt); + while (foldedIter.hasNext()) { + assertNextTerms(unfoldedIter.next(), foldedIter.next(), filter, termAtt); } assertFalse(filter.incrementToken()); } - void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt) throws Exception { - assertTrue(stream.incrementToken()); - assertEquals(expected, termAtt.toString()); - } - /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { Analyzer a = new Analyzer() { @@ -1917,7 +1928,8 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); - return new TokenStreamComponents(tokenizer, new ASCIIFoldingFilter(tokenizer)); + return new TokenStreamComponents(tokenizer, + new ASCIIFoldingFilter(tokenizer, random().nextBoolean())); } }; checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER); @@ -1928,7 +1940,8 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new KeywordTokenizer(); - return new TokenStreamComponents(tokenizer, new ASCIIFoldingFilter(tokenizer)); + return new TokenStreamComponents(tokenizer, + new ASCIIFoldingFilter(tokenizer, random().nextBoolean())); } }; checkOneTerm(a, "", "");