mirror of https://github.com/apache/lucene.git
LUCENE-5437: ASCIIFoldingFilter now has an option to preserve the original token
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1567595 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
cf3bef28fc
commit
531d3a5744
|
@ -157,6 +157,10 @@ New Features
|
|||
* LUCENE-5440: Add LongBitSet for managing more than 2.1B bits (otherwise use
|
||||
FixedBitSet). (Shai Erera)
|
||||
|
||||
* LUCENE-5437: ASCIIFoldingFilter now has an option to preserve the original token
|
||||
and emit it on the same position as the folded token only if the actual token was
|
||||
folded. (Simon Willnauer, Nik Everett)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-5217,LUCENE-5420: Maven config: get dependencies from Ant+Ivy config;
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.IOException;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
|
@ -57,17 +58,49 @@ import org.apache.lucene.util.RamUsageEstimator;
|
|||
* For example, 'à' will be replaced by 'a'.
|
||||
*/
|
||||
public final class ASCIIFoldingFilter extends TokenFilter {
|
||||
public ASCIIFoldingFilter(TokenStream input)
|
||||
{
|
||||
super(input);
|
||||
}
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttr = addAttribute(PositionIncrementAttribute.class);
|
||||
private final boolean preserveOriginal;
|
||||
private char[] output = new char[512];
|
||||
private int outputPos;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private State state;
|
||||
|
||||
public ASCIIFoldingFilter(TokenStream input)
|
||||
{
|
||||
this(input, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new {@link ASCIIFoldingFilter}.
|
||||
*
|
||||
* @param input
|
||||
* TokenStream to filter
|
||||
* @param preserveOriginal
|
||||
* should the original tokens be kept on the input stream with a 0 position increment
|
||||
* from the folded tokens?
|
||||
**/
|
||||
public ASCIIFoldingFilter(TokenStream input, boolean preserveOriginal)
|
||||
{
|
||||
super(input);
|
||||
this.preserveOriginal = preserveOriginal;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does the filter preserve the original tokens?
|
||||
*/
|
||||
public boolean isPreserveOriginal() {
|
||||
return preserveOriginal;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (state != null) {
|
||||
assert preserveOriginal : "state should only be captured if preserveOriginal is true";
|
||||
restoreState(state);
|
||||
posIncAttr.setPositionIncrement(0);
|
||||
state = null;
|
||||
return true;
|
||||
}
|
||||
if (input.incrementToken()) {
|
||||
final char[] buffer = termAtt.buffer();
|
||||
final int length = termAtt.length();
|
||||
|
@ -89,6 +122,12 @@ public final class ASCIIFoldingFilter extends TokenFilter {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
state = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts characters above ASCII to their ASCII equivalents. For example,
|
||||
* accents are removed from accented characters.
|
||||
|
@ -97,6 +136,9 @@ public final class ASCIIFoldingFilter extends TokenFilter {
|
|||
*/
|
||||
public void foldToASCII(char[] input, int length)
|
||||
{
|
||||
if (preserveOriginal) {
|
||||
state = captureState();
|
||||
}
|
||||
// Worst-case length required:
|
||||
final int maxSizeNeeded = 4 * length;
|
||||
if (output.length < maxSizeNeeded) {
|
||||
|
|
|
@ -31,15 +31,17 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
* <fieldType name="text_ascii" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.ASCIIFoldingFilterFactory"/>
|
||||
* <filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="false"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
private final boolean preserveOriginal;
|
||||
|
||||
/** Creates a new ASCIIFoldingFilterFactory */
|
||||
public ASCIIFoldingFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
preserveOriginal = getBoolean(args, "preserveOriginal", false);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -47,7 +49,7 @@ public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements Mul
|
|||
|
||||
@Override
|
||||
public ASCIIFoldingFilter create(TokenStream input) {
|
||||
return new ASCIIFoldingFilter(input);
|
||||
return new ASCIIFoldingFilter(input, preserveOriginal);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -31,91 +31,103 @@ import java.util.ArrayList;
|
|||
import java.util.Iterator;
|
||||
|
||||
public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
|
||||
/**
|
||||
* Pop one input token's worth of tokens off the filter and verify that they are as expected.
|
||||
*/
|
||||
void assertNextTerms(String expectedUnfolded, String expectedFolded, ASCIIFoldingFilter filter,
|
||||
CharTermAttribute termAtt) throws Exception {
|
||||
assertTrue(filter.incrementToken());
|
||||
assertEquals(expectedFolded, termAtt.toString());
|
||||
if (filter.isPreserveOriginal() && !expectedUnfolded.equals(expectedFolded)) {
|
||||
assertTrue(filter.incrementToken());
|
||||
assertEquals(expectedUnfolded, termAtt.toString());
|
||||
}
|
||||
}
|
||||
|
||||
// testLain1Accents() is a copy of TestLatin1AccentFilter.testU().
|
||||
public void testLatin1Accents() throws Exception {
|
||||
TokenStream stream = whitespaceMockTokenizer("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ"
|
||||
+" Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij"
|
||||
+" ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl");
|
||||
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream);
|
||||
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream, random().nextBoolean());
|
||||
|
||||
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||
filter.reset();
|
||||
assertTermEquals("Des", filter, termAtt);
|
||||
assertTermEquals("mot", filter, termAtt);
|
||||
assertTermEquals("cles", filter, termAtt);
|
||||
assertTermEquals("A", filter, termAtt);
|
||||
assertTermEquals("LA", filter, termAtt);
|
||||
assertTermEquals("CHAINE", filter, termAtt);
|
||||
assertTermEquals("A", filter, termAtt);
|
||||
assertTermEquals("A", filter, termAtt);
|
||||
assertTermEquals("A", filter, termAtt);
|
||||
assertTermEquals("A", filter, termAtt);
|
||||
assertTermEquals("A", filter, termAtt);
|
||||
assertTermEquals("A", filter, termAtt);
|
||||
assertTermEquals("AE", filter, termAtt);
|
||||
assertTermEquals("C", filter, termAtt);
|
||||
assertTermEquals("E", filter, termAtt);
|
||||
assertTermEquals("E", filter, termAtt);
|
||||
assertTermEquals("E", filter, termAtt);
|
||||
assertTermEquals("E", filter, termAtt);
|
||||
assertTermEquals("I", filter, termAtt);
|
||||
assertTermEquals("I", filter, termAtt);
|
||||
assertTermEquals("I", filter, termAtt);
|
||||
assertTermEquals("I", filter, termAtt);
|
||||
assertTermEquals("IJ", filter, termAtt);
|
||||
assertTermEquals("D", filter, termAtt);
|
||||
assertTermEquals("N", filter, termAtt);
|
||||
assertTermEquals("O", filter, termAtt);
|
||||
assertTermEquals("O", filter, termAtt);
|
||||
assertTermEquals("O", filter, termAtt);
|
||||
assertTermEquals("O", filter, termAtt);
|
||||
assertTermEquals("O", filter, termAtt);
|
||||
assertTermEquals("O", filter, termAtt);
|
||||
assertTermEquals("OE", filter, termAtt);
|
||||
assertTermEquals("TH", filter, termAtt);
|
||||
assertTermEquals("U", filter, termAtt);
|
||||
assertTermEquals("U", filter, termAtt);
|
||||
assertTermEquals("U", filter, termAtt);
|
||||
assertTermEquals("U", filter, termAtt);
|
||||
assertTermEquals("Y", filter, termAtt);
|
||||
assertTermEquals("Y", filter, termAtt);
|
||||
assertTermEquals("a", filter, termAtt);
|
||||
assertTermEquals("a", filter, termAtt);
|
||||
assertTermEquals("a", filter, termAtt);
|
||||
assertTermEquals("a", filter, termAtt);
|
||||
assertTermEquals("a", filter, termAtt);
|
||||
assertTermEquals("a", filter, termAtt);
|
||||
assertTermEquals("ae", filter, termAtt);
|
||||
assertTermEquals("c", filter, termAtt);
|
||||
assertTermEquals("e", filter, termAtt);
|
||||
assertTermEquals("e", filter, termAtt);
|
||||
assertTermEquals("e", filter, termAtt);
|
||||
assertTermEquals("e", filter, termAtt);
|
||||
assertTermEquals("i", filter, termAtt);
|
||||
assertTermEquals("i", filter, termAtt);
|
||||
assertTermEquals("i", filter, termAtt);
|
||||
assertTermEquals("i", filter, termAtt);
|
||||
assertTermEquals("ij", filter, termAtt);
|
||||
assertTermEquals("d", filter, termAtt);
|
||||
assertTermEquals("n", filter, termAtt);
|
||||
assertTermEquals("o", filter, termAtt);
|
||||
assertTermEquals("o", filter, termAtt);
|
||||
assertTermEquals("o", filter, termAtt);
|
||||
assertTermEquals("o", filter, termAtt);
|
||||
assertTermEquals("o", filter, termAtt);
|
||||
assertTermEquals("o", filter, termAtt);
|
||||
assertTermEquals("oe", filter, termAtt);
|
||||
assertTermEquals("ss", filter, termAtt);
|
||||
assertTermEquals("th", filter, termAtt);
|
||||
assertTermEquals("u", filter, termAtt);
|
||||
assertTermEquals("u", filter, termAtt);
|
||||
assertTermEquals("u", filter, termAtt);
|
||||
assertTermEquals("u", filter, termAtt);
|
||||
assertTermEquals("y", filter, termAtt);
|
||||
assertTermEquals("y", filter, termAtt);
|
||||
assertTermEquals("fi", filter, termAtt);
|
||||
assertTermEquals("fl", filter, termAtt);
|
||||
assertNextTerms("Des", "Des", filter, termAtt);
|
||||
assertNextTerms("mot", "mot", filter, termAtt);
|
||||
assertNextTerms("clés", "cles", filter, termAtt);
|
||||
assertNextTerms("À", "A", filter, termAtt);
|
||||
assertNextTerms("LA", "LA", filter, termAtt);
|
||||
assertNextTerms("CHAÎNE", "CHAINE", filter, termAtt);
|
||||
assertNextTerms("À", "A", filter, termAtt);
|
||||
assertNextTerms("Á", "A", filter, termAtt);
|
||||
assertNextTerms("Â", "A", filter, termAtt);
|
||||
assertNextTerms("Ã", "A", filter, termAtt);
|
||||
assertNextTerms("Ä", "A", filter, termAtt);
|
||||
assertNextTerms("Å", "A", filter, termAtt);
|
||||
assertNextTerms("Æ", "AE", filter, termAtt);
|
||||
assertNextTerms("Ç", "C", filter, termAtt);
|
||||
assertNextTerms("È", "E", filter, termAtt);
|
||||
assertNextTerms("É", "E", filter, termAtt);
|
||||
assertNextTerms("Ê", "E", filter, termAtt);
|
||||
assertNextTerms("Ë", "E", filter, termAtt);
|
||||
assertNextTerms("Ì", "I", filter, termAtt);
|
||||
assertNextTerms("Í", "I", filter, termAtt);
|
||||
assertNextTerms("Î", "I", filter, termAtt);
|
||||
assertNextTerms("Ï", "I", filter, termAtt);
|
||||
assertNextTerms("IJ", "IJ", filter, termAtt);
|
||||
assertNextTerms("Ð", "D", filter, termAtt);
|
||||
assertNextTerms("Ñ", "N", filter, termAtt);
|
||||
assertNextTerms("Ò", "O", filter, termAtt);
|
||||
assertNextTerms("Ó", "O", filter, termAtt);
|
||||
assertNextTerms("Ô", "O", filter, termAtt);
|
||||
assertNextTerms("Õ", "O", filter, termAtt);
|
||||
assertNextTerms("Ö", "O", filter, termAtt);
|
||||
assertNextTerms("Ø", "O", filter, termAtt);
|
||||
assertNextTerms("Œ", "OE", filter, termAtt);
|
||||
assertNextTerms("Þ", "TH", filter, termAtt);
|
||||
assertNextTerms("Ù", "U", filter, termAtt);
|
||||
assertNextTerms("Ú", "U", filter, termAtt);
|
||||
assertNextTerms("Û", "U", filter, termAtt);
|
||||
assertNextTerms("Ü", "U", filter, termAtt);
|
||||
assertNextTerms("Ý", "Y", filter, termAtt);
|
||||
assertNextTerms("Ÿ", "Y", filter, termAtt);
|
||||
assertNextTerms("à", "a", filter, termAtt);
|
||||
assertNextTerms("á", "a", filter, termAtt);
|
||||
assertNextTerms("â", "a", filter, termAtt);
|
||||
assertNextTerms("ã", "a", filter, termAtt);
|
||||
assertNextTerms("ä", "a", filter, termAtt);
|
||||
assertNextTerms("å", "a", filter, termAtt);
|
||||
assertNextTerms("æ", "ae", filter, termAtt);
|
||||
assertNextTerms("ç", "c", filter, termAtt);
|
||||
assertNextTerms("è", "e", filter, termAtt);
|
||||
assertNextTerms("é", "e", filter, termAtt);
|
||||
assertNextTerms("ê", "e", filter, termAtt);
|
||||
assertNextTerms("ë", "e", filter, termAtt);
|
||||
assertNextTerms("ì", "i", filter, termAtt);
|
||||
assertNextTerms("í", "i", filter, termAtt);
|
||||
assertNextTerms("î", "i", filter, termAtt);
|
||||
assertNextTerms("ï", "i", filter, termAtt);
|
||||
assertNextTerms("ij", "ij", filter, termAtt);
|
||||
assertNextTerms("ð", "d", filter, termAtt);
|
||||
assertNextTerms("ñ", "n", filter, termAtt);
|
||||
assertNextTerms("ò", "o", filter, termAtt);
|
||||
assertNextTerms("ó", "o", filter, termAtt);
|
||||
assertNextTerms("ô", "o", filter, termAtt);
|
||||
assertNextTerms("õ", "o", filter, termAtt);
|
||||
assertNextTerms("ö", "o", filter, termAtt);
|
||||
assertNextTerms("ø", "o", filter, termAtt);
|
||||
assertNextTerms("œ", "oe", filter, termAtt);
|
||||
assertNextTerms("ß", "ss", filter, termAtt);
|
||||
assertNextTerms("þ", "th", filter, termAtt);
|
||||
assertNextTerms("ù", "u", filter, termAtt);
|
||||
assertNextTerms("ú", "u", filter, termAtt);
|
||||
assertNextTerms("û", "u", filter, termAtt);
|
||||
assertNextTerms("ü", "u", filter, termAtt);
|
||||
assertNextTerms("ý", "y", filter, termAtt);
|
||||
assertNextTerms("ÿ", "y", filter, termAtt);
|
||||
assertNextTerms("fi", "fi", filter, termAtt);
|
||||
assertNextTerms("fl", "fl", filter, termAtt);
|
||||
assertFalse(filter.incrementToken());
|
||||
}
|
||||
|
||||
|
@ -1876,7 +1888,8 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
|
|||
};
|
||||
|
||||
// Construct input text and expected output tokens
|
||||
List<String> expectedOutputTokens = new ArrayList<String>();
|
||||
List<String> expectedUnfoldedTokens = new ArrayList<String>();
|
||||
List<String> expectedFoldedTokens = new ArrayList<String>();
|
||||
StringBuilder inputText = new StringBuilder();
|
||||
for (int n = 0 ; n < foldings.length ; n += 2) {
|
||||
if (n > 0) {
|
||||
|
@ -1884,32 +1897,30 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
inputText.append(foldings[n]);
|
||||
|
||||
// Construct the expected output token: the ASCII string to fold to,
|
||||
// duplicated as many times as the number of characters in the input text.
|
||||
// Construct the expected output tokens: both the unfolded and folded string,
|
||||
// with the folded duplicated as many times as the number of characters in
|
||||
// the input text.
|
||||
StringBuilder expected = new StringBuilder();
|
||||
int numChars = foldings[n].length();
|
||||
for (int m = 0 ; m < numChars; ++m) {
|
||||
expected.append(foldings[n + 1]);
|
||||
}
|
||||
expectedOutputTokens.add(expected.toString());
|
||||
expectedUnfoldedTokens.add(foldings[n]);
|
||||
expectedFoldedTokens.add(expected.toString());
|
||||
}
|
||||
|
||||
TokenStream stream = whitespaceMockTokenizer(inputText.toString());
|
||||
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream);
|
||||
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream, random().nextBoolean());
|
||||
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||
Iterator<String> expectedIter = expectedOutputTokens.iterator();
|
||||
Iterator<String> unfoldedIter = expectedUnfoldedTokens.iterator();
|
||||
Iterator<String> foldedIter = expectedFoldedTokens.iterator();
|
||||
filter.reset();
|
||||
while (expectedIter.hasNext()) {
|
||||
assertTermEquals(expectedIter.next(), filter, termAtt);
|
||||
while (foldedIter.hasNext()) {
|
||||
assertNextTerms(unfoldedIter.next(), foldedIter.next(), filter, termAtt);
|
||||
}
|
||||
assertFalse(filter.incrementToken());
|
||||
}
|
||||
|
||||
void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt) throws Exception {
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals(expected, termAtt.toString());
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
|
@ -1917,7 +1928,8 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, new ASCIIFoldingFilter(tokenizer));
|
||||
return new TokenStreamComponents(tokenizer,
|
||||
new ASCIIFoldingFilter(tokenizer, random().nextBoolean()));
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||
|
@ -1928,7 +1940,8 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer();
|
||||
return new TokenStreamComponents(tokenizer, new ASCIIFoldingFilter(tokenizer));
|
||||
return new TokenStreamComponents(tokenizer,
|
||||
new ASCIIFoldingFilter(tokenizer, random().nextBoolean()));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "", "");
|
||||
|
|
Loading…
Reference in New Issue