LUCENE-5437: ASCIIFoldingFilter now has an option to preserve the original token

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1567595 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2014-02-12 10:42:54 +00:00
parent cf3bef28fc
commit 531d3a5744
4 changed files with 160 additions and 99 deletions

View File

@ -157,6 +157,10 @@ New Features
* LUCENE-5440: Add LongBitSet for managing more than 2.1B bits (otherwise use
FixedBitSet). (Shai Erera)
* LUCENE-5437: ASCIIFoldingFilter now has an option to preserve the original token
and emit it on the same position as the folded token only if the actual token was
folded. (Simon Willnauer, Nik Everett)
Build
* LUCENE-5217,LUCENE-5420: Maven config: get dependencies from Ant+Ivy config;

View File

@ -22,6 +22,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
@ -57,17 +58,49 @@ import org.apache.lucene.util.RamUsageEstimator;
* For example, 'à' will be replaced by 'a'.
*/
public final class ASCIIFoldingFilter extends TokenFilter {
public ASCIIFoldingFilter(TokenStream input)
{
super(input);
}
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAttr = addAttribute(PositionIncrementAttribute.class);
private final boolean preserveOriginal;
private char[] output = new char[512];
private int outputPos;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private State state;
public ASCIIFoldingFilter(TokenStream input)
{
this(input, false);
}
/**
* Create a new {@link ASCIIFoldingFilter}.
*
* @param input
* TokenStream to filter
* @param preserveOriginal
* should the original tokens be kept on the input stream with a 0 position increment
* from the folded tokens?
**/
public ASCIIFoldingFilter(TokenStream input, boolean preserveOriginal)
{
super(input);
this.preserveOriginal = preserveOriginal;
}
/**
* Does the filter preserve the original tokens?
*/
public boolean isPreserveOriginal() {
return preserveOriginal;
}
@Override
public boolean incrementToken() throws IOException {
if (state != null) {
assert preserveOriginal : "state should only be captured if preserveOriginal is true";
restoreState(state);
posIncAttr.setPositionIncrement(0);
state = null;
return true;
}
if (input.incrementToken()) {
final char[] buffer = termAtt.buffer();
final int length = termAtt.length();
@ -89,6 +122,12 @@ public final class ASCIIFoldingFilter extends TokenFilter {
}
}
@Override
public void reset() throws IOException {
super.reset();
state = null;
}
/**
* Converts characters above ASCII to their ASCII equivalents. For example,
* accents are removed from accented characters.
@ -97,6 +136,9 @@ public final class ASCIIFoldingFilter extends TokenFilter {
*/
public void foldToASCII(char[] input, int length)
{
if (preserveOriginal) {
state = captureState();
}
// Worst-case length required:
final int maxSizeNeeded = 4 * length;
if (output.length < maxSizeNeeded) {

View File

@ -31,15 +31,17 @@ import org.apache.lucene.analysis.TokenStream;
* &lt;fieldType name="text_ascii" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.ASCIIFoldingFilterFactory"/&gt;
* &lt;filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="false"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
private final boolean preserveOriginal;
/** Creates a new ASCIIFoldingFilterFactory */
public ASCIIFoldingFilterFactory(Map<String,String> args) {
super(args);
preserveOriginal = getBoolean(args, "preserveOriginal", false);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@ -47,7 +49,7 @@ public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements Mul
@Override
public ASCIIFoldingFilter create(TokenStream input) {
return new ASCIIFoldingFilter(input);
return new ASCIIFoldingFilter(input, preserveOriginal);
}
@Override

View File

@ -31,91 +31,103 @@ import java.util.ArrayList;
import java.util.Iterator;
public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
/**
* Pop one input token's worth of tokens off the filter and verify that they are as expected.
*/
void assertNextTerms(String expectedUnfolded, String expectedFolded, ASCIIFoldingFilter filter,
CharTermAttribute termAtt) throws Exception {
assertTrue(filter.incrementToken());
assertEquals(expectedFolded, termAtt.toString());
if (filter.isPreserveOriginal() && !expectedUnfolded.equals(expectedFolded)) {
assertTrue(filter.incrementToken());
assertEquals(expectedUnfolded, termAtt.toString());
}
}
// testLain1Accents() is a copy of TestLatin1AccentFilter.testU().
public void testLatin1Accents() throws Exception {
TokenStream stream = whitespaceMockTokenizer("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ"
+" Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij"
+" ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl");
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream);
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream, random().nextBoolean());
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
filter.reset();
assertTermEquals("Des", filter, termAtt);
assertTermEquals("mot", filter, termAtt);
assertTermEquals("cles", filter, termAtt);
assertTermEquals("A", filter, termAtt);
assertTermEquals("LA", filter, termAtt);
assertTermEquals("CHAINE", filter, termAtt);
assertTermEquals("A", filter, termAtt);
assertTermEquals("A", filter, termAtt);
assertTermEquals("A", filter, termAtt);
assertTermEquals("A", filter, termAtt);
assertTermEquals("A", filter, termAtt);
assertTermEquals("A", filter, termAtt);
assertTermEquals("AE", filter, termAtt);
assertTermEquals("C", filter, termAtt);
assertTermEquals("E", filter, termAtt);
assertTermEquals("E", filter, termAtt);
assertTermEquals("E", filter, termAtt);
assertTermEquals("E", filter, termAtt);
assertTermEquals("I", filter, termAtt);
assertTermEquals("I", filter, termAtt);
assertTermEquals("I", filter, termAtt);
assertTermEquals("I", filter, termAtt);
assertTermEquals("IJ", filter, termAtt);
assertTermEquals("D", filter, termAtt);
assertTermEquals("N", filter, termAtt);
assertTermEquals("O", filter, termAtt);
assertTermEquals("O", filter, termAtt);
assertTermEquals("O", filter, termAtt);
assertTermEquals("O", filter, termAtt);
assertTermEquals("O", filter, termAtt);
assertTermEquals("O", filter, termAtt);
assertTermEquals("OE", filter, termAtt);
assertTermEquals("TH", filter, termAtt);
assertTermEquals("U", filter, termAtt);
assertTermEquals("U", filter, termAtt);
assertTermEquals("U", filter, termAtt);
assertTermEquals("U", filter, termAtt);
assertTermEquals("Y", filter, termAtt);
assertTermEquals("Y", filter, termAtt);
assertTermEquals("a", filter, termAtt);
assertTermEquals("a", filter, termAtt);
assertTermEquals("a", filter, termAtt);
assertTermEquals("a", filter, termAtt);
assertTermEquals("a", filter, termAtt);
assertTermEquals("a", filter, termAtt);
assertTermEquals("ae", filter, termAtt);
assertTermEquals("c", filter, termAtt);
assertTermEquals("e", filter, termAtt);
assertTermEquals("e", filter, termAtt);
assertTermEquals("e", filter, termAtt);
assertTermEquals("e", filter, termAtt);
assertTermEquals("i", filter, termAtt);
assertTermEquals("i", filter, termAtt);
assertTermEquals("i", filter, termAtt);
assertTermEquals("i", filter, termAtt);
assertTermEquals("ij", filter, termAtt);
assertTermEquals("d", filter, termAtt);
assertTermEquals("n", filter, termAtt);
assertTermEquals("o", filter, termAtt);
assertTermEquals("o", filter, termAtt);
assertTermEquals("o", filter, termAtt);
assertTermEquals("o", filter, termAtt);
assertTermEquals("o", filter, termAtt);
assertTermEquals("o", filter, termAtt);
assertTermEquals("oe", filter, termAtt);
assertTermEquals("ss", filter, termAtt);
assertTermEquals("th", filter, termAtt);
assertTermEquals("u", filter, termAtt);
assertTermEquals("u", filter, termAtt);
assertTermEquals("u", filter, termAtt);
assertTermEquals("u", filter, termAtt);
assertTermEquals("y", filter, termAtt);
assertTermEquals("y", filter, termAtt);
assertTermEquals("fi", filter, termAtt);
assertTermEquals("fl", filter, termAtt);
assertNextTerms("Des", "Des", filter, termAtt);
assertNextTerms("mot", "mot", filter, termAtt);
assertNextTerms("clés", "cles", filter, termAtt);
assertNextTerms("À", "A", filter, termAtt);
assertNextTerms("LA", "LA", filter, termAtt);
assertNextTerms("CHAÎNE", "CHAINE", filter, termAtt);
assertNextTerms("À", "A", filter, termAtt);
assertNextTerms("Á", "A", filter, termAtt);
assertNextTerms("Â", "A", filter, termAtt);
assertNextTerms("Ã", "A", filter, termAtt);
assertNextTerms("Ä", "A", filter, termAtt);
assertNextTerms("Å", "A", filter, termAtt);
assertNextTerms("Æ", "AE", filter, termAtt);
assertNextTerms("Ç", "C", filter, termAtt);
assertNextTerms("È", "E", filter, termAtt);
assertNextTerms("É", "E", filter, termAtt);
assertNextTerms("Ê", "E", filter, termAtt);
assertNextTerms("Ë", "E", filter, termAtt);
assertNextTerms("Ì", "I", filter, termAtt);
assertNextTerms("Í", "I", filter, termAtt);
assertNextTerms("Î", "I", filter, termAtt);
assertNextTerms("Ï", "I", filter, termAtt);
assertNextTerms("IJ", "IJ", filter, termAtt);
assertNextTerms("Ð", "D", filter, termAtt);
assertNextTerms("Ñ", "N", filter, termAtt);
assertNextTerms("Ò", "O", filter, termAtt);
assertNextTerms("Ó", "O", filter, termAtt);
assertNextTerms("Ô", "O", filter, termAtt);
assertNextTerms("Õ", "O", filter, termAtt);
assertNextTerms("Ö", "O", filter, termAtt);
assertNextTerms("Ø", "O", filter, termAtt);
assertNextTerms("Œ", "OE", filter, termAtt);
assertNextTerms("Þ", "TH", filter, termAtt);
assertNextTerms("Ù", "U", filter, termAtt);
assertNextTerms("Ú", "U", filter, termAtt);
assertNextTerms("Û", "U", filter, termAtt);
assertNextTerms("Ü", "U", filter, termAtt);
assertNextTerms("Ý", "Y", filter, termAtt);
assertNextTerms("Ÿ", "Y", filter, termAtt);
assertNextTerms("à", "a", filter, termAtt);
assertNextTerms("á", "a", filter, termAtt);
assertNextTerms("â", "a", filter, termAtt);
assertNextTerms("ã", "a", filter, termAtt);
assertNextTerms("ä", "a", filter, termAtt);
assertNextTerms("å", "a", filter, termAtt);
assertNextTerms("æ", "ae", filter, termAtt);
assertNextTerms("ç", "c", filter, termAtt);
assertNextTerms("è", "e", filter, termAtt);
assertNextTerms("é", "e", filter, termAtt);
assertNextTerms("ê", "e", filter, termAtt);
assertNextTerms("ë", "e", filter, termAtt);
assertNextTerms("ì", "i", filter, termAtt);
assertNextTerms("í", "i", filter, termAtt);
assertNextTerms("î", "i", filter, termAtt);
assertNextTerms("ï", "i", filter, termAtt);
assertNextTerms("ij", "ij", filter, termAtt);
assertNextTerms("ð", "d", filter, termAtt);
assertNextTerms("ñ", "n", filter, termAtt);
assertNextTerms("ò", "o", filter, termAtt);
assertNextTerms("ó", "o", filter, termAtt);
assertNextTerms("ô", "o", filter, termAtt);
assertNextTerms("õ", "o", filter, termAtt);
assertNextTerms("ö", "o", filter, termAtt);
assertNextTerms("ø", "o", filter, termAtt);
assertNextTerms("œ", "oe", filter, termAtt);
assertNextTerms("ß", "ss", filter, termAtt);
assertNextTerms("þ", "th", filter, termAtt);
assertNextTerms("ù", "u", filter, termAtt);
assertNextTerms("ú", "u", filter, termAtt);
assertNextTerms("û", "u", filter, termAtt);
assertNextTerms("ü", "u", filter, termAtt);
assertNextTerms("ý", "y", filter, termAtt);
assertNextTerms("ÿ", "y", filter, termAtt);
assertNextTerms("", "fi", filter, termAtt);
assertNextTerms("", "fl", filter, termAtt);
assertFalse(filter.incrementToken());
}
@ -1876,7 +1888,8 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
};
// Construct input text and expected output tokens
List<String> expectedOutputTokens = new ArrayList<String>();
List<String> expectedUnfoldedTokens = new ArrayList<String>();
List<String> expectedFoldedTokens = new ArrayList<String>();
StringBuilder inputText = new StringBuilder();
for (int n = 0 ; n < foldings.length ; n += 2) {
if (n > 0) {
@ -1884,32 +1897,30 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
}
inputText.append(foldings[n]);
// Construct the expected output token: the ASCII string to fold to,
// duplicated as many times as the number of characters in the input text.
// Construct the expected output tokens: both the unfolded and folded string,
// with the folded duplicated as many times as the number of characters in
// the input text.
StringBuilder expected = new StringBuilder();
int numChars = foldings[n].length();
for (int m = 0 ; m < numChars; ++m) {
expected.append(foldings[n + 1]);
}
expectedOutputTokens.add(expected.toString());
expectedUnfoldedTokens.add(foldings[n]);
expectedFoldedTokens.add(expected.toString());
}
TokenStream stream = whitespaceMockTokenizer(inputText.toString());
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream);
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream, random().nextBoolean());
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
Iterator<String> expectedIter = expectedOutputTokens.iterator();
Iterator<String> unfoldedIter = expectedUnfoldedTokens.iterator();
Iterator<String> foldedIter = expectedFoldedTokens.iterator();
filter.reset();
while (expectedIter.hasNext()) {
assertTermEquals(expectedIter.next(), filter, termAtt);
while (foldedIter.hasNext()) {
assertNextTerms(unfoldedIter.next(), foldedIter.next(), filter, termAtt);
}
assertFalse(filter.incrementToken());
}
void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt) throws Exception {
assertTrue(stream.incrementToken());
assertEquals(expected, termAtt.toString());
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@ -1917,7 +1928,8 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new ASCIIFoldingFilter(tokenizer));
return new TokenStreamComponents(tokenizer,
new ASCIIFoldingFilter(tokenizer, random().nextBoolean()));
}
};
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
@ -1928,7 +1940,8 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new ASCIIFoldingFilter(tokenizer));
return new TokenStreamComponents(tokenizer,
new ASCIIFoldingFilter(tokenizer, random().nextBoolean()));
}
};
checkOneTerm(a, "", "");