LUCENE-2413: consolidate ASCIIFolding and ISOLatin1Accent to contrib/analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940591 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-05-03 20:02:06 +00:00
parent 59a62bcb64
commit 89c24fbe37
12 changed files with 57 additions and 4316 deletions

View File

@ -6,6 +6,8 @@ Changes in backwards compatibility policy
* LUCENE-2413: Consolidated all Lucene analyzers into contrib/analyzers.
- o.a.l.analysis.PorterStemFilter -> o.a.l.analysis.en.PorterStemFilter
- o.a.l.analysis.ASCIIFoldingFilter -> o.a.l.analysis.miscellaneous.ASCIIFoldingFilter
- o.a.l.analysis.ISOLatin1AccentFilter -> o.a.l.analysis.miscellaneous.ISOLatin1AccentFilter
... (in progress)
* LUCENE-1458, LUCENE-2111, LUCENE-2354: Changes from flexible indexing:

View File

@ -1,260 +0,0 @@
package org.apache.lucene.analysis;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A filter that replaces accented characters in the ISO Latin 1 character set
* (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
* <p>
* For instance, '&agrave;' will be replaced by 'a'.
* <p>
*
* @deprecated If you build a new index, use {@link ASCIIFoldingFilter}
* which covers a superset of Latin 1.
* This class is included for use with existing
* indexes and will be removed in a future release (possibly Lucene 4.0).
*/
public final class ISOLatin1AccentFilter extends TokenFilter {
public ISOLatin1AccentFilter(TokenStream input) {
super(input);
termAtt = addAttribute(TermAttribute.class);
}
private char[] output = new char[256];
private int outputPos;
private TermAttribute termAtt;
@Override
public final boolean incrementToken() throws java.io.IOException {
if (input.incrementToken()) {
final char[] buffer = termAtt.termBuffer();
final int length = termAtt.termLength();
// If no characters actually require rewriting then we
// just return token as-is:
for(int i=0;i<length;i++) {
final char c = buffer[i];
if (c >= '\u00c0' && c <= '\uFB06') {
removeAccents(buffer, length);
termAtt.setTermBuffer(output, 0, outputPos);
break;
}
}
return true;
} else
return false;
}
/**
* To replace accented characters in a String by unaccented equivalents.
*/
public final void removeAccents(char[] input, int length) {
// Worst-case length required:
final int maxSizeNeeded = 2*length;
int size = output.length;
while (size < maxSizeNeeded)
size *= 2;
if (size != output.length)
output = new char[size];
outputPos = 0;
int pos = 0;
for (int i=0; i<length; i++, pos++) {
final char c = input[pos];
// Quick test: if it's not in range then just keep
// current character
if (c < '\u00c0' || c > '\uFB06')
output[outputPos++] = c;
else {
switch (c) {
case '\u00C0' : // À
case '\u00C1' : // Á
case '\u00C2' : // Â
case '\u00C3' : // Ã
case '\u00C4' : // Ä
case '\u00C5' : // Å
output[outputPos++] = 'A';
break;
case '\u00C6' : // Æ
output[outputPos++] = 'A';
output[outputPos++] = 'E';
break;
case '\u00C7' : // Ç
output[outputPos++] = 'C';
break;
case '\u00C8' : // È
case '\u00C9' : // É
case '\u00CA' : // Ê
case '\u00CB' : // Ë
output[outputPos++] = 'E';
break;
case '\u00CC' : // Ì
case '\u00CD' : // Í
case '\u00CE' : // Î
case '\u00CF' : // Ï
output[outputPos++] = 'I';
break;
case '\u0132' : // IJ
output[outputPos++] = 'I';
output[outputPos++] = 'J';
break;
case '\u00D0' : // Ð
output[outputPos++] = 'D';
break;
case '\u00D1' : // Ñ
output[outputPos++] = 'N';
break;
case '\u00D2' : // Ò
case '\u00D3' : // Ó
case '\u00D4' : // Ô
case '\u00D5' : // Õ
case '\u00D6' : // Ö
case '\u00D8' : // Ø
output[outputPos++] = 'O';
break;
case '\u0152' : // Œ
output[outputPos++] = 'O';
output[outputPos++] = 'E';
break;
case '\u00DE' : // Þ
output[outputPos++] = 'T';
output[outputPos++] = 'H';
break;
case '\u00D9' : // Ù
case '\u00DA' : // Ú
case '\u00DB' : // Û
case '\u00DC' : // Ü
output[outputPos++] = 'U';
break;
case '\u00DD' : // Ý
case '\u0178' : // Ÿ
output[outputPos++] = 'Y';
break;
case '\u00E0' : // à
case '\u00E1' : // á
case '\u00E2' : // â
case '\u00E3' : // ã
case '\u00E4' : // ä
case '\u00E5' : // å
output[outputPos++] = 'a';
break;
case '\u00E6' : // æ
output[outputPos++] = 'a';
output[outputPos++] = 'e';
break;
case '\u00E7' : // ç
output[outputPos++] = 'c';
break;
case '\u00E8' : // è
case '\u00E9' : // é
case '\u00EA' : // ê
case '\u00EB' : // ë
output[outputPos++] = 'e';
break;
case '\u00EC' : // ì
case '\u00ED' : // í
case '\u00EE' : // î
case '\u00EF' : // ï
output[outputPos++] = 'i';
break;
case '\u0133' : // ij
output[outputPos++] = 'i';
output[outputPos++] = 'j';
break;
case '\u00F0' : // ð
output[outputPos++] = 'd';
break;
case '\u00F1' : // ñ
output[outputPos++] = 'n';
break;
case '\u00F2' : // ò
case '\u00F3' : // ó
case '\u00F4' : // ô
case '\u00F5' : // õ
case '\u00F6' : // ö
case '\u00F8' : // ø
output[outputPos++] = 'o';
break;
case '\u0153' : // œ
output[outputPos++] = 'o';
output[outputPos++] = 'e';
break;
case '\u00DF' : // ß
output[outputPos++] = 's';
output[outputPos++] = 's';
break;
case '\u00FE' : // þ
output[outputPos++] = 't';
output[outputPos++] = 'h';
break;
case '\u00F9' : // ù
case '\u00FA' : // ú
case '\u00FB' : // û
case '\u00FC' : // ü
output[outputPos++] = 'u';
break;
case '\u00FD' : // ý
case '\u00FF' : // ÿ
output[outputPos++] = 'y';
break;
case '\uFB00': //
output[outputPos++] = 'f';
output[outputPos++] = 'f';
break;
case '\uFB01': //
output[outputPos++] = 'f';
output[outputPos++] = 'i';
break;
case '\uFB02': //
output[outputPos++] = 'f';
output[outputPos++] = 'l';
break;
// following 2 are commented as they can break the maxSizeNeeded (and doing *3 could be expensive)
// case '\uFB03': //
// output[outputPos++] = 'f';
// output[outputPos++] = 'f';
// output[outputPos++] = 'i';
// break;
// case '\uFB04': //
// output[outputPos++] = 'f';
// output[outputPos++] = 'f';
// output[outputPos++] = 'l';
// break;
case '\uFB05': //
output[outputPos++] = 'f';
output[outputPos++] = 't';
break;
case '\uFB06': //
output[outputPos++] = 's';
output[outputPos++] = 't';
break;
default :
output[outputPos++] = c;
break;
}
}
}
}
}

View File

@ -1,111 +0,0 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.StringReader;
public class TestISOLatin1AccentFilter extends BaseTokenStreamTestCase {
public void testU() throws Exception {
TokenStream stream = new WhitespaceTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl"));
ISOLatin1AccentFilter filter = new ISOLatin1AccentFilter(stream);
TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
assertTermEquals("Des", filter, termAtt);
assertTermEquals("mot", filter, termAtt);
assertTermEquals("cles", filter, termAtt);
assertTermEquals("A", filter, termAtt);
assertTermEquals("LA", filter, termAtt);
assertTermEquals("CHAINE", filter, termAtt);
assertTermEquals("A", filter, termAtt);
assertTermEquals("A", filter, termAtt);
assertTermEquals("A", filter, termAtt);
assertTermEquals("A", filter, termAtt);
assertTermEquals("A", filter, termAtt);
assertTermEquals("A", filter, termAtt);
assertTermEquals("AE", filter, termAtt);
assertTermEquals("C", filter, termAtt);
assertTermEquals("E", filter, termAtt);
assertTermEquals("E", filter, termAtt);
assertTermEquals("E", filter, termAtt);
assertTermEquals("E", filter, termAtt);
assertTermEquals("I", filter, termAtt);
assertTermEquals("I", filter, termAtt);
assertTermEquals("I", filter, termAtt);
assertTermEquals("I", filter, termAtt);
assertTermEquals("IJ", filter, termAtt);
assertTermEquals("D", filter, termAtt);
assertTermEquals("N", filter, termAtt);
assertTermEquals("O", filter, termAtt);
assertTermEquals("O", filter, termAtt);
assertTermEquals("O", filter, termAtt);
assertTermEquals("O", filter, termAtt);
assertTermEquals("O", filter, termAtt);
assertTermEquals("O", filter, termAtt);
assertTermEquals("OE", filter, termAtt);
assertTermEquals("TH", filter, termAtt);
assertTermEquals("U", filter, termAtt);
assertTermEquals("U", filter, termAtt);
assertTermEquals("U", filter, termAtt);
assertTermEquals("U", filter, termAtt);
assertTermEquals("Y", filter, termAtt);
assertTermEquals("Y", filter, termAtt);
assertTermEquals("a", filter, termAtt);
assertTermEquals("a", filter, termAtt);
assertTermEquals("a", filter, termAtt);
assertTermEquals("a", filter, termAtt);
assertTermEquals("a", filter, termAtt);
assertTermEquals("a", filter, termAtt);
assertTermEquals("ae", filter, termAtt);
assertTermEquals("c", filter, termAtt);
assertTermEquals("e", filter, termAtt);
assertTermEquals("e", filter, termAtt);
assertTermEquals("e", filter, termAtt);
assertTermEquals("e", filter, termAtt);
assertTermEquals("i", filter, termAtt);
assertTermEquals("i", filter, termAtt);
assertTermEquals("i", filter, termAtt);
assertTermEquals("i", filter, termAtt);
assertTermEquals("ij", filter, termAtt);
assertTermEquals("d", filter, termAtt);
assertTermEquals("n", filter, termAtt);
assertTermEquals("o", filter, termAtt);
assertTermEquals("o", filter, termAtt);
assertTermEquals("o", filter, termAtt);
assertTermEquals("o", filter, termAtt);
assertTermEquals("o", filter, termAtt);
assertTermEquals("o", filter, termAtt);
assertTermEquals("oe", filter, termAtt);
assertTermEquals("ss", filter, termAtt);
assertTermEquals("th", filter, termAtt);
assertTermEquals("u", filter, termAtt);
assertTermEquals("u", filter, termAtt);
assertTermEquals("u", filter, termAtt);
assertTermEquals("u", filter, termAtt);
assertTermEquals("y", filter, termAtt);
assertTermEquals("y", filter, termAtt);
assertTermEquals("fi", filter, termAtt);
assertTermEquals("fl", filter, termAtt);
assertFalse(filter.incrementToken());
}
void assertTermEquals(String expected, TokenStream stream, TermAttribute termAtt) throws Exception {
assertTrue(stream.incrementToken());
assertEquals(expected, termAtt.term());
}
}

View File

@ -1,4 +1,4 @@
package org.apache.lucene.analysis;
package org.apache.lucene.analysis.miscellaneous;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -19,6 +19,8 @@ package org.apache.lucene.analysis;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;

View File

@ -1,4 +1,4 @@
package org.apache.lucene.analysis;
package org.apache.lucene.analysis.miscellaneous;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -17,6 +17,8 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**

View File

@ -1,4 +1,4 @@
package org.apache.lucene.analysis;
package org.apache.lucene.analysis.miscellaneous;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -17,6 +17,9 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.io.StringReader;
import java.util.List;

View File

@ -1,4 +1,4 @@
package org.apache.lucene.analysis;
package org.apache.lucene.analysis.miscellaneous;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -17,6 +17,9 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.io.StringReader;

View File

@ -17,14 +17,16 @@ package org.apache.lucene.queryParser.analyzing;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.ASCIIFoldingFilter;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.util.LuceneTestCase;
@ -105,6 +107,41 @@ public class TestAnalyzingQueryParser extends LuceneTestCase {
}
// TODO: Use a TestAnalyzer instead
final class TestFoldingFilter extends TokenFilter {
final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
public TestFoldingFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
char term[] = termAtt.buffer();
for (int i = 0; i < term.length; i++)
switch(term[i]) {
case 'ü':
case 'Ü':
term[i] = 'u';
break;
case 'ö':
term[i] = 'o';
break;
case 'é':
term[i] = 'e';
break;
case 'ï':
term[i] = 'i';
break;
}
return true;
} else {
return false;
}
}
}
final class ASCIIAnalyzer extends org.apache.lucene.analysis.Analyzer {
public ASCIIAnalyzer() {
}
@ -113,7 +150,7 @@ final class ASCIIAnalyzer extends org.apache.lucene.analysis.Analyzer {
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(LuceneTestCase.TEST_VERSION_CURRENT, reader);
result = new StandardFilter(result);
result = new ASCIIFoldingFilter(result);
result = new TestFoldingFilter(result);
result = new LowerCaseFilter(LuceneTestCase.TEST_VERSION_CURRENT, result);
return result;
}

View File

@ -18,7 +18,7 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.ASCIIFoldingFilter;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.TokenStream;
/** Factory for {@link ASCIIFoldingFilter} */

View File

@ -17,7 +17,7 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.ISOLatin1AccentFilter;
import org.apache.lucene.analysis.miscellaneous.ISOLatin1AccentFilter;
import org.apache.lucene.analysis.TokenStream;
/** Factory for ISOLatin1AccentFilter