mirror of https://github.com/apache/lucene.git
LUCENE-9929 NorwegianNormalizationFilter (#84)
This commit is contained in:
parent
6ebf959502
commit
7dd7077609
|
@ -225,6 +225,9 @@ Improvements
|
|||
|
||||
* LUCENE-9850: Switch to PFOR encoding for doc IDs (instead of FOR). (Greg Miller)
|
||||
|
||||
* LUCENE-9929: Add NorwegianNormalizationFilter, which does the same as ScandinavianNormalizationFilter except
|
||||
it does not fold oo->ø and ao->å. (janhoy, Robert Muir, Adrien Grand)
|
||||
|
||||
Bug fixes
|
||||
|
||||
|
||||
|
|
|
@ -16,11 +16,12 @@
|
|||
*/
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import static org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.ALL_FOLDINGS;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.StemmerUtil;
|
||||
|
||||
/**
|
||||
* This filter normalize use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded
|
||||
|
@ -33,98 +34,29 @@ import org.apache.lucene.analysis.util.StemmerUtil;
|
|||
* <p>blåbærsyltetøj == blåbärsyltetöj == blaabaarsyltetoej but not blabarsyltetoj räksmörgås ==
|
||||
* ræksmørgås == ræksmörgaos == raeksmoergaas but not raksmorgas
|
||||
*
|
||||
* <p>There are also separate filters for Norwegian, Danish and Swedish with slightly differing
|
||||
* settings
|
||||
*
|
||||
* @see ScandinavianFoldingFilter
|
||||
*/
|
||||
public final class ScandinavianNormalizationFilter extends TokenFilter {
|
||||
|
||||
private final ScandinavianNormalizer normalizer;
|
||||
|
||||
public ScandinavianNormalizationFilter(TokenStream input) {
|
||||
super(input);
|
||||
this.normalizer = new ScandinavianNormalizer(ALL_FOLDINGS);
|
||||
}
|
||||
|
||||
private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
|
||||
|
||||
private static final char AA = '\u00C5'; // Å
|
||||
private static final char aa = '\u00E5'; // å
|
||||
private static final char AE = '\u00C6'; // Æ
|
||||
private static final char ae = '\u00E6'; // æ
|
||||
private static final char AE_se = '\u00C4'; // Ä
|
||||
private static final char ae_se = '\u00E4'; // ä
|
||||
private static final char OE = '\u00D8'; // Ø
|
||||
private static final char oe = '\u00F8'; // ø
|
||||
private static final char OE_se = '\u00D6'; // Ö
|
||||
private static final char oe_se = '\u00F6'; // ö
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
char[] buffer = charTermAttribute.buffer();
|
||||
int length = charTermAttribute.length();
|
||||
|
||||
int i;
|
||||
for (i = 0; i < length; i++) {
|
||||
|
||||
if (buffer[i] == ae_se) {
|
||||
buffer[i] = ae;
|
||||
|
||||
} else if (buffer[i] == AE_se) {
|
||||
buffer[i] = AE;
|
||||
|
||||
} else if (buffer[i] == oe_se) {
|
||||
buffer[i] = oe;
|
||||
|
||||
} else if (buffer[i] == OE_se) {
|
||||
buffer[i] = OE;
|
||||
|
||||
} else if (length - 1 > i) {
|
||||
|
||||
if (buffer[i] == 'a'
|
||||
&& (buffer[i + 1] == 'a'
|
||||
|| buffer[i + 1] == 'o'
|
||||
|| buffer[i + 1] == 'A'
|
||||
|| buffer[i + 1] == 'O')) {
|
||||
length = StemmerUtil.delete(buffer, i + 1, length);
|
||||
buffer[i] = aa;
|
||||
|
||||
} else if (buffer[i] == 'A'
|
||||
&& (buffer[i + 1] == 'a'
|
||||
|| buffer[i + 1] == 'A'
|
||||
|| buffer[i + 1] == 'o'
|
||||
|| buffer[i + 1] == 'O')) {
|
||||
length = StemmerUtil.delete(buffer, i + 1, length);
|
||||
buffer[i] = AA;
|
||||
|
||||
} else if (buffer[i] == 'a' && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
|
||||
length = StemmerUtil.delete(buffer, i + 1, length);
|
||||
buffer[i] = ae;
|
||||
|
||||
} else if (buffer[i] == 'A' && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
|
||||
length = StemmerUtil.delete(buffer, i + 1, length);
|
||||
buffer[i] = AE;
|
||||
|
||||
} else if (buffer[i] == 'o'
|
||||
&& (buffer[i + 1] == 'e'
|
||||
|| buffer[i + 1] == 'E'
|
||||
|| buffer[i + 1] == 'o'
|
||||
|| buffer[i + 1] == 'O')) {
|
||||
length = StemmerUtil.delete(buffer, i + 1, length);
|
||||
buffer[i] = oe;
|
||||
|
||||
} else if (buffer[i] == 'O'
|
||||
&& (buffer[i + 1] == 'e'
|
||||
|| buffer[i + 1] == 'E'
|
||||
|| buffer[i + 1] == 'o'
|
||||
|| buffer[i + 1] == 'O')) {
|
||||
length = StemmerUtil.delete(buffer, i + 1, length);
|
||||
buffer[i] = OE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
charTermAttribute.setLength(length);
|
||||
|
||||
charTermAttribute.setLength(
|
||||
normalizer.processToken(charTermAttribute.buffer(), charTermAttribute.length()));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,135 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.util.EnumSet;
|
||||
import java.util.Set;
|
||||
import org.apache.lucene.analysis.util.StemmerUtil;
|
||||
|
||||
/**
|
||||
* This Normalizer does the heavy lifting for a set of Scandinavian normalization filters,
|
||||
* normalizing use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded variants (aa,
|
||||
* ao, ae, oe and oo) by transforming them to åÅæÆøØ.
|
||||
*
|
||||
* @since 9.0
|
||||
* @lucene.internal
|
||||
*/
|
||||
public final class ScandinavianNormalizer {
|
||||
|
||||
/**
|
||||
* Create the instance, while choosing which foldings to apply. This may differ between Norwegian,
|
||||
* Danish and Swedish.
|
||||
*
|
||||
* @param foldings a Set of Foldings to apply (i.e. AE, OE, AA, AO, OO)
|
||||
*/
|
||||
public ScandinavianNormalizer(Set<Foldings> foldings) {
|
||||
this.foldings = foldings;
|
||||
}
|
||||
|
||||
/** List of possible foldings that can be used when configuring the filter */
|
||||
public enum Foldings {
|
||||
AA,
|
||||
AO,
|
||||
AE,
|
||||
OE,
|
||||
OO
|
||||
}
|
||||
|
||||
private final Set<Foldings> foldings;
|
||||
|
||||
public static final Set<Foldings> ALL_FOLDINGS = EnumSet.allOf(Foldings.class);
|
||||
|
||||
static final char AA = '\u00C5'; // Å
|
||||
static final char aa = '\u00E5'; // å
|
||||
static final char AE = '\u00C6'; // Æ
|
||||
static final char ae = '\u00E6'; // æ
|
||||
static final char AE_se = '\u00C4'; // Ä
|
||||
static final char ae_se = '\u00E4'; // ä
|
||||
static final char OE = '\u00D8'; // Ø
|
||||
static final char oe = '\u00F8'; // ø
|
||||
static final char OE_se = '\u00D6'; // Ö
|
||||
static final char oe_se = '\u00F6'; // ö
|
||||
|
||||
/**
|
||||
* Takes the original buffer and length as input. Modifies the buffer in-place and returns new
|
||||
* length
|
||||
*
|
||||
* @return new length
|
||||
*/
|
||||
public int processToken(char[] buffer, int length) {
|
||||
int i;
|
||||
for (i = 0; i < length; i++) {
|
||||
|
||||
if (buffer[i] == ae_se) {
|
||||
buffer[i] = ae;
|
||||
|
||||
} else if (buffer[i] == AE_se) {
|
||||
buffer[i] = AE;
|
||||
|
||||
} else if (buffer[i] == oe_se) {
|
||||
buffer[i] = oe;
|
||||
|
||||
} else if (buffer[i] == OE_se) {
|
||||
buffer[i] = OE;
|
||||
|
||||
} else if (length - 1 > i) {
|
||||
|
||||
if (buffer[i] == 'a'
|
||||
&& (foldings.contains(Foldings.AA) && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A')
|
||||
|| foldings.contains(Foldings.AO)
|
||||
&& (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
|
||||
length = StemmerUtil.delete(buffer, i + 1, length);
|
||||
buffer[i] = aa;
|
||||
|
||||
} else if (buffer[i] == 'A'
|
||||
&& (foldings.contains(Foldings.AA) && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A')
|
||||
|| foldings.contains(Foldings.AO)
|
||||
&& (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
|
||||
length = StemmerUtil.delete(buffer, i + 1, length);
|
||||
buffer[i] = AA;
|
||||
|
||||
} else if (buffer[i] == 'a'
|
||||
&& foldings.contains(Foldings.AE)
|
||||
&& (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
|
||||
length = StemmerUtil.delete(buffer, i + 1, length);
|
||||
buffer[i] = ae;
|
||||
|
||||
} else if (buffer[i] == 'A'
|
||||
&& foldings.contains(Foldings.AE)
|
||||
&& (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
|
||||
length = StemmerUtil.delete(buffer, i + 1, length);
|
||||
buffer[i] = AE;
|
||||
|
||||
} else if (buffer[i] == 'o'
|
||||
&& (foldings.contains(Foldings.OE) && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')
|
||||
|| foldings.contains(Foldings.OO)
|
||||
&& (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
|
||||
length = StemmerUtil.delete(buffer, i + 1, length);
|
||||
buffer[i] = oe;
|
||||
|
||||
} else if (buffer[i] == 'O'
|
||||
&& (foldings.contains(Foldings.OE) && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')
|
||||
|| foldings.contains(Foldings.OO)
|
||||
&& (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
|
||||
length = StemmerUtil.delete(buffer, i + 1, length);
|
||||
buffer[i] = OE;
|
||||
}
|
||||
}
|
||||
}
|
||||
return length;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.no;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.EnumSet;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.Foldings;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
/**
|
||||
* This filter normalize use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded
|
||||
* variants (ae, oe, aa) by transforming them to åÅæÆøØ. This is similar to
|
||||
* ScandinavianNormalizationFilter, except for the folding rules customized for Norwegian.
|
||||
*
|
||||
* <p>blåbærsyltetøj == blåbärsyltetöj == blaabaersyltetoej
|
||||
*
|
||||
* @see ScandinavianNormalizationFilter
|
||||
*/
|
||||
public final class NorwegianNormalizationFilter extends TokenFilter {
|
||||
private final ScandinavianNormalizer normalizer;
|
||||
|
||||
public NorwegianNormalizationFilter(TokenStream input) {
|
||||
super(input);
|
||||
this.normalizer = new ScandinavianNormalizer(EnumSet.of(Foldings.AE, Foldings.OE, Foldings.AA));
|
||||
}
|
||||
|
||||
private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
charTermAttribute.setLength(
|
||||
normalizer.processToken(charTermAttribute.buffer(), charTermAttribute.length()));
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.no;
|
||||
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.analysis.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Factory for {@link NorwegianNormalizationFilter}.
|
||||
*
|
||||
* @lucene.spi {@value #NAME}
|
||||
*/
|
||||
public class NorwegianNormalizationFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/** SPI name */
|
||||
public static final String NAME = "norwegianNormalization";
|
||||
|
||||
public NorwegianNormalizationFilterFactory(Map<String, String> args) {
|
||||
super(args);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
/** Default ctor for compatibility with SPI */
|
||||
public NorwegianNormalizationFilterFactory() {
|
||||
throw defaultCtorException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public NorwegianNormalizationFilter create(TokenStream input) {
|
||||
return new NorwegianNormalizationFilter(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
|
@ -95,6 +95,7 @@ org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory
|
|||
org.apache.lucene.analysis.ngram.NGramFilterFactory
|
||||
org.apache.lucene.analysis.no.NorwegianLightStemFilterFactory
|
||||
org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory
|
||||
org.apache.lucene.analysis.no.NorwegianNormalizationFilterFactory
|
||||
org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory
|
||||
org.apache.lucene.analysis.pattern.PatternCaptureGroupFilterFactory
|
||||
org.apache.lucene.analysis.pattern.PatternTypingFilterFactory
|
||||
|
|
|
@ -16,37 +16,12 @@
|
|||
*/
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
|
||||
public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase {
|
||||
private Analyzer analyzer;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
analyzer =
|
||||
new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String field) {
|
||||
final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
final TokenStream stream = new ScandinavianNormalizationFilter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, stream);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
analyzer.close();
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
public void test() throws Exception {
|
||||
public void testDefault() throws Exception {
|
||||
Analyzer analyzer = createAnalyzer();
|
||||
|
||||
checkOneTerm(analyzer, "aeäaeeea", "æææeea"); // should not cause ArrayIndexOutOfBoundsException
|
||||
|
||||
|
@ -107,6 +82,7 @@ public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase
|
|||
checkOneTerm(analyzer, "Oe", "Ø");
|
||||
checkOneTerm(analyzer, "OO", "Ø");
|
||||
checkOneTerm(analyzer, "OE", "Ø");
|
||||
analyzer.close();
|
||||
}
|
||||
|
||||
/** check that the empty string doesn't cause issues */
|
||||
|
@ -126,6 +102,19 @@ public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomData() throws Exception {
|
||||
Analyzer analyzer = createAnalyzer();
|
||||
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
|
||||
analyzer.close();
|
||||
}
|
||||
|
||||
private Analyzer createAnalyzer() {
|
||||
return new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String field) {
|
||||
final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
final TokenStream stream = new ScandinavianNormalizationFilter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, stream);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,10 +21,10 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
|
||||
public class TestScandinavianNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
|
||||
public void testStemming() throws Exception {
|
||||
TokenStream stream = whitespaceMockTokenizer("räksmörgås");
|
||||
public void testDefault() throws Exception {
|
||||
TokenStream stream = whitespaceMockTokenizer("räksmörgås_ae_oe_aa_oo_ao_AE_OE_AA_OO_AO");
|
||||
stream = tokenFilterFactory("ScandinavianNormalization").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] {"ræksmørgås"});
|
||||
assertTokenStreamContents(stream, new String[] {"ræksmørgås_æ_ø_å_ø_å_Æ_Ø_Å_Ø_Å"});
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
|
@ -35,6 +35,7 @@ public class TestScandinavianNormalizationFilterFactory extends BaseTokenStreamF
|
|||
() -> {
|
||||
tokenFilterFactory("ScandinavianNormalization", "bogusArg", "bogusValue");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
assertTrue(
|
||||
"Got " + expected.getMessage(), expected.getMessage().contains("Unknown parameters"));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,130 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.Foldings;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
/** Tests low level the normalizer functionality */
|
||||
public class TestScandinavianNormalizer extends BaseTokenStreamTestCase {
|
||||
public void testNoFoldings() throws Exception {
|
||||
Analyzer analyzer = createAnalyzer(Collections.emptySet());
|
||||
checkOneTerm(analyzer, "aa", "aa");
|
||||
checkOneTerm(analyzer, "ao", "ao");
|
||||
checkOneTerm(analyzer, "ae", "ae");
|
||||
checkOneTerm(analyzer, "oo", "oo");
|
||||
checkOneTerm(analyzer, "oe", "oe");
|
||||
analyzer.close();
|
||||
}
|
||||
|
||||
public void testAeFolding() throws Exception {
|
||||
Analyzer analyzer = createAnalyzer(Set.of(Foldings.AE));
|
||||
checkOneTerm(analyzer, "aa", "aa");
|
||||
checkOneTerm(analyzer, "ao", "ao");
|
||||
checkOneTerm(analyzer, "ae", "æ");
|
||||
checkOneTerm(analyzer, "aE", "æ");
|
||||
checkOneTerm(analyzer, "Ae", "Æ");
|
||||
checkOneTerm(analyzer, "AE", "Æ");
|
||||
checkOneTerm(analyzer, "oo", "oo");
|
||||
checkOneTerm(analyzer, "oe", "oe");
|
||||
analyzer.close();
|
||||
}
|
||||
|
||||
public void testAaFolding() throws Exception {
|
||||
Analyzer analyzer = createAnalyzer(Set.of(Foldings.AA));
|
||||
checkOneTerm(analyzer, "aa", "å");
|
||||
checkOneTerm(analyzer, "aA", "å");
|
||||
checkOneTerm(analyzer, "Aa", "Å");
|
||||
checkOneTerm(analyzer, "AA", "Å");
|
||||
checkOneTerm(analyzer, "ao", "ao");
|
||||
checkOneTerm(analyzer, "ae", "ae");
|
||||
checkOneTerm(analyzer, "oo", "oo");
|
||||
checkOneTerm(analyzer, "oe", "oe");
|
||||
analyzer.close();
|
||||
}
|
||||
|
||||
public void testOeFolding() throws Exception {
|
||||
Analyzer analyzer = createAnalyzer(Set.of(Foldings.OE));
|
||||
checkOneTerm(analyzer, "aa", "aa");
|
||||
checkOneTerm(analyzer, "ao", "ao");
|
||||
checkOneTerm(analyzer, "ae", "ae");
|
||||
checkOneTerm(analyzer, "oo", "oo");
|
||||
checkOneTerm(analyzer, "oe", "ø");
|
||||
checkOneTerm(analyzer, "oE", "ø");
|
||||
checkOneTerm(analyzer, "Oe", "Ø");
|
||||
checkOneTerm(analyzer, "OE", "Ø");
|
||||
analyzer.close();
|
||||
}
|
||||
|
||||
public void testOoFolding() throws Exception {
|
||||
Analyzer analyzer = createAnalyzer(Set.of(Foldings.OO));
|
||||
checkOneTerm(analyzer, "aa", "aa");
|
||||
checkOneTerm(analyzer, "ao", "ao");
|
||||
checkOneTerm(analyzer, "ae", "ae");
|
||||
checkOneTerm(analyzer, "oo", "ø");
|
||||
checkOneTerm(analyzer, "oO", "ø");
|
||||
checkOneTerm(analyzer, "Oo", "Ø");
|
||||
checkOneTerm(analyzer, "OO", "Ø");
|
||||
checkOneTerm(analyzer, "oe", "oe");
|
||||
analyzer.close();
|
||||
}
|
||||
|
||||
public void testAoFolding() throws Exception {
|
||||
Analyzer analyzer = createAnalyzer(Set.of(Foldings.AO));
|
||||
checkOneTerm(analyzer, "aa", "aa");
|
||||
checkOneTerm(analyzer, "ao", "å");
|
||||
checkOneTerm(analyzer, "aO", "å");
|
||||
checkOneTerm(analyzer, "Ao", "Å");
|
||||
checkOneTerm(analyzer, "AO", "Å");
|
||||
checkOneTerm(analyzer, "ae", "ae");
|
||||
checkOneTerm(analyzer, "oo", "oo");
|
||||
checkOneTerm(analyzer, "oe", "oe");
|
||||
analyzer.close();
|
||||
}
|
||||
|
||||
private Analyzer createAnalyzer(Set<Foldings> foldings) {
|
||||
return new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String field) {
|
||||
final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
final TokenStream stream =
|
||||
new TokenFilter(tokenizer) {
|
||||
private final CharTermAttribute charTermAttribute =
|
||||
addAttribute(CharTermAttribute.class);
|
||||
private final ScandinavianNormalizer normalizer =
|
||||
new ScandinavianNormalizer(foldings);
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
charTermAttribute.setLength(
|
||||
normalizer.processToken(
|
||||
charTermAttribute.buffer(), charTermAttribute.length()));
|
||||
return true;
|
||||
}
|
||||
};
|
||||
return new TokenStreamComponents(tokenizer, stream);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
|
@ -0,0 +1,120 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.no;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
|
||||
|
||||
public class TestNorwegianNormalizationFilter extends BaseTokenStreamTestCase {
|
||||
public void testDefault() throws Exception {
|
||||
Analyzer analyzer = createAnalyzer();
|
||||
|
||||
checkOneTerm(analyzer, "aeäaeeea", "æææeea"); // should not cause ArrayIndexOutOfBoundsException
|
||||
|
||||
checkOneTerm(analyzer, "aeäaeeeae", "æææeeæ");
|
||||
checkOneTerm(analyzer, "aeaeeeae", "ææeeæ");
|
||||
|
||||
checkOneTerm(analyzer, "bøen", "bøen");
|
||||
checkOneTerm(analyzer, "bOEen", "bØen");
|
||||
checkOneTerm(analyzer, "åene", "åene");
|
||||
|
||||
checkOneTerm(analyzer, "blåbærsyltetøj", "blåbærsyltetøj");
|
||||
checkOneTerm(analyzer, "blaabaersyltetöj", "blåbærsyltetøj");
|
||||
checkOneTerm(analyzer, "räksmörgås", "ræksmørgås");
|
||||
checkOneTerm(analyzer, "raeksmörgaas", "ræksmørgås");
|
||||
checkOneTerm(analyzer, "raeksmoergås", "ræksmørgås");
|
||||
|
||||
checkOneTerm(analyzer, "ab", "ab");
|
||||
checkOneTerm(analyzer, "ob", "ob");
|
||||
checkOneTerm(analyzer, "Ab", "Ab");
|
||||
checkOneTerm(analyzer, "Ob", "Ob");
|
||||
|
||||
checkOneTerm(analyzer, "å", "å");
|
||||
|
||||
checkOneTerm(analyzer, "aa", "å");
|
||||
checkOneTerm(analyzer, "aA", "å");
|
||||
checkOneTerm(analyzer, "ao", "ao");
|
||||
checkOneTerm(analyzer, "aO", "aO");
|
||||
|
||||
checkOneTerm(analyzer, "AA", "Å");
|
||||
checkOneTerm(analyzer, "Aa", "Å");
|
||||
checkOneTerm(analyzer, "Ao", "Ao");
|
||||
checkOneTerm(analyzer, "AO", "AO");
|
||||
|
||||
checkOneTerm(analyzer, "æ", "æ");
|
||||
checkOneTerm(analyzer, "ä", "æ");
|
||||
|
||||
checkOneTerm(analyzer, "Æ", "Æ");
|
||||
checkOneTerm(analyzer, "Ä", "Æ");
|
||||
|
||||
checkOneTerm(analyzer, "ae", "æ");
|
||||
checkOneTerm(analyzer, "aE", "æ");
|
||||
|
||||
checkOneTerm(analyzer, "Ae", "Æ");
|
||||
checkOneTerm(analyzer, "AE", "Æ");
|
||||
|
||||
checkOneTerm(analyzer, "ö", "ø");
|
||||
checkOneTerm(analyzer, "ø", "ø");
|
||||
checkOneTerm(analyzer, "Ö", "Ø");
|
||||
checkOneTerm(analyzer, "Ø", "Ø");
|
||||
|
||||
checkOneTerm(analyzer, "oo", "oo");
|
||||
checkOneTerm(analyzer, "oe", "ø");
|
||||
checkOneTerm(analyzer, "oO", "oO");
|
||||
checkOneTerm(analyzer, "oE", "ø");
|
||||
|
||||
checkOneTerm(analyzer, "Oo", "Oo");
|
||||
checkOneTerm(analyzer, "Oe", "Ø");
|
||||
checkOneTerm(analyzer, "OO", "OO");
|
||||
checkOneTerm(analyzer, "OE", "Ø");
|
||||
analyzer.close();
|
||||
}
|
||||
|
||||
/** check that the empty string doesn't cause issues */
|
||||
public void testEmptyTerm() throws Exception {
|
||||
Analyzer a =
|
||||
new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer();
|
||||
return new TokenStreamComponents(
|
||||
tokenizer, new ScandinavianNormalizationFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "", "");
|
||||
a.close();
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomData() throws Exception {
|
||||
Analyzer analyzer = createAnalyzer();
|
||||
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
|
||||
analyzer.close();
|
||||
}
|
||||
|
||||
private Analyzer createAnalyzer() {
|
||||
return new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String field) {
|
||||
final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
final TokenStream stream = new NorwegianNormalizationFilter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, stream);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.no;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
public class TestNorwegianNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
|
||||
public void testDefault() throws Exception {
|
||||
TokenStream stream = whitespaceMockTokenizer("räksmörgås_ae_oe_aa_oo_ao_AE_OE_AA_OO_AO");
|
||||
stream = tokenFilterFactory("NorwegianNormalization").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] {"ræksmørgås_æ_ø_å_oo_ao_Æ_Ø_Å_OO_AO"});
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusArguments() throws Exception {
|
||||
IllegalArgumentException expected =
|
||||
expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> {
|
||||
tokenFilterFactory("NorwegianNormalization", "bogusArg", "bogusValue");
|
||||
});
|
||||
assertTrue(
|
||||
"Got " + expected.getMessage(), expected.getMessage().contains("Unknown parameters"));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue