LUCENE-9929 NorwegianNormalizationFilter (#84)

This commit is contained in:
Jan Høydahl 2021-05-12 14:31:26 +02:00 committed by GitHub
parent 6ebf959502
commit 7dd7077609
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 572 additions and 110 deletions

View File

@ -225,6 +225,9 @@ Improvements
* LUCENE-9850: Switch to PFOR encoding for doc IDs (instead of FOR). (Greg Miller) * LUCENE-9850: Switch to PFOR encoding for doc IDs (instead of FOR). (Greg Miller)
* LUCENE-9929: Add NorwegianNormalizationFilter, which does the same as ScandinavianNormalizationFilter except
it does not fold oo->ø and ao->å. (janhoy, Robert Muir, Adrien Grand)
Bug fixes Bug fixes

View File

@ -16,11 +16,12 @@
*/ */
package org.apache.lucene.analysis.miscellaneous; package org.apache.lucene.analysis.miscellaneous;
import static org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.ALL_FOLDINGS;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.StemmerUtil;
/** /**
* This filter normalize use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded * This filter normalize use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded
@ -33,98 +34,29 @@ import org.apache.lucene.analysis.util.StemmerUtil;
* <p>blåbærsyltetøj == blåbärsyltetöj == blaabaarsyltetoej but not blabarsyltetoj räksmörgås == * <p>blåbærsyltetøj == blåbärsyltetöj == blaabaarsyltetoej but not blabarsyltetoj räksmörgås ==
* ræksmørgås == ræksmörgaos == raeksmoergaas but not raksmorgas * ræksmørgås == ræksmörgaos == raeksmoergaas but not raksmorgas
* *
* <p>There are also separate filters for Norwegian, Danish and Swedish with slightly differing
* settings
*
* @see ScandinavianFoldingFilter * @see ScandinavianFoldingFilter
*/ */
public final class ScandinavianNormalizationFilter extends TokenFilter { public final class ScandinavianNormalizationFilter extends TokenFilter {
private final ScandinavianNormalizer normalizer;
public ScandinavianNormalizationFilter(TokenStream input) { public ScandinavianNormalizationFilter(TokenStream input) {
super(input); super(input);
this.normalizer = new ScandinavianNormalizer(ALL_FOLDINGS);
} }
private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class); private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
private static final char AA = '\u00C5'; // Å
private static final char aa = '\u00E5'; // å
private static final char AE = '\u00C6'; // Æ
private static final char ae = '\u00E6'; // æ
private static final char AE_se = '\u00C4'; // Ä
private static final char ae_se = '\u00E4'; // ä
private static final char OE = '\u00D8'; // Ø
private static final char oe = '\u00F8'; // ø
private static final char OE_se = '\u00D6'; // Ö
private static final char oe_se = '\u00F6'; // ö
@Override @Override
public boolean incrementToken() throws IOException { public boolean incrementToken() throws IOException {
if (!input.incrementToken()) { if (!input.incrementToken()) {
return false; return false;
} }
charTermAttribute.setLength(
char[] buffer = charTermAttribute.buffer(); normalizer.processToken(charTermAttribute.buffer(), charTermAttribute.length()));
int length = charTermAttribute.length();
int i;
for (i = 0; i < length; i++) {
if (buffer[i] == ae_se) {
buffer[i] = ae;
} else if (buffer[i] == AE_se) {
buffer[i] = AE;
} else if (buffer[i] == oe_se) {
buffer[i] = oe;
} else if (buffer[i] == OE_se) {
buffer[i] = OE;
} else if (length - 1 > i) {
if (buffer[i] == 'a'
&& (buffer[i + 1] == 'a'
|| buffer[i + 1] == 'o'
|| buffer[i + 1] == 'A'
|| buffer[i + 1] == 'O')) {
length = StemmerUtil.delete(buffer, i + 1, length);
buffer[i] = aa;
} else if (buffer[i] == 'A'
&& (buffer[i + 1] == 'a'
|| buffer[i + 1] == 'A'
|| buffer[i + 1] == 'o'
|| buffer[i + 1] == 'O')) {
length = StemmerUtil.delete(buffer, i + 1, length);
buffer[i] = AA;
} else if (buffer[i] == 'a' && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
length = StemmerUtil.delete(buffer, i + 1, length);
buffer[i] = ae;
} else if (buffer[i] == 'A' && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
length = StemmerUtil.delete(buffer, i + 1, length);
buffer[i] = AE;
} else if (buffer[i] == 'o'
&& (buffer[i + 1] == 'e'
|| buffer[i + 1] == 'E'
|| buffer[i + 1] == 'o'
|| buffer[i + 1] == 'O')) {
length = StemmerUtil.delete(buffer, i + 1, length);
buffer[i] = oe;
} else if (buffer[i] == 'O'
&& (buffer[i + 1] == 'e'
|| buffer[i + 1] == 'E'
|| buffer[i + 1] == 'o'
|| buffer[i + 1] == 'O')) {
length = StemmerUtil.delete(buffer, i + 1, length);
buffer[i] = OE;
}
}
}
charTermAttribute.setLength(length);
return true; return true;
} }
} }

View File

@ -0,0 +1,135 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import java.util.EnumSet;
import java.util.Set;
import org.apache.lucene.analysis.util.StemmerUtil;
/**
* This Normalizer does the heavy lifting for a set of Scandinavian normalization filters,
* normalizing use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded variants (aa,
* ao, ae, oe and oo) by transforming them to åÅæÆøØ.
*
* @since 9.0
* @lucene.internal
*/
public final class ScandinavianNormalizer {
/**
* Create the instance, while choosing which foldings to apply. This may differ between Norwegian,
* Danish and Swedish.
*
* @param foldings a Set of Foldings to apply (i.e. AE, OE, AA, AO, OO)
*/
public ScandinavianNormalizer(Set<Foldings> foldings) {
this.foldings = foldings;
}
/** List of possible foldings that can be used when configuring the filter */
public enum Foldings {
AA,
AO,
AE,
OE,
OO
}
private final Set<Foldings> foldings;
public static final Set<Foldings> ALL_FOLDINGS = EnumSet.allOf(Foldings.class);
static final char AA = '\u00C5'; // Å
static final char aa = '\u00E5'; // å
static final char AE = '\u00C6'; // Æ
static final char ae = '\u00E6'; // æ
static final char AE_se = '\u00C4'; // Ä
static final char ae_se = '\u00E4'; // ä
static final char OE = '\u00D8'; // Ø
static final char oe = '\u00F8'; // ø
static final char OE_se = '\u00D6'; // Ö
static final char oe_se = '\u00F6'; // ö
/**
* Takes the original buffer and length as input. Modifies the buffer in-place and returns new
* length
*
* @return new length
*/
public int processToken(char[] buffer, int length) {
int i;
for (i = 0; i < length; i++) {
if (buffer[i] == ae_se) {
buffer[i] = ae;
} else if (buffer[i] == AE_se) {
buffer[i] = AE;
} else if (buffer[i] == oe_se) {
buffer[i] = oe;
} else if (buffer[i] == OE_se) {
buffer[i] = OE;
} else if (length - 1 > i) {
if (buffer[i] == 'a'
&& (foldings.contains(Foldings.AA) && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A')
|| foldings.contains(Foldings.AO)
&& (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
length = StemmerUtil.delete(buffer, i + 1, length);
buffer[i] = aa;
} else if (buffer[i] == 'A'
&& (foldings.contains(Foldings.AA) && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A')
|| foldings.contains(Foldings.AO)
&& (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
length = StemmerUtil.delete(buffer, i + 1, length);
buffer[i] = AA;
} else if (buffer[i] == 'a'
&& foldings.contains(Foldings.AE)
&& (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
length = StemmerUtil.delete(buffer, i + 1, length);
buffer[i] = ae;
} else if (buffer[i] == 'A'
&& foldings.contains(Foldings.AE)
&& (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
length = StemmerUtil.delete(buffer, i + 1, length);
buffer[i] = AE;
} else if (buffer[i] == 'o'
&& (foldings.contains(Foldings.OE) && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')
|| foldings.contains(Foldings.OO)
&& (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
length = StemmerUtil.delete(buffer, i + 1, length);
buffer[i] = oe;
} else if (buffer[i] == 'O'
&& (foldings.contains(Foldings.OE) && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')
|| foldings.contains(Foldings.OO)
&& (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
length = StemmerUtil.delete(buffer, i + 1, length);
buffer[i] = OE;
}
}
}
return length;
}
}

View File

@ -0,0 +1,56 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.no;
import java.io.IOException;
import java.util.EnumSet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer;
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.Foldings;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* This filter normalize use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded
* variants (ae, oe, aa) by transforming them to åÅæÆøØ. This is similar to
* ScandinavianNormalizationFilter, except for the folding rules customized for Norwegian.
*
* <p>blåbærsyltetøj == blåbärsyltetöj == blaabaersyltetoej
*
* @see ScandinavianNormalizationFilter
*/
public final class NorwegianNormalizationFilter extends TokenFilter {
private final ScandinavianNormalizer normalizer;
public NorwegianNormalizationFilter(TokenStream input) {
super(input);
this.normalizer = new ScandinavianNormalizer(EnumSet.of(Foldings.AE, Foldings.OE, Foldings.AA));
}
private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
@Override
public final boolean incrementToken() throws IOException {
if (!input.incrementToken()) {
return false;
}
charTermAttribute.setLength(
normalizer.processToken(charTermAttribute.buffer(), charTermAttribute.length()));
return true;
}
}

View File

@ -0,0 +1,54 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.no;
import java.util.Map;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
/**
* Factory for {@link NorwegianNormalizationFilter}.
*
* @lucene.spi {@value #NAME}
*/
public class NorwegianNormalizationFilterFactory extends TokenFilterFactory {
/** SPI name */
public static final String NAME = "norwegianNormalization";
public NorwegianNormalizationFilterFactory(Map<String, String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
/** Default ctor for compatibility with SPI */
public NorwegianNormalizationFilterFactory() {
throw defaultCtorException();
}
@Override
public NorwegianNormalizationFilter create(TokenStream input) {
return new NorwegianNormalizationFilter(input);
}
@Override
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -95,6 +95,7 @@ org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory
org.apache.lucene.analysis.ngram.NGramFilterFactory org.apache.lucene.analysis.ngram.NGramFilterFactory
org.apache.lucene.analysis.no.NorwegianLightStemFilterFactory org.apache.lucene.analysis.no.NorwegianLightStemFilterFactory
org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory
org.apache.lucene.analysis.no.NorwegianNormalizationFilterFactory
org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory
org.apache.lucene.analysis.pattern.PatternCaptureGroupFilterFactory org.apache.lucene.analysis.pattern.PatternCaptureGroupFilterFactory
org.apache.lucene.analysis.pattern.PatternTypingFilterFactory org.apache.lucene.analysis.pattern.PatternTypingFilterFactory

View File

@ -16,37 +16,12 @@
*/ */
package org.apache.lucene.analysis.miscellaneous; package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase { public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer; public void testDefault() throws Exception {
Analyzer analyzer = createAnalyzer();
@Override
public void setUp() throws Exception {
super.setUp();
analyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String field) {
final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
final TokenStream stream = new ScandinavianNormalizationFilter(tokenizer);
return new TokenStreamComponents(tokenizer, stream);
}
};
}
@Override
public void tearDown() throws Exception {
analyzer.close();
super.tearDown();
}
public void test() throws Exception {
checkOneTerm(analyzer, "aeäaeeea", "æææeea"); // should not cause ArrayIndexOutOfBoundsException checkOneTerm(analyzer, "aeäaeeea", "æææeea"); // should not cause ArrayIndexOutOfBoundsException
@ -107,6 +82,7 @@ public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase
checkOneTerm(analyzer, "Oe", "Ø"); checkOneTerm(analyzer, "Oe", "Ø");
checkOneTerm(analyzer, "OO", "Ø"); checkOneTerm(analyzer, "OO", "Ø");
checkOneTerm(analyzer, "OE", "Ø"); checkOneTerm(analyzer, "OE", "Ø");
analyzer.close();
} }
/** check that the empty string doesn't cause issues */ /** check that the empty string doesn't cause issues */
@ -126,6 +102,19 @@ public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase
/** blast some random strings through the analyzer */ /** blast some random strings through the analyzer */
public void testRandomData() throws Exception { public void testRandomData() throws Exception {
Analyzer analyzer = createAnalyzer();
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER); checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
analyzer.close();
}
private Analyzer createAnalyzer() {
return new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String field) {
final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
final TokenStream stream = new ScandinavianNormalizationFilter(tokenizer);
return new TokenStreamComponents(tokenizer, stream);
}
};
} }
} }

View File

@ -21,10 +21,10 @@ import org.apache.lucene.analysis.TokenStream;
public class TestScandinavianNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase { public class TestScandinavianNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {
public void testStemming() throws Exception { public void testDefault() throws Exception {
TokenStream stream = whitespaceMockTokenizer("räksmörgås"); TokenStream stream = whitespaceMockTokenizer("räksmörgås_ae_oe_aa_oo_ao_AE_OE_AA_OO_AO");
stream = tokenFilterFactory("ScandinavianNormalization").create(stream); stream = tokenFilterFactory("ScandinavianNormalization").create(stream);
assertTokenStreamContents(stream, new String[] {"ræksmørgås"}); assertTokenStreamContents(stream, new String[] {"ræksmørgås_æ_ø_å_ø_å_Æ_Ø_Å_Ø_Å"});
} }
/** Test that bogus arguments result in exception */ /** Test that bogus arguments result in exception */
@ -35,6 +35,7 @@ public class TestScandinavianNormalizationFilterFactory extends BaseTokenStreamF
() -> { () -> {
tokenFilterFactory("ScandinavianNormalization", "bogusArg", "bogusValue"); tokenFilterFactory("ScandinavianNormalization", "bogusArg", "bogusValue");
}); });
assertTrue(expected.getMessage().contains("Unknown parameters")); assertTrue(
"Got " + expected.getMessage(), expected.getMessage().contains("Unknown parameters"));
} }
} }

View File

@ -0,0 +1,130 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import java.util.Collections;
import java.util.Set;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.Foldings;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/** Tests low level the normalizer functionality */
public class TestScandinavianNormalizer extends BaseTokenStreamTestCase {
public void testNoFoldings() throws Exception {
Analyzer analyzer = createAnalyzer(Collections.emptySet());
checkOneTerm(analyzer, "aa", "aa");
checkOneTerm(analyzer, "ao", "ao");
checkOneTerm(analyzer, "ae", "ae");
checkOneTerm(analyzer, "oo", "oo");
checkOneTerm(analyzer, "oe", "oe");
analyzer.close();
}
public void testAeFolding() throws Exception {
Analyzer analyzer = createAnalyzer(Set.of(Foldings.AE));
checkOneTerm(analyzer, "aa", "aa");
checkOneTerm(analyzer, "ao", "ao");
checkOneTerm(analyzer, "ae", "æ");
checkOneTerm(analyzer, "aE", "æ");
checkOneTerm(analyzer, "Ae", "Æ");
checkOneTerm(analyzer, "AE", "Æ");
checkOneTerm(analyzer, "oo", "oo");
checkOneTerm(analyzer, "oe", "oe");
analyzer.close();
}
public void testAaFolding() throws Exception {
Analyzer analyzer = createAnalyzer(Set.of(Foldings.AA));
checkOneTerm(analyzer, "aa", "å");
checkOneTerm(analyzer, "aA", "å");
checkOneTerm(analyzer, "Aa", "Å");
checkOneTerm(analyzer, "AA", "Å");
checkOneTerm(analyzer, "ao", "ao");
checkOneTerm(analyzer, "ae", "ae");
checkOneTerm(analyzer, "oo", "oo");
checkOneTerm(analyzer, "oe", "oe");
analyzer.close();
}
public void testOeFolding() throws Exception {
Analyzer analyzer = createAnalyzer(Set.of(Foldings.OE));
checkOneTerm(analyzer, "aa", "aa");
checkOneTerm(analyzer, "ao", "ao");
checkOneTerm(analyzer, "ae", "ae");
checkOneTerm(analyzer, "oo", "oo");
checkOneTerm(analyzer, "oe", "ø");
checkOneTerm(analyzer, "oE", "ø");
checkOneTerm(analyzer, "Oe", "Ø");
checkOneTerm(analyzer, "OE", "Ø");
analyzer.close();
}
public void testOoFolding() throws Exception {
Analyzer analyzer = createAnalyzer(Set.of(Foldings.OO));
checkOneTerm(analyzer, "aa", "aa");
checkOneTerm(analyzer, "ao", "ao");
checkOneTerm(analyzer, "ae", "ae");
checkOneTerm(analyzer, "oo", "ø");
checkOneTerm(analyzer, "oO", "ø");
checkOneTerm(analyzer, "Oo", "Ø");
checkOneTerm(analyzer, "OO", "Ø");
checkOneTerm(analyzer, "oe", "oe");
analyzer.close();
}
public void testAoFolding() throws Exception {
Analyzer analyzer = createAnalyzer(Set.of(Foldings.AO));
checkOneTerm(analyzer, "aa", "aa");
checkOneTerm(analyzer, "ao", "å");
checkOneTerm(analyzer, "aO", "å");
checkOneTerm(analyzer, "Ao", "Å");
checkOneTerm(analyzer, "AO", "Å");
checkOneTerm(analyzer, "ae", "ae");
checkOneTerm(analyzer, "oo", "oo");
checkOneTerm(analyzer, "oe", "oe");
analyzer.close();
}
private Analyzer createAnalyzer(Set<Foldings> foldings) {
return new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String field) {
final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
final TokenStream stream =
new TokenFilter(tokenizer) {
private final CharTermAttribute charTermAttribute =
addAttribute(CharTermAttribute.class);
private final ScandinavianNormalizer normalizer =
new ScandinavianNormalizer(foldings);
@Override
public boolean incrementToken() throws IOException {
if (!input.incrementToken()) {
return false;
}
charTermAttribute.setLength(
normalizer.processToken(
charTermAttribute.buffer(), charTermAttribute.length()));
return true;
}
};
return new TokenStreamComponents(tokenizer, stream);
}
};
}
}

View File

@ -0,0 +1,120 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.no;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
public class TestNorwegianNormalizationFilter extends BaseTokenStreamTestCase {
public void testDefault() throws Exception {
Analyzer analyzer = createAnalyzer();
checkOneTerm(analyzer, "aeäaeeea", "æææeea"); // should not cause ArrayIndexOutOfBoundsException
checkOneTerm(analyzer, "aeäaeeeae", "æææeeæ");
checkOneTerm(analyzer, "aeaeeeae", "ææeeæ");
checkOneTerm(analyzer, "bøen", "bøen");
checkOneTerm(analyzer, "bOEen", "bØen");
checkOneTerm(analyzer, "åene", "åene");
checkOneTerm(analyzer, "blåbærsyltetøj", "blåbærsyltetøj");
checkOneTerm(analyzer, "blaabaersyltetöj", "blåbærsyltetøj");
checkOneTerm(analyzer, "räksmörgås", "ræksmørgås");
checkOneTerm(analyzer, "raeksmörgaas", "ræksmørgås");
checkOneTerm(analyzer, "raeksmoergås", "ræksmørgås");
checkOneTerm(analyzer, "ab", "ab");
checkOneTerm(analyzer, "ob", "ob");
checkOneTerm(analyzer, "Ab", "Ab");
checkOneTerm(analyzer, "Ob", "Ob");
checkOneTerm(analyzer, "å", "å");
checkOneTerm(analyzer, "aa", "å");
checkOneTerm(analyzer, "aA", "å");
checkOneTerm(analyzer, "ao", "ao");
checkOneTerm(analyzer, "aO", "aO");
checkOneTerm(analyzer, "AA", "Å");
checkOneTerm(analyzer, "Aa", "Å");
checkOneTerm(analyzer, "Ao", "Ao");
checkOneTerm(analyzer, "AO", "AO");
checkOneTerm(analyzer, "æ", "æ");
checkOneTerm(analyzer, "ä", "æ");
checkOneTerm(analyzer, "Æ", "Æ");
checkOneTerm(analyzer, "Ä", "Æ");
checkOneTerm(analyzer, "ae", "æ");
checkOneTerm(analyzer, "aE", "æ");
checkOneTerm(analyzer, "Ae", "Æ");
checkOneTerm(analyzer, "AE", "Æ");
checkOneTerm(analyzer, "ö", "ø");
checkOneTerm(analyzer, "ø", "ø");
checkOneTerm(analyzer, "Ö", "Ø");
checkOneTerm(analyzer, "Ø", "Ø");
checkOneTerm(analyzer, "oo", "oo");
checkOneTerm(analyzer, "oe", "ø");
checkOneTerm(analyzer, "oO", "oO");
checkOneTerm(analyzer, "oE", "ø");
checkOneTerm(analyzer, "Oo", "Oo");
checkOneTerm(analyzer, "Oe", "Ø");
checkOneTerm(analyzer, "OO", "OO");
checkOneTerm(analyzer, "OE", "Ø");
analyzer.close();
}
/** check that the empty string doesn't cause issues */
public void testEmptyTerm() throws Exception {
Analyzer a =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(
tokenizer, new ScandinavianNormalizationFilter(tokenizer));
}
};
checkOneTerm(a, "", "");
a.close();
}
/** blast some random strings through the analyzer */
public void testRandomData() throws Exception {
Analyzer analyzer = createAnalyzer();
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
analyzer.close();
}
private Analyzer createAnalyzer() {
return new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String field) {
final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
final TokenStream stream = new NorwegianNormalizationFilter(tokenizer);
return new TokenStreamComponents(tokenizer, stream);
}
};
}
}

View File

@ -0,0 +1,41 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.no;
import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
import org.apache.lucene.analysis.TokenStream;
public class TestNorwegianNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {
public void testDefault() throws Exception {
TokenStream stream = whitespaceMockTokenizer("räksmörgås_ae_oe_aa_oo_ao_AE_OE_AA_OO_AO");
stream = tokenFilterFactory("NorwegianNormalization").create(stream);
assertTokenStreamContents(stream, new String[] {"ræksmørgås_æ_ø_å_oo_ao_Æ_Ø_Å_OO_AO"});
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
IllegalArgumentException expected =
expectThrows(
IllegalArgumentException.class,
() -> {
tokenFilterFactory("NorwegianNormalization", "bogusArg", "bogusValue");
});
assertTrue(
"Got " + expected.getMessage(), expected.getMessage().contains("Unknown parameters"));
}
}