mirror of https://github.com/apache/lucene.git
LUCENE-9929 NorwegianNormalizationFilter (#84)
This commit is contained in:
parent
6ebf959502
commit
7dd7077609
|
@ -225,6 +225,9 @@ Improvements
|
||||||
|
|
||||||
* LUCENE-9850: Switch to PFOR encoding for doc IDs (instead of FOR). (Greg Miller)
|
* LUCENE-9850: Switch to PFOR encoding for doc IDs (instead of FOR). (Greg Miller)
|
||||||
|
|
||||||
|
* LUCENE-9929: Add NorwegianNormalizationFilter, which does the same as ScandinavianNormalizationFilter except
|
||||||
|
it does not fold oo->ø and ao->å. (janhoy, Robert Muir, Adrien Grand)
|
||||||
|
|
||||||
Bug fixes
|
Bug fixes
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -16,11 +16,12 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.analysis.miscellaneous;
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.ALL_FOLDINGS;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.util.StemmerUtil;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This filter normalize use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded
|
* This filter normalize use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded
|
||||||
|
@ -33,98 +34,29 @@ import org.apache.lucene.analysis.util.StemmerUtil;
|
||||||
* <p>blåbærsyltetøj == blåbärsyltetöj == blaabaarsyltetoej but not blabarsyltetoj räksmörgås ==
|
* <p>blåbærsyltetøj == blåbärsyltetöj == blaabaarsyltetoej but not blabarsyltetoj räksmörgås ==
|
||||||
* ræksmørgås == ræksmörgaos == raeksmoergaas but not raksmorgas
|
* ræksmørgås == ræksmörgaos == raeksmoergaas but not raksmorgas
|
||||||
*
|
*
|
||||||
|
* <p>There are also separate filters for Norwegian, Danish and Swedish with slightly differing
|
||||||
|
* settings
|
||||||
|
*
|
||||||
* @see ScandinavianFoldingFilter
|
* @see ScandinavianFoldingFilter
|
||||||
*/
|
*/
|
||||||
public final class ScandinavianNormalizationFilter extends TokenFilter {
|
public final class ScandinavianNormalizationFilter extends TokenFilter {
|
||||||
|
|
||||||
|
private final ScandinavianNormalizer normalizer;
|
||||||
|
|
||||||
public ScandinavianNormalizationFilter(TokenStream input) {
|
public ScandinavianNormalizationFilter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
|
this.normalizer = new ScandinavianNormalizer(ALL_FOLDINGS);
|
||||||
}
|
}
|
||||||
|
|
||||||
private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
private static final char AA = '\u00C5'; // Å
|
|
||||||
private static final char aa = '\u00E5'; // å
|
|
||||||
private static final char AE = '\u00C6'; // Æ
|
|
||||||
private static final char ae = '\u00E6'; // æ
|
|
||||||
private static final char AE_se = '\u00C4'; // Ä
|
|
||||||
private static final char ae_se = '\u00E4'; // ä
|
|
||||||
private static final char OE = '\u00D8'; // Ø
|
|
||||||
private static final char oe = '\u00F8'; // ø
|
|
||||||
private static final char OE_se = '\u00D6'; // Ö
|
|
||||||
private static final char oe_se = '\u00F6'; // ö
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (!input.incrementToken()) {
|
if (!input.incrementToken()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
charTermAttribute.setLength(
|
||||||
char[] buffer = charTermAttribute.buffer();
|
normalizer.processToken(charTermAttribute.buffer(), charTermAttribute.length()));
|
||||||
int length = charTermAttribute.length();
|
|
||||||
|
|
||||||
int i;
|
|
||||||
for (i = 0; i < length; i++) {
|
|
||||||
|
|
||||||
if (buffer[i] == ae_se) {
|
|
||||||
buffer[i] = ae;
|
|
||||||
|
|
||||||
} else if (buffer[i] == AE_se) {
|
|
||||||
buffer[i] = AE;
|
|
||||||
|
|
||||||
} else if (buffer[i] == oe_se) {
|
|
||||||
buffer[i] = oe;
|
|
||||||
|
|
||||||
} else if (buffer[i] == OE_se) {
|
|
||||||
buffer[i] = OE;
|
|
||||||
|
|
||||||
} else if (length - 1 > i) {
|
|
||||||
|
|
||||||
if (buffer[i] == 'a'
|
|
||||||
&& (buffer[i + 1] == 'a'
|
|
||||||
|| buffer[i + 1] == 'o'
|
|
||||||
|| buffer[i + 1] == 'A'
|
|
||||||
|| buffer[i + 1] == 'O')) {
|
|
||||||
length = StemmerUtil.delete(buffer, i + 1, length);
|
|
||||||
buffer[i] = aa;
|
|
||||||
|
|
||||||
} else if (buffer[i] == 'A'
|
|
||||||
&& (buffer[i + 1] == 'a'
|
|
||||||
|| buffer[i + 1] == 'A'
|
|
||||||
|| buffer[i + 1] == 'o'
|
|
||||||
|| buffer[i + 1] == 'O')) {
|
|
||||||
length = StemmerUtil.delete(buffer, i + 1, length);
|
|
||||||
buffer[i] = AA;
|
|
||||||
|
|
||||||
} else if (buffer[i] == 'a' && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
|
|
||||||
length = StemmerUtil.delete(buffer, i + 1, length);
|
|
||||||
buffer[i] = ae;
|
|
||||||
|
|
||||||
} else if (buffer[i] == 'A' && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
|
|
||||||
length = StemmerUtil.delete(buffer, i + 1, length);
|
|
||||||
buffer[i] = AE;
|
|
||||||
|
|
||||||
} else if (buffer[i] == 'o'
|
|
||||||
&& (buffer[i + 1] == 'e'
|
|
||||||
|| buffer[i + 1] == 'E'
|
|
||||||
|| buffer[i + 1] == 'o'
|
|
||||||
|| buffer[i + 1] == 'O')) {
|
|
||||||
length = StemmerUtil.delete(buffer, i + 1, length);
|
|
||||||
buffer[i] = oe;
|
|
||||||
|
|
||||||
} else if (buffer[i] == 'O'
|
|
||||||
&& (buffer[i + 1] == 'e'
|
|
||||||
|| buffer[i + 1] == 'E'
|
|
||||||
|| buffer[i + 1] == 'o'
|
|
||||||
|| buffer[i + 1] == 'O')) {
|
|
||||||
length = StemmerUtil.delete(buffer, i + 1, length);
|
|
||||||
buffer[i] = OE;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
charTermAttribute.setLength(length);
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,135 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
|
import java.util.EnumSet;
|
||||||
|
import java.util.Set;
|
||||||
|
import org.apache.lucene.analysis.util.StemmerUtil;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This Normalizer does the heavy lifting for a set of Scandinavian normalization filters,
|
||||||
|
* normalizing use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded variants (aa,
|
||||||
|
* ao, ae, oe and oo) by transforming them to åÅæÆøØ.
|
||||||
|
*
|
||||||
|
* @since 9.0
|
||||||
|
* @lucene.internal
|
||||||
|
*/
|
||||||
|
public final class ScandinavianNormalizer {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create the instance, while choosing which foldings to apply. This may differ between Norwegian,
|
||||||
|
* Danish and Swedish.
|
||||||
|
*
|
||||||
|
* @param foldings a Set of Foldings to apply (i.e. AE, OE, AA, AO, OO)
|
||||||
|
*/
|
||||||
|
public ScandinavianNormalizer(Set<Foldings> foldings) {
|
||||||
|
this.foldings = foldings;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** List of possible foldings that can be used when configuring the filter */
|
||||||
|
public enum Foldings {
|
||||||
|
AA,
|
||||||
|
AO,
|
||||||
|
AE,
|
||||||
|
OE,
|
||||||
|
OO
|
||||||
|
}
|
||||||
|
|
||||||
|
private final Set<Foldings> foldings;
|
||||||
|
|
||||||
|
public static final Set<Foldings> ALL_FOLDINGS = EnumSet.allOf(Foldings.class);
|
||||||
|
|
||||||
|
static final char AA = '\u00C5'; // Å
|
||||||
|
static final char aa = '\u00E5'; // å
|
||||||
|
static final char AE = '\u00C6'; // Æ
|
||||||
|
static final char ae = '\u00E6'; // æ
|
||||||
|
static final char AE_se = '\u00C4'; // Ä
|
||||||
|
static final char ae_se = '\u00E4'; // ä
|
||||||
|
static final char OE = '\u00D8'; // Ø
|
||||||
|
static final char oe = '\u00F8'; // ø
|
||||||
|
static final char OE_se = '\u00D6'; // Ö
|
||||||
|
static final char oe_se = '\u00F6'; // ö
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Takes the original buffer and length as input. Modifies the buffer in-place and returns new
|
||||||
|
* length
|
||||||
|
*
|
||||||
|
* @return new length
|
||||||
|
*/
|
||||||
|
public int processToken(char[] buffer, int length) {
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < length; i++) {
|
||||||
|
|
||||||
|
if (buffer[i] == ae_se) {
|
||||||
|
buffer[i] = ae;
|
||||||
|
|
||||||
|
} else if (buffer[i] == AE_se) {
|
||||||
|
buffer[i] = AE;
|
||||||
|
|
||||||
|
} else if (buffer[i] == oe_se) {
|
||||||
|
buffer[i] = oe;
|
||||||
|
|
||||||
|
} else if (buffer[i] == OE_se) {
|
||||||
|
buffer[i] = OE;
|
||||||
|
|
||||||
|
} else if (length - 1 > i) {
|
||||||
|
|
||||||
|
if (buffer[i] == 'a'
|
||||||
|
&& (foldings.contains(Foldings.AA) && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A')
|
||||||
|
|| foldings.contains(Foldings.AO)
|
||||||
|
&& (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
|
||||||
|
length = StemmerUtil.delete(buffer, i + 1, length);
|
||||||
|
buffer[i] = aa;
|
||||||
|
|
||||||
|
} else if (buffer[i] == 'A'
|
||||||
|
&& (foldings.contains(Foldings.AA) && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A')
|
||||||
|
|| foldings.contains(Foldings.AO)
|
||||||
|
&& (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
|
||||||
|
length = StemmerUtil.delete(buffer, i + 1, length);
|
||||||
|
buffer[i] = AA;
|
||||||
|
|
||||||
|
} else if (buffer[i] == 'a'
|
||||||
|
&& foldings.contains(Foldings.AE)
|
||||||
|
&& (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
|
||||||
|
length = StemmerUtil.delete(buffer, i + 1, length);
|
||||||
|
buffer[i] = ae;
|
||||||
|
|
||||||
|
} else if (buffer[i] == 'A'
|
||||||
|
&& foldings.contains(Foldings.AE)
|
||||||
|
&& (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')) {
|
||||||
|
length = StemmerUtil.delete(buffer, i + 1, length);
|
||||||
|
buffer[i] = AE;
|
||||||
|
|
||||||
|
} else if (buffer[i] == 'o'
|
||||||
|
&& (foldings.contains(Foldings.OE) && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')
|
||||||
|
|| foldings.contains(Foldings.OO)
|
||||||
|
&& (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
|
||||||
|
length = StemmerUtil.delete(buffer, i + 1, length);
|
||||||
|
buffer[i] = oe;
|
||||||
|
|
||||||
|
} else if (buffer[i] == 'O'
|
||||||
|
&& (foldings.contains(Foldings.OE) && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E')
|
||||||
|
|| foldings.contains(Foldings.OO)
|
||||||
|
&& (buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))) {
|
||||||
|
length = StemmerUtil.delete(buffer, i + 1, length);
|
||||||
|
buffer[i] = OE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,56 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.no;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.EnumSet;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.Foldings;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This filter normalize use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded
|
||||||
|
* variants (ae, oe, aa) by transforming them to åÅæÆøØ. This is similar to
|
||||||
|
* ScandinavianNormalizationFilter, except for the folding rules customized for Norwegian.
|
||||||
|
*
|
||||||
|
* <p>blåbærsyltetøj == blåbärsyltetöj == blaabaersyltetoej
|
||||||
|
*
|
||||||
|
* @see ScandinavianNormalizationFilter
|
||||||
|
*/
|
||||||
|
public final class NorwegianNormalizationFilter extends TokenFilter {
|
||||||
|
private final ScandinavianNormalizer normalizer;
|
||||||
|
|
||||||
|
public NorwegianNormalizationFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
this.normalizer = new ScandinavianNormalizer(EnumSet.of(Foldings.AE, Foldings.OE, Foldings.AA));
|
||||||
|
}
|
||||||
|
|
||||||
|
private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public final boolean incrementToken() throws IOException {
|
||||||
|
if (!input.incrementToken()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
charTermAttribute.setLength(
|
||||||
|
normalizer.processToken(charTermAttribute.buffer(), charTermAttribute.length()));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,54 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.no;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
import org.apache.lucene.analysis.TokenFilterFactory;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link NorwegianNormalizationFilter}.
|
||||||
|
*
|
||||||
|
* @lucene.spi {@value #NAME}
|
||||||
|
*/
|
||||||
|
public class NorwegianNormalizationFilterFactory extends TokenFilterFactory {
|
||||||
|
|
||||||
|
/** SPI name */
|
||||||
|
public static final String NAME = "norwegianNormalization";
|
||||||
|
|
||||||
|
public NorwegianNormalizationFilterFactory(Map<String, String> args) {
|
||||||
|
super(args);
|
||||||
|
if (!args.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Default ctor for compatibility with SPI */
|
||||||
|
public NorwegianNormalizationFilterFactory() {
|
||||||
|
throw defaultCtorException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NorwegianNormalizationFilter create(TokenStream input) {
|
||||||
|
return new NorwegianNormalizationFilter(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream normalize(TokenStream input) {
|
||||||
|
return create(input);
|
||||||
|
}
|
||||||
|
}
|
|
@ -95,6 +95,7 @@ org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory
|
||||||
org.apache.lucene.analysis.ngram.NGramFilterFactory
|
org.apache.lucene.analysis.ngram.NGramFilterFactory
|
||||||
org.apache.lucene.analysis.no.NorwegianLightStemFilterFactory
|
org.apache.lucene.analysis.no.NorwegianLightStemFilterFactory
|
||||||
org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory
|
org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory
|
||||||
|
org.apache.lucene.analysis.no.NorwegianNormalizationFilterFactory
|
||||||
org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory
|
org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory
|
||||||
org.apache.lucene.analysis.pattern.PatternCaptureGroupFilterFactory
|
org.apache.lucene.analysis.pattern.PatternCaptureGroupFilterFactory
|
||||||
org.apache.lucene.analysis.pattern.PatternTypingFilterFactory
|
org.apache.lucene.analysis.pattern.PatternTypingFilterFactory
|
||||||
|
|
|
@ -16,37 +16,12 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.analysis.miscellaneous;
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.*;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
|
||||||
public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase {
|
public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase {
|
||||||
private Analyzer analyzer;
|
public void testDefault() throws Exception {
|
||||||
|
Analyzer analyzer = createAnalyzer();
|
||||||
@Override
|
|
||||||
public void setUp() throws Exception {
|
|
||||||
super.setUp();
|
|
||||||
analyzer =
|
|
||||||
new Analyzer() {
|
|
||||||
@Override
|
|
||||||
protected TokenStreamComponents createComponents(String field) {
|
|
||||||
final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
|
||||||
final TokenStream stream = new ScandinavianNormalizationFilter(tokenizer);
|
|
||||||
return new TokenStreamComponents(tokenizer, stream);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void tearDown() throws Exception {
|
|
||||||
analyzer.close();
|
|
||||||
super.tearDown();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void test() throws Exception {
|
|
||||||
|
|
||||||
checkOneTerm(analyzer, "aeäaeeea", "æææeea"); // should not cause ArrayIndexOutOfBoundsException
|
checkOneTerm(analyzer, "aeäaeeea", "æææeea"); // should not cause ArrayIndexOutOfBoundsException
|
||||||
|
|
||||||
|
@ -107,6 +82,7 @@ public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase
|
||||||
checkOneTerm(analyzer, "Oe", "Ø");
|
checkOneTerm(analyzer, "Oe", "Ø");
|
||||||
checkOneTerm(analyzer, "OO", "Ø");
|
checkOneTerm(analyzer, "OO", "Ø");
|
||||||
checkOneTerm(analyzer, "OE", "Ø");
|
checkOneTerm(analyzer, "OE", "Ø");
|
||||||
|
analyzer.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
/** check that the empty string doesn't cause issues */
|
/** check that the empty string doesn't cause issues */
|
||||||
|
@ -126,6 +102,19 @@ public class TestScandinavianNormalizationFilter extends BaseTokenStreamTestCase
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomData() throws Exception {
|
public void testRandomData() throws Exception {
|
||||||
|
Analyzer analyzer = createAnalyzer();
|
||||||
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
|
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
|
||||||
|
analyzer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Analyzer createAnalyzer() {
|
||||||
|
return new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String field) {
|
||||||
|
final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
|
final TokenStream stream = new ScandinavianNormalizationFilter(tokenizer);
|
||||||
|
return new TokenStreamComponents(tokenizer, stream);
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,10 +21,10 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
public class TestScandinavianNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {
|
public class TestScandinavianNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||||
|
|
||||||
public void testStemming() throws Exception {
|
public void testDefault() throws Exception {
|
||||||
TokenStream stream = whitespaceMockTokenizer("räksmörgås");
|
TokenStream stream = whitespaceMockTokenizer("räksmörgås_ae_oe_aa_oo_ao_AE_OE_AA_OO_AO");
|
||||||
stream = tokenFilterFactory("ScandinavianNormalization").create(stream);
|
stream = tokenFilterFactory("ScandinavianNormalization").create(stream);
|
||||||
assertTokenStreamContents(stream, new String[] {"ræksmørgås"});
|
assertTokenStreamContents(stream, new String[] {"ræksmørgås_æ_ø_å_ø_å_Æ_Ø_Å_Ø_Å"});
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Test that bogus arguments result in exception */
|
/** Test that bogus arguments result in exception */
|
||||||
|
@ -35,6 +35,7 @@ public class TestScandinavianNormalizationFilterFactory extends BaseTokenStreamF
|
||||||
() -> {
|
() -> {
|
||||||
tokenFilterFactory("ScandinavianNormalization", "bogusArg", "bogusValue");
|
tokenFilterFactory("ScandinavianNormalization", "bogusArg", "bogusValue");
|
||||||
});
|
});
|
||||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
assertTrue(
|
||||||
|
"Got " + expected.getMessage(), expected.getMessage().contains("Unknown parameters"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,130 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Set;
|
||||||
|
import org.apache.lucene.analysis.*;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.Foldings;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
|
/** Tests low level the normalizer functionality */
|
||||||
|
public class TestScandinavianNormalizer extends BaseTokenStreamTestCase {
|
||||||
|
public void testNoFoldings() throws Exception {
|
||||||
|
Analyzer analyzer = createAnalyzer(Collections.emptySet());
|
||||||
|
checkOneTerm(analyzer, "aa", "aa");
|
||||||
|
checkOneTerm(analyzer, "ao", "ao");
|
||||||
|
checkOneTerm(analyzer, "ae", "ae");
|
||||||
|
checkOneTerm(analyzer, "oo", "oo");
|
||||||
|
checkOneTerm(analyzer, "oe", "oe");
|
||||||
|
analyzer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testAeFolding() throws Exception {
|
||||||
|
Analyzer analyzer = createAnalyzer(Set.of(Foldings.AE));
|
||||||
|
checkOneTerm(analyzer, "aa", "aa");
|
||||||
|
checkOneTerm(analyzer, "ao", "ao");
|
||||||
|
checkOneTerm(analyzer, "ae", "æ");
|
||||||
|
checkOneTerm(analyzer, "aE", "æ");
|
||||||
|
checkOneTerm(analyzer, "Ae", "Æ");
|
||||||
|
checkOneTerm(analyzer, "AE", "Æ");
|
||||||
|
checkOneTerm(analyzer, "oo", "oo");
|
||||||
|
checkOneTerm(analyzer, "oe", "oe");
|
||||||
|
analyzer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testAaFolding() throws Exception {
|
||||||
|
Analyzer analyzer = createAnalyzer(Set.of(Foldings.AA));
|
||||||
|
checkOneTerm(analyzer, "aa", "å");
|
||||||
|
checkOneTerm(analyzer, "aA", "å");
|
||||||
|
checkOneTerm(analyzer, "Aa", "Å");
|
||||||
|
checkOneTerm(analyzer, "AA", "Å");
|
||||||
|
checkOneTerm(analyzer, "ao", "ao");
|
||||||
|
checkOneTerm(analyzer, "ae", "ae");
|
||||||
|
checkOneTerm(analyzer, "oo", "oo");
|
||||||
|
checkOneTerm(analyzer, "oe", "oe");
|
||||||
|
analyzer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOeFolding() throws Exception {
|
||||||
|
Analyzer analyzer = createAnalyzer(Set.of(Foldings.OE));
|
||||||
|
checkOneTerm(analyzer, "aa", "aa");
|
||||||
|
checkOneTerm(analyzer, "ao", "ao");
|
||||||
|
checkOneTerm(analyzer, "ae", "ae");
|
||||||
|
checkOneTerm(analyzer, "oo", "oo");
|
||||||
|
checkOneTerm(analyzer, "oe", "ø");
|
||||||
|
checkOneTerm(analyzer, "oE", "ø");
|
||||||
|
checkOneTerm(analyzer, "Oe", "Ø");
|
||||||
|
checkOneTerm(analyzer, "OE", "Ø");
|
||||||
|
analyzer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOoFolding() throws Exception {
|
||||||
|
Analyzer analyzer = createAnalyzer(Set.of(Foldings.OO));
|
||||||
|
checkOneTerm(analyzer, "aa", "aa");
|
||||||
|
checkOneTerm(analyzer, "ao", "ao");
|
||||||
|
checkOneTerm(analyzer, "ae", "ae");
|
||||||
|
checkOneTerm(analyzer, "oo", "ø");
|
||||||
|
checkOneTerm(analyzer, "oO", "ø");
|
||||||
|
checkOneTerm(analyzer, "Oo", "Ø");
|
||||||
|
checkOneTerm(analyzer, "OO", "Ø");
|
||||||
|
checkOneTerm(analyzer, "oe", "oe");
|
||||||
|
analyzer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testAoFolding() throws Exception {
|
||||||
|
Analyzer analyzer = createAnalyzer(Set.of(Foldings.AO));
|
||||||
|
checkOneTerm(analyzer, "aa", "aa");
|
||||||
|
checkOneTerm(analyzer, "ao", "å");
|
||||||
|
checkOneTerm(analyzer, "aO", "å");
|
||||||
|
checkOneTerm(analyzer, "Ao", "Å");
|
||||||
|
checkOneTerm(analyzer, "AO", "Å");
|
||||||
|
checkOneTerm(analyzer, "ae", "ae");
|
||||||
|
checkOneTerm(analyzer, "oo", "oo");
|
||||||
|
checkOneTerm(analyzer, "oe", "oe");
|
||||||
|
analyzer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Analyzer createAnalyzer(Set<Foldings> foldings) {
|
||||||
|
return new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String field) {
|
||||||
|
final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
|
final TokenStream stream =
|
||||||
|
new TokenFilter(tokenizer) {
|
||||||
|
private final CharTermAttribute charTermAttribute =
|
||||||
|
addAttribute(CharTermAttribute.class);
|
||||||
|
private final ScandinavianNormalizer normalizer =
|
||||||
|
new ScandinavianNormalizer(foldings);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (!input.incrementToken()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
charTermAttribute.setLength(
|
||||||
|
normalizer.processToken(
|
||||||
|
charTermAttribute.buffer(), charTermAttribute.length()));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
return new TokenStreamComponents(tokenizer, stream);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,120 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.no;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.*;
|
||||||
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
|
||||||
|
|
||||||
|
public class TestNorwegianNormalizationFilter extends BaseTokenStreamTestCase {
|
||||||
|
public void testDefault() throws Exception {
|
||||||
|
Analyzer analyzer = createAnalyzer();
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "aeäaeeea", "æææeea"); // should not cause ArrayIndexOutOfBoundsException
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "aeäaeeeae", "æææeeæ");
|
||||||
|
checkOneTerm(analyzer, "aeaeeeae", "ææeeæ");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "bøen", "bøen");
|
||||||
|
checkOneTerm(analyzer, "bOEen", "bØen");
|
||||||
|
checkOneTerm(analyzer, "åene", "åene");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "blåbærsyltetøj", "blåbærsyltetøj");
|
||||||
|
checkOneTerm(analyzer, "blaabaersyltetöj", "blåbærsyltetøj");
|
||||||
|
checkOneTerm(analyzer, "räksmörgås", "ræksmørgås");
|
||||||
|
checkOneTerm(analyzer, "raeksmörgaas", "ræksmørgås");
|
||||||
|
checkOneTerm(analyzer, "raeksmoergås", "ræksmørgås");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "ab", "ab");
|
||||||
|
checkOneTerm(analyzer, "ob", "ob");
|
||||||
|
checkOneTerm(analyzer, "Ab", "Ab");
|
||||||
|
checkOneTerm(analyzer, "Ob", "Ob");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "å", "å");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "aa", "å");
|
||||||
|
checkOneTerm(analyzer, "aA", "å");
|
||||||
|
checkOneTerm(analyzer, "ao", "ao");
|
||||||
|
checkOneTerm(analyzer, "aO", "aO");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "AA", "Å");
|
||||||
|
checkOneTerm(analyzer, "Aa", "Å");
|
||||||
|
checkOneTerm(analyzer, "Ao", "Ao");
|
||||||
|
checkOneTerm(analyzer, "AO", "AO");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "æ", "æ");
|
||||||
|
checkOneTerm(analyzer, "ä", "æ");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "Æ", "Æ");
|
||||||
|
checkOneTerm(analyzer, "Ä", "Æ");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "ae", "æ");
|
||||||
|
checkOneTerm(analyzer, "aE", "æ");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "Ae", "Æ");
|
||||||
|
checkOneTerm(analyzer, "AE", "Æ");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "ö", "ø");
|
||||||
|
checkOneTerm(analyzer, "ø", "ø");
|
||||||
|
checkOneTerm(analyzer, "Ö", "Ø");
|
||||||
|
checkOneTerm(analyzer, "Ø", "Ø");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "oo", "oo");
|
||||||
|
checkOneTerm(analyzer, "oe", "ø");
|
||||||
|
checkOneTerm(analyzer, "oO", "oO");
|
||||||
|
checkOneTerm(analyzer, "oE", "ø");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "Oo", "Oo");
|
||||||
|
checkOneTerm(analyzer, "Oe", "Ø");
|
||||||
|
checkOneTerm(analyzer, "OO", "OO");
|
||||||
|
checkOneTerm(analyzer, "OE", "Ø");
|
||||||
|
analyzer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** check that the empty string doesn't cause issues */
|
||||||
|
public void testEmptyTerm() throws Exception {
|
||||||
|
Analyzer a =
|
||||||
|
new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer tokenizer = new KeywordTokenizer();
|
||||||
|
return new TokenStreamComponents(
|
||||||
|
tokenizer, new ScandinavianNormalizationFilter(tokenizer));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "", "");
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomData() throws Exception {
|
||||||
|
Analyzer analyzer = createAnalyzer();
|
||||||
|
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
|
||||||
|
analyzer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Analyzer createAnalyzer() {
|
||||||
|
return new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String field) {
|
||||||
|
final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
|
final TokenStream stream = new NorwegianNormalizationFilter(tokenizer);
|
||||||
|
return new TokenStreamComponents(tokenizer, stream);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,41 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.no;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
|
public class TestNorwegianNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||||
|
|
||||||
|
public void testDefault() throws Exception {
|
||||||
|
TokenStream stream = whitespaceMockTokenizer("räksmörgås_ae_oe_aa_oo_ao_AE_OE_AA_OO_AO");
|
||||||
|
stream = tokenFilterFactory("NorwegianNormalization").create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[] {"ræksmørgås_æ_ø_å_oo_ao_Æ_Ø_Å_OO_AO"});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test that bogus arguments result in exception */
|
||||||
|
public void testBogusArguments() throws Exception {
|
||||||
|
IllegalArgumentException expected =
|
||||||
|
expectThrows(
|
||||||
|
IllegalArgumentException.class,
|
||||||
|
() -> {
|
||||||
|
tokenFilterFactory("NorwegianNormalization", "bogusArg", "bogusValue");
|
||||||
|
});
|
||||||
|
assertTrue(
|
||||||
|
"Got " + expected.getMessage(), expected.getMessage().contains("Unknown parameters"));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue