LUCENE-2842: add Galician analyzer, Portuguese RSLP

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1055892 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-01-06 14:30:37 +00:00
parent 1b22e86417
commit 61872be09d
22 changed files with 2394 additions and 85 deletions

View File

@ -281,6 +281,9 @@ New features
BooleanModifiersQueryNodeProcessor, for example instead of GroupQueryNodeProcessor.
(Adriano Crestani via Robert Muir)
* LUCENE-2842: Add analyzer for Galician. Also adds the RSLP (Orengo) stemmer
for Portuguese. (Robert Muir)
Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation

View File

@ -0,0 +1,129 @@
package org.apache.lucene.analysis.gl;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.Version;
/**
* {@link Analyzer} for Galician.
*/
public final class GalicianAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet;
/** File containing default Galician stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
* Returns an unmodifiable instance of the default stop words set.
* @return default stop words set.
*/
public static Set<?> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
/**
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
* accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET;
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getWordSet(GalicianAnalyzer.class,
DEFAULT_STOPWORD_FILE);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
}
/**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/
public GalicianAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Builds an analyzer with the given stop words.
*
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set
*/
public GalicianAnalyzer(Version matchVersion, Set<?> stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
}
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed
*/
public GalicianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
}
/**
* Creates a
* {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
* which tokenizes all the text in the provided {@link Reader}.
*
* @return A
* {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* provided and {@link GalicianStemFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new GalicianStemFilter(result);
return new TokenStreamComponents(source, result);
}
}

View File

@ -0,0 +1,60 @@
package org.apache.lucene.analysis.gl;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link GalicianStemmer} to stem
* Galician words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class GalicianStemFilter extends TokenFilter {
private final GalicianStemmer stemmer = new GalicianStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public GalicianStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
// this stemmer increases word length by 1: worst case '*çom' -> '*ción'
final int len = termAtt.length();
final int newlen = stemmer.stem(termAtt.resizeBuffer(len+1), len);
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,83 @@
package org.apache.lucene.analysis.gl;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Map;
import org.apache.lucene.analysis.pt.RSLPStemmerBase;
/**
* Galician stemmer implementing "Regras do lematizador para o galego".
*
* @see RSLPStemmerBase
* @see <a href="http://bvg.udc.es/recursos_lingua/stemming.jsp">Description of rules</a>
*/
public class GalicianStemmer extends RSLPStemmerBase {
private static final Step plural, unification, adverb, augmentative, noun, verb, vowel;
static {
Map<String,Step> steps = parse(GalicianStemmer.class, "galician.rslp");
plural = steps.get("Plural");
unification = steps.get("Unification");
adverb = steps.get("Adverb");
augmentative = steps.get("Augmentative");
noun = steps.get("Noun");
verb = steps.get("Verb");
vowel = steps.get("Vowel");
}
/**
* @param s buffer, oversized to at least <code>len+1</code>
* @param len initial valid length of buffer
* @return new valid length, stemmed
*/
public int stem(char s[], int len) {
assert s.length >= len + 1 : "this stemmer requires an oversized array of at least 1";
len = plural.apply(s, len);
len = unification.apply(s, len);
len = adverb.apply(s, len);
int oldlen;
do {
oldlen = len;
len = augmentative.apply(s, len);
} while (len != oldlen);
oldlen = len;
len = noun.apply(s, len);
if (len == oldlen) { /* suffix not removed */
len = verb.apply(s, len);
}
len = vowel.apply(s, len);
// RSLG accent removal
for (int i = 0; i < len; i++)
switch(s[i]) {
case 'á': s[i] = 'a'; break;
case 'é':
case 'ê': s[i] = 'e'; break;
case 'í': s[i] = 'i'; break;
case 'ó': s[i] = 'o'; break;
case 'ú': s[i] = 'u'; break;
}
return len;
}
}

View File

@ -0,0 +1,22 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Analyzer for Galician.
</body>
</html>

View File

@ -1,10 +1,5 @@
package org.apache.lucene.analysis.pt;
import java.util.Arrays;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -31,89 +26,14 @@ import org.apache.lucene.util.Version;
* which is just the plural reduction step of the RSLP
* algorithm from <i>A Stemming Algorithmm for the Portuguese Language</i>,
* Orengo et al.
* @see RSLPStemmerBase
*/
public class PortugueseMinimalStemmer {
public class PortugueseMinimalStemmer extends RSLPStemmerBase {
private static final CharArraySet excIS = new CharArraySet(Version.LUCENE_31,
Arrays.asList("lápis", "cais", "mais", "crúcis", "biquínis", "pois",
"depois","dois","leis"),
false);
private static final CharArraySet excS = new CharArraySet(Version.LUCENE_31,
Arrays.asList("aliás", "pires", "lápis", "cais", "mais", "mas", "menos",
"férias", "fezes", "pêsames", "crúcis", "gás", "atrás", "moisés",
"através", "convés", "ês", "país", "após", "ambas", "ambos",
"messias", "depois"),
false);
private static final Step pluralStep =
parse(PortugueseMinimalStemmer.class, "portuguese.rslp").get("Plural");
public int stem(char s[], int len) {
if (len < 3 || s[len-1] != 's')
return len;
if (s[len-2] == 'n') {
len--;
s[len-1] = 'm';
return len;
}
if (len >= 6 && s[len-3] == 'õ' && s[len-2] == 'e') {
len--;
s[len-2] = 'ã';
s[len-1] = 'o';
return len;
}
if (len >= 4 && s[len-3] == 'ã' && s[len-2] == 'e')
if (!(len == 4 && s[0] == 'm')) {
len--;
s[len-1] = 'o';
return len;
}
if (len >= 4 && s[len-2] == 'i') {
if (s[len-3] == 'a')
if (!(len == 4 && (s[0] == 'c' || s[0] == 'm'))) {
len--;
s[len-1] = 'l';
return len;
}
if (len >= 5 && s[len-3] == 'é') {
len--;
s[len-2] = 'e';
s[len-1] = 'l';
return len;
}
if (len >= 5 && s[len-3] == 'e') {
len--;
s[len-1] = 'l';
return len;
}
if (len >= 5 && s[len-3] == 'ó') {
len--;
s[len-2] = 'o';
s[len-1] = 'l';
return len;
}
if (!excIS.contains(s, 0, len)) {
s[len-1] = 'l';
return len;
}
}
if (len >= 6 && s[len-3] == 'l' && s[len-2] == 'e')
return len - 2;
if (len >= 6 && s[len-3] == 'r' && s[len-2] == 'e')
if (!(len == 7 && s[0] == 'á' && s[1] == 'r' && s[2] == 'v' && s[3] == 'o'))
return len - 2;
if (excS.contains(s, 0, len))
return len;
else
return len-1;
return pluralStep.apply(s, len);
}
}

View File

@ -0,0 +1,60 @@
package org.apache.lucene.analysis.pt;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link PortugueseStemmer} to stem
* Portuguese words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class PortugueseStemFilter extends TokenFilter {
private final PortugueseStemmer stemmer = new PortugueseStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public PortugueseStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
// this stemmer increases word length by 1: worst case '*ã' -> '*ão'
final int len = termAtt.length();
final int newlen = stemmer.stem(termAtt.resizeBuffer(len+1), len);
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,102 @@
package org.apache.lucene.analysis.pt;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Map;
/**
* Portuguese stemmer implementing the RSLP (Removedor de Sufixos da Lingua Portuguesa)
* algorithm. This is sometimes also referred to as the Orengo stemmer.
*
* @see RSLPStemmerBase
*/
public class PortugueseStemmer extends RSLPStemmerBase {
private static final Step plural, feminine, adverb, augmentative, noun, verb, vowel;
static {
Map<String,Step> steps = parse(PortugueseStemmer.class, "portuguese.rslp");
plural = steps.get("Plural");
feminine = steps.get("Feminine");
adverb = steps.get("Adverb");
augmentative = steps.get("Augmentative");
noun = steps.get("Noun");
verb = steps.get("Verb");
vowel = steps.get("Vowel");
}
/**
* @param s buffer, oversized to at least <code>len+1</code>
* @param len initial valid length of buffer
* @return new valid length, stemmed
*/
public int stem(char s[], int len) {
assert s.length >= len + 1 : "this stemmer requires an oversized array of at least 1";
len = plural.apply(s, len);
len = adverb.apply(s, len);
len = feminine.apply(s, len);
len = augmentative.apply(s, len);
int oldlen = len;
len = noun.apply(s, len);
if (len == oldlen) { /* suffix not removed */
oldlen = len;
len = verb.apply(s, len);
if (len == oldlen) { /* suffix not removed */
len = vowel.apply(s, len);
}
}
// rslp accent removal
for (int i = 0; i < len; i++) {
switch(s[i]) {
case 'à':
case 'á':
case 'â':
case 'ã':
case 'ä':
case 'å': s[i] = 'a'; break;
case 'ç': s[i] = 'c'; break;
case 'è':
case 'é':
case 'ê':
case 'ë': s[i] = 'e'; break;
case 'ì':
case 'í':
case 'î':
case 'ï': s[i] = 'i'; break;
case 'ñ': s[i] = 'n'; break;
case 'ò':
case 'ó':
case 'ô':
case 'õ':
case 'ö': s[i] = 'o'; break;
case 'ù':
case 'ú':
case 'û':
case 'ü': s[i] = 'u'; break;
case 'ý':
case 'ÿ': s[i] = 'y'; break;
}
}
return len;
}
}

View File

@ -0,0 +1,345 @@
package org.apache.lucene.analysis.pt;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/**
* Base class for stemmers that use a set of RSLP-like stemming steps.
* <p>
* RSLP (Removedor de Sufixos da Lingua Portuguesa) is an algorithm designed
* originally for stemming the Portuguese language, described in the paper
* <i>A Stemming Algorithm for the Portuguese Language</i>, Orengo et. al.
* <p>
* Since this time a plural-only modification (RSLP-S) as well as a modification
* for the Galician language have been implemented. This class parses a configuration
* file that describes {@link Step}s, where each Step contains a set of {@link Rule}s.
* <p>
* The general rule format is:
* <blockquote>{ "suffix", N, "replacement", { "exception1", "exception2", ...}}</blockquote>
* where:
* <ul>
* <li><code>suffix</code> is the suffix to be removed (such as "inho").
* <li><code>N</code> is the min stem size, where stem is defined as the candidate stem
* after removing the suffix (but before appending the replacement!)
* <li><code>replacement</code> is an optimal string to append after removing the suffix.
* This can be the empty string.
* <li><code>exceptions</code> is an optional list of exceptions, patterns that should
* not be stemmed. These patterns can be specified as whole word or suffix (ends-with)
* patterns, depending upon the exceptions format flag in the step header.
* </ul>
* <p>
* A step is an ordered list of rules, with a structure in this format:
* <blockquote>{ "name", N, B, { "cond1", "cond2", ... }
* ... rules ... };
* </blockquote>
* where:
* <ul>
* <li><code>name</code> is a name for the step (such as "Plural").
* <li><code>N</code> is the min word size. Words that are less than this length bypass
* the step completely, as an optimization. Note: N can be zero, in this case this
* implementation will automatically calculate the appropriate value from the underlying
* rules.
* <li><code>B</code> is a "boolean" flag specifying how exceptions in the rules are matched.
* A value of 1 indicates whole-word pattern matching, a value of 0 indicates that
* exceptions are actually suffixes and should be matched with ends-with.
* <li><code>conds</code> are an optional list of conditions to enter the step at all. If
* the list is non-empty, then a word must end with one of these conditions or it will
* bypass the step completely as an optimization.
* </ul>
* <p>
* @see <a href="http://www.inf.ufrgs.br/~viviane/rslp/index.htm">RSLP description</a>
* @lucene.internal
*/
public abstract class RSLPStemmerBase {
/**
* A basic rule, with no exceptions.
*/
protected static class Rule {
protected final char suffix[];
protected final char replacement[];
protected final int min;
/**
* Create a rule.
* @param suffix suffix to remove
* @param min minimum stem length
* @param replacement replacement string
*/
public Rule(String suffix, int min, String replacement) {
this.suffix = suffix.toCharArray();
this.replacement = replacement.toCharArray();
this.min = min;
}
/**
* @return true if the word matches this rule.
*/
public boolean matches(char s[], int len) {
return (len - suffix.length >= min && endsWith(s, len, suffix));
}
/**
* @return new valid length of the string after firing this rule.
*/
public int replace(char s[], int len) {
if (replacement.length > 0) {
System.arraycopy(replacement, 0, s, len - suffix.length, replacement.length);
}
return len - suffix.length + replacement.length;
}
}
/**
* A rule with a set of whole-word exceptions.
*/
protected static class RuleWithSetExceptions extends Rule {
protected final CharArraySet exceptions;
public RuleWithSetExceptions(String suffix, int min, String replacement,
String[] exceptions) {
super(suffix, min, replacement);
for (int i = 0; i < exceptions.length; i++) {
if (!exceptions[i].endsWith(suffix))
System.err.println("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
}
this.exceptions = new CharArraySet(Version.LUCENE_31,
Arrays.asList(exceptions), false);
}
@Override
public boolean matches(char s[], int len) {
return super.matches(s, len) && !exceptions.contains(s, 0, len);
}
}
/**
* A rule with a set of exceptional suffixes.
*/
protected static class RuleWithSuffixExceptions extends Rule {
// TODO: use a more efficient datastructure: automaton?
protected final char[][] exceptions;
public RuleWithSuffixExceptions(String suffix, int min, String replacement,
String[] exceptions) {
super(suffix, min, replacement);
for (int i = 0; i < exceptions.length; i++) {
if (!exceptions[i].endsWith(suffix))
System.err.println("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
}
this.exceptions = new char[exceptions.length][];
for (int i = 0; i < exceptions.length; i++)
this.exceptions[i] = exceptions[i].toCharArray();
}
@Override
public boolean matches(char s[], int len) {
if (!super.matches(s, len))
return false;
for (int i = 0; i < exceptions.length; i++)
if (endsWith(s, len, exceptions[i]))
return false;
return true;
}
}
/**
* A step containing a list of rules.
*/
protected static class Step {
protected final String name;
protected final Rule rules[];
protected final int min;
protected final char[][] suffixes;
/**
* Create a new step
* @param name Step's name.
* @param rules an ordered list of rules.
* @param min minimum word size. if this is 0 it is automatically calculated.
* @param suffixes optional list of conditional suffixes. may be null.
*/
public Step(String name, Rule rules[], int min, String suffixes[]) {
this.name = name;
this.rules = rules;
if (min == 0) {
min = Integer.MAX_VALUE;
for (Rule r : rules)
min = Math.min(min, r.min + r.suffix.length);
}
this.min = min;
if (suffixes == null || suffixes.length == 0) {
this.suffixes = null;
} else {
this.suffixes = new char[suffixes.length][];
for (int i = 0; i < suffixes.length; i++)
this.suffixes[i] = suffixes[i].toCharArray();
}
}
/**
* @return new valid length of the string after applying the entire step.
*/
public int apply(char s[], int len) {
if (len < min)
return len;
if (suffixes != null) {
boolean found = false;
for (int i = 0; i < suffixes.length; i++)
if (endsWith(s, len, suffixes[i])) {
found = true;
break;
}
if (!found) return len;
}
for (int i = 0; i < rules.length; i++) {
if (rules[i].matches(s, len))
return rules[i].replace(s, len);
}
return len;
}
}
/**
* Parse a resource file into an RSLP stemmer description.
* @return a Map containing the named Steps in this description.
*/
protected static Map<String,Step> parse(Class<? extends RSLPStemmerBase> clazz, String resource) {
// TODO: this parser is ugly, but works. use a jflex grammar instead.
try {
InputStream is = clazz.getResourceAsStream(resource);
LineNumberReader r = new LineNumberReader(new InputStreamReader(is, "UTF-8"));
Map<String,Step> steps = new HashMap<String,Step>();
String step;
while ((step = readLine(r)) != null) {
Step s = parseStep(r, step);
steps.put(s.name, s);
}
r.close();
return steps;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private static final Pattern headerPattern =
Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*(0|1),\\s*\\{(.*)\\},\\s*$");
private static final Pattern stripPattern =
Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+)\\s*\\}\\s*(,|(\\}\\s*;))$");
private static final Pattern repPattern =
Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\"\\}\\s*(,|(\\}\\s*;))$");
private static final Pattern excPattern =
Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\",\\s*\\{(.*)\\}\\s*\\}\\s*(,|(\\}\\s*;))$");
private static Step parseStep(LineNumberReader r, String header) throws IOException {
Matcher matcher = headerPattern.matcher(header);
if (!matcher.find()) {
throw new RuntimeException("Illegal Step header specified at line " + r.getLineNumber());
}
assert matcher.groupCount() == 4;
String name = matcher.group(1);
int min = Integer.parseInt(matcher.group(2));
int type = Integer.parseInt(matcher.group(3));
String suffixes[] = parseList(matcher.group(4));
Rule rules[] = parseRules(r, type);
return new Step(name, rules, min, suffixes);
}
private static Rule[] parseRules(LineNumberReader r, int type) throws IOException {
List<Rule> rules = new ArrayList<Rule>();
String line;
while ((line = readLine(r)) != null) {
Matcher matcher = stripPattern.matcher(line);
if (matcher.matches()) {
rules.add(new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), ""));
} else {
matcher = repPattern.matcher(line);
if (matcher.matches()) {
rules.add(new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), matcher.group(3)));
} else {
matcher = excPattern.matcher(line);
if (matcher.matches()) {
if (type == 0) {
rules.add(new RuleWithSuffixExceptions(matcher.group(1),
Integer.parseInt(matcher.group(2)),
matcher.group(3),
parseList(matcher.group(4))));
} else {
rules.add(new RuleWithSetExceptions(matcher.group(1),
Integer.parseInt(matcher.group(2)),
matcher.group(3),
parseList(matcher.group(4))));
}
} else {
throw new RuntimeException("Illegal Step rule specified at line " + r.getLineNumber());
}
}
}
if (line.endsWith(";"))
return rules.toArray(new Rule[rules.size()]);
}
return null;
}
private static String[] parseList(String s) {
if (s.isEmpty())
return null;
String list[] = s.split(",");
for (int i = 0; i < list.length; i++)
list[i] = parseString(list[i].trim());
return list;
}
private static String parseString(String s) {
return s.substring(1, s.length()-1);
}
private static String readLine(LineNumberReader r) throws IOException {
String line = null;
while ((line = r.readLine()) != null) {
line = line.trim();
if (!line.isEmpty() && line.charAt(0) != '#')
return line;
}
return line;
}
}

View File

@ -56,6 +56,25 @@ public class StemmerUtil {
return true;
}
/**
* Returns true if the character array ends with the suffix.
*
* @param s Input Buffer
* @param len length of input buffer
* @param suffix Suffix string to test
* @return true if <code>s</code> ends with <code>suffix</code>
*/
public static boolean endsWith(char s[], int len, char suffix[]) {
final int suffixLen = suffix.length;
if (suffixLen > len)
return false;
for (int i = suffixLen - 1; i >= 0; i--)
if (s[len -(suffixLen - i)] != suffix[i])
return false;
return true;
}
/**
* Delete a character in-place
*

View File

@ -0,0 +1,647 @@
# Steps file for the RSLP stemmer.
# Step 1: Plural Reduction
{ "Plural", 3, 1, {"s"},
# bons -> bon
{"ns",1,"n",{"luns","furatapóns","furatapons"}},
# xamós -> xamón
{"ós",3,"ón"},
# balões -> balón
{"ões",3,"ón"},
# capitães -> capitão
{"ães",1,"ão",{"mães","magalhães"}},
# normais -> normal
{"ais",2,"al",{"cais","tais","mais","pais","ademais"}},
{"áis",2,"al",{"cáis","táis", "máis", "páis", "ademáis"}},
# papéis -> papel
{"éis",2,"el"},
# posíbeis -> posíbel
{"eis",2,"el"},
# espanhóis -> espanhol
{"óis",2,"ol",{"escornabóis"}},
# caracois -> caracol
{"ois",2,"ol",{"escornabois"}},
# cadrís -> cadril
{"ís",2,"il",{"país"}},
# cadris -> cadril
{"is",2,"il",{"menfis","pais","kinguis"}},
# males -> mal
{"les",2,"l",{"ingles","marselles","montreales","senegales","manizales","móstoles","nápoles"}},
# mares -> mar
{"res",3,"r",{"petres","henares","cáceres","baleares","linares","londres","mieres","miraflores","mércores","venres", "pires"}},
# luces -> luz
{"ces",2,"z"},
# luzes -> luz
{"zes",2,"z"},
# leises -> lei
{"ises",3,"z"},
# animás -> animal
{"ás",1,"al",{"más"}},
# gases -> gas
{"ses",2,"s"},
# casas -> casa
{"s",2,"",{"barbadés","barcelonés","cantonés","gabonés","llanés","medinés","escocés","escocês","francês","barcelonês","cantonês","macramés","reves","barcelones","cantones","gabones","llanes","magallanes","medines","escoces","frances","xoves","martes","aliás","pires","lápis","cais","mais","mas","menos","férias","pêsames","crúcis","país","cangas","atenas","asturias","canarias","filipinas","honduras","molucas","caldas","mascareñas","micenas","covarrubias","psoas","óculos","nupcias","xoves","martes","llanes"}}};
{ "Unification", 0, 0, {},
# cansadísimo -> cansadísimo
{"íssimo",5,"ísimo"},
# cansadísima -> cansadísima
{"íssima",5,"ísima"},
# homaço -> homazo
{"aço",4,"azo"},
# mulheraça -> mulheraza
{"aça",4,"aza"},
# xentuça -> xentuza
{"uça",4,"uza"},
# manilhar -> manillar
{"lhar",2,"llar"},
# colher -> coller
{"lher",2,"ller"},
# melhor -> mellor
{"lhor",2,"llor"},
# alho -> allo
{"lho",1,"llo"},
# linhar -> liñar
{"nhar",2,"ñar"},
# penhor -> peñor
{"nhor",2,"ñor"},
# anho -> año
{"nho",1,"ño"},
# cunha -> cuña
{"nha",1,"ña"},
# hospitalário -> hospitalario
{"ário",3,"ario"},
# bibliotecária -> bibliotecaria
{"ária",3,"aria"},
# agradable -> agradábel
{"able",2,"ábel"},
# agradávele -> agradábel
{"ável",2,"ábel"},
# imposible -> imposíbel
{"ible",2,"íbel"},
# imposível -> imposíbel
{"ível",2,"íbel"},
# imposiçom -> imposición
{"çom",2,"ción"},
# garagem -> garaxe
{"agem",2,"axe"},
# garage -> garaxe
{"age",2,"axe"},
# impressão -> impressón
{"ão",3,"ón"},
# irmao -> irmán
{"ao",1,"án"},
# irmau -> irmán
{"au",1,"án"},
# garrafom -> garrafón
{"om",3,"ón"},
# cantem -> canten
{"m",2,"n"}};
{ "Adverb", 0, 0, {},
# felizmente -> feliz
{"mente",4,"",{"experimente","vehemente","sedimente"}}};
{ "Augmentative", 0, 1, {},
# cansadísimo -> cansad
{"dísimo",5},
# cansadísima -> cansad
{"dísima",5},
# amabilísimo -> ama
{"bilísimo",3},
# amabilísima -> ama
{"bilísima",3},
# fortísimo -> fort
{"ísimo",3},
# fortísima -> fort
{"ísima",3},
# centésimo -> cent
{"ésimo",3},
# centésima -> cent
{"ésima",3},
# paupérrimo -> paup
{"érrimo",4},
# paupérrima -> paup
{"érrima",4},
# charlatana -> charlat
{"ana",2,"",{"argana","banana","choupana","espadana","faciana","iguana","lantana","macana","membrana","mesana","nirvana","obsidiana","palangana","pavana","persiana","pestana","porcelana","pseudomembrana","roldana","sábana","salangana","saragana","ventana"}},
# charlatán -> charlat
{"án",3,"",{"ademán","bardán","barregán","corricán","curricán","faisán","furacán","fustán","gabán","gabián","galán","gañán","lavacán","mazán","mourán","rabadán","serán","serrán","tabán","titán","tobogán","verán","volcán","volován"}},
# homazo -> hom
{"azo",4,"",{"abrazo","espazo","andazo","bagazo","balazo","bandazo","cachazo","carazo","denazo","engazo","famazo","lampreazo","pantocazo","pedazo","preñazo","regazo","ribazo","sobrazo","terrazo","trompazo"}},
# mulleraza -> muller
{"aza",3,"",{"alcarraza","ameaza","baraza","broucaza","burgaza","cabaza","cachaza","calaza","carpaza","carraza","coiraza","colmaza","fogaza","famaza","labaza","liñaza","melaza","mordaza","paraza","pinaza","rabaza","rapaza","trancaza"}},
# cascallo -> casc
{"allo",4,"",{"traballo"}},
# xentalla -> xent
{"alla",4},
# bocarra -> boc
{"arra",3,"",{"cigarra","cinzarra"}},
# medicastro -> medic
{"astro",3,"",{"balastro","bimbastro","canastro","retropilastro"}},
# poetastra -> poet
{"astra",3,"",{"banastra","canastra","contrapilastra","piastra","pilastra"}},
# corpázio -> corp
{"ázio",3,"",{"topázio"}},
# soutelo -> sout
{"elo",4,"",{"bacelo","barrelo","bicarelo","biquelo","boquelo","botelo","bouquelo","cacarelo","cachelo","cadrelo","campelo","candelo","cantelo","carabelo","carambelo","caramelo","cercelo","cerebelo","chocarelo","coitelo","conchelo","corbelo","cotobelo","couselo","destelo","desvelo","esfácelo","fandelo","fardelo","farelo","farnelo","flabelo","ganchelo","garfelo","involucelo","mantelo","montelo","outerelo","padicelo","pesadelo","pinguelo","piquelo","rampelo","rastrelo","restelo","tornecelo","trabelo","restrelo","portelo","ourelo","zarapelo"}},
# avioneta -> avion
{"eta",3,"",{"arqueta","atleta","avoceta","baioneta","baldeta","banqueta","barraganeta","barreta","borleta","buceta","caceta","calceta","caldeta","cambeta","canaleta","caneta","carreta","cerceta","chaparreta","chapeta","chareta","chincheta","colcheta","cometa","corbeta","corveta","cuneta","desteta","espeta","espoleta","estafeta","esteta","faceta","falanxeta","frasqueta","gaceta","gabeta","galleta","garabeta","gaveta","glorieta","lagareta","lambeta","lanceta","libreta","maceta","macheta","maleta","malleta","mareta","marreta","meseta","mofeta","muleta","peseta","planeta","raqueta","regreta","saqueta","veleta","vendeta","viñeta"}},
# guapete -> guap
{"ete",3,"",{"alfinete","ariete","bacinete","banquete","barallete","barrete","billete","binguelete","birrete","bonete","bosquete","bufete","burlete","cabalete","cacahuete","cavinete","capacete","carrete","casarete","casete","chupete","clarinete","colchete","colete","capete","curupete","disquete","estilete","falsete","ferrete","filete","gallardete","gobelete","inglete","machete","miquelete","molete","mosquete","piquete","ribete","rodete","rolete","roquete","sorvete","vedete","vendete"}},
# práctica -> práct
{"ica",3,"",{"andarica","botánica","botica","dialéctica","dinámica","física","formica","gráfica","marica","túnica"}},
# práctico -> práct
{"ico",3,"",{"conico","acetifico","acidifico"}},
# trapexo -> trap
{"exo",3,"",{"arpexo","arquexo","asexo","axexo","azulexo","badexo","bafexo","bocexo","bosquexo","boubexo","cacarexo","carrexo","cascarexo","castrexo","convexo","cotexo","desexo","despexo","forcexo","gabexo","gargarexo","gorgolexo","inconexo","manexo","merexo","narnexo","padexo","patexo","sopexo","varexo"}},
{"exa",3,"",{"airexa","bandexa","carrexa","envexa","igrexa","larexa","patexa","presexa","sobexa"}},
# multidão -> mult
{"idão",3},
# pequeniño -> pequeno
{"iño",3,"o",{"camiño","cariño","comiño","golfiño","padriño","sobriño","viciño","veciño"}},
# pequeniña -> pequena
{"iña",3,"a",{"camariña","campiña","entreliña","espiña","fariña","moriña","valiña"}},
# grandito -> grand
{"ito",3,""},
# grandita -> grand
{"ita",3,""},
# anomaloide -> animal
{"oide",3,"",{"anaroide","aneroide","asteroide","axoide","cardioide","celuloide","coronoide","discoide","espermatozoide","espiroide","esquizoide","esteroide","glenoide","linfoide","hemorroide","melaloide","sacaroide","tetraploide","varioloide"}},
# cazola -> caz
{"ola",3,"",{"aixola","ampola","argola","arola","arteríola","bandola","bítola","bractéola","cachola","carambola","carapola","carola","carrandiola","catrapola","cebola","centola","champola","chatola","cirola","cítola","consola","corola","empola","escarola","esmola","estola","fitola","florícola","garañola","gárgola","garxola","glicocola","góndola","mariola","marola","michola","pirola","rebola","rupícola","saxícola","sémola","tachola","tómbola"}},
# pedrolo -> pedr
{"olo",3,"",{"arrolo","babiolo","cacharolo","caixarolo","carolo","carramolo","cascarolo","cirolo","codrolo","correolo","cotrolo","desconsolo","rebolo","repolo","subsolo","tixolo","tómbolo","torolo","trémolo","vacúolo","xermolo","zócolo"}},
# vellote -> vell
{"ote",3,"",{"aigote","alcaiote","barbarote","balote","billote","cachote","camarote","capote","cebote","chichote","citote","cocorote","escote","gañote","garrote","gavote","lamote","lapote","larapote","lingote","lítote","magote","marrote","matalote","pandote","paparote","rebote","tagarote","zarrote"}},
# mozota -> moz
{"ota",3,"",{"asíntota","caiota","cambota","chacota","compota","creosota","curota","derrota","díspota","gamota","maniota","pelota","picota","pillota","pixota","queirota","remota"}},
# gordocho -> gord
{"cho",3,"",{"abrocho","arrocho","carocho","falucho","bombacho","borracho","mostacho"}},
# gordecha -> gord
{"cha",3,"",{"borracha","carracha","estacha","garnacha","limacha","remolacha","abrocha"}},
# baratuco -> barat
{"uco",4,"",{"caduco","estuco","fachuco","malluco","saluco","trabuco"}},
# borrachuzo -> borrach
{"uzo",3,"",{"carriñouzo","fachuzo","mañuzo","mestruzo","tapuzo"}},
# xentuza -> xent
{"uza",3,"",{"barruza","chamuza","chapuza","charamuza","conduza","deduza","desluza","entreluza","induza","reluza","seduza","traduza","trasluza"}},
# babuxa -> bab
{"uxa",3,"",{"caramuxa","carrabouxa","cartuxa","coruxa","curuxa","gaturuxa","maruxa","meruxa","miruxa","moruxa","muruxa","papuxa","rabuxa","trouxa"}},
{"uxo",3,"",{"caramuxo","carouxo","carrabouxo","curuxo","debuxo","ganduxo","influxo","negouxo","pertuxo","refluxo"}},
# grupello -> grup
{"ello",3,"",{"alborello","artello","botello","cachafello","calello","casarello","cazabello","cercello","cocerello","concello","consello","desparello","escaravello","espello","fedello","fervello","gagafello","gorrobello","nortello","pendello","troupello","trebello"}},
# pontella -> pont
{"ella",3,"",{"alborella","bertorella","bocatella","botella","calella","cercella","gadella","grosella","lentella","movella","nocella","noitevella","parella","pelella","percebella","segorella","sabella"}}};
{ "Noun", 0, 0, {},
# lealdade -> leal
{"dade",3,"",{"acridade","calidade"}},
# clarificar -> clar
{"ificar",2},
# brasileiro->brasil
{"eiro",3,"",{"agoireiro","bardalleiro","braseiro","barreiro","canteiro","capoeiro","carneiro","carteiro","cinceiro","faroleiro","mareiro","preguiceiro","quinteiro","raposeiro","retranqueiro","regueiro","sineiro","troleiro","ventureiro"}},
# marisqueira -> marisqu
{"eira",3,"",{"cabeleira","canteira","cocheira","folleira","milleira"}},
# hospitalario -> hospital
{"ario",3,"",{"armario","calcario","lionario","salario"}},
# bibliotecaria -> bibliotec
{"aria",3,"",{"cetaria","coronaria","fumaria","linaria","lunaria","parietaria","saponaria","serpentaria"}},
# humorístico -> humor
{"ístico",3,"",{"balístico", "ensaístico"}},
# castrista -> castr
{"ista",3,"",{"batista","ciclista","fadista","operista","tenista","verista"}},
# lavado -> lav
{"ado",2,"",{"grado","agrado"}},
# decanato -> decan
{"ato",2,"",{"agnato"}},
# xemido -> xem
{"ido",3,"",{"cándido","cândido","consolido","decidido","duvido","marido","rápido"}},
# mantida -> mant
{"ida",3,"",{"bastida","dúbida","dubida","duvida","ermida","éxida","guarida","lapicida","medida","morida"}},
{"ída",3},
# mantído -> mant
{"ido",3},
# orelludo -> orell
{"udo",3,"",{"estudo","escudo"}},
# orelluda -> orell
{"uda",3},
{"ada",3,"",{"abada","alhada","allada","pitada"}},
# comedela -> come
{"dela",3,"",{"cambadela","cavadela","forcadela","erisipidela","mortadela","espadela","fondedela","picadela","arandela","candela","cordela","escudela","pardela"}},
# fontela -> font
{"ela",3,"",{"canela","capela","cotela","cubela","curupela","escarapela","esparrela","estela","fardela","flanela","fornela","franela","gabela","gamela","gavela","glumela","granicela","lamela","lapela","malvela","manela","manganela","mexarela","micela","mistela","novela","ourela","panela","parcela","pasarela","patamela","patela","paxarela","pipela","pitela","postela","pubela","restela","sabela","salmonela","secuela","sentinela","soldanela","subela","temoncela","tesela","tixela","tramela","trapela","varela","vitela","xanela","xestela"}},
# agradábel -> agrad
{"ábel",2,"",{"afábel","fiábel"}},
# combustíbel -> combust
{"íbel",2,"",{"críbel","imposíbel","posíbel","fisíbel","falíbel"}},
# fabricante -> frabrica
{"nte",3,"",{"alimente","adiante","acrescente","elefante","frequente","freqüente","gigante","instante","oriente","permanente","posante","possante","restaurante"}},
# ignorancia -> ignora
{"ncia",3},
# temperanza -> tempera
{"nza",3},
{"acia",3,"",{"acracia","audacia","falacia","farmacia"}},
# inmundicia -> inmund
{"icia",3,"",{"caricia","delicia","ledicia","malicia","milicia","noticia","pericia","presbicia","primicia","regalicia","sevicia","tiricia"}},
# xustiza -> xust
{"iza",3,"",{"alvariza","baliza","cachiza","caniza","cañiza","carbaliza","carriza","chamariza","chapiza","fraguiza","latiza","longaniza","mañiza","nabiza","peliza","preguiza","rabiza"}},
# clarexar -> clar
{"exar",3,"",{"palmexar"}},
# administración -> administr
{"ación",2,"",{"aeración"}},
# expedición -> exped
{"ición",3,"",{"condición","gornición","monición","nutrición","petición","posición","sedición","volición"}},
# excepción -> except
{"ción",3,"t"},
# comprensión -> comprens
{"sión",3,"s",{"abrasión", "alusión"}},
# doazón -> do
{"azón",2,"",{"armazón"}},
# garrafón -> garraf
{"ón",3,"",{"abalón","acordeón","alción","aldrabón","alerón","aliñón","ambón","bombón","calzón","campón","canalón","cantón","capitón","cañón","centón","ciclón","collón","colofón","copón","cotón","cupón","petón","tirón","tourón","turón","unción","versión","zubón","zurrón"}},
# lambona -> lamb
{"ona",3,"",{"abandona","acetona","aleurona","amazona","anémona","bombona","cambona","carona","chacona","charamona","cincona","condona","cortisona","cretona","cretona","detona","estona","fitohormona","fregona","gerona","hidroquinona","hormona","lesiona","madona","maratona","matrona","metadona","monótona","neurona","pamplona","peptona","poltrona","proxesterona","quinona","quinona","silicona","sulfona"}},
# bretoa -> bretón
{"oa",3,"",{"abandoa","madroa","barbacoa","estoa","airoa","eiroa","amalloa","ámboa","améndoa","anchoa","antinéboa","avéntoa","avoa","bágoa","balboa","bisavoa","boroa","canoa","caroa","comadroa","coroa","éngoa","espácoa","filloa","fírgoa","grañoa","lagoa","lanzoa","magoa","mámoa","morzoa","noiteboa","noraboa","parañoa","persoa","queiroa","rañoa","táboa","tataravoa","teiroa"}},
# demoníaco -> demoní
{"aco",3},
# demoníaca -> demoní
{"aca",3,"",{"alpaca","barraca","bullaca","buraca","carraca","casaca","cavaca","cloaca","entresaca","ervellaca","espinaca","estaca","farraca","millaca","pastinaca","pataca","resaca","urraca","purraca"}},
# carballal -> carball
{"al",4,"",{"afinal","animal","estatal","bisexual","bissexual","desleal","fiscal","formal","pessoal","persoal","liberal","postal","virtual","visual","pontual","puntual","homosexual","heterosexual"}},
# nadador -> nada
{"dor",2,"",{"abaixador"}},
# benfeitor -> benfei
{"tor",3,"",{"autor","motor","pastor","pintor"}},
# produtor -> produt
{"or",2,"",{"asesor","assessor","favor","mellor","melhor","redor","rigor","sensor","tambor","tumor"}},
# profesora -> profes
{"ora",3,"",{"albacora","anáfora","áncora","apisoadora","ardora","ascospora","aurora","avéspora","bitácora","canéfora","cantimplora","catáfora","cepilladora","demora","descalcificadora","diáspora","empacadora","epífora","ecavadora","escora","eslora","espora","fotocompoñedora","fotocopiadora","grampadora","isícora","lavadora","lixadora","macrospora","madrépora","madrágora","masora","mellora","metáfora","microspora","milépora","milpéndora","nécora","oospora","padeadora","pasiflora","pécora","píldora","pólvora","ratinadora","rémora","retroescavadora","sófora","torradora","trémbora","uredospora","víbora","víncora","zoospora"}},
# zapataría -> zapat
{"aría",3,"",{"libraría"}},
# etiquetaxe -> etiquet
{"axe",3,"",{"aluaxe","amaraxe","amperaxe","bagaxe","balaxe","barcaxe","borraxe","bescaxe","cabotaxe","carraxe","cartilaxe","chantaxe","colaxe","coraxe","carruaxe","dragaxe","embalaxe","ensilaxe","epistaxe","fagundaxe","fichaxe","fogaxe","forraxe","fretaxe","friaxe","garaxe","homenaxe","leitaxe","liñaxe","listaxe","maraxe","marcaxe","maridaxe","masaxe","miraxe","montaxe","pasaxe","peaxe","portaxe","ramaxe","rebelaxe","rodaxe","romaxe","sintaxe","sondaxe","tiraxe","vantaxe","vendaxe","viraxe"}},
# movedizo -> move
{"dizo",3},
# limpeza -> limp
{"eza",3,"",{"alteza","beleza","fereza","fineza","vasteza","vileza"}},
# rixidez -> rixid
{"ez",3,"",{"acidez","adultez","adustez","avidez","candidez","mudez","nenez","nudez","pomez"}},
# mullerengo -> muller
{"engo",3},
# chairego -> chair
{"ego",3,"",{"corego","derrego","entrego","lamego","sarego","sartego"}},
# cariñoso -> cariñ
{"oso",3,"",{"afanoso","algoso","caldoso","caloso","cocoso","ditoso","favoso","fogoso","lamoso","mecoso","mocoso","precioso","rixoso","venoso","viroso","xesoso"}},
# cariñosa -> cariñ
{"osa",3,"",{"mucosa","glicosa","baldosa","celulosa","isoglosa","nitrocelulosa","levulosa","ortosa","pectosa","preciosa","sacarosa","serosa","ventosa"}},
# negrume -> negr
{"ume",3,"",{"agrume","albume","alcume","batume","cacume","cerrume","chorume","churume","costume","curtume","estrume","gafume","legume","perfume","queixume","zarrume"}},
# altura -> alt
{"ura",3,"",{"albura","armadura","imatura","costura"}},
# cuspiñar -> cusp
{"iñar",3},
# febril -> febr
{"il",3,"",{"abril","alfil","anil","atril","badil","baril","barril","brasil","cadril","candil","cantil","carril","chamil","chancil","civil","cubil","dátil","difícil","dócil","edil","estéril","fácil","fráxil","funil","fusil","grácil","gradil","hábil","hostil","marfil"}},
# principesco -> princip
{"esco",4},
# mourisco -> mour
{"isco",4},
# esportivo -> esport
{"ivo",3,"",{"pasivo","positivo","passivo","possessivo","posesivo","pexotarivo","relativo"}}};
{ "Verb", 0, 0, {},
# amaba -> am
{"aba",2},
# andabade -> and
{"abade",2},
# andábade -> and
{"ábade",2},
# chorabamo -> chor
{"abamo",2},
# chorábamo -> chor
{"ábamo",2},
# moraban -> morab
{"aban",2},
# andache -> and
{"ache",2},
# andade -> and
{"ade",2},
{"an",2},
# cantando -> cant
{"ando",2},
# cantar -> cant
{"ar",2,"",{"azar","bazar","patamar"}},
# lembrarade -> lembra
{"arade",2},
{"aramo",2},
{"arán",2},
# cantaran -> cant
{"aran",2},
# convidárade -> convid
{"árade",2},
# convidaría -> convid
{"aría",2},
# cantariade -> cant
{"ariade",2},
# cantaríade -> cant
{"aríade",2},
# cantarian -> cant
{"arian",2},
# cantariamo -> cant
{"ariamo",2},
# pescaron -> pesc
{"aron",2},
# cantase -> cant
{"ase",2},
# cantasede -> cant
{"asede",2},
# cantásede -> cant
{"ásede",2},
# cantasemo -> cant
{"asemo",2},
# cantásemo -> cant
{"ásemo",2},
# cantasen -> cant
{"asen",2},
# loitavan -> loitav
{"avan",2},
# cantaríamo -> cant
{"aríamo",2},
# cantassen -> cant
{"assen",2},
# cantássemo -> cant
{"ássemo",2},
# beberíamo -> beb
{"eríamo",2},
# bebêssemo -> beb
{"êssemo",2},
# partiríamo -> part
{"iríamo",3},
# partíssemo -> part
{"íssemo",3},
# cantáramo -> cant
{"áramo",2},
# cantárei -> cant
{"árei",2},
# cantaren -> cant
{"aren",2},
# cantaremo -> cant
{"aremo",2},
# cantaríei -> cant
{"aríei",2},
{"ássei",2},
# cantávamo-> cant
{"ávamo",2},
# bebêramo -> beb
{"êramo",1},
# beberemo -> beb
{"eremo",1},
# beberíei -> beb
{"eríei",1},
# bebêssei -> beb
{"êssei",1},
# partiríamo -> part
{"íramo",3},
# partiremo -> part
{"iremo",3},
# partiríei -> part
{"iríei",3},
# partíssei -> part
{"íssei",3},
# partissen -> part
{"issen",3},
# bebendo -> beb
{"endo",1},
# partindo -> part
{"indo",3},
# propondo -> prop
{"ondo",3},
# cantarde -> cant
{"arde",2},
# cantarei -> cant
{"arei",2},
# cantaria -> cant
{"aria",2},
# cantarmo -> cant
{"armo",2},
# cantasse -> cant
{"asse",2},
{"aste",2},
# cantávei -> cant
{"ávei",2},
# perderão -> perd
{"erão",1},
# beberde -> beb
{"erde",1},
# beberei -> beb
{"erei",1},
# bebêrei -> beb
{"êrei",1},
# beberen -> beb
{"eren",2},
# beberia -> beb
{"eria",1},
# bebermo -> beb
{"ermo",1},
# bebeste -> beb
{"este",1,"",{"faroeste","agreste"}},
# bebíamo -> beb
{"íamo",1},
# fuxian -> fux
{"ian",2,"",{"enfian","eloxian","ensaian"}},
# partirde -> part
{"irde",2},
# partírei -> part
{"irei",3,"",{"admirei"}},
# partiren -> part
{"iren",3},
# partiria -> part
{"iria",3},
# partirmo -> part
{"irmo",3},
# partisse -> part
{"isse",3},
# partiste -> part
{"iste",4},
{"iava",1,"",{"ampliava"}},
# cantamo -> cant
{"amo",2},
# funciona -> func
{"iona",3},
# cantara -> cant
{"ara",2,"",{"arara","prepara"}},
# enviará -> envi
{"ará",2,"",{"alvará","bacará"}},
# cantare -> cant
{"are",2,"",{"prepare"}},
# cantava -> cant
{"ava",2,"",{"agrava"}},
# cantemo -> cant
{"emo",2},
# bebera -> beb
{"era",1,"",{"acelera","espera"}},
# beberá -> beb
{"erá",1},
# bebere -> beb
{"ere",1,"",{"espere"}},
# bebíei -> beb
{"íei",1},
# metin -> met
{"in",3},
# partimo -> part
{"imo",3,"",{"reprimo","intimo","íntimo","nimo","queimo","ximo"}},
# partira -> part
{"ira",3,"",{"fronteira","sátira"}},
{"ído",3},
# partirá -> part
{"irá",3},
# concretizar -> concret
{"tizar",4,"",{"alfabetizar"}},
{"izar",3,"",{"organizar"}},
# saltitar -> salt
{"itar",5,"",{"acreditar","explicitar","estreitar"}},
# partire -> part
{"ire",3,"",{"adquire"}},
# compomo -> comp
{"omo",3},
{"ai",2},
# barbear -> barb
{"ear",4,"",{"alardear","nuclear"}},
# cheguei -> cheg
{"uei",3},
{"uía",5,"u"},
# cantei -> cant
{"ei",3},
# beber -> beb
{"er",1,"",{"éter","pier"}},
# bebeu -> beb
{"eu",1,"",{"chapeu"}},
# bebia -> beb
{"ia",1,"",{"estória","fatia","acia","praia","elogia","mania","lábia","aprecia","polícia","arredia","cheia","ásia"}},
# partir -> part
{"ir",3},
# partiu -> part
{"iu",3},
# fraqueou -> fraqu
{"eou",5},
# chegou -> cheg
{"ou",3},
# bebi -> beb
{"i",1},
# varrede -> varr
{"ede",1,"",{"rede","bípede","céspede","parede","palmípede","vostede","hóspede","adrede"}},
# cantei -> cant
{"ei",3},
# anden -> and
{"en",2},
# descerade -> desc
{"erade",1},
# vivérade -> viv
{"érade",1},
# beberan -> beb
{"eran",2},
# colleramo -> coller
{"eramo",1},
# bebéramo -> beb
{"éramo",1},
# perderán -> perd
{"erán",1},
# varrería -> varr
{"ería",1},
# beberiade -> beb
{"eriade",1},
# beberíade -> beb
{"eríade",1},
# beberiamo -> beb
{"eriamo",1},
# beberian -> beb
{"erian",1},
# beberían -> beb
{"erían",1},
# perderon -> perd
{"eron",1},
# bebese -> beb
{"ese",1},
# bebesedes -> beb
{"esedes",1},
# bebésedes -> beb
{"ésedes",1},
# bebesemo -> beb
{"esemo",1},
# bebésemo -> beb
{"ésemo",1},
# bebesen -> beb
{"esen",1},
# bebêssede -> beb
{"êssede",1},
# chovía -> chov
{"ía",1},
# faciade -> fac
{"iade",1},
# facíade -> fac
{"íade",1},
# perdiamo -> perd
{"iamo",1},
# fuxían -> fux
{"ían",1},
# corriche -> corr
{"iche",1},
# partide -> part
{"ide",1},
# escribirade -> escrib
{"irade",3},
# parírade -> par
{"írade",3},
# partiramo -> part
{"iramo",3},
# fugirán -> fug
{"irán",3},
# viviría -> viv
{"iría",3},
# partiriade -> part
{"iriade",3},
# partiríade -> part
{"iríade",3},
# partiriamo -> part
{"iriamo",3},
# partirian -> part
{"irian",3},
# partirían -> part
{"irían",3},
# reflectiron -> reflect
{"iron",3},
# partise -> part
{"ise",3},
# partisede -> part
{"isede",3},
# partísede -> part
{"ísede",3},
# partisemo -> part
{"isemo",3},
# partísemo -> part
{"ísemo",3},
# partisen -> part
{"isen",3},
# partíssede -> part
{"íssede",3},
{"tizar",3,"",{"alfabetizar"}},
{"ondo",3}};
{ "Vowel", 0, 0, {},
# segue -> seg
{"gue",2,"g",{"azougue","dengue","merengue","nurague","merengue","rengue"}},
{"que",2,"c",{"alambique","albaricoque","abaroque","alcrique","almadraque","almanaque","arenque","arinque","baduloque","ballestrinque","betoque","bivaque","bloque","bodaque","bosque","breque","buque","cacique","cheque","claque","contradique","coque","croque","dique","duque","enroque","espeque","estoque","estoraque","estraloque","estrinque","milicroque","monicreque","orinque","arinque","palenque","parque","penique","picabeque","pique","psique","raque","remolque","xeque","repenique","roque","sotobosque","tabique","tanque","toque","traque","truque","vivaque","xaque"}},
{"a",3,"",{"amasadela","cerva"}},
{"e",3,"",{"marte"}},
{"o",3,"",{"barro","fado","cabo","libro","cervo"}},
{"â",3},
{"ã",3,"",{"amanhã","arapuã","fã","divã","manhã"}},
{"ê",3},
{"ô",3},
{"á",3},
{"é",3},
{"ó",3},
# munxi -> munx
{"i",3}};

View File

@ -0,0 +1,161 @@
# galican stopwords
a
aínda
alí
aquel
aquela
aquelas
aqueles
aquilo
aquí
ao
aos
as
así
á
ben
cando
che
co
coa
comigo
con
connosco
contigo
convosco
coas
cos
cun
cuns
cunha
cunhas
da
dalgunha
dalgunhas
dalgún
dalgúns
das
de
del
dela
delas
deles
desde
deste
do
dos
dun
duns
dunha
dunhas
e
el
ela
elas
eles
en
era
eran
esa
esas
ese
eses
esta
estar
estaba
está
están
este
estes
estiven
estou
eu
é
facer
foi
foron
fun
había
hai
iso
isto
la
las
lle
lles
lo
los
mais
me
meu
meus
min
miña
miñas
moi
na
nas
neste
nin
no
non
nos
nosa
nosas
noso
nosos
nós
nun
nunha
nuns
nunhas
o
os
ou
ó
ós
para
pero
pode
pois
pola
polas
polo
polos
por
que
se
senón
ser
seu
seus
sexa
sido
sobre
súa
súas
tamén
tan
te
ten
teñen
teño
ter
teu
teus
ti
tido
tiña
tiven
túa
túas
un
unha
unhas
uns
vos
vosa
vosas
voso
vosos
vós

View File

@ -0,0 +1,456 @@
# Steps file for the RSLP stemmer.
# Step 1: Plural Reduction
{ "Plural", 3, 1, {"s"},
# bons -> bom
{"ns",1,"m"},
# balões -> balão
{"ões",3,"ão"},
# capitães -> capitão
{"ães",1,"ão",{"mães"}},
# normais -> normal
{"ais",1,"al",{"cais","mais"}},
# papéis -> papel
{"éis",2,"el"},
# amáveis -> amável
{"eis",2,"el"},
# lençóis -> lençol
{"óis",2,"ol"},
# barris -> barril
{"is",2,"il",{"lápis","cais","mais","crúcis","biquínis","pois","depois","dois","leis"}},
# males -> mal
{"les",3,"l"},
# mares -> mar
{"res",3,"r", {"árvores"}},
# casas -> casa
{"s",2,"",{"aliás","pires","lápis","cais","mais","mas","menos","férias","fezes","pêsames","crúcis","gás","atrás","moisés","através","convés","ês","país","após","ambas","ambos","messias", "depois"}}};
# Step 2: Adverb Reduction
{ "Adverb", 0, 0, {},
# felizmente -> feliz
{"mente",4,"",{"experimente"}}};
# Step 3: Feminine Reduction
{ "Feminine", 3, 1, {"a","ã"},
# chefona -> chefão
{"ona",3,"ão",{"abandona","lona","iona","cortisona","monótona","maratona","acetona","detona","carona"}},
# vilã -> vilão
{"ã",2,"ão",{"amanhã","arapuã","fã","divã"}},
# professora -> professor
{"ora",3,"or"},
# americana -> americano
{"na",4,"no",{"carona","abandona","lona","iona","cortisona","monótona","maratona","acetona","detona","guiana","campana","grana","caravana","banana","paisana"}},
# sozinha -> sozinho
{"inha",3,"inho",{"rainha","linha","minha"}},
# inglesa -> inglês
{"esa",3,"ês",{"mesa","obesa","princesa","turquesa","ilesa","pesa","presa"}},
# famosa -> famoso
{"osa",3,"oso",{"mucosa","prosa"}},
# maníaca -> maníaco
{"íaca",3,"íaco"},
# prática -> prático
{"ica",3,"ico",{"dica"}},
# cansada -> cansado
{"ada",2,"ado",{"pitada"}},
# mantida -> mantido
{"ida",3,"ido",{"vida","dúvida"}},
{"ída",3,"ido",{"recaída","saída"}},
# prima -> primo
{"ima",3,"imo",{"vítima"}},
# passiva -> passivo
{"iva",3,"ivo",{"saliva","oliva"}},
# primeira -> primeiro
{"eira",3,"eiro",{"beira","cadeira","frigideira","bandeira","feira","capoeira","barreira","fronteira","besteira","poeira"}}};
# Step 4: Augmentative/Diminutive Reduction
{ "Augmentative", 0, 1, {},
# cansadíssimo -> cansad
{"díssimo",5},
# amabilíssimo -> ama
{"abilíssimo",5},
# fortíssimo -> fort
{"íssimo",3},
{"ésimo",3},
# chiquérrimo -> chiqu
{"érrimo",4},
# pezinho -> pe
{"zinho",2},
# maluquinho -> maluc
{"quinho",4,"c"},
# amiguinho -> amig
{"uinho",4},
# cansadinho -> cansad
{"adinho",3},
# carrinho -> carr
{"inho",3,"",{"caminho","cominho"}},
# grandalhão -> grand
{"alhão",4},
# dentuça -> dent
{"uça",4},
# ricaço -> ric
{"aço",4,"",{"antebraço"}},
{"aça",4},
# casadão -> cans
{"adão",4},
{"idão",4},
# corpázio -> corp
{"ázio",3,"",{"topázio"}},
# pratarraz -> prat
{"arraz",4},
{"zarrão",3},
{"arrão",4},
# bocarra -> boc
{"arra",3},
# calorzão -> calor
{"zão",2,"",{"coalizão"}},
# meninão -> menin
{"ão",3,"",{"camarão","chimarrão","canção","coração","embrião","grotão","glutão","ficção","fogão","feição","furacão","gamão","lampião","leão","macacão","nação","órfão","orgão","patrão","portão","quinhão","rincão","tração","falcão","espião","mamão","folião","cordão","aptidão","campeão","colchão","limão","leilão","melão","barão","milhão","bilhão","fusão","cristão","ilusão","capitão","estação","senão"}}};
# Step 5: Noun Suffix Reduction
{ "Noun", 0, 0, {},
# existencialista -> exist
{"encialista",4},
# minimalista -> minim
{"alista",5},
# contagem -> cont
{"agem",3,"",{"coragem","chantagem","vantagem","carruagem"}},
# gerenciamento -> gerenc
{"iamento",4},
# monitoramento -> monitor
{"amento",3,"",{"firmamento","fundamento","departamento"}},
# nascimento -> nasc
{"imento",3},
{"mento",6,"",{"firmamento","elemento","complemento","instrumento","departamento"}},
# comercializado -> comerci
{"alizado",4},
# traumatizado -> traum
{"atizado",4},
{"tizado",4,"",{"alfabetizado"}},
# alfabetizado -> alfabet
{"izado",5,"",{"organizado","pulverizado"}},
# associativo -> associ
{"ativo",4,"",{"pejorativo","relativo"}},
# contraceptivo -> contracep
{"tivo",4,"",{"relativo"}},
# esportivo -> esport
{"ivo",4,"",{"passivo","possessivo","pejorativo","positivo"}},
# abalado -> abal
{"ado",2,"",{"grado"}},
# impedido -> imped
{"ido",3,"",{"cândido","consolido","rápido","decido","tímido","duvido","marido"}},
# ralador -> ral
{"ador",3},
# entendedor -> entend
{"edor",3},
# cumpridor -> cumpr
{"idor",4,"",{"ouvidor"}},
{"dor",4,"",{"ouvidor"}},
{"sor",4,"",{"assessor"}},
{"atoria",5},
{"tor",3,"",{"benfeitor","leitor","editor","pastor","produtor","promotor","consultor"}},
{"or",2,"",{"motor","melhor","redor","rigor","sensor","tambor","tumor","assessor","benfeitor","pastor","terior","favor","autor"}},
# comparabilidade -> compar
{"abilidade",5},
# abolicionista -> abol
{"icionista",4},
# intervencionista -> interven
{"cionista",5},
{"ionista",5},
{"ionar",5},
# profissional -> profiss
{"ional",4},
# referência -> refer
{"ência",3},
# repugnância -> repugn
{"ância",4,"",{"ambulância"}},
# abatedouro -> abat
{"edouro",3},
# fofoqueiro -> fofoc
{"queiro",3,"c"},
{"adeiro",4,"",{"desfiladeiro"}},
# brasileiro -> brasil
{"eiro",3,"",{"desfiladeiro","pioneiro","mosteiro"}},
{"uoso",3},
# gostoso -> gost
{"oso",3,"",{"precioso"}},
# comercializaç -> comerci
{"alizaç",5},
{"atizaç",5},
{"tizaç",5},
{"izaç",5,"",{"organizaç"}},
# alegaç -> aleg
{"aç",3,"",{"equaç","relaç"}},
# aboliç -> abol
{"iç",3,"",{"eleiç"}},
# anedotário -> anedot
{"ário",3,"",{"voluntário","salário","aniversário","diário","lionário","armário"}},
{"atório",3},
{"rio",5,"",{"voluntário","salário","aniversário","diário","compulsório","lionário","próprio","stério","armário"}},
# ministério -> minist
{"ério",6},
# chinês -> chin
{"ês",4},
# beleza -> bel
{"eza",3},
# rigidez -> rigid
{"ez",4},
# parentesco -> parent
{"esco",4},
# ocupante -> ocup
{"ante",2,"",{"gigante","elefante","adiante","possante","instante","restaurante"}},
# bombástico -> bomb
{"ástico",4,"",{"eclesiástico"}},
{"alístico",3},
{"áutico",4},
{"êutico",4},
{"tico",3,"",{"político","eclesiástico","diagnostico","prático","doméstico","diagnóstico","idêntico","alopático","artístico","autêntico","eclético","crítico","critico"}},
# polêmico -> polêm
{"ico",4,"",{"tico","público","explico"}},
# produtividade -> produt
{"ividade",5},
# profundidade -> profund
{"idade",4,"",{"autoridade","comunidade"}},
# aposentadoria -> aposentad
{"oria",4,"",{"categoria"}},
# existencial -> exist
{"encial",5},
# artista -> art
{"ista",4},
{"auta",5},
# maluquice -> maluc
{"quice",4,"c"},
# chatice -> chat
{"ice",4,"",{"cúmplice"}},
# demoníaco -> demon
{"íaco",3},
# decorrente -> decorr
{"ente",4,"",{"freqüente","alimente","acrescente","permanente","oriente","aparente"}},
{"ense",5},
# criminal -> crim
{"inal",3},
# americano -> americ
{"ano",4},
# amável -> am
{"ável",2,"",{"afável","razoável","potável","vulnerável"}},
# combustível -> combust
{"ível",3,"",{"possível"}},
{"vel",5,"",{"possível","vulnerável","solúvel"}},
{"bil",3,"vel"},
# cobertura -> cobert
{"ura",4,"",{"imatura","acupuntura","costura"}},
{"ural",4},
# consensual -> consens
{"ual",3,"",{"bissexual","virtual","visual","pontual"}},
# mundial -> mund
{"ial",3},
# experimental -> experiment
{"al",4,"",{"afinal","animal","estatal","bissexual","desleal","fiscal","formal","pessoal","liberal","postal","virtual","visual","pontual","sideral","sucursal"}},
{"alismo",4},
{"ivismo",4},
{"ismo",3,"",{"cinismo"}}};
# Step 6: Verb Suffix Reduction
{ "Verb", 0, 0, {},
# cantaríamo -> cant
{"aríamo",2},
# cantássemo -> cant
{"ássemo",2},
# beberíamo -> beb
{"eríamo",2},
# bebêssemo -> beb
{"êssemo",2},
# partiríamo -> part
{"iríamo",3},
# partíssemo -> part
{"íssemo",3},
# cantáramo -> cant
{"áramo",2},
# cantárei -> cant
{"árei",2},
# cantaremo -> cant
{"aremo",2},
# cantariam -> cant
{"ariam",2},
# cantaríei -> cant
{"aríei",2},
# cantássei -> cant
{"ássei",2},
# cantassem -> cant
{"assem",2},
# cantávamo -> cant
{"ávamo",2},
# bebêramo -> beb
{"êramo",3},
# beberemo -> beb
{"eremo",3},
# beberiam -> beb
{"eriam",3},
# beberíei -> beb
{"eríei",3},
# bebêssei -> beb
{"êssei",3},
# bebessem -> beb
{"essem",3},
# partiríamo -> part
{"íramo",3},
# partiremo -> part
{"iremo",3},
# partiriam -> part
{"iriam",3},
# partiríei -> part
{"iríei",3},
# partíssei -> part
{"íssei",3},
# partissem -> part
{"issem",3},
# cantando -> cant
{"ando",2},
# bebendo -> beb
{"endo",3},
# partindo -> part
{"indo",3},
# propondo -> prop
{"ondo",3},
# cantaram -> cant
{"aram",2},
{"arão",2},
# cantarde -> cant
{"arde",2},
# cantarei -> cant
{"arei",2},
# cantarem -> cant
{"arem",2},
# cantaria -> cant
{"aria",2},
# cantarmo -> cant
{"armo",2},
# cantasse -> cant
{"asse",2},
# cantaste -> cant
{"aste",2},
# cantavam -> cant
{"avam",2,"",{"agravam"}},
# cantávei -> cant
{"ávei",2},
# beberam -> beb
{"eram",3},
{"erão",3},
# beberde -> beb
{"erde",3},
# beberei -> beb
{"erei",3},
# bebêrei -> beb
{"êrei",3},
# beberem -> beb
{"erem",3},
# beberia -> beb
{"eria",3},
# bebermo -> beb
{"ermo",3},
# bebesse -> beb
{"esse",3},
# bebeste -> beb
{"este",3,"",{"faroeste","agreste"}},
# bebíamo -> beb
{"íamo",3},
# partiram -> part
{"iram",3},
# concluíram -> conclu
{"íram",3},
{"irão",2},
# partirde -> part
{"irde",2},
# partírei -> part
{"irei",3,"",{"admirei"}},
# partirem -> part
{"irem",3,"",{"adquirem"}},
# partiria -> part
{"iria",3},
# partirmo -> part
{"irmo",3},
# partisse -> part
{"isse",3},
# partiste -> part
{"iste",4},
{"iava",4,"",{"ampliava"}},
# cantamo -> cant
{"amo",2},
{"iona",3},
# cantara -> cant
{"ara",2,"",{"arara","prepara"}},
# cantará -> cant
{"ará",2,"",{"alvará"}},
# cantare -> cant
{"are",2,"",{"prepare"}},
# cantava -> cant
{"ava",2,"",{"agrava"}},
# cantemo -> cant
{"emo",2},
# bebera -> beb
{"era",3,"",{"acelera","espera"}},
# beberá -> beb
{"erá",3},
# bebere -> beb
{"ere",3,"",{"espere"}},
# bebiam -> beb
{"iam",3,"",{"enfiam","ampliam","elogiam","ensaiam"}},
# bebíei -> beb
{"íei",3},
# partimo -> part
{"imo",3,"",{"reprimo","intimo","íntimo","nimo","queimo","ximo"}},
# partira -> part
{"ira",3,"",{"fronteira","sátira"}},
{"ído",3},
# partirá -> part
{"irá",3},
{"tizar",4,"",{"alfabetizar"}},
{"izar",5,"",{"organizar"}},
{"itar",5,"",{"acreditar","explicitar","estreitar"}},
# partire -> part
{"ire",3,"",{"adquire"}},
# compomo -> comp
{"omo",3},
# cantai -> cant
{"ai",2},
# cantam -> cant
{"am",2},
# barbear -> barb
{"ear",4,"",{"alardear","nuclear"}},
# cantar -> cant
{"ar",2,"",{"azar","bazaar","patamar"}},
# cheguei -> cheg
{"uei",3},
{"uía",5,"u"},
# cantei -> cant
{"ei",3},
{"guem",3,"g"},
# cantem -> cant
{"em",2,"",{"alem","virgem"}},
# beber -> beb
{"er",2,"",{"éter","pier"}},
# bebeu -> beb
{"eu",3,"",{"chapeu"}},
# bebia -> beb
{"ia",3,"",{"estória","fatia","acia","praia","elogia","mania","lábia","aprecia","polícia","arredia","cheia","ásia"}},
# partir -> part
{"ir",3,"",{"freir"}},
# partiu -> part
{"iu",3},
{"eou",5},
# chegou -> cheg
{"ou",3},
# bebi -> beb
{"i",3}};
# Step 7: Vowel Removal
{ "Vowel", 0, 0, {},
{"bil",2,"vel"},
{"gue",2,"g",{"gangue","jegue"}},
{"á",3},
{"ê",3,"",{"bebê"}},
# menina -> menin
{"a",3,"",{"ásia"}},
# grande -> grand
{"e",3},
# menino -> menin
{"o",3,"",{"ão"}}};

View File

@ -0,0 +1,53 @@
package org.apache.lucene.analysis.gl;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
public class TestGalicianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
* stopwords file is missing in classpath */
public void testResourcesAvailable() {
new GalicianAnalyzer(TEST_VERSION_CURRENT);
}
/** test stopwords and stemming */
public void testBasics() throws IOException {
Analyzer a = new GalicianAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "correspondente", "correspond");
checkOneTermReuse(a, "corresponderá", "correspond");
// stopword
assertAnalyzesTo(a, "e", new String[] {});
}
/** test use of exclusion set */
public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>();
exclusionSet.add("correspondente");
Analyzer a = new GalicianAnalyzer(TEST_VERSION_CURRENT,
GalicianAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "correspondente", "correspondente");
checkOneTermReuse(a, "corresponderá", "correspond");
}
}

View File

@ -0,0 +1,52 @@
package org.apache.lucene.analysis.gl;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.analysis.util.VocabularyAssert.assertVocabulary;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
/**
* Simple tests for {@link GalicianStemFilter}
*/
public class TestGalicianStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);
return new TokenStreamComponents(source, new GalicianStemFilter(result));
}
};
/** Test against a vocabulary from the reference impl */
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("gltestdata.zip"), "gl.txt");
}
}

View File

@ -0,0 +1,69 @@
package org.apache.lucene.analysis.pt;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.analysis.util.VocabularyAssert.assertVocabulary;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
/**
* Simple tests for {@link PortugueseStemFilter}
*/
public class TestPortugueseStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);
return new TokenStreamComponents(source, new PortugueseStemFilter(result));
}
};
/**
* Test the example from the paper "Assessing the impact of stemming accuracy
* on information retrieval"
*/
public void testExamples() throws IOException {
assertAnalyzesTo(
analyzer,
"O debate político, pelo menos o que vem a público, parece, de modo nada "
+ "surpreendente, restrito a temas menores. Mas há, evidentemente, "
+ "grandes questões em jogo nas eleições que se aproximam.",
new String[] {
"o", "debat", "politic", "pel", "menos", "o", "que", "vem", "a",
"public", "parec", "de", "mod", "nad", "surpreend", "restrit",
"a", "tem", "men", "mas", "ha", "evid", "grand", "quest",
"em", "jog", "na", "eleic", "que", "se", "aproxim"
});
}
/** Test against a vocabulary from the reference impl */
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("ptrslptestdata.zip"), "ptrslp.txt");
}
}

View File

@ -0,0 +1,28 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.gl.GalicianStemFilter;
/** Factory for {@link GalicianStemFilter} */
public class GalicianStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new GalicianStemFilter(input);
}
}

View File

@ -0,0 +1,28 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.pt.PortugueseStemFilter;
/** Factory for {@link PortugueseStemFilter} */
public class PortugueseStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new PortugueseStemFilter(input);
}
}

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Simple tests to ensure the Galician stem factory is working.
*/
public class TestGalicianStemFilterFactory extends BaseTokenTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("cariñosa");
GalicianStemFilterFactory factory = new GalicianStemFilterFactory();
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream, new String[] { "cariñ" });
}
}

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Simple tests to ensure the Portuguese stem factory is working.
*/
public class TestPortugueseStemFilterFactory extends BaseTokenTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("maluquice");
PortugueseStemFilterFactory factory = new PortugueseStemFilterFactory();
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream, new String[] { "maluc" });
}
}