LUCENE-2298: Add stempel, an algorithmic stemmer with included Polish support

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940433 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-05-03 12:44:22 +00:00
parent 220f9ee81c
commit 98c47c57e0
46 changed files with 4243 additions and 18 deletions

View File

@ -38,6 +38,15 @@ stopword list that is BSD-licensed created by Jacques Savoy. The file resides i
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt.
See http://members.unine.ch/jacques.savoy/clef/index.html.
The Stempel analyzer (contrib/analyzers) includes BSD-licensed software developed
by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil,
and Edmond Nolan.
The Polish analyzer (contrib/analyzers) comes with a default
stopword list that is BSD-licensed created by the Carrot2 project. The file resides
in contrib/analyzers/stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt.
See http://project.carrot2.org/license.html.
Includes lib/servlet-api-2.4.jar from Apache Tomcat
The SmartChineseAnalyzer source code (under contrib/analyzers) was

View File

@ -317,6 +317,7 @@ The source distribution does not contain sources of the previous Lucene Java ver
<packageset dir="contrib/analyzers/common/src/java"/>
<packageset dir="contrib/analyzers/smartcn/src/java"/>
<packageset dir="contrib/analyzers/stempel/src/java"/>
<packageset dir="contrib/ant/src/java"/>
<packageset dir="contrib/benchmark/src/java"/>
<packageset dir="contrib/icu/src/java"/>
@ -345,7 +346,7 @@ The source distribution does not contain sources of the previous Lucene Java ver
<group title="Demo" packages="org.apache.lucene.demo*"/>
<group title="contrib: Analysis" packages="org.apache.lucene.analysis.*:org.tartarus.snowball*"/>
<group title="contrib: Analysis" packages="org.apache.lucene.analysis.*:org.tartarus.snowball*:org.egothor.stemmer*"/>
<group title="contrib: Ant" packages="org.apache.lucene.ant*"/>
<group title="contrib: Benchmark" packages="org.apache.lucene.benchmark*"/>
<group title="contrib: ICU" packages="org.apache.lucene.collation*"/>

View File

@ -137,6 +137,9 @@ New features
sensitive way, either from ICU built-in rules (such as Traditional-Simplified),
or from rules you write yourself. (Robert Muir)
* LUCENE-2298: Add analyzers/stempel, an algorithmic stemmer with support for
the Polish language. (Andrzej Bialecki via Robert Muir)
Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation

View File

@ -23,6 +23,7 @@
Additional Analyzers
- common: Additional Analyzers
- smartcn: Smart Analyzer for Simplified Chinese Text
- stempel: Algorithmic Stemmer for Polish
</description>
<target name="common">
@ -33,23 +34,31 @@
<ant dir="smartcn" />
</target>
<target name="default" depends="common,smartcn" />
<target name="stempel">
<ant dir="stempel" />
</target>
<target name="default" depends="common,smartcn,stempel" />
<target name="clean">
<ant dir="common" target="clean" />
<ant dir="smartcn" target="clean" />
<ant dir="stempel" target="clean" />
</target>
<target name="compile-core">
<ant dir="common" target="compile-core" />
<ant dir="smartcn" target="compile-core" />
<ant dir="stempel" target="compile-core" />
</target>
<target name="compile-test">
<ant dir="common" target="compile-test" />
<ant dir="smartcn" target="compile-test" />
<ant dir="stempel" target="compile-test" />
</target>
<target name="test">
<ant dir="common" target="test" />
<ant dir="smartcn" target="test" />
<ant dir="stempel" target="test" />
</target>
<target name="build-artifacts-and-tests" depends="default,compile-test" />
@ -57,16 +66,19 @@
<target name="dist-maven" depends="default">
<ant dir="common" target="dist-maven" />
<ant dir="smartcn" target="dist-maven" />
<ant dir="stempel" target="dist-maven" />
</target>
<target name="javadocs">
<ant dir="common" target="javadocs" />
<ant dir="smartcn" target="javadocs" />
<ant dir="stempel" target="javadocs" />
</target>
<target name="javadocs-index.html">
<ant dir="common" target="javadocs-index.html" />
<ant dir="smartcn" target="javadocs-index.html" />
<ant dir="stempel" target="javadocs-index.html" />
</target>
</project>

View File

@ -0,0 +1,38 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="stempel" default="default">
<description>
Stempel Analyzer
</description>
<property name="build.dir" location="../../../build/contrib/analyzers/stempel" />
<property name="dist.dir" location="../../../dist/contrib/analyzers/stempel" />
<property name="maven.dist.dir" location="../../../dist/maven" />
<import file="../../contrib-build.xml"/>
<path id="test.classpath">
<path refid="classpath"/>
<pathelement location="../../../build/classes/test/"/>
<path refid="junit-path"/>
<pathelement location="${build.dir}/classes/java"/>
</path>
</project>

View File

@ -0,0 +1,35 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-contrib</artifactId>
<version>@version@</version>
</parent>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-stempel</artifactId>
<name>Lucene Stempel Analyzer</name>
<version>@version@</version>
<description>Stempel Analyzer</description>
<packaging>jar</packaging>
</project>

View File

@ -0,0 +1,154 @@
package org.apache.lucene.analysis.pl;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.KeywordMarkerFilter;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.stempel.StempelStemmer;
import org.apache.lucene.analysis.stempel.StempelFilter;
import org.apache.lucene.util.Version;
import org.egothor.stemmer.Trie;
/**
* {@link Analyzer} for Polish.
*/
public final class PolishAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet;
private final Trie stemTable;
/** File containing default Polish stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
* Returns an unmodifiable instance of the default stop words set.
* @return default stop words set.
*/
public static Set<?> getDefaultStopSet(){
return DefaultsHolder.DEFAULT_STOP_SET;
}
/**
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
* accesses the static final set the first time.;
*/
private static class DefaultsHolder {
static final Set<?> DEFAULT_STOP_SET;
static final Trie DEFAULT_TABLE;
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getWordSet(PolishAnalyzer.class,
DEFAULT_STOPWORD_FILE);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set", ex);
}
InputStream stream = PolishAnalyzer.class.getResourceAsStream("stemmer_20000.tbl");
try {
DataInputStream in = new DataInputStream(new BufferedInputStream(stream));
String method = in.readUTF().toUpperCase();
if (method.indexOf('M') < 0) {
DEFAULT_TABLE = new org.egothor.stemmer.Trie(in);
} else {
DEFAULT_TABLE = new org.egothor.stemmer.MultiTrie2(in);
}
in.close();
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stemming tables", ex);
}
}
}
/**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/
public PolishAnalyzer(Version matchVersion) {
this(matchVersion, DefaultsHolder.DEFAULT_STOP_SET);
}
/**
* Builds an analyzer with the given stop words.
*
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set
*/
public PolishAnalyzer(Version matchVersion, Set<?> stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
}
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed
*/
public PolishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
super(matchVersion, stopwords);
this.stemTable = DefaultsHolder.DEFAULT_TABLE;
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
}
/**
* Creates a
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
* which tokenizes all the text in the provided {@link Reader}.
*
* @return A
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* provided and {@link StempelFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new StempelFilter(result, new StempelStemmer(stemTable));
return new TokenStreamComponents(source, result);
}
}

View File

@ -0,0 +1,22 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Analyzer for Polish.
</body>
</html>

View File

@ -0,0 +1,83 @@
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain a
* copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/
package org.apache.lucene.analysis.stempel;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* Transforms the token stream as per the stemming algorithm.
* <p>
* Note: the input to the stemming filter must already be in lower case, so you
* will need to use LowerCaseFilter or LowerCaseTokenizer farther down the
* Tokenizer chain in order for this to work properly!
*/
public final class StempelFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
private final StempelStemmer stemmer;
private final int minLength;
/**
* Minimum length of input words to be processed. Shorter words are returned
* unchanged.
*/
public static final int DEFAULT_MIN_LENGTH = 3;
/**
* Create filter using the supplied stemming table.
*
* @param in input token stream
* @param stemmer stemmer
*/
public StempelFilter(TokenStream in, StempelStemmer stemmer) {
this(in, stemmer, DEFAULT_MIN_LENGTH);
}
/**
* Create filter using the supplied stemming table.
*
* @param in input token stream
* @param stemmer stemmer
* @param minLength For performance reasons words shorter than minLength
* characters are not processed, but simply returned.
*/
public StempelFilter(TokenStream in, StempelStemmer stemmer, int minLength) {
super(in);
this.stemmer = stemmer;
this.minLength = minLength;
}
/** Returns the next input Token, after being stemmed */
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAtt.isKeyword() && termAtt.length() > minLength) {
StringBuilder sb = stemmer.stem(termAtt);
if (sb != null) // if we can't stem it, return unchanged
termAtt.setEmpty().append(sb);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,92 @@
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.apache.lucene.analysis.stempel;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.egothor.stemmer.Diff;
import org.egothor.stemmer.Trie;
/**
* <p>
* Stemmer class is a convenient facade for other stemmer-related classes. The
* core stemming algorithm and its implementation is taken verbatim from the
* Egothor project ( <a href="http://www.egothor.org">www.egothor.org </a>).
* </p>
* <p>
* Even though the stemmer tables supplied in the distribution package are built
* for Polish language, there is nothing language-specific here.
* </p>
*/
public class StempelStemmer {
private Trie stemmer = null;
private StringBuilder buffer = new StringBuilder();
/**
* Create a Stemmer using selected stemmer table
*
* @param stemmerTable stemmer table.
*/
public StempelStemmer(InputStream stemmerTable) throws IOException {
if (stemmerTable == null) return;
DataInputStream in = new DataInputStream(new BufferedInputStream(
stemmerTable));
String method = in.readUTF().toUpperCase();
if (method.indexOf('M') < 0) {
stemmer = new org.egothor.stemmer.Trie(in);
} else {
stemmer = new org.egothor.stemmer.MultiTrie2(in);
}
in.close();
}
/**
* Create a Stemmer using pre-loaded stemmer table
*
* @param stemmer pre-loaded stemmer table
*/
public StempelStemmer(Trie stemmer) {
this.stemmer = stemmer;
}
/**
* Stem a word.
*
* @param word input word to be stemmed.
* @return stemmed word, or null if the stem could not be generated.
*/
public StringBuilder stem(CharSequence word) {
CharSequence cmd = stemmer.getLastOnPath(word);
if (cmd == null)
return null;
buffer.setLength(0);
buffer.append(word);
Diff.apply(buffer, cmd);
if (buffer.length() > 0)
return buffer;
else
return null;
}
}

View File

@ -0,0 +1,22 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
<p>Stempel: Algorithmic Stemmer</p>
</body>
</html>

View File

@ -0,0 +1,94 @@
/*
Egothor Software License version 1.00
Copyright (C) 1997-2004 Leo Galambos.
Copyright (C) 2002-2004 "Egothor developers"
on behalf of the Egothor Project.
All rights reserved.
This software is copyrighted by the "Egothor developers". If this
license applies to a single file or document, the "Egothor developers"
are the people or entities mentioned as copyright holders in that file
or document. If this license applies to the Egothor project as a
whole, the copyright holders are the people or entities mentioned in
the file CREDITS. This file can be found in the same location as this
license in the distribution.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, the list of contributors, this list of conditions, and the
following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, the list of contributors, this list of conditions, and the
disclaimer that follows these conditions in the documentation
and/or other materials provided with the distribution.
3. The name "Egothor" must not be used to endorse or promote products
derived from this software without prior written permission. For
written permission, please contact Leo.G@seznam.cz
4. Products derived from this software may not be called "Egothor",
nor may "Egothor" appear in their name, without prior written
permission from Leo.G@seznam.cz.
In addition, we request that you include in the end-user documentation
provided with the redistribution and/or in the software itself an
acknowledgement equivalent to the following:
"This product includes software developed by the Egothor Project.
http://egothor.sf.net/"
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
This software consists of voluntary contributions made by many
individuals on behalf of the Egothor Project and was originally
created by Leo Galambos (Leo.G@seznam.cz).
*/
package org.egothor.stemmer;
/**
* A Cell is a portion of a trie.
*/
class Cell {
/** next row id in this way */
int ref = -1;
/** command of the cell */
int cmd = -1;
/** how many cmd-s was in subtrie before pack() */
int cnt = 0;
/** how many chars would be discarded from input key in this way */
int skip = 0;
/** Constructor for the Cell object. */
Cell() {}
/**
* Construct a Cell using the properties of the given Cell.
*
* @param a the Cell whose properties will be used
*/
Cell(Cell a) {
ref = a.ref;
cmd = a.cmd;
cnt = a.cnt;
skip = a.skip;
}
/**
* Return a String containing this Cell's attributes.
*
* @return a String representation of this Cell
*/
@Override
public String toString() {
return "ref(" + ref + ")cmd(" + cmd + ")cnt(" + cnt + ")skp(" + skip + ")";
}
}

View File

@ -0,0 +1,205 @@
/*
Egothor Software License version 1.00
Copyright (C) 1997-2004 Leo Galambos.
Copyright (C) 2002-2004 "Egothor developers"
on behalf of the Egothor Project.
All rights reserved.
This software is copyrighted by the "Egothor developers". If this
license applies to a single file or document, the "Egothor developers"
are the people or entities mentioned as copyright holders in that file
or document. If this license applies to the Egothor project as a
whole, the copyright holders are the people or entities mentioned in
the file CREDITS. This file can be found in the same location as this
license in the distribution.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, the list of contributors, this list of conditions, and the
following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, the list of contributors, this list of conditions, and the
disclaimer that follows these conditions in the documentation
and/or other materials provided with the distribution.
3. The name "Egothor" must not be used to endorse or promote products
derived from this software without prior written permission. For
written permission, please contact Leo.G@seznam.cz
4. Products derived from this software may not be called "Egothor",
nor may "Egothor" appear in their name, without prior written
permission from Leo.G@seznam.cz.
In addition, we request that you include in the end-user documentation
provided with the redistribution and/or in the software itself an
acknowledgement equivalent to the following:
"This product includes software developed by the Egothor Project.
http://egothor.sf.net/"
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
This software consists of voluntary contributions made by many
individuals on behalf of the Egothor Project and was originally
created by Leo Galambos (Leo.G@seznam.cz).
*/
package org.egothor.stemmer;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.util.StringTokenizer;
/**
* The Compile class is used to compile a stemmer table.
*/
public class Compile {
static boolean backward;
static boolean multi;
static Trie trie;
/**
* Entry point to the Compile application.
* <p>
* This program takes any number of arguments: the first is the name of the
* desired stemming algorithm to use (a list is available in the package
* description) , all of the rest should be the path or paths to a file or
* files containing a stemmer table to compile.
*
* @param args the command line arguments
*/
public static void main(java.lang.String[] args) {
if (args.length < 1) {
return;
}
args[0].toUpperCase();
backward = args[0].charAt(0) == '-';
int qq = (backward) ? 1 : 0;
boolean storeorig = false;
if (args[0].charAt(qq) == '0') {
storeorig = true;
qq++;
}
multi = args[0].charAt(qq) == 'M';
if (multi) {
qq++;
}
String charset = System.getProperty("egothor.stemmer.charset", "UTF-8");
char optimizer[] = new char[args[0].length() - qq];
for (int i = 0; i < optimizer.length; i++) {
optimizer[i] = args[0].charAt(qq + i);
}
for (int i = 1; i < args.length; i++) {
LineNumberReader in;
// System.out.println("[" + args[i] + "]");
Diff diff = new Diff();
try {
int stems = 0;
int words = 0;
allocTrie();
System.out.println(args[i]);
in = new LineNumberReader(new BufferedReader(new InputStreamReader(
new FileInputStream(args[i]), charset)));
for (String line = in.readLine(); line != null; line = in.readLine()) {
try {
line = line.toLowerCase();
StringTokenizer st = new StringTokenizer(line);
String stem = st.nextToken();
if (storeorig) {
trie.add(stem, "-a");
words++;
}
while (st.hasMoreTokens()) {
String token = st.nextToken();
if (token.equals(stem) == false) {
trie.add(token, diff.exec(token, stem));
words++;
}
}
} catch (java.util.NoSuchElementException x) {
// no base token (stem) on a line
}
}
Optimizer o = new Optimizer();
Optimizer2 o2 = new Optimizer2();
Lift l = new Lift(true);
Lift e = new Lift(false);
Gener g = new Gener();
for (int j = 0; j < optimizer.length; j++) {
String prefix;
switch (optimizer[j]) {
case 'G':
trie = trie.reduce(g);
prefix = "G: ";
break;
case 'L':
trie = trie.reduce(l);
prefix = "L: ";
break;
case 'E':
trie = trie.reduce(e);
prefix = "E: ";
break;
case '2':
trie = trie.reduce(o2);
prefix = "2: ";
break;
case '1':
trie = trie.reduce(o);
prefix = "1: ";
break;
default:
continue;
}
trie.printInfo(prefix + " ");
}
DataOutputStream os = new DataOutputStream(new BufferedOutputStream(
new FileOutputStream(args[i] + ".out")));
os.writeUTF(args[0]);
trie.store(os);
os.close();
} catch (FileNotFoundException x) {
x.printStackTrace();
} catch (IOException x) {
x.printStackTrace();
}
}
}
static void allocTrie() {
if (multi) {
trie = new MultiTrie2(!backward);
} else {
trie = new Trie(!backward);
}
}
}

View File

@ -0,0 +1,295 @@
/*
Egothor Software License version 1.00
Copyright (C) 1997-2004 Leo Galambos.
Copyright (C) 2002-2004 "Egothor developers"
on behalf of the Egothor Project.
All rights reserved.
This software is copyrighted by the "Egothor developers". If this
license applies to a single file or document, the "Egothor developers"
are the people or entities mentioned as copyright holders in that file
or document. If this license applies to the Egothor project as a
whole, the copyright holders are the people or entities mentioned in
the file CREDITS. This file can be found in the same location as this
license in the distribution.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, the list of contributors, this list of conditions, and the
following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, the list of contributors, this list of conditions, and the
disclaimer that follows these conditions in the documentation
and/or other materials provided with the distribution.
3. The name "Egothor" must not be used to endorse or promote products
derived from this software without prior written permission. For
written permission, please contact Leo.G@seznam.cz
4. Products derived from this software may not be called "Egothor",
nor may "Egothor" appear in their name, without prior written
permission from Leo.G@seznam.cz.
In addition, we request that you include in the end-user documentation
provided with the redistribution and/or in the software itself an
acknowledgement equivalent to the following:
"This product includes software developed by the Egothor Project.
http://egothor.sf.net/"
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
This software consists of voluntary contributions made by many
individuals on behalf of the Egothor Project and was originally
created by Leo Galambos (Leo.G@seznam.cz).
*/
package org.egothor.stemmer;
/**
* The Diff object generates a patch string.
* <p>
* A patch string is actually a command to a stemmer telling it how to reduce a
* word to its root. For example, to reduce the word teacher to its root teach
* the patch string Db would be generated. This command tells the stemmer to
* delete the last 2 characters from the word teacher to reach the stem (the
* patch commands are applied starting from the last character in order to save
*/
public class Diff {
int sizex = 0;
int sizey = 0;
int net[][];
int way[][];
int INSERT;
int DELETE;
int REPLACE;
int NOOP;
/**
* Constructor for the Diff object.
*/
public Diff() {
this(1, 1, 1, 0);
}
/**
* Constructor for the Diff object
*
* @param ins Description of the Parameter
* @param del Description of the Parameter
* @param rep Description of the Parameter
* @param noop Description of the Parameter
*/
public Diff(int ins, int del, int rep, int noop) {
INSERT = ins;
DELETE = del;
REPLACE = rep;
NOOP = noop;
}
/**
* Apply the given patch string <tt>diff</tt> to the given string <tt>
* dest</tt>.
*
* @param dest Destination string
* @param diff Patch string
*/
public static void apply(StringBuilder dest, CharSequence diff) {
try {
if (diff == null) {
return;
}
int pos = dest.length() - 1;
if (pos < 0) {
return;
}
// orig == ""
for (int i = 0; i < diff.length() / 2; i++) {
char cmd = diff.charAt(2 * i);
char param = diff.charAt(2 * i + 1);
int par_num = (param - 'a' + 1);
switch (cmd) {
case '-':
pos = pos - par_num + 1;
break;
case 'R':
dest.setCharAt(pos, param);
break;
case 'D':
int o = pos;
pos -= par_num - 1;
/*
* delete par_num chars from index pos
*/
// String s = orig.toString();
// s = s.substring( 0, pos ) + s.substring( o + 1 );
// orig = new StringBuffer( s );
dest.delete(pos, o + 1);
break;
case 'I':
dest.insert(pos += 1, param);
break;
}
pos--;
}
} catch (StringIndexOutOfBoundsException x) {
// x.printStackTrace();
} catch (ArrayIndexOutOfBoundsException x) {
// x.printStackTrace();
}
}
/**
* Construct a patch string that transforms a to b.
*
* @param a String 1st string
* @param b String 2nd string
* @return String
*/
public synchronized String exec(String a, String b) {
if (a == null || b == null) {
return null;
}
int x;
int y;
int maxx;
int maxy;
int go[] = new int[4];
final int X = 1;
final int Y = 2;
final int R = 3;
final int D = 0;
/*
* setup memory if needed => processing speed up
*/
maxx = a.length() + 1;
maxy = b.length() + 1;
if ((maxx >= sizex) || (maxy >= sizey)) {
sizex = maxx + 8;
sizey = maxy + 8;
net = new int[sizex][sizey];
way = new int[sizex][sizey];
}
/*
* clear the network
*/
for (x = 0; x < maxx; x++) {
for (y = 0; y < maxy; y++) {
net[x][y] = 0;
}
}
/*
* set known persistent values
*/
for (x = 1; x < maxx; x++) {
net[x][0] = x;
way[x][0] = X;
}
for (y = 1; y < maxy; y++) {
net[0][y] = y;
way[0][y] = Y;
}
for (x = 1; x < maxx; x++) {
for (y = 1; y < maxy; y++) {
go[X] = net[x - 1][y] + DELETE;
// way on x costs 1 unit
go[Y] = net[x][y - 1] + INSERT;
// way on y costs 1 unit
go[R] = net[x - 1][y - 1] + REPLACE;
go[D] = net[x - 1][y - 1]
+ ((a.charAt(x - 1) == b.charAt(y - 1)) ? NOOP : 100);
// diagonal costs 0, when no change
short min = D;
if (go[min] >= go[X]) {
min = X;
}
if (go[min] > go[Y]) {
min = Y;
}
if (go[min] > go[R]) {
min = R;
}
way[x][y] = min;
net[x][y] = (short) go[min];
}
}
// read the patch string
StringBuffer result = new StringBuffer();
final char base = 'a' - 1;
char deletes = base;
char equals = base;
for (x = maxx - 1, y = maxy - 1; x + y != 0;) {
switch (way[x][y]) {
case X:
if (equals != base) {
result.append("-" + (equals));
equals = base;
}
deletes++;
x--;
break;
// delete
case Y:
if (deletes != base) {
result.append("D" + (deletes));
deletes = base;
}
if (equals != base) {
result.append("-" + (equals));
equals = base;
}
result.append('I');
result.append(b.charAt(--y));
break;
// insert
case R:
if (deletes != base) {
result.append("D" + (deletes));
deletes = base;
}
if (equals != base) {
result.append("-" + (equals));
equals = base;
}
result.append('R');
result.append(b.charAt(--y));
x--;
break;
// replace
case D:
if (deletes != base) {
result.append("D" + (deletes));
deletes = base;
}
equals++;
x--;
y--;
break;
// no change
}
}
if (deletes != base) {
result.append("D" + (deletes));
deletes = base;
}
return result.toString();
}
}

View File

@ -0,0 +1,121 @@
/*
Egothor Software License version 1.00
Copyright (C) 1997-2004 Leo Galambos.
Copyright (C) 2002-2004 "Egothor developers"
on behalf of the Egothor Project.
All rights reserved.
This software is copyrighted by the "Egothor developers". If this
license applies to a single file or document, the "Egothor developers"
are the people or entities mentioned as copyright holders in that file
or document. If this license applies to the Egothor project as a
whole, the copyright holders are the people or entities mentioned in
the file CREDITS. This file can be found in the same location as this
license in the distribution.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, the list of contributors, this list of conditions, and the
following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, the list of contributors, this list of conditions, and the
disclaimer that follows these conditions in the documentation
and/or other materials provided with the distribution.
3. The name "Egothor" must not be used to endorse or promote products
derived from this software without prior written permission. For
written permission, please contact Leo.G@seznam.cz
4. Products derived from this software may not be called "Egothor",
nor may "Egothor" appear in their name, without prior written
permission from Leo.G@seznam.cz.
In addition, we request that you include in the end-user documentation
provided with the redistribution and/or in the software itself an
acknowledgement equivalent to the following:
"This product includes software developed by the Egothor Project.
http://egothor.sf.net/"
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
This software consists of voluntary contributions made by many
individuals on behalf of the Egothor Project and was originally
created by Leo Galambos (Leo.G@seznam.cz).
*/
package org.egothor.stemmer;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.util.StringTokenizer;
/**
* The DiffIt class is a means generate patch commands from an already prepared
* stemmer table.
*/
public class DiffIt {
static int get(int i, String s) {
try {
return Integer.parseInt(s.substring(i, i + 1));
} catch (Throwable x) {
return 1;
}
}
/**
* Entry point to the DiffIt application.
* <p>
* This application takes one argument, the path to a file containing a
* stemmer table. The program reads the file and generates the patch commands
* for the stems.
*
* @param args the path to a file containing a stemmer table
*/
public static void main(java.lang.String[] args) {
int ins = get(0, args[0]);
int del = get(1, args[0]);
int rep = get(2, args[0]);
int nop = get(3, args[0]);
for (int i = 1; i < args.length; i++) {
LineNumberReader in;
// System.out.println("[" + args[i] + "]");
Diff diff = new Diff(ins, del, rep, nop);
try {
in = new LineNumberReader(new BufferedReader(new FileReader(args[i])));
for (String line = in.readLine(); line != null; line = in.readLine()) {
try {
line = line.toLowerCase();
StringTokenizer st = new StringTokenizer(line);
String stem = st.nextToken();
System.out.println(stem + " -a");
while (st.hasMoreTokens()) {
String token = st.nextToken();
if (token.equals(stem) == false) {
System.out.println(stem + " " + diff.exec(token, stem));
}
}
} catch (java.util.NoSuchElementException x) {
// no base token (stem) on a line
}
}
} catch (IOException x) {
x.printStackTrace();
}
}
}
}

View File

@ -0,0 +1,132 @@
/*
Egothor Software License version 1.00
Copyright (C) 1997-2004 Leo Galambos.
Copyright (C) 2002-2004 "Egothor developers"
on behalf of the Egothor Project.
All rights reserved.
This software is copyrighted by the "Egothor developers". If this
license applies to a single file or document, the "Egothor developers"
are the people or entities mentioned as copyright holders in that file
or document. If this license applies to the Egothor project as a
whole, the copyright holders are the people or entities mentioned in
the file CREDITS. This file can be found in the same location as this
license in the distribution.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, the list of contributors, this list of conditions, and the
following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, the list of contributors, this list of conditions, and the
disclaimer that follows these conditions in the documentation
and/or other materials provided with the distribution.
3. The name "Egothor" must not be used to endorse or promote products
derived from this software without prior written permission. For
written permission, please contact Leo.G@seznam.cz
4. Products derived from this software may not be called "Egothor",
nor may "Egothor" appear in their name, without prior written
permission from Leo.G@seznam.cz.
In addition, we request that you include in the end-user documentation
provided with the redistribution and/or in the software itself an
acknowledgement equivalent to the following:
"This product includes software developed by the Egothor Project.
http://egothor.sf.net/"
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
This software consists of voluntary contributions made by many
individuals on behalf of the Egothor Project and was originally
created by Leo Galambos (Leo.G@seznam.cz).
*/
package org.egothor.stemmer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
/**
* The Gener object helps in the discarding of nodes which break the reduction
* effort and defend the structure against large reductions.
*/
public class Gener extends Reduce {
/**
* Constructor for the Gener object.
*/
public Gener() {}
/**
* Return a Trie with infrequent values occurring in the given Trie removed.
*
* @param orig the Trie to optimize
* @return a new optimized Trie
*/
@Override
public Trie optimize(Trie orig) {
List<CharSequence> cmds = orig.cmds;
List<Row> rows = new ArrayList<Row>();
List<Row> orows = orig.rows;
int remap[] = new int[orows.size()];
Arrays.fill(remap, 1);
for (int j = orows.size() - 1; j >= 0; j--) {
if (eat(orows.get(j), remap)) {
remap[j] = 0;
}
}
Arrays.fill(remap, -1);
rows = removeGaps(orig.root, orows, new ArrayList<Row>(), remap);
return new Trie(orig.forward, remap[orig.root], cmds, rows);
}
/**
* Test whether the given Row of Cells in a Trie should be included in an
* optimized Trie.
*
* @param in the Row to test
* @param remap Description of the Parameter
* @return <tt>true</tt> if the Row should remain, <tt>false
* </tt> otherwise
*/
public boolean eat(Row in, int remap[]) {
int sum = 0;
for (Iterator<Cell> i = in.cells.values().iterator(); i.hasNext();) {
Cell c = i.next();
sum += c.cnt;
if (c.ref >= 0) {
if (remap[c.ref] == 0) {
c.ref = -1;
}
}
}
int frame = sum / 10;
boolean live = false;
for (Iterator<Cell> i = in.cells.values().iterator(); i.hasNext();) {
Cell c = i.next();
if (c.cnt < frame && c.cmd >= 0) {
c.cnt = 0;
c.cmd = -1;
}
if (c.cmd >= 0 || c.ref >= 0) {
live |= true;
}
}
return !live;
}
}

View File

@ -0,0 +1,147 @@
/*
Egothor Software License version 1.00
Copyright (C) 1997-2004 Leo Galambos.
Copyright (C) 2002-2004 "Egothor developers"
on behalf of the Egothor Project.
All rights reserved.
This software is copyrighted by the "Egothor developers". If this
license applies to a single file or document, the "Egothor developers"
are the people or entities mentioned as copyright holders in that file
or document. If this license applies to the Egothor project as a
whole, the copyright holders are the people or entities mentioned in
the file CREDITS. This file can be found in the same location as this
license in the distribution.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, the list of contributors, this list of conditions, and the
following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, the list of contributors, this list of conditions, and the
disclaimer that follows these conditions in the documentation
and/or other materials provided with the distribution.
3. The name "Egothor" must not be used to endorse or promote products
derived from this software without prior written permission. For
written permission, please contact Leo.G@seznam.cz
4. Products derived from this software may not be called "Egothor",
nor may "Egothor" appear in their name, without prior written
permission from Leo.G@seznam.cz.
In addition, we request that you include in the end-user documentation
provided with the redistribution and/or in the software itself an
acknowledgement equivalent to the following:
"This product includes software developed by the Egothor Project.
http://egothor.sf.net/"
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
This software consists of voluntary contributions made by many
individuals on behalf of the Egothor Project and was originally
created by Leo Galambos (Leo.G@seznam.cz).
*/
package org.egothor.stemmer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
/**
* The Lift class is a data structure that is a variation of a Patricia trie.
* <p>
* Lift's <i>raison d'etre</i> is to implement reduction of the trie via the
* Lift-Up method., which makes the data structure less liable to overstemming.
*/
public class Lift extends Reduce {
boolean changeSkip;
/**
* Constructor for the Lift object.
*
* @param changeSkip when set to <tt>true</tt>, comparison of two Cells takes
* a skip command into account
*/
public Lift(boolean changeSkip) {
this.changeSkip = changeSkip;
}
/**
* Optimize (eliminate rows with no content) the given Trie and return the
* reduced Trie.
*
* @param orig the Trie to optimized
* @return the reduced Trie
*/
@Override
public Trie optimize(Trie orig) {
List<CharSequence> cmds = orig.cmds;
List<Row> rows = new ArrayList<Row>();
List<Row> orows = orig.rows;
int remap[] = new int[orows.size()];
for (int j = orows.size() - 1; j >= 0; j--) {
liftUp(orows.get(j), orows);
}
Arrays.fill(remap, -1);
rows = removeGaps(orig.root, orows, new ArrayList<Row>(), remap);
return new Trie(orig.forward, remap[orig.root], cmds, rows);
}
/**
* Reduce the trie using Lift-Up reduction.
* <p>
* The Lift-Up reduction propagates all leaf-values (patch commands), where
* possible, to higher levels which are closer to the root of the trie.
*
* @param in the Row to consider when optimizing
* @param nodes contains the patch commands
*/
public void liftUp(Row in, List<Row> nodes) {
Iterator<Cell> i = in.cells.values().iterator();
for (; i.hasNext();) {
Cell c = i.next();
if (c.ref >= 0) {
Row to = nodes.get(c.ref);
int sum = to.uniformCmd(changeSkip);
if (sum >= 0) {
if (sum == c.cmd) {
if (changeSkip) {
if (c.skip != to.uniformSkip + 1) {
continue;
}
c.skip = to.uniformSkip + 1;
} else {
c.skip = 0;
}
c.cnt += to.uniformCnt;
c.ref = -1;
} else if (c.cmd < 0) {
c.cnt = to.uniformCnt;
c.cmd = sum;
c.ref = -1;
if (changeSkip) {
c.skip = to.uniformSkip + 1;
} else {
c.skip = 0;
}
}
}
}
}
}
}

View File

@ -0,0 +1,208 @@
/*
Egothor Software License version 1.00
Copyright (C) 1997-2004 Leo Galambos.
Copyright (C) 2002-2004 "Egothor developers"
on behalf of the Egothor Project.
All rights reserved.
This software is copyrighted by the "Egothor developers". If this
license applies to a single file or document, the "Egothor developers"
are the people or entities mentioned as copyright holders in that file
or document. If this license applies to the Egothor project as a
whole, the copyright holders are the people or entities mentioned in
the file CREDITS. This file can be found in the same location as this
license in the distribution.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, the list of contributors, this list of conditions, and the
following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, the list of contributors, this list of conditions, and the
disclaimer that follows these conditions in the documentation
and/or other materials provided with the distribution.
3. The name "Egothor" must not be used to endorse or promote products
derived from this software without prior written permission. For
written permission, please contact Leo.G@seznam.cz
4. Products derived from this software may not be called "Egothor",
nor may "Egothor" appear in their name, without prior written
permission from Leo.G@seznam.cz.
In addition, we request that you include in the end-user documentation
provided with the redistribution and/or in the software itself an
acknowledgement equivalent to the following:
"This product includes software developed by the Egothor Project.
http://egothor.sf.net/"
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
This software consists of voluntary contributions made by many
individuals on behalf of the Egothor Project and was originally
created by Leo Galambos (Leo.G@seznam.cz).
*/
package org.egothor.stemmer;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* The MultiTrie is a Trie of Tries. It stores words and their associated patch
* commands. The MultiTrie handles patch commmands individually (each command by
* itself).
*/
public class MultiTrie extends Trie {
final char EOM = '*';
final String EOM_NODE = "" + EOM;
List<Trie> tries = new ArrayList<Trie>();
int BY = 1;
/**
* Constructor for the MultiTrie object.
*
* @param is the input stream
* @exception IOException if an I/O error occurs
*/
public MultiTrie(DataInput is) throws IOException {
super(false);
forward = is.readBoolean();
BY = is.readInt();
for (int i = is.readInt(); i > 0; i--) {
tries.add(new Trie(is));
}
}
/**
* Constructor for the MultiTrie object
*
* @param forward set to <tt>true</tt> if the elements should be read left to
* right
*/
public MultiTrie(boolean forward) {
super(forward);
}
/**
* Return the element that is stored in a cell associated with the given key.
*
* @param key the key to the cell holding the desired element
* @return the element
*/
@Override
public CharSequence getFully(CharSequence key) {
StringBuilder result = new StringBuilder(tries.size() * 2);
for (int i = 0; i < tries.size(); i++) {
CharSequence r = tries.get(i).getFully(key);
if (r == null || (r.length() == 1 && r.charAt(0) == EOM)) {
return result;
}
result.append(r);
}
return result;
}
/**
* Return the element that is stored as last on a path belonging to the given
* key.
*
* @param key the key associated with the desired element
* @return the element that is stored as last on a path
*/
@Override
public CharSequence getLastOnPath(CharSequence key) {
StringBuilder result = new StringBuilder(tries.size() * 2);
for (int i = 0; i < tries.size(); i++) {
CharSequence r = tries.get(i).getLastOnPath(key);
if (r == null || (r.length() == 1 && r.charAt(0) == EOM)) {
return result;
}
result.append(r);
}
return result;
}
/**
* Write this data structure to the given output stream.
*
* @param os the output stream
* @exception IOException if an I/O error occurs
*/
@Override
public void store(DataOutput os) throws IOException {
os.writeBoolean(forward);
os.writeInt(BY);
os.writeInt(tries.size());
for (Trie trie : tries)
trie.store(os);
}
/**
* Add an element to this structure consisting of the given key and patch
* command.
* <p>
* This method will return without executing if the <tt>cmd</tt>
* parameter's length is 0.
*
* @param key the key
* @param cmd the patch command
*/
@Override
public void add(CharSequence key, CharSequence cmd) {
if (cmd.length() == 0) {
return;
}
int levels = cmd.length() / BY;
while (levels >= tries.size()) {
tries.add(new Trie(forward));
}
for (int i = 0; i < levels; i++) {
tries.get(i).add(key, cmd.subSequence(BY * i, BY * i + BY));
}
tries.get(levels).add(key, EOM_NODE);
}
/**
* Remove empty rows from the given Trie and return the newly reduced Trie.
*
* @param by the Trie to reduce
* @return the newly reduced Trie
*/
@Override
public Trie reduce(Reduce by) {
List<Trie> h = new ArrayList<Trie>();
for (Trie trie : tries)
h.add(trie.reduce(by));
MultiTrie m = new MultiTrie(forward);
m.tries = h;
return m;
}
/**
* Print the given prefix and the position(s) in the Trie where it appears.
*
* @param prefix the desired prefix
*/
@Override
public void printInfo(CharSequence prefix) {
int c = 0;
for (Trie trie : tries)
trie.printInfo(prefix + "[" + (++c) + "] ");
}
}

View File

@ -0,0 +1,333 @@
/*
Egothor Software License version 1.00
Copyright (C) 1997-2004 Leo Galambos.
Copyright (C) 2002-2004 "Egothor developers"
on behalf of the Egothor Project.
All rights reserved.
This software is copyrighted by the "Egothor developers". If this
license applies to a single file or document, the "Egothor developers"
are the people or entities mentioned as copyright holders in that file
or document. If this license applies to the Egothor project as a
whole, the copyright holders are the people or entities mentioned in
the file CREDITS. This file can be found in the same location as this
license in the distribution.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, the list of contributors, this list of conditions, and the
following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, the list of contributors, this list of conditions, and the
disclaimer that follows these conditions in the documentation
and/or other materials provided with the distribution.
3. The name "Egothor" must not be used to endorse or promote products
derived from this software without prior written permission. For
written permission, please contact Leo.G@seznam.cz
4. Products derived from this software may not be called "Egothor",
nor may "Egothor" appear in their name, without prior written
permission from Leo.G@seznam.cz.
In addition, we request that you include in the end-user documentation
provided with the redistribution and/or in the software itself an
acknowledgement equivalent to the following:
"This product includes software developed by the Egothor Project.
http://egothor.sf.net/"
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
This software consists of voluntary contributions made by many
individuals on behalf of the Egothor Project and was originally
created by Leo Galambos (Leo.G@seznam.cz).
*/
package org.egothor.stemmer;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* The MultiTrie is a Trie of Tries.
* <p>
* It stores words and their associated patch commands. The MultiTrie handles
* patch commmands broken into their constituent parts, as a MultiTrie does, but
* the commands are delimited by the skip command.
*/
public class MultiTrie2 extends MultiTrie {
/**
* Constructor for the MultiTrie object.
*
* @param is the input stream
* @exception IOException if an I/O error occurs
*/
public MultiTrie2(DataInput is) throws IOException {
super(is);
}
/**
* Constructor for the MultiTrie2 object
*
* @param forward set to <tt>true</tt> if the elements should be read left to
* right
*/
public MultiTrie2(boolean forward) {
super(forward);
}
/**
* Return the element that is stored in a cell associated with the given key.
*
* @param key the key to the cell holding the desired element
* @return the element
*/
@Override
public CharSequence getFully(CharSequence key) {
StringBuilder result = new StringBuilder(tries.size() * 2);
try {
CharSequence lastkey = key;
CharSequence p[] = new CharSequence[tries.size()];
char lastch = ' ';
for (int i = 0; i < tries.size(); i++) {
CharSequence r = tries.get(i).getFully(lastkey);
if (r == null || (r.length() == 1 && r.charAt(0) == EOM)) {
return result;
}
if (cannotFollow(lastch, r.charAt(0))) {
return result;
} else {
lastch = r.charAt(r.length() - 2);
}
// key=key.substring(lengthPP(r));
p[i] = r;
if (p[i].charAt(0) == '-') {
if (i > 0) {
key = skip(key, lengthPP(p[i - 1]));
}
key = skip(key, lengthPP(p[i]));
}
// key = skip(key, lengthPP(r));
result.append(r);
if (key.length() != 0) {
lastkey = key;
}
}
} catch (IndexOutOfBoundsException x) {}
return result;
}
/**
* Return the element that is stored as last on a path belonging to the given
* key.
*
* @param key the key associated with the desired element
* @return the element that is stored as last on a path
*/
@Override
public CharSequence getLastOnPath(CharSequence key) {
StringBuilder result = new StringBuilder(tries.size() * 2);
try {
CharSequence lastkey = key;
CharSequence p[] = new CharSequence[tries.size()];
char lastch = ' ';
for (int i = 0; i < tries.size(); i++) {
CharSequence r = tries.get(i).getLastOnPath(lastkey);
if (r == null || (r.length() == 1 && r.charAt(0) == EOM)) {
return result;
}
// System.err.println("LP:"+key+" last:"+lastch+" new:"+r);
if (cannotFollow(lastch, r.charAt(0))) {
return result;
} else {
lastch = r.charAt(r.length() - 2);
}
// key=key.substring(lengthPP(r));
p[i] = r;
if (p[i].charAt(0) == '-') {
if (i > 0) {
key = skip(key, lengthPP(p[i - 1]));
}
key = skip(key, lengthPP(p[i]));
}
// key = skip(key, lengthPP(r));
result.append(r);
if (key.length() != 0) {
lastkey = key;
}
}
} catch (IndexOutOfBoundsException x) {}
return result;
}
/**
* Write this data structure to the given output stream.
*
* @param os the output stream
* @exception IOException if an I/O error occurs
*/
@Override
public void store(DataOutput os) throws IOException {
super.store(os);
}
/**
* Add an element to this structure consisting of the given key and patch
* command.
* <p>
* This method will return without executing if the <tt>cmd</tt>
* parameter's length is 0.
*
* @param key the key
* @param cmd the patch command
*/
@Override
public void add(CharSequence key, CharSequence cmd) {
if (cmd.length() == 0) {
return;
}
// System.err.println( cmd );
CharSequence p[] = decompose(cmd);
int levels = p.length;
// System.err.println("levels "+key+" cmd "+cmd+"|"+levels);
while (levels >= tries.size()) {
tries.add(new Trie(forward));
}
CharSequence lastkey = key;
for (int i = 0; i < levels; i++) {
if (key.length() > 0) {
tries.get(i).add(key, p[i]);
lastkey = key;
} else {
tries.get(i).add(lastkey, p[i]);
}
// System.err.println("-"+key+" "+p[i]+"|"+key.length());
/*
* key=key.substring(lengthPP(p[i]));
*/
if (p[i].length() > 0 && p[i].charAt(0) == '-') {
if (i > 0) {
key = skip(key, lengthPP(p[i - 1]));
}
key = skip(key, lengthPP(p[i]));
}
// System.err.println("--->"+key);
}
if (key.length() > 0) {
tries.get(levels).add(key, EOM_NODE);
} else {
tries.get(levels).add(lastkey, EOM_NODE);
}
}
/**
* Break the given patch command into its constituent pieces. The pieces are
* delimited by NOOP commands.
*
* @param cmd the patch command
* @return an array containing the pieces of the command
*/
public CharSequence[] decompose(CharSequence cmd) {
int parts = 0;
for (int i = 0; 0 <= i && i < cmd.length();) {
int next = dashEven(cmd, i);
if (i == next) {
parts++;
i = next + 2;
} else {
parts++;
i = next;
}
}
CharSequence part[] = new CharSequence[parts];
int x = 0;
for (int i = 0; 0 <= i && i < cmd.length();) {
int next = dashEven(cmd, i);
if (i == next) {
part[x++] = cmd.subSequence(i, i + 2);
i = next + 2;
} else {
part[x++] = (next < 0) ? cmd.subSequence(i, cmd.length()) : cmd.subSequence(i, next);
i = next;
}
}
return part;
}
/**
* Remove empty rows from the given Trie and return the newly reduced Trie.
*
* @param by the Trie to reduce
* @return the newly reduced Trie
*/
@Override
public Trie reduce(Reduce by) {
List<Trie> h = new ArrayList<Trie>();
for (Trie trie : tries)
h.add(trie.reduce(by));
MultiTrie2 m = new MultiTrie2(forward);
m.tries = h;
return m;
}
private boolean cannotFollow(char after, char goes) {
switch (after) {
case '-':
case 'D':
return after == goes;
}
return false;
}
private CharSequence skip(CharSequence in, int count) {
if (forward) {
return in.subSequence(count, in.length());
} else {
return in.subSequence(0, in.length() - count);
}
}
private int dashEven(CharSequence in, int from) {
while (from < in.length()) {
if (in.charAt(from) == '-') {
return from;
} else {
from += 2;
}
}
return -1;
}
private int lengthPP(CharSequence cmd) {
int len = 0;
for (int i = 0; i < cmd.length(); i++) {
switch (cmd.charAt(i++)) {
case '-':
case 'D':
len += cmd.charAt(i) - 'a' + 1;
break;
case 'R':
len++;
case 'I':
break;
}
}
return len;
}
}

View File

@ -0,0 +1,198 @@
/*
Egothor Software License version 1.00
Copyright (C) 1997-2004 Leo Galambos.
Copyright (C) 2002-2004 "Egothor developers"
on behalf of the Egothor Project.
All rights reserved.
This software is copyrighted by the "Egothor developers". If this
license applies to a single file or document, the "Egothor developers"
are the people or entities mentioned as copyright holders in that file
or document. If this license applies to the Egothor project as a
whole, the copyright holders are the people or entities mentioned in
the file CREDITS. This file can be found in the same location as this
license in the distribution.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, the list of contributors, this list of conditions, and the
following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, the list of contributors, this list of conditions, and the
disclaimer that follows these conditions in the documentation
and/or other materials provided with the distribution.
3. The name "Egothor" must not be used to endorse or promote products
derived from this software without prior written permission. For
written permission, please contact Leo.G@seznam.cz
4. Products derived from this software may not be called "Egothor",
nor may "Egothor" appear in their name, without prior written
permission from Leo.G@seznam.cz.
In addition, we request that you include in the end-user documentation
provided with the redistribution and/or in the software itself an
acknowledgement equivalent to the following:
"This product includes software developed by the Egothor Project.
http://egothor.sf.net/"
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
This software consists of voluntary contributions made by many
individuals on behalf of the Egothor Project and was originally
created by Leo Galambos (Leo.G@seznam.cz).
*/
package org.egothor.stemmer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
/**
* The Optimizer class is a Trie that will be reduced (have empty rows removed).
* <p>
* The reduction will be made by joining two rows where the first is a subset of
* the second.
*/
public class Optimizer extends Reduce {
/**
* Constructor for the Optimizer object.
*/
public Optimizer() {}
/**
* Optimize (remove empty rows) from the given Trie and return the resulting
* Trie.
*
* @param orig the Trie to consolidate
* @return the newly consolidated Trie
*/
@Override
public Trie optimize(Trie orig) {
List<CharSequence> cmds = orig.cmds;
List<Row> rows = new ArrayList<Row>();
List<Row> orows = orig.rows;
int remap[] = new int[orows.size()];
for (int j = orows.size() - 1; j >= 0; j--) {
Row now = new Remap(orows.get(j), remap);
boolean merged = false;
for (int i = 0; i < rows.size(); i++) {
Row q = merge(now, rows.get(i));
if (q != null) {
rows.set(i, q);
merged = true;
remap[j] = i;
break;
}
}
if (merged == false) {
remap[j] = rows.size();
rows.add(now);
}
}
int root = remap[orig.root];
Arrays.fill(remap, -1);
rows = removeGaps(root, rows, new ArrayList<Row>(), remap);
return new Trie(orig.forward, remap[root], cmds, rows);
}
/**
* Merge the given rows and return the resulting Row.
*
* @param master the master Row
* @param existing the existing Row
* @return the resulting Row, or <tt>null</tt> if the operation cannot be
* realized
*/
public Row merge(Row master, Row existing) {
Iterator<Character> i = master.cells.keySet().iterator();
Row n = new Row();
for (; i.hasNext();) {
Character ch = i.next();
// XXX also must handle Cnt and Skip !!
Cell a = master.cells.get(ch);
Cell b = existing.cells.get(ch);
Cell s = (b == null) ? new Cell(a) : merge(a, b);
if (s == null) {
return null;
}
n.cells.put(ch, s);
}
i = existing.cells.keySet().iterator();
for (; i.hasNext();) {
Character ch = i.next();
if (master.at(ch) != null) {
continue;
}
n.cells.put(ch, existing.at(ch));
}
return n;
}
/**
* Merge the given Cells and return the resulting Cell.
*
* @param m the master Cell
* @param e the existing Cell
* @return the resulting Cell, or <tt>null</tt> if the operation cannot be
* realized
*/
public Cell merge(Cell m, Cell e) {
Cell n = new Cell();
if (m.skip != e.skip) {
return null;
}
if (m.cmd >= 0) {
if (e.cmd >= 0) {
if (m.cmd == e.cmd) {
n.cmd = m.cmd;
} else {
return null;
}
} else {
n.cmd = m.cmd;
}
} else {
n.cmd = e.cmd;
}
if (m.ref >= 0) {
if (e.ref >= 0) {
if (m.ref == e.ref) {
if (m.skip == e.skip) {
n.ref = m.ref;
} else {
return null;
}
} else {
return null;
}
} else {
n.ref = m.ref;
}
} else {
n.ref = e.ref;
}
n.cnt = m.cnt + e.cnt;
n.skip = m.skip;
return n;
}
}

View File

@ -0,0 +1,90 @@
/*
Egothor Software License version 1.00
Copyright (C) 1997-2004 Leo Galambos.
Copyright (C) 2002-2004 "Egothor developers"
on behalf of the Egothor Project.
All rights reserved.
This software is copyrighted by the "Egothor developers". If this
license applies to a single file or document, the "Egothor developers"
are the people or entities mentioned as copyright holders in that file
or document. If this license applies to the Egothor project as a
whole, the copyright holders are the people or entities mentioned in
the file CREDITS. This file can be found in the same location as this
license in the distribution.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, the list of contributors, this list of conditions, and the
following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, the list of contributors, this list of conditions, and the
disclaimer that follows these conditions in the documentation
and/or other materials provided with the distribution.
3. The name "Egothor" must not be used to endorse or promote products
derived from this software without prior written permission. For
written permission, please contact Leo.G@seznam.cz
4. Products derived from this software may not be called "Egothor",
nor may "Egothor" appear in their name, without prior written
permission from Leo.G@seznam.cz.
In addition, we request that you include in the end-user documentation
provided with the redistribution and/or in the software itself an
acknowledgement equivalent to the following:
"This product includes software developed by the Egothor Project.
http://egothor.sf.net/"
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
This software consists of voluntary contributions made by many
individuals on behalf of the Egothor Project and was originally
created by Leo Galambos (Leo.G@seznam.cz).
*/
package org.egothor.stemmer;
/**
* The Optimizer class is a Trie that will be reduced (have empty rows removed).
* <p>
* This is the result of allowing a joining of rows when there is no collision
* between non-<tt>null</tt> values in the rows. Information loss, resulting in
* the stemmer not being able to recognize words (as in Optimizer), is
* curtailed, allowing the stemmer to recognize words for which the original
* trie was built. Use of this class allows the stemmer to be self-teaching.
*/
public class Optimizer2 extends Optimizer {
/**
* Constructor for the Optimizer2 object.
*/
public Optimizer2() {}
/**
* Merge the given Cells and return the resulting Cell.
*
* @param m the master Cell
* @param e the existing Cell
* @return the resulting Cell, or <tt>null</tt> if the operation cannot be
* realized
*/
@Override
public Cell merge(Cell m, Cell e) {
if (m.cmd == e.cmd && m.ref == e.ref && m.skip == e.skip) {
Cell c = new Cell(m);
c.cnt += e.cnt;
return c;
} else {
return null;
}
}
}

View File

@ -0,0 +1,134 @@
/*
Egothor Software License version 1.00
Copyright (C) 1997-2004 Leo Galambos.
Copyright (C) 2002-2004 "Egothor developers"
on behalf of the Egothor Project.
All rights reserved.
This software is copyrighted by the "Egothor developers". If this
license applies to a single file or document, the "Egothor developers"
are the people or entities mentioned as copyright holders in that file
or document. If this license applies to the Egothor project as a
whole, the copyright holders are the people or entities mentioned in
the file CREDITS. This file can be found in the same location as this
license in the distribution.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, the list of contributors, this list of conditions, and the
following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, the list of contributors, this list of conditions, and the
disclaimer that follows these conditions in the documentation
and/or other materials provided with the distribution.
3. The name "Egothor" must not be used to endorse or promote products
derived from this software without prior written permission. For
written permission, please contact Leo.G@seznam.cz
4. Products derived from this software may not be called "Egothor",
nor may "Egothor" appear in their name, without prior written
permission from Leo.G@seznam.cz.
In addition, we request that you include in the end-user documentation
provided with the redistribution and/or in the software itself an
acknowledgement equivalent to the following:
"This product includes software developed by the Egothor Project.
http://egothor.sf.net/"
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
This software consists of voluntary contributions made by many
individuals on behalf of the Egothor Project and was originally
created by Leo Galambos (Leo.G@seznam.cz).
*/
package org.egothor.stemmer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
/**
* The Reduce object is used to remove gaps in a Trie which stores a dictionary.
*/
public class Reduce {
/**
* Constructor for the Reduce object.
*/
public Reduce() {}
/**
* Optimize (remove holes in the rows) the given Trie and return the
* restructured Trie.
*
* @param orig the Trie to optimize
* @return the restructured Trie
*/
public Trie optimize(Trie orig) {
List<CharSequence> cmds = orig.cmds;
List<Row> rows = new ArrayList<Row>();
List<Row> orows = orig.rows;
int remap[] = new int[orows.size()];
Arrays.fill(remap, -1);
rows = removeGaps(orig.root, rows, new ArrayList<Row>(), remap);
return new Trie(orig.forward, remap[orig.root], cmds, rows);
}
List<Row> removeGaps(int ind, List<Row> old, List<Row> to, int remap[]) {
remap[ind] = to.size();
Row now = old.get(ind);
to.add(now);
Iterator<Cell> i = now.cells.values().iterator();
for (; i.hasNext();) {
Cell c = i.next();
if (c.ref >= 0 && remap[c.ref] < 0) {
removeGaps(c.ref, old, to, remap);
}
}
to.set(remap[ind], new Remap(now, remap));
return to;
}
/**
* This class is part of the Egothor Project
*/
class Remap extends Row {
/**
* Constructor for the Remap object
*
* @param old Description of the Parameter
* @param remap Description of the Parameter
*/
public Remap(Row old, int remap[]) {
super();
Iterator<Character> i = old.cells.keySet().iterator();
for (; i.hasNext();) {
Character ch = i.next();
Cell c = old.at(ch);
Cell nc;
if (c.ref >= 0) {
nc = new Cell(c);
nc.ref = remap[nc.ref];
} else {
nc = new Cell(c);
}
cells.put(ch, nc);
}
}
}
}

View File

@ -0,0 +1,309 @@
/*
Egothor Software License version 1.00
Copyright (C) 1997-2004 Leo Galambos.
Copyright (C) 2002-2004 "Egothor developers"
on behalf of the Egothor Project.
All rights reserved.
This software is copyrighted by the "Egothor developers". If this
license applies to a single file or document, the "Egothor developers"
are the people or entities mentioned as copyright holders in that file
or document. If this license applies to the Egothor project as a
whole, the copyright holders are the people or entities mentioned in
the file CREDITS. This file can be found in the same location as this
license in the distribution.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, the list of contributors, this list of conditions, and the
following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, the list of contributors, this list of conditions, and the
disclaimer that follows these conditions in the documentation
and/or other materials provided with the distribution.
3. The name "Egothor" must not be used to endorse or promote products
derived from this software without prior written permission. For
written permission, please contact Leo.G@seznam.cz
4. Products derived from this software may not be called "Egothor",
nor may "Egothor" appear in their name, without prior written
permission from Leo.G@seznam.cz.
In addition, we request that you include in the end-user documentation
provided with the redistribution and/or in the software itself an
acknowledgement equivalent to the following:
"This product includes software developed by the Egothor Project.
http://egothor.sf.net/"
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
This software consists of voluntary contributions made by many
individuals on behalf of the Egothor Project and was originally
created by Leo Galambos (Leo.G@seznam.cz).
*/
package org.egothor.stemmer;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Iterator;
import java.util.TreeMap;
/**
* The Row class represents a row in a matrix representation of a trie.
*/
public class Row {
TreeMap<Character,Cell> cells = new TreeMap<Character,Cell>();
int uniformCnt = 0;
int uniformSkip = 0;
/**
* Construct a Row object from input carried in via the given input stream.
*
* @param is the input stream
* @exception IOException if an I/O error occurs
*/
public Row(DataInput is) throws IOException {
for (int i = is.readInt(); i > 0; i--) {
char ch = is.readChar();
Cell c = new Cell();
c.cmd = is.readInt();
c.cnt = is.readInt();
c.ref = is.readInt();
c.skip = is.readInt();
cells.put(ch, c);
}
}
/**
* The default constructor for the Row object.
*/
public Row() {}
/**
* Construct a Row using the cells of the given Row.
*
* @param old the Row to copy
*/
public Row(Row old) {
cells = old.cells;
}
/**
* Set the command in the Cell of the given Character to the given integer.
*
* @param way the Character defining the Cell
* @param cmd the new command
*/
public void setCmd(Character way, int cmd) {
Cell c = at(way);
if (c == null) {
c = new Cell();
c.cmd = cmd;
cells.put(way, c);
} else {
c.cmd = cmd;
}
c.cnt = (cmd >= 0) ? 1 : 0;
}
/**
* Set the reference to the next row in the Cell of the given Character to the
* given integer.
*
* @param way the Character defining the Cell
* @param ref The new ref value
*/
public void setRef(Character way, int ref) {
Cell c = at(way);
if (c == null) {
c = new Cell();
c.ref = ref;
cells.put(way, c);
} else {
c.ref = ref;
}
}
/**
* Return the number of cells in use.
*
* @return the number of cells in use
*/
public int getCells() {
Iterator<Character> i = cells.keySet().iterator();
int size = 0;
for (; i.hasNext();) {
Character c = i.next();
Cell e = at(c);
if (e.cmd >= 0 || e.ref >= 0) {
size++;
}
}
return size;
}
/**
* Return the number of references (how many transitions) to other rows.
*
* @return the number of references
*/
public int getCellsPnt() {
Iterator<Character> i = cells.keySet().iterator();
int size = 0;
for (; i.hasNext();) {
Character c = i.next();
Cell e = at(c);
if (e.ref >= 0) {
size++;
}
}
return size;
}
/**
* Return the number of patch commands saved in this Row.
*
* @return the number of patch commands
*/
public int getCellsVal() {
Iterator<Character> i = cells.keySet().iterator();
int size = 0;
for (; i.hasNext();) {
Character c = i.next();
Cell e = at(c);
if (e.cmd >= 0) {
size++;
}
}
return size;
}
/**
* Return the command in the Cell associated with the given Character.
*
* @param way the Character associated with the Cell holding the desired
* command
* @return the command
*/
public int getCmd(Character way) {
Cell c = at(way);
return (c == null) ? -1 : c.cmd;
}
/**
* Return the number of patch commands were in the Cell associated with the
* given Character before the Trie containing this Row was reduced.
*
* @param way the Character associated with the desired Cell
* @return the number of patch commands before reduction
*/
public int getCnt(Character way) {
Cell c = at(way);
return (c == null) ? -1 : c.cnt;
}
/**
* Return the reference to the next Row in the Cell associated with the given
* Character.
*
* @param way the Character associated with the desired Cell
* @return the reference, or -1 if the Cell is <tt>null,/tt>
*/
public int getRef(Character way) {
Cell c = at(way);
return (c == null) ? -1 : c.ref;
}
/**
* Write the contents of this Row to the given output stream.
*
* @param os the output stream
* @exception IOException if an I/O error occurs
*/
public void store(DataOutput os) throws IOException {
os.writeInt(cells.size());
Iterator<Character> i = cells.keySet().iterator();
for (; i.hasNext();) {
Character c = i.next();
Cell e = at(c);
if (e.cmd < 0 && e.ref < 0) {
continue;
}
os.writeChar(c.charValue());
os.writeInt(e.cmd);
os.writeInt(e.cnt);
os.writeInt(e.ref);
os.writeInt(e.skip);
}
}
/**
* Return the number of identical Cells (containing patch commands) in this
* Row.
*
* @param eqSkip when set to <tt>false</tt> the removed patch commands are
* considered
* @return the number of identical Cells, or -1 if there are (at least) two
* different cells
*/
public int uniformCmd(boolean eqSkip) {
Iterator<Cell> i = cells.values().iterator();
int ret = -1;
uniformCnt = 1;
uniformSkip = 0;
for (; i.hasNext();) {
Cell c = i.next();
if (c.ref >= 0) {
return -1;
}
if (c.cmd >= 0) {
if (ret < 0) {
ret = c.cmd;
uniformSkip = c.skip;
} else if (ret == c.cmd) {
if (eqSkip) {
if (uniformSkip == c.skip) {
uniformCnt++;
} else {
return -1;
}
} else {
uniformCnt++;
}
} else {
return -1;
}
}
}
return ret;
}
/**
* Write the contents of this Row to stdout.
*/
public void print() {
for (Iterator<Character> i = cells.keySet().iterator(); i.hasNext();) {
Character ch = i.next();
Cell c = at(ch);
System.out.print("[" + ch + ":" + c + "]");
}
System.out.println();
}
Cell at(Character index) {
return cells.get(index);
}
}

View File

@ -0,0 +1,419 @@
/*
Egothor Software License version 1.00
Copyright (C) 1997-2004 Leo Galambos.
Copyright (C) 2002-2004 "Egothor developers"
on behalf of the Egothor Project.
All rights reserved.
This software is copyrighted by the "Egothor developers". If this
license applies to a single file or document, the "Egothor developers"
are the people or entities mentioned as copyright holders in that file
or document. If this license applies to the Egothor project as a
whole, the copyright holders are the people or entities mentioned in
the file CREDITS. This file can be found in the same location as this
license in the distribution.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, the list of contributors, this list of conditions, and the
following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, the list of contributors, this list of conditions, and the
disclaimer that follows these conditions in the documentation
and/or other materials provided with the distribution.
3. The name "Egothor" must not be used to endorse or promote products
derived from this software without prior written permission. For
written permission, please contact Leo.G@seznam.cz
4. Products derived from this software may not be called "Egothor",
nor may "Egothor" appear in their name, without prior written
permission from Leo.G@seznam.cz.
In addition, we request that you include in the end-user documentation
provided with the redistribution and/or in the software itself an
acknowledgement equivalent to the following:
"This product includes software developed by the Egothor Project.
http://egothor.sf.net/"
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
This software consists of voluntary contributions made by many
individuals on behalf of the Egothor Project and was originally
created by Leo Galambos (Leo.G@seznam.cz).
*/
package org.egothor.stemmer;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* A Trie is used to store a dictionary of words and their stems.
* <p>
* Actually, what is stored are words with their respective patch commands. A
* trie can be termed forward (keys read from left to right) or backward (keys
* read from right to left). This property will vary depending on the language
* for which a Trie is constructed.
*/
public class Trie {
List<Row> rows = new ArrayList<Row>();
List<CharSequence> cmds = new ArrayList<CharSequence>();
int root;
boolean forward = false;
/**
* Constructor for the Trie object.
*
* @param is the input stream
* @exception IOException if an I/O error occurs
*/
public Trie(DataInput is) throws IOException {
forward = is.readBoolean();
root = is.readInt();
for (int i = is.readInt(); i > 0; i--) {
cmds.add(is.readUTF());
}
for (int i = is.readInt(); i > 0; i--) {
rows.add(new Row(is));
}
}
/**
* Constructor for the Trie object.
*
* @param forward set to <tt>true</tt>
*/
public Trie(boolean forward) {
rows.add(new Row());
root = 0;
this.forward = forward;
}
/**
* Constructor for the Trie object.
*
* @param forward <tt>true</tt> if read left to right, <tt>false</tt> if read
* right to left
* @param root index of the row that is the root node
* @param cmds the patch commands to store
* @param rows a Vector of Vectors. Each inner Vector is a node of this Trie
*/
public Trie(boolean forward, int root, List<CharSequence> cmds, List<Row> rows) {
this.rows = rows;
this.cmds = cmds;
this.root = root;
this.forward = forward;
}
/**
* Gets the all attribute of the Trie object
*
* @param key Description of the Parameter
* @return The all value
*/
public CharSequence[] getAll(CharSequence key) {
int res[] = new int[key.length()];
int resc = 0;
Row now = getRow(root);
int w;
StrEnum e = new StrEnum(key, forward);
boolean br = false;
for (int i = 0; i < key.length() - 1; i++) {
Character ch = new Character(e.next());
w = now.getCmd(ch);
if (w >= 0) {
int n = w;
for (int j = 0; j < resc; j++) {
if (n == res[j]) {
n = -1;
break;
}
}
if (n >= 0) {
res[resc++] = n;
}
}
w = now.getRef(ch);
if (w >= 0) {
now = getRow(w);
} else {
br = true;
break;
}
}
if (br == false) {
w = now.getCmd(new Character(e.next()));
if (w >= 0) {
int n = w;
for (int j = 0; j < resc; j++) {
if (n == res[j]) {
n = -1;
break;
}
}
if (n >= 0) {
res[resc++] = n;
}
}
}
if (resc < 1) {
return null;
}
CharSequence R[] = new CharSequence[resc];
for (int j = 0; j < resc; j++) {
R[j] = cmds.get(res[j]);
}
return R;
}
/**
* Return the number of cells in this Trie object.
*
* @return the number of cells
*/
public int getCells() {
int size = 0;
for (Row row : rows)
size += row.getCells();
return size;
}
/**
* Gets the cellsPnt attribute of the Trie object
*
* @return The cellsPnt value
*/
public int getCellsPnt() {
int size = 0;
for (Row row : rows)
size += row.getCellsPnt();
return size;
}
/**
* Gets the cellsVal attribute of the Trie object
*
* @return The cellsVal value
*/
public int getCellsVal() {
int size = 0;
for (Row row : rows)
size += row.getCellsVal();
return size;
}
/**
* Return the element that is stored in a cell associated with the given key.
*
* @param key the key
* @return the associated element
*/
public CharSequence getFully(CharSequence key) {
Row now = getRow(root);
int w;
Cell c;
int cmd = -1;
StrEnum e = new StrEnum(key, forward);
Character ch = null;
Character aux = null;
for (int i = 0; i < key.length();) {
ch = new Character(e.next());
i++;
c = now.at(ch);
if (c == null) {
return null;
}
cmd = c.cmd;
for (int skip = c.skip; skip > 0; skip--) {
if (i < key.length()) {
aux = new Character(e.next());
} else {
return null;
}
i++;
}
w = now.getRef(ch);
if (w >= 0) {
now = getRow(w);
} else if (i < key.length()) {
return null;
}
}
return (cmd == -1) ? null : cmds.get(cmd);
}
/**
* Return the element that is stored as last on a path associated with the
* given key.
*
* @param key the key associated with the desired element
* @return the last on path element
*/
public CharSequence getLastOnPath(CharSequence key) {
Row now = getRow(root);
int w;
CharSequence last = null;
StrEnum e = new StrEnum(key, forward);
for (int i = 0; i < key.length() - 1; i++) {
Character ch = new Character(e.next());
w = now.getCmd(ch);
if (w >= 0) {
last = cmds.get(w);
}
w = now.getRef(ch);
if (w >= 0) {
now = getRow(w);
} else {
return last;
}
}
w = now.getCmd(new Character(e.next()));
return (w >= 0) ? cmds.get(w) : last;
}
/**
* Return the Row at the given index.
*
* @param index the index containing the desired Row
* @return the Row
*/
private Row getRow(int index) {
if (index < 0 || index >= rows.size()) {
return null;
}
return rows.get(index);
}
/**
* Write this Trie to the given output stream.
*
* @param os the output stream
* @exception IOException if an I/O error occurs
*/
public void store(DataOutput os) throws IOException {
os.writeBoolean(forward);
os.writeInt(root);
os.writeInt(cmds.size());
for (CharSequence cmd : cmds)
os.writeUTF(cmd.toString());
os.writeInt(rows.size());
for (Row row : rows)
row.store(os);
}
/**
* Add the given key associated with the given patch command. If either
* parameter is null this method will return without executing.
*
* @param key the key
* @param cmd the patch command
*/
public void add(CharSequence key, CharSequence cmd) {
if (key == null || cmd == null) {
return;
}
if (cmd.length() == 0) {
return;
}
int id_cmd = cmds.indexOf(cmd);
if (id_cmd == -1) {
id_cmd = cmds.size();
cmds.add(cmd);
}
int node = root;
Row r = getRow(node);
StrEnum e = new StrEnum(key, forward);
for (int i = 0; i < e.length() - 1; i++) {
Character ch = new Character(e.next());
node = r.getRef(ch);
if (node >= 0) {
r = getRow(node);
} else {
node = rows.size();
Row n;
rows.add(n = new Row());
r.setRef(ch, node);
r = n;
}
}
r.setCmd(new Character(e.next()), id_cmd);
}
/**
* Remove empty rows from the given Trie and return the newly reduced Trie.
*
* @param by the Trie to reduce
* @return the newly reduced Trie
*/
public Trie reduce(Reduce by) {
return by.optimize(this);
}
public void printInfo(CharSequence prefix) {
System.out.println(prefix + "nds " + rows.size() + " cmds " + cmds.size()
+ " cells " + getCells() + " valcells " + getCellsVal() + " pntcells "
+ getCellsPnt());
}
/**
* This class is part of the Egothor Project
*/
class StrEnum {
CharSequence s;
int from;
int by;
/**
* Constructor for the StrEnum object
*
* @param s Description of the Parameter
* @param up Description of the Parameter
*/
StrEnum(CharSequence s, boolean up) {
this.s = s;
if (up) {
from = 0;
by = 1;
} else {
from = s.length() - 1;
by = -1;
}
}
int length() {
return s.length();
}
char next() {
char ch = s.charAt(from);
from += by;
return ch;
}
}
}

View File

@ -0,0 +1,458 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<meta content="text/html; charset=UTF-8" http-equiv="content-type">
<title>Stempel - Algorithmic Stemmer for Polish Language</title>
<meta content="Andrzej Bialecki" name="author">
<meta name="keywords"
content="stemming, stemmer, algorithmic stemmer, Polish stemmer">
<meta
content="This page describes a software package consisting of high-quality stemming tables for Polish, and a universal algorithmic stemmer, which operates using these tables."
name="description">
</head>
<body style="font-family: Arial,SansSerif;">
<h1><i>Stempel</i> - Algorithmic Stemmer for Polish Language</h1>
<h2>Introduction</h2>
<p>A method for conflation of different inflected word forms is an
important component of many Information Retrieval systems. It helps to
improve the system's recall and can significantly reduce the index
size. This is especially true for highly-inflectional languages like
those from the Slavic language family (Czech, Slovak, Polish, Russian,
Bulgarian, etc).</p>
<p>This page describes a software package consisting of high-quality
stemming tables for Polish, and a universal algorithmic stemmer, which
operates using these tables. The stemmer code is taken virtually
unchanged from the <a href="http://www.egothor.org">Egothor project</a>.</p>
<p>The software distribution includes stemmer
tables prepared using an extensive corpus of Polish language (see
details below).</p>
<p>This work is available under Apache-style Open Source license - the
stemmer code is covered by Egothor License, the tables and other
additions are covered by Apache License 2.0. Both licenses allow to use
the code in Open Source as well as commercial (closed source) projects.</p>
<h3>Terminology</h3>
<p>A short explanation is in order about the terminology used in this
text.</p>
<p>In the following sections I make a distinction between <b>stem</b>
and <b>lemma</b>.</p>
<p>Lemma is a base grammatical form (dictionary form, headword) of a
word. Lemma is an existing, grammatically correct word in some human
language.</p>
<p>Stem on the other hand is just a unique token, not necessarily
making any sense in any human language, but which can serve as a unique
label instead of lemma for the same set of inflected forms. Quite often
stem is referred to as a "root" of the word - which is incorrect and
misleading (stems sometimes have very little to do with the linguistic
root of a word, i.e. a pattern found in a word which is common to all
inflected forms or within a family of languages).</p>
<p>For an IR system stems are usually sufficient, for a morphological
analysis system obviously lemmas are a must. In practice, various
stemmers produce a mix of stems and lemmas, as is the case with the
stemmer described here. Additionally, for some languages, which use
suffix-based inflection rules many stemmers based on suffix-stripping
will produce a large percentage of stems equivalent to lemmas. This is
however not the case for languages with complex, irregular inflection
rules (such as Slavic languages) - here simplistic suffix-stripping
stemmers produce very poor results.</p>
<h3>Background</h3>
<p>Lemmatization is a process of finding the base, non-inflected form
of a word. The result of lemmatization is a correct existing word,
often in nominative case for nouns and infinitive form for verbs. A
given inflected form may correspond to several lemmas (e.g. "found"
-&gt; find, found) - the correct choice depends on the context.<br>
<br>
Stemming is concerned mostly with finding a unique "root" of a word,
which not necessarily results in any existing word or lemma. The
quality of stemming is measured by the rate of collisions (overstemming
- which causes words with different lemmas to be incorrectly conflated
into one "root"), and the rate of superfluous word "roots"
(understemming - which assigns several "roots" to words with the same
lemma). <br>
<br>
Both stemmer and lemmatizer can be implemented in various ways. The two
most common approaches are:<br>
</p>
<ul>
<li>dictionary-based: where the stemmer uses an extensive dictionary
of morphological forms in order to find the corresponding stem or lemma</li>
<li>algorithmic: where the stemmer uses an algorithm, based on
general morphological properties of a given language plus a set of
heuristic rules<br>
</li>
</ul>
There are many existing and well-known implementations of stemmers for
English (Porter, Lovins, Krovetz) and other European languages
(<a href="http://snowball.tartarus.org">Snowball</a>). There are also
good quality commercial lemmatizers for Polish. However, there is only
one
freely available Polish stemmer, implemented by
<a
href="http://www.cs.put.poznan.pl/dweiss/xml/projects/lametyzator/index.xml?lang=en">Dawid
Weiss</a>, based on the "ispell" dictionary and Jan Daciuk's <a
href="http://www.eti.pg.gda.pl/%7Ejandac/">FSA package</a>. That
stemmer is dictionary-based. This means that even
though it can achieve
perfect accuracy for previously known word forms found in its
dictionary, it
completely fails in case of all other word forms. This deficiency is
somewhat mitigated by the comprehensive dictionary distributed with
this stemmer (so there is a high probability that most of the words in
the input text will be found in the dictionary), however the problem
still remains (please see the page above for more detailed description).<br>
<br>
The implementation described here uses an algorithmic method. This
method
and particular algorithm implementation are described in detail in
[1][2].
The main advantage of algorithmic stemmers is their ability to process
previously
unseen word forms with high accuracy. This particular algorithm uses a
set
of
transformation rules (patch commands), which describe how a word with a
given pattern should be transformed to its stem. These rules are first
learned from a training corpus. They don't
cover
all possible cases, so there is always some loss of precision/recall
(which
means that even the words from the training corpus are sometimes
incorrectly stemmed).<br>
<h2>Algorithm and implementation<span style="font-style: italic;"></span></h2>
The algorithm and its Java implementation is described in detail in the
publications cited below. Here's just a short excerpt from [2]:<br>
<br>
<center>
<div style="width: 80%;" align="justify">"The aim is separation of the
stemmer execution code from the data
structures [...]. In other words, a static algorithm configurable by
data must be developed. The word transformations that happen in the
stemmer must be then encoded to the data tables.<br>
<br>
The tacit input of our method is a sample set (a so-called dictionary)
of words (as keys) and their stems. Each record can be equivalently
stored as a key and the record of key's transformation to its
respective stem. The transformation record is termed a patch command
(P-command). It must be ensured that P-commands are universal, and that
P-commands can transform any word to its stem. Our solution[6,8] is
based on the Levenstein metric [10], which produces P-command as the
minimum cost path in a directed graph.<br>
<br>
One can imagine the P-command as an algorithm for an operator (editor)
that rewrites a string to another string. The operator can use these
instructions (PP-command's): <span style="font-weight: bold;">removal </span>-
deletes a sequence of characters starting at the current cursor
position and moves the cursor to the next character. The length of this
sequence is the parameter; <span style="font-weight: bold;">insertion </span>-
inserts a character ch, without moving the cursor. The character ch is
a parameter; <span style="font-weight: bold;">substitution&nbsp;</span>
- rewrites a character at the current cursor position to the character
ch and moves the cursor to the next character. The character ch is a
parameter; <span style="font-weight: bold;">no operation</span> (NOOP)
- skip a sequence of characters starting at the current cursor
position. The length of this sequence is the parameter.<br>
<br>
The P-commands are applied from the end of a word (right to left). This
assumption can reduce the set of P-command's, because the last NOOP,
moving the cursor to the end of a string without any changes, need not
be stored."</div>
</center>
<br>
Data structure used to keep the dictionary (words and their P-commands)
is a trie. Several optimization steps are applied in turn to reduce and
optimize the initial trie, by eliminating useless information and
shortening the paths in the trie.<br>
<br>
Finally, in order to obtain a stem from the input word, the word is
passed once through a matching path in the trie (applying at each node
the P-commands stored there). The result is a word stem.<br>
<h2>Corpus</h2>
<p><i>(to be completed...)</i></p>
<p>The following Polish corpora have been used:</p>
<ul>
<li><a
href="http://sourceforge.net/project/showfiles.php?group_id=49316&amp;package_id=65354">Polish
dictionary
from ispell distribution</a></li>
<li><a href="http://www.mimuw.edu.pl/polszczyzna/">Wzbogacony korpus
ownika frekwencyjnego</a></li>
<!--<li><a href="http://www.korpus.pl">Korpus IPI PAN</a></li>-->
<!--<li>The Bible (so called "Warsaw Bible" or "Brytyjka")</li>--><li>The
Bible (so called "TysiÄ…clecia") - unauthorized electronic version</li>
<li><a
href="http://www.mimuw.edu.pl/polszczyzna/Debian/sam34_3.4a.02-1_i386.deb">Analizator
morfologiczny SAM v. 3.4</a> - this was used to recover lemmas
missing from other texts</li>
</ul>
<p>This step was the most time-consuming - and it would probably be
even more tedious and difficult if not for the
help of
<a href="http://www.python.org/">Python</a>. The source texts had to be
brought to a common encoding (UTF-8) - some of them used quite ancient
encodings like Mazovia or DHN - and then scripts were written to
collect all lemmas and
inflected forms from the source texts. In cases when the source text
was not
tagged,
I used the SAM analyzer to produce lemmas. In cases of ambiguous
lemmatization I decided to put references to inflected forms from all
base forms.<br>
</p>
<p>All grammatical categories were allowed to appear in the corpus,
i.e. nouns, verbs, adjectives, numerals, and pronouns. The resulting
corpus consisted of roughly 87,000+ inflection sets, i.e. each set
consisted of one base form (lemma) and many inflected forms. However,
because of the nature of the training method I restricted these sets to
include only those where there were at least 4 inflected forms. Sets
with 3 or less inflected forms were removed, so that the final corpus
consisted of ~69,000 unique sets, which in turn contained ~1.5 mln
inflected forms. <br>
</p>
<h2>Testing</h2>
<p>I tested the stemmer tables produced using the implementation
described above. The following sections give some details about
the testing setup.
</p>
<h3>Testing procedure</h3>
<p>The testing procedure was as follows:
</p>
<ul>
<li>the whole corpus of ~69,000 unique sets was shuffled, so that the
input sets were in random order.</li>
<li>the corpus was split into two parts - one with 30,000 sets (Part
1), the other with ~39,000 sets (Part 2).</li>
<li>Training samples were drawn in sequential order from the Part 1.
Since the sets were already randomized, the training samples were also
randomized, but this procedure ensured that each larger training sample
contained all smaller samples.</li>
<li>Part 2 was used for testing. Note: this means that the testing
run used <em>only</em> words previously unseen during the training
phase. This is the worst scenario, because it means that stemmer must
extrapolate the learned rules to unknown cases. This also means that in
a real-life case (where the input is a mix between known and unknown
words) the F-measure of the stemmer will be even higher than in the
table below.</li>
</ul>
<h3>Test results</h3>
<p>The following table summarizes test results for varying sizes
of training samples. The meaning of the table columns is
described below:
</p>
<ul>
<li><b>training sets:</b> the number of training sets. One set
consists of one lemma and at least 4 and up to ~80 inflected forms
(including pre- and suffixed forms).</li>
<li><b>testing forms:</b> the number of testing forms. Only inflected
forms were used in testing.</li>
<li><b>stem OK:</b> the number of cases when produced output was a
correct (unique) stem. Note: quite often correct stems were also
correct lemmas.</li>
<li><b>lemma OK:</b> the number of cases when produced output was a
correct lemma.</li>
<li><b>missing:</b> the number of cases when stemmer was unable to
provide any output.</li>
<li><b>stem bad:</b> the number of cases when produced output was a
stem, but already in use identifying a different set.</li>
<li><b>lemma bad:</b> the number of cases when produced output was an
incorrect lemma. Note: quite often in such case the output was a
correct stem.</li>
<li><b>table size:</b> the size in bytes of the stemmer table.</li>
</ul>
<div align="center">
<table border="1" cellpadding="2" cellspacing="0">
<tbody>
<tr bgcolor="#a0b0c0">
<th>Training sets</th>
<th>Testing forms</th>
<th>Stem OK</th>
<th>Lemma OK</th>
<th>Missing</th>
<th>Stem Bad</th>
<th>Lemma Bad</th>
<th>Table size [B]</th>
</tr>
<tr align="right">
<td>100</td>
<td>1022985</td>
<td>842209</td>
<td>593632</td>
<td>172711</td>
<td>22331</td>
<td>256642</td>
<td>28438</td>
</tr>
<tr align="right">
<td>200</td>
<td>1022985</td>
<td>862789</td>
<td>646488</td>
<td>153288</td>
<td>16306</td>
<td>223209</td>
<td>48660</td>
</tr>
<tr align="right">
<td>500</td>
<td>1022985</td>
<td>885786</td>
<td>685009</td>
<td>130772</td>
<td>14856</td>
<td>207204</td>
<td>108798</td>
</tr>
<tr align="right">
<td>700</td>
<td>1022985</td>
<td>909031</td>
<td>704609</td>
<td>107084</td>
<td>15442</td>
<td>211292</td>
<td>139291</td>
</tr>
<tr align="right">
<td>1000</td>
<td>1022985</td>
<td>926079</td>
<td>725720</td>
<td>90117</td>
<td>14941</td>
<td>207148</td>
<td>183677</td>
</tr>
<tr align="right">
<td>2000</td>
<td>1022985</td>
<td>942886</td>
<td>746641</td>
<td>73429</td>
<td>14903</td>
<td>202915</td>
<td>313516</td>
</tr>
<tr align="right">
<td>5000</td>
<td>1022985</td>
<td>954721</td>
<td>759930</td>
<td>61476</td>
<td>14817</td>
<td>201579</td>
<td>640969</td>
</tr>
<tr align="right">
<td>7000</td>
<td>1022985</td>
<td>956165</td>
<td>764033</td>
<td>60364</td>
<td>14620</td>
<td>198588</td>
<td>839347</td>
</tr>
<tr align="right">
<td>10000</td>
<td>1022985</td>
<td>965427</td>
<td>775507</td>
<td>50797</td>
<td>14662</td>
<td>196681</td>
<td>1144537</td>
</tr>
<tr align="right">
<td>12000</td>
<td>1022985</td>
<td>967664</td>
<td>782143</td>
<td>48722</td>
<td>14284</td>
<td>192120</td>
<td>1313508</td>
</tr>
<tr align="right">
<td>15000</td>
<td>1022985</td>
<td>973188</td>
<td>788867</td>
<td>43247</td>
<td>14349</td>
<td>190871</td>
<td>1567902</td>
</tr>
<tr align="right">
<td>17000</td>
<td>1022985</td>
<td>974203</td>
<td>791804</td>
<td>42319</td>
<td>14333</td>
<td>188862</td>
<td>1733957</td>
</tr>
<tr align="right">
<td>20000</td>
<td>1022985</td>
<td>976234</td>
<td>791554</td>
<td>40058</td>
<td>14601</td>
<td>191373</td>
<td>1977615</td>
</tr>
</tbody>
</table>
</div>
<p>I also measured the time to produce a stem (which involves
traversing a trie,
retrieving a patch command and applying the patch command to the input
string).
On a machine running Windows XP (Pentium 4, 1.7 GHz, JDK 1.4.2_03
HotSpot),
for tables ranging in size from 1,000 to 20,000 cells, the time to
produce a
single stem varies between 5-10 microseconds.<br>
</p>
<p>This means that the stemmer can process up to <span
style="font-weight: bold;">200,000 words per second</span>, an
outstanding result when compared to other stemmers (Morfeusz - ~2,000
w/s, FormAN (MS Word analyzer) - ~1,000 w/s).<br>
</p>
<p>The package contains a class <code>org.getopt.stempel.Benchmark</code>,
which you can use to produce reports
like the one below:<br>
</p>
<pre>--------- Stemmer benchmark report: -----------<br>Stemmer table: /res/tables/stemmer_2000.out<br>Input file: ../test3.txt<br>Number of runs: 3<br><br> RUN NUMBER: 1 2 3<br> Total input words 1378176 1378176 1378176<br> Missed output words 112 112 112<br> Time elapsed [ms] 6989 6940 6640<br> Hit rate percent 99.99% 99.99% 99.99%<br> Miss rate percent 00.01% 00.01% 00.01%<br> Words per second 197192 198584 207557<br> Time per word [us] 5.07 5.04 4.82<br></pre>
<h2>Summary</h2>
<p>The results of these tests are very encouraging. It seems that using
the
training corpus and the stemming algorithm described above results in a
high-quality stemmer useful for most applications. Moreover, it can
also
be used as a better than average lemmatizer.</p>
<p>Both the author of the implementation
(Leo Galambos, &lt;leo.galambos AT egothor DOT org&gt;) and the author
of this
compilation (Andrzej Bialecki &lt;ab AT getopt DOT org&gt;) would
appreciate any
feedback and suggestions for further improvements.</p>
<h2>Bibliography</h2>
<ol>
<li>Galambos, L.: Multilingual Stemmer in Web Environment, PhD
Thesis,
Faculty of Mathematics and Physics, Charles University in Prague, in
press.</li>
<li>Galambos, L.: Semi-automatic Stemmer Evaluation. International
Intelligent Information Processing and Web Mining Conference, 2004,
Zakopane, Poland.</li>
<li>Galambos, L.: Lemmatizer for Document Information Retrieval
Systems in JAVA.<span style="text-decoration: underline;"> </span><a
class="moz-txt-link-rfc2396E"
href="http://www.informatik.uni-trier.de/%7Eley/db/conf/sofsem/sofsem2001.html#Galambos01">&lt;http://www.informatik.uni-trier.de/%7Eley/db/conf/sofsem/sofsem2001.html#Galambos01&gt;</a>
SOFSEM 2001, Piestany, Slovakia. <br>
</li>
</ol>
<br>
<br>
</body>
</html>

View File

@ -0,0 +1,186 @@
# This file was created from the carrot2 project and is distributed under the BSD license.
# See http://project.carrot2.org/license.html
# Also see http://www.opensource.org/licenses/bsd-license.html
# From trunk/core/carrot2-util-text/src-resources/stopwords.pl
vol
o.o.
mgr
godz
www
pl
ul
tel
hab
prof
inż
dr
i
u
aby
albo
ale
ani
bardzo
bez
bo
bowiem
by
byli
bym
był
była
było
były
być
będzie
będą
chce
choć
co
coraz
coś
czy
czyli
często
dla
do
gdy
gdyby
gdyż
gdzie
go
ich
im
inne
ja
jak
jakie
jako
je
jednak
jednym
jedynie
jego
jej
jest
jeszcze
jeśli
jeżeli
już
kiedy
kilku
kto
która
które
którego
której
który
których
którym
którzy
lat
lecz
lub
ma
mają
mamy
mi
miał
mimo
mnie
mogą
może
można
mu
musi
na
nad
nam
nas
nawet
nic
nich
nie
niej
nim
niż
no
nowe
np
nr
o
od
ok
on
one
oraz
pan
po
pod
ponad
ponieważ
poza
przed
przede
przez
przy
raz
razie
roku
również
się
sobie
sposób
swoje
ta
tak
takich
takie
także
tam
te
tego
tej
temu
ten
teraz
też
to
trzeba
tu
tych
tylko
tym
tys
tzw
w
we
wie
więc
wszystko
wśród
właśnie
z
za
zaś
ze
że
żeby
ii
iii
iv
vi
vii
viii
ix
xi
xii
xiii
xiv
xv

View File

@ -0,0 +1,53 @@
package org.apache.lucene.analysis.pl;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
public class TestPolishAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
* stopwords file is missing in classpath */
public void testResourcesAvailable() {
new PolishAnalyzer(TEST_VERSION_CURRENT);
}
/** test stopwords and stemming */
public void testBasics() throws IOException {
Analyzer a = new PolishAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "studenta", "student");
checkOneTermReuse(a, "studenci", "student");
// stopword
assertAnalyzesTo(a, "był", new String[] {});
}
/** test use of exclusion set */
public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>();
exclusionSet.add("studenta");
Analyzer a = new PolishAnalyzer(TEST_VERSION_CURRENT,
PolishAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "studenta", "studenta");
checkOneTermReuse(a, "studenci", "student");
}
}

View File

@ -0,0 +1,153 @@
package org.egothor.stemmer;
/*
Egothor Software License version 1.00
Copyright (C) 1997-2004 Leo Galambos.
Copyright (C) 2002-2004 "Egothor developers"
on behalf of the Egothor Project.
All rights reserved.
This software is copyrighted by the "Egothor developers". If this
license applies to a single file or document, the "Egothor developers"
are the people or entities mentioned as copyright holders in that file
or document. If this license applies to the Egothor project as a
whole, the copyright holders are the people or entities mentioned in
the file CREDITS. This file can be found in the same location as this
license in the distribution.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, the list of contributors, this list of conditions, and the
following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, the list of contributors, this list of conditions, and the
disclaimer that follows these conditions in the documentation
and/or other materials provided with the distribution.
3. The name "Egothor" must not be used to endorse or promote products
derived from this software without prior written permission. For
written permission, please contact Leo.G@seznam.cz
4. Products derived from this software may not be called "Egothor",
nor may "Egothor" appear in their name, without prior written
permission from Leo.G@seznam.cz.
In addition, we request that you include in the end-user documentation
provided with the redistribution and/or in the software itself an
acknowledgement equivalent to the following:
"This product includes software developed by the Egothor Project.
http://egothor.sf.net/"
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
This software consists of voluntary contributions made by many
individuals on behalf of the Egothor Project and was originally
created by Leo Galambos (Leo.G@seznam.cz).
*/
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.net.URI;
import java.util.StringTokenizer;
import org.apache.lucene.util.LuceneTestCase;
public class TestCompile extends LuceneTestCase {
public void testCompile() throws Exception {
URI uri = getClass().getResource("testRules.txt").toURI();
String path = uri.getPath();
Compile.main(new String[] {"test", path});
String compiled = path + ".out";
Trie trie = loadTrie(compiled);
assertTrie(trie, path, true, true);
assertTrie(trie, path, false, true);
new File(compiled).delete();
}
public void testCompileBackwards() throws Exception {
URI uri = getClass().getResource("testRules.txt").toURI();
String path = uri.getPath();
Compile.main(new String[] {"-test", path});
String compiled = path + ".out";
Trie trie = loadTrie(compiled);
assertTrie(trie, path, true, true);
assertTrie(trie, path, false, true);
new File(compiled).delete();
}
public void testCompileMulti() throws Exception {
URI uri = getClass().getResource("testRules.txt").toURI();
String path = uri.getPath();
Compile.main(new String[] {"Mtest", path});
String compiled = path + ".out";
Trie trie = loadTrie(compiled);
assertTrie(trie, path, true, true);
assertTrie(trie, path, false, true);
new File(compiled).delete();
}
static Trie loadTrie(String path) throws IOException {
Trie trie;
DataInputStream is = new DataInputStream(new BufferedInputStream(
new FileInputStream(path)));
String method = is.readUTF().toUpperCase();
if (method.indexOf('M') < 0) {
trie = new Trie(is);
} else {
trie = new MultiTrie(is);
}
is.close();
return trie;
}
private static void assertTrie(Trie trie, String file, boolean usefull,
boolean storeorig) throws Exception {
LineNumberReader in = new LineNumberReader(new BufferedReader(
new FileReader(file)));
for (String line = in.readLine(); line != null; line = in.readLine()) {
try {
line = line.toLowerCase();
StringTokenizer st = new StringTokenizer(line);
String stem = st.nextToken();
if (storeorig) {
CharSequence cmd = (usefull) ? trie.getFully(stem) : trie
.getLastOnPath(stem);
StringBuilder stm = new StringBuilder(stem);
Diff.apply(stm, cmd);
assertEquals(stem.toLowerCase(), stm.toString().toLowerCase());
}
while (st.hasMoreTokens()) {
String token = st.nextToken();
if (token.equals(stem)) {
continue;
}
CharSequence cmd = (usefull) ? trie.getFully(token) : trie
.getLastOnPath(token);
StringBuilder stm = new StringBuilder(token);
Diff.apply(stm, cmd);
assertEquals(stem.toLowerCase(), stm.toString().toLowerCase());
}
} catch (java.util.NoSuchElementException x) {
// no base token (stem) on a line
}
}
}
}

View File

@ -0,0 +1,168 @@
package org.egothor.stemmer;
/*
Egothor Software License version 1.00
Copyright (C) 1997-2004 Leo Galambos.
Copyright (C) 2002-2004 "Egothor developers"
on behalf of the Egothor Project.
All rights reserved.
This software is copyrighted by the "Egothor developers". If this
license applies to a single file or document, the "Egothor developers"
are the people or entities mentioned as copyright holders in that file
or document. If this license applies to the Egothor project as a
whole, the copyright holders are the people or entities mentioned in
the file CREDITS. This file can be found in the same location as this
license in the distribution.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, the list of contributors, this list of conditions, and the
following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, the list of contributors, this list of conditions, and the
disclaimer that follows these conditions in the documentation
and/or other materials provided with the distribution.
3. The name "Egothor" must not be used to endorse or promote products
derived from this software without prior written permission. For
written permission, please contact Leo.G@seznam.cz
4. Products derived from this software may not be called "Egothor",
nor may "Egothor" appear in their name, without prior written
permission from Leo.G@seznam.cz.
In addition, we request that you include in the end-user documentation
provided with the redistribution and/or in the software itself an
acknowledgement equivalent to the following:
"This product includes software developed by the Egothor Project.
http://egothor.sf.net/"
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
This software consists of voluntary contributions made by many
individuals on behalf of the Egothor Project and was originally
created by Leo Galambos (Leo.G@seznam.cz).
*/
import org.apache.lucene.util.LuceneTestCase;
public class TestStemmer extends LuceneTestCase {
public void testTrie() {
Trie t = new Trie(true);
String keys[] = {"a", "ba", "bb", "c"};
String vals[] = {"1", "2", "2", "4"};
for (int i = 0; i < keys.length; i++) {
t.add(keys[i], vals[i]);
}
assertEquals(0, t.root);
assertEquals(2, t.rows.size());
assertEquals(3, t.cmds.size());
assertTrieContents(t, keys, vals);
}
public void testTrieBackwards() {
Trie t = new Trie(false);
String keys[] = {"a", "ba", "bb", "c"};
String vals[] = {"1", "2", "2", "4"};
for (int i = 0; i < keys.length; i++) {
t.add(keys[i], vals[i]);
}
assertTrieContents(t, keys, vals);
}
public void testMultiTrie() {
Trie t = new MultiTrie(true);
String keys[] = {"a", "ba", "bb", "c"};
String vals[] = {"1", "2", "2", "4"};
for (int i = 0; i < keys.length; i++) {
t.add(keys[i], vals[i]);
}
assertTrieContents(t, keys, vals);
}
public void testMultiTrieBackwards() {
Trie t = new MultiTrie(false);
String keys[] = {"a", "ba", "bb", "c"};
String vals[] = {"1", "2", "2", "4"};
for (int i = 0; i < keys.length; i++) {
t.add(keys[i], vals[i]);
}
assertTrieContents(t, keys, vals);
}
public void testMultiTrie2() {
Trie t = new MultiTrie2(true);
String keys[] = {"a", "ba", "bb", "c"};
/*
* short vals won't work, see line 155 for example
* the IOOBE is caught (wierd), but shouldnt affect patch cmds?
*/
String vals[] = {"1111", "2222", "2223", "4444"};
for (int i = 0; i < keys.length; i++) {
t.add(keys[i], vals[i]);
}
assertTrieContents(t, keys, vals);
}
public void testMultiTrie2Backwards() {
Trie t = new MultiTrie2(false);
String keys[] = {"a", "ba", "bb", "c"};
/*
* short vals won't work, see line 155 for example
* the IOOBE is caught (wierd), but shouldnt affect patch cmds?
*/
String vals[] = {"1111", "2222", "2223", "4444"};
for (int i = 0; i < keys.length; i++) {
t.add(keys[i], vals[i]);
}
assertTrieContents(t, keys, vals);
}
private static void assertTrieContents(Trie trie, String keys[], String vals[]) {
Trie[] tries = new Trie[] {
trie,
trie.reduce(new Optimizer()),
trie.reduce(new Optimizer2()),
trie.reduce(new Gener()),
trie.reduce(new Lift(true)),
trie.reduce(new Lift(false))
};
for (Trie t : tries) {
for (int i = 0; i < keys.length; i++) {
assertEquals(vals[i], t.getFully(keys[i]).toString());
assertEquals(vals[i], t.getLastOnPath(keys[i]).toString());
}
}
}
}

View File

@ -0,0 +1,4 @@
act acted acting actor
walk walked walking
wander wandered wanderer
want wanted wanting

View File

@ -141,6 +141,9 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-ant/index.html">Ant</a>
</div>
<div class="menuitem">

View File

@ -141,6 +141,9 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-ant/index.html">Ant</a>
</div>
<div class="menuitem">

View File

@ -141,6 +141,9 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-ant/index.html">Ant</a>
</div>
<div class="menuitem">

View File

@ -141,6 +141,9 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-ant/index.html">Ant</a>
</div>
<div class="menuitem">

View File

@ -141,6 +141,9 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-ant/index.html">Ant</a>
</div>
<div class="menuitem">

View File

@ -141,6 +141,9 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-ant/index.html">Ant</a>
</div>
<div class="menuitem">

View File

@ -141,6 +141,9 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-ant/index.html">Ant</a>
</div>
<div class="menuitem">

View File

@ -139,6 +139,9 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-ant/index.html">Ant</a>
</div>
<div class="menuitem">

View File

@ -139,6 +139,9 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-ant/index.html">Ant</a>
</div>
<div class="menuitem">
@ -329,6 +332,12 @@ document.write("Last Published: " + document.lastModified);
</li>
</ul>
<ul>
<li>
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>&nbsp;&nbsp;___________________&nbsp;&nbsp;<em>javadoc-contrib-stempel</em>
</li>
</ul>
<ul>
<li>
<a href="api/contrib-ant/index.html">Ant</a>&nbsp;&nbsp;___________________&nbsp;&nbsp;<em>javadoc-contrib-ant</em>

View File

@ -5,10 +5,10 @@
/Producer (FOP 0.20.5) >>
endobj
5 0 obj
<< /Length 1070 /Filter [ /ASCII85Decode /FlateDecode ]
<< /Length 1057 /Filter [ /ASCII85Decode /FlateDecode ]
>>
stream
Gau1.?#Q2d(kqGM/%BCKiqXP'9[ZpVeS1J@?*2RDJJ;8h)*"1u?U#K#YfU-GFDKA1)tf`Ap3AB!Y%R2UM2*O1hKgu1l78GlmpUr5-pPc\<?J+M&0?)^XnVMT2AbVg2og@in!r4pjjQ-A%3=cu^<l:LS)JZnPp.eKF7+JJf%8?<GmJW>L)ol6^`i+3elgiX$)[-\l.==%fb8IecEb%t=ZSD!C:buoU%L7QhMTHDEtXpA,O0\.p2p_&m)[&h64a>aq]3]CQ,+ZCVM0!qk[ChWq`$/B&qd=VpX>b0+?pIV49Pe,;mI@;*<L9c5nUoZ^n)e#%K<)ZpKP1GOiXSD.N4u`KHW75Ot(o4%#CqoL?2bMg8F,@Ia6E7PGoD`-;=04NhS-]Ld>Dl>@g\-O>bRcb6J.TVF>mZZu^pN;)BC_rJ\:eH?-Y(B.]@Aha>ZK<nKj\kn)&PZ;T3JrF@n`N8ZrY/I&^kgTdK;12@b48'b$KfVqLnM\DQqO.E.phGfV@\/%Lf4oR3JKU_Nqdl^O>/hY64(%<l>VX#>>r[MV-n?"QeO`%5a0gdO\Tmn1X>%&3b&,pb^(`(#r\S;*XocBcHC9)nS)&4g>dX#pGW2+DOQmO&pGr3%@=:)@c`U,R``NY'"ZFil#6ZA&@9^0\O@#3KEN73Q<j;WWK3?DV(f](=>4ri9T]I:b[S:l8@I_#a#:$:8p]M,3#=I"`-2<;f*pPCuRR+[ZSq<!a%T2-.e^c=7mrd`X)\1"=>_ooEG1RXN+ae4<c9ul`_i>bNrW#/\[pE/s]XH:3V%@*\r's2@0VTj3)(GA2-o!qqPGtUEWe^rp'S9_bknt?%f$g=U,KIju&&o$p&#urY7L<g1)@H8Wp`*-(0lr)o.;.6s-L;=_7kT!idPZeI[UYMR/Qg&''[[Y%j0S\BV(9PaVp$>1qSo,'o#l;>L*c5%&^s&iXD_q8Q>H2bS*'=B&3BY)@A_JjY0a9ti]T<-UGJ:+mpJS%Y(_52BR%^4-+lq@f4O['eMd7%_mFuA#IR^`rF'7nRUcn5KmGHW!c+_SO2d0GRY4Zk)J77ZV!@ip"n,~>
Gau1.d8&FM'Sc)R'Y/mTHF@<Fa^n\2#"%#j\uRP[+a$@!9tJ7pg7Eb:daiT"nIufT1j6h;J&ab9,bE:O+"kG2'**[\EWjoo2M/c_#R+%R:lqV97N_t>*6_L#pTrU@nFL%HUKHA5].'bX<P$`poCTaRmak*[4f^*L/('pn5D&^=FJ])U]sD9?UBiPEHFnH/Zq>qM4(m\uW\8Q!rLgXdY=6b0+KJT0Opbcs/^.'q/IX"8=LFmVA7F#,E*u0GG+*q,hQbsH9OA)A6+&8.5sY.khJ=rR.3o6^j.E56#=3MWLfjdG`,*OZ7Q1H^$A'CNTnPX!W"8m%c8I2`DlG/MC+*7V6BEJF2HnV`CWV*oR\3d^FeF0>nn)+qkJ:+Slkdn<W"5Ud15J6_+IhJ,ot]>@b8#R>a0nFFf5u'[.\fNSlM<O'Wggih'f+K/=o(4YERkHna;E<]GsYPgVRR]V-[Q8[Y\g)mG1rN.-%n>&i&]\kIDQKM;H]OB[Wq-rrTg><eA4!kbMh\=ZRfXqr/PK].U=,4AF9eLO%_A&pL+#cBjhuACZ'fJlJZH33\j?X:U]J?I:Oo.h_eeEk[jA9!"RP+JBCel%8<e/r;DC0K8W!+)C)M(E^Vl^[>U)'K[.0=?)<P-O76:Y<4q$7c9[17?24cQBss(BG_eo)!SEj]&[mQKX1[u]`1=lVK!3b<e.I//bG*6g$kUHK;gbJC/^9IZpMdhHRChEQ.P\FE&p>gTRUj928).s*\enbNWXhX'9E(=IIc2q[N8<l&nr4^r2dh4[@#W_J0-GER5"b$6i[M(:>Sb6+@*gT,:K>TuKN"@b:f6TIc(BlK]Tg1j%@*\r-*:a"9UhM85$mC>eQ9(qDb3d5qj..7B9@'/I+6uEYGtXE+tQX.(N3GQFfMZn:-^D+:@$qfoW((9G%W]YP:*(*l*qkO1h_-<*.8i%eaZbA6a/Wna#G9m3#s596,I;sKRcP*qlF-(DUeCEF71-H8<5HXbW%6<`aO.u:,?s=>;*s:g))W4oqko7#;KWa>PA?3)g'r=0pW?`B$1uUL]/+.45NX~>
endstream
endobj
6 0 obj
@ -20,10 +20,10 @@ endobj
>>
endobj
7 0 obj
<< /Length 995 /Filter [ /ASCII85Decode /FlateDecode ]
<< /Length 1045 /Filter [ /ASCII85Decode /FlateDecode ]
>>
stream
GatUs>u03/'Sc)R.sqX>p&#CLc6]-+dKIu@Q'aD*aeV\RRZB@H.K0!^^.=b,18Sr4%!ganf]O5%f?;j6"Tt,W"I?k4Jb9ERUjk*AeC'75Z_<@0Y`-E%M^LjD-(&2dOT*_J?'tLEMF8Z;IBM0>.[U1*r`k'4@/4)4g$_t$NFb,3'$E-..pg1Ep#gdmW%<=fa4fc`K:Q]*W76CL!:'MhAtb_)M4')T*Vt`,`aT>iF%AL24R/;5i=D(4192/I<ELD[,6h#?ZH,Eb%r&J?S0+S=IU1?^niLa$7)M#?c6+4MnNk0oP%U>=7Pu1^W**sl7VNCb[^Qu$'[_"rX>+S@U5t^$*b1_kE++)th1>Q`oJaao9Kde?]%jYQAaT>Q!L)ITpC:W;SkP#$ZlNToS=imm@uSKGCV@d`QtEBeWp,:GlmF_DqNQcT@t`=jmU.RD]Xr$976N^rQkbq9.Df\o9K`BR9[/[cVtLlre[1^)NXKZiJF&I$5NYSXD41uU[h!V6b=%mUc>h`!05WI3I#OL(=8>Ru'mg[ma^e*1kQOXR,MuHAA&Q8b^::bH9L_ld4:o,%)X7ganfj$'Hch7'MoG;jVRE8A\7ldnLO%5PO.gSNGW1#68l\6[VY-1pP[$pIB+A5!2EfSsV:k%@6VV=U_</;aq_IDPCL*6i5rE&trJ"l_H8Nl2NbadWIJr\rM%XaiFAMF9Bc'l\.BeHG:^J.Y*m`H91]8Mm(+im'%#_dG*90A-Y`0^A9mBuG)sW2VY'ts&%P;1N&ct(e^qg^oi`;+b@JP(:*.:aF2UOr1-C\+RNO>U-Nm.Mq"!]?H[53NO/n_felbk@dZ+bo3V?_>X+@m?j,RA#VZg6Ul#59j?`hT%*f7MdtRqlt[qNF[C(tG6G3]kAkSW=u=(j'U!\>9kYQ./O>4_E[L7?r23f_6c1E#TCBlo&%]a>UQ^$GRO,BqPq[l`;YcIBjL)["k3mU8SMnn%9<,IHS>e'l<X4+)rfDZi~>
GatUs>Ar4L'Z],,'R<q*nD\5Ra0lnMVH-k>UeY*k.1M#Dj8r()jkJ+l2)3="$Li-G3WG^mDYD>&np*\FLk+g8QP7tR9q^OH06@aPC6;?tQqNrplMUSPN"E[CV@ZX,rUf>l(F?>lWEYk,:Hqq+05(8Y8[&_^c96/%^_l;;J&8[6O[mkKcUo(i9Tqr>F[Alo[#EW>`5$UNdTdh_6UDl&`-$cDaL%5:;*kbSM)$\-OX+?^nm*sgQ?Wb&kX$Z$9$)r_I35s'e;'TVYo!uXWIiV7NZ#<sKFQE%!C)3>KcN&5pJ;Vl[N7ZGd)mqOK*7^d0GXIiY".pkj*H4A9NG6l4)BX>BD+m?\c&H>Y7qdAU-^/=Ea,sb2G.N*m9gTnEL;^*KF3q-'$$A.Vd\V-c&!HW<!>ap,$J&V)&muGpfs.GAKC&<J-e04Gd*78ebJj]!V;C*E.ZdlfGHd'3$Or7\]@0a8cSHDf-QG+pRR%uP<rb.rOBW.Rml'nrm<5%*[G1G;oT$WCUSuDjRmkg1u2VUD&8;aJFH6a+,WslrAGb:Fr.1\(Kl#L4,:kV74f`Ak8%h!S$W%$GnmqHcEkocfM`.hksCO#fuD+X=['HH(eob@A*;$SCStV'>b8okkbC8`AK$R>0P6m`/_:/m8n#<t*8=9r1<tWXU+Dc\f$Xb6-qgf^+(l(2TBVg+Y0?#'l-PS;Cb-WMeeQlr[C0d0)H6mQWd[YtWAV9H\)XJ%oR@p3JjWRnn!@!LFg0?uS$l;'N\.2c'4^P8Oc%HZ6&%sPO3f/0cDQY[;.CJ,`Ttp<S#32Cn,(gEqZLq!+BfjU#]PG;%q5O$h`MOip#us3C/T9bW?dG/FW,/)[<_P1kl8elRTX4eTF67;:U]pZc`[nRRR/&qUTlRZJ>a]HZ^a-ODQ8U<4N=0(%s.+4"E;]U`o'J@7tYYF\%PnDh0FY`Y*Y5Ji%sYsYX&C2=1A"f+QM!f=o770H[eL!p(h[Sem%`Id2U],0H/Y2^+B#P*3@BT?+($>D@Zr4,tW%)g;UiCC?E6e"mFk"?X2^'VZ~>
endstream
endobj
8 0 obj
@ -87,19 +87,19 @@ endobj
xref
0 14
0000000000 65535 f
0000003088 00000 n
0000003152 00000 n
0000003202 00000 n
0000003126 00000 n
0000003190 00000 n
0000003240 00000 n
0000000015 00000 n
0000000071 00000 n
0000001233 00000 n
0000001339 00000 n
0000002425 00000 n
0000002531 00000 n
0000002643 00000 n
0000002753 00000 n
0000002864 00000 n
0000002972 00000 n
0000001220 00000 n
0000001326 00000 n
0000002463 00000 n
0000002569 00000 n
0000002681 00000 n
0000002791 00000 n
0000002902 00000 n
0000003010 00000 n
trailer
<<
/Size 14
@ -107,5 +107,5 @@ trailer
/Info 4 0 R
>>
startxref
3324
3362
%%EOF

View File

@ -141,6 +141,9 @@ document.write("Last Published: " + document.lastModified);
<a href="../api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
</div>
<div class="menuitem">
<a href="../api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
</div>
<div class="menuitem">
<a href="../api/contrib-ant/index.html">Ant</a>
</div>
<div class="menuitem">

View File

@ -141,6 +141,9 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-ant/index.html">Ant</a>
</div>
<div class="menuitem">

View File

@ -141,6 +141,9 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-ant/index.html">Ant</a>
</div>
<div class="menuitem">

View File

@ -139,6 +139,9 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
</div>
<div class="menuitem">
<a href="api/contrib-ant/index.html">Ant</a>
</div>
<div class="menuitem">

View File

@ -54,6 +54,7 @@ See http://forrest.apache.org/docs/linking.html for more info
<javadoc-contrib label="Contrib">
<javadoc-contrib-analyzers label="Analyzers" href="ext:javadocs-contrib-analyzers"/>
<javadoc-contrib-smartcn label="Smart Chinese Analyzer" href="ext:javadocs-contrib-smartcn"/>
<javadoc-contrib-stempel label="Stempel Polish Analyzer" href="ext:javadocs-contrib-stempel"/>
<javadoc-contrib-ant label="Ant" href="ext:javadocs-contrib-ant"/>
<javadoc-contrib-bdb label="Bdb" href="ext:javadocs-contrib-bdb"/>
<javadoc-contrib-bdb-je label="Bdb-je" href="ext:javadocs-contrib-bdb-je"/>
@ -104,6 +105,7 @@ See http://forrest.apache.org/docs/linking.html for more info
<javadocs-demo href="api/demo/index.html"/>
<javadocs-contrib-analyzers href="api/contrib-analyzers/index.html"/>
<javadocs-contrib-smartcn href="api/contrib-smartcn/index.html"/>
<javadocs-contrib-stempel href="api/contrib-stempel/index.html"/>
<javadocs-contrib-ant href="api/contrib-ant/index.html"/>
<javadocs-contrib-bdb href="api/contrib-bdb/index.html"/>
<javadocs-contrib-bdb-je href="api/contrib-bdb-je/index.html"/>