mirror of https://github.com/apache/lucene.git
LUCENE-2298: Add stempel, an algorithmic stemmer with included Polish support
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940433 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
220f9ee81c
commit
98c47c57e0
|
@ -38,6 +38,15 @@ stopword list that is BSD-licensed created by Jacques Savoy. The file resides i
|
|||
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt.
|
||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
|
||||
The Stempel analyzer (contrib/analyzers) includes BSD-licensed software developed
|
||||
by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil,
|
||||
and Edmond Nolan.
|
||||
|
||||
The Polish analyzer (contrib/analyzers) comes with a default
|
||||
stopword list that is BSD-licensed created by the Carrot2 project. The file resides
|
||||
in contrib/analyzers/stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt.
|
||||
See http://project.carrot2.org/license.html.
|
||||
|
||||
Includes lib/servlet-api-2.4.jar from Apache Tomcat
|
||||
|
||||
The SmartChineseAnalyzer source code (under contrib/analyzers) was
|
||||
|
|
|
@ -317,6 +317,7 @@ The source distribution does not contain sources of the previous Lucene Java ver
|
|||
|
||||
<packageset dir="contrib/analyzers/common/src/java"/>
|
||||
<packageset dir="contrib/analyzers/smartcn/src/java"/>
|
||||
<packageset dir="contrib/analyzers/stempel/src/java"/>
|
||||
<packageset dir="contrib/ant/src/java"/>
|
||||
<packageset dir="contrib/benchmark/src/java"/>
|
||||
<packageset dir="contrib/icu/src/java"/>
|
||||
|
@ -345,7 +346,7 @@ The source distribution does not contain sources of the previous Lucene Java ver
|
|||
|
||||
<group title="Demo" packages="org.apache.lucene.demo*"/>
|
||||
|
||||
<group title="contrib: Analysis" packages="org.apache.lucene.analysis.*:org.tartarus.snowball*"/>
|
||||
<group title="contrib: Analysis" packages="org.apache.lucene.analysis.*:org.tartarus.snowball*:org.egothor.stemmer*"/>
|
||||
<group title="contrib: Ant" packages="org.apache.lucene.ant*"/>
|
||||
<group title="contrib: Benchmark" packages="org.apache.lucene.benchmark*"/>
|
||||
<group title="contrib: ICU" packages="org.apache.lucene.collation*"/>
|
||||
|
|
|
@ -137,6 +137,9 @@ New features
|
|||
sensitive way, either from ICU built-in rules (such as Traditional-Simplified),
|
||||
or from rules you write yourself. (Robert Muir)
|
||||
|
||||
* LUCENE-2298: Add analyzers/stempel, an algorithmic stemmer with support for
|
||||
the Polish language. (Andrzej Bialecki via Robert Muir)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
Additional Analyzers
|
||||
- common: Additional Analyzers
|
||||
- smartcn: Smart Analyzer for Simplified Chinese Text
|
||||
- stempel: Algorithmic Stemmer for Polish
|
||||
</description>
|
||||
|
||||
<target name="common">
|
||||
|
@ -33,23 +34,31 @@
|
|||
<ant dir="smartcn" />
|
||||
</target>
|
||||
|
||||
<target name="default" depends="common,smartcn" />
|
||||
<target name="stempel">
|
||||
<ant dir="stempel" />
|
||||
</target>
|
||||
|
||||
<target name="default" depends="common,smartcn,stempel" />
|
||||
|
||||
<target name="clean">
|
||||
<ant dir="common" target="clean" />
|
||||
<ant dir="smartcn" target="clean" />
|
||||
<ant dir="stempel" target="clean" />
|
||||
</target>
|
||||
<target name="compile-core">
|
||||
<ant dir="common" target="compile-core" />
|
||||
<ant dir="smartcn" target="compile-core" />
|
||||
<ant dir="stempel" target="compile-core" />
|
||||
</target>
|
||||
<target name="compile-test">
|
||||
<ant dir="common" target="compile-test" />
|
||||
<ant dir="smartcn" target="compile-test" />
|
||||
<ant dir="stempel" target="compile-test" />
|
||||
</target>
|
||||
<target name="test">
|
||||
<ant dir="common" target="test" />
|
||||
<ant dir="smartcn" target="test" />
|
||||
<ant dir="stempel" target="test" />
|
||||
</target>
|
||||
|
||||
<target name="build-artifacts-and-tests" depends="default,compile-test" />
|
||||
|
@ -57,16 +66,19 @@
|
|||
<target name="dist-maven" depends="default">
|
||||
<ant dir="common" target="dist-maven" />
|
||||
<ant dir="smartcn" target="dist-maven" />
|
||||
<ant dir="stempel" target="dist-maven" />
|
||||
</target>
|
||||
|
||||
<target name="javadocs">
|
||||
<ant dir="common" target="javadocs" />
|
||||
<ant dir="smartcn" target="javadocs" />
|
||||
<ant dir="stempel" target="javadocs" />
|
||||
</target>
|
||||
|
||||
<target name="javadocs-index.html">
|
||||
<ant dir="common" target="javadocs-index.html" />
|
||||
<ant dir="smartcn" target="javadocs-index.html" />
|
||||
<ant dir="stempel" target="javadocs-index.html" />
|
||||
</target>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<project name="stempel" default="default">
|
||||
|
||||
<description>
|
||||
Stempel Analyzer
|
||||
</description>
|
||||
|
||||
<property name="build.dir" location="../../../build/contrib/analyzers/stempel" />
|
||||
<property name="dist.dir" location="../../../dist/contrib/analyzers/stempel" />
|
||||
<property name="maven.dist.dir" location="../../../dist/maven" />
|
||||
|
||||
<import file="../../contrib-build.xml"/>
|
||||
|
||||
<path id="test.classpath">
|
||||
<path refid="classpath"/>
|
||||
<pathelement location="../../../build/classes/test/"/>
|
||||
<path refid="junit-path"/>
|
||||
<pathelement location="${build.dir}/classes/java"/>
|
||||
</path>
|
||||
</project>
|
|
@ -0,0 +1,35 @@
|
|||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-contrib</artifactId>
|
||||
<version>@version@</version>
|
||||
</parent>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-stempel</artifactId>
|
||||
<name>Lucene Stempel Analyzer</name>
|
||||
<version>@version@</version>
|
||||
<description>Stempel Analyzer</description>
|
||||
<packaging>jar</packaging>
|
||||
</project>
|
|
@ -0,0 +1,154 @@
|
|||
package org.apache.lucene.analysis.pl;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.stempel.StempelStemmer;
|
||||
import org.apache.lucene.analysis.stempel.StempelFilter;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.egothor.stemmer.Trie;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Polish.
|
||||
*/
|
||||
public final class PolishAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final Trie stemTable;
|
||||
|
||||
/** File containing default Polish stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultsHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultsHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final Trie DEFAULT_TABLE;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getWordSet(PolishAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set", ex);
|
||||
}
|
||||
|
||||
InputStream stream = PolishAnalyzer.class.getResourceAsStream("stemmer_20000.tbl");
|
||||
try {
|
||||
DataInputStream in = new DataInputStream(new BufferedInputStream(stream));
|
||||
String method = in.readUTF().toUpperCase();
|
||||
if (method.indexOf('M') < 0) {
|
||||
DEFAULT_TABLE = new org.egothor.stemmer.Trie(in);
|
||||
} else {
|
||||
DEFAULT_TABLE = new org.egothor.stemmer.MultiTrie2(in);
|
||||
}
|
||||
in.close();
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stemming tables", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public PolishAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultsHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public PolishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public PolishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemTable = DefaultsHolder.DEFAULT_TABLE;
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a
|
||||
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||
* which tokenizes all the text in the provided {@link Reader}.
|
||||
*
|
||||
* @return A
|
||||
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link StempelFilter}.
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new StempelFilter(result, new StempelStemmer(stemTable));
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Analyzer for Polish.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,83 @@
|
|||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain a
|
||||
* copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.stempel;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
/**
|
||||
* Transforms the token stream as per the stemming algorithm.
|
||||
* <p>
|
||||
* Note: the input to the stemming filter must already be in lower case, so you
|
||||
* will need to use LowerCaseFilter or LowerCaseTokenizer farther down the
|
||||
* Tokenizer chain in order for this to work properly!
|
||||
*/
|
||||
public final class StempelFilter extends TokenFilter {
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||
private final StempelStemmer stemmer;
|
||||
private final int minLength;
|
||||
|
||||
/**
|
||||
* Minimum length of input words to be processed. Shorter words are returned
|
||||
* unchanged.
|
||||
*/
|
||||
public static final int DEFAULT_MIN_LENGTH = 3;
|
||||
|
||||
/**
|
||||
* Create filter using the supplied stemming table.
|
||||
*
|
||||
* @param in input token stream
|
||||
* @param stemmer stemmer
|
||||
*/
|
||||
public StempelFilter(TokenStream in, StempelStemmer stemmer) {
|
||||
this(in, stemmer, DEFAULT_MIN_LENGTH);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create filter using the supplied stemming table.
|
||||
*
|
||||
* @param in input token stream
|
||||
* @param stemmer stemmer
|
||||
* @param minLength For performance reasons words shorter than minLength
|
||||
* characters are not processed, but simply returned.
|
||||
*/
|
||||
public StempelFilter(TokenStream in, StempelStemmer stemmer, int minLength) {
|
||||
super(in);
|
||||
this.stemmer = stemmer;
|
||||
this.minLength = minLength;
|
||||
}
|
||||
|
||||
/** Returns the next input Token, after being stemmed */
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
if (!keywordAtt.isKeyword() && termAtt.length() > minLength) {
|
||||
StringBuilder sb = stemmer.stem(termAtt);
|
||||
if (sb != null) // if we can't stem it, return unchanged
|
||||
termAtt.setEmpty().append(sb);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
* use this file except in compliance with the License. You may obtain a copy of
|
||||
* the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations under
|
||||
* the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.stempel;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.egothor.stemmer.Diff;
|
||||
import org.egothor.stemmer.Trie;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Stemmer class is a convenient facade for other stemmer-related classes. The
|
||||
* core stemming algorithm and its implementation is taken verbatim from the
|
||||
* Egothor project ( <a href="http://www.egothor.org">www.egothor.org </a>).
|
||||
* </p>
|
||||
* <p>
|
||||
* Even though the stemmer tables supplied in the distribution package are built
|
||||
* for Polish language, there is nothing language-specific here.
|
||||
* </p>
|
||||
*/
|
||||
public class StempelStemmer {
|
||||
private Trie stemmer = null;
|
||||
private StringBuilder buffer = new StringBuilder();
|
||||
|
||||
/**
|
||||
* Create a Stemmer using selected stemmer table
|
||||
*
|
||||
* @param stemmerTable stemmer table.
|
||||
*/
|
||||
public StempelStemmer(InputStream stemmerTable) throws IOException {
|
||||
if (stemmerTable == null) return;
|
||||
|
||||
DataInputStream in = new DataInputStream(new BufferedInputStream(
|
||||
stemmerTable));
|
||||
String method = in.readUTF().toUpperCase();
|
||||
if (method.indexOf('M') < 0) {
|
||||
stemmer = new org.egothor.stemmer.Trie(in);
|
||||
} else {
|
||||
stemmer = new org.egothor.stemmer.MultiTrie2(in);
|
||||
}
|
||||
in.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a Stemmer using pre-loaded stemmer table
|
||||
*
|
||||
* @param stemmer pre-loaded stemmer table
|
||||
*/
|
||||
public StempelStemmer(Trie stemmer) {
|
||||
this.stemmer = stemmer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Stem a word.
|
||||
*
|
||||
* @param word input word to be stemmed.
|
||||
* @return stemmed word, or null if the stem could not be generated.
|
||||
*/
|
||||
public StringBuilder stem(CharSequence word) {
|
||||
CharSequence cmd = stemmer.getLastOnPath(word);
|
||||
|
||||
if (cmd == null)
|
||||
return null;
|
||||
|
||||
buffer.setLength(0);
|
||||
buffer.append(word);
|
||||
|
||||
Diff.apply(buffer, cmd);
|
||||
|
||||
if (buffer.length() > 0)
|
||||
return buffer;
|
||||
else
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<body>
|
||||
<p>Stempel: Algorithmic Stemmer</p>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,94 @@
|
|||
/*
|
||||
Egothor Software License version 1.00
|
||||
Copyright (C) 1997-2004 Leo Galambos.
|
||||
Copyright (C) 2002-2004 "Egothor developers"
|
||||
on behalf of the Egothor Project.
|
||||
All rights reserved.
|
||||
|
||||
This software is copyrighted by the "Egothor developers". If this
|
||||
license applies to a single file or document, the "Egothor developers"
|
||||
are the people or entities mentioned as copyright holders in that file
|
||||
or document. If this license applies to the Egothor project as a
|
||||
whole, the copyright holders are the people or entities mentioned in
|
||||
the file CREDITS. This file can be found in the same location as this
|
||||
license in the distribution.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
disclaimer that follows these conditions in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
3. The name "Egothor" must not be used to endorse or promote products
|
||||
derived from this software without prior written permission. For
|
||||
written permission, please contact Leo.G@seznam.cz
|
||||
4. Products derived from this software may not be called "Egothor",
|
||||
nor may "Egothor" appear in their name, without prior written
|
||||
permission from Leo.G@seznam.cz.
|
||||
|
||||
In addition, we request that you include in the end-user documentation
|
||||
provided with the redistribution and/or in the software itself an
|
||||
acknowledgement equivalent to the following:
|
||||
"This product includes software developed by the Egothor Project.
|
||||
http://egothor.sf.net/"
|
||||
|
||||
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
This software consists of voluntary contributions made by many
|
||||
individuals on behalf of the Egothor Project and was originally
|
||||
created by Leo Galambos (Leo.G@seznam.cz).
|
||||
*/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
/**
|
||||
* A Cell is a portion of a trie.
|
||||
*/
|
||||
class Cell {
|
||||
/** next row id in this way */
|
||||
int ref = -1;
|
||||
/** command of the cell */
|
||||
int cmd = -1;
|
||||
/** how many cmd-s was in subtrie before pack() */
|
||||
int cnt = 0;
|
||||
/** how many chars would be discarded from input key in this way */
|
||||
int skip = 0;
|
||||
|
||||
/** Constructor for the Cell object. */
|
||||
Cell() {}
|
||||
|
||||
/**
|
||||
* Construct a Cell using the properties of the given Cell.
|
||||
*
|
||||
* @param a the Cell whose properties will be used
|
||||
*/
|
||||
Cell(Cell a) {
|
||||
ref = a.ref;
|
||||
cmd = a.cmd;
|
||||
cnt = a.cnt;
|
||||
skip = a.skip;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a String containing this Cell's attributes.
|
||||
*
|
||||
* @return a String representation of this Cell
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ref(" + ref + ")cmd(" + cmd + ")cnt(" + cnt + ")skp(" + skip + ")";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,205 @@
|
|||
/*
|
||||
Egothor Software License version 1.00
|
||||
Copyright (C) 1997-2004 Leo Galambos.
|
||||
Copyright (C) 2002-2004 "Egothor developers"
|
||||
on behalf of the Egothor Project.
|
||||
All rights reserved.
|
||||
|
||||
This software is copyrighted by the "Egothor developers". If this
|
||||
license applies to a single file or document, the "Egothor developers"
|
||||
are the people or entities mentioned as copyright holders in that file
|
||||
or document. If this license applies to the Egothor project as a
|
||||
whole, the copyright holders are the people or entities mentioned in
|
||||
the file CREDITS. This file can be found in the same location as this
|
||||
license in the distribution.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
disclaimer that follows these conditions in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
3. The name "Egothor" must not be used to endorse or promote products
|
||||
derived from this software without prior written permission. For
|
||||
written permission, please contact Leo.G@seznam.cz
|
||||
4. Products derived from this software may not be called "Egothor",
|
||||
nor may "Egothor" appear in their name, without prior written
|
||||
permission from Leo.G@seznam.cz.
|
||||
|
||||
In addition, we request that you include in the end-user documentation
|
||||
provided with the redistribution and/or in the software itself an
|
||||
acknowledgement equivalent to the following:
|
||||
"This product includes software developed by the Egothor Project.
|
||||
http://egothor.sf.net/"
|
||||
|
||||
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
This software consists of voluntary contributions made by many
|
||||
individuals on behalf of the Egothor Project and was originally
|
||||
created by Leo Galambos (Leo.G@seznam.cz).
|
||||
*/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.LineNumberReader;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
/**
|
||||
* The Compile class is used to compile a stemmer table.
|
||||
*/
|
||||
public class Compile {
|
||||
|
||||
static boolean backward;
|
||||
static boolean multi;
|
||||
static Trie trie;
|
||||
|
||||
/**
|
||||
* Entry point to the Compile application.
|
||||
* <p>
|
||||
* This program takes any number of arguments: the first is the name of the
|
||||
* desired stemming algorithm to use (a list is available in the package
|
||||
* description) , all of the rest should be the path or paths to a file or
|
||||
* files containing a stemmer table to compile.
|
||||
*
|
||||
* @param args the command line arguments
|
||||
*/
|
||||
public static void main(java.lang.String[] args) {
|
||||
if (args.length < 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
args[0].toUpperCase();
|
||||
|
||||
backward = args[0].charAt(0) == '-';
|
||||
int qq = (backward) ? 1 : 0;
|
||||
boolean storeorig = false;
|
||||
|
||||
if (args[0].charAt(qq) == '0') {
|
||||
storeorig = true;
|
||||
qq++;
|
||||
}
|
||||
|
||||
multi = args[0].charAt(qq) == 'M';
|
||||
if (multi) {
|
||||
qq++;
|
||||
}
|
||||
|
||||
String charset = System.getProperty("egothor.stemmer.charset", "UTF-8");
|
||||
|
||||
char optimizer[] = new char[args[0].length() - qq];
|
||||
for (int i = 0; i < optimizer.length; i++) {
|
||||
optimizer[i] = args[0].charAt(qq + i);
|
||||
}
|
||||
|
||||
for (int i = 1; i < args.length; i++) {
|
||||
LineNumberReader in;
|
||||
// System.out.println("[" + args[i] + "]");
|
||||
Diff diff = new Diff();
|
||||
try {
|
||||
int stems = 0;
|
||||
int words = 0;
|
||||
|
||||
allocTrie();
|
||||
|
||||
System.out.println(args[i]);
|
||||
in = new LineNumberReader(new BufferedReader(new InputStreamReader(
|
||||
new FileInputStream(args[i]), charset)));
|
||||
for (String line = in.readLine(); line != null; line = in.readLine()) {
|
||||
try {
|
||||
line = line.toLowerCase();
|
||||
StringTokenizer st = new StringTokenizer(line);
|
||||
String stem = st.nextToken();
|
||||
if (storeorig) {
|
||||
trie.add(stem, "-a");
|
||||
words++;
|
||||
}
|
||||
while (st.hasMoreTokens()) {
|
||||
String token = st.nextToken();
|
||||
if (token.equals(stem) == false) {
|
||||
trie.add(token, diff.exec(token, stem));
|
||||
words++;
|
||||
}
|
||||
}
|
||||
} catch (java.util.NoSuchElementException x) {
|
||||
// no base token (stem) on a line
|
||||
}
|
||||
}
|
||||
|
||||
Optimizer o = new Optimizer();
|
||||
Optimizer2 o2 = new Optimizer2();
|
||||
Lift l = new Lift(true);
|
||||
Lift e = new Lift(false);
|
||||
Gener g = new Gener();
|
||||
|
||||
for (int j = 0; j < optimizer.length; j++) {
|
||||
String prefix;
|
||||
switch (optimizer[j]) {
|
||||
case 'G':
|
||||
trie = trie.reduce(g);
|
||||
prefix = "G: ";
|
||||
break;
|
||||
case 'L':
|
||||
trie = trie.reduce(l);
|
||||
prefix = "L: ";
|
||||
break;
|
||||
case 'E':
|
||||
trie = trie.reduce(e);
|
||||
prefix = "E: ";
|
||||
break;
|
||||
case '2':
|
||||
trie = trie.reduce(o2);
|
||||
prefix = "2: ";
|
||||
break;
|
||||
case '1':
|
||||
trie = trie.reduce(o);
|
||||
prefix = "1: ";
|
||||
break;
|
||||
default:
|
||||
continue;
|
||||
}
|
||||
trie.printInfo(prefix + " ");
|
||||
}
|
||||
|
||||
DataOutputStream os = new DataOutputStream(new BufferedOutputStream(
|
||||
new FileOutputStream(args[i] + ".out")));
|
||||
os.writeUTF(args[0]);
|
||||
trie.store(os);
|
||||
os.close();
|
||||
|
||||
} catch (FileNotFoundException x) {
|
||||
x.printStackTrace();
|
||||
} catch (IOException x) {
|
||||
x.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void allocTrie() {
|
||||
if (multi) {
|
||||
trie = new MultiTrie2(!backward);
|
||||
} else {
|
||||
trie = new Trie(!backward);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,295 @@
|
|||
/*
|
||||
Egothor Software License version 1.00
|
||||
Copyright (C) 1997-2004 Leo Galambos.
|
||||
Copyright (C) 2002-2004 "Egothor developers"
|
||||
on behalf of the Egothor Project.
|
||||
All rights reserved.
|
||||
|
||||
This software is copyrighted by the "Egothor developers". If this
|
||||
license applies to a single file or document, the "Egothor developers"
|
||||
are the people or entities mentioned as copyright holders in that file
|
||||
or document. If this license applies to the Egothor project as a
|
||||
whole, the copyright holders are the people or entities mentioned in
|
||||
the file CREDITS. This file can be found in the same location as this
|
||||
license in the distribution.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
disclaimer that follows these conditions in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
3. The name "Egothor" must not be used to endorse or promote products
|
||||
derived from this software without prior written permission. For
|
||||
written permission, please contact Leo.G@seznam.cz
|
||||
4. Products derived from this software may not be called "Egothor",
|
||||
nor may "Egothor" appear in their name, without prior written
|
||||
permission from Leo.G@seznam.cz.
|
||||
|
||||
In addition, we request that you include in the end-user documentation
|
||||
provided with the redistribution and/or in the software itself an
|
||||
acknowledgement equivalent to the following:
|
||||
"This product includes software developed by the Egothor Project.
|
||||
http://egothor.sf.net/"
|
||||
|
||||
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
This software consists of voluntary contributions made by many
|
||||
individuals on behalf of the Egothor Project and was originally
|
||||
created by Leo Galambos (Leo.G@seznam.cz).
|
||||
*/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
/**
|
||||
* The Diff object generates a patch string.
|
||||
* <p>
|
||||
* A patch string is actually a command to a stemmer telling it how to reduce a
|
||||
* word to its root. For example, to reduce the word teacher to its root teach
|
||||
* the patch string Db would be generated. This command tells the stemmer to
|
||||
* delete the last 2 characters from the word teacher to reach the stem (the
|
||||
* patch commands are applied starting from the last character in order to save
|
||||
*/
|
||||
public class Diff {
|
||||
int sizex = 0;
|
||||
int sizey = 0;
|
||||
int net[][];
|
||||
int way[][];
|
||||
|
||||
int INSERT;
|
||||
int DELETE;
|
||||
int REPLACE;
|
||||
int NOOP;
|
||||
|
||||
/**
|
||||
* Constructor for the Diff object.
|
||||
*/
|
||||
public Diff() {
|
||||
this(1, 1, 1, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor for the Diff object
|
||||
*
|
||||
* @param ins Description of the Parameter
|
||||
* @param del Description of the Parameter
|
||||
* @param rep Description of the Parameter
|
||||
* @param noop Description of the Parameter
|
||||
*/
|
||||
public Diff(int ins, int del, int rep, int noop) {
|
||||
INSERT = ins;
|
||||
DELETE = del;
|
||||
REPLACE = rep;
|
||||
NOOP = noop;
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply the given patch string <tt>diff</tt> to the given string <tt>
|
||||
* dest</tt>.
|
||||
*
|
||||
* @param dest Destination string
|
||||
* @param diff Patch string
|
||||
*/
|
||||
public static void apply(StringBuilder dest, CharSequence diff) {
|
||||
try {
|
||||
|
||||
if (diff == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
int pos = dest.length() - 1;
|
||||
if (pos < 0) {
|
||||
return;
|
||||
}
|
||||
// orig == ""
|
||||
for (int i = 0; i < diff.length() / 2; i++) {
|
||||
char cmd = diff.charAt(2 * i);
|
||||
char param = diff.charAt(2 * i + 1);
|
||||
int par_num = (param - 'a' + 1);
|
||||
switch (cmd) {
|
||||
case '-':
|
||||
pos = pos - par_num + 1;
|
||||
break;
|
||||
case 'R':
|
||||
dest.setCharAt(pos, param);
|
||||
break;
|
||||
case 'D':
|
||||
int o = pos;
|
||||
pos -= par_num - 1;
|
||||
/*
|
||||
* delete par_num chars from index pos
|
||||
*/
|
||||
// String s = orig.toString();
|
||||
// s = s.substring( 0, pos ) + s.substring( o + 1 );
|
||||
// orig = new StringBuffer( s );
|
||||
dest.delete(pos, o + 1);
|
||||
break;
|
||||
case 'I':
|
||||
dest.insert(pos += 1, param);
|
||||
break;
|
||||
}
|
||||
pos--;
|
||||
}
|
||||
} catch (StringIndexOutOfBoundsException x) {
|
||||
// x.printStackTrace();
|
||||
} catch (ArrayIndexOutOfBoundsException x) {
|
||||
// x.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a patch string that transforms a to b.
|
||||
*
|
||||
* @param a String 1st string
|
||||
* @param b String 2nd string
|
||||
* @return String
|
||||
*/
|
||||
public synchronized String exec(String a, String b) {
|
||||
if (a == null || b == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
int x;
|
||||
int y;
|
||||
int maxx;
|
||||
int maxy;
|
||||
int go[] = new int[4];
|
||||
final int X = 1;
|
||||
final int Y = 2;
|
||||
final int R = 3;
|
||||
final int D = 0;
|
||||
|
||||
/*
|
||||
* setup memory if needed => processing speed up
|
||||
*/
|
||||
maxx = a.length() + 1;
|
||||
maxy = b.length() + 1;
|
||||
if ((maxx >= sizex) || (maxy >= sizey)) {
|
||||
sizex = maxx + 8;
|
||||
sizey = maxy + 8;
|
||||
net = new int[sizex][sizey];
|
||||
way = new int[sizex][sizey];
|
||||
}
|
||||
|
||||
/*
|
||||
* clear the network
|
||||
*/
|
||||
for (x = 0; x < maxx; x++) {
|
||||
for (y = 0; y < maxy; y++) {
|
||||
net[x][y] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* set known persistent values
|
||||
*/
|
||||
for (x = 1; x < maxx; x++) {
|
||||
net[x][0] = x;
|
||||
way[x][0] = X;
|
||||
}
|
||||
for (y = 1; y < maxy; y++) {
|
||||
net[0][y] = y;
|
||||
way[0][y] = Y;
|
||||
}
|
||||
|
||||
for (x = 1; x < maxx; x++) {
|
||||
for (y = 1; y < maxy; y++) {
|
||||
go[X] = net[x - 1][y] + DELETE;
|
||||
// way on x costs 1 unit
|
||||
go[Y] = net[x][y - 1] + INSERT;
|
||||
// way on y costs 1 unit
|
||||
go[R] = net[x - 1][y - 1] + REPLACE;
|
||||
go[D] = net[x - 1][y - 1]
|
||||
+ ((a.charAt(x - 1) == b.charAt(y - 1)) ? NOOP : 100);
|
||||
// diagonal costs 0, when no change
|
||||
short min = D;
|
||||
if (go[min] >= go[X]) {
|
||||
min = X;
|
||||
}
|
||||
if (go[min] > go[Y]) {
|
||||
min = Y;
|
||||
}
|
||||
if (go[min] > go[R]) {
|
||||
min = R;
|
||||
}
|
||||
way[x][y] = min;
|
||||
net[x][y] = (short) go[min];
|
||||
}
|
||||
}
|
||||
|
||||
// read the patch string
|
||||
StringBuffer result = new StringBuffer();
|
||||
final char base = 'a' - 1;
|
||||
char deletes = base;
|
||||
char equals = base;
|
||||
for (x = maxx - 1, y = maxy - 1; x + y != 0;) {
|
||||
switch (way[x][y]) {
|
||||
case X:
|
||||
if (equals != base) {
|
||||
result.append("-" + (equals));
|
||||
equals = base;
|
||||
}
|
||||
deletes++;
|
||||
x--;
|
||||
break;
|
||||
// delete
|
||||
case Y:
|
||||
if (deletes != base) {
|
||||
result.append("D" + (deletes));
|
||||
deletes = base;
|
||||
}
|
||||
if (equals != base) {
|
||||
result.append("-" + (equals));
|
||||
equals = base;
|
||||
}
|
||||
result.append('I');
|
||||
result.append(b.charAt(--y));
|
||||
break;
|
||||
// insert
|
||||
case R:
|
||||
if (deletes != base) {
|
||||
result.append("D" + (deletes));
|
||||
deletes = base;
|
||||
}
|
||||
if (equals != base) {
|
||||
result.append("-" + (equals));
|
||||
equals = base;
|
||||
}
|
||||
result.append('R');
|
||||
result.append(b.charAt(--y));
|
||||
x--;
|
||||
break;
|
||||
// replace
|
||||
case D:
|
||||
if (deletes != base) {
|
||||
result.append("D" + (deletes));
|
||||
deletes = base;
|
||||
}
|
||||
equals++;
|
||||
x--;
|
||||
y--;
|
||||
break;
|
||||
// no change
|
||||
}
|
||||
}
|
||||
if (deletes != base) {
|
||||
result.append("D" + (deletes));
|
||||
deletes = base;
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,121 @@
|
|||
/*
|
||||
Egothor Software License version 1.00
|
||||
Copyright (C) 1997-2004 Leo Galambos.
|
||||
Copyright (C) 2002-2004 "Egothor developers"
|
||||
on behalf of the Egothor Project.
|
||||
All rights reserved.
|
||||
|
||||
This software is copyrighted by the "Egothor developers". If this
|
||||
license applies to a single file or document, the "Egothor developers"
|
||||
are the people or entities mentioned as copyright holders in that file
|
||||
or document. If this license applies to the Egothor project as a
|
||||
whole, the copyright holders are the people or entities mentioned in
|
||||
the file CREDITS. This file can be found in the same location as this
|
||||
license in the distribution.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
disclaimer that follows these conditions in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
3. The name "Egothor" must not be used to endorse or promote products
|
||||
derived from this software without prior written permission. For
|
||||
written permission, please contact Leo.G@seznam.cz
|
||||
4. Products derived from this software may not be called "Egothor",
|
||||
nor may "Egothor" appear in their name, without prior written
|
||||
permission from Leo.G@seznam.cz.
|
||||
|
||||
In addition, we request that you include in the end-user documentation
|
||||
provided with the redistribution and/or in the software itself an
|
||||
acknowledgement equivalent to the following:
|
||||
"This product includes software developed by the Egothor Project.
|
||||
http://egothor.sf.net/"
|
||||
|
||||
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
This software consists of voluntary contributions made by many
|
||||
individuals on behalf of the Egothor Project and was originally
|
||||
created by Leo Galambos (Leo.G@seznam.cz).
|
||||
*/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.LineNumberReader;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
/**
|
||||
* The DiffIt class is a means generate patch commands from an already prepared
|
||||
* stemmer table.
|
||||
*/
|
||||
public class DiffIt {
|
||||
|
||||
static int get(int i, String s) {
|
||||
try {
|
||||
return Integer.parseInt(s.substring(i, i + 1));
|
||||
} catch (Throwable x) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Entry point to the DiffIt application.
|
||||
* <p>
|
||||
* This application takes one argument, the path to a file containing a
|
||||
* stemmer table. The program reads the file and generates the patch commands
|
||||
* for the stems.
|
||||
*
|
||||
* @param args the path to a file containing a stemmer table
|
||||
*/
|
||||
public static void main(java.lang.String[] args) {
|
||||
|
||||
int ins = get(0, args[0]);
|
||||
int del = get(1, args[0]);
|
||||
int rep = get(2, args[0]);
|
||||
int nop = get(3, args[0]);
|
||||
|
||||
for (int i = 1; i < args.length; i++) {
|
||||
LineNumberReader in;
|
||||
// System.out.println("[" + args[i] + "]");
|
||||
Diff diff = new Diff(ins, del, rep, nop);
|
||||
try {
|
||||
in = new LineNumberReader(new BufferedReader(new FileReader(args[i])));
|
||||
for (String line = in.readLine(); line != null; line = in.readLine()) {
|
||||
try {
|
||||
line = line.toLowerCase();
|
||||
StringTokenizer st = new StringTokenizer(line);
|
||||
String stem = st.nextToken();
|
||||
System.out.println(stem + " -a");
|
||||
while (st.hasMoreTokens()) {
|
||||
String token = st.nextToken();
|
||||
if (token.equals(stem) == false) {
|
||||
System.out.println(stem + " " + diff.exec(token, stem));
|
||||
}
|
||||
}
|
||||
} catch (java.util.NoSuchElementException x) {
|
||||
// no base token (stem) on a line
|
||||
}
|
||||
}
|
||||
|
||||
} catch (IOException x) {
|
||||
x.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,132 @@
|
|||
/*
|
||||
Egothor Software License version 1.00
|
||||
Copyright (C) 1997-2004 Leo Galambos.
|
||||
Copyright (C) 2002-2004 "Egothor developers"
|
||||
on behalf of the Egothor Project.
|
||||
All rights reserved.
|
||||
|
||||
This software is copyrighted by the "Egothor developers". If this
|
||||
license applies to a single file or document, the "Egothor developers"
|
||||
are the people or entities mentioned as copyright holders in that file
|
||||
or document. If this license applies to the Egothor project as a
|
||||
whole, the copyright holders are the people or entities mentioned in
|
||||
the file CREDITS. This file can be found in the same location as this
|
||||
license in the distribution.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
disclaimer that follows these conditions in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
3. The name "Egothor" must not be used to endorse or promote products
|
||||
derived from this software without prior written permission. For
|
||||
written permission, please contact Leo.G@seznam.cz
|
||||
4. Products derived from this software may not be called "Egothor",
|
||||
nor may "Egothor" appear in their name, without prior written
|
||||
permission from Leo.G@seznam.cz.
|
||||
|
||||
In addition, we request that you include in the end-user documentation
|
||||
provided with the redistribution and/or in the software itself an
|
||||
acknowledgement equivalent to the following:
|
||||
"This product includes software developed by the Egothor Project.
|
||||
http://egothor.sf.net/"
|
||||
|
||||
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
This software consists of voluntary contributions made by many
|
||||
individuals on behalf of the Egothor Project and was originally
|
||||
created by Leo Galambos (Leo.G@seznam.cz).
|
||||
*/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The Gener object helps in the discarding of nodes which break the reduction
|
||||
* effort and defend the structure against large reductions.
|
||||
*/
|
||||
public class Gener extends Reduce {
|
||||
/**
|
||||
* Constructor for the Gener object.
|
||||
*/
|
||||
public Gener() {}
|
||||
|
||||
/**
|
||||
* Return a Trie with infrequent values occurring in the given Trie removed.
|
||||
*
|
||||
* @param orig the Trie to optimize
|
||||
* @return a new optimized Trie
|
||||
*/
|
||||
@Override
|
||||
public Trie optimize(Trie orig) {
|
||||
List<CharSequence> cmds = orig.cmds;
|
||||
List<Row> rows = new ArrayList<Row>();
|
||||
List<Row> orows = orig.rows;
|
||||
int remap[] = new int[orows.size()];
|
||||
|
||||
Arrays.fill(remap, 1);
|
||||
for (int j = orows.size() - 1; j >= 0; j--) {
|
||||
if (eat(orows.get(j), remap)) {
|
||||
remap[j] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
Arrays.fill(remap, -1);
|
||||
rows = removeGaps(orig.root, orows, new ArrayList<Row>(), remap);
|
||||
|
||||
return new Trie(orig.forward, remap[orig.root], cmds, rows);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test whether the given Row of Cells in a Trie should be included in an
|
||||
* optimized Trie.
|
||||
*
|
||||
* @param in the Row to test
|
||||
* @param remap Description of the Parameter
|
||||
* @return <tt>true</tt> if the Row should remain, <tt>false
|
||||
* </tt> otherwise
|
||||
*/
|
||||
public boolean eat(Row in, int remap[]) {
|
||||
int sum = 0;
|
||||
for (Iterator<Cell> i = in.cells.values().iterator(); i.hasNext();) {
|
||||
Cell c = i.next();
|
||||
sum += c.cnt;
|
||||
if (c.ref >= 0) {
|
||||
if (remap[c.ref] == 0) {
|
||||
c.ref = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
int frame = sum / 10;
|
||||
boolean live = false;
|
||||
for (Iterator<Cell> i = in.cells.values().iterator(); i.hasNext();) {
|
||||
Cell c = i.next();
|
||||
if (c.cnt < frame && c.cmd >= 0) {
|
||||
c.cnt = 0;
|
||||
c.cmd = -1;
|
||||
}
|
||||
if (c.cmd >= 0 || c.ref >= 0) {
|
||||
live |= true;
|
||||
}
|
||||
}
|
||||
return !live;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,147 @@
|
|||
/*
|
||||
Egothor Software License version 1.00
|
||||
Copyright (C) 1997-2004 Leo Galambos.
|
||||
Copyright (C) 2002-2004 "Egothor developers"
|
||||
on behalf of the Egothor Project.
|
||||
All rights reserved.
|
||||
|
||||
This software is copyrighted by the "Egothor developers". If this
|
||||
license applies to a single file or document, the "Egothor developers"
|
||||
are the people or entities mentioned as copyright holders in that file
|
||||
or document. If this license applies to the Egothor project as a
|
||||
whole, the copyright holders are the people or entities mentioned in
|
||||
the file CREDITS. This file can be found in the same location as this
|
||||
license in the distribution.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
disclaimer that follows these conditions in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
3. The name "Egothor" must not be used to endorse or promote products
|
||||
derived from this software without prior written permission. For
|
||||
written permission, please contact Leo.G@seznam.cz
|
||||
4. Products derived from this software may not be called "Egothor",
|
||||
nor may "Egothor" appear in their name, without prior written
|
||||
permission from Leo.G@seznam.cz.
|
||||
|
||||
In addition, we request that you include in the end-user documentation
|
||||
provided with the redistribution and/or in the software itself an
|
||||
acknowledgement equivalent to the following:
|
||||
"This product includes software developed by the Egothor Project.
|
||||
http://egothor.sf.net/"
|
||||
|
||||
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
This software consists of voluntary contributions made by many
|
||||
individuals on behalf of the Egothor Project and was originally
|
||||
created by Leo Galambos (Leo.G@seznam.cz).
|
||||
*/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The Lift class is a data structure that is a variation of a Patricia trie.
|
||||
* <p>
|
||||
* Lift's <i>raison d'etre</i> is to implement reduction of the trie via the
|
||||
* Lift-Up method., which makes the data structure less liable to overstemming.
|
||||
*/
|
||||
public class Lift extends Reduce {
|
||||
boolean changeSkip;
|
||||
|
||||
/**
|
||||
* Constructor for the Lift object.
|
||||
*
|
||||
* @param changeSkip when set to <tt>true</tt>, comparison of two Cells takes
|
||||
* a skip command into account
|
||||
*/
|
||||
public Lift(boolean changeSkip) {
|
||||
this.changeSkip = changeSkip;
|
||||
}
|
||||
|
||||
/**
|
||||
* Optimize (eliminate rows with no content) the given Trie and return the
|
||||
* reduced Trie.
|
||||
*
|
||||
* @param orig the Trie to optimized
|
||||
* @return the reduced Trie
|
||||
*/
|
||||
@Override
|
||||
public Trie optimize(Trie orig) {
|
||||
List<CharSequence> cmds = orig.cmds;
|
||||
List<Row> rows = new ArrayList<Row>();
|
||||
List<Row> orows = orig.rows;
|
||||
int remap[] = new int[orows.size()];
|
||||
|
||||
for (int j = orows.size() - 1; j >= 0; j--) {
|
||||
liftUp(orows.get(j), orows);
|
||||
}
|
||||
|
||||
Arrays.fill(remap, -1);
|
||||
rows = removeGaps(orig.root, orows, new ArrayList<Row>(), remap);
|
||||
|
||||
return new Trie(orig.forward, remap[orig.root], cmds, rows);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduce the trie using Lift-Up reduction.
|
||||
* <p>
|
||||
* The Lift-Up reduction propagates all leaf-values (patch commands), where
|
||||
* possible, to higher levels which are closer to the root of the trie.
|
||||
*
|
||||
* @param in the Row to consider when optimizing
|
||||
* @param nodes contains the patch commands
|
||||
*/
|
||||
public void liftUp(Row in, List<Row> nodes) {
|
||||
Iterator<Cell> i = in.cells.values().iterator();
|
||||
for (; i.hasNext();) {
|
||||
Cell c = i.next();
|
||||
if (c.ref >= 0) {
|
||||
Row to = nodes.get(c.ref);
|
||||
int sum = to.uniformCmd(changeSkip);
|
||||
if (sum >= 0) {
|
||||
if (sum == c.cmd) {
|
||||
if (changeSkip) {
|
||||
if (c.skip != to.uniformSkip + 1) {
|
||||
continue;
|
||||
}
|
||||
c.skip = to.uniformSkip + 1;
|
||||
} else {
|
||||
c.skip = 0;
|
||||
}
|
||||
c.cnt += to.uniformCnt;
|
||||
c.ref = -1;
|
||||
} else if (c.cmd < 0) {
|
||||
c.cnt = to.uniformCnt;
|
||||
c.cmd = sum;
|
||||
c.ref = -1;
|
||||
if (changeSkip) {
|
||||
c.skip = to.uniformSkip + 1;
|
||||
} else {
|
||||
c.skip = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,208 @@
|
|||
/*
|
||||
Egothor Software License version 1.00
|
||||
Copyright (C) 1997-2004 Leo Galambos.
|
||||
Copyright (C) 2002-2004 "Egothor developers"
|
||||
on behalf of the Egothor Project.
|
||||
All rights reserved.
|
||||
|
||||
This software is copyrighted by the "Egothor developers". If this
|
||||
license applies to a single file or document, the "Egothor developers"
|
||||
are the people or entities mentioned as copyright holders in that file
|
||||
or document. If this license applies to the Egothor project as a
|
||||
whole, the copyright holders are the people or entities mentioned in
|
||||
the file CREDITS. This file can be found in the same location as this
|
||||
license in the distribution.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
disclaimer that follows these conditions in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
3. The name "Egothor" must not be used to endorse or promote products
|
||||
derived from this software without prior written permission. For
|
||||
written permission, please contact Leo.G@seznam.cz
|
||||
4. Products derived from this software may not be called "Egothor",
|
||||
nor may "Egothor" appear in their name, without prior written
|
||||
permission from Leo.G@seznam.cz.
|
||||
|
||||
In addition, we request that you include in the end-user documentation
|
||||
provided with the redistribution and/or in the software itself an
|
||||
acknowledgement equivalent to the following:
|
||||
"This product includes software developed by the Egothor Project.
|
||||
http://egothor.sf.net/"
|
||||
|
||||
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
This software consists of voluntary contributions made by many
|
||||
individuals on behalf of the Egothor Project and was originally
|
||||
created by Leo Galambos (Leo.G@seznam.cz).
|
||||
*/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.io.DataInput;
|
||||
import java.io.DataOutput;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The MultiTrie is a Trie of Tries. It stores words and their associated patch
|
||||
* commands. The MultiTrie handles patch commmands individually (each command by
|
||||
* itself).
|
||||
*/
|
||||
public class MultiTrie extends Trie {
|
||||
final char EOM = '*';
|
||||
final String EOM_NODE = "" + EOM;
|
||||
|
||||
List<Trie> tries = new ArrayList<Trie>();
|
||||
|
||||
int BY = 1;
|
||||
|
||||
/**
|
||||
* Constructor for the MultiTrie object.
|
||||
*
|
||||
* @param is the input stream
|
||||
* @exception IOException if an I/O error occurs
|
||||
*/
|
||||
public MultiTrie(DataInput is) throws IOException {
|
||||
super(false);
|
||||
forward = is.readBoolean();
|
||||
BY = is.readInt();
|
||||
for (int i = is.readInt(); i > 0; i--) {
|
||||
tries.add(new Trie(is));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor for the MultiTrie object
|
||||
*
|
||||
* @param forward set to <tt>true</tt> if the elements should be read left to
|
||||
* right
|
||||
*/
|
||||
public MultiTrie(boolean forward) {
|
||||
super(forward);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the element that is stored in a cell associated with the given key.
|
||||
*
|
||||
* @param key the key to the cell holding the desired element
|
||||
* @return the element
|
||||
*/
|
||||
@Override
|
||||
public CharSequence getFully(CharSequence key) {
|
||||
StringBuilder result = new StringBuilder(tries.size() * 2);
|
||||
for (int i = 0; i < tries.size(); i++) {
|
||||
CharSequence r = tries.get(i).getFully(key);
|
||||
if (r == null || (r.length() == 1 && r.charAt(0) == EOM)) {
|
||||
return result;
|
||||
}
|
||||
result.append(r);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the element that is stored as last on a path belonging to the given
|
||||
* key.
|
||||
*
|
||||
* @param key the key associated with the desired element
|
||||
* @return the element that is stored as last on a path
|
||||
*/
|
||||
@Override
|
||||
public CharSequence getLastOnPath(CharSequence key) {
|
||||
StringBuilder result = new StringBuilder(tries.size() * 2);
|
||||
for (int i = 0; i < tries.size(); i++) {
|
||||
CharSequence r = tries.get(i).getLastOnPath(key);
|
||||
if (r == null || (r.length() == 1 && r.charAt(0) == EOM)) {
|
||||
return result;
|
||||
}
|
||||
result.append(r);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write this data structure to the given output stream.
|
||||
*
|
||||
* @param os the output stream
|
||||
* @exception IOException if an I/O error occurs
|
||||
*/
|
||||
@Override
|
||||
public void store(DataOutput os) throws IOException {
|
||||
os.writeBoolean(forward);
|
||||
os.writeInt(BY);
|
||||
os.writeInt(tries.size());
|
||||
for (Trie trie : tries)
|
||||
trie.store(os);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an element to this structure consisting of the given key and patch
|
||||
* command.
|
||||
* <p>
|
||||
* This method will return without executing if the <tt>cmd</tt>
|
||||
* parameter's length is 0.
|
||||
*
|
||||
* @param key the key
|
||||
* @param cmd the patch command
|
||||
*/
|
||||
@Override
|
||||
public void add(CharSequence key, CharSequence cmd) {
|
||||
if (cmd.length() == 0) {
|
||||
return;
|
||||
}
|
||||
int levels = cmd.length() / BY;
|
||||
while (levels >= tries.size()) {
|
||||
tries.add(new Trie(forward));
|
||||
}
|
||||
for (int i = 0; i < levels; i++) {
|
||||
tries.get(i).add(key, cmd.subSequence(BY * i, BY * i + BY));
|
||||
}
|
||||
tries.get(levels).add(key, EOM_NODE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove empty rows from the given Trie and return the newly reduced Trie.
|
||||
*
|
||||
* @param by the Trie to reduce
|
||||
* @return the newly reduced Trie
|
||||
*/
|
||||
@Override
|
||||
public Trie reduce(Reduce by) {
|
||||
List<Trie> h = new ArrayList<Trie>();
|
||||
for (Trie trie : tries)
|
||||
h.add(trie.reduce(by));
|
||||
|
||||
MultiTrie m = new MultiTrie(forward);
|
||||
m.tries = h;
|
||||
return m;
|
||||
}
|
||||
|
||||
/**
|
||||
* Print the given prefix and the position(s) in the Trie where it appears.
|
||||
*
|
||||
* @param prefix the desired prefix
|
||||
*/
|
||||
@Override
|
||||
public void printInfo(CharSequence prefix) {
|
||||
int c = 0;
|
||||
for (Trie trie : tries)
|
||||
trie.printInfo(prefix + "[" + (++c) + "] ");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,333 @@
|
|||
/*
|
||||
Egothor Software License version 1.00
|
||||
Copyright (C) 1997-2004 Leo Galambos.
|
||||
Copyright (C) 2002-2004 "Egothor developers"
|
||||
on behalf of the Egothor Project.
|
||||
All rights reserved.
|
||||
|
||||
This software is copyrighted by the "Egothor developers". If this
|
||||
license applies to a single file or document, the "Egothor developers"
|
||||
are the people or entities mentioned as copyright holders in that file
|
||||
or document. If this license applies to the Egothor project as a
|
||||
whole, the copyright holders are the people or entities mentioned in
|
||||
the file CREDITS. This file can be found in the same location as this
|
||||
license in the distribution.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
disclaimer that follows these conditions in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
3. The name "Egothor" must not be used to endorse or promote products
|
||||
derived from this software without prior written permission. For
|
||||
written permission, please contact Leo.G@seznam.cz
|
||||
4. Products derived from this software may not be called "Egothor",
|
||||
nor may "Egothor" appear in their name, without prior written
|
||||
permission from Leo.G@seznam.cz.
|
||||
|
||||
In addition, we request that you include in the end-user documentation
|
||||
provided with the redistribution and/or in the software itself an
|
||||
acknowledgement equivalent to the following:
|
||||
"This product includes software developed by the Egothor Project.
|
||||
http://egothor.sf.net/"
|
||||
|
||||
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
This software consists of voluntary contributions made by many
|
||||
individuals on behalf of the Egothor Project and was originally
|
||||
created by Leo Galambos (Leo.G@seznam.cz).
|
||||
*/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.io.DataInput;
|
||||
import java.io.DataOutput;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The MultiTrie is a Trie of Tries.
|
||||
* <p>
|
||||
* It stores words and their associated patch commands. The MultiTrie handles
|
||||
* patch commmands broken into their constituent parts, as a MultiTrie does, but
|
||||
* the commands are delimited by the skip command.
|
||||
*/
|
||||
public class MultiTrie2 extends MultiTrie {
|
||||
/**
|
||||
* Constructor for the MultiTrie object.
|
||||
*
|
||||
* @param is the input stream
|
||||
* @exception IOException if an I/O error occurs
|
||||
*/
|
||||
public MultiTrie2(DataInput is) throws IOException {
|
||||
super(is);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor for the MultiTrie2 object
|
||||
*
|
||||
* @param forward set to <tt>true</tt> if the elements should be read left to
|
||||
* right
|
||||
*/
|
||||
public MultiTrie2(boolean forward) {
|
||||
super(forward);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the element that is stored in a cell associated with the given key.
|
||||
*
|
||||
* @param key the key to the cell holding the desired element
|
||||
* @return the element
|
||||
*/
|
||||
@Override
|
||||
public CharSequence getFully(CharSequence key) {
|
||||
StringBuilder result = new StringBuilder(tries.size() * 2);
|
||||
try {
|
||||
CharSequence lastkey = key;
|
||||
CharSequence p[] = new CharSequence[tries.size()];
|
||||
char lastch = ' ';
|
||||
for (int i = 0; i < tries.size(); i++) {
|
||||
CharSequence r = tries.get(i).getFully(lastkey);
|
||||
if (r == null || (r.length() == 1 && r.charAt(0) == EOM)) {
|
||||
return result;
|
||||
}
|
||||
if (cannotFollow(lastch, r.charAt(0))) {
|
||||
return result;
|
||||
} else {
|
||||
lastch = r.charAt(r.length() - 2);
|
||||
}
|
||||
// key=key.substring(lengthPP(r));
|
||||
p[i] = r;
|
||||
if (p[i].charAt(0) == '-') {
|
||||
if (i > 0) {
|
||||
key = skip(key, lengthPP(p[i - 1]));
|
||||
}
|
||||
key = skip(key, lengthPP(p[i]));
|
||||
}
|
||||
// key = skip(key, lengthPP(r));
|
||||
result.append(r);
|
||||
if (key.length() != 0) {
|
||||
lastkey = key;
|
||||
}
|
||||
}
|
||||
} catch (IndexOutOfBoundsException x) {}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the element that is stored as last on a path belonging to the given
|
||||
* key.
|
||||
*
|
||||
* @param key the key associated with the desired element
|
||||
* @return the element that is stored as last on a path
|
||||
*/
|
||||
@Override
|
||||
public CharSequence getLastOnPath(CharSequence key) {
|
||||
StringBuilder result = new StringBuilder(tries.size() * 2);
|
||||
try {
|
||||
CharSequence lastkey = key;
|
||||
CharSequence p[] = new CharSequence[tries.size()];
|
||||
char lastch = ' ';
|
||||
for (int i = 0; i < tries.size(); i++) {
|
||||
CharSequence r = tries.get(i).getLastOnPath(lastkey);
|
||||
if (r == null || (r.length() == 1 && r.charAt(0) == EOM)) {
|
||||
return result;
|
||||
}
|
||||
// System.err.println("LP:"+key+" last:"+lastch+" new:"+r);
|
||||
if (cannotFollow(lastch, r.charAt(0))) {
|
||||
return result;
|
||||
} else {
|
||||
lastch = r.charAt(r.length() - 2);
|
||||
}
|
||||
// key=key.substring(lengthPP(r));
|
||||
p[i] = r;
|
||||
if (p[i].charAt(0) == '-') {
|
||||
if (i > 0) {
|
||||
key = skip(key, lengthPP(p[i - 1]));
|
||||
}
|
||||
key = skip(key, lengthPP(p[i]));
|
||||
}
|
||||
// key = skip(key, lengthPP(r));
|
||||
result.append(r);
|
||||
if (key.length() != 0) {
|
||||
lastkey = key;
|
||||
}
|
||||
}
|
||||
} catch (IndexOutOfBoundsException x) {}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write this data structure to the given output stream.
|
||||
*
|
||||
* @param os the output stream
|
||||
* @exception IOException if an I/O error occurs
|
||||
*/
|
||||
@Override
|
||||
public void store(DataOutput os) throws IOException {
|
||||
super.store(os);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an element to this structure consisting of the given key and patch
|
||||
* command.
|
||||
* <p>
|
||||
* This method will return without executing if the <tt>cmd</tt>
|
||||
* parameter's length is 0.
|
||||
*
|
||||
* @param key the key
|
||||
* @param cmd the patch command
|
||||
*/
|
||||
@Override
|
||||
public void add(CharSequence key, CharSequence cmd) {
|
||||
if (cmd.length() == 0) {
|
||||
return;
|
||||
}
|
||||
// System.err.println( cmd );
|
||||
CharSequence p[] = decompose(cmd);
|
||||
int levels = p.length;
|
||||
// System.err.println("levels "+key+" cmd "+cmd+"|"+levels);
|
||||
while (levels >= tries.size()) {
|
||||
tries.add(new Trie(forward));
|
||||
}
|
||||
CharSequence lastkey = key;
|
||||
for (int i = 0; i < levels; i++) {
|
||||
if (key.length() > 0) {
|
||||
tries.get(i).add(key, p[i]);
|
||||
lastkey = key;
|
||||
} else {
|
||||
tries.get(i).add(lastkey, p[i]);
|
||||
}
|
||||
// System.err.println("-"+key+" "+p[i]+"|"+key.length());
|
||||
/*
|
||||
* key=key.substring(lengthPP(p[i]));
|
||||
*/
|
||||
if (p[i].length() > 0 && p[i].charAt(0) == '-') {
|
||||
if (i > 0) {
|
||||
key = skip(key, lengthPP(p[i - 1]));
|
||||
}
|
||||
key = skip(key, lengthPP(p[i]));
|
||||
}
|
||||
// System.err.println("--->"+key);
|
||||
}
|
||||
if (key.length() > 0) {
|
||||
tries.get(levels).add(key, EOM_NODE);
|
||||
} else {
|
||||
tries.get(levels).add(lastkey, EOM_NODE);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Break the given patch command into its constituent pieces. The pieces are
|
||||
* delimited by NOOP commands.
|
||||
*
|
||||
* @param cmd the patch command
|
||||
* @return an array containing the pieces of the command
|
||||
*/
|
||||
public CharSequence[] decompose(CharSequence cmd) {
|
||||
int parts = 0;
|
||||
|
||||
for (int i = 0; 0 <= i && i < cmd.length();) {
|
||||
int next = dashEven(cmd, i);
|
||||
if (i == next) {
|
||||
parts++;
|
||||
i = next + 2;
|
||||
} else {
|
||||
parts++;
|
||||
i = next;
|
||||
}
|
||||
}
|
||||
|
||||
CharSequence part[] = new CharSequence[parts];
|
||||
int x = 0;
|
||||
|
||||
for (int i = 0; 0 <= i && i < cmd.length();) {
|
||||
int next = dashEven(cmd, i);
|
||||
if (i == next) {
|
||||
part[x++] = cmd.subSequence(i, i + 2);
|
||||
i = next + 2;
|
||||
} else {
|
||||
part[x++] = (next < 0) ? cmd.subSequence(i, cmd.length()) : cmd.subSequence(i, next);
|
||||
i = next;
|
||||
}
|
||||
}
|
||||
return part;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove empty rows from the given Trie and return the newly reduced Trie.
|
||||
*
|
||||
* @param by the Trie to reduce
|
||||
* @return the newly reduced Trie
|
||||
*/
|
||||
@Override
|
||||
public Trie reduce(Reduce by) {
|
||||
List<Trie> h = new ArrayList<Trie>();
|
||||
for (Trie trie : tries)
|
||||
h.add(trie.reduce(by));
|
||||
|
||||
MultiTrie2 m = new MultiTrie2(forward);
|
||||
m.tries = h;
|
||||
return m;
|
||||
}
|
||||
|
||||
private boolean cannotFollow(char after, char goes) {
|
||||
switch (after) {
|
||||
case '-':
|
||||
case 'D':
|
||||
return after == goes;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private CharSequence skip(CharSequence in, int count) {
|
||||
if (forward) {
|
||||
return in.subSequence(count, in.length());
|
||||
} else {
|
||||
return in.subSequence(0, in.length() - count);
|
||||
}
|
||||
}
|
||||
|
||||
private int dashEven(CharSequence in, int from) {
|
||||
while (from < in.length()) {
|
||||
if (in.charAt(from) == '-') {
|
||||
return from;
|
||||
} else {
|
||||
from += 2;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
private int lengthPP(CharSequence cmd) {
|
||||
int len = 0;
|
||||
for (int i = 0; i < cmd.length(); i++) {
|
||||
switch (cmd.charAt(i++)) {
|
||||
case '-':
|
||||
case 'D':
|
||||
len += cmd.charAt(i) - 'a' + 1;
|
||||
break;
|
||||
case 'R':
|
||||
len++;
|
||||
case 'I':
|
||||
break;
|
||||
}
|
||||
}
|
||||
return len;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,198 @@
|
|||
/*
|
||||
Egothor Software License version 1.00
|
||||
Copyright (C) 1997-2004 Leo Galambos.
|
||||
Copyright (C) 2002-2004 "Egothor developers"
|
||||
on behalf of the Egothor Project.
|
||||
All rights reserved.
|
||||
|
||||
This software is copyrighted by the "Egothor developers". If this
|
||||
license applies to a single file or document, the "Egothor developers"
|
||||
are the people or entities mentioned as copyright holders in that file
|
||||
or document. If this license applies to the Egothor project as a
|
||||
whole, the copyright holders are the people or entities mentioned in
|
||||
the file CREDITS. This file can be found in the same location as this
|
||||
license in the distribution.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
disclaimer that follows these conditions in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
3. The name "Egothor" must not be used to endorse or promote products
|
||||
derived from this software without prior written permission. For
|
||||
written permission, please contact Leo.G@seznam.cz
|
||||
4. Products derived from this software may not be called "Egothor",
|
||||
nor may "Egothor" appear in their name, without prior written
|
||||
permission from Leo.G@seznam.cz.
|
||||
|
||||
In addition, we request that you include in the end-user documentation
|
||||
provided with the redistribution and/or in the software itself an
|
||||
acknowledgement equivalent to the following:
|
||||
"This product includes software developed by the Egothor Project.
|
||||
http://egothor.sf.net/"
|
||||
|
||||
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
This software consists of voluntary contributions made by many
|
||||
individuals on behalf of the Egothor Project and was originally
|
||||
created by Leo Galambos (Leo.G@seznam.cz).
|
||||
*/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The Optimizer class is a Trie that will be reduced (have empty rows removed).
|
||||
* <p>
|
||||
* The reduction will be made by joining two rows where the first is a subset of
|
||||
* the second.
|
||||
*/
|
||||
public class Optimizer extends Reduce {
|
||||
/**
|
||||
* Constructor for the Optimizer object.
|
||||
*/
|
||||
public Optimizer() {}
|
||||
|
||||
/**
|
||||
* Optimize (remove empty rows) from the given Trie and return the resulting
|
||||
* Trie.
|
||||
*
|
||||
* @param orig the Trie to consolidate
|
||||
* @return the newly consolidated Trie
|
||||
*/
|
||||
@Override
|
||||
public Trie optimize(Trie orig) {
|
||||
List<CharSequence> cmds = orig.cmds;
|
||||
List<Row> rows = new ArrayList<Row>();
|
||||
List<Row> orows = orig.rows;
|
||||
int remap[] = new int[orows.size()];
|
||||
|
||||
for (int j = orows.size() - 1; j >= 0; j--) {
|
||||
Row now = new Remap(orows.get(j), remap);
|
||||
boolean merged = false;
|
||||
|
||||
for (int i = 0; i < rows.size(); i++) {
|
||||
Row q = merge(now, rows.get(i));
|
||||
if (q != null) {
|
||||
rows.set(i, q);
|
||||
merged = true;
|
||||
remap[j] = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (merged == false) {
|
||||
remap[j] = rows.size();
|
||||
rows.add(now);
|
||||
}
|
||||
}
|
||||
|
||||
int root = remap[orig.root];
|
||||
Arrays.fill(remap, -1);
|
||||
rows = removeGaps(root, rows, new ArrayList<Row>(), remap);
|
||||
|
||||
return new Trie(orig.forward, remap[root], cmds, rows);
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge the given rows and return the resulting Row.
|
||||
*
|
||||
* @param master the master Row
|
||||
* @param existing the existing Row
|
||||
* @return the resulting Row, or <tt>null</tt> if the operation cannot be
|
||||
* realized
|
||||
*/
|
||||
public Row merge(Row master, Row existing) {
|
||||
Iterator<Character> i = master.cells.keySet().iterator();
|
||||
Row n = new Row();
|
||||
for (; i.hasNext();) {
|
||||
Character ch = i.next();
|
||||
// XXX also must handle Cnt and Skip !!
|
||||
Cell a = master.cells.get(ch);
|
||||
Cell b = existing.cells.get(ch);
|
||||
|
||||
Cell s = (b == null) ? new Cell(a) : merge(a, b);
|
||||
if (s == null) {
|
||||
return null;
|
||||
}
|
||||
n.cells.put(ch, s);
|
||||
}
|
||||
i = existing.cells.keySet().iterator();
|
||||
for (; i.hasNext();) {
|
||||
Character ch = i.next();
|
||||
if (master.at(ch) != null) {
|
||||
continue;
|
||||
}
|
||||
n.cells.put(ch, existing.at(ch));
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge the given Cells and return the resulting Cell.
|
||||
*
|
||||
* @param m the master Cell
|
||||
* @param e the existing Cell
|
||||
* @return the resulting Cell, or <tt>null</tt> if the operation cannot be
|
||||
* realized
|
||||
*/
|
||||
public Cell merge(Cell m, Cell e) {
|
||||
Cell n = new Cell();
|
||||
|
||||
if (m.skip != e.skip) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (m.cmd >= 0) {
|
||||
if (e.cmd >= 0) {
|
||||
if (m.cmd == e.cmd) {
|
||||
n.cmd = m.cmd;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
n.cmd = m.cmd;
|
||||
}
|
||||
} else {
|
||||
n.cmd = e.cmd;
|
||||
}
|
||||
if (m.ref >= 0) {
|
||||
if (e.ref >= 0) {
|
||||
if (m.ref == e.ref) {
|
||||
if (m.skip == e.skip) {
|
||||
n.ref = m.ref;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
n.ref = m.ref;
|
||||
}
|
||||
} else {
|
||||
n.ref = e.ref;
|
||||
}
|
||||
n.cnt = m.cnt + e.cnt;
|
||||
n.skip = m.skip;
|
||||
return n;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,90 @@
|
|||
/*
|
||||
Egothor Software License version 1.00
|
||||
Copyright (C) 1997-2004 Leo Galambos.
|
||||
Copyright (C) 2002-2004 "Egothor developers"
|
||||
on behalf of the Egothor Project.
|
||||
All rights reserved.
|
||||
|
||||
This software is copyrighted by the "Egothor developers". If this
|
||||
license applies to a single file or document, the "Egothor developers"
|
||||
are the people or entities mentioned as copyright holders in that file
|
||||
or document. If this license applies to the Egothor project as a
|
||||
whole, the copyright holders are the people or entities mentioned in
|
||||
the file CREDITS. This file can be found in the same location as this
|
||||
license in the distribution.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
disclaimer that follows these conditions in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
3. The name "Egothor" must not be used to endorse or promote products
|
||||
derived from this software without prior written permission. For
|
||||
written permission, please contact Leo.G@seznam.cz
|
||||
4. Products derived from this software may not be called "Egothor",
|
||||
nor may "Egothor" appear in their name, without prior written
|
||||
permission from Leo.G@seznam.cz.
|
||||
|
||||
In addition, we request that you include in the end-user documentation
|
||||
provided with the redistribution and/or in the software itself an
|
||||
acknowledgement equivalent to the following:
|
||||
"This product includes software developed by the Egothor Project.
|
||||
http://egothor.sf.net/"
|
||||
|
||||
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
This software consists of voluntary contributions made by many
|
||||
individuals on behalf of the Egothor Project and was originally
|
||||
created by Leo Galambos (Leo.G@seznam.cz).
|
||||
*/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
/**
|
||||
* The Optimizer class is a Trie that will be reduced (have empty rows removed).
|
||||
* <p>
|
||||
* This is the result of allowing a joining of rows when there is no collision
|
||||
* between non-<tt>null</tt> values in the rows. Information loss, resulting in
|
||||
* the stemmer not being able to recognize words (as in Optimizer), is
|
||||
* curtailed, allowing the stemmer to recognize words for which the original
|
||||
* trie was built. Use of this class allows the stemmer to be self-teaching.
|
||||
*/
|
||||
public class Optimizer2 extends Optimizer {
|
||||
/**
|
||||
* Constructor for the Optimizer2 object.
|
||||
*/
|
||||
public Optimizer2() {}
|
||||
|
||||
/**
|
||||
* Merge the given Cells and return the resulting Cell.
|
||||
*
|
||||
* @param m the master Cell
|
||||
* @param e the existing Cell
|
||||
* @return the resulting Cell, or <tt>null</tt> if the operation cannot be
|
||||
* realized
|
||||
*/
|
||||
@Override
|
||||
public Cell merge(Cell m, Cell e) {
|
||||
if (m.cmd == e.cmd && m.ref == e.ref && m.skip == e.skip) {
|
||||
Cell c = new Cell(m);
|
||||
c.cnt += e.cnt;
|
||||
return c;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,134 @@
|
|||
/*
|
||||
Egothor Software License version 1.00
|
||||
Copyright (C) 1997-2004 Leo Galambos.
|
||||
Copyright (C) 2002-2004 "Egothor developers"
|
||||
on behalf of the Egothor Project.
|
||||
All rights reserved.
|
||||
|
||||
This software is copyrighted by the "Egothor developers". If this
|
||||
license applies to a single file or document, the "Egothor developers"
|
||||
are the people or entities mentioned as copyright holders in that file
|
||||
or document. If this license applies to the Egothor project as a
|
||||
whole, the copyright holders are the people or entities mentioned in
|
||||
the file CREDITS. This file can be found in the same location as this
|
||||
license in the distribution.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
disclaimer that follows these conditions in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
3. The name "Egothor" must not be used to endorse or promote products
|
||||
derived from this software without prior written permission. For
|
||||
written permission, please contact Leo.G@seznam.cz
|
||||
4. Products derived from this software may not be called "Egothor",
|
||||
nor may "Egothor" appear in their name, without prior written
|
||||
permission from Leo.G@seznam.cz.
|
||||
|
||||
In addition, we request that you include in the end-user documentation
|
||||
provided with the redistribution and/or in the software itself an
|
||||
acknowledgement equivalent to the following:
|
||||
"This product includes software developed by the Egothor Project.
|
||||
http://egothor.sf.net/"
|
||||
|
||||
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
This software consists of voluntary contributions made by many
|
||||
individuals on behalf of the Egothor Project and was originally
|
||||
created by Leo Galambos (Leo.G@seznam.cz).
|
||||
*/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The Reduce object is used to remove gaps in a Trie which stores a dictionary.
|
||||
*/
|
||||
public class Reduce {
|
||||
|
||||
/**
|
||||
* Constructor for the Reduce object.
|
||||
*/
|
||||
public Reduce() {}
|
||||
|
||||
/**
|
||||
* Optimize (remove holes in the rows) the given Trie and return the
|
||||
* restructured Trie.
|
||||
*
|
||||
* @param orig the Trie to optimize
|
||||
* @return the restructured Trie
|
||||
*/
|
||||
public Trie optimize(Trie orig) {
|
||||
List<CharSequence> cmds = orig.cmds;
|
||||
List<Row> rows = new ArrayList<Row>();
|
||||
List<Row> orows = orig.rows;
|
||||
int remap[] = new int[orows.size()];
|
||||
|
||||
Arrays.fill(remap, -1);
|
||||
rows = removeGaps(orig.root, rows, new ArrayList<Row>(), remap);
|
||||
|
||||
return new Trie(orig.forward, remap[orig.root], cmds, rows);
|
||||
}
|
||||
|
||||
List<Row> removeGaps(int ind, List<Row> old, List<Row> to, int remap[]) {
|
||||
remap[ind] = to.size();
|
||||
|
||||
Row now = old.get(ind);
|
||||
to.add(now);
|
||||
Iterator<Cell> i = now.cells.values().iterator();
|
||||
for (; i.hasNext();) {
|
||||
Cell c = i.next();
|
||||
if (c.ref >= 0 && remap[c.ref] < 0) {
|
||||
removeGaps(c.ref, old, to, remap);
|
||||
}
|
||||
}
|
||||
to.set(remap[ind], new Remap(now, remap));
|
||||
return to;
|
||||
}
|
||||
|
||||
/**
|
||||
* This class is part of the Egothor Project
|
||||
*/
|
||||
class Remap extends Row {
|
||||
/**
|
||||
* Constructor for the Remap object
|
||||
*
|
||||
* @param old Description of the Parameter
|
||||
* @param remap Description of the Parameter
|
||||
*/
|
||||
public Remap(Row old, int remap[]) {
|
||||
super();
|
||||
Iterator<Character> i = old.cells.keySet().iterator();
|
||||
for (; i.hasNext();) {
|
||||
Character ch = i.next();
|
||||
Cell c = old.at(ch);
|
||||
Cell nc;
|
||||
if (c.ref >= 0) {
|
||||
nc = new Cell(c);
|
||||
nc.ref = remap[nc.ref];
|
||||
} else {
|
||||
nc = new Cell(c);
|
||||
}
|
||||
cells.put(ch, nc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,309 @@
|
|||
/*
|
||||
Egothor Software License version 1.00
|
||||
Copyright (C) 1997-2004 Leo Galambos.
|
||||
Copyright (C) 2002-2004 "Egothor developers"
|
||||
on behalf of the Egothor Project.
|
||||
All rights reserved.
|
||||
|
||||
This software is copyrighted by the "Egothor developers". If this
|
||||
license applies to a single file or document, the "Egothor developers"
|
||||
are the people or entities mentioned as copyright holders in that file
|
||||
or document. If this license applies to the Egothor project as a
|
||||
whole, the copyright holders are the people or entities mentioned in
|
||||
the file CREDITS. This file can be found in the same location as this
|
||||
license in the distribution.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
disclaimer that follows these conditions in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
3. The name "Egothor" must not be used to endorse or promote products
|
||||
derived from this software without prior written permission. For
|
||||
written permission, please contact Leo.G@seznam.cz
|
||||
4. Products derived from this software may not be called "Egothor",
|
||||
nor may "Egothor" appear in their name, without prior written
|
||||
permission from Leo.G@seznam.cz.
|
||||
|
||||
In addition, we request that you include in the end-user documentation
|
||||
provided with the redistribution and/or in the software itself an
|
||||
acknowledgement equivalent to the following:
|
||||
"This product includes software developed by the Egothor Project.
|
||||
http://egothor.sf.net/"
|
||||
|
||||
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
This software consists of voluntary contributions made by many
|
||||
individuals on behalf of the Egothor Project and was originally
|
||||
created by Leo Galambos (Leo.G@seznam.cz).
|
||||
*/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.io.DataInput;
|
||||
import java.io.DataOutput;
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.TreeMap;
|
||||
|
||||
/**
|
||||
* The Row class represents a row in a matrix representation of a trie.
|
||||
*/
|
||||
public class Row {
|
||||
TreeMap<Character,Cell> cells = new TreeMap<Character,Cell>();
|
||||
int uniformCnt = 0;
|
||||
int uniformSkip = 0;
|
||||
|
||||
/**
|
||||
* Construct a Row object from input carried in via the given input stream.
|
||||
*
|
||||
* @param is the input stream
|
||||
* @exception IOException if an I/O error occurs
|
||||
*/
|
||||
public Row(DataInput is) throws IOException {
|
||||
for (int i = is.readInt(); i > 0; i--) {
|
||||
char ch = is.readChar();
|
||||
Cell c = new Cell();
|
||||
c.cmd = is.readInt();
|
||||
c.cnt = is.readInt();
|
||||
c.ref = is.readInt();
|
||||
c.skip = is.readInt();
|
||||
cells.put(ch, c);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The default constructor for the Row object.
|
||||
*/
|
||||
public Row() {}
|
||||
|
||||
/**
|
||||
* Construct a Row using the cells of the given Row.
|
||||
*
|
||||
* @param old the Row to copy
|
||||
*/
|
||||
public Row(Row old) {
|
||||
cells = old.cells;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the command in the Cell of the given Character to the given integer.
|
||||
*
|
||||
* @param way the Character defining the Cell
|
||||
* @param cmd the new command
|
||||
*/
|
||||
public void setCmd(Character way, int cmd) {
|
||||
Cell c = at(way);
|
||||
if (c == null) {
|
||||
c = new Cell();
|
||||
c.cmd = cmd;
|
||||
cells.put(way, c);
|
||||
} else {
|
||||
c.cmd = cmd;
|
||||
}
|
||||
c.cnt = (cmd >= 0) ? 1 : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the reference to the next row in the Cell of the given Character to the
|
||||
* given integer.
|
||||
*
|
||||
* @param way the Character defining the Cell
|
||||
* @param ref The new ref value
|
||||
*/
|
||||
public void setRef(Character way, int ref) {
|
||||
Cell c = at(way);
|
||||
if (c == null) {
|
||||
c = new Cell();
|
||||
c.ref = ref;
|
||||
cells.put(way, c);
|
||||
} else {
|
||||
c.ref = ref;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of cells in use.
|
||||
*
|
||||
* @return the number of cells in use
|
||||
*/
|
||||
public int getCells() {
|
||||
Iterator<Character> i = cells.keySet().iterator();
|
||||
int size = 0;
|
||||
for (; i.hasNext();) {
|
||||
Character c = i.next();
|
||||
Cell e = at(c);
|
||||
if (e.cmd >= 0 || e.ref >= 0) {
|
||||
size++;
|
||||
}
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of references (how many transitions) to other rows.
|
||||
*
|
||||
* @return the number of references
|
||||
*/
|
||||
public int getCellsPnt() {
|
||||
Iterator<Character> i = cells.keySet().iterator();
|
||||
int size = 0;
|
||||
for (; i.hasNext();) {
|
||||
Character c = i.next();
|
||||
Cell e = at(c);
|
||||
if (e.ref >= 0) {
|
||||
size++;
|
||||
}
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of patch commands saved in this Row.
|
||||
*
|
||||
* @return the number of patch commands
|
||||
*/
|
||||
public int getCellsVal() {
|
||||
Iterator<Character> i = cells.keySet().iterator();
|
||||
int size = 0;
|
||||
for (; i.hasNext();) {
|
||||
Character c = i.next();
|
||||
Cell e = at(c);
|
||||
if (e.cmd >= 0) {
|
||||
size++;
|
||||
}
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the command in the Cell associated with the given Character.
|
||||
*
|
||||
* @param way the Character associated with the Cell holding the desired
|
||||
* command
|
||||
* @return the command
|
||||
*/
|
||||
public int getCmd(Character way) {
|
||||
Cell c = at(way);
|
||||
return (c == null) ? -1 : c.cmd;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of patch commands were in the Cell associated with the
|
||||
* given Character before the Trie containing this Row was reduced.
|
||||
*
|
||||
* @param way the Character associated with the desired Cell
|
||||
* @return the number of patch commands before reduction
|
||||
*/
|
||||
public int getCnt(Character way) {
|
||||
Cell c = at(way);
|
||||
return (c == null) ? -1 : c.cnt;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the reference to the next Row in the Cell associated with the given
|
||||
* Character.
|
||||
*
|
||||
* @param way the Character associated with the desired Cell
|
||||
* @return the reference, or -1 if the Cell is <tt>null,/tt>
|
||||
*/
|
||||
public int getRef(Character way) {
|
||||
Cell c = at(way);
|
||||
return (c == null) ? -1 : c.ref;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the contents of this Row to the given output stream.
|
||||
*
|
||||
* @param os the output stream
|
||||
* @exception IOException if an I/O error occurs
|
||||
*/
|
||||
public void store(DataOutput os) throws IOException {
|
||||
os.writeInt(cells.size());
|
||||
Iterator<Character> i = cells.keySet().iterator();
|
||||
for (; i.hasNext();) {
|
||||
Character c = i.next();
|
||||
Cell e = at(c);
|
||||
if (e.cmd < 0 && e.ref < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
os.writeChar(c.charValue());
|
||||
os.writeInt(e.cmd);
|
||||
os.writeInt(e.cnt);
|
||||
os.writeInt(e.ref);
|
||||
os.writeInt(e.skip);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of identical Cells (containing patch commands) in this
|
||||
* Row.
|
||||
*
|
||||
* @param eqSkip when set to <tt>false</tt> the removed patch commands are
|
||||
* considered
|
||||
* @return the number of identical Cells, or -1 if there are (at least) two
|
||||
* different cells
|
||||
*/
|
||||
public int uniformCmd(boolean eqSkip) {
|
||||
Iterator<Cell> i = cells.values().iterator();
|
||||
int ret = -1;
|
||||
uniformCnt = 1;
|
||||
uniformSkip = 0;
|
||||
for (; i.hasNext();) {
|
||||
Cell c = i.next();
|
||||
if (c.ref >= 0) {
|
||||
return -1;
|
||||
}
|
||||
if (c.cmd >= 0) {
|
||||
if (ret < 0) {
|
||||
ret = c.cmd;
|
||||
uniformSkip = c.skip;
|
||||
} else if (ret == c.cmd) {
|
||||
if (eqSkip) {
|
||||
if (uniformSkip == c.skip) {
|
||||
uniformCnt++;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
uniformCnt++;
|
||||
}
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the contents of this Row to stdout.
|
||||
*/
|
||||
public void print() {
|
||||
for (Iterator<Character> i = cells.keySet().iterator(); i.hasNext();) {
|
||||
Character ch = i.next();
|
||||
Cell c = at(ch);
|
||||
System.out.print("[" + ch + ":" + c + "]");
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
Cell at(Character index) {
|
||||
return cells.get(index);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,419 @@
|
|||
/*
|
||||
Egothor Software License version 1.00
|
||||
Copyright (C) 1997-2004 Leo Galambos.
|
||||
Copyright (C) 2002-2004 "Egothor developers"
|
||||
on behalf of the Egothor Project.
|
||||
All rights reserved.
|
||||
|
||||
This software is copyrighted by the "Egothor developers". If this
|
||||
license applies to a single file or document, the "Egothor developers"
|
||||
are the people or entities mentioned as copyright holders in that file
|
||||
or document. If this license applies to the Egothor project as a
|
||||
whole, the copyright holders are the people or entities mentioned in
|
||||
the file CREDITS. This file can be found in the same location as this
|
||||
license in the distribution.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
disclaimer that follows these conditions in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
3. The name "Egothor" must not be used to endorse or promote products
|
||||
derived from this software without prior written permission. For
|
||||
written permission, please contact Leo.G@seznam.cz
|
||||
4. Products derived from this software may not be called "Egothor",
|
||||
nor may "Egothor" appear in their name, without prior written
|
||||
permission from Leo.G@seznam.cz.
|
||||
|
||||
In addition, we request that you include in the end-user documentation
|
||||
provided with the redistribution and/or in the software itself an
|
||||
acknowledgement equivalent to the following:
|
||||
"This product includes software developed by the Egothor Project.
|
||||
http://egothor.sf.net/"
|
||||
|
||||
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
This software consists of voluntary contributions made by many
|
||||
individuals on behalf of the Egothor Project and was originally
|
||||
created by Leo Galambos (Leo.G@seznam.cz).
|
||||
*/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.io.DataInput;
|
||||
import java.io.DataOutput;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A Trie is used to store a dictionary of words and their stems.
|
||||
* <p>
|
||||
* Actually, what is stored are words with their respective patch commands. A
|
||||
* trie can be termed forward (keys read from left to right) or backward (keys
|
||||
* read from right to left). This property will vary depending on the language
|
||||
* for which a Trie is constructed.
|
||||
*/
|
||||
public class Trie {
|
||||
List<Row> rows = new ArrayList<Row>();
|
||||
List<CharSequence> cmds = new ArrayList<CharSequence>();
|
||||
int root;
|
||||
|
||||
boolean forward = false;
|
||||
|
||||
/**
|
||||
* Constructor for the Trie object.
|
||||
*
|
||||
* @param is the input stream
|
||||
* @exception IOException if an I/O error occurs
|
||||
*/
|
||||
public Trie(DataInput is) throws IOException {
|
||||
forward = is.readBoolean();
|
||||
root = is.readInt();
|
||||
for (int i = is.readInt(); i > 0; i--) {
|
||||
cmds.add(is.readUTF());
|
||||
}
|
||||
for (int i = is.readInt(); i > 0; i--) {
|
||||
rows.add(new Row(is));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor for the Trie object.
|
||||
*
|
||||
* @param forward set to <tt>true</tt>
|
||||
*/
|
||||
public Trie(boolean forward) {
|
||||
rows.add(new Row());
|
||||
root = 0;
|
||||
this.forward = forward;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor for the Trie object.
|
||||
*
|
||||
* @param forward <tt>true</tt> if read left to right, <tt>false</tt> if read
|
||||
* right to left
|
||||
* @param root index of the row that is the root node
|
||||
* @param cmds the patch commands to store
|
||||
* @param rows a Vector of Vectors. Each inner Vector is a node of this Trie
|
||||
*/
|
||||
public Trie(boolean forward, int root, List<CharSequence> cmds, List<Row> rows) {
|
||||
this.rows = rows;
|
||||
this.cmds = cmds;
|
||||
this.root = root;
|
||||
this.forward = forward;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the all attribute of the Trie object
|
||||
*
|
||||
* @param key Description of the Parameter
|
||||
* @return The all value
|
||||
*/
|
||||
public CharSequence[] getAll(CharSequence key) {
|
||||
int res[] = new int[key.length()];
|
||||
int resc = 0;
|
||||
Row now = getRow(root);
|
||||
int w;
|
||||
StrEnum e = new StrEnum(key, forward);
|
||||
boolean br = false;
|
||||
|
||||
for (int i = 0; i < key.length() - 1; i++) {
|
||||
Character ch = new Character(e.next());
|
||||
w = now.getCmd(ch);
|
||||
if (w >= 0) {
|
||||
int n = w;
|
||||
for (int j = 0; j < resc; j++) {
|
||||
if (n == res[j]) {
|
||||
n = -1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (n >= 0) {
|
||||
res[resc++] = n;
|
||||
}
|
||||
}
|
||||
w = now.getRef(ch);
|
||||
if (w >= 0) {
|
||||
now = getRow(w);
|
||||
} else {
|
||||
br = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (br == false) {
|
||||
w = now.getCmd(new Character(e.next()));
|
||||
if (w >= 0) {
|
||||
int n = w;
|
||||
for (int j = 0; j < resc; j++) {
|
||||
if (n == res[j]) {
|
||||
n = -1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (n >= 0) {
|
||||
res[resc++] = n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (resc < 1) {
|
||||
return null;
|
||||
}
|
||||
CharSequence R[] = new CharSequence[resc];
|
||||
for (int j = 0; j < resc; j++) {
|
||||
R[j] = cmds.get(res[j]);
|
||||
}
|
||||
return R;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of cells in this Trie object.
|
||||
*
|
||||
* @return the number of cells
|
||||
*/
|
||||
public int getCells() {
|
||||
int size = 0;
|
||||
for (Row row : rows)
|
||||
size += row.getCells();
|
||||
return size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the cellsPnt attribute of the Trie object
|
||||
*
|
||||
* @return The cellsPnt value
|
||||
*/
|
||||
public int getCellsPnt() {
|
||||
int size = 0;
|
||||
for (Row row : rows)
|
||||
size += row.getCellsPnt();
|
||||
return size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the cellsVal attribute of the Trie object
|
||||
*
|
||||
* @return The cellsVal value
|
||||
*/
|
||||
public int getCellsVal() {
|
||||
int size = 0;
|
||||
for (Row row : rows)
|
||||
size += row.getCellsVal();
|
||||
return size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the element that is stored in a cell associated with the given key.
|
||||
*
|
||||
* @param key the key
|
||||
* @return the associated element
|
||||
*/
|
||||
public CharSequence getFully(CharSequence key) {
|
||||
Row now = getRow(root);
|
||||
int w;
|
||||
Cell c;
|
||||
int cmd = -1;
|
||||
StrEnum e = new StrEnum(key, forward);
|
||||
Character ch = null;
|
||||
Character aux = null;
|
||||
|
||||
for (int i = 0; i < key.length();) {
|
||||
ch = new Character(e.next());
|
||||
i++;
|
||||
|
||||
c = now.at(ch);
|
||||
if (c == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
cmd = c.cmd;
|
||||
|
||||
for (int skip = c.skip; skip > 0; skip--) {
|
||||
if (i < key.length()) {
|
||||
aux = new Character(e.next());
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
w = now.getRef(ch);
|
||||
if (w >= 0) {
|
||||
now = getRow(w);
|
||||
} else if (i < key.length()) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return (cmd == -1) ? null : cmds.get(cmd);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the element that is stored as last on a path associated with the
|
||||
* given key.
|
||||
*
|
||||
* @param key the key associated with the desired element
|
||||
* @return the last on path element
|
||||
*/
|
||||
public CharSequence getLastOnPath(CharSequence key) {
|
||||
Row now = getRow(root);
|
||||
int w;
|
||||
CharSequence last = null;
|
||||
StrEnum e = new StrEnum(key, forward);
|
||||
|
||||
for (int i = 0; i < key.length() - 1; i++) {
|
||||
Character ch = new Character(e.next());
|
||||
w = now.getCmd(ch);
|
||||
if (w >= 0) {
|
||||
last = cmds.get(w);
|
||||
}
|
||||
w = now.getRef(ch);
|
||||
if (w >= 0) {
|
||||
now = getRow(w);
|
||||
} else {
|
||||
return last;
|
||||
}
|
||||
}
|
||||
w = now.getCmd(new Character(e.next()));
|
||||
return (w >= 0) ? cmds.get(w) : last;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the Row at the given index.
|
||||
*
|
||||
* @param index the index containing the desired Row
|
||||
* @return the Row
|
||||
*/
|
||||
private Row getRow(int index) {
|
||||
if (index < 0 || index >= rows.size()) {
|
||||
return null;
|
||||
}
|
||||
return rows.get(index);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write this Trie to the given output stream.
|
||||
*
|
||||
* @param os the output stream
|
||||
* @exception IOException if an I/O error occurs
|
||||
*/
|
||||
public void store(DataOutput os) throws IOException {
|
||||
os.writeBoolean(forward);
|
||||
os.writeInt(root);
|
||||
os.writeInt(cmds.size());
|
||||
for (CharSequence cmd : cmds)
|
||||
os.writeUTF(cmd.toString());
|
||||
|
||||
os.writeInt(rows.size());
|
||||
for (Row row : rows)
|
||||
row.store(os);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the given key associated with the given patch command. If either
|
||||
* parameter is null this method will return without executing.
|
||||
*
|
||||
* @param key the key
|
||||
* @param cmd the patch command
|
||||
*/
|
||||
public void add(CharSequence key, CharSequence cmd) {
|
||||
if (key == null || cmd == null) {
|
||||
return;
|
||||
}
|
||||
if (cmd.length() == 0) {
|
||||
return;
|
||||
}
|
||||
int id_cmd = cmds.indexOf(cmd);
|
||||
if (id_cmd == -1) {
|
||||
id_cmd = cmds.size();
|
||||
cmds.add(cmd);
|
||||
}
|
||||
|
||||
int node = root;
|
||||
Row r = getRow(node);
|
||||
|
||||
StrEnum e = new StrEnum(key, forward);
|
||||
|
||||
for (int i = 0; i < e.length() - 1; i++) {
|
||||
Character ch = new Character(e.next());
|
||||
node = r.getRef(ch);
|
||||
if (node >= 0) {
|
||||
r = getRow(node);
|
||||
} else {
|
||||
node = rows.size();
|
||||
Row n;
|
||||
rows.add(n = new Row());
|
||||
r.setRef(ch, node);
|
||||
r = n;
|
||||
}
|
||||
}
|
||||
r.setCmd(new Character(e.next()), id_cmd);
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove empty rows from the given Trie and return the newly reduced Trie.
|
||||
*
|
||||
* @param by the Trie to reduce
|
||||
* @return the newly reduced Trie
|
||||
*/
|
||||
public Trie reduce(Reduce by) {
|
||||
return by.optimize(this);
|
||||
}
|
||||
|
||||
public void printInfo(CharSequence prefix) {
|
||||
System.out.println(prefix + "nds " + rows.size() + " cmds " + cmds.size()
|
||||
+ " cells " + getCells() + " valcells " + getCellsVal() + " pntcells "
|
||||
+ getCellsPnt());
|
||||
}
|
||||
|
||||
/**
|
||||
* This class is part of the Egothor Project
|
||||
*/
|
||||
class StrEnum {
|
||||
CharSequence s;
|
||||
int from;
|
||||
int by;
|
||||
|
||||
/**
|
||||
* Constructor for the StrEnum object
|
||||
*
|
||||
* @param s Description of the Parameter
|
||||
* @param up Description of the Parameter
|
||||
*/
|
||||
StrEnum(CharSequence s, boolean up) {
|
||||
this.s = s;
|
||||
if (up) {
|
||||
from = 0;
|
||||
by = 1;
|
||||
} else {
|
||||
from = s.length() - 1;
|
||||
by = -1;
|
||||
}
|
||||
}
|
||||
|
||||
int length() {
|
||||
return s.length();
|
||||
}
|
||||
|
||||
char next() {
|
||||
char ch = s.charAt(from);
|
||||
from += by;
|
||||
return ch;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,458 @@
|
|||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
<html>
|
||||
<head>
|
||||
<meta content="text/html; charset=UTF-8" http-equiv="content-type">
|
||||
<title>Stempel - Algorithmic Stemmer for Polish Language</title>
|
||||
<meta content="Andrzej Bialecki" name="author">
|
||||
<meta name="keywords"
|
||||
content="stemming, stemmer, algorithmic stemmer, Polish stemmer">
|
||||
<meta
|
||||
content="This page describes a software package consisting of high-quality stemming tables for Polish, and a universal algorithmic stemmer, which operates using these tables."
|
||||
name="description">
|
||||
</head>
|
||||
<body style="font-family: Arial,SansSerif;">
|
||||
<h1><i>Stempel</i> - Algorithmic Stemmer for Polish Language</h1>
|
||||
<h2>Introduction</h2>
|
||||
<p>A method for conflation of different inflected word forms is an
|
||||
important component of many Information Retrieval systems. It helps to
|
||||
improve the system's recall and can significantly reduce the index
|
||||
size. This is especially true for highly-inflectional languages like
|
||||
those from the Slavic language family (Czech, Slovak, Polish, Russian,
|
||||
Bulgarian, etc).</p>
|
||||
<p>This page describes a software package consisting of high-quality
|
||||
stemming tables for Polish, and a universal algorithmic stemmer, which
|
||||
operates using these tables. The stemmer code is taken virtually
|
||||
unchanged from the <a href="http://www.egothor.org">Egothor project</a>.</p>
|
||||
<p>The software distribution includes stemmer
|
||||
tables prepared using an extensive corpus of Polish language (see
|
||||
details below).</p>
|
||||
<p>This work is available under Apache-style Open Source license - the
|
||||
stemmer code is covered by Egothor License, the tables and other
|
||||
additions are covered by Apache License 2.0. Both licenses allow to use
|
||||
the code in Open Source as well as commercial (closed source) projects.</p>
|
||||
<h3>Terminology</h3>
|
||||
<p>A short explanation is in order about the terminology used in this
|
||||
text.</p>
|
||||
<p>In the following sections I make a distinction between <b>stem</b>
|
||||
and <b>lemma</b>.</p>
|
||||
<p>Lemma is a base grammatical form (dictionary form, headword) of a
|
||||
word. Lemma is an existing, grammatically correct word in some human
|
||||
language.</p>
|
||||
<p>Stem on the other hand is just a unique token, not necessarily
|
||||
making any sense in any human language, but which can serve as a unique
|
||||
label instead of lemma for the same set of inflected forms. Quite often
|
||||
stem is referred to as a "root" of the word - which is incorrect and
|
||||
misleading (stems sometimes have very little to do with the linguistic
|
||||
root of a word, i.e. a pattern found in a word which is common to all
|
||||
inflected forms or within a family of languages).</p>
|
||||
<p>For an IR system stems are usually sufficient, for a morphological
|
||||
analysis system obviously lemmas are a must. In practice, various
|
||||
stemmers produce a mix of stems and lemmas, as is the case with the
|
||||
stemmer described here. Additionally, for some languages, which use
|
||||
suffix-based inflection rules many stemmers based on suffix-stripping
|
||||
will produce a large percentage of stems equivalent to lemmas. This is
|
||||
however not the case for languages with complex, irregular inflection
|
||||
rules (such as Slavic languages) - here simplistic suffix-stripping
|
||||
stemmers produce very poor results.</p>
|
||||
<h3>Background</h3>
|
||||
<p>Lemmatization is a process of finding the base, non-inflected form
|
||||
of a word. The result of lemmatization is a correct existing word,
|
||||
often in nominative case for nouns and infinitive form for verbs. A
|
||||
given inflected form may correspond to several lemmas (e.g. "found"
|
||||
-> find, found) - the correct choice depends on the context.<br>
|
||||
<br>
|
||||
Stemming is concerned mostly with finding a unique "root" of a word,
|
||||
which not necessarily results in any existing word or lemma. The
|
||||
quality of stemming is measured by the rate of collisions (overstemming
|
||||
- which causes words with different lemmas to be incorrectly conflated
|
||||
into one "root"), and the rate of superfluous word "roots"
|
||||
(understemming - which assigns several "roots" to words with the same
|
||||
lemma). <br>
|
||||
<br>
|
||||
Both stemmer and lemmatizer can be implemented in various ways. The two
|
||||
most common approaches are:<br>
|
||||
</p>
|
||||
<ul>
|
||||
<li>dictionary-based: where the stemmer uses an extensive dictionary
|
||||
of morphological forms in order to find the corresponding stem or lemma</li>
|
||||
<li>algorithmic: where the stemmer uses an algorithm, based on
|
||||
general morphological properties of a given language plus a set of
|
||||
heuristic rules<br>
|
||||
</li>
|
||||
</ul>
|
||||
There are many existing and well-known implementations of stemmers for
|
||||
English (Porter, Lovins, Krovetz) and other European languages
|
||||
(<a href="http://snowball.tartarus.org">Snowball</a>). There are also
|
||||
good quality commercial lemmatizers for Polish. However, there is only
|
||||
one
|
||||
freely available Polish stemmer, implemented by
|
||||
<a
|
||||
href="http://www.cs.put.poznan.pl/dweiss/xml/projects/lametyzator/index.xml?lang=en">Dawid
|
||||
Weiss</a>, based on the "ispell" dictionary and Jan Daciuk's <a
|
||||
href="http://www.eti.pg.gda.pl/%7Ejandac/">FSA package</a>. That
|
||||
stemmer is dictionary-based. This means that even
|
||||
though it can achieve
|
||||
perfect accuracy for previously known word forms found in its
|
||||
dictionary, it
|
||||
completely fails in case of all other word forms. This deficiency is
|
||||
somewhat mitigated by the comprehensive dictionary distributed with
|
||||
this stemmer (so there is a high probability that most of the words in
|
||||
the input text will be found in the dictionary), however the problem
|
||||
still remains (please see the page above for more detailed description).<br>
|
||||
<br>
|
||||
The implementation described here uses an algorithmic method. This
|
||||
method
|
||||
and particular algorithm implementation are described in detail in
|
||||
[1][2].
|
||||
The main advantage of algorithmic stemmers is their ability to process
|
||||
previously
|
||||
unseen word forms with high accuracy. This particular algorithm uses a
|
||||
set
|
||||
of
|
||||
transformation rules (patch commands), which describe how a word with a
|
||||
given pattern should be transformed to its stem. These rules are first
|
||||
learned from a training corpus. They don't
|
||||
cover
|
||||
all possible cases, so there is always some loss of precision/recall
|
||||
(which
|
||||
means that even the words from the training corpus are sometimes
|
||||
incorrectly stemmed).<br>
|
||||
<h2>Algorithm and implementation<span style="font-style: italic;"></span></h2>
|
||||
The algorithm and its Java implementation is described in detail in the
|
||||
publications cited below. Here's just a short excerpt from [2]:<br>
|
||||
<br>
|
||||
<center>
|
||||
<div style="width: 80%;" align="justify">"The aim is separation of the
|
||||
stemmer execution code from the data
|
||||
structures [...]. In other words, a static algorithm configurable by
|
||||
data must be developed. The word transformations that happen in the
|
||||
stemmer must be then encoded to the data tables.<br>
|
||||
<br>
|
||||
The tacit input of our method is a sample set (a so-called dictionary)
|
||||
of words (as keys) and their stems. Each record can be equivalently
|
||||
stored as a key and the record of key's transformation to its
|
||||
respective stem. The transformation record is termed a patch command
|
||||
(P-command). It must be ensured that P-commands are universal, and that
|
||||
P-commands can transform any word to its stem. Our solution[6,8] is
|
||||
based on the Levenstein metric [10], which produces P-command as the
|
||||
minimum cost path in a directed graph.<br>
|
||||
<br>
|
||||
One can imagine the P-command as an algorithm for an operator (editor)
|
||||
that rewrites a string to another string. The operator can use these
|
||||
instructions (PP-command's): <span style="font-weight: bold;">removal </span>-
|
||||
deletes a sequence of characters starting at the current cursor
|
||||
position and moves the cursor to the next character. The length of this
|
||||
sequence is the parameter; <span style="font-weight: bold;">insertion </span>-
|
||||
inserts a character ch, without moving the cursor. The character ch is
|
||||
a parameter; <span style="font-weight: bold;">substitution </span>
|
||||
- rewrites a character at the current cursor position to the character
|
||||
ch and moves the cursor to the next character. The character ch is a
|
||||
parameter; <span style="font-weight: bold;">no operation</span> (NOOP)
|
||||
- skip a sequence of characters starting at the current cursor
|
||||
position. The length of this sequence is the parameter.<br>
|
||||
<br>
|
||||
The P-commands are applied from the end of a word (right to left). This
|
||||
assumption can reduce the set of P-command's, because the last NOOP,
|
||||
moving the cursor to the end of a string without any changes, need not
|
||||
be stored."</div>
|
||||
</center>
|
||||
<br>
|
||||
Data structure used to keep the dictionary (words and their P-commands)
|
||||
is a trie. Several optimization steps are applied in turn to reduce and
|
||||
optimize the initial trie, by eliminating useless information and
|
||||
shortening the paths in the trie.<br>
|
||||
<br>
|
||||
Finally, in order to obtain a stem from the input word, the word is
|
||||
passed once through a matching path in the trie (applying at each node
|
||||
the P-commands stored there). The result is a word stem.<br>
|
||||
<h2>Corpus</h2>
|
||||
<p><i>(to be completed...)</i></p>
|
||||
<p>The following Polish corpora have been used:</p>
|
||||
<ul>
|
||||
<li><a
|
||||
href="http://sourceforge.net/project/showfiles.php?group_id=49316&package_id=65354">Polish
|
||||
dictionary
|
||||
from ispell distribution</a></li>
|
||||
<li><a href="http://www.mimuw.edu.pl/polszczyzna/">Wzbogacony korpus
|
||||
słownika frekwencyjnego</a></li>
|
||||
<!--<li><a href="http://www.korpus.pl">Korpus IPI PAN</a></li>-->
|
||||
<!--<li>The Bible (so called "Warsaw Bible" or "Brytyjka")</li>--><li>The
|
||||
Bible (so called "TysiÄ…clecia") - unauthorized electronic version</li>
|
||||
<li><a
|
||||
href="http://www.mimuw.edu.pl/polszczyzna/Debian/sam34_3.4a.02-1_i386.deb">Analizator
|
||||
morfologiczny SAM v. 3.4</a> - this was used to recover lemmas
|
||||
missing from other texts</li>
|
||||
</ul>
|
||||
<p>This step was the most time-consuming - and it would probably be
|
||||
even more tedious and difficult if not for the
|
||||
help of
|
||||
<a href="http://www.python.org/">Python</a>. The source texts had to be
|
||||
brought to a common encoding (UTF-8) - some of them used quite ancient
|
||||
encodings like Mazovia or DHN - and then scripts were written to
|
||||
collect all lemmas and
|
||||
inflected forms from the source texts. In cases when the source text
|
||||
was not
|
||||
tagged,
|
||||
I used the SAM analyzer to produce lemmas. In cases of ambiguous
|
||||
lemmatization I decided to put references to inflected forms from all
|
||||
base forms.<br>
|
||||
</p>
|
||||
<p>All grammatical categories were allowed to appear in the corpus,
|
||||
i.e. nouns, verbs, adjectives, numerals, and pronouns. The resulting
|
||||
corpus consisted of roughly 87,000+ inflection sets, i.e. each set
|
||||
consisted of one base form (lemma) and many inflected forms. However,
|
||||
because of the nature of the training method I restricted these sets to
|
||||
include only those where there were at least 4 inflected forms. Sets
|
||||
with 3 or less inflected forms were removed, so that the final corpus
|
||||
consisted of ~69,000 unique sets, which in turn contained ~1.5 mln
|
||||
inflected forms. <br>
|
||||
</p>
|
||||
<h2>Testing</h2>
|
||||
<p>I tested the stemmer tables produced using the implementation
|
||||
described above. The following sections give some details about
|
||||
the testing setup.
|
||||
</p>
|
||||
<h3>Testing procedure</h3>
|
||||
<p>The testing procedure was as follows:
|
||||
</p>
|
||||
<ul>
|
||||
<li>the whole corpus of ~69,000 unique sets was shuffled, so that the
|
||||
input sets were in random order.</li>
|
||||
<li>the corpus was split into two parts - one with 30,000 sets (Part
|
||||
1), the other with ~39,000 sets (Part 2).</li>
|
||||
<li>Training samples were drawn in sequential order from the Part 1.
|
||||
Since the sets were already randomized, the training samples were also
|
||||
randomized, but this procedure ensured that each larger training sample
|
||||
contained all smaller samples.</li>
|
||||
<li>Part 2 was used for testing. Note: this means that the testing
|
||||
run used <em>only</em> words previously unseen during the training
|
||||
phase. This is the worst scenario, because it means that stemmer must
|
||||
extrapolate the learned rules to unknown cases. This also means that in
|
||||
a real-life case (where the input is a mix between known and unknown
|
||||
words) the F-measure of the stemmer will be even higher than in the
|
||||
table below.</li>
|
||||
</ul>
|
||||
<h3>Test results</h3>
|
||||
<p>The following table summarizes test results for varying sizes
|
||||
of training samples. The meaning of the table columns is
|
||||
described below:
|
||||
</p>
|
||||
<ul>
|
||||
<li><b>training sets:</b> the number of training sets. One set
|
||||
consists of one lemma and at least 4 and up to ~80 inflected forms
|
||||
(including pre- and suffixed forms).</li>
|
||||
<li><b>testing forms:</b> the number of testing forms. Only inflected
|
||||
forms were used in testing.</li>
|
||||
<li><b>stem OK:</b> the number of cases when produced output was a
|
||||
correct (unique) stem. Note: quite often correct stems were also
|
||||
correct lemmas.</li>
|
||||
<li><b>lemma OK:</b> the number of cases when produced output was a
|
||||
correct lemma.</li>
|
||||
<li><b>missing:</b> the number of cases when stemmer was unable to
|
||||
provide any output.</li>
|
||||
<li><b>stem bad:</b> the number of cases when produced output was a
|
||||
stem, but already in use identifying a different set.</li>
|
||||
<li><b>lemma bad:</b> the number of cases when produced output was an
|
||||
incorrect lemma. Note: quite often in such case the output was a
|
||||
correct stem.</li>
|
||||
<li><b>table size:</b> the size in bytes of the stemmer table.</li>
|
||||
</ul>
|
||||
<div align="center">
|
||||
<table border="1" cellpadding="2" cellspacing="0">
|
||||
<tbody>
|
||||
<tr bgcolor="#a0b0c0">
|
||||
<th>Training sets</th>
|
||||
<th>Testing forms</th>
|
||||
<th>Stem OK</th>
|
||||
<th>Lemma OK</th>
|
||||
<th>Missing</th>
|
||||
<th>Stem Bad</th>
|
||||
<th>Lemma Bad</th>
|
||||
<th>Table size [B]</th>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<td>100</td>
|
||||
<td>1022985</td>
|
||||
<td>842209</td>
|
||||
<td>593632</td>
|
||||
<td>172711</td>
|
||||
<td>22331</td>
|
||||
<td>256642</td>
|
||||
<td>28438</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<td>200</td>
|
||||
<td>1022985</td>
|
||||
<td>862789</td>
|
||||
<td>646488</td>
|
||||
<td>153288</td>
|
||||
<td>16306</td>
|
||||
<td>223209</td>
|
||||
<td>48660</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<td>500</td>
|
||||
<td>1022985</td>
|
||||
<td>885786</td>
|
||||
<td>685009</td>
|
||||
<td>130772</td>
|
||||
<td>14856</td>
|
||||
<td>207204</td>
|
||||
<td>108798</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<td>700</td>
|
||||
<td>1022985</td>
|
||||
<td>909031</td>
|
||||
<td>704609</td>
|
||||
<td>107084</td>
|
||||
<td>15442</td>
|
||||
<td>211292</td>
|
||||
<td>139291</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<td>1000</td>
|
||||
<td>1022985</td>
|
||||
<td>926079</td>
|
||||
<td>725720</td>
|
||||
<td>90117</td>
|
||||
<td>14941</td>
|
||||
<td>207148</td>
|
||||
<td>183677</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<td>2000</td>
|
||||
<td>1022985</td>
|
||||
<td>942886</td>
|
||||
<td>746641</td>
|
||||
<td>73429</td>
|
||||
<td>14903</td>
|
||||
<td>202915</td>
|
||||
<td>313516</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<td>5000</td>
|
||||
<td>1022985</td>
|
||||
<td>954721</td>
|
||||
<td>759930</td>
|
||||
<td>61476</td>
|
||||
<td>14817</td>
|
||||
<td>201579</td>
|
||||
<td>640969</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<td>7000</td>
|
||||
<td>1022985</td>
|
||||
<td>956165</td>
|
||||
<td>764033</td>
|
||||
<td>60364</td>
|
||||
<td>14620</td>
|
||||
<td>198588</td>
|
||||
<td>839347</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<td>10000</td>
|
||||
<td>1022985</td>
|
||||
<td>965427</td>
|
||||
<td>775507</td>
|
||||
<td>50797</td>
|
||||
<td>14662</td>
|
||||
<td>196681</td>
|
||||
<td>1144537</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<td>12000</td>
|
||||
<td>1022985</td>
|
||||
<td>967664</td>
|
||||
<td>782143</td>
|
||||
<td>48722</td>
|
||||
<td>14284</td>
|
||||
<td>192120</td>
|
||||
<td>1313508</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<td>15000</td>
|
||||
<td>1022985</td>
|
||||
<td>973188</td>
|
||||
<td>788867</td>
|
||||
<td>43247</td>
|
||||
<td>14349</td>
|
||||
<td>190871</td>
|
||||
<td>1567902</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<td>17000</td>
|
||||
<td>1022985</td>
|
||||
<td>974203</td>
|
||||
<td>791804</td>
|
||||
<td>42319</td>
|
||||
<td>14333</td>
|
||||
<td>188862</td>
|
||||
<td>1733957</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<td>20000</td>
|
||||
<td>1022985</td>
|
||||
<td>976234</td>
|
||||
<td>791554</td>
|
||||
<td>40058</td>
|
||||
<td>14601</td>
|
||||
<td>191373</td>
|
||||
<td>1977615</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
<p>I also measured the time to produce a stem (which involves
|
||||
traversing a trie,
|
||||
retrieving a patch command and applying the patch command to the input
|
||||
string).
|
||||
On a machine running Windows XP (Pentium 4, 1.7 GHz, JDK 1.4.2_03
|
||||
HotSpot),
|
||||
for tables ranging in size from 1,000 to 20,000 cells, the time to
|
||||
produce a
|
||||
single stem varies between 5-10 microseconds.<br>
|
||||
</p>
|
||||
<p>This means that the stemmer can process up to <span
|
||||
style="font-weight: bold;">200,000 words per second</span>, an
|
||||
outstanding result when compared to other stemmers (Morfeusz - ~2,000
|
||||
w/s, FormAN (MS Word analyzer) - ~1,000 w/s).<br>
|
||||
</p>
|
||||
<p>The package contains a class <code>org.getopt.stempel.Benchmark</code>,
|
||||
which you can use to produce reports
|
||||
like the one below:<br>
|
||||
</p>
|
||||
<pre>--------- Stemmer benchmark report: -----------<br>Stemmer table: /res/tables/stemmer_2000.out<br>Input file: ../test3.txt<br>Number of runs: 3<br><br> RUN NUMBER: 1 2 3<br> Total input words 1378176 1378176 1378176<br> Missed output words 112 112 112<br> Time elapsed [ms] 6989 6940 6640<br> Hit rate percent 99.99% 99.99% 99.99%<br> Miss rate percent 00.01% 00.01% 00.01%<br> Words per second 197192 198584 207557<br> Time per word [us] 5.07 5.04 4.82<br></pre>
|
||||
<h2>Summary</h2>
|
||||
<p>The results of these tests are very encouraging. It seems that using
|
||||
the
|
||||
training corpus and the stemming algorithm described above results in a
|
||||
high-quality stemmer useful for most applications. Moreover, it can
|
||||
also
|
||||
be used as a better than average lemmatizer.</p>
|
||||
<p>Both the author of the implementation
|
||||
(Leo Galambos, <leo.galambos AT egothor DOT org>) and the author
|
||||
of this
|
||||
compilation (Andrzej Bialecki <ab AT getopt DOT org>) would
|
||||
appreciate any
|
||||
feedback and suggestions for further improvements.</p>
|
||||
<h2>Bibliography</h2>
|
||||
<ol>
|
||||
<li>Galambos, L.: Multilingual Stemmer in Web Environment, PhD
|
||||
Thesis,
|
||||
Faculty of Mathematics and Physics, Charles University in Prague, in
|
||||
press.</li>
|
||||
<li>Galambos, L.: Semi-automatic Stemmer Evaluation. International
|
||||
Intelligent Information Processing and Web Mining Conference, 2004,
|
||||
Zakopane, Poland.</li>
|
||||
<li>Galambos, L.: Lemmatizer for Document Information Retrieval
|
||||
Systems in JAVA.<span style="text-decoration: underline;"> </span><a
|
||||
class="moz-txt-link-rfc2396E"
|
||||
href="http://www.informatik.uni-trier.de/%7Eley/db/conf/sofsem/sofsem2001.html#Galambos01"><http://www.informatik.uni-trier.de/%7Eley/db/conf/sofsem/sofsem2001.html#Galambos01></a>
|
||||
SOFSEM 2001, Piestany, Slovakia. <br>
|
||||
</li>
|
||||
</ol>
|
||||
<br>
|
||||
<br>
|
||||
</body>
|
||||
</html>
|
Binary file not shown.
|
@ -0,0 +1,186 @@
|
|||
# This file was created from the carrot2 project and is distributed under the BSD license.
|
||||
# See http://project.carrot2.org/license.html
|
||||
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
# From trunk/core/carrot2-util-text/src-resources/stopwords.pl
|
||||
vol
|
||||
o.o.
|
||||
mgr
|
||||
godz
|
||||
zł
|
||||
www
|
||||
pl
|
||||
ul
|
||||
tel
|
||||
hab
|
||||
prof
|
||||
inż
|
||||
dr
|
||||
i
|
||||
u
|
||||
aby
|
||||
albo
|
||||
ale
|
||||
ani
|
||||
aż
|
||||
bardzo
|
||||
bez
|
||||
bo
|
||||
bowiem
|
||||
by
|
||||
byli
|
||||
bym
|
||||
był
|
||||
była
|
||||
było
|
||||
były
|
||||
być
|
||||
będzie
|
||||
będą
|
||||
chce
|
||||
choć
|
||||
co
|
||||
coraz
|
||||
coś
|
||||
czy
|
||||
czyli
|
||||
często
|
||||
dla
|
||||
do
|
||||
gdy
|
||||
gdyby
|
||||
gdyż
|
||||
gdzie
|
||||
go
|
||||
ich
|
||||
im
|
||||
inne
|
||||
iż
|
||||
ja
|
||||
jak
|
||||
jakie
|
||||
jako
|
||||
je
|
||||
jednak
|
||||
jednym
|
||||
jedynie
|
||||
jego
|
||||
jej
|
||||
jest
|
||||
jeszcze
|
||||
jeśli
|
||||
jeżeli
|
||||
już
|
||||
ją
|
||||
kiedy
|
||||
kilku
|
||||
kto
|
||||
która
|
||||
które
|
||||
którego
|
||||
której
|
||||
który
|
||||
których
|
||||
którym
|
||||
którzy
|
||||
lat
|
||||
lecz
|
||||
lub
|
||||
ma
|
||||
mają
|
||||
mamy
|
||||
mi
|
||||
miał
|
||||
mimo
|
||||
mnie
|
||||
mogą
|
||||
może
|
||||
można
|
||||
mu
|
||||
musi
|
||||
na
|
||||
nad
|
||||
nam
|
||||
nas
|
||||
nawet
|
||||
nic
|
||||
nich
|
||||
nie
|
||||
niej
|
||||
nim
|
||||
niż
|
||||
no
|
||||
nowe
|
||||
np
|
||||
nr
|
||||
o
|
||||
od
|
||||
ok
|
||||
on
|
||||
one
|
||||
oraz
|
||||
pan
|
||||
po
|
||||
pod
|
||||
ponad
|
||||
ponieważ
|
||||
poza
|
||||
przed
|
||||
przede
|
||||
przez
|
||||
przy
|
||||
raz
|
||||
razie
|
||||
roku
|
||||
również
|
||||
się
|
||||
sobie
|
||||
sposób
|
||||
swoje
|
||||
są
|
||||
ta
|
||||
tak
|
||||
takich
|
||||
takie
|
||||
także
|
||||
tam
|
||||
te
|
||||
tego
|
||||
tej
|
||||
temu
|
||||
ten
|
||||
teraz
|
||||
też
|
||||
to
|
||||
trzeba
|
||||
tu
|
||||
tych
|
||||
tylko
|
||||
tym
|
||||
tys
|
||||
tzw
|
||||
tę
|
||||
w
|
||||
we
|
||||
wie
|
||||
więc
|
||||
wszystko
|
||||
wśród
|
||||
właśnie
|
||||
z
|
||||
za
|
||||
zaś
|
||||
ze
|
||||
że
|
||||
żeby
|
||||
ii
|
||||
iii
|
||||
iv
|
||||
vi
|
||||
vii
|
||||
viii
|
||||
ix
|
||||
xi
|
||||
xii
|
||||
xiii
|
||||
xiv
|
||||
xv
|
|
@ -0,0 +1,53 @@
|
|||
package org.apache.lucene.analysis.pl;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
||||
public class TestPolishAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
* stopwords file is missing in classpath */
|
||||
public void testResourcesAvailable() {
|
||||
new PolishAnalyzer(TEST_VERSION_CURRENT);
|
||||
}
|
||||
|
||||
/** test stopwords and stemming */
|
||||
public void testBasics() throws IOException {
|
||||
Analyzer a = new PolishAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "studenta", "student");
|
||||
checkOneTermReuse(a, "studenci", "student");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "był", new String[] {});
|
||||
}
|
||||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("studenta");
|
||||
Analyzer a = new PolishAnalyzer(TEST_VERSION_CURRENT,
|
||||
PolishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "studenta", "studenta");
|
||||
checkOneTermReuse(a, "studenci", "student");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,153 @@
|
|||
package org.egothor.stemmer;
|
||||
|
||||
/*
|
||||
Egothor Software License version 1.00
|
||||
Copyright (C) 1997-2004 Leo Galambos.
|
||||
Copyright (C) 2002-2004 "Egothor developers"
|
||||
on behalf of the Egothor Project.
|
||||
All rights reserved.
|
||||
|
||||
This software is copyrighted by the "Egothor developers". If this
|
||||
license applies to a single file or document, the "Egothor developers"
|
||||
are the people or entities mentioned as copyright holders in that file
|
||||
or document. If this license applies to the Egothor project as a
|
||||
whole, the copyright holders are the people or entities mentioned in
|
||||
the file CREDITS. This file can be found in the same location as this
|
||||
license in the distribution.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
disclaimer that follows these conditions in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
3. The name "Egothor" must not be used to endorse or promote products
|
||||
derived from this software without prior written permission. For
|
||||
written permission, please contact Leo.G@seznam.cz
|
||||
4. Products derived from this software may not be called "Egothor",
|
||||
nor may "Egothor" appear in their name, without prior written
|
||||
permission from Leo.G@seznam.cz.
|
||||
|
||||
In addition, we request that you include in the end-user documentation
|
||||
provided with the redistribution and/or in the software itself an
|
||||
acknowledgement equivalent to the following:
|
||||
"This product includes software developed by the Egothor Project.
|
||||
http://egothor.sf.net/"
|
||||
|
||||
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
This software consists of voluntary contributions made by many
|
||||
individuals on behalf of the Egothor Project and was originally
|
||||
created by Leo Galambos (Leo.G@seznam.cz).
|
||||
*/
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.LineNumberReader;
|
||||
import java.net.URI;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestCompile extends LuceneTestCase {
|
||||
|
||||
public void testCompile() throws Exception {
|
||||
URI uri = getClass().getResource("testRules.txt").toURI();
|
||||
String path = uri.getPath();
|
||||
Compile.main(new String[] {"test", path});
|
||||
String compiled = path + ".out";
|
||||
Trie trie = loadTrie(compiled);
|
||||
assertTrie(trie, path, true, true);
|
||||
assertTrie(trie, path, false, true);
|
||||
new File(compiled).delete();
|
||||
}
|
||||
|
||||
public void testCompileBackwards() throws Exception {
|
||||
URI uri = getClass().getResource("testRules.txt").toURI();
|
||||
String path = uri.getPath();
|
||||
Compile.main(new String[] {"-test", path});
|
||||
String compiled = path + ".out";
|
||||
Trie trie = loadTrie(compiled);
|
||||
assertTrie(trie, path, true, true);
|
||||
assertTrie(trie, path, false, true);
|
||||
new File(compiled).delete();
|
||||
}
|
||||
|
||||
public void testCompileMulti() throws Exception {
|
||||
URI uri = getClass().getResource("testRules.txt").toURI();
|
||||
String path = uri.getPath();
|
||||
Compile.main(new String[] {"Mtest", path});
|
||||
String compiled = path + ".out";
|
||||
Trie trie = loadTrie(compiled);
|
||||
assertTrie(trie, path, true, true);
|
||||
assertTrie(trie, path, false, true);
|
||||
new File(compiled).delete();
|
||||
}
|
||||
|
||||
static Trie loadTrie(String path) throws IOException {
|
||||
Trie trie;
|
||||
DataInputStream is = new DataInputStream(new BufferedInputStream(
|
||||
new FileInputStream(path)));
|
||||
String method = is.readUTF().toUpperCase();
|
||||
if (method.indexOf('M') < 0) {
|
||||
trie = new Trie(is);
|
||||
} else {
|
||||
trie = new MultiTrie(is);
|
||||
}
|
||||
is.close();
|
||||
return trie;
|
||||
}
|
||||
|
||||
private static void assertTrie(Trie trie, String file, boolean usefull,
|
||||
boolean storeorig) throws Exception {
|
||||
LineNumberReader in = new LineNumberReader(new BufferedReader(
|
||||
new FileReader(file)));
|
||||
|
||||
for (String line = in.readLine(); line != null; line = in.readLine()) {
|
||||
try {
|
||||
line = line.toLowerCase();
|
||||
StringTokenizer st = new StringTokenizer(line);
|
||||
String stem = st.nextToken();
|
||||
if (storeorig) {
|
||||
CharSequence cmd = (usefull) ? trie.getFully(stem) : trie
|
||||
.getLastOnPath(stem);
|
||||
StringBuilder stm = new StringBuilder(stem);
|
||||
Diff.apply(stm, cmd);
|
||||
assertEquals(stem.toLowerCase(), stm.toString().toLowerCase());
|
||||
}
|
||||
while (st.hasMoreTokens()) {
|
||||
String token = st.nextToken();
|
||||
if (token.equals(stem)) {
|
||||
continue;
|
||||
}
|
||||
CharSequence cmd = (usefull) ? trie.getFully(token) : trie
|
||||
.getLastOnPath(token);
|
||||
StringBuilder stm = new StringBuilder(token);
|
||||
Diff.apply(stm, cmd);
|
||||
assertEquals(stem.toLowerCase(), stm.toString().toLowerCase());
|
||||
}
|
||||
} catch (java.util.NoSuchElementException x) {
|
||||
// no base token (stem) on a line
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,168 @@
|
|||
package org.egothor.stemmer;
|
||||
|
||||
/*
|
||||
Egothor Software License version 1.00
|
||||
Copyright (C) 1997-2004 Leo Galambos.
|
||||
Copyright (C) 2002-2004 "Egothor developers"
|
||||
on behalf of the Egothor Project.
|
||||
All rights reserved.
|
||||
|
||||
This software is copyrighted by the "Egothor developers". If this
|
||||
license applies to a single file or document, the "Egothor developers"
|
||||
are the people or entities mentioned as copyright holders in that file
|
||||
or document. If this license applies to the Egothor project as a
|
||||
whole, the copyright holders are the people or entities mentioned in
|
||||
the file CREDITS. This file can be found in the same location as this
|
||||
license in the distribution.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, the list of contributors, this list of conditions, and the
|
||||
disclaimer that follows these conditions in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
3. The name "Egothor" must not be used to endorse or promote products
|
||||
derived from this software without prior written permission. For
|
||||
written permission, please contact Leo.G@seznam.cz
|
||||
4. Products derived from this software may not be called "Egothor",
|
||||
nor may "Egothor" appear in their name, without prior written
|
||||
permission from Leo.G@seznam.cz.
|
||||
|
||||
In addition, we request that you include in the end-user documentation
|
||||
provided with the redistribution and/or in the software itself an
|
||||
acknowledgement equivalent to the following:
|
||||
"This product includes software developed by the Egothor Project.
|
||||
http://egothor.sf.net/"
|
||||
|
||||
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
This software consists of voluntary contributions made by many
|
||||
individuals on behalf of the Egothor Project and was originally
|
||||
created by Leo Galambos (Leo.G@seznam.cz).
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestStemmer extends LuceneTestCase {
|
||||
|
||||
public void testTrie() {
|
||||
Trie t = new Trie(true);
|
||||
|
||||
String keys[] = {"a", "ba", "bb", "c"};
|
||||
String vals[] = {"1", "2", "2", "4"};
|
||||
|
||||
for (int i = 0; i < keys.length; i++) {
|
||||
t.add(keys[i], vals[i]);
|
||||
}
|
||||
|
||||
assertEquals(0, t.root);
|
||||
assertEquals(2, t.rows.size());
|
||||
assertEquals(3, t.cmds.size());
|
||||
assertTrieContents(t, keys, vals);
|
||||
}
|
||||
|
||||
public void testTrieBackwards() {
|
||||
Trie t = new Trie(false);
|
||||
|
||||
String keys[] = {"a", "ba", "bb", "c"};
|
||||
String vals[] = {"1", "2", "2", "4"};
|
||||
|
||||
for (int i = 0; i < keys.length; i++) {
|
||||
t.add(keys[i], vals[i]);
|
||||
}
|
||||
|
||||
assertTrieContents(t, keys, vals);
|
||||
}
|
||||
|
||||
public void testMultiTrie() {
|
||||
Trie t = new MultiTrie(true);
|
||||
|
||||
String keys[] = {"a", "ba", "bb", "c"};
|
||||
String vals[] = {"1", "2", "2", "4"};
|
||||
|
||||
for (int i = 0; i < keys.length; i++) {
|
||||
t.add(keys[i], vals[i]);
|
||||
}
|
||||
|
||||
assertTrieContents(t, keys, vals);
|
||||
}
|
||||
|
||||
public void testMultiTrieBackwards() {
|
||||
Trie t = new MultiTrie(false);
|
||||
|
||||
String keys[] = {"a", "ba", "bb", "c"};
|
||||
String vals[] = {"1", "2", "2", "4"};
|
||||
|
||||
for (int i = 0; i < keys.length; i++) {
|
||||
t.add(keys[i], vals[i]);
|
||||
}
|
||||
|
||||
assertTrieContents(t, keys, vals);
|
||||
}
|
||||
|
||||
public void testMultiTrie2() {
|
||||
Trie t = new MultiTrie2(true);
|
||||
|
||||
String keys[] = {"a", "ba", "bb", "c"};
|
||||
/*
|
||||
* short vals won't work, see line 155 for example
|
||||
* the IOOBE is caught (wierd), but shouldnt affect patch cmds?
|
||||
*/
|
||||
String vals[] = {"1111", "2222", "2223", "4444"};
|
||||
|
||||
for (int i = 0; i < keys.length; i++) {
|
||||
t.add(keys[i], vals[i]);
|
||||
}
|
||||
|
||||
assertTrieContents(t, keys, vals);
|
||||
}
|
||||
|
||||
public void testMultiTrie2Backwards() {
|
||||
Trie t = new MultiTrie2(false);
|
||||
|
||||
String keys[] = {"a", "ba", "bb", "c"};
|
||||
/*
|
||||
* short vals won't work, see line 155 for example
|
||||
* the IOOBE is caught (wierd), but shouldnt affect patch cmds?
|
||||
*/
|
||||
String vals[] = {"1111", "2222", "2223", "4444"};
|
||||
|
||||
for (int i = 0; i < keys.length; i++) {
|
||||
t.add(keys[i], vals[i]);
|
||||
}
|
||||
|
||||
assertTrieContents(t, keys, vals);
|
||||
}
|
||||
|
||||
private static void assertTrieContents(Trie trie, String keys[], String vals[]) {
|
||||
Trie[] tries = new Trie[] {
|
||||
trie,
|
||||
trie.reduce(new Optimizer()),
|
||||
trie.reduce(new Optimizer2()),
|
||||
trie.reduce(new Gener()),
|
||||
trie.reduce(new Lift(true)),
|
||||
trie.reduce(new Lift(false))
|
||||
};
|
||||
|
||||
for (Trie t : tries) {
|
||||
for (int i = 0; i < keys.length; i++) {
|
||||
assertEquals(vals[i], t.getFully(keys[i]).toString());
|
||||
assertEquals(vals[i], t.getLastOnPath(keys[i]).toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,4 @@
|
|||
act acted acting actor
|
||||
walk walked walking
|
||||
wander wandered wanderer
|
||||
want wanted wanting
|
|
@ -141,6 +141,9 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-ant/index.html">Ant</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
|
|
|
@ -141,6 +141,9 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-ant/index.html">Ant</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
|
|
|
@ -141,6 +141,9 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-ant/index.html">Ant</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
|
|
|
@ -141,6 +141,9 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-ant/index.html">Ant</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
|
|
|
@ -141,6 +141,9 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-ant/index.html">Ant</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
|
|
|
@ -141,6 +141,9 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-ant/index.html">Ant</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
|
|
|
@ -141,6 +141,9 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-ant/index.html">Ant</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
|
|
|
@ -139,6 +139,9 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-ant/index.html">Ant</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
|
|
|
@ -139,6 +139,9 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-ant/index.html">Ant</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
|
@ -329,6 +332,12 @@ document.write("Last Published: " + document.lastModified);
|
|||
</li>
|
||||
</ul>
|
||||
|
||||
<ul>
|
||||
<li>
|
||||
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a> ___________________ <em>javadoc-contrib-stempel</em>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
<ul>
|
||||
<li>
|
||||
<a href="api/contrib-ant/index.html">Ant</a> ___________________ <em>javadoc-contrib-ant</em>
|
||||
|
|
|
@ -5,10 +5,10 @@
|
|||
/Producer (FOP 0.20.5) >>
|
||||
endobj
|
||||
5 0 obj
|
||||
<< /Length 1070 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
<< /Length 1057 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
>>
|
||||
stream
|
||||
Gau1.?#Q2d(kqGM/%BCKiqXP'9[ZpVeS1J@?*2RDJJ;8h)*"1u?U#K#YfU-GFDKA1)tf`Ap3AB!Y%R2UM2*O1hKgu1l78GlmpUr5-pPc\<?J+M&0?)^XnVMT2AbVg2og@in!r4pjjQ-A%3=cu^<l:LS)JZnPp.eKF7+JJf%8?<GmJW>L)ol6^`i+3elgiX$)[-\l.==%fb8IecEb%t=ZSD!C:buoU%L7QhMTHDEtXpA,O0\.p2p_&m)[&h64a>aq]3]CQ,+ZCVM0!qk[ChWq`$/B&qd=VpX>b0+?pIV49Pe,;mI@;*<L9c5nUoZ^n)e#%K<)ZpKP1GOiXSD.N4u`KHW75Ot(o4%#CqoL?2bMg8F,@Ia6E7PGoD`-;=04NhS-]Ld>Dl>@g\-O>bRcb6J.TVF>mZZu^pN;)BC_rJ\:eH?-Y(B.]@Aha>ZK<nKj\kn)&PZ;T3JrF@n`N8ZrY/I&^kgTdK;12@b48'b$KfVqLnM\DQqO.E.phGfV@\/%Lf4oR3JKU_Nqdl^O>/hY64(%<l>VX#>>r[MV-n?"QeO`%5a0gdO\Tmn1X>%&3b&,pb^(`(#r\S;*XocBcHC9)nS)&4g>dX#pGW2+DOQmO&pGr3%@=:)@c`U,R``NY'"ZFil#6ZA&@9^0\O@#3KEN73Q<j;WWK3?DV(f](=>4ri9T]I:b[S:l8@I_#a#:$:8p]M,3#=I"`-2<;f*pPCuRR+[ZSq<!a%T2-.e^c=7mrd`X)\1"=>_ooEG1RXN+ae4<c9ul`_i>bNrW#/\[pE/s]XH:3V%@*\r's2@0VTj3)(GA2-o!qqPGtUEWe^rp'S9_bknt?%f$g=U,KIju&&o$p&#urY7L<g1)@H8Wp`*-(0lr)o.;.6s-L;=_7kT!idPZeI[UYMR/Qg&''[[Y%j0S\BV(9PaVp$>1qSo,'o#l;>L*c5%&^s&iXD_q8Q>H2bS*'=B&3BY)@A_JjY0a9ti]T<-UGJ:+mpJS%Y(_52BR%^4-+lq@f4O['eMd7%_mFuA#IR^`rF'7nRUcn5KmGHW!c+_SO2d0GRY4Zk)J77ZV!@ip"n,~>
|
||||
Gau1.d8&FM'Sc)R'Y/mTHF@<Fa^n\2#"%#j\uRP[+a$@!9tJ7pg7Eb:daiT"nIufT1j6h;J&ab9,bE:O+"kG2'**[\EWjoo2M/c_#R+%R:lqV97N_t>*6_L#pTrU@nFL%HUKHA5].'bX<P$`poCTaRmak*[4f^*L/('pn5D&^=FJ])U]sD9?UBiPEHFnH/Zq>qM4(m\uW\8Q!rLgXdY=6b0+KJT0Opbcs/^.'q/IX"8=LFmVA7F#,E*u0GG+*q,hQbsH9OA)A6+&8.5sY.khJ=rR.3o6^j.E56#=3MWLfjdG`,*OZ7Q1H^$A'CNTnPX!W"8m%c8I2`DlG/MC+*7V6BEJF2HnV`CWV*oR\3d^FeF0>nn)+qkJ:+Slkdn<W"5Ud15J6_+IhJ,ot]>@b8#R>a0nFFf5u'[.\fNSlM<O'Wggih'f+K/=o(4YERkHna;E<]GsYPgVRR]V-[Q8[Y\g)mG1rN.-%n>&i&]\kIDQKM;H]OB[Wq-rrTg><eA4!kbMh\=ZRfXqr/PK].U=,4AF9eLO%_A&pL+#cBjhuACZ'fJlJZH33\j?X:U]J?I:Oo.h_eeEk[jA9!"RP+JBCel%8<e/r;DC0K8W!+)C)M(E^Vl^[>U)'K[.0=?)<P-O76:Y<4q$7c9[17?24cQBss(BG_eo)!SEj]&[mQKX1[u]`1=lVK!3b<e.I//bG*6g$kUHK;gbJC/^9IZpMdhHRChEQ.P\FE&p>gTRUj928).s*\enbNWXhX'9E(=IIc2q[N8<l&nr4^r2dh4[@#W_J0-GER5"b$6i[M(:>Sb6+@*gT,:K>TuKN"@b:f6TIc(BlK]Tg1j%@*\r-*:a"9UhM85$mC>eQ9(qDb3d5qj..7B9@'/I+6uEYGtXE+tQX.(N3GQFfMZn:-^D+:@$qfoW((9G%W]YP:*(*l*qkO1h_-<*.8i%eaZbA6a/Wna#G9m3#s596,I;sKRcP*qlF-(DUeCEF71-H8<5HXbW%6<`aO.u:,?s=>;*s:g))W4oqko7#;KWa>PA?3)g'r=0pW?`B$1uUL]/+.45NX~>
|
||||
endstream
|
||||
endobj
|
||||
6 0 obj
|
||||
|
@ -20,10 +20,10 @@ endobj
|
|||
>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<< /Length 995 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
<< /Length 1045 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
>>
|
||||
stream
|
||||
GatUs>u03/'Sc)R.sqX>p&#CLc6]-+dKIu@Q'aD*aeV\RRZB@H.K0!^^.=b,18Sr4%!ganf]O5%f?;j6"Tt,W"I?k4Jb9ERUjk*AeC'75Z_<@0Y`-E%M^LjD-(&2dOT*_J?'tLEMF8Z;IBM0>.[U1*r`k'4@/4)4g$_t$NFb,3'$E-..pg1Ep#gdmW%<=fa4fc`K:Q]*W76CL!:'MhAtb_)M4')T*Vt`,`aT>iF%AL24R/;5i=D(4192/I<ELD[,6h#?ZH,Eb%r&J?S0+S=IU1?^niLa$7)M#?c6+4MnNk0oP%U>=7Pu1^W**sl7VNCb[^Qu$'[_"rX>+S@U5t^$*b1_kE++)th1>Q`oJaao9Kde?]%jYQAaT>Q!L)ITpC:W;SkP#$ZlNToS=imm@uSKGCV@d`QtEBeWp,:GlmF_DqNQcT@t`=jmU.RD]Xr$976N^rQkbq9.Df\o9K`BR9[/[cVtLlre[1^)NXKZiJF&I$5NYSXD41uU[h!V6b=%mUc>h`!05WI3I#OL(=8>Ru'mg[ma^e*1kQOXR,MuHAA&Q8b^::bH9L_ld4:o,%)X7ganfj$'Hch7'MoG;jVRE8A\7ldnLO%5PO.gSNGW1#68l\6[VY-1pP[$pIB+A5!2EfSsV:k%@6VV=U_</;aq_IDPCL*6i5rE&trJ"l_H8Nl2NbadWIJr\rM%XaiFAMF9Bc'l\.BeHG:^J.Y*m`H91]8Mm(+im'%#_dG*90A-Y`0^A9mBuG)sW2VY'ts&%P;1N&ct(e^qg^oi`;+b@JP(:*.:aF2UOr1-C\+RNO>U-Nm.Mq"!]?H[53NO/n_felbk@dZ+bo3V?_>X+@m?j,RA#VZg6Ul#59j?`hT%*f7MdtRqlt[qNF[C(tG6G3]kAkSW=u=(j'U!\>9kYQ./O>4_E[L7?r23f_6c1E#TCBlo&%]a>UQ^$GRO,BqPq[l`;YcIBjL)["k3mU8SMnn%9<,IHS>e'l<X4+)rfDZi~>
|
||||
GatUs>Ar4L'Z],,'R<q*nD\5Ra0lnMVH-k>UeY*k.1M#Dj8r()jkJ+l2)3="$Li-G3WG^mDYD>&np*\FLk+g8QP7tR9q^OH06@aPC6;?tQqNrplMUSPN"E[CV@ZX,rUf>l(F?>lWEYk,:Hqq+05(8Y8[&_^c96/%^_l;;J&8[6O[mkKcUo(i9Tqr>F[Alo[#EW>`5$UNdTdh_6UDl&`-$cDaL%5:;*kbSM)$\-OX+?^nm*sgQ?Wb&kX$Z$9$)r_I35s'e;'TVYo!uXWIiV7NZ#<sKFQE%!C)3>KcN&5pJ;Vl[N7ZGd)mqOK*7^d0GXIiY".pkj*H4A9NG6l4)BX>BD+m?\c&H>Y7qdAU-^/=Ea,sb2G.N*m9gTnEL;^*KF3q-'$$A.Vd\V-c&!HW<!>ap,$J&V)&muGpfs.GAKC&<J-e04Gd*78ebJj]!V;C*E.ZdlfGHd'3$Or7\]@0a8cSHDf-QG+pRR%uP<rb.rOBW.Rml'nrm<5%*[G1G;oT$WCUSuDjRmkg1u2VUD&8;aJFH6a+,WslrAGb:Fr.1\(Kl#L4,:kV74f`Ak8%h!S$W%$GnmqHcEkocfM`.hksCO#fuD+X=['HH(eob@A*;$SCStV'>b8okkbC8`AK$R>0P6m`/_:/m8n#<t*8=9r1<tWXU+Dc\f$Xb6-qgf^+(l(2TBVg+Y0?#'l-PS;Cb-WMeeQlr[C0d0)H6mQWd[YtWAV9H\)XJ%oR@p3JjWRnn!@!LFg0?uS$l;'N\.2c'4^P8Oc%HZ6&%sPO3f/0cDQY[;.CJ,`Ttp<S#32Cn,(gEqZLq!+BfjU#]PG;%q5O$h`MOip#us3C/T9bW?dG/FW,/)[<_P1kl8elRTX4eTF67;:U]pZc`[nRRR/&qUTlRZJ>a]HZ^a-ODQ8U<4N=0(%s.+4"E;]U`o'J@7tYYF\%PnDh0FY`Y*Y5Ji%sYsYX&C2=1A"f+QM!f=o770H[eL!p(h[Sem%`Id2U],0H/Y2^+B#P*3@BT?+($>D@Zr4,tW%)g;UiCC?E6e"mFk"?X2^'VZ~>
|
||||
endstream
|
||||
endobj
|
||||
8 0 obj
|
||||
|
@ -87,19 +87,19 @@ endobj
|
|||
xref
|
||||
0 14
|
||||
0000000000 65535 f
|
||||
0000003088 00000 n
|
||||
0000003152 00000 n
|
||||
0000003202 00000 n
|
||||
0000003126 00000 n
|
||||
0000003190 00000 n
|
||||
0000003240 00000 n
|
||||
0000000015 00000 n
|
||||
0000000071 00000 n
|
||||
0000001233 00000 n
|
||||
0000001339 00000 n
|
||||
0000002425 00000 n
|
||||
0000002531 00000 n
|
||||
0000002643 00000 n
|
||||
0000002753 00000 n
|
||||
0000002864 00000 n
|
||||
0000002972 00000 n
|
||||
0000001220 00000 n
|
||||
0000001326 00000 n
|
||||
0000002463 00000 n
|
||||
0000002569 00000 n
|
||||
0000002681 00000 n
|
||||
0000002791 00000 n
|
||||
0000002902 00000 n
|
||||
0000003010 00000 n
|
||||
trailer
|
||||
<<
|
||||
/Size 14
|
||||
|
@ -107,5 +107,5 @@ trailer
|
|||
/Info 4 0 R
|
||||
>>
|
||||
startxref
|
||||
3324
|
||||
3362
|
||||
%%EOF
|
||||
|
|
|
@ -141,6 +141,9 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="../api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="../api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="../api/contrib-ant/index.html">Ant</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
|
|
|
@ -141,6 +141,9 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-ant/index.html">Ant</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
|
|
|
@ -141,6 +141,9 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-ant/index.html">Ant</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
|
|
|
@ -139,6 +139,9 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-smartcn/index.html">Smart Chinese Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-stempel/index.html">Stempel Polish Analyzer</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-ant/index.html">Ant</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
|
|
|
@ -54,6 +54,7 @@ See http://forrest.apache.org/docs/linking.html for more info
|
|||
<javadoc-contrib label="Contrib">
|
||||
<javadoc-contrib-analyzers label="Analyzers" href="ext:javadocs-contrib-analyzers"/>
|
||||
<javadoc-contrib-smartcn label="Smart Chinese Analyzer" href="ext:javadocs-contrib-smartcn"/>
|
||||
<javadoc-contrib-stempel label="Stempel Polish Analyzer" href="ext:javadocs-contrib-stempel"/>
|
||||
<javadoc-contrib-ant label="Ant" href="ext:javadocs-contrib-ant"/>
|
||||
<javadoc-contrib-bdb label="Bdb" href="ext:javadocs-contrib-bdb"/>
|
||||
<javadoc-contrib-bdb-je label="Bdb-je" href="ext:javadocs-contrib-bdb-je"/>
|
||||
|
@ -104,6 +105,7 @@ See http://forrest.apache.org/docs/linking.html for more info
|
|||
<javadocs-demo href="api/demo/index.html"/>
|
||||
<javadocs-contrib-analyzers href="api/contrib-analyzers/index.html"/>
|
||||
<javadocs-contrib-smartcn href="api/contrib-smartcn/index.html"/>
|
||||
<javadocs-contrib-stempel href="api/contrib-stempel/index.html"/>
|
||||
<javadocs-contrib-ant href="api/contrib-ant/index.html"/>
|
||||
<javadocs-contrib-bdb href="api/contrib-bdb/index.html"/>
|
||||
<javadocs-contrib-bdb-je href="api/contrib-bdb-je/index.html"/>
|
||||
|
|
Loading…
Reference in New Issue