LUCENE-2213: rename ArrayUtil.getNextSize -> oversize; tweak how it picks the next size

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@901662 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2010-01-21 11:54:50 +00:00
parent 94826b348c
commit 9b3b890f45
71 changed files with 26485 additions and 59 deletions

View File

@ -35,6 +35,7 @@ package org.tartarus.snowball;
import java.lang.reflect.InvocationTargetException;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
/**
* This is the rev 502 of the Snowball SVN trunk,
@ -432,7 +433,7 @@ public abstract class SnowballProgram {
final int newLength = limit + adjustment;
//resize if necessary
if (newLength > current.length) {
char newBuffer[] = new char[ArrayUtil.getNextSize(newLength)];
char newBuffer[] = new char[ArrayUtil.oversize(newLength, RamUsageEstimator.NUM_BYTES_CHAR)];
System.arraycopy(current, 0, newBuffer, 0, limit);
current = newBuffer;
}

View File

@ -0,0 +1,2 @@
build
snowball

View File

@ -0,0 +1,16 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

View File

@ -0,0 +1,25 @@
Lucene Snowball README file
This project provides pre-compiled version of the Snowball stemmers
based on revision 500 of the Tartarus Snowball repository,
together with classes integrating them with the Lucene search engine.
A few changes has been made to the static Snowball code and compiled stemmers:
* Class SnowballProgram is made abstract and contains new abstract method stem() to avoid reflection in Lucene filter class SnowballFilter.
* All use of StringBuffers has been refactored to StringBuilder for speed.
* Snowball BSD license header has been added to the Java classes to avoid having RAT adding new ASL headers.
IMPORTANT NOTICE ON BACKWARDS COMPATIBILITY!
An index created using the Snowball module in Lucene 2.3.2 and below
might not be compatible with the Snowball module in Lucene 2.4 or greater.
For more information about this issue see:
https://issues.apache.org/jira/browse/LUCENE-1142
For more information on Snowball, see:
http://snowball.tartarus.org/

View File

@ -0,0 +1,26 @@
Copyright (c) 2001, Dr Martin Porter
Copyright (c) 2002, Richard Boulton
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holders nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1 @@
snowball

View File

@ -0,0 +1,7 @@
#!/bin/csh -f
set infile = $1
set outdir = $2
set name = $infile:h:t:uStemmer
exec $0:h/snowball $infile -o $outdir/$name -n $name -java

155
contrib/snowball/build.xml Normal file
View File

@ -0,0 +1,155 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="snowball" default="default">
<description>
Snowball Analyzers
</description>
<import file="../contrib-build.xml"/>
<property name="snowball.cvsroot" value=":pserver:cvsuser@cvs.tartarus.org:/home/cvs"/>
<property name="snowball.cvs.password" value="anonymous"/>
<property name="snowball.root" value="snowball/website"/>
<property name="bin.dir" location="bin"/>
<property name="analyzers.jar" location="${common.dir}/build/contrib/analyzers/common/lucene-analyzers-${version}.jar"/>
<available property="analyzers.jar.present" type="file" file="${analyzers.jar}"/>
<path id="classpath">
<pathelement path="${lucene.jar}"/>
<pathelement path="${analyzers.jar}"/>
<pathelement path="${project.classpath}"/>
</path>
<target name="jar" depends="compile" description="Create JAR">
<jarify>
<metainf-includes>
<metainf dir=".">
<include name="SNOWBALL-LICENSE.txt"/>
</metainf>
</metainf-includes>
</jarify>
</target>
<target name="jar-src" depends="init"
description="Packages the sources as JAR file">
<jarify basedir="${src.dir}" destfile="${build.dir}/${final.name}-src.jar">
<metainf-includes>
<metainf dir=".">
<include name="SNOWBALL-LICENSE.txt"/>
</metainf>
</metainf-includes>
</jarify>
</target>
<!-- ====================================================== -->
<!-- Download Snowball code -->
<!-- ====================================================== -->
<target name="download" depends="init">
<cvs cvsRoot="${snowball.cvsroot}"
package="${snowball.root}"
passfile="snowball.cvspass"/>
</target>
<target name="create-passfile">
<cvspass cvsroot="${snowball.cvsroot}"
password="${snowball.cvs.password}"
passfile="snowball.cvspass"
/>
</target>
<!-- ====================================================== -->
<!-- Compile Snowball C code -->
<!-- ====================================================== -->
<target name="compile-compiler" depends="download">
<apply failonerror="true" executable="gcc" parallel="true">
<arg value="-O"/>
<arg value="-o"/>
<arg value="${bin.dir}/snowball"/>
<fileset dir="${snowball.root}/p" includes="*.c"/>
</apply>
</target>
<!-- ====================================================== -->
<!-- Generate Java code -->
<!-- ====================================================== -->
<target name="generate" depends="compile-compiler">
<apply failonerror="true" executable="${bin.dir}/snowball.sh">
<srcfile/>
<arg value="${src.dir}/net/sf/snowball/ext"/>
<fileset dir="${snowball.root}" includes="**/stem.sbl"/>
</apply>
<copy todir="${src.dir}/net">
<fileset dir="${snowball.root}/net">
<include name="**/*.java"/>
</fileset>
</copy>
</target>
<target name="docs">
<taskdef
name="anakia"
classname="org.apache.velocity.anakia.AnakiaTask"
>
<classpath refid="anakia.classpath"/>
</taskdef>
<anakia
basedir="${docs.src}"
destdir="${docs.dest}/"
extension=".html" style="./site.vsl"
projectFile="stylesheets/project.xml"
excludes="**/stylesheets/** empty.xml"
includes="**/*.xml"
lastModifiedCheck="true"
templatePath="${jakarta.site2.home}/xdocs/stylesheets"
>
</anakia>
</target>
<target name="compile-core" depends="build-analyzers, common.compile-core" />
<target name="compile-test" depends="download-vocab-tests, common.compile-test" />
<target name="build-analyzers" unless="analyzers.jar.present">
<echo>Snowball building dependency ${analyzers.jar}</echo>
<ant antfile="../analyzers/build.xml" target="default" inheritall="false" dir="../analyzers" />
</target>
<property name="snowball.vocab.rev" value="500"/>
<property name="snowball.vocab.url"
value="svn://svn.tartarus.org/snowball/trunk/data"/>
<property name="vocab.dir" value="src/test/org/apache/lucene/analysis/snowball"/>
<target name="download-vocab-tests" depends="compile-core"
description="Downloads Snowball vocabulary tests">
<sequential>
<mkdir dir="${vocab.dir}"/>
<exec dir="${vocab.dir}" executable="${svn.exe}" failifexecutionfails="false" failonerror="true">
<arg line="checkout --trust-server-cert --non-interactive -r ${snowball.vocab.rev} ${snowball.vocab.url}"/>
</exec>
</sequential>
</target>
</project>

View File

@ -0,0 +1,148 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<!-- Content Stylesheet for Site -->
<!-- start the processing -->
<!-- ====================================================================== -->
<!-- GENERATED FILE, DO NOT EDIT, EDIT THE XML FILE IN xdocs INSTEAD! -->
<!-- Main Page Section -->
<!-- ====================================================================== -->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"/>
<meta name="author" value="Doug Cutting">
<meta name="email" value="cutting@apache.org">
<title>Snowball Stemmers for Lucene - Overview - Snowball Stemmers for Lucene</title>
</head>
<body bgcolor="#ffffff" text="#000000" link="#525D76">
<table border="0" width="100%" cellspacing="0">
<!-- TOP IMAGE -->
<tr>
<td colspan="2">
<a href="http://jakarta.apache.org"><img src="http://jakarta.apache.org/images/jakarta-logo.gif" align="left" border="0"/></a>
</td>
</tr>
</table>
<table border="0" width="100%" cellspacing="4">
<tr><td colspan="2">
<hr noshade="" size="1"/>
</td></tr>
<tr>
<!-- LEFT SIDE NAVIGATION -->
<td width="20%" valign="top" nowrap="true">
<!-- ============================================================ -->
<p><strong>Documentation</strong></p>
<ul>
<li> <a href="./api/index.html">Javadoc</a>
</li>
</ul>
<p><strong>Download</strong></p>
<ul>
<li> <a href="http://jakarta.apache.org/builds/jakarta-lucene-sandbox/snowball/">Releases</a>
</li>
<li> <a href="http://jakarta.apache.org/site/cvsindex.html">CVS Repository</a>
</li>
</ul>
<p><strong>Links</strong></p>
<ul>
<li> <a href="http://snowball.tartarus.org/">Snowball Home</a>
</li>
<li> <a href="http://jakarta.apache.org/lucene/">Lucene Home</a>
</li>
<li> <a href="http://jakarta.apache.org/lucene/docs/lucene-sandbox/">Lucene Sandbox</a>
</li>
</ul>
<p><strong>Jakarta</strong></p>
<ul>
<li> <a href="http://jakarta.apache.org/site/getinvolved.html">Get Involved</a>
</li>
<li> <a href="http://jakarta.apache.org/site/acknowledgements.html">Acknowledgements</a>
</li>
<li> <a href="http://jakarta.apache.org/site/contact.html">Contact</a>
</li>
<li> <a href="http://jakarta.apache.org/site/legal.html">Legal</a>
</li>
</ul>
</td>
<td width="80%" align="left" valign="top">
<table border="0" cellspacing="0" cellpadding="2" width="100%">
<tr><td bgcolor="#525D76">
<font color="#ffffff" face="arial,helvetica,sanserif">
<a name="Snowball Stemmers for Lucene"><strong>Snowball Stemmers for Lucene</strong></a>
</font>
</td></tr>
<tr><td>
<blockquote>
<p>
This project provides pre-compiled version of the Snowball stemmers
together with classes integrating them with the Lucene search engine.
</p>
</blockquote>
</p>
</td></tr>
<tr><td><br/></td></tr>
</table>
<table border="0" cellspacing="0" cellpadding="2" width="100%">
<tr><td bgcolor="#525D76">
<font color="#ffffff" face="arial,helvetica,sanserif">
<a name="Download"><strong>Download</strong></a>
</font>
</td></tr>
<tr><td>
<blockquote>
<p>
Releases of the stemmers are available
<a href="http://jakarta.apache.org/builds/jakarta-lucene-sandbox/snowball/">
here</a>
</p>
</blockquote>
</p>
</td></tr>
<tr><td><br/></td></tr>
</table>
</td>
</tr>
<!-- FOOTER -->
<tr><td colspan="2">
<hr noshade="" size="1"/>
</td></tr>
<tr><td colspan="2">
<div align="center"><font color="#525D76" size="-1"><em>
Copyright &#169; 1999-2004, The Apache Software Foundation
</em></font></div>
</td></tr>
</table>
</body>
</html>
<!-- end the processing -->

View File

@ -0,0 +1,43 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-contrib</artifactId>
<version>@version@</version>
</parent>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-snowball</artifactId>
<name>Lucene Snowball</name>
<version>@version@</version>
<description>Snowball Analyzers</description>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers</artifactId>
<version>@version@</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1 @@
:pserver:cvsuser@cvs.tartarus.org:/home/cvs Ay=0=a%0bZ

View File

@ -0,0 +1,121 @@
package org.apache.lucene.analysis.snowball;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter;
import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.Reader;
import java.util.Set;
/** Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
* LowerCaseFilter}, {@link StopFilter} and {@link SnowballFilter}.
*
* Available stemmers are listed in org.tartarus.snowball.ext. The name of a
* stemmer is the part of the class name before "Stemmer", e.g., the stemmer in
* {@link org.tartarus.snowball.ext.EnglishStemmer} is named "English".
*
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}, with the following addition:
* <ul>
* <li> As of 3.1, uses {@link TurkishLowerCaseFilter} for Turkish language.
* </ul>
* </p>
*/
public final class SnowballAnalyzer extends Analyzer {
private String name;
private Set<?> stopSet;
private final Version matchVersion;
/** Builds the named analyzer with no stop words. */
public SnowballAnalyzer(Version matchVersion, String name) {
this.name = name;
this.matchVersion = matchVersion;
}
/**
* Builds the named analyzer with the given stop words.
* @deprecated Use {@link #SnowballAnalyzer(Version, String, Set)} instead.
*/
@Deprecated
public SnowballAnalyzer(Version matchVersion, String name, String[] stopWords) {
this(matchVersion, name);
stopSet = StopFilter.makeStopSet(matchVersion, stopWords);
}
/** Builds the named analyzer with the given stop words. */
public SnowballAnalyzer(Version matchVersion, String name, Set<?> stopWords) {
this(matchVersion, name);
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion,
stopWords));
}
/** Constructs a {@link StandardTokenizer} filtered by a {@link
StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter},
and a {@link SnowballFilter} */
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(matchVersion, reader);
result = new StandardFilter(result);
// Use a special lowercase filter for turkish, the stemmer expects it.
if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish"))
result = new TurkishLowerCaseFilter(result);
else
result = new LowerCaseFilter(matchVersion, result);
if (stopSet != null)
result = new StopFilter(matchVersion,
result, stopSet);
result = new SnowballFilter(result, name);
return result;
}
private class SavedStreams {
Tokenizer source;
TokenStream result;
}
/** Returns a (possibly reused) {@link StandardTokenizer} filtered by a
* {@link StandardFilter}, a {@link LowerCaseFilter},
* a {@link StopFilter}, and a {@link SnowballFilter} */
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new StandardFilter(streams.source);
// Use a special lowercase filter for turkish, the stemmer expects it.
if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish"))
streams.result = new TurkishLowerCaseFilter(streams.result);
else
streams.result = new LowerCaseFilter(matchVersion, streams.result);
if (stopSet != null)
streams.result = new StopFilter(matchVersion,
streams.result, stopSet);
streams.result = new SnowballFilter(streams.result, name);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);
}
return streams.result;
}
}

View File

@ -0,0 +1,92 @@
package org.apache.lucene.analysis.snowball;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; // javadoc @link
import org.apache.lucene.analysis.LowerCaseFilter; // javadoc @link
import org.tartarus.snowball.SnowballProgram;
/**
* A filter that stems words using a Snowball-generated stemmer.
*
* Available stemmers are listed in {@link org.tartarus.snowball.ext}.
* <p><b>NOTE</b>: SnowballFilter expects lowercased text.
* <ul>
* <li>For the Turkish language, see {@link TurkishLowerCaseFilter}.
* <li>For other languages, see {@link LowerCaseFilter}.
* </ul>
* </p>
*/
public final class SnowballFilter extends TokenFilter {
private SnowballProgram stemmer;
private TermAttribute termAtt;
public SnowballFilter(TokenStream input, SnowballProgram stemmer) {
super(input);
this.stemmer = stemmer;
termAtt = addAttribute(TermAttribute.class);
}
/**
* Construct the named stemming filter.
*
* Available stemmers are listed in {@link org.tartarus.snowball.ext}.
* The name of a stemmer is the part of the class name before "Stemmer",
* e.g., the stemmer in {@link org.tartarus.snowball.ext.EnglishStemmer} is named "English".
*
* @param in the input tokens to stem
* @param name the name of a stemmer
*/
public SnowballFilter(TokenStream in, String name) {
super(in);
try {
Class<?> stemClass = Class.forName("org.tartarus.snowball.ext." + name + "Stemmer");
stemmer = (SnowballProgram) stemClass.newInstance();
} catch (Exception e) {
throw new RuntimeException(e.toString());
}
termAtt = addAttribute(TermAttribute.class);
}
/** Returns the next input Token, after being stemmed */
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
char termBuffer[] = termAtt.termBuffer();
final int length = termAtt.termLength();
stemmer.setCurrent(termBuffer, length);
stemmer.stem();
final char finalTerm[] = stemmer.getCurrentBuffer();
final int newLength = stemmer.getCurrentBufferLength();
if (finalTerm != termBuffer)
termAtt.setTermBuffer(finalTerm, 0, newLength);
else
termAtt.setTermLength(newLength);
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,24 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
{@link org.apache.lucene.analysis.TokenFilter} and {@link
org.apache.lucene.analysis.Analyzer} implementations that use Snowball
stemmers.
</body>
</html>

View File

@ -0,0 +1,63 @@
/*
Copyright (c) 2001, Dr Martin Porter
Copyright (c) 2002, Richard Boulton
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holders nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.tartarus.snowball;
import java.lang.reflect.Method;
public class Among {
public Among (String s, int substring_i, int result,
String methodname, SnowballProgram methodobject) {
this.s_size = s.length();
this.s = s.toCharArray();
this.substring_i = substring_i;
this.result = result;
this.methodobject = methodobject;
if (methodname.length() == 0) {
this.method = null;
} else {
try {
this.method = methodobject.getClass().
getDeclaredMethod(methodname, new Class[0]);
} catch (NoSuchMethodException e) {
throw new RuntimeException(e);
}
}
}
public final int s_size; /* search string */
public final char[] s; /* search string */
public final int substring_i; /* index to longest matching substring */
public final int result; /* result of the lookup */
public final Method method; /* method to use if substring matches */
public final SnowballProgram methodobject; /* object to invoke method on */
};

View File

@ -0,0 +1,566 @@
/*
Copyright (c) 2001, Dr Martin Porter
Copyright (c) 2002, Richard Boulton
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holders nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.tartarus.snowball;
import java.lang.reflect.InvocationTargetException;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
/**
* This is the rev 502 of the Snowball SVN trunk,
* but modified:
* made abstract and introduced abstract method stem to avoid expensive reflection in filter class.
* refactored StringBuffers to StringBuilder
* uses char[] as buffer instead of StringBuffer/StringBuilder
* eq_s,eq_s_b,insert,replace_s take CharSequence like eq_v and eq_v_b
*/
public abstract class SnowballProgram {
protected SnowballProgram()
{
current = new char[8];
setCurrent("");
}
public abstract boolean stem();
/**
* Set the current string.
*/
public void setCurrent(String value)
{
current = value.toCharArray();
cursor = 0;
limit = value.length();
limit_backward = 0;
bra = cursor;
ket = limit;
}
/**
* Get the current string.
*/
public String getCurrent()
{
return new String(current, 0, limit);
}
/**
* Set the current string.
* @param text character array containing input
* @param length valid length of text.
*/
public void setCurrent(char text[], int length) {
current = text;
cursor = 0;
limit = length;
limit_backward = 0;
bra = cursor;
ket = limit;
}
/**
* Get the current buffer containing the stem.
* <p>
* NOTE: this may be a reference to a different character array than the
* one originally provided with setCurrent, in the exceptional case that
* stemming produced a longer intermediate or result string.
* </p>
* <p>
* It is necessary to use {@link #getCurrentBufferLength()} to determine
* the valid length of the returned buffer. For example, many words are
* stemmed simply by subtracting from the length to remove suffixes.
* </p>
* @see #getCurrentBufferLength()
*/
public char[] getCurrentBuffer() {
return current;
}
/**
* Get the valid length of the character array in
* {@link #getCurrentBuffer()}.
* @return valid length of the array.
*/
public int getCurrentBufferLength() {
return limit;
}
// current string
private char current[];
protected int cursor;
protected int limit;
protected int limit_backward;
protected int bra;
protected int ket;
protected void copy_from(SnowballProgram other)
{
current = other.current;
cursor = other.cursor;
limit = other.limit;
limit_backward = other.limit_backward;
bra = other.bra;
ket = other.ket;
}
protected boolean in_grouping(char [] s, int min, int max)
{
if (cursor >= limit) return false;
char ch = current[cursor];
if (ch > max || ch < min) return false;
ch -= min;
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
cursor++;
return true;
}
protected boolean in_grouping_b(char [] s, int min, int max)
{
if (cursor <= limit_backward) return false;
char ch = current[cursor - 1];
if (ch > max || ch < min) return false;
ch -= min;
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
cursor--;
return true;
}
protected boolean out_grouping(char [] s, int min, int max)
{
if (cursor >= limit) return false;
char ch = current[cursor];
if (ch > max || ch < min) {
cursor++;
return true;
}
ch -= min;
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
cursor ++;
return true;
}
return false;
}
protected boolean out_grouping_b(char [] s, int min, int max)
{
if (cursor <= limit_backward) return false;
char ch = current[cursor - 1];
if (ch > max || ch < min) {
cursor--;
return true;
}
ch -= min;
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
cursor--;
return true;
}
return false;
}
protected boolean in_range(int min, int max)
{
if (cursor >= limit) return false;
char ch = current[cursor];
if (ch > max || ch < min) return false;
cursor++;
return true;
}
protected boolean in_range_b(int min, int max)
{
if (cursor <= limit_backward) return false;
char ch = current[cursor - 1];
if (ch > max || ch < min) return false;
cursor--;
return true;
}
protected boolean out_range(int min, int max)
{
if (cursor >= limit) return false;
char ch = current[cursor];
if (!(ch > max || ch < min)) return false;
cursor++;
return true;
}
protected boolean out_range_b(int min, int max)
{
if (cursor <= limit_backward) return false;
char ch = current[cursor - 1];
if(!(ch > max || ch < min)) return false;
cursor--;
return true;
}
protected boolean eq_s(int s_size, CharSequence s)
{
if (limit - cursor < s_size) return false;
int i;
for (i = 0; i != s_size; i++) {
if (current[cursor + i] != s.charAt(i)) return false;
}
cursor += s_size;
return true;
}
/** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
@Deprecated
protected boolean eq_s(int s_size, String s)
{
return eq_s(s_size, (CharSequence)s);
}
protected boolean eq_s_b(int s_size, CharSequence s)
{
if (cursor - limit_backward < s_size) return false;
int i;
for (i = 0; i != s_size; i++) {
if (current[cursor - s_size + i] != s.charAt(i)) return false;
}
cursor -= s_size;
return true;
}
/** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
@Deprecated
protected boolean eq_s_b(int s_size, String s)
{
return eq_s_b(s_size, (CharSequence)s);
}
protected boolean eq_v(CharSequence s)
{
return eq_s(s.length(), s);
}
/** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
@Deprecated
protected boolean eq_v(StringBuilder s)
{
return eq_s(s.length(), (CharSequence)s);
}
protected boolean eq_v_b(CharSequence s)
{ return eq_s_b(s.length(), s);
}
/** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
@Deprecated
protected boolean eq_v_b(StringBuilder s)
{ return eq_s_b(s.length(), (CharSequence)s);
}
protected int find_among(Among v[], int v_size)
{
int i = 0;
int j = v_size;
int c = cursor;
int l = limit;
int common_i = 0;
int common_j = 0;
boolean first_key_inspected = false;
while(true) {
int k = i + ((j - i) >> 1);
int diff = 0;
int common = common_i < common_j ? common_i : common_j; // smaller
Among w = v[k];
int i2;
for (i2 = common; i2 < w.s_size; i2++) {
if (c + common == l) {
diff = -1;
break;
}
diff = current[c + common] - w.s[i2];
if (diff != 0) break;
common++;
}
if (diff < 0) {
j = k;
common_j = common;
} else {
i = k;
common_i = common;
}
if (j - i <= 1) {
if (i > 0) break; // v->s has been inspected
if (j == i) break; // only one item in v
// - but now we need to go round once more to get
// v->s inspected. This looks messy, but is actually
// the optimal approach.
if (first_key_inspected) break;
first_key_inspected = true;
}
}
while(true) {
Among w = v[i];
if (common_i >= w.s_size) {
cursor = c + w.s_size;
if (w.method == null) return w.result;
boolean res;
try {
Object resobj = w.method.invoke(w.methodobject,
new Object[0]);
res = resobj.toString().equals("true");
} catch (InvocationTargetException e) {
res = false;
// FIXME - debug message
} catch (IllegalAccessException e) {
res = false;
// FIXME - debug message
}
cursor = c + w.s_size;
if (res) return w.result;
}
i = w.substring_i;
if (i < 0) return 0;
}
}
// find_among_b is for backwards processing. Same comments apply
protected int find_among_b(Among v[], int v_size)
{
int i = 0;
int j = v_size;
int c = cursor;
int lb = limit_backward;
int common_i = 0;
int common_j = 0;
boolean first_key_inspected = false;
while(true) {
int k = i + ((j - i) >> 1);
int diff = 0;
int common = common_i < common_j ? common_i : common_j;
Among w = v[k];
int i2;
for (i2 = w.s_size - 1 - common; i2 >= 0; i2--) {
if (c - common == lb) {
diff = -1;
break;
}
diff = current[c - 1 - common] - w.s[i2];
if (diff != 0) break;
common++;
}
if (diff < 0) {
j = k;
common_j = common;
} else {
i = k;
common_i = common;
}
if (j - i <= 1) {
if (i > 0) break;
if (j == i) break;
if (first_key_inspected) break;
first_key_inspected = true;
}
}
while(true) {
Among w = v[i];
if (common_i >= w.s_size) {
cursor = c - w.s_size;
if (w.method == null) return w.result;
boolean res;
try {
Object resobj = w.method.invoke(w.methodobject,
new Object[0]);
res = resobj.toString().equals("true");
} catch (InvocationTargetException e) {
res = false;
// FIXME - debug message
} catch (IllegalAccessException e) {
res = false;
// FIXME - debug message
}
cursor = c - w.s_size;
if (res) return w.result;
}
i = w.substring_i;
if (i < 0) return 0;
}
}
/* to replace chars between c_bra and c_ket in current by the
* chars in s.
*/
protected int replace_s(int c_bra, int c_ket, CharSequence s)
{
final int adjustment = s.length() - (c_ket - c_bra);
final int newLength = limit + adjustment;
//resize if necessary
if (newLength > current.length) {
char newBuffer[] = new char[ArrayUtil.oversize(newLength, RamUsageEstimator.NUM_BYTES_CHAR)];
System.arraycopy(current, 0, newBuffer, 0, limit);
current = newBuffer;
}
// if the substring being replaced is longer or shorter than the
// replacement, need to shift things around
if (adjustment != 0 && c_ket < limit) {
System.arraycopy(current, c_ket, current, c_bra + s.length(),
limit - c_ket);
}
// insert the replacement text
// Note, faster is s.getChars(0, s.length(), current, c_bra);
// but would have to duplicate this method for both String and StringBuilder
for (int i = 0; i < s.length(); i++)
current[c_bra + i] = s.charAt(i);
limit += adjustment;
if (cursor >= c_ket) cursor += adjustment;
else if (cursor > c_bra) cursor = c_bra;
return adjustment;
}
/** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
@Deprecated
protected int replace_s(int c_bra, int c_ket, String s) {
return replace_s(c_bra, c_ket, (CharSequence)s);
}
protected void slice_check()
{
if (bra < 0 ||
bra > ket ||
ket > limit)
{
System.err.println("faulty slice operation");
// FIXME: report error somehow.
/*
fprintf(stderr, "faulty slice operation:\n");
debug(z, -1, 0);
exit(1);
*/
}
}
protected void slice_from(CharSequence s)
{
slice_check();
replace_s(bra, ket, s);
}
/** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
@Deprecated
protected void slice_from(String s)
{
slice_from((CharSequence)s);
}
/** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
@Deprecated
protected void slice_from(StringBuilder s)
{
slice_from((CharSequence)s);
}
protected void slice_del()
{
slice_from((CharSequence)"");
}
protected void insert(int c_bra, int c_ket, CharSequence s)
{
int adjustment = replace_s(c_bra, c_ket, s);
if (c_bra <= bra) bra += adjustment;
if (c_bra <= ket) ket += adjustment;
}
/** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
@Deprecated
protected void insert(int c_bra, int c_ket, String s)
{
insert(c_bra, c_ket, (CharSequence)s);
}
/** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
@Deprecated
protected void insert(int c_bra, int c_ket, StringBuilder s)
{
insert(c_bra, c_ket, (CharSequence)s);
}
/* Copy the slice into the supplied StringBuffer */
protected StringBuilder slice_to(StringBuilder s)
{
slice_check();
int len = ket - bra;
s.setLength(0);
s.append(current, bra, len);
return s;
}
protected StringBuilder assign_to(StringBuilder s)
{
s.setLength(0);
s.append(current, 0, limit);
return s;
}
/*
extern void debug(struct SN_env * z, int number, int line_count)
{ int i;
int limit = SIZE(z->p);
//if (number >= 0) printf("%3d (line %4d): '", number, line_count);
if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
for (i = 0; i <= limit; i++)
{ if (z->lb == i) printf("{");
if (z->bra == i) printf("[");
if (z->c == i) printf("|");
if (z->ket == i) printf("]");
if (z->l == i) printf("}");
if (i < limit)
{ int ch = z->p[i];
if (ch == 0) ch = '#';
printf("%c", ch);
}
}
printf("'\n");
}
*/
};

View File

@ -0,0 +1,108 @@
/*
Copyright (c) 2001, Dr Martin Porter
Copyright (c) 2002, Richard Boulton
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holders nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.tartarus.snowball;
import java.lang.reflect.Method;
import java.io.Reader;
import java.io.Writer;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.OutputStream;
import java.io.FileOutputStream;
public class TestApp {
private static void usage()
{
System.err.println("Usage: TestApp <algorithm> <input file> [-o <output file>]");
}
public static void main(String [] args) throws Throwable {
if (args.length < 2) {
usage();
return;
}
Class stemClass = Class.forName("org.tartarus.snowball.ext." +
args[0] + "Stemmer");
SnowballProgram stemmer = (SnowballProgram) stemClass.newInstance();
Method stemMethod = stemClass.getMethod("stem", new Class[0]);
Reader reader;
reader = new InputStreamReader(new FileInputStream(args[1]));
reader = new BufferedReader(reader);
StringBuffer input = new StringBuffer();
OutputStream outstream;
if (args.length > 2) {
if (args.length == 4 && args[2].equals("-o")) {
outstream = new FileOutputStream(args[3]);
} else {
usage();
return;
}
} else {
outstream = System.out;
}
Writer output = new OutputStreamWriter(outstream);
output = new BufferedWriter(output);
int repeat = 1;
if (args.length > 4) {
repeat = Integer.parseInt(args[4]);
}
Object [] emptyArgs = new Object[0];
int character;
while ((character = reader.read()) != -1) {
char ch = (char) character;
if (Character.isWhitespace((char) ch)) {
if (input.length() > 0) {
stemmer.setCurrent(input.toString());
for (int i = repeat; i != 0; i--) {
stemMethod.invoke(stemmer, emptyArgs);
}
output.write(stemmer.getCurrent());
output.write('\n');
input.delete(0, input.length());
}
} else {
input.append(Character.toLowerCase(ch));
}
}
output.flush();
}
}

View File

@ -0,0 +1,423 @@
// This file was generated automatically by the Snowball to Java compiler
package org.tartarus.snowball.ext;
import org.tartarus.snowball.SnowballProgram;
import org.tartarus.snowball.Among;
/**
* Generated class implementing code defined by a snowball script.
*/
public class DanishStemmer extends SnowballProgram {
private Among a_0[] = {
new Among ( "hed", -1, 1, "", this),
new Among ( "ethed", 0, 1, "", this),
new Among ( "ered", -1, 1, "", this),
new Among ( "e", -1, 1, "", this),
new Among ( "erede", 3, 1, "", this),
new Among ( "ende", 3, 1, "", this),
new Among ( "erende", 5, 1, "", this),
new Among ( "ene", 3, 1, "", this),
new Among ( "erne", 3, 1, "", this),
new Among ( "ere", 3, 1, "", this),
new Among ( "en", -1, 1, "", this),
new Among ( "heden", 10, 1, "", this),
new Among ( "eren", 10, 1, "", this),
new Among ( "er", -1, 1, "", this),
new Among ( "heder", 13, 1, "", this),
new Among ( "erer", 13, 1, "", this),
new Among ( "s", -1, 2, "", this),
new Among ( "heds", 16, 1, "", this),
new Among ( "es", 16, 1, "", this),
new Among ( "endes", 18, 1, "", this),
new Among ( "erendes", 19, 1, "", this),
new Among ( "enes", 18, 1, "", this),
new Among ( "ernes", 18, 1, "", this),
new Among ( "eres", 18, 1, "", this),
new Among ( "ens", 16, 1, "", this),
new Among ( "hedens", 24, 1, "", this),
new Among ( "erens", 24, 1, "", this),
new Among ( "ers", 16, 1, "", this),
new Among ( "ets", 16, 1, "", this),
new Among ( "erets", 28, 1, "", this),
new Among ( "et", -1, 1, "", this),
new Among ( "eret", 30, 1, "", this)
};
private Among a_1[] = {
new Among ( "gd", -1, -1, "", this),
new Among ( "dt", -1, -1, "", this),
new Among ( "gt", -1, -1, "", this),
new Among ( "kt", -1, -1, "", this)
};
private Among a_2[] = {
new Among ( "ig", -1, 1, "", this),
new Among ( "lig", 0, 1, "", this),
new Among ( "elig", 1, 1, "", this),
new Among ( "els", -1, 1, "", this),
new Among ( "l\u00F8st", -1, 2, "", this)
};
private static final char g_v[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128 };
private static final char g_s_ending[] = {239, 254, 42, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16 };
private int I_x;
private int I_p1;
private StringBuilder S_ch = new StringBuilder();
private void copy_from(DanishStemmer other) {
I_x = other.I_x;
I_p1 = other.I_p1;
S_ch = other.S_ch;
super.copy_from(other);
}
private boolean r_mark_regions() {
int v_1;
int v_2;
// (, line 29
I_p1 = limit;
// test, line 33
v_1 = cursor;
// (, line 33
// hop, line 33
{
int c = cursor + 3;
if (0 > c || c > limit)
{
return false;
}
cursor = c;
}
// setmark x, line 33
I_x = cursor;
cursor = v_1;
// goto, line 34
golab0: while(true)
{
v_2 = cursor;
lab1: do {
if (!(in_grouping(g_v, 97, 248)))
{
break lab1;
}
cursor = v_2;
break golab0;
} while (false);
cursor = v_2;
if (cursor >= limit)
{
return false;
}
cursor++;
}
// gopast, line 34
golab2: while(true)
{
lab3: do {
if (!(out_grouping(g_v, 97, 248)))
{
break lab3;
}
break golab2;
} while (false);
if (cursor >= limit)
{
return false;
}
cursor++;
}
// setmark p1, line 34
I_p1 = cursor;
// try, line 35
lab4: do {
// (, line 35
if (!(I_p1 < I_x))
{
break lab4;
}
I_p1 = I_x;
} while (false);
return true;
}
private boolean r_main_suffix() {
int among_var;
int v_1;
int v_2;
// (, line 40
// setlimit, line 41
v_1 = limit - cursor;
// tomark, line 41
if (cursor < I_p1)
{
return false;
}
cursor = I_p1;
v_2 = limit_backward;
limit_backward = cursor;
cursor = limit - v_1;
// (, line 41
// [, line 41
ket = cursor;
// substring, line 41
among_var = find_among_b(a_0, 32);
if (among_var == 0)
{
limit_backward = v_2;
return false;
}
// ], line 41
bra = cursor;
limit_backward = v_2;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 48
// delete, line 48
slice_del();
break;
case 2:
// (, line 50
if (!(in_grouping_b(g_s_ending, 97, 229)))
{
return false;
}
// delete, line 50
slice_del();
break;
}
return true;
}
private boolean r_consonant_pair() {
int v_1;
int v_2;
int v_3;
// (, line 54
// test, line 55
v_1 = limit - cursor;
// (, line 55
// setlimit, line 56
v_2 = limit - cursor;
// tomark, line 56
if (cursor < I_p1)
{
return false;
}
cursor = I_p1;
v_3 = limit_backward;
limit_backward = cursor;
cursor = limit - v_2;
// (, line 56
// [, line 56
ket = cursor;
// substring, line 56
if (find_among_b(a_1, 4) == 0)
{
limit_backward = v_3;
return false;
}
// ], line 56
bra = cursor;
limit_backward = v_3;
cursor = limit - v_1;
// next, line 62
if (cursor <= limit_backward)
{
return false;
}
cursor--;
// ], line 62
bra = cursor;
// delete, line 62
slice_del();
return true;
}
private boolean r_other_suffix() {
int among_var;
int v_1;
int v_2;
int v_3;
int v_4;
// (, line 65
// do, line 66
v_1 = limit - cursor;
lab0: do {
// (, line 66
// [, line 66
ket = cursor;
// literal, line 66
if (!(eq_s_b(2, "st")))
{
break lab0;
}
// ], line 66
bra = cursor;
// literal, line 66
if (!(eq_s_b(2, "ig")))
{
break lab0;
}
// delete, line 66
slice_del();
} while (false);
cursor = limit - v_1;
// setlimit, line 67
v_2 = limit - cursor;
// tomark, line 67
if (cursor < I_p1)
{
return false;
}
cursor = I_p1;
v_3 = limit_backward;
limit_backward = cursor;
cursor = limit - v_2;
// (, line 67
// [, line 67
ket = cursor;
// substring, line 67
among_var = find_among_b(a_2, 5);
if (among_var == 0)
{
limit_backward = v_3;
return false;
}
// ], line 67
bra = cursor;
limit_backward = v_3;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 70
// delete, line 70
slice_del();
// do, line 70
v_4 = limit - cursor;
lab1: do {
// call consonant_pair, line 70
if (!r_consonant_pair())
{
break lab1;
}
} while (false);
cursor = limit - v_4;
break;
case 2:
// (, line 72
// <-, line 72
slice_from("l\u00F8s");
break;
}
return true;
}
private boolean r_undouble() {
int v_1;
int v_2;
// (, line 75
// setlimit, line 76
v_1 = limit - cursor;
// tomark, line 76
if (cursor < I_p1)
{
return false;
}
cursor = I_p1;
v_2 = limit_backward;
limit_backward = cursor;
cursor = limit - v_1;
// (, line 76
// [, line 76
ket = cursor;
if (!(out_grouping_b(g_v, 97, 248)))
{
limit_backward = v_2;
return false;
}
// ], line 76
bra = cursor;
// -> ch, line 76
S_ch = slice_to(S_ch);
limit_backward = v_2;
// name ch, line 77
if (!(eq_v_b(S_ch)))
{
return false;
}
// delete, line 78
slice_del();
return true;
}
public boolean stem() {
int v_1;
int v_2;
int v_3;
int v_4;
int v_5;
// (, line 82
// do, line 84
v_1 = cursor;
lab0: do {
// call mark_regions, line 84
if (!r_mark_regions())
{
break lab0;
}
} while (false);
cursor = v_1;
// backwards, line 85
limit_backward = cursor; cursor = limit;
// (, line 85
// do, line 86
v_2 = limit - cursor;
lab1: do {
// call main_suffix, line 86
if (!r_main_suffix())
{
break lab1;
}
} while (false);
cursor = limit - v_2;
// do, line 87
v_3 = limit - cursor;
lab2: do {
// call consonant_pair, line 87
if (!r_consonant_pair())
{
break lab2;
}
} while (false);
cursor = limit - v_3;
// do, line 88
v_4 = limit - cursor;
lab3: do {
// call other_suffix, line 88
if (!r_other_suffix())
{
break lab3;
}
} while (false);
cursor = limit - v_4;
// do, line 89
v_5 = limit - cursor;
lab4: do {
// call undouble, line 89
if (!r_undouble())
{
break lab4;
}
} while (false);
cursor = limit - v_5;
cursor = limit_backward; return true;
}
}

View File

@ -0,0 +1,837 @@
// This file was generated automatically by the Snowball to Java compiler
package org.tartarus.snowball.ext;
import org.tartarus.snowball.SnowballProgram;
import org.tartarus.snowball.Among;
/**
* Generated class implementing code defined by a snowball script.
*/
public class DutchStemmer extends SnowballProgram {
private Among a_0[] = {
new Among ( "", -1, 6, "", this),
new Among ( "\u00E1", 0, 1, "", this),
new Among ( "\u00E4", 0, 1, "", this),
new Among ( "\u00E9", 0, 2, "", this),
new Among ( "\u00EB", 0, 2, "", this),
new Among ( "\u00ED", 0, 3, "", this),
new Among ( "\u00EF", 0, 3, "", this),
new Among ( "\u00F3", 0, 4, "", this),
new Among ( "\u00F6", 0, 4, "", this),
new Among ( "\u00FA", 0, 5, "", this),
new Among ( "\u00FC", 0, 5, "", this)
};
private Among a_1[] = {
new Among ( "", -1, 3, "", this),
new Among ( "I", 0, 2, "", this),
new Among ( "Y", 0, 1, "", this)
};
private Among a_2[] = {
new Among ( "dd", -1, -1, "", this),
new Among ( "kk", -1, -1, "", this),
new Among ( "tt", -1, -1, "", this)
};
private Among a_3[] = {
new Among ( "ene", -1, 2, "", this),
new Among ( "se", -1, 3, "", this),
new Among ( "en", -1, 2, "", this),
new Among ( "heden", 2, 1, "", this),
new Among ( "s", -1, 3, "", this)
};
private Among a_4[] = {
new Among ( "end", -1, 1, "", this),
new Among ( "ig", -1, 2, "", this),
new Among ( "ing", -1, 1, "", this),
new Among ( "lijk", -1, 3, "", this),
new Among ( "baar", -1, 4, "", this),
new Among ( "bar", -1, 5, "", this)
};
private Among a_5[] = {
new Among ( "aa", -1, -1, "", this),
new Among ( "ee", -1, -1, "", this),
new Among ( "oo", -1, -1, "", this),
new Among ( "uu", -1, -1, "", this)
};
private static final char g_v[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128 };
private static final char g_v_I[] = {1, 0, 0, 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128 };
private static final char g_v_j[] = {17, 67, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128 };
private int I_p2;
private int I_p1;
private boolean B_e_found;
private void copy_from(DutchStemmer other) {
I_p2 = other.I_p2;
I_p1 = other.I_p1;
B_e_found = other.B_e_found;
super.copy_from(other);
}
private boolean r_prelude() {
int among_var;
int v_1;
int v_2;
int v_3;
int v_4;
int v_5;
int v_6;
// (, line 41
// test, line 42
v_1 = cursor;
// repeat, line 42
replab0: while(true)
{
v_2 = cursor;
lab1: do {
// (, line 42
// [, line 43
bra = cursor;
// substring, line 43
among_var = find_among(a_0, 11);
if (among_var == 0)
{
break lab1;
}
// ], line 43
ket = cursor;
switch(among_var) {
case 0:
break lab1;
case 1:
// (, line 45
// <-, line 45
slice_from("a");
break;
case 2:
// (, line 47
// <-, line 47
slice_from("e");
break;
case 3:
// (, line 49
// <-, line 49
slice_from("i");
break;
case 4:
// (, line 51
// <-, line 51
slice_from("o");
break;
case 5:
// (, line 53
// <-, line 53
slice_from("u");
break;
case 6:
// (, line 54
// next, line 54
if (cursor >= limit)
{
break lab1;
}
cursor++;
break;
}
continue replab0;
} while (false);
cursor = v_2;
break replab0;
}
cursor = v_1;
// try, line 57
v_3 = cursor;
lab2: do {
// (, line 57
// [, line 57
bra = cursor;
// literal, line 57
if (!(eq_s(1, "y")))
{
cursor = v_3;
break lab2;
}
// ], line 57
ket = cursor;
// <-, line 57
slice_from("Y");
} while (false);
// repeat, line 58
replab3: while(true)
{
v_4 = cursor;
lab4: do {
// goto, line 58
golab5: while(true)
{
v_5 = cursor;
lab6: do {
// (, line 58
if (!(in_grouping(g_v, 97, 232)))
{
break lab6;
}
// [, line 59
bra = cursor;
// or, line 59
lab7: do {
v_6 = cursor;
lab8: do {
// (, line 59
// literal, line 59
if (!(eq_s(1, "i")))
{
break lab8;
}
// ], line 59
ket = cursor;
if (!(in_grouping(g_v, 97, 232)))
{
break lab8;
}
// <-, line 59
slice_from("I");
break lab7;
} while (false);
cursor = v_6;
// (, line 60
// literal, line 60
if (!(eq_s(1, "y")))
{
break lab6;
}
// ], line 60
ket = cursor;
// <-, line 60
slice_from("Y");
} while (false);
cursor = v_5;
break golab5;
} while (false);
cursor = v_5;
if (cursor >= limit)
{
break lab4;
}
cursor++;
}
continue replab3;
} while (false);
cursor = v_4;
break replab3;
}
return true;
}
private boolean r_mark_regions() {
// (, line 64
I_p1 = limit;
I_p2 = limit;
// gopast, line 69
golab0: while(true)
{
lab1: do {
if (!(in_grouping(g_v, 97, 232)))
{
break lab1;
}
break golab0;
} while (false);
if (cursor >= limit)
{
return false;
}
cursor++;
}
// gopast, line 69
golab2: while(true)
{
lab3: do {
if (!(out_grouping(g_v, 97, 232)))
{
break lab3;
}
break golab2;
} while (false);
if (cursor >= limit)
{
return false;
}
cursor++;
}
// setmark p1, line 69
I_p1 = cursor;
// try, line 70
lab4: do {
// (, line 70
if (!(I_p1 < 3))
{
break lab4;
}
I_p1 = 3;
} while (false);
// gopast, line 71
golab5: while(true)
{
lab6: do {
if (!(in_grouping(g_v, 97, 232)))
{
break lab6;
}
break golab5;
} while (false);
if (cursor >= limit)
{
return false;
}
cursor++;
}
// gopast, line 71
golab7: while(true)
{
lab8: do {
if (!(out_grouping(g_v, 97, 232)))
{
break lab8;
}
break golab7;
} while (false);
if (cursor >= limit)
{
return false;
}
cursor++;
}
// setmark p2, line 71
I_p2 = cursor;
return true;
}
private boolean r_postlude() {
int among_var;
int v_1;
// repeat, line 75
replab0: while(true)
{
v_1 = cursor;
lab1: do {
// (, line 75
// [, line 77
bra = cursor;
// substring, line 77
among_var = find_among(a_1, 3);
if (among_var == 0)
{
break lab1;
}
// ], line 77
ket = cursor;
switch(among_var) {
case 0:
break lab1;
case 1:
// (, line 78
// <-, line 78
slice_from("y");
break;
case 2:
// (, line 79
// <-, line 79
slice_from("i");
break;
case 3:
// (, line 80
// next, line 80
if (cursor >= limit)
{
break lab1;
}
cursor++;
break;
}
continue replab0;
} while (false);
cursor = v_1;
break replab0;
}
return true;
}
private boolean r_R1() {
if (!(I_p1 <= cursor))
{
return false;
}
return true;
}
private boolean r_R2() {
if (!(I_p2 <= cursor))
{
return false;
}
return true;
}
private boolean r_undouble() {
int v_1;
// (, line 90
// test, line 91
v_1 = limit - cursor;
// among, line 91
if (find_among_b(a_2, 3) == 0)
{
return false;
}
cursor = limit - v_1;
// [, line 91
ket = cursor;
// next, line 91
if (cursor <= limit_backward)
{
return false;
}
cursor--;
// ], line 91
bra = cursor;
// delete, line 91
slice_del();
return true;
}
private boolean r_e_ending() {
int v_1;
// (, line 94
// unset e_found, line 95
B_e_found = false;
// [, line 96
ket = cursor;
// literal, line 96
if (!(eq_s_b(1, "e")))
{
return false;
}
// ], line 96
bra = cursor;
// call R1, line 96
if (!r_R1())
{
return false;
}
// test, line 96
v_1 = limit - cursor;
if (!(out_grouping_b(g_v, 97, 232)))
{
return false;
}
cursor = limit - v_1;
// delete, line 96
slice_del();
// set e_found, line 97
B_e_found = true;
// call undouble, line 98
if (!r_undouble())
{
return false;
}
return true;
}
private boolean r_en_ending() {
int v_1;
int v_2;
// (, line 101
// call R1, line 102
if (!r_R1())
{
return false;
}
// and, line 102
v_1 = limit - cursor;
if (!(out_grouping_b(g_v, 97, 232)))
{
return false;
}
cursor = limit - v_1;
// not, line 102
{
v_2 = limit - cursor;
lab0: do {
// literal, line 102
if (!(eq_s_b(3, "gem")))
{
break lab0;
}
return false;
} while (false);
cursor = limit - v_2;
}
// delete, line 102
slice_del();
// call undouble, line 103
if (!r_undouble())
{
return false;
}
return true;
}
private boolean r_standard_suffix() {
int among_var;
int v_1;
int v_2;
int v_3;
int v_4;
int v_5;
int v_6;
int v_7;
int v_8;
int v_9;
int v_10;
// (, line 106
// do, line 107
v_1 = limit - cursor;
lab0: do {
// (, line 107
// [, line 108
ket = cursor;
// substring, line 108
among_var = find_among_b(a_3, 5);
if (among_var == 0)
{
break lab0;
}
// ], line 108
bra = cursor;
switch(among_var) {
case 0:
break lab0;
case 1:
// (, line 110
// call R1, line 110
if (!r_R1())
{
break lab0;
}
// <-, line 110
slice_from("heid");
break;
case 2:
// (, line 113
// call en_ending, line 113
if (!r_en_ending())
{
break lab0;
}
break;
case 3:
// (, line 116
// call R1, line 116
if (!r_R1())
{
break lab0;
}
if (!(out_grouping_b(g_v_j, 97, 232)))
{
break lab0;
}
// delete, line 116
slice_del();
break;
}
} while (false);
cursor = limit - v_1;
// do, line 120
v_2 = limit - cursor;
lab1: do {
// call e_ending, line 120
if (!r_e_ending())
{
break lab1;
}
} while (false);
cursor = limit - v_2;
// do, line 122
v_3 = limit - cursor;
lab2: do {
// (, line 122
// [, line 122
ket = cursor;
// literal, line 122
if (!(eq_s_b(4, "heid")))
{
break lab2;
}
// ], line 122
bra = cursor;
// call R2, line 122
if (!r_R2())
{
break lab2;
}
// not, line 122
{
v_4 = limit - cursor;
lab3: do {
// literal, line 122
if (!(eq_s_b(1, "c")))
{
break lab3;
}
break lab2;
} while (false);
cursor = limit - v_4;
}
// delete, line 122
slice_del();
// [, line 123
ket = cursor;
// literal, line 123
if (!(eq_s_b(2, "en")))
{
break lab2;
}
// ], line 123
bra = cursor;
// call en_ending, line 123
if (!r_en_ending())
{
break lab2;
}
} while (false);
cursor = limit - v_3;
// do, line 126
v_5 = limit - cursor;
lab4: do {
// (, line 126
// [, line 127
ket = cursor;
// substring, line 127
among_var = find_among_b(a_4, 6);
if (among_var == 0)
{
break lab4;
}
// ], line 127
bra = cursor;
switch(among_var) {
case 0:
break lab4;
case 1:
// (, line 129
// call R2, line 129
if (!r_R2())
{
break lab4;
}
// delete, line 129
slice_del();
// or, line 130
lab5: do {
v_6 = limit - cursor;
lab6: do {
// (, line 130
// [, line 130
ket = cursor;
// literal, line 130
if (!(eq_s_b(2, "ig")))
{
break lab6;
}
// ], line 130
bra = cursor;
// call R2, line 130
if (!r_R2())
{
break lab6;
}
// not, line 130
{
v_7 = limit - cursor;
lab7: do {
// literal, line 130
if (!(eq_s_b(1, "e")))
{
break lab7;
}
break lab6;
} while (false);
cursor = limit - v_7;
}
// delete, line 130
slice_del();
break lab5;
} while (false);
cursor = limit - v_6;
// call undouble, line 130
if (!r_undouble())
{
break lab4;
}
} while (false);
break;
case 2:
// (, line 133
// call R2, line 133
if (!r_R2())
{
break lab4;
}
// not, line 133
{
v_8 = limit - cursor;
lab8: do {
// literal, line 133
if (!(eq_s_b(1, "e")))
{
break lab8;
}
break lab4;
} while (false);
cursor = limit - v_8;
}
// delete, line 133
slice_del();
break;
case 3:
// (, line 136
// call R2, line 136
if (!r_R2())
{
break lab4;
}
// delete, line 136
slice_del();
// call e_ending, line 136
if (!r_e_ending())
{
break lab4;
}
break;
case 4:
// (, line 139
// call R2, line 139
if (!r_R2())
{
break lab4;
}
// delete, line 139
slice_del();
break;
case 5:
// (, line 142
// call R2, line 142
if (!r_R2())
{
break lab4;
}
// Boolean test e_found, line 142
if (!(B_e_found))
{
break lab4;
}
// delete, line 142
slice_del();
break;
}
} while (false);
cursor = limit - v_5;
// do, line 146
v_9 = limit - cursor;
lab9: do {
// (, line 146
if (!(out_grouping_b(g_v_I, 73, 232)))
{
break lab9;
}
// test, line 148
v_10 = limit - cursor;
// (, line 148
// among, line 149
if (find_among_b(a_5, 4) == 0)
{
break lab9;
}
if (!(out_grouping_b(g_v, 97, 232)))
{
break lab9;
}
cursor = limit - v_10;
// [, line 152
ket = cursor;
// next, line 152
if (cursor <= limit_backward)
{
break lab9;
}
cursor--;
// ], line 152
bra = cursor;
// delete, line 152
slice_del();
} while (false);
cursor = limit - v_9;
return true;
}
public boolean stem() {
int v_1;
int v_2;
int v_3;
int v_4;
// (, line 157
// do, line 159
v_1 = cursor;
lab0: do {
// call prelude, line 159
if (!r_prelude())
{
break lab0;
}
} while (false);
cursor = v_1;
// do, line 160
v_2 = cursor;
lab1: do {
// call mark_regions, line 160
if (!r_mark_regions())
{
break lab1;
}
} while (false);
cursor = v_2;
// backwards, line 161
limit_backward = cursor; cursor = limit;
// do, line 162
v_3 = limit - cursor;
lab2: do {
// call standard_suffix, line 162
if (!r_standard_suffix())
{
break lab2;
}
} while (false);
cursor = limit - v_3;
cursor = limit_backward; // do, line 163
v_4 = cursor;
lab3: do {
// call postlude, line 163
if (!r_postlude())
{
break lab3;
}
} while (false);
cursor = v_4;
return true;
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,726 @@
// This file was generated automatically by the Snowball to Java compiler
package org.tartarus.snowball.ext;
import org.tartarus.snowball.SnowballProgram;
import org.tartarus.snowball.Among;
/**
* Generated class implementing code defined by a snowball script.
*/
public class German2Stemmer extends SnowballProgram {
private Among a_0[] = {
new Among ( "", -1, 6, "", this),
new Among ( "ae", 0, 2, "", this),
new Among ( "oe", 0, 3, "", this),
new Among ( "qu", 0, 5, "", this),
new Among ( "ue", 0, 4, "", this),
new Among ( "\u00DF", 0, 1, "", this)
};
private Among a_1[] = {
new Among ( "", -1, 6, "", this),
new Among ( "U", 0, 2, "", this),
new Among ( "Y", 0, 1, "", this),
new Among ( "\u00E4", 0, 3, "", this),
new Among ( "\u00F6", 0, 4, "", this),
new Among ( "\u00FC", 0, 5, "", this)
};
private Among a_2[] = {
new Among ( "e", -1, 1, "", this),
new Among ( "em", -1, 1, "", this),
new Among ( "en", -1, 1, "", this),
new Among ( "ern", -1, 1, "", this),
new Among ( "er", -1, 1, "", this),
new Among ( "s", -1, 2, "", this),
new Among ( "es", 5, 1, "", this)
};
private Among a_3[] = {
new Among ( "en", -1, 1, "", this),
new Among ( "er", -1, 1, "", this),
new Among ( "st", -1, 2, "", this),
new Among ( "est", 2, 1, "", this)
};
private Among a_4[] = {
new Among ( "ig", -1, 1, "", this),
new Among ( "lich", -1, 1, "", this)
};
private Among a_5[] = {
new Among ( "end", -1, 1, "", this),
new Among ( "ig", -1, 2, "", this),
new Among ( "ung", -1, 1, "", this),
new Among ( "lich", -1, 3, "", this),
new Among ( "isch", -1, 2, "", this),
new Among ( "ik", -1, 2, "", this),
new Among ( "heit", -1, 3, "", this),
new Among ( "keit", -1, 4, "", this)
};
private static final char g_v[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 32, 8 };
private static final char g_s_ending[] = {117, 30, 5 };
private static final char g_st_ending[] = {117, 30, 4 };
private int I_x;
private int I_p2;
private int I_p1;
private void copy_from(German2Stemmer other) {
I_x = other.I_x;
I_p2 = other.I_p2;
I_p1 = other.I_p1;
super.copy_from(other);
}
private boolean r_prelude() {
int among_var;
int v_1;
int v_2;
int v_3;
int v_4;
int v_5;
// (, line 28
// test, line 30
v_1 = cursor;
// repeat, line 30
replab0: while(true)
{
v_2 = cursor;
lab1: do {
// goto, line 30
golab2: while(true)
{
v_3 = cursor;
lab3: do {
// (, line 30
if (!(in_grouping(g_v, 97, 252)))
{
break lab3;
}
// [, line 31
bra = cursor;
// or, line 31
lab4: do {
v_4 = cursor;
lab5: do {
// (, line 31
// literal, line 31
if (!(eq_s(1, "u")))
{
break lab5;
}
// ], line 31
ket = cursor;
if (!(in_grouping(g_v, 97, 252)))
{
break lab5;
}
// <-, line 31
slice_from("U");
break lab4;
} while (false);
cursor = v_4;
// (, line 32
// literal, line 32
if (!(eq_s(1, "y")))
{
break lab3;
}
// ], line 32
ket = cursor;
if (!(in_grouping(g_v, 97, 252)))
{
break lab3;
}
// <-, line 32
slice_from("Y");
} while (false);
cursor = v_3;
break golab2;
} while (false);
cursor = v_3;
if (cursor >= limit)
{
break lab1;
}
cursor++;
}
continue replab0;
} while (false);
cursor = v_2;
break replab0;
}
cursor = v_1;
// repeat, line 35
replab6: while(true)
{
v_5 = cursor;
lab7: do {
// (, line 35
// [, line 36
bra = cursor;
// substring, line 36
among_var = find_among(a_0, 6);
if (among_var == 0)
{
break lab7;
}
// ], line 36
ket = cursor;
switch(among_var) {
case 0:
break lab7;
case 1:
// (, line 37
// <-, line 37
slice_from("ss");
break;
case 2:
// (, line 38
// <-, line 38
slice_from("\u00E4");
break;
case 3:
// (, line 39
// <-, line 39
slice_from("\u00F6");
break;
case 4:
// (, line 40
// <-, line 40
slice_from("\u00FC");
break;
case 5:
// (, line 41
// hop, line 41
{
int c = cursor + 2;
if (0 > c || c > limit)
{
break lab7;
}
cursor = c;
}
break;
case 6:
// (, line 42
// next, line 42
if (cursor >= limit)
{
break lab7;
}
cursor++;
break;
}
continue replab6;
} while (false);
cursor = v_5;
break replab6;
}
return true;
}
private boolean r_mark_regions() {
int v_1;
// (, line 48
I_p1 = limit;
I_p2 = limit;
// test, line 53
v_1 = cursor;
// (, line 53
// hop, line 53
{
int c = cursor + 3;
if (0 > c || c > limit)
{
return false;
}
cursor = c;
}
// setmark x, line 53
I_x = cursor;
cursor = v_1;
// gopast, line 55
golab0: while(true)
{
lab1: do {
if (!(in_grouping(g_v, 97, 252)))
{
break lab1;
}
break golab0;
} while (false);
if (cursor >= limit)
{
return false;
}
cursor++;
}
// gopast, line 55
golab2: while(true)
{
lab3: do {
if (!(out_grouping(g_v, 97, 252)))
{
break lab3;
}
break golab2;
} while (false);
if (cursor >= limit)
{
return false;
}
cursor++;
}
// setmark p1, line 55
I_p1 = cursor;
// try, line 56
lab4: do {
// (, line 56
if (!(I_p1 < I_x))
{
break lab4;
}
I_p1 = I_x;
} while (false);
// gopast, line 57
golab5: while(true)
{
lab6: do {
if (!(in_grouping(g_v, 97, 252)))
{
break lab6;
}
break golab5;
} while (false);
if (cursor >= limit)
{
return false;
}
cursor++;
}
// gopast, line 57
golab7: while(true)
{
lab8: do {
if (!(out_grouping(g_v, 97, 252)))
{
break lab8;
}
break golab7;
} while (false);
if (cursor >= limit)
{
return false;
}
cursor++;
}
// setmark p2, line 57
I_p2 = cursor;
return true;
}
private boolean r_postlude() {
int among_var;
int v_1;
// repeat, line 61
replab0: while(true)
{
v_1 = cursor;
lab1: do {
// (, line 61
// [, line 63
bra = cursor;
// substring, line 63
among_var = find_among(a_1, 6);
if (among_var == 0)
{
break lab1;
}
// ], line 63
ket = cursor;
switch(among_var) {
case 0:
break lab1;
case 1:
// (, line 64
// <-, line 64
slice_from("y");
break;
case 2:
// (, line 65
// <-, line 65
slice_from("u");
break;
case 3:
// (, line 66
// <-, line 66
slice_from("a");
break;
case 4:
// (, line 67
// <-, line 67
slice_from("o");
break;
case 5:
// (, line 68
// <-, line 68
slice_from("u");
break;
case 6:
// (, line 69
// next, line 69
if (cursor >= limit)
{
break lab1;
}
cursor++;
break;
}
continue replab0;
} while (false);
cursor = v_1;
break replab0;
}
return true;
}
private boolean r_R1() {
if (!(I_p1 <= cursor))
{
return false;
}
return true;
}
private boolean r_R2() {
if (!(I_p2 <= cursor))
{
return false;
}
return true;
}
private boolean r_standard_suffix() {
int among_var;
int v_1;
int v_2;
int v_3;
int v_4;
int v_5;
int v_6;
int v_7;
int v_8;
int v_9;
// (, line 79
// do, line 80
v_1 = limit - cursor;
lab0: do {
// (, line 80
// [, line 81
ket = cursor;
// substring, line 81
among_var = find_among_b(a_2, 7);
if (among_var == 0)
{
break lab0;
}
// ], line 81
bra = cursor;
// call R1, line 81
if (!r_R1())
{
break lab0;
}
switch(among_var) {
case 0:
break lab0;
case 1:
// (, line 83
// delete, line 83
slice_del();
break;
case 2:
// (, line 86
if (!(in_grouping_b(g_s_ending, 98, 116)))
{
break lab0;
}
// delete, line 86
slice_del();
break;
}
} while (false);
cursor = limit - v_1;
// do, line 90
v_2 = limit - cursor;
lab1: do {
// (, line 90
// [, line 91
ket = cursor;
// substring, line 91
among_var = find_among_b(a_3, 4);
if (among_var == 0)
{
break lab1;
}
// ], line 91
bra = cursor;
// call R1, line 91
if (!r_R1())
{
break lab1;
}
switch(among_var) {
case 0:
break lab1;
case 1:
// (, line 93
// delete, line 93
slice_del();
break;
case 2:
// (, line 96
if (!(in_grouping_b(g_st_ending, 98, 116)))
{
break lab1;
}
// hop, line 96
{
int c = cursor - 3;
if (limit_backward > c || c > limit)
{
break lab1;
}
cursor = c;
}
// delete, line 96
slice_del();
break;
}
} while (false);
cursor = limit - v_2;
// do, line 100
v_3 = limit - cursor;
lab2: do {
// (, line 100
// [, line 101
ket = cursor;
// substring, line 101
among_var = find_among_b(a_5, 8);
if (among_var == 0)
{
break lab2;
}
// ], line 101
bra = cursor;
// call R2, line 101
if (!r_R2())
{
break lab2;
}
switch(among_var) {
case 0:
break lab2;
case 1:
// (, line 103
// delete, line 103
slice_del();
// try, line 104
v_4 = limit - cursor;
lab3: do {
// (, line 104
// [, line 104
ket = cursor;
// literal, line 104
if (!(eq_s_b(2, "ig")))
{
cursor = limit - v_4;
break lab3;
}
// ], line 104
bra = cursor;
// not, line 104
{
v_5 = limit - cursor;
lab4: do {
// literal, line 104
if (!(eq_s_b(1, "e")))
{
break lab4;
}
cursor = limit - v_4;
break lab3;
} while (false);
cursor = limit - v_5;
}
// call R2, line 104
if (!r_R2())
{
cursor = limit - v_4;
break lab3;
}
// delete, line 104
slice_del();
} while (false);
break;
case 2:
// (, line 107
// not, line 107
{
v_6 = limit - cursor;
lab5: do {
// literal, line 107
if (!(eq_s_b(1, "e")))
{
break lab5;
}
break lab2;
} while (false);
cursor = limit - v_6;
}
// delete, line 107
slice_del();
break;
case 3:
// (, line 110
// delete, line 110
slice_del();
// try, line 111
v_7 = limit - cursor;
lab6: do {
// (, line 111
// [, line 112
ket = cursor;
// or, line 112
lab7: do {
v_8 = limit - cursor;
lab8: do {
// literal, line 112
if (!(eq_s_b(2, "er")))
{
break lab8;
}
break lab7;
} while (false);
cursor = limit - v_8;
// literal, line 112
if (!(eq_s_b(2, "en")))
{
cursor = limit - v_7;
break lab6;
}
} while (false);
// ], line 112
bra = cursor;
// call R1, line 112
if (!r_R1())
{
cursor = limit - v_7;
break lab6;
}
// delete, line 112
slice_del();
} while (false);
break;
case 4:
// (, line 116
// delete, line 116
slice_del();
// try, line 117
v_9 = limit - cursor;
lab9: do {
// (, line 117
// [, line 118
ket = cursor;
// substring, line 118
among_var = find_among_b(a_4, 2);
if (among_var == 0)
{
cursor = limit - v_9;
break lab9;
}
// ], line 118
bra = cursor;
// call R2, line 118
if (!r_R2())
{
cursor = limit - v_9;
break lab9;
}
switch(among_var) {
case 0:
cursor = limit - v_9;
break lab9;
case 1:
// (, line 120
// delete, line 120
slice_del();
break;
}
} while (false);
break;
}
} while (false);
cursor = limit - v_3;
return true;
}
public boolean stem() {
int v_1;
int v_2;
int v_3;
int v_4;
// (, line 130
// do, line 131
v_1 = cursor;
lab0: do {
// call prelude, line 131
if (!r_prelude())
{
break lab0;
}
} while (false);
cursor = v_1;
// do, line 132
v_2 = cursor;
lab1: do {
// call mark_regions, line 132
if (!r_mark_regions())
{
break lab1;
}
} while (false);
cursor = v_2;
// backwards, line 133
limit_backward = cursor; cursor = limit;
// do, line 134
v_3 = limit - cursor;
lab2: do {
// call standard_suffix, line 134
if (!r_standard_suffix())
{
break lab2;
}
} while (false);
cursor = limit - v_3;
cursor = limit_backward; // do, line 135
v_4 = cursor;
lab3: do {
// call postlude, line 135
if (!r_postlude())
{
break lab3;
}
} while (false);
cursor = v_4;
return true;
}
}

View File

@ -0,0 +1,688 @@
// This file was generated automatically by the Snowball to Java compiler
package org.tartarus.snowball.ext;
import org.tartarus.snowball.SnowballProgram;
import org.tartarus.snowball.Among;
/**
* Generated class implementing code defined by a snowball script.
*/
public class GermanStemmer extends SnowballProgram {
private Among a_0[] = {
new Among ( "", -1, 6, "", this),
new Among ( "U", 0, 2, "", this),
new Among ( "Y", 0, 1, "", this),
new Among ( "\u00E4", 0, 3, "", this),
new Among ( "\u00F6", 0, 4, "", this),
new Among ( "\u00FC", 0, 5, "", this)
};
private Among a_1[] = {
new Among ( "e", -1, 1, "", this),
new Among ( "em", -1, 1, "", this),
new Among ( "en", -1, 1, "", this),
new Among ( "ern", -1, 1, "", this),
new Among ( "er", -1, 1, "", this),
new Among ( "s", -1, 2, "", this),
new Among ( "es", 5, 1, "", this)
};
private Among a_2[] = {
new Among ( "en", -1, 1, "", this),
new Among ( "er", -1, 1, "", this),
new Among ( "st", -1, 2, "", this),
new Among ( "est", 2, 1, "", this)
};
private Among a_3[] = {
new Among ( "ig", -1, 1, "", this),
new Among ( "lich", -1, 1, "", this)
};
private Among a_4[] = {
new Among ( "end", -1, 1, "", this),
new Among ( "ig", -1, 2, "", this),
new Among ( "ung", -1, 1, "", this),
new Among ( "lich", -1, 3, "", this),
new Among ( "isch", -1, 2, "", this),
new Among ( "ik", -1, 2, "", this),
new Among ( "heit", -1, 3, "", this),
new Among ( "keit", -1, 4, "", this)
};
private static final char g_v[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 32, 8 };
private static final char g_s_ending[] = {117, 30, 5 };
private static final char g_st_ending[] = {117, 30, 4 };
private int I_x;
private int I_p2;
private int I_p1;
private void copy_from(GermanStemmer other) {
I_x = other.I_x;
I_p2 = other.I_p2;
I_p1 = other.I_p1;
super.copy_from(other);
}
private boolean r_prelude() {
int v_1;
int v_2;
int v_3;
int v_4;
int v_5;
int v_6;
// (, line 28
// test, line 30
v_1 = cursor;
// repeat, line 30
replab0: while(true)
{
v_2 = cursor;
lab1: do {
// (, line 30
// or, line 33
lab2: do {
v_3 = cursor;
lab3: do {
// (, line 31
// [, line 32
bra = cursor;
// literal, line 32
if (!(eq_s(1, "\u00DF")))
{
break lab3;
}
// ], line 32
ket = cursor;
// <-, line 32
slice_from("ss");
break lab2;
} while (false);
cursor = v_3;
// next, line 33
if (cursor >= limit)
{
break lab1;
}
cursor++;
} while (false);
continue replab0;
} while (false);
cursor = v_2;
break replab0;
}
cursor = v_1;
// repeat, line 36
replab4: while(true)
{
v_4 = cursor;
lab5: do {
// goto, line 36
golab6: while(true)
{
v_5 = cursor;
lab7: do {
// (, line 36
if (!(in_grouping(g_v, 97, 252)))
{
break lab7;
}
// [, line 37
bra = cursor;
// or, line 37
lab8: do {
v_6 = cursor;
lab9: do {
// (, line 37
// literal, line 37
if (!(eq_s(1, "u")))
{
break lab9;
}
// ], line 37
ket = cursor;
if (!(in_grouping(g_v, 97, 252)))
{
break lab9;
}
// <-, line 37
slice_from("U");
break lab8;
} while (false);
cursor = v_6;
// (, line 38
// literal, line 38
if (!(eq_s(1, "y")))
{
break lab7;
}
// ], line 38
ket = cursor;
if (!(in_grouping(g_v, 97, 252)))
{
break lab7;
}
// <-, line 38
slice_from("Y");
} while (false);
cursor = v_5;
break golab6;
} while (false);
cursor = v_5;
if (cursor >= limit)
{
break lab5;
}
cursor++;
}
continue replab4;
} while (false);
cursor = v_4;
break replab4;
}
return true;
}
private boolean r_mark_regions() {
int v_1;
// (, line 42
I_p1 = limit;
I_p2 = limit;
// test, line 47
v_1 = cursor;
// (, line 47
// hop, line 47
{
int c = cursor + 3;
if (0 > c || c > limit)
{
return false;
}
cursor = c;
}
// setmark x, line 47
I_x = cursor;
cursor = v_1;
// gopast, line 49
golab0: while(true)
{
lab1: do {
if (!(in_grouping(g_v, 97, 252)))
{
break lab1;
}
break golab0;
} while (false);
if (cursor >= limit)
{
return false;
}
cursor++;
}
// gopast, line 49
golab2: while(true)
{
lab3: do {
if (!(out_grouping(g_v, 97, 252)))
{
break lab3;
}
break golab2;
} while (false);
if (cursor >= limit)
{
return false;
}
cursor++;
}
// setmark p1, line 49
I_p1 = cursor;
// try, line 50
lab4: do {
// (, line 50
if (!(I_p1 < I_x))
{
break lab4;
}
I_p1 = I_x;
} while (false);
// gopast, line 51
golab5: while(true)
{
lab6: do {
if (!(in_grouping(g_v, 97, 252)))
{
break lab6;
}
break golab5;
} while (false);
if (cursor >= limit)
{
return false;
}
cursor++;
}
// gopast, line 51
golab7: while(true)
{
lab8: do {
if (!(out_grouping(g_v, 97, 252)))
{
break lab8;
}
break golab7;
} while (false);
if (cursor >= limit)
{
return false;
}
cursor++;
}
// setmark p2, line 51
I_p2 = cursor;
return true;
}
private boolean r_postlude() {
int among_var;
int v_1;
// repeat, line 55
replab0: while(true)
{
v_1 = cursor;
lab1: do {
// (, line 55
// [, line 57
bra = cursor;
// substring, line 57
among_var = find_among(a_0, 6);
if (among_var == 0)
{
break lab1;
}
// ], line 57
ket = cursor;
switch(among_var) {
case 0:
break lab1;
case 1:
// (, line 58
// <-, line 58
slice_from("y");
break;
case 2:
// (, line 59
// <-, line 59
slice_from("u");
break;
case 3:
// (, line 60
// <-, line 60
slice_from("a");
break;
case 4:
// (, line 61
// <-, line 61
slice_from("o");
break;
case 5:
// (, line 62
// <-, line 62
slice_from("u");
break;
case 6:
// (, line 63
// next, line 63
if (cursor >= limit)
{
break lab1;
}
cursor++;
break;
}
continue replab0;
} while (false);
cursor = v_1;
break replab0;
}
return true;
}
private boolean r_R1() {
if (!(I_p1 <= cursor))
{
return false;
}
return true;
}
private boolean r_R2() {
if (!(I_p2 <= cursor))
{
return false;
}
return true;
}
private boolean r_standard_suffix() {
int among_var;
int v_1;
int v_2;
int v_3;
int v_4;
int v_5;
int v_6;
int v_7;
int v_8;
int v_9;
// (, line 73
// do, line 74
v_1 = limit - cursor;
lab0: do {
// (, line 74
// [, line 75
ket = cursor;
// substring, line 75
among_var = find_among_b(a_1, 7);
if (among_var == 0)
{
break lab0;
}
// ], line 75
bra = cursor;
// call R1, line 75
if (!r_R1())
{
break lab0;
}
switch(among_var) {
case 0:
break lab0;
case 1:
// (, line 77
// delete, line 77
slice_del();
break;
case 2:
// (, line 80
if (!(in_grouping_b(g_s_ending, 98, 116)))
{
break lab0;
}
// delete, line 80
slice_del();
break;
}
} while (false);
cursor = limit - v_1;
// do, line 84
v_2 = limit - cursor;
lab1: do {
// (, line 84
// [, line 85
ket = cursor;
// substring, line 85
among_var = find_among_b(a_2, 4);
if (among_var == 0)
{
break lab1;
}
// ], line 85
bra = cursor;
// call R1, line 85
if (!r_R1())
{
break lab1;
}
switch(among_var) {
case 0:
break lab1;
case 1:
// (, line 87
// delete, line 87
slice_del();
break;
case 2:
// (, line 90
if (!(in_grouping_b(g_st_ending, 98, 116)))
{
break lab1;
}
// hop, line 90
{
int c = cursor - 3;
if (limit_backward > c || c > limit)
{
break lab1;
}
cursor = c;
}
// delete, line 90
slice_del();
break;
}
} while (false);
cursor = limit - v_2;
// do, line 94
v_3 = limit - cursor;
lab2: do {
// (, line 94
// [, line 95
ket = cursor;
// substring, line 95
among_var = find_among_b(a_4, 8);
if (among_var == 0)
{
break lab2;
}
// ], line 95
bra = cursor;
// call R2, line 95
if (!r_R2())
{
break lab2;
}
switch(among_var) {
case 0:
break lab2;
case 1:
// (, line 97
// delete, line 97
slice_del();
// try, line 98
v_4 = limit - cursor;
lab3: do {
// (, line 98
// [, line 98
ket = cursor;
// literal, line 98
if (!(eq_s_b(2, "ig")))
{
cursor = limit - v_4;
break lab3;
}
// ], line 98
bra = cursor;
// not, line 98
{
v_5 = limit - cursor;
lab4: do {
// literal, line 98
if (!(eq_s_b(1, "e")))
{
break lab4;
}
cursor = limit - v_4;
break lab3;
} while (false);
cursor = limit - v_5;
}
// call R2, line 98
if (!r_R2())
{
cursor = limit - v_4;
break lab3;
}
// delete, line 98
slice_del();
} while (false);
break;
case 2:
// (, line 101
// not, line 101
{
v_6 = limit - cursor;
lab5: do {
// literal, line 101
if (!(eq_s_b(1, "e")))
{
break lab5;
}
break lab2;
} while (false);
cursor = limit - v_6;
}
// delete, line 101
slice_del();
break;
case 3:
// (, line 104
// delete, line 104
slice_del();
// try, line 105
v_7 = limit - cursor;
lab6: do {
// (, line 105
// [, line 106
ket = cursor;
// or, line 106
lab7: do {
v_8 = limit - cursor;
lab8: do {
// literal, line 106
if (!(eq_s_b(2, "er")))
{
break lab8;
}
break lab7;
} while (false);
cursor = limit - v_8;
// literal, line 106
if (!(eq_s_b(2, "en")))
{
cursor = limit - v_7;
break lab6;
}
} while (false);
// ], line 106
bra = cursor;
// call R1, line 106
if (!r_R1())
{
cursor = limit - v_7;
break lab6;
}
// delete, line 106
slice_del();
} while (false);
break;
case 4:
// (, line 110
// delete, line 110
slice_del();
// try, line 111
v_9 = limit - cursor;
lab9: do {
// (, line 111
// [, line 112
ket = cursor;
// substring, line 112
among_var = find_among_b(a_3, 2);
if (among_var == 0)
{
cursor = limit - v_9;
break lab9;
}
// ], line 112
bra = cursor;
// call R2, line 112
if (!r_R2())
{
cursor = limit - v_9;
break lab9;
}
switch(among_var) {
case 0:
cursor = limit - v_9;
break lab9;
case 1:
// (, line 114
// delete, line 114
slice_del();
break;
}
} while (false);
break;
}
} while (false);
cursor = limit - v_3;
return true;
}
public boolean stem() {
int v_1;
int v_2;
int v_3;
int v_4;
// (, line 124
// do, line 125
v_1 = cursor;
lab0: do {
// call prelude, line 125
if (!r_prelude())
{
break lab0;
}
} while (false);
cursor = v_1;
// do, line 126
v_2 = cursor;
lab1: do {
// call mark_regions, line 126
if (!r_mark_regions())
{
break lab1;
}
} while (false);
cursor = v_2;
// backwards, line 127
limit_backward = cursor; cursor = limit;
// do, line 128
v_3 = limit - cursor;
lab2: do {
// call standard_suffix, line 128
if (!r_standard_suffix())
{
break lab2;
}
} while (false);
cursor = limit - v_3;
cursor = limit_backward; // do, line 129
v_4 = cursor;
lab3: do {
// call postlude, line 129
if (!r_postlude())
{
break lab3;
}
} while (false);
cursor = v_4;
return true;
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,358 @@
// This file was generated automatically by the Snowball to Java compiler
package org.tartarus.snowball.ext;
import org.tartarus.snowball.SnowballProgram;
import org.tartarus.snowball.Among;
/**
* Generated class implementing code defined by a snowball script.
*/
public class NorwegianStemmer extends SnowballProgram {
private Among a_0[] = {
new Among ( "a", -1, 1, "", this),
new Among ( "e", -1, 1, "", this),
new Among ( "ede", 1, 1, "", this),
new Among ( "ande", 1, 1, "", this),
new Among ( "ende", 1, 1, "", this),
new Among ( "ane", 1, 1, "", this),
new Among ( "ene", 1, 1, "", this),
new Among ( "hetene", 6, 1, "", this),
new Among ( "erte", 1, 3, "", this),
new Among ( "en", -1, 1, "", this),
new Among ( "heten", 9, 1, "", this),
new Among ( "ar", -1, 1, "", this),
new Among ( "er", -1, 1, "", this),
new Among ( "heter", 12, 1, "", this),
new Among ( "s", -1, 2, "", this),
new Among ( "as", 14, 1, "", this),
new Among ( "es", 14, 1, "", this),
new Among ( "edes", 16, 1, "", this),
new Among ( "endes", 16, 1, "", this),
new Among ( "enes", 16, 1, "", this),
new Among ( "hetenes", 19, 1, "", this),
new Among ( "ens", 14, 1, "", this),
new Among ( "hetens", 21, 1, "", this),
new Among ( "ers", 14, 1, "", this),
new Among ( "ets", 14, 1, "", this),
new Among ( "et", -1, 1, "", this),
new Among ( "het", 25, 1, "", this),
new Among ( "ert", -1, 3, "", this),
new Among ( "ast", -1, 1, "", this)
};
private Among a_1[] = {
new Among ( "dt", -1, -1, "", this),
new Among ( "vt", -1, -1, "", this)
};
private Among a_2[] = {
new Among ( "leg", -1, 1, "", this),
new Among ( "eleg", 0, 1, "", this),
new Among ( "ig", -1, 1, "", this),
new Among ( "eig", 2, 1, "", this),
new Among ( "lig", 2, 1, "", this),
new Among ( "elig", 4, 1, "", this),
new Among ( "els", -1, 1, "", this),
new Among ( "lov", -1, 1, "", this),
new Among ( "elov", 7, 1, "", this),
new Among ( "slov", 7, 1, "", this),
new Among ( "hetslov", 9, 1, "", this)
};
private static final char g_v[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128 };
private static final char g_s_ending[] = {119, 125, 149, 1 };
private int I_x;
private int I_p1;
private void copy_from(NorwegianStemmer other) {
I_x = other.I_x;
I_p1 = other.I_p1;
super.copy_from(other);
}
private boolean r_mark_regions() {
int v_1;
int v_2;
// (, line 26
I_p1 = limit;
// test, line 30
v_1 = cursor;
// (, line 30
// hop, line 30
{
int c = cursor + 3;
if (0 > c || c > limit)
{
return false;
}
cursor = c;
}
// setmark x, line 30
I_x = cursor;
cursor = v_1;
// goto, line 31
golab0: while(true)
{
v_2 = cursor;
lab1: do {
if (!(in_grouping(g_v, 97, 248)))
{
break lab1;
}
cursor = v_2;
break golab0;
} while (false);
cursor = v_2;
if (cursor >= limit)
{
return false;
}
cursor++;
}
// gopast, line 31
golab2: while(true)
{
lab3: do {
if (!(out_grouping(g_v, 97, 248)))
{
break lab3;
}
break golab2;
} while (false);
if (cursor >= limit)
{
return false;
}
cursor++;
}
// setmark p1, line 31
I_p1 = cursor;
// try, line 32
lab4: do {
// (, line 32
if (!(I_p1 < I_x))
{
break lab4;
}
I_p1 = I_x;
} while (false);
return true;
}
private boolean r_main_suffix() {
int among_var;
int v_1;
int v_2;
int v_3;
// (, line 37
// setlimit, line 38
v_1 = limit - cursor;
// tomark, line 38
if (cursor < I_p1)
{
return false;
}
cursor = I_p1;
v_2 = limit_backward;
limit_backward = cursor;
cursor = limit - v_1;
// (, line 38
// [, line 38
ket = cursor;
// substring, line 38
among_var = find_among_b(a_0, 29);
if (among_var == 0)
{
limit_backward = v_2;
return false;
}
// ], line 38
bra = cursor;
limit_backward = v_2;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 44
// delete, line 44
slice_del();
break;
case 2:
// (, line 46
// or, line 46
lab0: do {
v_3 = limit - cursor;
lab1: do {
if (!(in_grouping_b(g_s_ending, 98, 122)))
{
break lab1;
}
break lab0;
} while (false);
cursor = limit - v_3;
// (, line 46
// literal, line 46
if (!(eq_s_b(1, "k")))
{
return false;
}
if (!(out_grouping_b(g_v, 97, 248)))
{
return false;
}
} while (false);
// delete, line 46
slice_del();
break;
case 3:
// (, line 48
// <-, line 48
slice_from("er");
break;
}
return true;
}
private boolean r_consonant_pair() {
int v_1;
int v_2;
int v_3;
// (, line 52
// test, line 53
v_1 = limit - cursor;
// (, line 53
// setlimit, line 54
v_2 = limit - cursor;
// tomark, line 54
if (cursor < I_p1)
{
return false;
}
cursor = I_p1;
v_3 = limit_backward;
limit_backward = cursor;
cursor = limit - v_2;
// (, line 54
// [, line 54
ket = cursor;
// substring, line 54
if (find_among_b(a_1, 2) == 0)
{
limit_backward = v_3;
return false;
}
// ], line 54
bra = cursor;
limit_backward = v_3;
cursor = limit - v_1;
// next, line 59
if (cursor <= limit_backward)
{
return false;
}
cursor--;
// ], line 59
bra = cursor;
// delete, line 59
slice_del();
return true;
}
private boolean r_other_suffix() {
int among_var;
int v_1;
int v_2;
// (, line 62
// setlimit, line 63
v_1 = limit - cursor;
// tomark, line 63
if (cursor < I_p1)
{
return false;
}
cursor = I_p1;
v_2 = limit_backward;
limit_backward = cursor;
cursor = limit - v_1;
// (, line 63
// [, line 63
ket = cursor;
// substring, line 63
among_var = find_among_b(a_2, 11);
if (among_var == 0)
{
limit_backward = v_2;
return false;
}
// ], line 63
bra = cursor;
limit_backward = v_2;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 67
// delete, line 67
slice_del();
break;
}
return true;
}
public boolean stem() {
int v_1;
int v_2;
int v_3;
int v_4;
// (, line 72
// do, line 74
v_1 = cursor;
lab0: do {
// call mark_regions, line 74
if (!r_mark_regions())
{
break lab0;
}
} while (false);
cursor = v_1;
// backwards, line 75
limit_backward = cursor; cursor = limit;
// (, line 75
// do, line 76
v_2 = limit - cursor;
lab1: do {
// call main_suffix, line 76
if (!r_main_suffix())
{
break lab1;
}
} while (false);
cursor = limit - v_2;
// do, line 77
v_3 = limit - cursor;
lab2: do {
// call consonant_pair, line 77
if (!r_consonant_pair())
{
break lab2;
}
} while (false);
cursor = limit - v_3;
// do, line 78
v_4 = limit - cursor;
lab3: do {
// call other_suffix, line 78
if (!r_other_suffix())
{
break lab3;
}
} while (false);
cursor = limit - v_4;
cursor = limit_backward; return true;
}
}

View File

@ -0,0 +1,906 @@
// This file was generated automatically by the Snowball to Java compiler
package org.tartarus.snowball.ext;
import org.tartarus.snowball.SnowballProgram;
import org.tartarus.snowball.Among;
/**
* Generated class implementing code defined by a snowball script.
*/
public class PorterStemmer extends SnowballProgram {
private Among a_0[] = {
new Among ( "s", -1, 3, "", this),
new Among ( "ies", 0, 2, "", this),
new Among ( "sses", 0, 1, "", this),
new Among ( "ss", 0, -1, "", this)
};
private Among a_1[] = {
new Among ( "", -1, 3, "", this),
new Among ( "bb", 0, 2, "", this),
new Among ( "dd", 0, 2, "", this),
new Among ( "ff", 0, 2, "", this),
new Among ( "gg", 0, 2, "", this),
new Among ( "bl", 0, 1, "", this),
new Among ( "mm", 0, 2, "", this),
new Among ( "nn", 0, 2, "", this),
new Among ( "pp", 0, 2, "", this),
new Among ( "rr", 0, 2, "", this),
new Among ( "at", 0, 1, "", this),
new Among ( "tt", 0, 2, "", this),
new Among ( "iz", 0, 1, "", this)
};
private Among a_2[] = {
new Among ( "ed", -1, 2, "", this),
new Among ( "eed", 0, 1, "", this),
new Among ( "ing", -1, 2, "", this)
};
private Among a_3[] = {
new Among ( "anci", -1, 3, "", this),
new Among ( "enci", -1, 2, "", this),
new Among ( "abli", -1, 4, "", this),
new Among ( "eli", -1, 6, "", this),
new Among ( "alli", -1, 9, "", this),
new Among ( "ousli", -1, 12, "", this),
new Among ( "entli", -1, 5, "", this),
new Among ( "aliti", -1, 10, "", this),
new Among ( "biliti", -1, 14, "", this),
new Among ( "iviti", -1, 13, "", this),
new Among ( "tional", -1, 1, "", this),
new Among ( "ational", 10, 8, "", this),
new Among ( "alism", -1, 10, "", this),
new Among ( "ation", -1, 8, "", this),
new Among ( "ization", 13, 7, "", this),
new Among ( "izer", -1, 7, "", this),
new Among ( "ator", -1, 8, "", this),
new Among ( "iveness", -1, 13, "", this),
new Among ( "fulness", -1, 11, "", this),
new Among ( "ousness", -1, 12, "", this)
};
private Among a_4[] = {
new Among ( "icate", -1, 2, "", this),
new Among ( "ative", -1, 3, "", this),
new Among ( "alize", -1, 1, "", this),
new Among ( "iciti", -1, 2, "", this),
new Among ( "ical", -1, 2, "", this),
new Among ( "ful", -1, 3, "", this),
new Among ( "ness", -1, 3, "", this)
};
private Among a_5[] = {
new Among ( "ic", -1, 1, "", this),
new Among ( "ance", -1, 1, "", this),
new Among ( "ence", -1, 1, "", this),
new Among ( "able", -1, 1, "", this),
new Among ( "ible", -1, 1, "", this),
new Among ( "ate", -1, 1, "", this),
new Among ( "ive", -1, 1, "", this),
new Among ( "ize", -1, 1, "", this),
new Among ( "iti", -1, 1, "", this),
new Among ( "al", -1, 1, "", this),
new Among ( "ism", -1, 1, "", this),
new Among ( "ion", -1, 2, "", this),
new Among ( "er", -1, 1, "", this),
new Among ( "ous", -1, 1, "", this),
new Among ( "ant", -1, 1, "", this),
new Among ( "ent", -1, 1, "", this),
new Among ( "ment", 15, 1, "", this),
new Among ( "ement", 16, 1, "", this),
new Among ( "ou", -1, 1, "", this)
};
private static final char g_v[] = {17, 65, 16, 1 };
private static final char g_v_WXY[] = {1, 17, 65, 208, 1 };
private boolean B_Y_found;
private int I_p2;
private int I_p1;
private void copy_from(PorterStemmer other) {
B_Y_found = other.B_Y_found;
I_p2 = other.I_p2;
I_p1 = other.I_p1;
super.copy_from(other);
}
private boolean r_shortv() {
// (, line 19
if (!(out_grouping_b(g_v_WXY, 89, 121)))
{
return false;
}
if (!(in_grouping_b(g_v, 97, 121)))
{
return false;
}
if (!(out_grouping_b(g_v, 97, 121)))
{
return false;
}
return true;
}
private boolean r_R1() {
if (!(I_p1 <= cursor))
{
return false;
}
return true;
}
private boolean r_R2() {
if (!(I_p2 <= cursor))
{
return false;
}
return true;
}
private boolean r_Step_1a() {
int among_var;
// (, line 24
// [, line 25
ket = cursor;
// substring, line 25
among_var = find_among_b(a_0, 4);
if (among_var == 0)
{
return false;
}
// ], line 25
bra = cursor;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 26
// <-, line 26
slice_from("ss");
break;
case 2:
// (, line 27
// <-, line 27
slice_from("i");
break;
case 3:
// (, line 29
// delete, line 29
slice_del();
break;
}
return true;
}
private boolean r_Step_1b() {
int among_var;
int v_1;
int v_3;
int v_4;
// (, line 33
// [, line 34
ket = cursor;
// substring, line 34
among_var = find_among_b(a_2, 3);
if (among_var == 0)
{
return false;
}
// ], line 34
bra = cursor;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 35
// call R1, line 35
if (!r_R1())
{
return false;
}
// <-, line 35
slice_from("ee");
break;
case 2:
// (, line 37
// test, line 38
v_1 = limit - cursor;
// gopast, line 38
golab0: while(true)
{
lab1: do {
if (!(in_grouping_b(g_v, 97, 121)))
{
break lab1;
}
break golab0;
} while (false);
if (cursor <= limit_backward)
{
return false;
}
cursor--;
}
cursor = limit - v_1;
// delete, line 38
slice_del();
// test, line 39
v_3 = limit - cursor;
// substring, line 39
among_var = find_among_b(a_1, 13);
if (among_var == 0)
{
return false;
}
cursor = limit - v_3;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 41
// <+, line 41
{
int c = cursor;
insert(cursor, cursor, "e");
cursor = c;
}
break;
case 2:
// (, line 44
// [, line 44
ket = cursor;
// next, line 44
if (cursor <= limit_backward)
{
return false;
}
cursor--;
// ], line 44
bra = cursor;
// delete, line 44
slice_del();
break;
case 3:
// (, line 45
// atmark, line 45
if (cursor != I_p1)
{
return false;
}
// test, line 45
v_4 = limit - cursor;
// call shortv, line 45
if (!r_shortv())
{
return false;
}
cursor = limit - v_4;
// <+, line 45
{
int c = cursor;
insert(cursor, cursor, "e");
cursor = c;
}
break;
}
break;
}
return true;
}
private boolean r_Step_1c() {
int v_1;
// (, line 51
// [, line 52
ket = cursor;
// or, line 52
lab0: do {
v_1 = limit - cursor;
lab1: do {
// literal, line 52
if (!(eq_s_b(1, "y")))
{
break lab1;
}
break lab0;
} while (false);
cursor = limit - v_1;
// literal, line 52
if (!(eq_s_b(1, "Y")))
{
return false;
}
} while (false);
// ], line 52
bra = cursor;
// gopast, line 53
golab2: while(true)
{
lab3: do {
if (!(in_grouping_b(g_v, 97, 121)))
{
break lab3;
}
break golab2;
} while (false);
if (cursor <= limit_backward)
{
return false;
}
cursor--;
}
// <-, line 54
slice_from("i");
return true;
}
private boolean r_Step_2() {
int among_var;
// (, line 57
// [, line 58
ket = cursor;
// substring, line 58
among_var = find_among_b(a_3, 20);
if (among_var == 0)
{
return false;
}
// ], line 58
bra = cursor;
// call R1, line 58
if (!r_R1())
{
return false;
}
switch(among_var) {
case 0:
return false;
case 1:
// (, line 59
// <-, line 59
slice_from("tion");
break;
case 2:
// (, line 60
// <-, line 60
slice_from("ence");
break;
case 3:
// (, line 61
// <-, line 61
slice_from("ance");
break;
case 4:
// (, line 62
// <-, line 62
slice_from("able");
break;
case 5:
// (, line 63
// <-, line 63
slice_from("ent");
break;
case 6:
// (, line 64
// <-, line 64
slice_from("e");
break;
case 7:
// (, line 66
// <-, line 66
slice_from("ize");
break;
case 8:
// (, line 68
// <-, line 68
slice_from("ate");
break;
case 9:
// (, line 69
// <-, line 69
slice_from("al");
break;
case 10:
// (, line 71
// <-, line 71
slice_from("al");
break;
case 11:
// (, line 72
// <-, line 72
slice_from("ful");
break;
case 12:
// (, line 74
// <-, line 74
slice_from("ous");
break;
case 13:
// (, line 76
// <-, line 76
slice_from("ive");
break;
case 14:
// (, line 77
// <-, line 77
slice_from("ble");
break;
}
return true;
}
private boolean r_Step_3() {
int among_var;
// (, line 81
// [, line 82
ket = cursor;
// substring, line 82
among_var = find_among_b(a_4, 7);
if (among_var == 0)
{
return false;
}
// ], line 82
bra = cursor;
// call R1, line 82
if (!r_R1())
{
return false;
}
switch(among_var) {
case 0:
return false;
case 1:
// (, line 83
// <-, line 83
slice_from("al");
break;
case 2:
// (, line 85
// <-, line 85
slice_from("ic");
break;
case 3:
// (, line 87
// delete, line 87
slice_del();
break;
}
return true;
}
private boolean r_Step_4() {
int among_var;
int v_1;
// (, line 91
// [, line 92
ket = cursor;
// substring, line 92
among_var = find_among_b(a_5, 19);
if (among_var == 0)
{
return false;
}
// ], line 92
bra = cursor;
// call R2, line 92
if (!r_R2())
{
return false;
}
switch(among_var) {
case 0:
return false;
case 1:
// (, line 95
// delete, line 95
slice_del();
break;
case 2:
// (, line 96
// or, line 96
lab0: do {
v_1 = limit - cursor;
lab1: do {
// literal, line 96
if (!(eq_s_b(1, "s")))
{
break lab1;
}
break lab0;
} while (false);
cursor = limit - v_1;
// literal, line 96
if (!(eq_s_b(1, "t")))
{
return false;
}
} while (false);
// delete, line 96
slice_del();
break;
}
return true;
}
private boolean r_Step_5a() {
int v_1;
int v_2;
// (, line 100
// [, line 101
ket = cursor;
// literal, line 101
if (!(eq_s_b(1, "e")))
{
return false;
}
// ], line 101
bra = cursor;
// or, line 102
lab0: do {
v_1 = limit - cursor;
lab1: do {
// call R2, line 102
if (!r_R2())
{
break lab1;
}
break lab0;
} while (false);
cursor = limit - v_1;
// (, line 102
// call R1, line 102
if (!r_R1())
{
return false;
}
// not, line 102
{
v_2 = limit - cursor;
lab2: do {
// call shortv, line 102
if (!r_shortv())
{
break lab2;
}
return false;
} while (false);
cursor = limit - v_2;
}
} while (false);
// delete, line 103
slice_del();
return true;
}
private boolean r_Step_5b() {
// (, line 106
// [, line 107
ket = cursor;
// literal, line 107
if (!(eq_s_b(1, "l")))
{
return false;
}
// ], line 107
bra = cursor;
// call R2, line 108
if (!r_R2())
{
return false;
}
// literal, line 108
if (!(eq_s_b(1, "l")))
{
return false;
}
// delete, line 109
slice_del();
return true;
}
public boolean stem() {
int v_1;
int v_2;
int v_3;
int v_4;
int v_5;
int v_10;
int v_11;
int v_12;
int v_13;
int v_14;
int v_15;
int v_16;
int v_17;
int v_18;
int v_19;
int v_20;
// (, line 113
// unset Y_found, line 115
B_Y_found = false;
// do, line 116
v_1 = cursor;
lab0: do {
// (, line 116
// [, line 116
bra = cursor;
// literal, line 116
if (!(eq_s(1, "y")))
{
break lab0;
}
// ], line 116
ket = cursor;
// <-, line 116
slice_from("Y");
// set Y_found, line 116
B_Y_found = true;
} while (false);
cursor = v_1;
// do, line 117
v_2 = cursor;
lab1: do {
// repeat, line 117
replab2: while(true)
{
v_3 = cursor;
lab3: do {
// (, line 117
// goto, line 117
golab4: while(true)
{
v_4 = cursor;
lab5: do {
// (, line 117
if (!(in_grouping(g_v, 97, 121)))
{
break lab5;
}
// [, line 117
bra = cursor;
// literal, line 117
if (!(eq_s(1, "y")))
{
break lab5;
}
// ], line 117
ket = cursor;
cursor = v_4;
break golab4;
} while (false);
cursor = v_4;
if (cursor >= limit)
{
break lab3;
}
cursor++;
}
// <-, line 117
slice_from("Y");
// set Y_found, line 117
B_Y_found = true;
continue replab2;
} while (false);
cursor = v_3;
break replab2;
}
} while (false);
cursor = v_2;
I_p1 = limit;
I_p2 = limit;
// do, line 121
v_5 = cursor;
lab6: do {
// (, line 121
// gopast, line 122
golab7: while(true)
{
lab8: do {
if (!(in_grouping(g_v, 97, 121)))
{
break lab8;
}
break golab7;
} while (false);
if (cursor >= limit)
{
break lab6;
}
cursor++;
}
// gopast, line 122
golab9: while(true)
{
lab10: do {
if (!(out_grouping(g_v, 97, 121)))
{
break lab10;
}
break golab9;
} while (false);
if (cursor >= limit)
{
break lab6;
}
cursor++;
}
// setmark p1, line 122
I_p1 = cursor;
// gopast, line 123
golab11: while(true)
{
lab12: do {
if (!(in_grouping(g_v, 97, 121)))
{
break lab12;
}
break golab11;
} while (false);
if (cursor >= limit)
{
break lab6;
}
cursor++;
}
// gopast, line 123
golab13: while(true)
{
lab14: do {
if (!(out_grouping(g_v, 97, 121)))
{
break lab14;
}
break golab13;
} while (false);
if (cursor >= limit)
{
break lab6;
}
cursor++;
}
// setmark p2, line 123
I_p2 = cursor;
} while (false);
cursor = v_5;
// backwards, line 126
limit_backward = cursor; cursor = limit;
// (, line 126
// do, line 127
v_10 = limit - cursor;
lab15: do {
// call Step_1a, line 127
if (!r_Step_1a())
{
break lab15;
}
} while (false);
cursor = limit - v_10;
// do, line 128
v_11 = limit - cursor;
lab16: do {
// call Step_1b, line 128
if (!r_Step_1b())
{
break lab16;
}
} while (false);
cursor = limit - v_11;
// do, line 129
v_12 = limit - cursor;
lab17: do {
// call Step_1c, line 129
if (!r_Step_1c())
{
break lab17;
}
} while (false);
cursor = limit - v_12;
// do, line 130
v_13 = limit - cursor;
lab18: do {
// call Step_2, line 130
if (!r_Step_2())
{
break lab18;
}
} while (false);
cursor = limit - v_13;
// do, line 131
v_14 = limit - cursor;
lab19: do {
// call Step_3, line 131
if (!r_Step_3())
{
break lab19;
}
} while (false);
cursor = limit - v_14;
// do, line 132
v_15 = limit - cursor;
lab20: do {
// call Step_4, line 132
if (!r_Step_4())
{
break lab20;
}
} while (false);
cursor = limit - v_15;
// do, line 133
v_16 = limit - cursor;
lab21: do {
// call Step_5a, line 133
if (!r_Step_5a())
{
break lab21;
}
} while (false);
cursor = limit - v_16;
// do, line 134
v_17 = limit - cursor;
lab22: do {
// call Step_5b, line 134
if (!r_Step_5b())
{
break lab22;
}
} while (false);
cursor = limit - v_17;
cursor = limit_backward; // do, line 137
v_18 = cursor;
lab23: do {
// (, line 137
// Boolean test Y_found, line 137
if (!(B_Y_found))
{
break lab23;
}
// repeat, line 137
replab24: while(true)
{
v_19 = cursor;
lab25: do {
// (, line 137
// goto, line 137
golab26: while(true)
{
v_20 = cursor;
lab27: do {
// (, line 137
// [, line 137
bra = cursor;
// literal, line 137
if (!(eq_s(1, "Y")))
{
break lab27;
}
// ], line 137
ket = cursor;
cursor = v_20;
break golab26;
} while (false);
cursor = v_20;
if (cursor >= limit)
{
break lab25;
}
cursor++;
}
// <-, line 137
slice_from("y");
continue replab24;
} while (false);
cursor = v_19;
break replab24;
}
} while (false);
cursor = v_18;
return true;
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,727 @@
// This file was generated automatically by the Snowball to Java compiler
package org.tartarus.snowball.ext;
import org.tartarus.snowball.SnowballProgram;
import org.tartarus.snowball.Among;
/**
* Generated class implementing code defined by a snowball script.
*/
public class RussianStemmer extends SnowballProgram {
private Among a_0[] = {
new Among ( "\u0432", -1, 1, "", this),
new Among ( "\u0438\u0432", 0, 2, "", this),
new Among ( "\u044B\u0432", 0, 2, "", this),
new Among ( "\u0432\u0448\u0438", -1, 1, "", this),
new Among ( "\u0438\u0432\u0448\u0438", 3, 2, "", this),
new Among ( "\u044B\u0432\u0448\u0438", 3, 2, "", this),
new Among ( "\u0432\u0448\u0438\u0441\u044C", -1, 1, "", this),
new Among ( "\u0438\u0432\u0448\u0438\u0441\u044C", 6, 2, "", this),
new Among ( "\u044B\u0432\u0448\u0438\u0441\u044C", 6, 2, "", this)
};
private Among a_1[] = {
new Among ( "\u0435\u0435", -1, 1, "", this),
new Among ( "\u0438\u0435", -1, 1, "", this),
new Among ( "\u043E\u0435", -1, 1, "", this),
new Among ( "\u044B\u0435", -1, 1, "", this),
new Among ( "\u0438\u043C\u0438", -1, 1, "", this),
new Among ( "\u044B\u043C\u0438", -1, 1, "", this),
new Among ( "\u0435\u0439", -1, 1, "", this),
new Among ( "\u0438\u0439", -1, 1, "", this),
new Among ( "\u043E\u0439", -1, 1, "", this),
new Among ( "\u044B\u0439", -1, 1, "", this),
new Among ( "\u0435\u043C", -1, 1, "", this),
new Among ( "\u0438\u043C", -1, 1, "", this),
new Among ( "\u043E\u043C", -1, 1, "", this),
new Among ( "\u044B\u043C", -1, 1, "", this),
new Among ( "\u0435\u0433\u043E", -1, 1, "", this),
new Among ( "\u043E\u0433\u043E", -1, 1, "", this),
new Among ( "\u0435\u043C\u0443", -1, 1, "", this),
new Among ( "\u043E\u043C\u0443", -1, 1, "", this),
new Among ( "\u0438\u0445", -1, 1, "", this),
new Among ( "\u044B\u0445", -1, 1, "", this),
new Among ( "\u0435\u044E", -1, 1, "", this),
new Among ( "\u043E\u044E", -1, 1, "", this),
new Among ( "\u0443\u044E", -1, 1, "", this),
new Among ( "\u044E\u044E", -1, 1, "", this),
new Among ( "\u0430\u044F", -1, 1, "", this),
new Among ( "\u044F\u044F", -1, 1, "", this)
};
private Among a_2[] = {
new Among ( "\u0435\u043C", -1, 1, "", this),
new Among ( "\u043D\u043D", -1, 1, "", this),
new Among ( "\u0432\u0448", -1, 1, "", this),
new Among ( "\u0438\u0432\u0448", 2, 2, "", this),
new Among ( "\u044B\u0432\u0448", 2, 2, "", this),
new Among ( "\u0449", -1, 1, "", this),
new Among ( "\u044E\u0449", 5, 1, "", this),
new Among ( "\u0443\u044E\u0449", 6, 2, "", this)
};
private Among a_3[] = {
new Among ( "\u0441\u044C", -1, 1, "", this),
new Among ( "\u0441\u044F", -1, 1, "", this)
};
private Among a_4[] = {
new Among ( "\u043B\u0430", -1, 1, "", this),
new Among ( "\u0438\u043B\u0430", 0, 2, "", this),
new Among ( "\u044B\u043B\u0430", 0, 2, "", this),
new Among ( "\u043D\u0430", -1, 1, "", this),
new Among ( "\u0435\u043D\u0430", 3, 2, "", this),
new Among ( "\u0435\u0442\u0435", -1, 1, "", this),
new Among ( "\u0438\u0442\u0435", -1, 2, "", this),
new Among ( "\u0439\u0442\u0435", -1, 1, "", this),
new Among ( "\u0435\u0439\u0442\u0435", 7, 2, "", this),
new Among ( "\u0443\u0439\u0442\u0435", 7, 2, "", this),
new Among ( "\u043B\u0438", -1, 1, "", this),
new Among ( "\u0438\u043B\u0438", 10, 2, "", this),
new Among ( "\u044B\u043B\u0438", 10, 2, "", this),
new Among ( "\u0439", -1, 1, "", this),
new Among ( "\u0435\u0439", 13, 2, "", this),
new Among ( "\u0443\u0439", 13, 2, "", this),
new Among ( "\u043B", -1, 1, "", this),
new Among ( "\u0438\u043B", 16, 2, "", this),
new Among ( "\u044B\u043B", 16, 2, "", this),
new Among ( "\u0435\u043C", -1, 1, "", this),
new Among ( "\u0438\u043C", -1, 2, "", this),
new Among ( "\u044B\u043C", -1, 2, "", this),
new Among ( "\u043D", -1, 1, "", this),
new Among ( "\u0435\u043D", 22, 2, "", this),
new Among ( "\u043B\u043E", -1, 1, "", this),
new Among ( "\u0438\u043B\u043E", 24, 2, "", this),
new Among ( "\u044B\u043B\u043E", 24, 2, "", this),
new Among ( "\u043D\u043E", -1, 1, "", this),
new Among ( "\u0435\u043D\u043E", 27, 2, "", this),
new Among ( "\u043D\u043D\u043E", 27, 1, "", this),
new Among ( "\u0435\u0442", -1, 1, "", this),
new Among ( "\u0443\u0435\u0442", 30, 2, "", this),
new Among ( "\u0438\u0442", -1, 2, "", this),
new Among ( "\u044B\u0442", -1, 2, "", this),
new Among ( "\u044E\u0442", -1, 1, "", this),
new Among ( "\u0443\u044E\u0442", 34, 2, "", this),
new Among ( "\u044F\u0442", -1, 2, "", this),
new Among ( "\u043D\u044B", -1, 1, "", this),
new Among ( "\u0435\u043D\u044B", 37, 2, "", this),
new Among ( "\u0442\u044C", -1, 1, "", this),
new Among ( "\u0438\u0442\u044C", 39, 2, "", this),
new Among ( "\u044B\u0442\u044C", 39, 2, "", this),
new Among ( "\u0435\u0448\u044C", -1, 1, "", this),
new Among ( "\u0438\u0448\u044C", -1, 2, "", this),
new Among ( "\u044E", -1, 2, "", this),
new Among ( "\u0443\u044E", 44, 2, "", this)
};
private Among a_5[] = {
new Among ( "\u0430", -1, 1, "", this),
new Among ( "\u0435\u0432", -1, 1, "", this),
new Among ( "\u043E\u0432", -1, 1, "", this),
new Among ( "\u0435", -1, 1, "", this),
new Among ( "\u0438\u0435", 3, 1, "", this),
new Among ( "\u044C\u0435", 3, 1, "", this),
new Among ( "\u0438", -1, 1, "", this),
new Among ( "\u0435\u0438", 6, 1, "", this),
new Among ( "\u0438\u0438", 6, 1, "", this),
new Among ( "\u0430\u043C\u0438", 6, 1, "", this),
new Among ( "\u044F\u043C\u0438", 6, 1, "", this),
new Among ( "\u0438\u044F\u043C\u0438", 10, 1, "", this),
new Among ( "\u0439", -1, 1, "", this),
new Among ( "\u0435\u0439", 12, 1, "", this),
new Among ( "\u0438\u0435\u0439", 13, 1, "", this),
new Among ( "\u0438\u0439", 12, 1, "", this),
new Among ( "\u043E\u0439", 12, 1, "", this),
new Among ( "\u0430\u043C", -1, 1, "", this),
new Among ( "\u0435\u043C", -1, 1, "", this),
new Among ( "\u0438\u0435\u043C", 18, 1, "", this),
new Among ( "\u043E\u043C", -1, 1, "", this),
new Among ( "\u044F\u043C", -1, 1, "", this),
new Among ( "\u0438\u044F\u043C", 21, 1, "", this),
new Among ( "\u043E", -1, 1, "", this),
new Among ( "\u0443", -1, 1, "", this),
new Among ( "\u0430\u0445", -1, 1, "", this),
new Among ( "\u044F\u0445", -1, 1, "", this),
new Among ( "\u0438\u044F\u0445", 26, 1, "", this),
new Among ( "\u044B", -1, 1, "", this),
new Among ( "\u044C", -1, 1, "", this),
new Among ( "\u044E", -1, 1, "", this),
new Among ( "\u0438\u044E", 30, 1, "", this),
new Among ( "\u044C\u044E", 30, 1, "", this),
new Among ( "\u044F", -1, 1, "", this),
new Among ( "\u0438\u044F", 33, 1, "", this),
new Among ( "\u044C\u044F", 33, 1, "", this)
};
private Among a_6[] = {
new Among ( "\u043E\u0441\u0442", -1, 1, "", this),
new Among ( "\u043E\u0441\u0442\u044C", -1, 1, "", this)
};
private Among a_7[] = {
new Among ( "\u0435\u0439\u0448\u0435", -1, 1, "", this),
new Among ( "\u043D", -1, 2, "", this),
new Among ( "\u0435\u0439\u0448", -1, 1, "", this),
new Among ( "\u044C", -1, 3, "", this)
};
private static final char g_v[] = {33, 65, 8, 232 };
private int I_p2;
private int I_pV;
private void copy_from(RussianStemmer other) {
I_p2 = other.I_p2;
I_pV = other.I_pV;
super.copy_from(other);
}
private boolean r_mark_regions() {
int v_1;
// (, line 57
I_pV = limit;
I_p2 = limit;
// do, line 61
v_1 = cursor;
lab0: do {
// (, line 61
// gopast, line 62
golab1: while(true)
{
lab2: do {
if (!(in_grouping(g_v, 1072, 1103)))
{
break lab2;
}
break golab1;
} while (false);
if (cursor >= limit)
{
break lab0;
}
cursor++;
}
// setmark pV, line 62
I_pV = cursor;
// gopast, line 62
golab3: while(true)
{
lab4: do {
if (!(out_grouping(g_v, 1072, 1103)))
{
break lab4;
}
break golab3;
} while (false);
if (cursor >= limit)
{
break lab0;
}
cursor++;
}
// gopast, line 63
golab5: while(true)
{
lab6: do {
if (!(in_grouping(g_v, 1072, 1103)))
{
break lab6;
}
break golab5;
} while (false);
if (cursor >= limit)
{
break lab0;
}
cursor++;
}
// gopast, line 63
golab7: while(true)
{
lab8: do {
if (!(out_grouping(g_v, 1072, 1103)))
{
break lab8;
}
break golab7;
} while (false);
if (cursor >= limit)
{
break lab0;
}
cursor++;
}
// setmark p2, line 63
I_p2 = cursor;
} while (false);
cursor = v_1;
return true;
}
private boolean r_R2() {
if (!(I_p2 <= cursor))
{
return false;
}
return true;
}
private boolean r_perfective_gerund() {
int among_var;
int v_1;
// (, line 71
// [, line 72
ket = cursor;
// substring, line 72
among_var = find_among_b(a_0, 9);
if (among_var == 0)
{
return false;
}
// ], line 72
bra = cursor;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 76
// or, line 76
lab0: do {
v_1 = limit - cursor;
lab1: do {
// literal, line 76
if (!(eq_s_b(1, "\u0430")))
{
break lab1;
}
break lab0;
} while (false);
cursor = limit - v_1;
// literal, line 76
if (!(eq_s_b(1, "\u044F")))
{
return false;
}
} while (false);
// delete, line 76
slice_del();
break;
case 2:
// (, line 83
// delete, line 83
slice_del();
break;
}
return true;
}
private boolean r_adjective() {
int among_var;
// (, line 87
// [, line 88
ket = cursor;
// substring, line 88
among_var = find_among_b(a_1, 26);
if (among_var == 0)
{
return false;
}
// ], line 88
bra = cursor;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 97
// delete, line 97
slice_del();
break;
}
return true;
}
private boolean r_adjectival() {
int among_var;
int v_1;
int v_2;
// (, line 101
// call adjective, line 102
if (!r_adjective())
{
return false;
}
// try, line 109
v_1 = limit - cursor;
lab0: do {
// (, line 109
// [, line 110
ket = cursor;
// substring, line 110
among_var = find_among_b(a_2, 8);
if (among_var == 0)
{
cursor = limit - v_1;
break lab0;
}
// ], line 110
bra = cursor;
switch(among_var) {
case 0:
cursor = limit - v_1;
break lab0;
case 1:
// (, line 115
// or, line 115
lab1: do {
v_2 = limit - cursor;
lab2: do {
// literal, line 115
if (!(eq_s_b(1, "\u0430")))
{
break lab2;
}
break lab1;
} while (false);
cursor = limit - v_2;
// literal, line 115
if (!(eq_s_b(1, "\u044F")))
{
cursor = limit - v_1;
break lab0;
}
} while (false);
// delete, line 115
slice_del();
break;
case 2:
// (, line 122
// delete, line 122
slice_del();
break;
}
} while (false);
return true;
}
private boolean r_reflexive() {
int among_var;
// (, line 128
// [, line 129
ket = cursor;
// substring, line 129
among_var = find_among_b(a_3, 2);
if (among_var == 0)
{
return false;
}
// ], line 129
bra = cursor;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 132
// delete, line 132
slice_del();
break;
}
return true;
}
private boolean r_verb() {
int among_var;
int v_1;
// (, line 136
// [, line 137
ket = cursor;
// substring, line 137
among_var = find_among_b(a_4, 46);
if (among_var == 0)
{
return false;
}
// ], line 137
bra = cursor;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 143
// or, line 143
lab0: do {
v_1 = limit - cursor;
lab1: do {
// literal, line 143
if (!(eq_s_b(1, "\u0430")))
{
break lab1;
}
break lab0;
} while (false);
cursor = limit - v_1;
// literal, line 143
if (!(eq_s_b(1, "\u044F")))
{
return false;
}
} while (false);
// delete, line 143
slice_del();
break;
case 2:
// (, line 151
// delete, line 151
slice_del();
break;
}
return true;
}
private boolean r_noun() {
int among_var;
// (, line 159
// [, line 160
ket = cursor;
// substring, line 160
among_var = find_among_b(a_5, 36);
if (among_var == 0)
{
return false;
}
// ], line 160
bra = cursor;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 167
// delete, line 167
slice_del();
break;
}
return true;
}
private boolean r_derivational() {
int among_var;
// (, line 175
// [, line 176
ket = cursor;
// substring, line 176
among_var = find_among_b(a_6, 2);
if (among_var == 0)
{
return false;
}
// ], line 176
bra = cursor;
// call R2, line 176
if (!r_R2())
{
return false;
}
switch(among_var) {
case 0:
return false;
case 1:
// (, line 179
// delete, line 179
slice_del();
break;
}
return true;
}
private boolean r_tidy_up() {
int among_var;
// (, line 183
// [, line 184
ket = cursor;
// substring, line 184
among_var = find_among_b(a_7, 4);
if (among_var == 0)
{
return false;
}
// ], line 184
bra = cursor;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 188
// delete, line 188
slice_del();
// [, line 189
ket = cursor;
// literal, line 189
if (!(eq_s_b(1, "\u043D")))
{
return false;
}
// ], line 189
bra = cursor;
// literal, line 189
if (!(eq_s_b(1, "\u043D")))
{
return false;
}
// delete, line 189
slice_del();
break;
case 2:
// (, line 192
// literal, line 192
if (!(eq_s_b(1, "\u043D")))
{
return false;
}
// delete, line 192
slice_del();
break;
case 3:
// (, line 194
// delete, line 194
slice_del();
break;
}
return true;
}
public boolean stem() {
int v_1;
int v_2;
int v_3;
int v_4;
int v_5;
int v_6;
int v_7;
int v_8;
int v_9;
int v_10;
// (, line 199
// do, line 201
v_1 = cursor;
lab0: do {
// call mark_regions, line 201
if (!r_mark_regions())
{
break lab0;
}
} while (false);
cursor = v_1;
// backwards, line 202
limit_backward = cursor; cursor = limit;
// setlimit, line 202
v_2 = limit - cursor;
// tomark, line 202
if (cursor < I_pV)
{
return false;
}
cursor = I_pV;
v_3 = limit_backward;
limit_backward = cursor;
cursor = limit - v_2;
// (, line 202
// do, line 203
v_4 = limit - cursor;
lab1: do {
// (, line 203
// or, line 204
lab2: do {
v_5 = limit - cursor;
lab3: do {
// call perfective_gerund, line 204
if (!r_perfective_gerund())
{
break lab3;
}
break lab2;
} while (false);
cursor = limit - v_5;
// (, line 205
// try, line 205
v_6 = limit - cursor;
lab4: do {
// call reflexive, line 205
if (!r_reflexive())
{
cursor = limit - v_6;
break lab4;
}
} while (false);
// or, line 206
lab5: do {
v_7 = limit - cursor;
lab6: do {
// call adjectival, line 206
if (!r_adjectival())
{
break lab6;
}
break lab5;
} while (false);
cursor = limit - v_7;
lab7: do {
// call verb, line 206
if (!r_verb())
{
break lab7;
}
break lab5;
} while (false);
cursor = limit - v_7;
// call noun, line 206
if (!r_noun())
{
break lab1;
}
} while (false);
} while (false);
} while (false);
cursor = limit - v_4;
// try, line 209
v_8 = limit - cursor;
lab8: do {
// (, line 209
// [, line 209
ket = cursor;
// literal, line 209
if (!(eq_s_b(1, "\u0438")))
{
cursor = limit - v_8;
break lab8;
}
// ], line 209
bra = cursor;
// delete, line 209
slice_del();
} while (false);
// do, line 212
v_9 = limit - cursor;
lab9: do {
// call derivational, line 212
if (!r_derivational())
{
break lab9;
}
} while (false);
cursor = limit - v_9;
// do, line 213
v_10 = limit - cursor;
lab10: do {
// call tidy_up, line 213
if (!r_tidy_up())
{
break lab10;
}
} while (false);
cursor = limit - v_10;
limit_backward = v_3;
cursor = limit_backward; return true;
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,349 @@
// This file was generated automatically by the Snowball to Java compiler
package org.tartarus.snowball.ext;
import org.tartarus.snowball.SnowballProgram;
import org.tartarus.snowball.Among;
/**
* Generated class implementing code defined by a snowball script.
*/
public class SwedishStemmer extends SnowballProgram {
private Among a_0[] = {
new Among ( "a", -1, 1, "", this),
new Among ( "arna", 0, 1, "", this),
new Among ( "erna", 0, 1, "", this),
new Among ( "heterna", 2, 1, "", this),
new Among ( "orna", 0, 1, "", this),
new Among ( "ad", -1, 1, "", this),
new Among ( "e", -1, 1, "", this),
new Among ( "ade", 6, 1, "", this),
new Among ( "ande", 6, 1, "", this),
new Among ( "arne", 6, 1, "", this),
new Among ( "are", 6, 1, "", this),
new Among ( "aste", 6, 1, "", this),
new Among ( "en", -1, 1, "", this),
new Among ( "anden", 12, 1, "", this),
new Among ( "aren", 12, 1, "", this),
new Among ( "heten", 12, 1, "", this),
new Among ( "ern", -1, 1, "", this),
new Among ( "ar", -1, 1, "", this),
new Among ( "er", -1, 1, "", this),
new Among ( "heter", 18, 1, "", this),
new Among ( "or", -1, 1, "", this),
new Among ( "s", -1, 2, "", this),
new Among ( "as", 21, 1, "", this),
new Among ( "arnas", 22, 1, "", this),
new Among ( "ernas", 22, 1, "", this),
new Among ( "ornas", 22, 1, "", this),
new Among ( "es", 21, 1, "", this),
new Among ( "ades", 26, 1, "", this),
new Among ( "andes", 26, 1, "", this),
new Among ( "ens", 21, 1, "", this),
new Among ( "arens", 29, 1, "", this),
new Among ( "hetens", 29, 1, "", this),
new Among ( "erns", 21, 1, "", this),
new Among ( "at", -1, 1, "", this),
new Among ( "andet", -1, 1, "", this),
new Among ( "het", -1, 1, "", this),
new Among ( "ast", -1, 1, "", this)
};
private Among a_1[] = {
new Among ( "dd", -1, -1, "", this),
new Among ( "gd", -1, -1, "", this),
new Among ( "nn", -1, -1, "", this),
new Among ( "dt", -1, -1, "", this),
new Among ( "gt", -1, -1, "", this),
new Among ( "kt", -1, -1, "", this),
new Among ( "tt", -1, -1, "", this)
};
private Among a_2[] = {
new Among ( "ig", -1, 1, "", this),
new Among ( "lig", 0, 1, "", this),
new Among ( "els", -1, 1, "", this),
new Among ( "fullt", -1, 3, "", this),
new Among ( "l\u00F6st", -1, 2, "", this)
};
private static final char g_v[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 32 };
private static final char g_s_ending[] = {119, 127, 149 };
private int I_x;
private int I_p1;
private void copy_from(SwedishStemmer other) {
I_x = other.I_x;
I_p1 = other.I_p1;
super.copy_from(other);
}
private boolean r_mark_regions() {
int v_1;
int v_2;
// (, line 26
I_p1 = limit;
// test, line 29
v_1 = cursor;
// (, line 29
// hop, line 29
{
int c = cursor + 3;
if (0 > c || c > limit)
{
return false;
}
cursor = c;
}
// setmark x, line 29
I_x = cursor;
cursor = v_1;
// goto, line 30
golab0: while(true)
{
v_2 = cursor;
lab1: do {
if (!(in_grouping(g_v, 97, 246)))
{
break lab1;
}
cursor = v_2;
break golab0;
} while (false);
cursor = v_2;
if (cursor >= limit)
{
return false;
}
cursor++;
}
// gopast, line 30
golab2: while(true)
{
lab3: do {
if (!(out_grouping(g_v, 97, 246)))
{
break lab3;
}
break golab2;
} while (false);
if (cursor >= limit)
{
return false;
}
cursor++;
}
// setmark p1, line 30
I_p1 = cursor;
// try, line 31
lab4: do {
// (, line 31
if (!(I_p1 < I_x))
{
break lab4;
}
I_p1 = I_x;
} while (false);
return true;
}
private boolean r_main_suffix() {
int among_var;
int v_1;
int v_2;
// (, line 36
// setlimit, line 37
v_1 = limit - cursor;
// tomark, line 37
if (cursor < I_p1)
{
return false;
}
cursor = I_p1;
v_2 = limit_backward;
limit_backward = cursor;
cursor = limit - v_1;
// (, line 37
// [, line 37
ket = cursor;
// substring, line 37
among_var = find_among_b(a_0, 37);
if (among_var == 0)
{
limit_backward = v_2;
return false;
}
// ], line 37
bra = cursor;
limit_backward = v_2;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 44
// delete, line 44
slice_del();
break;
case 2:
// (, line 46
if (!(in_grouping_b(g_s_ending, 98, 121)))
{
return false;
}
// delete, line 46
slice_del();
break;
}
return true;
}
private boolean r_consonant_pair() {
int v_1;
int v_2;
int v_3;
// setlimit, line 50
v_1 = limit - cursor;
// tomark, line 50
if (cursor < I_p1)
{
return false;
}
cursor = I_p1;
v_2 = limit_backward;
limit_backward = cursor;
cursor = limit - v_1;
// (, line 50
// and, line 52
v_3 = limit - cursor;
// among, line 51
if (find_among_b(a_1, 7) == 0)
{
limit_backward = v_2;
return false;
}
cursor = limit - v_3;
// (, line 52
// [, line 52
ket = cursor;
// next, line 52
if (cursor <= limit_backward)
{
limit_backward = v_2;
return false;
}
cursor--;
// ], line 52
bra = cursor;
// delete, line 52
slice_del();
limit_backward = v_2;
return true;
}
private boolean r_other_suffix() {
int among_var;
int v_1;
int v_2;
// setlimit, line 55
v_1 = limit - cursor;
// tomark, line 55
if (cursor < I_p1)
{
return false;
}
cursor = I_p1;
v_2 = limit_backward;
limit_backward = cursor;
cursor = limit - v_1;
// (, line 55
// [, line 56
ket = cursor;
// substring, line 56
among_var = find_among_b(a_2, 5);
if (among_var == 0)
{
limit_backward = v_2;
return false;
}
// ], line 56
bra = cursor;
switch(among_var) {
case 0:
limit_backward = v_2;
return false;
case 1:
// (, line 57
// delete, line 57
slice_del();
break;
case 2:
// (, line 58
// <-, line 58
slice_from("l\u00F6s");
break;
case 3:
// (, line 59
// <-, line 59
slice_from("full");
break;
}
limit_backward = v_2;
return true;
}
public boolean stem() {
int v_1;
int v_2;
int v_3;
int v_4;
// (, line 64
// do, line 66
v_1 = cursor;
lab0: do {
// call mark_regions, line 66
if (!r_mark_regions())
{
break lab0;
}
} while (false);
cursor = v_1;
// backwards, line 67
limit_backward = cursor; cursor = limit;
// (, line 67
// do, line 68
v_2 = limit - cursor;
lab1: do {
// call main_suffix, line 68
if (!r_main_suffix())
{
break lab1;
}
} while (false);
cursor = limit - v_2;
// do, line 69
v_3 = limit - cursor;
lab2: do {
// call consonant_pair, line 69
if (!r_consonant_pair())
{
break lab2;
}
} while (false);
cursor = limit - v_3;
// do, line 70
v_4 = limit - cursor;
lab3: do {
// call other_suffix, line 70
if (!r_other_suffix())
{
break lab3;
}
} while (false);
cursor = limit - v_4;
cursor = limit_backward; return true;
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,53 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
<p>
Lucene Snowball README file
</p>
<p>
This project provides pre-compiled version of the Snowball stemmers
based on revision 500 of the Tartarus Snowball repository,
together with classes integrating them with the Lucene search engine.
</p>
<p>
A few changes has been made to the static Snowball code and compiled stemmers:
</p>
<ul>
<li>Class SnowballProgram is made abstract and contains new abstract method stem() to avoid reflection in Lucene filter class SnowballFilter.</li>
<li>All use of StringBuffers has been refactored to StringBuilder for speed.</li>
<li>Snowball BSD license header has been added to the Java classes to avoid having RAT adding ASL headers.</li>
</ul>
<p>
See the Snowball <a href ="http://snowball.tartarus.org/">home page</a> for more information about the algorithms.
</p>
<p>
<b>IMPORTANT NOTICE ON BACKWARDS COMPATIBILITY!</b>
</p>
<p>
An index created using the Snowball module in Lucene 2.3.2 and below
might not be compatible with the Snowball module in Lucene 2.4 or greater.
</p>
<p>
For more information about this issue see:
https://issues.apache.org/jira/browse/LUCENE-1142
</p>
</body>
</html>

View File

@ -0,0 +1,108 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| A Danish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| This is a ranked list (commonest to rarest) of stopwords derived from
| a large text sample.
og | and
i | in
jeg | I
det | that (dem. pronoun)/it (pers. pronoun)
at | that (in front of a sentence)/to (with infinitive)
en | a/an
den | it (pers. pronoun)/that (dem. pronoun)
til | to/at/for/until/against/by/of/into, more
er | present tense of "to be"
som | who, as
på | on/upon/in/on/at/to/after/of/with/for, on
de | they
med | with/by/in, along
han | he
af | of/by/from/off/for/in/with/on, off
for | at/for/to/from/by/of/ago, in front/before, because
ikke | not
der | who/which, there/those
var | past tense of "to be"
mig | me/myself
sig | oneself/himself/herself/itself/themselves
men | but
et | a/an/one, one (number), someone/somebody/one
har | present tense of "to have"
om | round/about/for/in/a, about/around/down, if
vi | we
min | my
havde | past tense of "to have"
ham | him
hun | she
nu | now
over | over/above/across/by/beyond/past/on/about, over/past
da | then, when/as/since
fra | from/off/since, off, since
du | you
ud | out
sin | his/her/its/one's
dem | them
os | us/ourselves
op | up
man | you/one
hans | his
hvor | where
eller | or
hvad | what
skal | must/shall etc.
selv | myself/youself/herself/ourselves etc., even
her | here
alle | all/everyone/everybody etc.
vil | will (verb)
blev | past tense of "to stay/to remain/to get/to become"
kunne | could
ind | in
når | when
være | present tense of "to be"
dog | however/yet/after all
noget | something
ville | would
jo | you know/you see (adv), yes
deres | their/theirs
efter | after/behind/according to/for/by/from, later/afterwards
ned | down
skulle | should
denne | this
end | than
dette | this
mit | my/mine
også | also
under | under/beneath/below/during, below/underneath
have | have
dig | you
anden | other
hende | her
mine | my
alt | everything
meget | much/very, plenty of
sit | his, her, its, one's
sine | his, her, its, one's
vor | our
mod | against
disse | these
hvis | if
din | your/yours
nogle | some
hos | by/at
blive | be/become
mange | many
ad | by/through
bliver | present tense of "to be/to become"
hendes | her/hers
været | be
thi | for (conj)
jer | you
sådan | such, like this/like that

View File

@ -0,0 +1,117 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/dutch/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| A Dutch stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| This is a ranked list (commonest to rarest) of stopwords derived from
| a large sample of Dutch text.
| Dutch stop words frequently exhibit homonym clashes. These are indicated
| clearly below.
de | the
en | and
van | of, from
ik | I, the ego
te | (1) chez, at etc, (2) to, (3) too
dat | that, which
die | that, those, who, which
in | in, inside
een | a, an, one
hij | he
het | the, it
niet | not, nothing, naught
zijn | (1) to be, being, (2) his, one's, its
is | is
was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river
op | on, upon, at, in, up, used up
aan | on, upon, to (as dative)
met | with, by
als | like, such as, when
voor | (1) before, in front of, (2) furrow
had | had, past tense all persons sing. of 'hebben' (have)
er | there
maar | but, only
om | round, about, for etc
hem | him
dan | then
zou | should/would, past tense all persons sing. of 'zullen'
of | or, whether, if
wat | what, something, anything
mijn | possessive and noun 'mine'
men | people, 'one'
dit | this
zo | so, thus, in this way
door | through by
over | over, across
ze | she, her, they, them
zich | oneself
bij | (1) a bee, (2) by, near, at
ook | also, too
tot | till, until
je | you
mij | me
uit | out of, from
der | Old Dutch form of 'van der' still found in surnames
daar | (1) there, (2) because
haar | (1) her, their, them, (2) hair
naar | (1) unpleasant, unwell etc, (2) towards, (3) as
heb | present first person sing. of 'to have'
hoe | how, why
heeft | present third person sing. of 'to have'
hebben | 'to have' and various parts thereof
deze | this
u | you
want | (1) for, (2) mitten, (3) rigging
nog | yet, still
zal | 'shall', first and third person sing. of verb 'zullen' (will)
me | me
zij | she, they
nu | now
ge | 'thou', still used in Belgium and south Netherlands
geen | none
omdat | because
iets | something, somewhat
worden | to become, grow, get
toch | yet, still
al | all, every, each
waren | (1) 'were' (2) to wander, (3) wares, (3)
veel | much, many
meer | (1) more, (2) lake
doen | to do, to make
toen | then, when
moet | noun 'spot/mote' and present form of 'to must'
ben | (1) am, (2) 'are' in interrogative second person singular of 'to be'
zonder | without
kan | noun 'can' and present form of 'to be able'
hun | their, them
dus | so, consequently
alles | all, everything, anything
onder | under, beneath
ja | yes, of course
eens | once, one day
hier | here
wie | who
werd | imperfect third person sing. of 'become'
altijd | always
doch | yet, but etc
wordt | present third person sing. of 'become'
wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans
kunnen | to be able
ons | us/our
zelf | self
tegen | against, towards, at
na | after, near
reeds | already
wil | (1) present tense of 'want', (2) 'will', noun, (3) fender
kon | could; past tense of 'to be able'
niets | nothing
uw | your
iemand | somebody
geweest | been; past participle of 'be'
andere | other

View File

@ -0,0 +1,317 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/english/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| An English stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| Many of the forms below are quite rare (e.g. "yourselves") but included for
| completeness.
| PRONOUNS FORMS
| 1st person sing
i | subject, always in upper case of course
me | object
my | possessive adjective
| the possessive pronoun `mine' is best suppressed, because of the
| sense of coal-mine etc.
myself | reflexive
| 1st person plural
we | subject
| us | object
| care is required here because US = United States. It is usually
| safe to remove it if it is in lower case.
our | possessive adjective
ours | possessive pronoun
ourselves | reflexive
| second person (archaic `thou' forms not included)
you | subject and object
your | possessive adjective
yours | possessive pronoun
yourself | reflexive (singular)
yourselves | reflexive (plural)
| third person singular
he | subject
him | object
his | possessive adjective and pronoun
himself | reflexive
she | subject
her | object and possessive adjective
hers | possessive pronoun
herself | reflexive
it | subject and object
its | possessive adjective
itself | reflexive
| third person plural
they | subject
them | object
their | possessive adjective
theirs | possessive pronoun
themselves | reflexive
| other forms (demonstratives, interrogatives)
what
which
who
whom
this
that
these
those
| VERB FORMS (using F.R. Palmer's nomenclature)
| BE
am | 1st person, present
is | -s form (3rd person, present)
are | present
was | 1st person, past
were | past
be | infinitive
been | past participle
being | -ing form
| HAVE
have | simple
has | -s form
had | past
having | -ing form
| DO
do | simple
does | -s form
did | past
doing | -ing form
| The forms below are, I believe, best omitted, because of the significant
| homonym forms:
| He made a WILL
| old tin CAN
| merry month of MAY
| a smell of MUST
| fight the good fight with all thy MIGHT
| would, could, should, ought might however be included
| | AUXILIARIES
| | WILL
|will
would
| | SHALL
|shall
should
| | CAN
|can
could
| | MAY
|may
|might
| | MUST
|must
| | OUGHT
ought
| COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing
| pronoun + verb
i'm
you're
he's
she's
it's
we're
they're
i've
you've
we've
they've
i'd
you'd
he'd
she'd
we'd
they'd
i'll
you'll
he'll
she'll
we'll
they'll
| verb + negation
isn't
aren't
wasn't
weren't
hasn't
haven't
hadn't
doesn't
don't
didn't
| auxiliary + negation
won't
wouldn't
shan't
shouldn't
can't
cannot
couldn't
mustn't
| miscellaneous forms
let's
that's
who's
what's
here's
there's
when's
where's
why's
how's
| rarer forms
| daren't needn't
| doubtful forms
| oughtn't mightn't
| ARTICLES
a
an
the
| THE REST (Overlap among prepositions, conjunctions, adverbs etc is so
| high, that classification is pointless.)
and
but
if
or
because
as
until
while
of
at
by
for
with
about
against
between
into
through
during
before
after
above
below
to
from
up
down
in
out
on
off
over
under
again
further
then
once
here
there
when
where
why
how
all
any
both
each
few
more
most
other
some
such
no
nor
not
only
own
same
so
than
too
very
| Just for the record, the following words are among the commonest in English
| one
| every
| least
| less
| many
| now
| ever
| never
| say
| says
| said
| also
| get
| go
| goes
| just
| made
| make
| put
| see
| seen
| whether
| like
| well
| back
| even
| still
| way
| take
| since
| another
| however
| two
| three
| four
| five
| first
| second
| new
| old
| high
| long

View File

@ -0,0 +1,95 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| forms of BE
olla
olen
olet
on
olemme
olette
ovat
ole | negative form
oli
olisi
olisit
olisin
olisimme
olisitte
olisivat
olit
olin
olimme
olitte
olivat
ollut
olleet
en | negation
et
ei
emme
ette
eivät
|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans
minä minun minut minua minussa minusta minuun minulla minulta minulle | I
sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you
hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she
me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we
te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you
he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they
tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this
tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that
se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it
nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these
nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those
ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they
kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who
ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl)
mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what
mitkä | (pl)
joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which
jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl)
| conjunctions
että | that
ja | and
jos | if
koska | because
kuin | than
mutta | but
niin | so
sekä | and
sillä | for
tai | or
vaan | but
vai | or
vaikka | although
| prepositions
kanssa | with
mukaan | according to
noin | about
poikki | across
yli | over, across
| other
kun | when
niin | so
nyt | now
itse | self

View File

@ -0,0 +1,183 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/french/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| A French stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
au | a + le
aux | a + les
avec | with
ce | this
ces | these
dans | with
de | of
des | de + les
du | de + le
elle | she
en | `of them' etc
et | and
eux | them
il | he
je | I
la | the
le | the
leur | their
lui | him
ma | my (fem)
mais | but
me | me
même | same; as in moi-même (myself) etc
mes | me (pl)
moi | me
mon | my (masc)
ne | not
nos | our (pl)
notre | our
nous | we
on | one
ou | where
par | by
pas | not
pour | for
qu | que before vowel
que | that
qui | who
sa | his, her (fem)
se | oneself
ses | his (pl)
son | his, her (masc)
sur | on
ta | thy (fem)
te | thee
tes | thy (pl)
toi | thee
ton | thy (masc)
tu | thou
un | a
une | a
vos | your (pl)
votre | your
vous | you
| single letter forms
c | c'
d | d'
j | j'
l | l'
à | to, at
m | m'
n | n'
s | s'
t | t'
y | there
| forms of être (not including the infinitive):
été
étée
étées
étés
étant
suis
es
est
sommes
êtes
sont
serai
seras
sera
serons
serez
seront
serais
serait
serions
seriez
seraient
étais
était
étions
étiez
étaient
fus
fut
fûmes
fûtes
furent
sois
soit
soyons
soyez
soient
fusse
fusses
fût
fussions
fussiez
fussent
| forms of avoir (not including the infinitive):
ayant
eu
eue
eues
eus
ai
as
avons
avez
ont
aurai
auras
aura
aurons
aurez
auront
aurais
aurait
aurions
auriez
auraient
avais
avait
avions
aviez
avaient
eut
eûmes
eûtes
eurent
aie
aies
ait
ayons
ayez
aient
eusse
eusses
eût
eussions
eussiez
eussent
| Later additions (from Jean-Christophe Deschamps)
ceci | this
celà  | that
cet | this
cette | this
ici | here
ils | they
les | the (pl)
leurs | their (pl)
quel | which
quels | which
quelle | which
quelles | which
sans | without
soi | oneself

View File

@ -0,0 +1,292 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| A German stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| The number of forms in this list is reduced significantly by passing it
| through the German stemmer.
aber | but
alle | all
allem
allen
aller
alles
als | than, as
also | so
am | an + dem
an | at
ander | other
andere
anderem
anderen
anderer
anderes
anderm
andern
anderr
anders
auch | also
auf | on
aus | out of
bei | by
bin | am
bis | until
bist | art
da | there
damit | with it
dann | then
der | the
den
des
dem
die
das
daß | that
derselbe | the same
derselben
denselben
desselben
demselben
dieselbe
dieselben
dasselbe
dazu | to that
dein | thy
deine
deinem
deinen
deiner
deines
denn | because
derer | of those
dessen | of him
dich | thee
dir | to thee
du | thou
dies | this
diese
diesem
diesen
dieser
dieses
doch | (several meanings)
dort | (over) there
durch | through
ein | a
eine
einem
einen
einer
eines
einig | some
einige
einigem
einigen
einiger
einiges
einmal | once
er | he
ihn | him
ihm | to him
es | it
etwas | something
euer | your
eure
eurem
euren
eurer
eures
für | for
gegen | towards
gewesen | p.p. of sein
hab | have
habe | have
haben | have
hat | has
hatte | had
hatten | had
hier | here
hin | there
hinter | behind
ich | I
mich | me
mir | to me
ihr | you, to her
ihre
ihrem
ihren
ihrer
ihres
euch | to you
im | in + dem
in | in
indem | while
ins | in + das
ist | is
jede | each, every
jedem
jeden
jeder
jedes
jene | that
jenem
jenen
jener
jenes
jetzt | now
kann | can
kein | no
keine
keinem
keinen
keiner
keines
können | can
könnte | could
machen | do
man | one
manche | some, many a
manchem
manchen
mancher
manches
mein | my
meine
meinem
meinen
meiner
meines
mit | with
muss | must
musste | had to
nach | to(wards)
nicht | not
nichts | nothing
noch | still, yet
nun | now
nur | only
ob | whether
oder | or
ohne | without
sehr | very
sein | his
seine
seinem
seinen
seiner
seines
selbst | self
sich | herself
sie | they, she
ihnen | to them
sind | are
so | so
solche | such
solchem
solchen
solcher
solches
soll | shall
sollte | should
sondern | but
sonst | else
über | over
um | about, around
und | and
uns | us
unse
unsem
unsen
unser
unses
unter | under
viel | much
vom | von + dem
von | from
vor | before
während | while
war | was
waren | were
warst | wast
was | what
weg | away, off
weil | because
weiter | further
welche | which
welchem
welchen
welcher
welches
wenn | when
werde | will
werden | will
wie | how
wieder | again
will | want
wir | we
wird | will
wirst | willst
wo | where
wollen | want
wollte | wanted
würde | would
würden | would
zu | to
zum | zu + dem
zur | zu + der
zwar | indeed
zwischen | between

View File

@ -0,0 +1,209 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| Hungarian stop word list
| prepared by Anna Tordai
a
ahogy
ahol
aki
akik
akkor
alatt
által
általában
amely
amelyek
amelyekben
amelyeket
amelyet
amelynek
ami
amit
amolyan
amíg
amikor
át
abban
ahhoz
annak
arra
arról
az
azok
azon
azt
azzal
azért
aztán
azután
azonban
bár
be
belül
benne
cikk
cikkek
cikkeket
csak
de
e
eddig
egész
egy
egyes
egyetlen
egyéb
egyik
egyre
ekkor
el
elég
ellen
elő
először
előtt
első
én
éppen
ebben
ehhez
emilyen
ennek
erre
ez
ezt
ezek
ezen
ezzel
ezért
és
fel
felé
hanem
hiszen
hogy
hogyan
igen
így
illetve
ill.
ill
ilyen
ilyenkor
ison
ismét
itt
jól
jobban
kell
kellett
keresztül
keressünk
ki
kívül
között
közül
legalább
lehet
lehetett
legyen
lenne
lenni
lesz
lett
maga
magát
majd
majd
már
más
másik
meg
még
mellett
mert
mely
melyek
mi
mit
míg
miért
milyen
mikor
minden
mindent
mindenki
mindig
mint
mintha
mivel
most
nagy
nagyobb
nagyon
ne
néha
nekem
neki
nem
néhány
nélkül
nincs
olyan
ott
össze
ő
ők
őket
pedig
persze
s
saját
sem
semmi
sok
sokat
sokkal
számára
szemben
szerint
szinte
talán
tehát
teljes
tovább
továbbá
több
úgy
ugyanis
új
újabb
újra
után
utána
utolsó
vagy
vagyis
valaki
valami
valamint
való
vagyok
van
vannak
volt
voltam
voltak
voltunk
vissza
vele
viszont
volna

View File

@ -0,0 +1,301 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/italian/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| An Italian stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
ad | a (to) before vowel
al | a + il
allo | a + lo
ai | a + i
agli | a + gli
all | a + l'
agl | a + gl'
alla | a + la
alle | a + le
con | with
col | con + il
coi | con + i (forms collo, cogli etc are now very rare)
da | from
dal | da + il
dallo | da + lo
dai | da + i
dagli | da + gli
dall | da + l'
dagl | da + gll'
dalla | da + la
dalle | da + le
di | of
del | di + il
dello | di + lo
dei | di + i
degli | di + gli
dell | di + l'
degl | di + gl'
della | di + la
delle | di + le
in | in
nel | in + el
nello | in + lo
nei | in + i
negli | in + gli
nell | in + l'
negl | in + gl'
nella | in + la
nelle | in + le
su | on
sul | su + il
sullo | su + lo
sui | su + i
sugli | su + gli
sull | su + l'
sugl | su + gl'
sulla | su + la
sulle | su + le
per | through, by
tra | among
contro | against
io | I
tu | thou
lui | he
lei | she
noi | we
voi | you
loro | they
mio | my
mia |
miei |
mie |
tuo |
tua |
tuoi | thy
tue |
suo |
sua |
suoi | his, her
sue |
nostro | our
nostra |
nostri |
nostre |
vostro | your
vostra |
vostri |
vostre |
mi | me
ti | thee
ci | us, there
vi | you, there
lo | him, the
la | her, the
li | them
le | them, the
gli | to him, the
ne | from there etc
il | the
un | a
uno | a
una | a
ma | but
ed | and
se | if
perché | why, because
anche | also
come | how
dov | where (as dov')
dove | where
che | who, that
chi | who
cui | whom
non | not
più | more
quale | who, that
quanto | how much
quanti |
quanta |
quante |
quello | that
quelli |
quella |
quelle |
questo | this
questi |
questa |
queste |
si | yes
tutto | all
tutti | all
| single letter forms:
a | at
c | as c' for ce or ci
e | and
i | the
l | as l'
o | or
| forms of avere, to have (not including the infinitive):
ho
hai
ha
abbiamo
avete
hanno
abbia
abbiate
abbiano
avrò
avrai
avrà
avremo
avrete
avranno
avrei
avresti
avrebbe
avremmo
avreste
avrebbero
avevo
avevi
aveva
avevamo
avevate
avevano
ebbi
avesti
ebbe
avemmo
aveste
ebbero
avessi
avesse
avessimo
avessero
avendo
avuto
avuta
avuti
avute
| forms of essere, to be (not including the infinitive):
sono
sei
è
siamo
siete
sia
siate
siano
sarò
sarai
sarà
saremo
sarete
saranno
sarei
saresti
sarebbe
saremmo
sareste
sarebbero
ero
eri
era
eravamo
eravate
erano
fui
fosti
fu
fummo
foste
furono
fossi
fosse
fossimo
fossero
essendo
| forms of fare, to do (not including the infinitive, fa, fat-):
faccio
fai
facciamo
fanno
faccia
facciate
facciano
farò
farai
farà
faremo
farete
faranno
farei
faresti
farebbe
faremmo
fareste
farebbero
facevo
facevi
faceva
facevamo
facevate
facevano
feci
facesti
fece
facemmo
faceste
fecero
facessi
facesse
facessimo
facessero
facendo
| forms of stare, to be (not including the infinitive):
sto
stai
sta
stiamo
stanno
stia
stiate
stiano
starò
starai
starà
staremo
starete
staranno
starei
staresti
starebbe
staremmo
stareste
starebbero
stavo
stavi
stava
stavamo
stavate
stavano
stetti
stesti
stette
stemmo
steste
stettero
stessi
stesse
stessimo
stessero
stando

View File

@ -0,0 +1,192 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| A Norwegian stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| This stop word list is for the dominant bokmål dialect. Words unique
| to nynorsk are marked *.
| Revised by Jan Bruusgaard <Jan.Bruusgaard@ssb.no>, Jan 2005
og | and
i | in
jeg | I
det | it/this/that
at | to (w. inf.)
en | a/an
et | a/an
den | it/this/that
til | to
er | is/am/are
som | who/that
på | on
de | they / you(formal)
med | with
han | he
av | of
ikke | not
ikkje | not *
der | there
så | so
var | was/were
meg | me
seg | you
men | but
ett | one
har | have
om | about
vi | we
min | my
mitt | my
ha | have
hadde | had
hun | she
nå | now
over | over
da | when/as
ved | by/know
fra | from
du | you
ut | out
sin | your
dem | them
oss | us
opp | up
man | you/one
kan | can
hans | his
hvor | where
eller | or
hva | what
skal | shall/must
selv | self (reflective)
sjøl | self (reflective)
her | here
alle | all
vil | will
bli | become
ble | became
blei | became *
blitt | have become
kunne | could
inn | in
når | when
være | be
kom | come
noen | some
noe | some
ville | would
dere | you
som | who/which/that
deres | their/theirs
kun | only/just
ja | yes
etter | after
ned | down
skulle | should
denne | this
for | for/because
deg | you
si | hers/his
sine | hers/his
sitt | hers/his
mot | against
å | to
meget | much
hvorfor | why
dette | this
disse | these/those
uten | without
hvordan | how
ingen | none
din | your
ditt | your
blir | become
samme | same
hvilken | which
hvilke | which (plural)
sånn | such a
inni | inside/within
mellom | between
vår | our
hver | each
hvem | who
vors | us/ours
hvis | whose
både | both
bare | only/just
enn | than
fordi | as/because
før | before
mange | many
også | also
slik | just
vært | been
være | to be
båe | both *
begge | both
siden | since
dykk | your *
dykkar | yours *
dei | they *
deira | them *
deires | theirs *
deim | them *
di | your (fem.) *
då | as/when *
eg | I *
ein | a/an *
eit | a/an *
eitt | a/an *
elles | or *
honom | he *
hjå | at *
ho | she *
hoe | she *
henne | her
hennar | her/hers
hennes | hers
hoss | how *
hossen | how *
ikkje | not *
ingi | noone *
inkje | noone *
korleis | how *
korso | how *
kva | what/which *
kvar | where *
kvarhelst | where *
kven | who/whom *
kvi | why *
kvifor | why *
me | we *
medan | while *
mi | my *
mine | my *
mykje | much *
no | now *
nokon | some (masc./neut.) *
noka | some (fem.) *
nokor | some *
noko | some *
nokre | some *
si | his/hers *
sia | since *
sidan | since *
so | so *
somt | some *
somme | some *
um | about*
upp | up *
vere | be *
vore | was *
verte | become *
vort | become *
varte | became *
vart | became *

View File

@ -0,0 +1,251 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/portuguese/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| A Portuguese stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| The following is a ranked list (commonest to rarest) of stopwords
| deriving from a large sample of text.
| Extra words have been added at the end.
de | of, from
a | the; to, at; her
o | the; him
que | who, that
e | and
do | de + o
da | de + a
em | in
um | a
para | for
| é from SER
com | with
não | not, no
uma | a
os | the; them
no | em + o
se | himself etc
na | em + a
por | for
mais | more
as | the; them
dos | de + os
como | as, like
mas | but
| foi from SER
ao | a + o
ele | he
das | de + as
| tem from TER
à | a + a
seu | his
sua | her
ou | or
| ser from SER
quando | when
muito | much
| há from HAV
nos | em + os; us
já | already, now
| está from EST
eu | I
também | also
só | only, just
pelo | per + o
pela | per + a
até | up to
isso | that
ela | he
entre | between
| era from SER
depois | after
sem | without
mesmo | same
aos | a + os
| ter from TER
seus | his
quem | whom
nas | em + as
me | me
esse | that
eles | they
| estão from EST
você | you
| tinha from TER
| foram from SER
essa | that
num | em + um
nem | nor
suas | her
meu | my
às | a + as
minha | my
| têm from TER
numa | em + uma
pelos | per + os
elas | they
| havia from HAV
| seja from SER
qual | which
| será from SER
nós | we
| tenho from TER
lhe | to him, her
deles | of them
essas | those
esses | those
pelas | per + as
este | this
| fosse from SER
dele | of him
| other words. There are many contractions such as naquele = em+aquele,
| mo = me+o, but they are rare.
| Indefinite article plural forms are also rare.
tu | thou
te | thee
vocês | you (plural)
vos | you
lhes | to them
meus | my
minhas
teu | thy
tua
teus
tuas
nosso | our
nossa
nossos
nossas
dela | of her
delas | of them
esta | this
estes | these
estas | these
aquele | that
aquela | that
aqueles | those
aquelas | those
isto | this
aquilo | that
| forms of estar, to be (not including the infinitive):
estou
está
estamos
estão
estive
esteve
estivemos
estiveram
estava
estávamos
estavam
estivera
estivéramos
esteja
estejamos
estejam
estivesse
estivéssemos
estivessem
estiver
estivermos
estiverem
| forms of haver, to have (not including the infinitive):
hei
havemos
hão
houve
houvemos
houveram
houvera
houvéramos
haja
hajamos
hajam
houvesse
houvéssemos
houvessem
houver
houvermos
houverem
houverei
houverá
houveremos
houverão
houveria
houveríamos
houveriam
| forms of ser, to be (not including the infinitive):
sou
somos
são
era
éramos
eram
fui
foi
fomos
foram
fora
fôramos
seja
sejamos
sejam
fosse
fôssemos
fossem
for
formos
forem
serei
será
seremos
serão
seria
seríamos
seriam
| forms of ter, to have (not including the infinitive):
tenho
tem
temos
tém
tinha
tínhamos
tinham
tive
teve
tivemos
tiveram
tivera
tivéramos
tenha
tenhamos
tenham
tivesse
tivéssemos
tivessem
tiver
tivermos
tiverem
terei
terá
teremos
terão
teria
teríamos
teriam

View File

@ -0,0 +1,241 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/russian/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| a russian stop word list. comments begin with vertical bar. each stop
| word is at the start of a line.
| this is a ranked list (commonest to rarest) of stopwords derived from
| a large text sample.
| letter `ё' is translated to `е'.
и | and
в | in/into
во | alternative form
не | not
что | what/that
он | he
на | on/onto
я | i
с | from
со | alternative form
как | how
а | milder form of `no' (but)
то | conjunction and form of `that'
все | all
она | she
так | so, thus
его | him
но | but
да | yes/and
ты | thou
к | towards, by
у | around, chez
же | intensifier particle
вы | you
за | beyond, behind
бы | conditional/subj. particle
по | up to, along
только | only
ее | her
мне | to me
было | it was
вот | here is/are, particle
от | away from
меня | me
еще | still, yet, more
нет | no, there isnt/arent
о | about
из | out of
ему | to him
теперь | now
когда | when
даже | even
ну | so, well
вдруг | suddenly
ли | interrogative particle
если | if
уже | already, but homonym of `narrower'
или | or
ни | neither
быть | to be
был | he was
него | prepositional form of его
до | up to
вас | you accusative
нибудь | indef. suffix preceded by hyphen
опять | again
уж | already, but homonym of `adder'
вам | to you
сказал | he said
ведь | particle `after all'
там | there
потом | then
себя | oneself
ничего | nothing
ей | to her
может | usually with `быть' as `maybe'
они | they
тут | here
где | where
есть | there is/are
надо | got to, must
ней | prepositional form of ей
для | for
мы | we
тебя | thee
их | them, their
чем | than
была | she was
сам | self
чтоб | in order to
без | without
будто | as if
человек | man, person, one
чего | genitive form of `what'
раз | once
тоже | also
себе | to oneself
под | beneath
жизнь | life
будет | will be
ж | short form of intensifer particle `же'
тогда | then
кто | who
этот | this
говорил | was saying
того | genitive form of `that'
потому | for that reason
этого | genitive form of `this'
какой | which
совсем | altogether
ним | prepositional form of `его', `они'
здесь | here
этом | prepositional form of `этот'
один | one
почти | almost
мой | my
тем | instrumental/dative plural of `тот', `то'
чтобы | full form of `in order that'
нее | her (acc.)
кажется | it seems
сейчас | now
были | they were
куда | where to
зачем | why
сказать | to say
всех | all (acc., gen. preposn. plural)
никогда | never
сегодня | today
можно | possible, one can
при | by
наконец | finally
два | two
об | alternative form of `о', about
другой | another
хоть | even
после | after
над | above
больше | more
тот | that one (masc.)
через | across, in
эти | these
нас | us
про | about
всего | in all, only, of all
них | prepositional form of `они' (they)
какая | which, feminine
много | lots
разве | interrogative particle
сказала | she said
три | three
эту | this, acc. fem. sing.
моя | my, feminine
впрочем | moreover, besides
хорошо | good
свою | ones own, acc. fem. sing.
этой | oblique form of `эта', fem. `this'
перед | in front of
иногда | sometimes
лучше | better
чуть | a little
том | preposn. form of `that one'
нельзя | one must not
такой | such a one
им | to them
более | more
всегда | always
конечно | of course
всю | acc. fem. sing of `all'
между | between
| b: some paradigms
|
| personal pronouns
|
| я меня мне мной [мною]
| ты тебя тебе тобой [тобою]
| он его ему им [него, нему, ним]
| она ее эи ею [нее, нэи, нею]
| оно его ему им [него, нему, ним]
|
| мы нас нам нами
| вы вас вам вами
| они их им ими [них, ним, ними]
|
| себя себе собой [собою]
|
| demonstrative pronouns: этот (this), тот (that)
|
| этот эта это эти
| этого эты это эти
| этого этой этого этих
| этому этой этому этим
| этим этой этим [этою] этими
| этом этой этом этих
|
| тот та то те
| того ту то те
| того той того тех
| тому той тому тем
| тем той тем [тою] теми
| том той том тех
|
| determinative pronouns
|
| (a) весь (all)
|
| весь вся все все
| всего всю все все
| всего всей всего всех
| всему всей всему всем
| всем всей всем [всею] всеми
| всем всей всем всех
|
| (b) сам (himself etc)
|
| сам сама само сами
| самого саму само самих
| самого самой самого самих
| самому самой самому самим
| самим самой самим [самою] самими
| самом самой самом самих
|
| stems of verbs `to be', `to have', `to do' and modal
|
| быть бы буд быв есть суть
| име
| дел
| мог мож мочь
| уме
| хоч хот
| долж
| можн
| нужн
| нельзя

View File

@ -0,0 +1,354 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| A Spanish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| The following is a ranked list (commonest to rarest) of stopwords
| deriving from a large sample of text.
| Extra words have been added at the end.
de | from, of
la | the, her
que | who, that
el | the
en | in
y | and
a | to
los | the, them
del | de + el
se | himself, from him etc
las | the, them
por | for, by, etc
un | a
para | for
con | with
no | no
una | a
su | his, her
al | a + el
| es from SER
lo | him
como | how
más | more
pero | pero
sus | su plural
le | to him, her
ya | already
o | or
| fue from SER
este | this
| ha from HABER
sí | himself etc
porque | because
esta | this
| son from SER
entre | between
| está from ESTAR
cuando | when
muy | very
sin | without
sobre | on
| ser from SER
| tiene from TENER
también | also
me | me
hasta | until
hay | there is/are
donde | where
| han from HABER
quien | whom, that
| están from ESTAR
| estado from ESTAR
desde | from
todo | all
nos | us
durante | during
| estados from ESTAR
todos | all
uno | a
les | to them
ni | nor
contra | against
otros | other
| fueron from SER
ese | that
eso | that
| había from HABER
ante | before
ellos | they
e | and (variant of y)
esto | this
mí | me
antes | before
algunos | some
qué | what?
unos | a
yo | I
otro | other
otras | other
otra | other
él | he
tanto | so much, many
esa | that
estos | these
mucho | much, many
quienes | who
nada | nothing
muchos | many
cual | who
| sea from SER
poco | few
ella | she
estar | to be
| haber from HABER
estas | these
| estaba from ESTAR
| estamos from ESTAR
algunas | some
algo | something
nosotros | we
| other forms
mi | me
mis | mi plural
tú | thou
te | thee
ti | thee
tu | thy
tus | tu plural
ellas | they
nosotras | we
vosotros | you
vosotras | you
os | you
mío | mine
mía |
míos |
mías |
tuyo | thine
tuya |
tuyos |
tuyas |
suyo | his, hers, theirs
suya |
suyos |
suyas |
nuestro | ours
nuestra |
nuestros |
nuestras |
vuestro | yours
vuestra |
vuestros |
vuestras |
esos | those
esas | those
| forms of estar, to be (not including the infinitive):
estoy
estás
está
estamos
estáis
están
esté
estés
estemos
estéis
estén
estaré
estarás
estará
estaremos
estaréis
estarán
estaría
estarías
estaríamos
estaríais
estarían
estaba
estabas
estábamos
estabais
estaban
estuve
estuviste
estuvo
estuvimos
estuvisteis
estuvieron
estuviera
estuvieras
estuviéramos
estuvierais
estuvieran
estuviese
estuvieses
estuviésemos
estuvieseis
estuviesen
estando
estado
estada
estados
estadas
estad
| forms of haber, to have (not including the infinitive):
he
has
ha
hemos
habéis
han
haya
hayas
hayamos
hayáis
hayan
habré
habrás
habrá
habremos
habréis
habrán
habría
habrías
habríamos
habríais
habrían
había
habías
habíamos
habíais
habían
hube
hubiste
hubo
hubimos
hubisteis
hubieron
hubiera
hubieras
hubiéramos
hubierais
hubieran
hubiese
hubieses
hubiésemos
hubieseis
hubiesen
habiendo
habido
habida
habidos
habidas
| forms of ser, to be (not including the infinitive):
soy
eres
es
somos
sois
son
sea
seas
seamos
seáis
sean
seré
serás
será
seremos
seréis
serán
sería
serías
seríamos
seríais
serían
era
eras
éramos
erais
eran
fui
fuiste
fue
fuimos
fuisteis
fueron
fuera
fueras
fuéramos
fuerais
fueran
fuese
fueses
fuésemos
fueseis
fuesen
siendo
sido
| sed also means 'thirst'
| forms of tener, to have (not including the infinitive):
tengo
tienes
tiene
tenemos
tenéis
tienen
tenga
tengas
tengamos
tengáis
tengan
tendré
tendrás
tendrá
tendremos
tendréis
tendrán
tendría
tendrías
tendríamos
tendríais
tendrían
tenía
tenías
teníamos
teníais
tenían
tuve
tuviste
tuvo
tuvimos
tuvisteis
tuvieron
tuviera
tuvieras
tuviéramos
tuvierais
tuvieran
tuviese
tuvieses
tuviésemos
tuvieseis
tuviesen
teniendo
tenido
tenida
tenidos
tenidas
tened

View File

@ -0,0 +1,131 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| A Swedish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| This is a ranked list (commonest to rarest) of stopwords derived from
| a large text sample.
| Swedish stop words occasionally exhibit homonym clashes. For example
| så = so, but also seed. These are indicated clearly below.
och | and
det | it, this/that
att | to (with infinitive)
i | in, at
en | a
jag | I
hon | she
som | who, that
han | he
på | on
den | it, this/that
med | with
var | where, each
sig | him(self) etc
för | for
så | so (also: seed)
till | to
är | is
men | but
ett | a
om | if; around, about
hade | had
de | they, these/those
av | of
icke | not, no
mig | me
du | you
henne | her
då | then, when
sin | his
nu | now
har | have
inte | inte någon = no one
hans | his
honom | him
skulle | 'sake'
hennes | her
där | there
min | my
man | one (pronoun)
ej | nor
vid | at, by, on (also: vast)
kunde | could
något | some etc
från | from, off
ut | out
när | when
efter | after, behind
upp | up
vi | we
dem | them
vara | be
vad | what
över | over
än | than
dig | you
kan | can
sina | his
här | here
ha | have
mot | towards
alla | all
under | under (also: wonder)
någon | some etc
eller | or (else)
allt | all
mycket | much
sedan | since
ju | why
denna | this/that
själv | myself, yourself etc
detta | this/that
åt | to
utan | without
varit | was
hur | how
ingen | no
mitt | my
ni | you
bli | to be, become
blev | from bli
oss | us
din | thy
dessa | these/those
några | some etc
deras | their
blir | from bli
mina | my
samma | (the) same
vilken | who, that
er | you, your
sådan | such a
vår | our
blivit | from bli
dess | its
inom | within
mellan | between
sådant | such a
varför | why
varje | each
vilka | who, that
ditt | thy
vem | who
vilket | who, that
sitta | his
sådana | such a
vart | each
dina | thy
vars | whose
vårt | our
våra | our
ert | your
era | your
vilkas | whose

View File

@ -0,0 +1,144 @@
package org.apache.lucene.analysis.snowball;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Payload;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.Version;
public class TestSnowball extends BaseTokenStreamTestCase {
public void testEnglish() throws Exception {
Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English");
assertAnalyzesTo(a, "he abhorred accents",
new String[]{"he", "abhor", "accent"});
}
public void testStopwords() throws Exception {
Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English",
StandardAnalyzer.STOP_WORDS_SET);
assertAnalyzesTo(a, "the quick brown fox jumped",
new String[]{"quick", "brown", "fox", "jump"});
}
/**
* Test english lowercasing. Test both cases (pre-3.1 and post-3.1) to ensure
* we lowercase I correct for non-Turkish languages in either case.
*/
public void testEnglishLowerCase() throws Exception {
Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English");
assertAnalyzesTo(a, "cryogenic", new String[] { "cryogen" });
assertAnalyzesTo(a, "CRYOGENIC", new String[] { "cryogen" });
Analyzer b = new SnowballAnalyzer(Version.LUCENE_30, "English");
assertAnalyzesTo(b, "cryogenic", new String[] { "cryogen" });
assertAnalyzesTo(b, "CRYOGENIC", new String[] { "cryogen" });
}
/**
* Test turkish lowercasing
*/
public void testTurkish() throws Exception {
Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "Turkish");
assertAnalyzesTo(a, "ağacı", new String[] { "ağaç" });
assertAnalyzesTo(a, "AĞACI", new String[] { "ağaç" });
}
/**
* Test turkish lowercasing (old buggy behavior)
* @deprecated Remove this when support for 3.0 indexes is no longer required
*/
public void testTurkishBWComp() throws Exception {
Analyzer a = new SnowballAnalyzer(Version.LUCENE_30, "Turkish");
// AĞACI in turkish lowercases to ağacı, but with lowercase filter ağaci.
// this fails due to wrong casing, because the stemmer
// will only remove -ı, not -i
assertAnalyzesTo(a, "ağacı", new String[] { "ağaç" });
assertAnalyzesTo(a, "AĞACI", new String[] { "ağaci" });
}
public void testReusableTokenStream() throws Exception {
Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English");
assertAnalyzesToReuse(a, "he abhorred accents",
new String[]{"he", "abhor", "accent"});
assertAnalyzesToReuse(a, "she abhorred him",
new String[]{"she", "abhor", "him"});
}
public void testFilterTokens() throws Exception {
SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class);
FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class);
filter.incrementToken();
assertEquals("accent", termAtt.term());
assertEquals(2, offsetAtt.startOffset());
assertEquals(7, offsetAtt.endOffset());
assertEquals("wrd", typeAtt.type());
assertEquals(3, posIncAtt.getPositionIncrement());
assertEquals(77, flagsAtt.getFlags());
assertEquals(new Payload(new byte[]{0,1,2,3}), payloadAtt.getPayload());
}
private final class TestTokenStream extends TokenStream {
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
private TypeAttribute typeAtt;
private PayloadAttribute payloadAtt;
private PositionIncrementAttribute posIncAtt;
private FlagsAttribute flagsAtt;
TestTokenStream() {
super();
termAtt = addAttribute(TermAttribute.class);
offsetAtt = addAttribute(OffsetAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
payloadAtt = addAttribute(PayloadAttribute.class);
posIncAtt = addAttribute(PositionIncrementAttribute.class);
flagsAtt = addAttribute(FlagsAttribute.class);
}
@Override
public boolean incrementToken() {
clearAttributes();
termAtt.setTermBuffer("accents");
offsetAtt.setOffset(2, 7);
typeAtt.setType("wrd");
posIncAtt.setPositionIncrement(3);
payloadAtt.setPayload(new Payload(new byte[]{0,1,2,3}));
flagsAtt.setFlags(77);
return true;
}
}
}

View File

@ -0,0 +1,102 @@
package org.apache.lucene.analysis.snowball;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.KeywordTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
/**
* Test the snowball filters against the snowball data tests
*/
public class TestSnowballVocab extends BaseTokenStreamTestCase {
private Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
static final File dataDir = new File(System.getProperty("dataDir", "./bin"));
static final File dataRoot = new File(dataDir,
"org/apache/lucene/analysis/snowball/data");
/**
* Run all languages against their snowball vocabulary tests.
*/
public void testStemmers() throws IOException {
if (!dataRoot.exists()) {
System.err.println("WARN: This test was disabled, as the svn checkout of snowball test files is not supported on your system!");
return;
}
assertCorrectOutput("Danish", "danish");
assertCorrectOutput("Dutch", "dutch");
assertCorrectOutput("English", "english");
// disabled due to snowball java code generation bug:
// see http://article.gmane.org/gmane.comp.search.snowball/1139
// assertCorrectOutput("Finnish", "finnish");
assertCorrectOutput("French", "french");
assertCorrectOutput("German", "german");
assertCorrectOutput("German2", "german2");
assertCorrectOutput("Hungarian", "hungarian");
assertCorrectOutput("Italian", "italian");
assertCorrectOutput("Kp", "kraaij_pohlmann");
// disabled due to snowball java code generation bug:
// see http://article.gmane.org/gmane.comp.search.snowball/1139
// assertCorrectOutput("Lovins", "lovins");
assertCorrectOutput("Norwegian", "norwegian");
assertCorrectOutput("Porter", "porter");
assertCorrectOutput("Portuguese", "portuguese");
assertCorrectOutput("Romanian", "romanian");
assertCorrectOutput("Russian", "russian");
assertCorrectOutput("Spanish", "spanish");
assertCorrectOutput("Swedish", "swedish");
assertCorrectOutput("Turkish", "turkish");
}
/**
* For the supplied language, run the stemmer against all strings in voc.txt
* The output should be the same as the string in output.txt
*/
private void assertCorrectOutput(String snowballLanguage, String dataDirectory)
throws IOException {
System.err.println("checking snowball language: " + snowballLanguage);
TokenStream filter = new SnowballFilter(tokenizer, snowballLanguage);
InputStream vocFile = new FileInputStream(new File(dataRoot,
dataDirectory + "/voc.txt"));
InputStream outputFile = new FileInputStream(new File(dataRoot,
dataDirectory + "/output.txt"));
BufferedReader vocReader = new BufferedReader(new InputStreamReader(
vocFile, "UTF-8"));
BufferedReader outputReader = new BufferedReader(new InputStreamReader(
outputFile, "UTF-8"));
String inputWord = null;
while ((inputWord = vocReader.readLine()) != null) {
String expectedWord = outputReader.readLine();
assertNotNull(expectedWord);
tokenizer.reset(new StringReader(inputWord));
filter.reset();
assertTokenStreamContents(filter, new String[] {expectedWord});
}
vocReader.close();
outputReader.close();
}
}

View File

@ -0,0 +1,17 @@
<?xml version="1.0"?>
<document>
<properties>
<title>Overview - Snowball Stemmers for Lucene</title>
</properties>
<body>
<section name="Snowball Stemmers for Lucene">
<p>
This project provides pre-compiled version of the Snowball stemmers
together with classes integrating them with the Lucene search engine.
</p>
</section>
</body>
</document>

View File

@ -0,0 +1,32 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<project name="Snowball Stemers for Lucene"
href="http://jakarta.apache.org/lucene-sandbox/snowball/">
<title>Snowball Stemmers for Lucene</title>
<body>
<menu name="Documentation">
<item name="Javadoc" href="/api/index.html"/>
</menu>
<menu name="Download">
<item name="Releases"
href="http://jakarta.apache.org/builds/jakarta-lucene-sandbox/snowball/"/>
<item name="CVS Repository" href="/site/cvsindex.html"/>
</menu>
<menu name="Links">
<item name="Snowball Home" href="http://snowball.tartarus.org/"/>
<item name="Lucene Home" href="http://jakarta.apache.org/lucene/"/>
<item name="Lucene Sandbox"
href="http://jakarta.apache.org/lucene/docs/lucene-sandbox/"/>
</menu>
<menu name="Jakarta">
<item name="Get Involved" href="/site/getinvolved.html"/>
<item name="Acknowledgements" href="/site/acknowledgements.html"/>
<item name="Contact" href="/site/contact.html"/>
<item name="Legal" href="/site/legal.html"/>
</menu>
</body>
</project>

View File

@ -1,10 +1,5 @@
package org.apache.lucene.analysis;
import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.ArrayUtil;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -22,6 +17,12 @@ import org.apache.lucene.util.ArrayUtil;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
/**
* This class converts alphabetic, numeric, and symbolic Unicode characters
* which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
@ -101,7 +102,7 @@ public final class ASCIIFoldingFilter extends TokenFilter {
// Worst-case length required:
final int maxSizeNeeded = 4 * length;
if (output.length < maxSizeNeeded) {
output = new char[ArrayUtil.getNextSize(maxSizeNeeded)];
output = new char[ArrayUtil.oversize(maxSizeNeeded, RamUsageEstimator.NUM_BYTES_CHAR)];
}
outputPos = 0;

View File

@ -29,6 +29,7 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.RamUsageEstimator;
/**
A Token is an occurrence of a term from the text of a field. It consists of
@ -347,12 +348,12 @@ public class Token extends AttributeImpl
public char[] resizeTermBuffer(int newSize) {
if (termBuffer == null) {
// The buffer is always at least MIN_BUFFER_SIZE
termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
} else {
if(termBuffer.length < newSize){
// Not big enough; create a new array with slight
// over allocation and preserve content
final char[] newCharBuffer = new char[ArrayUtil.getNextSize(newSize)];
final char[] newCharBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
termBuffer = newCharBuffer;
}
@ -367,19 +368,19 @@ public class Token extends AttributeImpl
private void growTermBuffer(int newSize) {
if (termBuffer == null) {
// The buffer is always at least MIN_BUFFER_SIZE
termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
} else {
if(termBuffer.length < newSize){
// Not big enough; create a new array with slight
// over allocation:
termBuffer = new char[ArrayUtil.getNextSize(newSize)];
termBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
}
}
}
private void initTermBuffer() {
if (termBuffer == null) {
termBuffer = new char[ArrayUtil.getNextSize(MIN_BUFFER_SIZE)];
termBuffer = new char[ArrayUtil.oversize(MIN_BUFFER_SIZE, RamUsageEstimator.NUM_BYTES_CHAR)];
termLength = 0;
}
}

View File

@ -21,6 +21,7 @@ import java.io.Serializable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.RamUsageEstimator;
/**
* The term text of a Token.
@ -106,12 +107,12 @@ public class TermAttributeImpl extends AttributeImpl implements TermAttribute, C
public char[] resizeTermBuffer(int newSize) {
if (termBuffer == null) {
// The buffer is always at least MIN_BUFFER_SIZE
termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
} else {
if(termBuffer.length < newSize){
// Not big enough; create a new array with slight
// over allocation and preserve content
final char[] newCharBuffer = new char[ArrayUtil.getNextSize(newSize)];
final char[] newCharBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
termBuffer = newCharBuffer;
}
@ -127,19 +128,19 @@ public class TermAttributeImpl extends AttributeImpl implements TermAttribute, C
private void growTermBuffer(int newSize) {
if (termBuffer == null) {
// The buffer is always at least MIN_BUFFER_SIZE
termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
} else {
if(termBuffer.length < newSize){
// Not big enough; create a new array with slight
// over allocation:
termBuffer = new char[ArrayUtil.getNextSize(newSize)];
termBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
}
}
}
private void initTermBuffer() {
if (termBuffer == null) {
termBuffer = new char[ArrayUtil.getNextSize(MIN_BUFFER_SIZE)];
termBuffer = new char[ArrayUtil.oversize(MIN_BUFFER_SIZE, RamUsageEstimator.NUM_BYTES_CHAR)];
termLength = 0;
}
}

View File

@ -25,6 +25,7 @@ import java.util.HashSet;
import java.io.IOException;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
/** This is just a "splitter" class: it lets you wrap two
* DocFieldConsumer instances as a single consumer. */
@ -117,7 +118,7 @@ final class DocFieldConsumers extends DocFieldConsumer {
// enough space to recycle all outstanding PerDoc
// instances
assert allocCount == 1+docFreeList.length;
docFreeList = new PerDoc[ArrayUtil.getNextSize(allocCount)];
docFreeList = new PerDoc[ArrayUtil.oversize(allocCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
}
return new PerDoc();
} else

View File

@ -24,6 +24,7 @@ import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
/**
* Gathers all Fieldables for a document under the same
@ -340,7 +341,7 @@ final class DocFieldProcessorPerThread extends DocConsumerPerThread {
// enough space to recycle all outstanding PerDoc
// instances
assert allocCount == 1+docFreeList.length;
docFreeList = new PerDoc[ArrayUtil.getNextSize(allocCount)];
docFreeList = new PerDoc[ArrayUtil.oversize(allocCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
}
return new PerDoc();
} else

View File

@ -40,6 +40,7 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.ThreadInterruptedException;
import org.apache.lucene.util.RamUsageEstimator;
/**
* This class accepts multiple added documents and directly
@ -1503,7 +1504,7 @@ final class DocumentsWriter {
int gap = doc.docID - nextWriteDocID;
if (gap >= waiting.length) {
// Grow queue
DocWriter[] newArray = new DocWriter[ArrayUtil.getNextSize(gap)];
DocWriter[] newArray = new DocWriter[ArrayUtil.oversize(gap, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
assert nextWriteLoc >= 0;
System.arraycopy(waiting, nextWriteLoc, newArray, 0, waiting.length-nextWriteLoc);
System.arraycopy(waiting, 0, newArray, waiting.length-nextWriteLoc, nextWriteLoc);

View File

@ -20,6 +20,7 @@ package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
/**
* NOTE: this API is experimental and will likely change
@ -35,7 +36,7 @@ abstract class FormatPostingsTermsConsumer {
FormatPostingsDocsConsumer addTerm(String text) throws IOException {
final int len = text.length();
if (termBuffer == null || termBuffer.length < 1+len)
termBuffer = new char[ArrayUtil.getNextSize(1+len)];
termBuffer = new char[ArrayUtil.oversize(1+len, RamUsageEstimator.NUM_BYTES_CHAR)];
text.getChars(0, len, termBuffer, 0);
termBuffer[len] = 0xffff;
return addTerm(termBuffer, 0);

View File

@ -63,11 +63,13 @@ final class NormsWriterPerField extends InvertedDocEndConsumerPerField implement
@Override
void finish() {
assert docIDs.length == norms.length;
if (fieldInfo.isIndexed && !fieldInfo.omitNorms) {
if (docIDs.length <= upto) {
assert docIDs.length == upto;
docIDs = ArrayUtil.grow(docIDs, 1+upto);
}
if (norms.length <= upto) {
assert norms.length == upto;
norms = ArrayUtil.grow(norms, 1+upto);
}
final float norm = docState.similarity.computeNorm(fieldInfo.name, fieldState);

View File

@ -20,6 +20,7 @@ package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
/** This is a DocFieldConsumer that writes stored fields. */
final class StoredFieldsWriter {
@ -108,7 +109,7 @@ final class StoredFieldsWriter {
// enough space to recycle all outstanding PerDoc
// instances
assert allocCount == 1+docFreeList.length;
docFreeList = new PerDoc[ArrayUtil.getNextSize(allocCount)];
docFreeList = new PerDoc[ArrayUtil.oversize(allocCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
}
return new PerDoc();
} else

View File

@ -20,6 +20,7 @@ package org.apache.lucene.index;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
import java.io.IOException;
import java.util.Collection;
@ -117,7 +118,7 @@ final class TermVectorsTermsWriter extends TermsHashConsumer {
// enough space to recycle all outstanding PerDoc
// instances
assert allocCount == 1+docFreeList.length;
docFreeList = new PerDoc[ArrayUtil.getNextSize(allocCount)];
docFreeList = new PerDoc[ArrayUtil.oversize(allocCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
}
return new PerDoc();
} else
@ -266,6 +267,8 @@ final class TermVectorsTermsWriter extends TermsHashConsumer {
void addField(final int fieldNumber) {
if (numVectorFields == fieldNumbers.length) {
fieldNumbers = ArrayUtil.grow(fieldNumbers);
}
if (numVectorFields == fieldPointers.length) {
fieldPointers = ArrayUtil.grow(fieldPointers);
}
fieldNumbers[numVectorFields] = fieldNumber;

View File

@ -26,6 +26,7 @@ import java.util.Arrays;
import java.io.IOException;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
/** This class implements {@link InvertedDocConsumer}, which
* is passed each token produced by the analyzer on each
@ -89,7 +90,7 @@ final class TermsHash extends InvertedDocConsumer {
assert postingsFreeCount == postingsAllocCount: Thread.currentThread().getName() + ": postingsFreeCount=" + postingsFreeCount + " postingsAllocCount=" + postingsAllocCount + " consumer=" + consumer;
final int newSize = ArrayUtil.getShrinkSize(postingsFreeList.length, postingsAllocCount);
final int newSize = ArrayUtil.getShrinkSize(postingsFreeList.length, postingsAllocCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
if (newSize != postingsFreeList.length) {
RawPostingList[] newArray = new RawPostingList[newSize];
System.arraycopy(postingsFreeList, 0, newArray, 0, postingsFreeCount);
@ -222,7 +223,7 @@ final class TermsHash extends InvertedDocConsumer {
if (newPostingsAllocCount > postingsFreeList.length)
// Pre-allocate the postingsFreeList so it's large
// enough to hold all postings we've given out
postingsFreeList = new RawPostingList[ArrayUtil.getNextSize(newPostingsAllocCount)];
postingsFreeList = new RawPostingList[ArrayUtil.oversize(newPostingsAllocCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
}
postingsFreeCount -= numToCopy;

View File

@ -7,9 +7,9 @@ package org.apache.lucene.util;
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* <p/>
* http://www.apache.org/licenses/LICENSE-2.0
* <p/>
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -122,20 +122,95 @@ public final class ArrayUtil {
END APACHE HARMONY CODE
*/
/** Returns an array size >= minTargetSize, generally
* over-allocating exponentially to achieve amortized
* linear-time cost as the array grows.
*
* NOTE: this was originally borrowed from Python 2.4.2
* listobject.c sources (attribution in LICENSE.txt), but
* has now been substantially changed based on
* discussions from java-dev thread with subject "Dynamic
* array reallocation algorithms", started on Jan 12
* 2010.
*
* @param minTargetSize Minimum required value to be returned.
* @param bytesPerElement Bytes used by each element of
* the array. See constants in {@link RamUsageEstimator}.
*
* @lucene.internal
*/
public static int getNextSize(int targetSize) {
/* This over-allocates proportional to the list size, making room
* for additional growth. The over-allocation is mild, but is
* enough to give linear-time amortized behavior over a long
* sequence of appends() in the presence of a poorly-performing
* system realloc().
* The growth pattern is: 0, 4, 8, 16, 25, 35, 46, 58, 72, 88, ...
*/
return (targetSize >> 3) + (targetSize < 9 ? 3 : 6) + targetSize;
public static int oversize(int minTargetSize, int bytesPerElement) {
if (minTargetSize < 0) {
// catch usage that accidentally overflows int
throw new IllegalArgumentException("invalid array size " + minTargetSize);
}
if (minTargetSize == 0) {
// wait until at least one element is requested
return 0;
}
// asymptotic exponential growth by 1/8th, favors
// spending a bit more CPU to not tye up too much wasted
// RAM:
int extra = minTargetSize >> 3;
if (extra < 3) {
// for very small arrays, where constant overhead of
// realloc is presumably relatively high, we grow
// faster
extra = 3;
}
int newSize = minTargetSize + extra;
// add 7 to allow for worst case byte alignment addition below:
if (newSize+7 < 0) {
// int overflowed -- return max allowed array size
return Integer.MAX_VALUE;
}
if (Constants.JRE_IS_64BIT) {
// round up to 8 byte alignment in 64bit env
switch(bytesPerElement) {
case 4:
// round up to multiple of 2
return (newSize + 1) & 0x7ffffffe;
case 2:
// round up to multiple of 4
return (newSize + 3) & 0x7ffffffc;
case 1:
// round up to multiple of 8
return (newSize + 7) & 0x7ffffff8;
case 8:
// no rounding
default:
// odd (invalid?) size
return newSize;
}
} else {
// round up to 4 byte alignment in 64bit env
switch(bytesPerElement) {
case 2:
// round up to multiple of 2
return (newSize + 1) & 0x7ffffffe;
case 1:
// round up to multiple of 4
return (newSize + 3) & 0x7ffffffc;
case 4:
case 8:
// no rounding
default:
// odd (invalid?) size
return newSize;
}
}
}
public static int getShrinkSize(int currentSize, int targetSize) {
final int newSize = getNextSize(targetSize);
public static int getShrinkSize(int currentSize, int targetSize, int bytesPerElement) {
final int newSize = oversize(targetSize, bytesPerElement);
// Only reallocate if we are "substantially" smaller.
// This saves us from "running hot" (constantly making a
// bit bigger then a bit smaller, over and over):
@ -147,7 +222,7 @@ public final class ArrayUtil {
public static int[] grow(int[] array, int minSize) {
if (array.length < minSize) {
int[] newArray = new int[getNextSize(minSize)];
int[] newArray = new int[oversize(minSize, RamUsageEstimator.NUM_BYTES_INT)];
System.arraycopy(array, 0, newArray, 0, array.length);
return newArray;
} else
@ -159,7 +234,7 @@ public final class ArrayUtil {
}
public static int[] shrink(int[] array, int targetSize) {
final int newSize = getShrinkSize(array.length, targetSize);
final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_INT);
if (newSize != array.length) {
int[] newArray = new int[newSize];
System.arraycopy(array, 0, newArray, 0, newSize);
@ -170,7 +245,7 @@ public final class ArrayUtil {
public static long[] grow(long[] array, int minSize) {
if (array.length < minSize) {
long[] newArray = new long[getNextSize(minSize)];
long[] newArray = new long[oversize(minSize, RamUsageEstimator.NUM_BYTES_LONG)];
System.arraycopy(array, 0, newArray, 0, array.length);
return newArray;
} else
@ -182,7 +257,7 @@ public final class ArrayUtil {
}
public static long[] shrink(long[] array, int targetSize) {
final int newSize = getShrinkSize(array.length, targetSize);
final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_LONG);
if (newSize != array.length) {
long[] newArray = new long[newSize];
System.arraycopy(array, 0, newArray, 0, newSize);
@ -193,7 +268,7 @@ public final class ArrayUtil {
public static byte[] grow(byte[] array, int minSize) {
if (array.length < minSize) {
byte[] newArray = new byte[getNextSize(minSize)];
byte[] newArray = new byte[oversize(minSize, 1)];
System.arraycopy(array, 0, newArray, 0, array.length);
return newArray;
} else
@ -205,7 +280,7 @@ public final class ArrayUtil {
}
public static byte[] shrink(byte[] array, int targetSize) {
final int newSize = getShrinkSize(array.length, targetSize);
final int newSize = getShrinkSize(array.length, targetSize, 1);
if (newSize != array.length) {
byte[] newArray = new byte[newSize];
System.arraycopy(array, 0, newArray, 0, newSize);

View File

@ -43,6 +43,14 @@ public final class RamUsageEstimator {
private int arraySize;
private int classSize;
public final static int NUM_BYTES_OBJECT_REF = Constants.JRE_IS_64BIT ? 8 : 4;
public final static int NUM_BYTES_CHAR = 2;
public final static int NUM_BYTES_SHORT = 2;
public final static int NUM_BYTES_INT = 4;
public final static int NUM_BYTES_LONG = 8;
public final static int NUM_BYTES_FLOAT = 4;
public final static int NUM_BYTES_DOUBLE = 8;
private boolean checkInterned;
/**

View File

@ -85,7 +85,6 @@ public class TestToken extends LuceneTestCase {
buf.append(buf.toString());
}
assertEquals(1048576, t.termLength());
assertEquals(1179654, t.termBuffer().length);
// now as a string, first variant
t = new Token();
@ -99,7 +98,6 @@ public class TestToken extends LuceneTestCase {
buf.append(content);
}
assertEquals(1048576, t.termLength());
assertEquals(1179654, t.termBuffer().length);
// now as a string, second variant
t = new Token();
@ -113,7 +111,6 @@ public class TestToken extends LuceneTestCase {
buf.append(content);
}
assertEquals(1048576, t.termLength());
assertEquals(1179654, t.termBuffer().length);
// Test for slow growth to a long term
t = new Token();
@ -127,7 +124,6 @@ public class TestToken extends LuceneTestCase {
buf.append("a");
}
assertEquals(20000, t.termLength());
assertEquals(20167, t.termBuffer().length);
// Test for slow growth to a long term
t = new Token();
@ -141,7 +137,6 @@ public class TestToken extends LuceneTestCase {
buf.append("a");
}
assertEquals(20000, t.termLength());
assertEquals(20167, t.termBuffer().length);
}
public void testToString() throws Exception {

View File

@ -49,7 +49,6 @@ public class TestTermAttributeImpl extends LuceneTestCase {
buf.append(buf.toString());
}
assertEquals(1048576, t.termLength());
assertEquals(1179654, t.termBuffer().length);
// now as a string, first variant
t = new TermAttributeImpl();
@ -63,7 +62,6 @@ public class TestTermAttributeImpl extends LuceneTestCase {
buf.append(content);
}
assertEquals(1048576, t.termLength());
assertEquals(1179654, t.termBuffer().length);
// now as a string, second variant
t = new TermAttributeImpl();
@ -77,7 +75,6 @@ public class TestTermAttributeImpl extends LuceneTestCase {
buf.append(content);
}
assertEquals(1048576, t.termLength());
assertEquals(1179654, t.termBuffer().length);
// Test for slow growth to a long term
t = new TermAttributeImpl();
@ -91,7 +88,6 @@ public class TestTermAttributeImpl extends LuceneTestCase {
buf.append("a");
}
assertEquals(20000, t.termLength());
assertEquals(20167, t.termBuffer().length);
// Test for slow growth to a long term
t = new TermAttributeImpl();
@ -105,7 +101,6 @@ public class TestTermAttributeImpl extends LuceneTestCase {
buf.append("a");
}
assertEquals(20000, t.termLength());
assertEquals(20167, t.termBuffer().length);
}
public void testToString() throws Exception {

View File

@ -0,0 +1,59 @@
package org.apache.lucene.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Random;
public class TestArrayUtil extends LuceneTestCase {
// Ensure ArrayUtil.getNextSize gives linear amortized cost of realloc/copy
public void testGrowth() {
int currentSize = 0;
long copyCost = 0;
// Make sure ArrayUtil hits Integer.MAX_VALUE, if we insist:
while(currentSize != Integer.MAX_VALUE) {
int nextSize = ArrayUtil.oversize(1+currentSize, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
assertTrue(nextSize > currentSize);
if (currentSize > 0) {
copyCost += currentSize;
double copyCostPerElement = ((double) copyCost)/currentSize;
assertTrue("cost " + copyCostPerElement, copyCostPerElement < 10.0);
}
currentSize = nextSize;
}
}
public void testMaxSize() {
// intentionally pass invalid elemSizes:
for(int elemSize=0;elemSize<10;elemSize++) {
assertEquals(Integer.MAX_VALUE, ArrayUtil.oversize(Integer.MAX_VALUE, elemSize));
assertEquals(Integer.MAX_VALUE, ArrayUtil.oversize(Integer.MAX_VALUE-1, elemSize));
}
}
public void testInvalidElementSizes() {
final Random r = newRandom();
for(int iter=0;iter<10000;iter++) {
final int minTargetSize = r.nextInt(Integer.MAX_VALUE);
final int elemSize = r.nextInt(11);
final int v = ArrayUtil.oversize(minTargetSize, elemSize);
assertTrue(v >= minTargetSize);
}
}
}

View File

@ -164,14 +164,14 @@ public class TestIndexableBinaryStringTools extends LuceneTestCase {
int encodedLen1 = IndexableBinaryStringTools.getEncodedLength(
originalArray1, 0, numBytes1);
if (encodedLen1 > encoded1.length)
encoded1 = new char[ArrayUtil.getNextSize(encodedLen1)];
encoded1 = new char[ArrayUtil.oversize(encodedLen1, RamUsageEstimator.NUM_BYTES_CHAR)];
IndexableBinaryStringTools.encode(originalArray1, 0, numBytes1, encoded1,
0, encodedLen1);
int encodedLen2 = IndexableBinaryStringTools.getEncodedLength(original2,
0, numBytes2);
if (encodedLen2 > encoded2.length)
encoded2 = new char[ArrayUtil.getNextSize(encodedLen2)];
encoded2 = new char[ArrayUtil.oversize(encodedLen2, RamUsageEstimator.NUM_BYTES_CHAR)];
IndexableBinaryStringTools.encode(original2, 0, numBytes2, encoded2, 0,
encodedLen2);
@ -308,7 +308,7 @@ public class TestIndexableBinaryStringTools extends LuceneTestCase {
int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
numBytes);
if (encoded.length < encodedLen)
encoded = new char[ArrayUtil.getNextSize(encodedLen)];
encoded = new char[ArrayUtil.oversize(encodedLen, RamUsageEstimator.NUM_BYTES_CHAR)];
IndexableBinaryStringTools.encode(binary, 0, numBytes, encoded, 0,
encodedLen);