mirror of https://github.com/apache/lucene.git
LUCENE-2213: rename ArrayUtil.getNextSize -> oversize; tweak how it picks the next size
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@901662 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
94826b348c
commit
9b3b890f45
|
@ -35,6 +35,7 @@ package org.tartarus.snowball;
|
|||
import java.lang.reflect.InvocationTargetException;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
/**
|
||||
* This is the rev 502 of the Snowball SVN trunk,
|
||||
|
@ -432,7 +433,7 @@ public abstract class SnowballProgram {
|
|||
final int newLength = limit + adjustment;
|
||||
//resize if necessary
|
||||
if (newLength > current.length) {
|
||||
char newBuffer[] = new char[ArrayUtil.getNextSize(newLength)];
|
||||
char newBuffer[] = new char[ArrayUtil.oversize(newLength, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
System.arraycopy(current, 0, newBuffer, 0, limit);
|
||||
current = newBuffer;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
build
|
||||
snowball
|
|
@ -0,0 +1,16 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
|
@ -0,0 +1,25 @@
|
|||
Lucene Snowball README file
|
||||
|
||||
This project provides pre-compiled version of the Snowball stemmers
|
||||
based on revision 500 of the Tartarus Snowball repository,
|
||||
together with classes integrating them with the Lucene search engine.
|
||||
|
||||
A few changes has been made to the static Snowball code and compiled stemmers:
|
||||
|
||||
* Class SnowballProgram is made abstract and contains new abstract method stem() to avoid reflection in Lucene filter class SnowballFilter.
|
||||
* All use of StringBuffers has been refactored to StringBuilder for speed.
|
||||
* Snowball BSD license header has been added to the Java classes to avoid having RAT adding new ASL headers.
|
||||
|
||||
|
||||
IMPORTANT NOTICE ON BACKWARDS COMPATIBILITY!
|
||||
|
||||
An index created using the Snowball module in Lucene 2.3.2 and below
|
||||
might not be compatible with the Snowball module in Lucene 2.4 or greater.
|
||||
|
||||
For more information about this issue see:
|
||||
https://issues.apache.org/jira/browse/LUCENE-1142
|
||||
|
||||
|
||||
For more information on Snowball, see:
|
||||
http://snowball.tartarus.org/
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
Copyright (c) 2001, Dr Martin Porter
|
||||
Copyright (c) 2002, Richard Boulton
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holders nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -0,0 +1 @@
|
|||
snowball
|
|
@ -0,0 +1,7 @@
|
|||
#!/bin/csh -f
|
||||
set infile = $1
|
||||
set outdir = $2
|
||||
|
||||
set name = $infile:h:t:uStemmer
|
||||
|
||||
exec $0:h/snowball $infile -o $outdir/$name -n $name -java
|
|
@ -0,0 +1,155 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<project name="snowball" default="default">
|
||||
|
||||
<description>
|
||||
Snowball Analyzers
|
||||
</description>
|
||||
|
||||
<import file="../contrib-build.xml"/>
|
||||
|
||||
<property name="snowball.cvsroot" value=":pserver:cvsuser@cvs.tartarus.org:/home/cvs"/>
|
||||
<property name="snowball.cvs.password" value="anonymous"/>
|
||||
<property name="snowball.root" value="snowball/website"/>
|
||||
<property name="bin.dir" location="bin"/>
|
||||
|
||||
<property name="analyzers.jar" location="${common.dir}/build/contrib/analyzers/common/lucene-analyzers-${version}.jar"/>
|
||||
<available property="analyzers.jar.present" type="file" file="${analyzers.jar}"/>
|
||||
|
||||
<path id="classpath">
|
||||
<pathelement path="${lucene.jar}"/>
|
||||
<pathelement path="${analyzers.jar}"/>
|
||||
<pathelement path="${project.classpath}"/>
|
||||
</path>
|
||||
|
||||
<target name="jar" depends="compile" description="Create JAR">
|
||||
<jarify>
|
||||
<metainf-includes>
|
||||
<metainf dir=".">
|
||||
<include name="SNOWBALL-LICENSE.txt"/>
|
||||
</metainf>
|
||||
</metainf-includes>
|
||||
</jarify>
|
||||
</target>
|
||||
|
||||
<target name="jar-src" depends="init"
|
||||
description="Packages the sources as JAR file">
|
||||
<jarify basedir="${src.dir}" destfile="${build.dir}/${final.name}-src.jar">
|
||||
<metainf-includes>
|
||||
<metainf dir=".">
|
||||
<include name="SNOWBALL-LICENSE.txt"/>
|
||||
</metainf>
|
||||
</metainf-includes>
|
||||
</jarify>
|
||||
</target>
|
||||
|
||||
|
||||
<!-- ====================================================== -->
|
||||
<!-- Download Snowball code -->
|
||||
<!-- ====================================================== -->
|
||||
<target name="download" depends="init">
|
||||
<cvs cvsRoot="${snowball.cvsroot}"
|
||||
package="${snowball.root}"
|
||||
passfile="snowball.cvspass"/>
|
||||
</target>
|
||||
|
||||
<target name="create-passfile">
|
||||
<cvspass cvsroot="${snowball.cvsroot}"
|
||||
password="${snowball.cvs.password}"
|
||||
passfile="snowball.cvspass"
|
||||
/>
|
||||
</target>
|
||||
|
||||
<!-- ====================================================== -->
|
||||
<!-- Compile Snowball C code -->
|
||||
<!-- ====================================================== -->
|
||||
<target name="compile-compiler" depends="download">
|
||||
<apply failonerror="true" executable="gcc" parallel="true">
|
||||
<arg value="-O"/>
|
||||
<arg value="-o"/>
|
||||
<arg value="${bin.dir}/snowball"/>
|
||||
<fileset dir="${snowball.root}/p" includes="*.c"/>
|
||||
</apply>
|
||||
</target>
|
||||
|
||||
<!-- ====================================================== -->
|
||||
<!-- Generate Java code -->
|
||||
<!-- ====================================================== -->
|
||||
<target name="generate" depends="compile-compiler">
|
||||
<apply failonerror="true" executable="${bin.dir}/snowball.sh">
|
||||
<srcfile/>
|
||||
<arg value="${src.dir}/net/sf/snowball/ext"/>
|
||||
<fileset dir="${snowball.root}" includes="**/stem.sbl"/>
|
||||
</apply>
|
||||
|
||||
<copy todir="${src.dir}/net">
|
||||
<fileset dir="${snowball.root}/net">
|
||||
<include name="**/*.java"/>
|
||||
</fileset>
|
||||
</copy>
|
||||
|
||||
</target>
|
||||
|
||||
|
||||
<target name="docs">
|
||||
<taskdef
|
||||
name="anakia"
|
||||
classname="org.apache.velocity.anakia.AnakiaTask"
|
||||
>
|
||||
<classpath refid="anakia.classpath"/>
|
||||
</taskdef>
|
||||
|
||||
<anakia
|
||||
basedir="${docs.src}"
|
||||
destdir="${docs.dest}/"
|
||||
extension=".html" style="./site.vsl"
|
||||
projectFile="stylesheets/project.xml"
|
||||
excludes="**/stylesheets/** empty.xml"
|
||||
includes="**/*.xml"
|
||||
lastModifiedCheck="true"
|
||||
templatePath="${jakarta.site2.home}/xdocs/stylesheets"
|
||||
>
|
||||
</anakia>
|
||||
|
||||
</target>
|
||||
|
||||
<target name="compile-core" depends="build-analyzers, common.compile-core" />
|
||||
<target name="compile-test" depends="download-vocab-tests, common.compile-test" />
|
||||
|
||||
<target name="build-analyzers" unless="analyzers.jar.present">
|
||||
<echo>Snowball building dependency ${analyzers.jar}</echo>
|
||||
<ant antfile="../analyzers/build.xml" target="default" inheritall="false" dir="../analyzers" />
|
||||
</target>
|
||||
|
||||
<property name="snowball.vocab.rev" value="500"/>
|
||||
<property name="snowball.vocab.url"
|
||||
value="svn://svn.tartarus.org/snowball/trunk/data"/>
|
||||
<property name="vocab.dir" value="src/test/org/apache/lucene/analysis/snowball"/>
|
||||
|
||||
<target name="download-vocab-tests" depends="compile-core"
|
||||
description="Downloads Snowball vocabulary tests">
|
||||
<sequential>
|
||||
<mkdir dir="${vocab.dir}"/>
|
||||
<exec dir="${vocab.dir}" executable="${svn.exe}" failifexecutionfails="false" failonerror="true">
|
||||
<arg line="checkout --trust-server-cert --non-interactive -r ${snowball.vocab.rev} ${snowball.vocab.url}"/>
|
||||
</exec>
|
||||
</sequential>
|
||||
</target>
|
||||
</project>
|
|
@ -0,0 +1,148 @@
|
|||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
||||
|
||||
<!-- Content Stylesheet for Site -->
|
||||
|
||||
|
||||
<!-- start the processing -->
|
||||
<!-- ====================================================================== -->
|
||||
<!-- GENERATED FILE, DO NOT EDIT, EDIT THE XML FILE IN xdocs INSTEAD! -->
|
||||
<!-- Main Page Section -->
|
||||
<!-- ====================================================================== -->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"/>
|
||||
|
||||
<meta name="author" value="Doug Cutting">
|
||||
<meta name="email" value="cutting@apache.org">
|
||||
|
||||
|
||||
|
||||
|
||||
<title>Snowball Stemmers for Lucene - Overview - Snowball Stemmers for Lucene</title>
|
||||
</head>
|
||||
|
||||
<body bgcolor="#ffffff" text="#000000" link="#525D76">
|
||||
<table border="0" width="100%" cellspacing="0">
|
||||
<!-- TOP IMAGE -->
|
||||
<tr>
|
||||
<td colspan="2">
|
||||
<a href="http://jakarta.apache.org"><img src="http://jakarta.apache.org/images/jakarta-logo.gif" align="left" border="0"/></a>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
<table border="0" width="100%" cellspacing="4">
|
||||
<tr><td colspan="2">
|
||||
<hr noshade="" size="1"/>
|
||||
</td></tr>
|
||||
|
||||
<tr>
|
||||
<!-- LEFT SIDE NAVIGATION -->
|
||||
<td width="20%" valign="top" nowrap="true">
|
||||
|
||||
<!-- ============================================================ -->
|
||||
|
||||
<p><strong>Documentation</strong></p>
|
||||
<ul>
|
||||
<li> <a href="./api/index.html">Javadoc</a>
|
||||
</li>
|
||||
</ul>
|
||||
<p><strong>Download</strong></p>
|
||||
<ul>
|
||||
<li> <a href="http://jakarta.apache.org/builds/jakarta-lucene-sandbox/snowball/">Releases</a>
|
||||
</li>
|
||||
<li> <a href="http://jakarta.apache.org/site/cvsindex.html">CVS Repository</a>
|
||||
</li>
|
||||
</ul>
|
||||
<p><strong>Links</strong></p>
|
||||
<ul>
|
||||
<li> <a href="http://snowball.tartarus.org/">Snowball Home</a>
|
||||
</li>
|
||||
<li> <a href="http://jakarta.apache.org/lucene/">Lucene Home</a>
|
||||
</li>
|
||||
<li> <a href="http://jakarta.apache.org/lucene/docs/lucene-sandbox/">Lucene Sandbox</a>
|
||||
</li>
|
||||
</ul>
|
||||
<p><strong>Jakarta</strong></p>
|
||||
<ul>
|
||||
<li> <a href="http://jakarta.apache.org/site/getinvolved.html">Get Involved</a>
|
||||
</li>
|
||||
<li> <a href="http://jakarta.apache.org/site/acknowledgements.html">Acknowledgements</a>
|
||||
</li>
|
||||
<li> <a href="http://jakarta.apache.org/site/contact.html">Contact</a>
|
||||
</li>
|
||||
<li> <a href="http://jakarta.apache.org/site/legal.html">Legal</a>
|
||||
</li>
|
||||
</ul>
|
||||
</td>
|
||||
<td width="80%" align="left" valign="top">
|
||||
<table border="0" cellspacing="0" cellpadding="2" width="100%">
|
||||
<tr><td bgcolor="#525D76">
|
||||
<font color="#ffffff" face="arial,helvetica,sanserif">
|
||||
<a name="Snowball Stemmers for Lucene"><strong>Snowball Stemmers for Lucene</strong></a>
|
||||
</font>
|
||||
</td></tr>
|
||||
<tr><td>
|
||||
<blockquote>
|
||||
<p>
|
||||
This project provides pre-compiled version of the Snowball stemmers
|
||||
together with classes integrating them with the Lucene search engine.
|
||||
</p>
|
||||
</blockquote>
|
||||
</p>
|
||||
</td></tr>
|
||||
<tr><td><br/></td></tr>
|
||||
</table>
|
||||
<table border="0" cellspacing="0" cellpadding="2" width="100%">
|
||||
<tr><td bgcolor="#525D76">
|
||||
<font color="#ffffff" face="arial,helvetica,sanserif">
|
||||
<a name="Download"><strong>Download</strong></a>
|
||||
</font>
|
||||
</td></tr>
|
||||
<tr><td>
|
||||
<blockquote>
|
||||
<p>
|
||||
Releases of the stemmers are available
|
||||
<a href="http://jakarta.apache.org/builds/jakarta-lucene-sandbox/snowball/">
|
||||
here</a>
|
||||
</p>
|
||||
</blockquote>
|
||||
</p>
|
||||
</td></tr>
|
||||
<tr><td><br/></td></tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<!-- FOOTER -->
|
||||
<tr><td colspan="2">
|
||||
<hr noshade="" size="1"/>
|
||||
</td></tr>
|
||||
<tr><td colspan="2">
|
||||
<div align="center"><font color="#525D76" size="-1"><em>
|
||||
Copyright © 1999-2004, The Apache Software Foundation
|
||||
</em></font></div>
|
||||
</td></tr>
|
||||
</table>
|
||||
</body>
|
||||
</html>
|
||||
<!-- end the processing -->
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-contrib</artifactId>
|
||||
<version>@version@</version>
|
||||
</parent>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-snowball</artifactId>
|
||||
<name>Lucene Snowball</name>
|
||||
<version>@version@</version>
|
||||
<description>Snowball Analyzers</description>
|
||||
<packaging>jar</packaging>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-analyzers</artifactId>
|
||||
<version>@version@</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
|
@ -0,0 +1 @@
|
|||
:pserver:cvsuser@cvs.tartarus.org:/home/cvs Ay=0=a%0bZ
|
|
@ -0,0 +1,121 @@
|
|||
package org.apache.lucene.analysis.snowball;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.standard.*;
|
||||
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
/** Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
|
||||
* LowerCaseFilter}, {@link StopFilter} and {@link SnowballFilter}.
|
||||
*
|
||||
* Available stemmers are listed in org.tartarus.snowball.ext. The name of a
|
||||
* stemmer is the part of the class name before "Stemmer", e.g., the stemmer in
|
||||
* {@link org.tartarus.snowball.ext.EnglishStemmer} is named "English".
|
||||
*
|
||||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* dependent settings as {@link StandardAnalyzer}, with the following addition:
|
||||
* <ul>
|
||||
* <li> As of 3.1, uses {@link TurkishLowerCaseFilter} for Turkish language.
|
||||
* </ul>
|
||||
* </p>
|
||||
*/
|
||||
public final class SnowballAnalyzer extends Analyzer {
|
||||
private String name;
|
||||
private Set<?> stopSet;
|
||||
private final Version matchVersion;
|
||||
|
||||
/** Builds the named analyzer with no stop words. */
|
||||
public SnowballAnalyzer(Version matchVersion, String name) {
|
||||
this.name = name;
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds the named analyzer with the given stop words.
|
||||
* @deprecated Use {@link #SnowballAnalyzer(Version, String, Set)} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public SnowballAnalyzer(Version matchVersion, String name, String[] stopWords) {
|
||||
this(matchVersion, name);
|
||||
stopSet = StopFilter.makeStopSet(matchVersion, stopWords);
|
||||
}
|
||||
|
||||
/** Builds the named analyzer with the given stop words. */
|
||||
public SnowballAnalyzer(Version matchVersion, String name, Set<?> stopWords) {
|
||||
this(matchVersion, name);
|
||||
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion,
|
||||
stopWords));
|
||||
}
|
||||
|
||||
/** Constructs a {@link StandardTokenizer} filtered by a {@link
|
||||
StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter},
|
||||
and a {@link SnowballFilter} */
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new StandardTokenizer(matchVersion, reader);
|
||||
result = new StandardFilter(result);
|
||||
// Use a special lowercase filter for turkish, the stemmer expects it.
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish"))
|
||||
result = new TurkishLowerCaseFilter(result);
|
||||
else
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
if (stopSet != null)
|
||||
result = new StopFilter(matchVersion,
|
||||
result, stopSet);
|
||||
result = new SnowballFilter(result, name);
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
}
|
||||
|
||||
/** Returns a (possibly reused) {@link StandardTokenizer} filtered by a
|
||||
* {@link StandardFilter}, a {@link LowerCaseFilter},
|
||||
* a {@link StopFilter}, and a {@link SnowballFilter} */
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(matchVersion, reader);
|
||||
streams.result = new StandardFilter(streams.source);
|
||||
// Use a special lowercase filter for turkish, the stemmer expects it.
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish"))
|
||||
streams.result = new TurkishLowerCaseFilter(streams.result);
|
||||
else
|
||||
streams.result = new LowerCaseFilter(matchVersion, streams.result);
|
||||
if (stopSet != null)
|
||||
streams.result = new StopFilter(matchVersion,
|
||||
streams.result, stopSet);
|
||||
streams.result = new SnowballFilter(streams.result, name);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
package org.apache.lucene.analysis.snowball;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; // javadoc @link
|
||||
import org.apache.lucene.analysis.LowerCaseFilter; // javadoc @link
|
||||
import org.tartarus.snowball.SnowballProgram;
|
||||
|
||||
/**
|
||||
* A filter that stems words using a Snowball-generated stemmer.
|
||||
*
|
||||
* Available stemmers are listed in {@link org.tartarus.snowball.ext}.
|
||||
* <p><b>NOTE</b>: SnowballFilter expects lowercased text.
|
||||
* <ul>
|
||||
* <li>For the Turkish language, see {@link TurkishLowerCaseFilter}.
|
||||
* <li>For other languages, see {@link LowerCaseFilter}.
|
||||
* </ul>
|
||||
* </p>
|
||||
*/
|
||||
public final class SnowballFilter extends TokenFilter {
|
||||
|
||||
private SnowballProgram stemmer;
|
||||
|
||||
private TermAttribute termAtt;
|
||||
|
||||
public SnowballFilter(TokenStream input, SnowballProgram stemmer) {
|
||||
super(input);
|
||||
this.stemmer = stemmer;
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct the named stemming filter.
|
||||
*
|
||||
* Available stemmers are listed in {@link org.tartarus.snowball.ext}.
|
||||
* The name of a stemmer is the part of the class name before "Stemmer",
|
||||
* e.g., the stemmer in {@link org.tartarus.snowball.ext.EnglishStemmer} is named "English".
|
||||
*
|
||||
* @param in the input tokens to stem
|
||||
* @param name the name of a stemmer
|
||||
*/
|
||||
public SnowballFilter(TokenStream in, String name) {
|
||||
super(in);
|
||||
try {
|
||||
Class<?> stemClass = Class.forName("org.tartarus.snowball.ext." + name + "Stemmer");
|
||||
stemmer = (SnowballProgram) stemClass.newInstance();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e.toString());
|
||||
}
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
/** Returns the next input Token, after being stemmed */
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
char termBuffer[] = termAtt.termBuffer();
|
||||
final int length = termAtt.termLength();
|
||||
stemmer.setCurrent(termBuffer, length);
|
||||
stemmer.stem();
|
||||
final char finalTerm[] = stemmer.getCurrentBuffer();
|
||||
final int newLength = stemmer.getCurrentBufferLength();
|
||||
if (finalTerm != termBuffer)
|
||||
termAtt.setTermBuffer(finalTerm, 0, newLength);
|
||||
else
|
||||
termAtt.setTermLength(newLength);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<body>
|
||||
{@link org.apache.lucene.analysis.TokenFilter} and {@link
|
||||
org.apache.lucene.analysis.Analyzer} implementations that use Snowball
|
||||
stemmers.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,63 @@
|
|||
/*
|
||||
|
||||
Copyright (c) 2001, Dr Martin Porter
|
||||
Copyright (c) 2002, Richard Boulton
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holders nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
package org.tartarus.snowball;
|
||||
|
||||
import java.lang.reflect.Method;
|
||||
|
||||
public class Among {
|
||||
public Among (String s, int substring_i, int result,
|
||||
String methodname, SnowballProgram methodobject) {
|
||||
this.s_size = s.length();
|
||||
this.s = s.toCharArray();
|
||||
this.substring_i = substring_i;
|
||||
this.result = result;
|
||||
this.methodobject = methodobject;
|
||||
if (methodname.length() == 0) {
|
||||
this.method = null;
|
||||
} else {
|
||||
try {
|
||||
this.method = methodobject.getClass().
|
||||
getDeclaredMethod(methodname, new Class[0]);
|
||||
} catch (NoSuchMethodException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public final int s_size; /* search string */
|
||||
public final char[] s; /* search string */
|
||||
public final int substring_i; /* index to longest matching substring */
|
||||
public final int result; /* result of the lookup */
|
||||
public final Method method; /* method to use if substring matches */
|
||||
public final SnowballProgram methodobject; /* object to invoke method on */
|
||||
|
||||
};
|
|
@ -0,0 +1,566 @@
|
|||
/*
|
||||
|
||||
Copyright (c) 2001, Dr Martin Porter
|
||||
Copyright (c) 2002, Richard Boulton
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holders nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
package org.tartarus.snowball;
|
||||
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
/**
|
||||
* This is the rev 502 of the Snowball SVN trunk,
|
||||
* but modified:
|
||||
* made abstract and introduced abstract method stem to avoid expensive reflection in filter class.
|
||||
* refactored StringBuffers to StringBuilder
|
||||
* uses char[] as buffer instead of StringBuffer/StringBuilder
|
||||
* eq_s,eq_s_b,insert,replace_s take CharSequence like eq_v and eq_v_b
|
||||
*/
|
||||
public abstract class SnowballProgram {
|
||||
protected SnowballProgram()
|
||||
{
|
||||
current = new char[8];
|
||||
setCurrent("");
|
||||
}
|
||||
|
||||
public abstract boolean stem();
|
||||
|
||||
/**
|
||||
* Set the current string.
|
||||
*/
|
||||
public void setCurrent(String value)
|
||||
{
|
||||
current = value.toCharArray();
|
||||
cursor = 0;
|
||||
limit = value.length();
|
||||
limit_backward = 0;
|
||||
bra = cursor;
|
||||
ket = limit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the current string.
|
||||
*/
|
||||
public String getCurrent()
|
||||
{
|
||||
return new String(current, 0, limit);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the current string.
|
||||
* @param text character array containing input
|
||||
* @param length valid length of text.
|
||||
*/
|
||||
public void setCurrent(char text[], int length) {
|
||||
current = text;
|
||||
cursor = 0;
|
||||
limit = length;
|
||||
limit_backward = 0;
|
||||
bra = cursor;
|
||||
ket = limit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the current buffer containing the stem.
|
||||
* <p>
|
||||
* NOTE: this may be a reference to a different character array than the
|
||||
* one originally provided with setCurrent, in the exceptional case that
|
||||
* stemming produced a longer intermediate or result string.
|
||||
* </p>
|
||||
* <p>
|
||||
* It is necessary to use {@link #getCurrentBufferLength()} to determine
|
||||
* the valid length of the returned buffer. For example, many words are
|
||||
* stemmed simply by subtracting from the length to remove suffixes.
|
||||
* </p>
|
||||
* @see #getCurrentBufferLength()
|
||||
*/
|
||||
public char[] getCurrentBuffer() {
|
||||
return current;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the valid length of the character array in
|
||||
* {@link #getCurrentBuffer()}.
|
||||
* @return valid length of the array.
|
||||
*/
|
||||
public int getCurrentBufferLength() {
|
||||
return limit;
|
||||
}
|
||||
|
||||
// current string
|
||||
private char current[];
|
||||
|
||||
protected int cursor;
|
||||
protected int limit;
|
||||
protected int limit_backward;
|
||||
protected int bra;
|
||||
protected int ket;
|
||||
|
||||
protected void copy_from(SnowballProgram other)
|
||||
{
|
||||
current = other.current;
|
||||
cursor = other.cursor;
|
||||
limit = other.limit;
|
||||
limit_backward = other.limit_backward;
|
||||
bra = other.bra;
|
||||
ket = other.ket;
|
||||
}
|
||||
|
||||
protected boolean in_grouping(char [] s, int min, int max)
|
||||
{
|
||||
if (cursor >= limit) return false;
|
||||
char ch = current[cursor];
|
||||
if (ch > max || ch < min) return false;
|
||||
ch -= min;
|
||||
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
|
||||
cursor++;
|
||||
return true;
|
||||
}
|
||||
|
||||
protected boolean in_grouping_b(char [] s, int min, int max)
|
||||
{
|
||||
if (cursor <= limit_backward) return false;
|
||||
char ch = current[cursor - 1];
|
||||
if (ch > max || ch < min) return false;
|
||||
ch -= min;
|
||||
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
|
||||
cursor--;
|
||||
return true;
|
||||
}
|
||||
|
||||
protected boolean out_grouping(char [] s, int min, int max)
|
||||
{
|
||||
if (cursor >= limit) return false;
|
||||
char ch = current[cursor];
|
||||
if (ch > max || ch < min) {
|
||||
cursor++;
|
||||
return true;
|
||||
}
|
||||
ch -= min;
|
||||
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
|
||||
cursor ++;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
protected boolean out_grouping_b(char [] s, int min, int max)
|
||||
{
|
||||
if (cursor <= limit_backward) return false;
|
||||
char ch = current[cursor - 1];
|
||||
if (ch > max || ch < min) {
|
||||
cursor--;
|
||||
return true;
|
||||
}
|
||||
ch -= min;
|
||||
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
|
||||
cursor--;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
protected boolean in_range(int min, int max)
|
||||
{
|
||||
if (cursor >= limit) return false;
|
||||
char ch = current[cursor];
|
||||
if (ch > max || ch < min) return false;
|
||||
cursor++;
|
||||
return true;
|
||||
}
|
||||
|
||||
protected boolean in_range_b(int min, int max)
|
||||
{
|
||||
if (cursor <= limit_backward) return false;
|
||||
char ch = current[cursor - 1];
|
||||
if (ch > max || ch < min) return false;
|
||||
cursor--;
|
||||
return true;
|
||||
}
|
||||
|
||||
protected boolean out_range(int min, int max)
|
||||
{
|
||||
if (cursor >= limit) return false;
|
||||
char ch = current[cursor];
|
||||
if (!(ch > max || ch < min)) return false;
|
||||
cursor++;
|
||||
return true;
|
||||
}
|
||||
|
||||
protected boolean out_range_b(int min, int max)
|
||||
{
|
||||
if (cursor <= limit_backward) return false;
|
||||
char ch = current[cursor - 1];
|
||||
if(!(ch > max || ch < min)) return false;
|
||||
cursor--;
|
||||
return true;
|
||||
}
|
||||
|
||||
protected boolean eq_s(int s_size, CharSequence s)
|
||||
{
|
||||
if (limit - cursor < s_size) return false;
|
||||
int i;
|
||||
for (i = 0; i != s_size; i++) {
|
||||
if (current[cursor + i] != s.charAt(i)) return false;
|
||||
}
|
||||
cursor += s_size;
|
||||
return true;
|
||||
}
|
||||
|
||||
/** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
|
||||
@Deprecated
|
||||
protected boolean eq_s(int s_size, String s)
|
||||
{
|
||||
return eq_s(s_size, (CharSequence)s);
|
||||
}
|
||||
|
||||
protected boolean eq_s_b(int s_size, CharSequence s)
|
||||
{
|
||||
if (cursor - limit_backward < s_size) return false;
|
||||
int i;
|
||||
for (i = 0; i != s_size; i++) {
|
||||
if (current[cursor - s_size + i] != s.charAt(i)) return false;
|
||||
}
|
||||
cursor -= s_size;
|
||||
return true;
|
||||
}
|
||||
|
||||
/** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
|
||||
@Deprecated
|
||||
protected boolean eq_s_b(int s_size, String s)
|
||||
{
|
||||
return eq_s_b(s_size, (CharSequence)s);
|
||||
}
|
||||
|
||||
protected boolean eq_v(CharSequence s)
|
||||
{
|
||||
return eq_s(s.length(), s);
|
||||
}
|
||||
|
||||
/** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
|
||||
@Deprecated
|
||||
protected boolean eq_v(StringBuilder s)
|
||||
{
|
||||
return eq_s(s.length(), (CharSequence)s);
|
||||
}
|
||||
|
||||
protected boolean eq_v_b(CharSequence s)
|
||||
{ return eq_s_b(s.length(), s);
|
||||
}
|
||||
|
||||
/** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
|
||||
@Deprecated
|
||||
protected boolean eq_v_b(StringBuilder s)
|
||||
{ return eq_s_b(s.length(), (CharSequence)s);
|
||||
}
|
||||
|
||||
protected int find_among(Among v[], int v_size)
|
||||
{
|
||||
int i = 0;
|
||||
int j = v_size;
|
||||
|
||||
int c = cursor;
|
||||
int l = limit;
|
||||
|
||||
int common_i = 0;
|
||||
int common_j = 0;
|
||||
|
||||
boolean first_key_inspected = false;
|
||||
|
||||
while(true) {
|
||||
int k = i + ((j - i) >> 1);
|
||||
int diff = 0;
|
||||
int common = common_i < common_j ? common_i : common_j; // smaller
|
||||
Among w = v[k];
|
||||
int i2;
|
||||
for (i2 = common; i2 < w.s_size; i2++) {
|
||||
if (c + common == l) {
|
||||
diff = -1;
|
||||
break;
|
||||
}
|
||||
diff = current[c + common] - w.s[i2];
|
||||
if (diff != 0) break;
|
||||
common++;
|
||||
}
|
||||
if (diff < 0) {
|
||||
j = k;
|
||||
common_j = common;
|
||||
} else {
|
||||
i = k;
|
||||
common_i = common;
|
||||
}
|
||||
if (j - i <= 1) {
|
||||
if (i > 0) break; // v->s has been inspected
|
||||
if (j == i) break; // only one item in v
|
||||
|
||||
// - but now we need to go round once more to get
|
||||
// v->s inspected. This looks messy, but is actually
|
||||
// the optimal approach.
|
||||
|
||||
if (first_key_inspected) break;
|
||||
first_key_inspected = true;
|
||||
}
|
||||
}
|
||||
while(true) {
|
||||
Among w = v[i];
|
||||
if (common_i >= w.s_size) {
|
||||
cursor = c + w.s_size;
|
||||
if (w.method == null) return w.result;
|
||||
boolean res;
|
||||
try {
|
||||
Object resobj = w.method.invoke(w.methodobject,
|
||||
new Object[0]);
|
||||
res = resobj.toString().equals("true");
|
||||
} catch (InvocationTargetException e) {
|
||||
res = false;
|
||||
// FIXME - debug message
|
||||
} catch (IllegalAccessException e) {
|
||||
res = false;
|
||||
// FIXME - debug message
|
||||
}
|
||||
cursor = c + w.s_size;
|
||||
if (res) return w.result;
|
||||
}
|
||||
i = w.substring_i;
|
||||
if (i < 0) return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// find_among_b is for backwards processing. Same comments apply
|
||||
protected int find_among_b(Among v[], int v_size)
|
||||
{
|
||||
int i = 0;
|
||||
int j = v_size;
|
||||
|
||||
int c = cursor;
|
||||
int lb = limit_backward;
|
||||
|
||||
int common_i = 0;
|
||||
int common_j = 0;
|
||||
|
||||
boolean first_key_inspected = false;
|
||||
|
||||
while(true) {
|
||||
int k = i + ((j - i) >> 1);
|
||||
int diff = 0;
|
||||
int common = common_i < common_j ? common_i : common_j;
|
||||
Among w = v[k];
|
||||
int i2;
|
||||
for (i2 = w.s_size - 1 - common; i2 >= 0; i2--) {
|
||||
if (c - common == lb) {
|
||||
diff = -1;
|
||||
break;
|
||||
}
|
||||
diff = current[c - 1 - common] - w.s[i2];
|
||||
if (diff != 0) break;
|
||||
common++;
|
||||
}
|
||||
if (diff < 0) {
|
||||
j = k;
|
||||
common_j = common;
|
||||
} else {
|
||||
i = k;
|
||||
common_i = common;
|
||||
}
|
||||
if (j - i <= 1) {
|
||||
if (i > 0) break;
|
||||
if (j == i) break;
|
||||
if (first_key_inspected) break;
|
||||
first_key_inspected = true;
|
||||
}
|
||||
}
|
||||
while(true) {
|
||||
Among w = v[i];
|
||||
if (common_i >= w.s_size) {
|
||||
cursor = c - w.s_size;
|
||||
if (w.method == null) return w.result;
|
||||
|
||||
boolean res;
|
||||
try {
|
||||
Object resobj = w.method.invoke(w.methodobject,
|
||||
new Object[0]);
|
||||
res = resobj.toString().equals("true");
|
||||
} catch (InvocationTargetException e) {
|
||||
res = false;
|
||||
// FIXME - debug message
|
||||
} catch (IllegalAccessException e) {
|
||||
res = false;
|
||||
// FIXME - debug message
|
||||
}
|
||||
cursor = c - w.s_size;
|
||||
if (res) return w.result;
|
||||
}
|
||||
i = w.substring_i;
|
||||
if (i < 0) return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* to replace chars between c_bra and c_ket in current by the
|
||||
* chars in s.
|
||||
*/
|
||||
protected int replace_s(int c_bra, int c_ket, CharSequence s)
|
||||
{
|
||||
final int adjustment = s.length() - (c_ket - c_bra);
|
||||
final int newLength = limit + adjustment;
|
||||
//resize if necessary
|
||||
if (newLength > current.length) {
|
||||
char newBuffer[] = new char[ArrayUtil.oversize(newLength, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
System.arraycopy(current, 0, newBuffer, 0, limit);
|
||||
current = newBuffer;
|
||||
}
|
||||
// if the substring being replaced is longer or shorter than the
|
||||
// replacement, need to shift things around
|
||||
if (adjustment != 0 && c_ket < limit) {
|
||||
System.arraycopy(current, c_ket, current, c_bra + s.length(),
|
||||
limit - c_ket);
|
||||
}
|
||||
// insert the replacement text
|
||||
// Note, faster is s.getChars(0, s.length(), current, c_bra);
|
||||
// but would have to duplicate this method for both String and StringBuilder
|
||||
for (int i = 0; i < s.length(); i++)
|
||||
current[c_bra + i] = s.charAt(i);
|
||||
|
||||
limit += adjustment;
|
||||
if (cursor >= c_ket) cursor += adjustment;
|
||||
else if (cursor > c_bra) cursor = c_bra;
|
||||
return adjustment;
|
||||
}
|
||||
|
||||
/** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
|
||||
@Deprecated
|
||||
protected int replace_s(int c_bra, int c_ket, String s) {
|
||||
return replace_s(c_bra, c_ket, (CharSequence)s);
|
||||
}
|
||||
|
||||
protected void slice_check()
|
||||
{
|
||||
if (bra < 0 ||
|
||||
bra > ket ||
|
||||
ket > limit)
|
||||
{
|
||||
System.err.println("faulty slice operation");
|
||||
// FIXME: report error somehow.
|
||||
/*
|
||||
fprintf(stderr, "faulty slice operation:\n");
|
||||
debug(z, -1, 0);
|
||||
exit(1);
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
protected void slice_from(CharSequence s)
|
||||
{
|
||||
slice_check();
|
||||
replace_s(bra, ket, s);
|
||||
}
|
||||
|
||||
/** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
|
||||
@Deprecated
|
||||
protected void slice_from(String s)
|
||||
{
|
||||
slice_from((CharSequence)s);
|
||||
}
|
||||
|
||||
/** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
|
||||
@Deprecated
|
||||
protected void slice_from(StringBuilder s)
|
||||
{
|
||||
slice_from((CharSequence)s);
|
||||
}
|
||||
|
||||
protected void slice_del()
|
||||
{
|
||||
slice_from((CharSequence)"");
|
||||
}
|
||||
|
||||
protected void insert(int c_bra, int c_ket, CharSequence s)
|
||||
{
|
||||
int adjustment = replace_s(c_bra, c_ket, s);
|
||||
if (c_bra <= bra) bra += adjustment;
|
||||
if (c_bra <= ket) ket += adjustment;
|
||||
}
|
||||
|
||||
/** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
|
||||
@Deprecated
|
||||
protected void insert(int c_bra, int c_ket, String s)
|
||||
{
|
||||
insert(c_bra, c_ket, (CharSequence)s);
|
||||
}
|
||||
|
||||
/** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
|
||||
@Deprecated
|
||||
protected void insert(int c_bra, int c_ket, StringBuilder s)
|
||||
{
|
||||
insert(c_bra, c_ket, (CharSequence)s);
|
||||
}
|
||||
|
||||
/* Copy the slice into the supplied StringBuffer */
|
||||
protected StringBuilder slice_to(StringBuilder s)
|
||||
{
|
||||
slice_check();
|
||||
int len = ket - bra;
|
||||
s.setLength(0);
|
||||
s.append(current, bra, len);
|
||||
return s;
|
||||
}
|
||||
|
||||
protected StringBuilder assign_to(StringBuilder s)
|
||||
{
|
||||
s.setLength(0);
|
||||
s.append(current, 0, limit);
|
||||
return s;
|
||||
}
|
||||
|
||||
/*
|
||||
extern void debug(struct SN_env * z, int number, int line_count)
|
||||
{ int i;
|
||||
int limit = SIZE(z->p);
|
||||
//if (number >= 0) printf("%3d (line %4d): '", number, line_count);
|
||||
if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
|
||||
for (i = 0; i <= limit; i++)
|
||||
{ if (z->lb == i) printf("{");
|
||||
if (z->bra == i) printf("[");
|
||||
if (z->c == i) printf("|");
|
||||
if (z->ket == i) printf("]");
|
||||
if (z->l == i) printf("}");
|
||||
if (i < limit)
|
||||
{ int ch = z->p[i];
|
||||
if (ch == 0) ch = '#';
|
||||
printf("%c", ch);
|
||||
}
|
||||
}
|
||||
printf("'\n");
|
||||
}
|
||||
*/
|
||||
|
||||
};
|
||||
|
|
@ -0,0 +1,108 @@
|
|||
/*
|
||||
|
||||
Copyright (c) 2001, Dr Martin Porter
|
||||
Copyright (c) 2002, Richard Boulton
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holders nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
package org.tartarus.snowball;
|
||||
|
||||
import java.lang.reflect.Method;
|
||||
import java.io.Reader;
|
||||
import java.io.Writer;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.OutputStream;
|
||||
import java.io.FileOutputStream;
|
||||
|
||||
public class TestApp {
|
||||
private static void usage()
|
||||
{
|
||||
System.err.println("Usage: TestApp <algorithm> <input file> [-o <output file>]");
|
||||
}
|
||||
|
||||
public static void main(String [] args) throws Throwable {
|
||||
if (args.length < 2) {
|
||||
usage();
|
||||
return;
|
||||
}
|
||||
|
||||
Class stemClass = Class.forName("org.tartarus.snowball.ext." +
|
||||
args[0] + "Stemmer");
|
||||
SnowballProgram stemmer = (SnowballProgram) stemClass.newInstance();
|
||||
Method stemMethod = stemClass.getMethod("stem", new Class[0]);
|
||||
|
||||
Reader reader;
|
||||
reader = new InputStreamReader(new FileInputStream(args[1]));
|
||||
reader = new BufferedReader(reader);
|
||||
|
||||
StringBuffer input = new StringBuffer();
|
||||
|
||||
OutputStream outstream;
|
||||
|
||||
if (args.length > 2) {
|
||||
if (args.length == 4 && args[2].equals("-o")) {
|
||||
outstream = new FileOutputStream(args[3]);
|
||||
} else {
|
||||
usage();
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
outstream = System.out;
|
||||
}
|
||||
Writer output = new OutputStreamWriter(outstream);
|
||||
output = new BufferedWriter(output);
|
||||
|
||||
int repeat = 1;
|
||||
if (args.length > 4) {
|
||||
repeat = Integer.parseInt(args[4]);
|
||||
}
|
||||
|
||||
Object [] emptyArgs = new Object[0];
|
||||
int character;
|
||||
while ((character = reader.read()) != -1) {
|
||||
char ch = (char) character;
|
||||
if (Character.isWhitespace((char) ch)) {
|
||||
if (input.length() > 0) {
|
||||
stemmer.setCurrent(input.toString());
|
||||
for (int i = repeat; i != 0; i--) {
|
||||
stemMethod.invoke(stemmer, emptyArgs);
|
||||
}
|
||||
output.write(stemmer.getCurrent());
|
||||
output.write('\n');
|
||||
input.delete(0, input.length());
|
||||
}
|
||||
} else {
|
||||
input.append(Character.toLowerCase(ch));
|
||||
}
|
||||
}
|
||||
output.flush();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,423 @@
|
|||
// This file was generated automatically by the Snowball to Java compiler
|
||||
|
||||
package org.tartarus.snowball.ext;
|
||||
import org.tartarus.snowball.SnowballProgram;
|
||||
import org.tartarus.snowball.Among;
|
||||
|
||||
/**
|
||||
* Generated class implementing code defined by a snowball script.
|
||||
*/
|
||||
public class DanishStemmer extends SnowballProgram {
|
||||
|
||||
private Among a_0[] = {
|
||||
new Among ( "hed", -1, 1, "", this),
|
||||
new Among ( "ethed", 0, 1, "", this),
|
||||
new Among ( "ered", -1, 1, "", this),
|
||||
new Among ( "e", -1, 1, "", this),
|
||||
new Among ( "erede", 3, 1, "", this),
|
||||
new Among ( "ende", 3, 1, "", this),
|
||||
new Among ( "erende", 5, 1, "", this),
|
||||
new Among ( "ene", 3, 1, "", this),
|
||||
new Among ( "erne", 3, 1, "", this),
|
||||
new Among ( "ere", 3, 1, "", this),
|
||||
new Among ( "en", -1, 1, "", this),
|
||||
new Among ( "heden", 10, 1, "", this),
|
||||
new Among ( "eren", 10, 1, "", this),
|
||||
new Among ( "er", -1, 1, "", this),
|
||||
new Among ( "heder", 13, 1, "", this),
|
||||
new Among ( "erer", 13, 1, "", this),
|
||||
new Among ( "s", -1, 2, "", this),
|
||||
new Among ( "heds", 16, 1, "", this),
|
||||
new Among ( "es", 16, 1, "", this),
|
||||
new Among ( "endes", 18, 1, "", this),
|
||||
new Among ( "erendes", 19, 1, "", this),
|
||||
new Among ( "enes", 18, 1, "", this),
|
||||
new Among ( "ernes", 18, 1, "", this),
|
||||
new Among ( "eres", 18, 1, "", this),
|
||||
new Among ( "ens", 16, 1, "", this),
|
||||
new Among ( "hedens", 24, 1, "", this),
|
||||
new Among ( "erens", 24, 1, "", this),
|
||||
new Among ( "ers", 16, 1, "", this),
|
||||
new Among ( "ets", 16, 1, "", this),
|
||||
new Among ( "erets", 28, 1, "", this),
|
||||
new Among ( "et", -1, 1, "", this),
|
||||
new Among ( "eret", 30, 1, "", this)
|
||||
};
|
||||
|
||||
private Among a_1[] = {
|
||||
new Among ( "gd", -1, -1, "", this),
|
||||
new Among ( "dt", -1, -1, "", this),
|
||||
new Among ( "gt", -1, -1, "", this),
|
||||
new Among ( "kt", -1, -1, "", this)
|
||||
};
|
||||
|
||||
private Among a_2[] = {
|
||||
new Among ( "ig", -1, 1, "", this),
|
||||
new Among ( "lig", 0, 1, "", this),
|
||||
new Among ( "elig", 1, 1, "", this),
|
||||
new Among ( "els", -1, 1, "", this),
|
||||
new Among ( "l\u00F8st", -1, 2, "", this)
|
||||
};
|
||||
|
||||
private static final char g_v[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128 };
|
||||
|
||||
private static final char g_s_ending[] = {239, 254, 42, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16 };
|
||||
|
||||
private int I_x;
|
||||
private int I_p1;
|
||||
private StringBuilder S_ch = new StringBuilder();
|
||||
|
||||
private void copy_from(DanishStemmer other) {
|
||||
I_x = other.I_x;
|
||||
I_p1 = other.I_p1;
|
||||
S_ch = other.S_ch;
|
||||
super.copy_from(other);
|
||||
}
|
||||
|
||||
private boolean r_mark_regions() {
|
||||
int v_1;
|
||||
int v_2;
|
||||
// (, line 29
|
||||
I_p1 = limit;
|
||||
// test, line 33
|
||||
v_1 = cursor;
|
||||
// (, line 33
|
||||
// hop, line 33
|
||||
{
|
||||
int c = cursor + 3;
|
||||
if (0 > c || c > limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = c;
|
||||
}
|
||||
// setmark x, line 33
|
||||
I_x = cursor;
|
||||
cursor = v_1;
|
||||
// goto, line 34
|
||||
golab0: while(true)
|
||||
{
|
||||
v_2 = cursor;
|
||||
lab1: do {
|
||||
if (!(in_grouping(g_v, 97, 248)))
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
cursor = v_2;
|
||||
break golab0;
|
||||
} while (false);
|
||||
cursor = v_2;
|
||||
if (cursor >= limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// gopast, line 34
|
||||
golab2: while(true)
|
||||
{
|
||||
lab3: do {
|
||||
if (!(out_grouping(g_v, 97, 248)))
|
||||
{
|
||||
break lab3;
|
||||
}
|
||||
break golab2;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// setmark p1, line 34
|
||||
I_p1 = cursor;
|
||||
// try, line 35
|
||||
lab4: do {
|
||||
// (, line 35
|
||||
if (!(I_p1 < I_x))
|
||||
{
|
||||
break lab4;
|
||||
}
|
||||
I_p1 = I_x;
|
||||
} while (false);
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_main_suffix() {
|
||||
int among_var;
|
||||
int v_1;
|
||||
int v_2;
|
||||
// (, line 40
|
||||
// setlimit, line 41
|
||||
v_1 = limit - cursor;
|
||||
// tomark, line 41
|
||||
if (cursor < I_p1)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = I_p1;
|
||||
v_2 = limit_backward;
|
||||
limit_backward = cursor;
|
||||
cursor = limit - v_1;
|
||||
// (, line 41
|
||||
// [, line 41
|
||||
ket = cursor;
|
||||
// substring, line 41
|
||||
among_var = find_among_b(a_0, 32);
|
||||
if (among_var == 0)
|
||||
{
|
||||
limit_backward = v_2;
|
||||
return false;
|
||||
}
|
||||
// ], line 41
|
||||
bra = cursor;
|
||||
limit_backward = v_2;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 48
|
||||
// delete, line 48
|
||||
slice_del();
|
||||
break;
|
||||
case 2:
|
||||
// (, line 50
|
||||
if (!(in_grouping_b(g_s_ending, 97, 229)))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// delete, line 50
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_consonant_pair() {
|
||||
int v_1;
|
||||
int v_2;
|
||||
int v_3;
|
||||
// (, line 54
|
||||
// test, line 55
|
||||
v_1 = limit - cursor;
|
||||
// (, line 55
|
||||
// setlimit, line 56
|
||||
v_2 = limit - cursor;
|
||||
// tomark, line 56
|
||||
if (cursor < I_p1)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = I_p1;
|
||||
v_3 = limit_backward;
|
||||
limit_backward = cursor;
|
||||
cursor = limit - v_2;
|
||||
// (, line 56
|
||||
// [, line 56
|
||||
ket = cursor;
|
||||
// substring, line 56
|
||||
if (find_among_b(a_1, 4) == 0)
|
||||
{
|
||||
limit_backward = v_3;
|
||||
return false;
|
||||
}
|
||||
// ], line 56
|
||||
bra = cursor;
|
||||
limit_backward = v_3;
|
||||
cursor = limit - v_1;
|
||||
// next, line 62
|
||||
if (cursor <= limit_backward)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor--;
|
||||
// ], line 62
|
||||
bra = cursor;
|
||||
// delete, line 62
|
||||
slice_del();
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_other_suffix() {
|
||||
int among_var;
|
||||
int v_1;
|
||||
int v_2;
|
||||
int v_3;
|
||||
int v_4;
|
||||
// (, line 65
|
||||
// do, line 66
|
||||
v_1 = limit - cursor;
|
||||
lab0: do {
|
||||
// (, line 66
|
||||
// [, line 66
|
||||
ket = cursor;
|
||||
// literal, line 66
|
||||
if (!(eq_s_b(2, "st")))
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
// ], line 66
|
||||
bra = cursor;
|
||||
// literal, line 66
|
||||
if (!(eq_s_b(2, "ig")))
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
// delete, line 66
|
||||
slice_del();
|
||||
} while (false);
|
||||
cursor = limit - v_1;
|
||||
// setlimit, line 67
|
||||
v_2 = limit - cursor;
|
||||
// tomark, line 67
|
||||
if (cursor < I_p1)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = I_p1;
|
||||
v_3 = limit_backward;
|
||||
limit_backward = cursor;
|
||||
cursor = limit - v_2;
|
||||
// (, line 67
|
||||
// [, line 67
|
||||
ket = cursor;
|
||||
// substring, line 67
|
||||
among_var = find_among_b(a_2, 5);
|
||||
if (among_var == 0)
|
||||
{
|
||||
limit_backward = v_3;
|
||||
return false;
|
||||
}
|
||||
// ], line 67
|
||||
bra = cursor;
|
||||
limit_backward = v_3;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 70
|
||||
// delete, line 70
|
||||
slice_del();
|
||||
// do, line 70
|
||||
v_4 = limit - cursor;
|
||||
lab1: do {
|
||||
// call consonant_pair, line 70
|
||||
if (!r_consonant_pair())
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_4;
|
||||
break;
|
||||
case 2:
|
||||
// (, line 72
|
||||
// <-, line 72
|
||||
slice_from("l\u00F8s");
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_undouble() {
|
||||
int v_1;
|
||||
int v_2;
|
||||
// (, line 75
|
||||
// setlimit, line 76
|
||||
v_1 = limit - cursor;
|
||||
// tomark, line 76
|
||||
if (cursor < I_p1)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = I_p1;
|
||||
v_2 = limit_backward;
|
||||
limit_backward = cursor;
|
||||
cursor = limit - v_1;
|
||||
// (, line 76
|
||||
// [, line 76
|
||||
ket = cursor;
|
||||
if (!(out_grouping_b(g_v, 97, 248)))
|
||||
{
|
||||
limit_backward = v_2;
|
||||
return false;
|
||||
}
|
||||
// ], line 76
|
||||
bra = cursor;
|
||||
// -> ch, line 76
|
||||
S_ch = slice_to(S_ch);
|
||||
limit_backward = v_2;
|
||||
// name ch, line 77
|
||||
if (!(eq_v_b(S_ch)))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// delete, line 78
|
||||
slice_del();
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean stem() {
|
||||
int v_1;
|
||||
int v_2;
|
||||
int v_3;
|
||||
int v_4;
|
||||
int v_5;
|
||||
// (, line 82
|
||||
// do, line 84
|
||||
v_1 = cursor;
|
||||
lab0: do {
|
||||
// call mark_regions, line 84
|
||||
if (!r_mark_regions())
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
} while (false);
|
||||
cursor = v_1;
|
||||
// backwards, line 85
|
||||
limit_backward = cursor; cursor = limit;
|
||||
// (, line 85
|
||||
// do, line 86
|
||||
v_2 = limit - cursor;
|
||||
lab1: do {
|
||||
// call main_suffix, line 86
|
||||
if (!r_main_suffix())
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_2;
|
||||
// do, line 87
|
||||
v_3 = limit - cursor;
|
||||
lab2: do {
|
||||
// call consonant_pair, line 87
|
||||
if (!r_consonant_pair())
|
||||
{
|
||||
break lab2;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_3;
|
||||
// do, line 88
|
||||
v_4 = limit - cursor;
|
||||
lab3: do {
|
||||
// call other_suffix, line 88
|
||||
if (!r_other_suffix())
|
||||
{
|
||||
break lab3;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_4;
|
||||
// do, line 89
|
||||
v_5 = limit - cursor;
|
||||
lab4: do {
|
||||
// call undouble, line 89
|
||||
if (!r_undouble())
|
||||
{
|
||||
break lab4;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_5;
|
||||
cursor = limit_backward; return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,837 @@
|
|||
// This file was generated automatically by the Snowball to Java compiler
|
||||
|
||||
package org.tartarus.snowball.ext;
|
||||
import org.tartarus.snowball.SnowballProgram;
|
||||
import org.tartarus.snowball.Among;
|
||||
|
||||
/**
|
||||
* Generated class implementing code defined by a snowball script.
|
||||
*/
|
||||
public class DutchStemmer extends SnowballProgram {
|
||||
|
||||
private Among a_0[] = {
|
||||
new Among ( "", -1, 6, "", this),
|
||||
new Among ( "\u00E1", 0, 1, "", this),
|
||||
new Among ( "\u00E4", 0, 1, "", this),
|
||||
new Among ( "\u00E9", 0, 2, "", this),
|
||||
new Among ( "\u00EB", 0, 2, "", this),
|
||||
new Among ( "\u00ED", 0, 3, "", this),
|
||||
new Among ( "\u00EF", 0, 3, "", this),
|
||||
new Among ( "\u00F3", 0, 4, "", this),
|
||||
new Among ( "\u00F6", 0, 4, "", this),
|
||||
new Among ( "\u00FA", 0, 5, "", this),
|
||||
new Among ( "\u00FC", 0, 5, "", this)
|
||||
};
|
||||
|
||||
private Among a_1[] = {
|
||||
new Among ( "", -1, 3, "", this),
|
||||
new Among ( "I", 0, 2, "", this),
|
||||
new Among ( "Y", 0, 1, "", this)
|
||||
};
|
||||
|
||||
private Among a_2[] = {
|
||||
new Among ( "dd", -1, -1, "", this),
|
||||
new Among ( "kk", -1, -1, "", this),
|
||||
new Among ( "tt", -1, -1, "", this)
|
||||
};
|
||||
|
||||
private Among a_3[] = {
|
||||
new Among ( "ene", -1, 2, "", this),
|
||||
new Among ( "se", -1, 3, "", this),
|
||||
new Among ( "en", -1, 2, "", this),
|
||||
new Among ( "heden", 2, 1, "", this),
|
||||
new Among ( "s", -1, 3, "", this)
|
||||
};
|
||||
|
||||
private Among a_4[] = {
|
||||
new Among ( "end", -1, 1, "", this),
|
||||
new Among ( "ig", -1, 2, "", this),
|
||||
new Among ( "ing", -1, 1, "", this),
|
||||
new Among ( "lijk", -1, 3, "", this),
|
||||
new Among ( "baar", -1, 4, "", this),
|
||||
new Among ( "bar", -1, 5, "", this)
|
||||
};
|
||||
|
||||
private Among a_5[] = {
|
||||
new Among ( "aa", -1, -1, "", this),
|
||||
new Among ( "ee", -1, -1, "", this),
|
||||
new Among ( "oo", -1, -1, "", this),
|
||||
new Among ( "uu", -1, -1, "", this)
|
||||
};
|
||||
|
||||
private static final char g_v[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128 };
|
||||
|
||||
private static final char g_v_I[] = {1, 0, 0, 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128 };
|
||||
|
||||
private static final char g_v_j[] = {17, 67, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128 };
|
||||
|
||||
private int I_p2;
|
||||
private int I_p1;
|
||||
private boolean B_e_found;
|
||||
|
||||
private void copy_from(DutchStemmer other) {
|
||||
I_p2 = other.I_p2;
|
||||
I_p1 = other.I_p1;
|
||||
B_e_found = other.B_e_found;
|
||||
super.copy_from(other);
|
||||
}
|
||||
|
||||
private boolean r_prelude() {
|
||||
int among_var;
|
||||
int v_1;
|
||||
int v_2;
|
||||
int v_3;
|
||||
int v_4;
|
||||
int v_5;
|
||||
int v_6;
|
||||
// (, line 41
|
||||
// test, line 42
|
||||
v_1 = cursor;
|
||||
// repeat, line 42
|
||||
replab0: while(true)
|
||||
{
|
||||
v_2 = cursor;
|
||||
lab1: do {
|
||||
// (, line 42
|
||||
// [, line 43
|
||||
bra = cursor;
|
||||
// substring, line 43
|
||||
among_var = find_among(a_0, 11);
|
||||
if (among_var == 0)
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
// ], line 43
|
||||
ket = cursor;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
break lab1;
|
||||
case 1:
|
||||
// (, line 45
|
||||
// <-, line 45
|
||||
slice_from("a");
|
||||
break;
|
||||
case 2:
|
||||
// (, line 47
|
||||
// <-, line 47
|
||||
slice_from("e");
|
||||
break;
|
||||
case 3:
|
||||
// (, line 49
|
||||
// <-, line 49
|
||||
slice_from("i");
|
||||
break;
|
||||
case 4:
|
||||
// (, line 51
|
||||
// <-, line 51
|
||||
slice_from("o");
|
||||
break;
|
||||
case 5:
|
||||
// (, line 53
|
||||
// <-, line 53
|
||||
slice_from("u");
|
||||
break;
|
||||
case 6:
|
||||
// (, line 54
|
||||
// next, line 54
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
cursor++;
|
||||
break;
|
||||
}
|
||||
continue replab0;
|
||||
} while (false);
|
||||
cursor = v_2;
|
||||
break replab0;
|
||||
}
|
||||
cursor = v_1;
|
||||
// try, line 57
|
||||
v_3 = cursor;
|
||||
lab2: do {
|
||||
// (, line 57
|
||||
// [, line 57
|
||||
bra = cursor;
|
||||
// literal, line 57
|
||||
if (!(eq_s(1, "y")))
|
||||
{
|
||||
cursor = v_3;
|
||||
break lab2;
|
||||
}
|
||||
// ], line 57
|
||||
ket = cursor;
|
||||
// <-, line 57
|
||||
slice_from("Y");
|
||||
} while (false);
|
||||
// repeat, line 58
|
||||
replab3: while(true)
|
||||
{
|
||||
v_4 = cursor;
|
||||
lab4: do {
|
||||
// goto, line 58
|
||||
golab5: while(true)
|
||||
{
|
||||
v_5 = cursor;
|
||||
lab6: do {
|
||||
// (, line 58
|
||||
if (!(in_grouping(g_v, 97, 232)))
|
||||
{
|
||||
break lab6;
|
||||
}
|
||||
// [, line 59
|
||||
bra = cursor;
|
||||
// or, line 59
|
||||
lab7: do {
|
||||
v_6 = cursor;
|
||||
lab8: do {
|
||||
// (, line 59
|
||||
// literal, line 59
|
||||
if (!(eq_s(1, "i")))
|
||||
{
|
||||
break lab8;
|
||||
}
|
||||
// ], line 59
|
||||
ket = cursor;
|
||||
if (!(in_grouping(g_v, 97, 232)))
|
||||
{
|
||||
break lab8;
|
||||
}
|
||||
// <-, line 59
|
||||
slice_from("I");
|
||||
break lab7;
|
||||
} while (false);
|
||||
cursor = v_6;
|
||||
// (, line 60
|
||||
// literal, line 60
|
||||
if (!(eq_s(1, "y")))
|
||||
{
|
||||
break lab6;
|
||||
}
|
||||
// ], line 60
|
||||
ket = cursor;
|
||||
// <-, line 60
|
||||
slice_from("Y");
|
||||
} while (false);
|
||||
cursor = v_5;
|
||||
break golab5;
|
||||
} while (false);
|
||||
cursor = v_5;
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab4;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
continue replab3;
|
||||
} while (false);
|
||||
cursor = v_4;
|
||||
break replab3;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_mark_regions() {
|
||||
// (, line 64
|
||||
I_p1 = limit;
|
||||
I_p2 = limit;
|
||||
// gopast, line 69
|
||||
golab0: while(true)
|
||||
{
|
||||
lab1: do {
|
||||
if (!(in_grouping(g_v, 97, 232)))
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
break golab0;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// gopast, line 69
|
||||
golab2: while(true)
|
||||
{
|
||||
lab3: do {
|
||||
if (!(out_grouping(g_v, 97, 232)))
|
||||
{
|
||||
break lab3;
|
||||
}
|
||||
break golab2;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// setmark p1, line 69
|
||||
I_p1 = cursor;
|
||||
// try, line 70
|
||||
lab4: do {
|
||||
// (, line 70
|
||||
if (!(I_p1 < 3))
|
||||
{
|
||||
break lab4;
|
||||
}
|
||||
I_p1 = 3;
|
||||
} while (false);
|
||||
// gopast, line 71
|
||||
golab5: while(true)
|
||||
{
|
||||
lab6: do {
|
||||
if (!(in_grouping(g_v, 97, 232)))
|
||||
{
|
||||
break lab6;
|
||||
}
|
||||
break golab5;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// gopast, line 71
|
||||
golab7: while(true)
|
||||
{
|
||||
lab8: do {
|
||||
if (!(out_grouping(g_v, 97, 232)))
|
||||
{
|
||||
break lab8;
|
||||
}
|
||||
break golab7;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// setmark p2, line 71
|
||||
I_p2 = cursor;
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_postlude() {
|
||||
int among_var;
|
||||
int v_1;
|
||||
// repeat, line 75
|
||||
replab0: while(true)
|
||||
{
|
||||
v_1 = cursor;
|
||||
lab1: do {
|
||||
// (, line 75
|
||||
// [, line 77
|
||||
bra = cursor;
|
||||
// substring, line 77
|
||||
among_var = find_among(a_1, 3);
|
||||
if (among_var == 0)
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
// ], line 77
|
||||
ket = cursor;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
break lab1;
|
||||
case 1:
|
||||
// (, line 78
|
||||
// <-, line 78
|
||||
slice_from("y");
|
||||
break;
|
||||
case 2:
|
||||
// (, line 79
|
||||
// <-, line 79
|
||||
slice_from("i");
|
||||
break;
|
||||
case 3:
|
||||
// (, line 80
|
||||
// next, line 80
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
cursor++;
|
||||
break;
|
||||
}
|
||||
continue replab0;
|
||||
} while (false);
|
||||
cursor = v_1;
|
||||
break replab0;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_R1() {
|
||||
if (!(I_p1 <= cursor))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_R2() {
|
||||
if (!(I_p2 <= cursor))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_undouble() {
|
||||
int v_1;
|
||||
// (, line 90
|
||||
// test, line 91
|
||||
v_1 = limit - cursor;
|
||||
// among, line 91
|
||||
if (find_among_b(a_2, 3) == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = limit - v_1;
|
||||
// [, line 91
|
||||
ket = cursor;
|
||||
// next, line 91
|
||||
if (cursor <= limit_backward)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor--;
|
||||
// ], line 91
|
||||
bra = cursor;
|
||||
// delete, line 91
|
||||
slice_del();
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_e_ending() {
|
||||
int v_1;
|
||||
// (, line 94
|
||||
// unset e_found, line 95
|
||||
B_e_found = false;
|
||||
// [, line 96
|
||||
ket = cursor;
|
||||
// literal, line 96
|
||||
if (!(eq_s_b(1, "e")))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// ], line 96
|
||||
bra = cursor;
|
||||
// call R1, line 96
|
||||
if (!r_R1())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// test, line 96
|
||||
v_1 = limit - cursor;
|
||||
if (!(out_grouping_b(g_v, 97, 232)))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = limit - v_1;
|
||||
// delete, line 96
|
||||
slice_del();
|
||||
// set e_found, line 97
|
||||
B_e_found = true;
|
||||
// call undouble, line 98
|
||||
if (!r_undouble())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_en_ending() {
|
||||
int v_1;
|
||||
int v_2;
|
||||
// (, line 101
|
||||
// call R1, line 102
|
||||
if (!r_R1())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// and, line 102
|
||||
v_1 = limit - cursor;
|
||||
if (!(out_grouping_b(g_v, 97, 232)))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = limit - v_1;
|
||||
// not, line 102
|
||||
{
|
||||
v_2 = limit - cursor;
|
||||
lab0: do {
|
||||
// literal, line 102
|
||||
if (!(eq_s_b(3, "gem")))
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
return false;
|
||||
} while (false);
|
||||
cursor = limit - v_2;
|
||||
}
|
||||
// delete, line 102
|
||||
slice_del();
|
||||
// call undouble, line 103
|
||||
if (!r_undouble())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_standard_suffix() {
|
||||
int among_var;
|
||||
int v_1;
|
||||
int v_2;
|
||||
int v_3;
|
||||
int v_4;
|
||||
int v_5;
|
||||
int v_6;
|
||||
int v_7;
|
||||
int v_8;
|
||||
int v_9;
|
||||
int v_10;
|
||||
// (, line 106
|
||||
// do, line 107
|
||||
v_1 = limit - cursor;
|
||||
lab0: do {
|
||||
// (, line 107
|
||||
// [, line 108
|
||||
ket = cursor;
|
||||
// substring, line 108
|
||||
among_var = find_among_b(a_3, 5);
|
||||
if (among_var == 0)
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
// ], line 108
|
||||
bra = cursor;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
break lab0;
|
||||
case 1:
|
||||
// (, line 110
|
||||
// call R1, line 110
|
||||
if (!r_R1())
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
// <-, line 110
|
||||
slice_from("heid");
|
||||
break;
|
||||
case 2:
|
||||
// (, line 113
|
||||
// call en_ending, line 113
|
||||
if (!r_en_ending())
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
// (, line 116
|
||||
// call R1, line 116
|
||||
if (!r_R1())
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
if (!(out_grouping_b(g_v_j, 97, 232)))
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
// delete, line 116
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_1;
|
||||
// do, line 120
|
||||
v_2 = limit - cursor;
|
||||
lab1: do {
|
||||
// call e_ending, line 120
|
||||
if (!r_e_ending())
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_2;
|
||||
// do, line 122
|
||||
v_3 = limit - cursor;
|
||||
lab2: do {
|
||||
// (, line 122
|
||||
// [, line 122
|
||||
ket = cursor;
|
||||
// literal, line 122
|
||||
if (!(eq_s_b(4, "heid")))
|
||||
{
|
||||
break lab2;
|
||||
}
|
||||
// ], line 122
|
||||
bra = cursor;
|
||||
// call R2, line 122
|
||||
if (!r_R2())
|
||||
{
|
||||
break lab2;
|
||||
}
|
||||
// not, line 122
|
||||
{
|
||||
v_4 = limit - cursor;
|
||||
lab3: do {
|
||||
// literal, line 122
|
||||
if (!(eq_s_b(1, "c")))
|
||||
{
|
||||
break lab3;
|
||||
}
|
||||
break lab2;
|
||||
} while (false);
|
||||
cursor = limit - v_4;
|
||||
}
|
||||
// delete, line 122
|
||||
slice_del();
|
||||
// [, line 123
|
||||
ket = cursor;
|
||||
// literal, line 123
|
||||
if (!(eq_s_b(2, "en")))
|
||||
{
|
||||
break lab2;
|
||||
}
|
||||
// ], line 123
|
||||
bra = cursor;
|
||||
// call en_ending, line 123
|
||||
if (!r_en_ending())
|
||||
{
|
||||
break lab2;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_3;
|
||||
// do, line 126
|
||||
v_5 = limit - cursor;
|
||||
lab4: do {
|
||||
// (, line 126
|
||||
// [, line 127
|
||||
ket = cursor;
|
||||
// substring, line 127
|
||||
among_var = find_among_b(a_4, 6);
|
||||
if (among_var == 0)
|
||||
{
|
||||
break lab4;
|
||||
}
|
||||
// ], line 127
|
||||
bra = cursor;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
break lab4;
|
||||
case 1:
|
||||
// (, line 129
|
||||
// call R2, line 129
|
||||
if (!r_R2())
|
||||
{
|
||||
break lab4;
|
||||
}
|
||||
// delete, line 129
|
||||
slice_del();
|
||||
// or, line 130
|
||||
lab5: do {
|
||||
v_6 = limit - cursor;
|
||||
lab6: do {
|
||||
// (, line 130
|
||||
// [, line 130
|
||||
ket = cursor;
|
||||
// literal, line 130
|
||||
if (!(eq_s_b(2, "ig")))
|
||||
{
|
||||
break lab6;
|
||||
}
|
||||
// ], line 130
|
||||
bra = cursor;
|
||||
// call R2, line 130
|
||||
if (!r_R2())
|
||||
{
|
||||
break lab6;
|
||||
}
|
||||
// not, line 130
|
||||
{
|
||||
v_7 = limit - cursor;
|
||||
lab7: do {
|
||||
// literal, line 130
|
||||
if (!(eq_s_b(1, "e")))
|
||||
{
|
||||
break lab7;
|
||||
}
|
||||
break lab6;
|
||||
} while (false);
|
||||
cursor = limit - v_7;
|
||||
}
|
||||
// delete, line 130
|
||||
slice_del();
|
||||
break lab5;
|
||||
} while (false);
|
||||
cursor = limit - v_6;
|
||||
// call undouble, line 130
|
||||
if (!r_undouble())
|
||||
{
|
||||
break lab4;
|
||||
}
|
||||
} while (false);
|
||||
break;
|
||||
case 2:
|
||||
// (, line 133
|
||||
// call R2, line 133
|
||||
if (!r_R2())
|
||||
{
|
||||
break lab4;
|
||||
}
|
||||
// not, line 133
|
||||
{
|
||||
v_8 = limit - cursor;
|
||||
lab8: do {
|
||||
// literal, line 133
|
||||
if (!(eq_s_b(1, "e")))
|
||||
{
|
||||
break lab8;
|
||||
}
|
||||
break lab4;
|
||||
} while (false);
|
||||
cursor = limit - v_8;
|
||||
}
|
||||
// delete, line 133
|
||||
slice_del();
|
||||
break;
|
||||
case 3:
|
||||
// (, line 136
|
||||
// call R2, line 136
|
||||
if (!r_R2())
|
||||
{
|
||||
break lab4;
|
||||
}
|
||||
// delete, line 136
|
||||
slice_del();
|
||||
// call e_ending, line 136
|
||||
if (!r_e_ending())
|
||||
{
|
||||
break lab4;
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
// (, line 139
|
||||
// call R2, line 139
|
||||
if (!r_R2())
|
||||
{
|
||||
break lab4;
|
||||
}
|
||||
// delete, line 139
|
||||
slice_del();
|
||||
break;
|
||||
case 5:
|
||||
// (, line 142
|
||||
// call R2, line 142
|
||||
if (!r_R2())
|
||||
{
|
||||
break lab4;
|
||||
}
|
||||
// Boolean test e_found, line 142
|
||||
if (!(B_e_found))
|
||||
{
|
||||
break lab4;
|
||||
}
|
||||
// delete, line 142
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_5;
|
||||
// do, line 146
|
||||
v_9 = limit - cursor;
|
||||
lab9: do {
|
||||
// (, line 146
|
||||
if (!(out_grouping_b(g_v_I, 73, 232)))
|
||||
{
|
||||
break lab9;
|
||||
}
|
||||
// test, line 148
|
||||
v_10 = limit - cursor;
|
||||
// (, line 148
|
||||
// among, line 149
|
||||
if (find_among_b(a_5, 4) == 0)
|
||||
{
|
||||
break lab9;
|
||||
}
|
||||
if (!(out_grouping_b(g_v, 97, 232)))
|
||||
{
|
||||
break lab9;
|
||||
}
|
||||
cursor = limit - v_10;
|
||||
// [, line 152
|
||||
ket = cursor;
|
||||
// next, line 152
|
||||
if (cursor <= limit_backward)
|
||||
{
|
||||
break lab9;
|
||||
}
|
||||
cursor--;
|
||||
// ], line 152
|
||||
bra = cursor;
|
||||
// delete, line 152
|
||||
slice_del();
|
||||
} while (false);
|
||||
cursor = limit - v_9;
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean stem() {
|
||||
int v_1;
|
||||
int v_2;
|
||||
int v_3;
|
||||
int v_4;
|
||||
// (, line 157
|
||||
// do, line 159
|
||||
v_1 = cursor;
|
||||
lab0: do {
|
||||
// call prelude, line 159
|
||||
if (!r_prelude())
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
} while (false);
|
||||
cursor = v_1;
|
||||
// do, line 160
|
||||
v_2 = cursor;
|
||||
lab1: do {
|
||||
// call mark_regions, line 160
|
||||
if (!r_mark_regions())
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
} while (false);
|
||||
cursor = v_2;
|
||||
// backwards, line 161
|
||||
limit_backward = cursor; cursor = limit;
|
||||
// do, line 162
|
||||
v_3 = limit - cursor;
|
||||
lab2: do {
|
||||
// call standard_suffix, line 162
|
||||
if (!r_standard_suffix())
|
||||
{
|
||||
break lab2;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_3;
|
||||
cursor = limit_backward; // do, line 163
|
||||
v_4 = cursor;
|
||||
lab3: do {
|
||||
// call postlude, line 163
|
||||
if (!r_postlude())
|
||||
{
|
||||
break lab3;
|
||||
}
|
||||
} while (false);
|
||||
cursor = v_4;
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,726 @@
|
|||
// This file was generated automatically by the Snowball to Java compiler
|
||||
|
||||
package org.tartarus.snowball.ext;
|
||||
import org.tartarus.snowball.SnowballProgram;
|
||||
import org.tartarus.snowball.Among;
|
||||
|
||||
/**
|
||||
* Generated class implementing code defined by a snowball script.
|
||||
*/
|
||||
public class German2Stemmer extends SnowballProgram {
|
||||
|
||||
private Among a_0[] = {
|
||||
new Among ( "", -1, 6, "", this),
|
||||
new Among ( "ae", 0, 2, "", this),
|
||||
new Among ( "oe", 0, 3, "", this),
|
||||
new Among ( "qu", 0, 5, "", this),
|
||||
new Among ( "ue", 0, 4, "", this),
|
||||
new Among ( "\u00DF", 0, 1, "", this)
|
||||
};
|
||||
|
||||
private Among a_1[] = {
|
||||
new Among ( "", -1, 6, "", this),
|
||||
new Among ( "U", 0, 2, "", this),
|
||||
new Among ( "Y", 0, 1, "", this),
|
||||
new Among ( "\u00E4", 0, 3, "", this),
|
||||
new Among ( "\u00F6", 0, 4, "", this),
|
||||
new Among ( "\u00FC", 0, 5, "", this)
|
||||
};
|
||||
|
||||
private Among a_2[] = {
|
||||
new Among ( "e", -1, 1, "", this),
|
||||
new Among ( "em", -1, 1, "", this),
|
||||
new Among ( "en", -1, 1, "", this),
|
||||
new Among ( "ern", -1, 1, "", this),
|
||||
new Among ( "er", -1, 1, "", this),
|
||||
new Among ( "s", -1, 2, "", this),
|
||||
new Among ( "es", 5, 1, "", this)
|
||||
};
|
||||
|
||||
private Among a_3[] = {
|
||||
new Among ( "en", -1, 1, "", this),
|
||||
new Among ( "er", -1, 1, "", this),
|
||||
new Among ( "st", -1, 2, "", this),
|
||||
new Among ( "est", 2, 1, "", this)
|
||||
};
|
||||
|
||||
private Among a_4[] = {
|
||||
new Among ( "ig", -1, 1, "", this),
|
||||
new Among ( "lich", -1, 1, "", this)
|
||||
};
|
||||
|
||||
private Among a_5[] = {
|
||||
new Among ( "end", -1, 1, "", this),
|
||||
new Among ( "ig", -1, 2, "", this),
|
||||
new Among ( "ung", -1, 1, "", this),
|
||||
new Among ( "lich", -1, 3, "", this),
|
||||
new Among ( "isch", -1, 2, "", this),
|
||||
new Among ( "ik", -1, 2, "", this),
|
||||
new Among ( "heit", -1, 3, "", this),
|
||||
new Among ( "keit", -1, 4, "", this)
|
||||
};
|
||||
|
||||
private static final char g_v[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 32, 8 };
|
||||
|
||||
private static final char g_s_ending[] = {117, 30, 5 };
|
||||
|
||||
private static final char g_st_ending[] = {117, 30, 4 };
|
||||
|
||||
private int I_x;
|
||||
private int I_p2;
|
||||
private int I_p1;
|
||||
|
||||
private void copy_from(German2Stemmer other) {
|
||||
I_x = other.I_x;
|
||||
I_p2 = other.I_p2;
|
||||
I_p1 = other.I_p1;
|
||||
super.copy_from(other);
|
||||
}
|
||||
|
||||
private boolean r_prelude() {
|
||||
int among_var;
|
||||
int v_1;
|
||||
int v_2;
|
||||
int v_3;
|
||||
int v_4;
|
||||
int v_5;
|
||||
// (, line 28
|
||||
// test, line 30
|
||||
v_1 = cursor;
|
||||
// repeat, line 30
|
||||
replab0: while(true)
|
||||
{
|
||||
v_2 = cursor;
|
||||
lab1: do {
|
||||
// goto, line 30
|
||||
golab2: while(true)
|
||||
{
|
||||
v_3 = cursor;
|
||||
lab3: do {
|
||||
// (, line 30
|
||||
if (!(in_grouping(g_v, 97, 252)))
|
||||
{
|
||||
break lab3;
|
||||
}
|
||||
// [, line 31
|
||||
bra = cursor;
|
||||
// or, line 31
|
||||
lab4: do {
|
||||
v_4 = cursor;
|
||||
lab5: do {
|
||||
// (, line 31
|
||||
// literal, line 31
|
||||
if (!(eq_s(1, "u")))
|
||||
{
|
||||
break lab5;
|
||||
}
|
||||
// ], line 31
|
||||
ket = cursor;
|
||||
if (!(in_grouping(g_v, 97, 252)))
|
||||
{
|
||||
break lab5;
|
||||
}
|
||||
// <-, line 31
|
||||
slice_from("U");
|
||||
break lab4;
|
||||
} while (false);
|
||||
cursor = v_4;
|
||||
// (, line 32
|
||||
// literal, line 32
|
||||
if (!(eq_s(1, "y")))
|
||||
{
|
||||
break lab3;
|
||||
}
|
||||
// ], line 32
|
||||
ket = cursor;
|
||||
if (!(in_grouping(g_v, 97, 252)))
|
||||
{
|
||||
break lab3;
|
||||
}
|
||||
// <-, line 32
|
||||
slice_from("Y");
|
||||
} while (false);
|
||||
cursor = v_3;
|
||||
break golab2;
|
||||
} while (false);
|
||||
cursor = v_3;
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
continue replab0;
|
||||
} while (false);
|
||||
cursor = v_2;
|
||||
break replab0;
|
||||
}
|
||||
cursor = v_1;
|
||||
// repeat, line 35
|
||||
replab6: while(true)
|
||||
{
|
||||
v_5 = cursor;
|
||||
lab7: do {
|
||||
// (, line 35
|
||||
// [, line 36
|
||||
bra = cursor;
|
||||
// substring, line 36
|
||||
among_var = find_among(a_0, 6);
|
||||
if (among_var == 0)
|
||||
{
|
||||
break lab7;
|
||||
}
|
||||
// ], line 36
|
||||
ket = cursor;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
break lab7;
|
||||
case 1:
|
||||
// (, line 37
|
||||
// <-, line 37
|
||||
slice_from("ss");
|
||||
break;
|
||||
case 2:
|
||||
// (, line 38
|
||||
// <-, line 38
|
||||
slice_from("\u00E4");
|
||||
break;
|
||||
case 3:
|
||||
// (, line 39
|
||||
// <-, line 39
|
||||
slice_from("\u00F6");
|
||||
break;
|
||||
case 4:
|
||||
// (, line 40
|
||||
// <-, line 40
|
||||
slice_from("\u00FC");
|
||||
break;
|
||||
case 5:
|
||||
// (, line 41
|
||||
// hop, line 41
|
||||
{
|
||||
int c = cursor + 2;
|
||||
if (0 > c || c > limit)
|
||||
{
|
||||
break lab7;
|
||||
}
|
||||
cursor = c;
|
||||
}
|
||||
break;
|
||||
case 6:
|
||||
// (, line 42
|
||||
// next, line 42
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab7;
|
||||
}
|
||||
cursor++;
|
||||
break;
|
||||
}
|
||||
continue replab6;
|
||||
} while (false);
|
||||
cursor = v_5;
|
||||
break replab6;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_mark_regions() {
|
||||
int v_1;
|
||||
// (, line 48
|
||||
I_p1 = limit;
|
||||
I_p2 = limit;
|
||||
// test, line 53
|
||||
v_1 = cursor;
|
||||
// (, line 53
|
||||
// hop, line 53
|
||||
{
|
||||
int c = cursor + 3;
|
||||
if (0 > c || c > limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = c;
|
||||
}
|
||||
// setmark x, line 53
|
||||
I_x = cursor;
|
||||
cursor = v_1;
|
||||
// gopast, line 55
|
||||
golab0: while(true)
|
||||
{
|
||||
lab1: do {
|
||||
if (!(in_grouping(g_v, 97, 252)))
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
break golab0;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// gopast, line 55
|
||||
golab2: while(true)
|
||||
{
|
||||
lab3: do {
|
||||
if (!(out_grouping(g_v, 97, 252)))
|
||||
{
|
||||
break lab3;
|
||||
}
|
||||
break golab2;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// setmark p1, line 55
|
||||
I_p1 = cursor;
|
||||
// try, line 56
|
||||
lab4: do {
|
||||
// (, line 56
|
||||
if (!(I_p1 < I_x))
|
||||
{
|
||||
break lab4;
|
||||
}
|
||||
I_p1 = I_x;
|
||||
} while (false);
|
||||
// gopast, line 57
|
||||
golab5: while(true)
|
||||
{
|
||||
lab6: do {
|
||||
if (!(in_grouping(g_v, 97, 252)))
|
||||
{
|
||||
break lab6;
|
||||
}
|
||||
break golab5;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// gopast, line 57
|
||||
golab7: while(true)
|
||||
{
|
||||
lab8: do {
|
||||
if (!(out_grouping(g_v, 97, 252)))
|
||||
{
|
||||
break lab8;
|
||||
}
|
||||
break golab7;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// setmark p2, line 57
|
||||
I_p2 = cursor;
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_postlude() {
|
||||
int among_var;
|
||||
int v_1;
|
||||
// repeat, line 61
|
||||
replab0: while(true)
|
||||
{
|
||||
v_1 = cursor;
|
||||
lab1: do {
|
||||
// (, line 61
|
||||
// [, line 63
|
||||
bra = cursor;
|
||||
// substring, line 63
|
||||
among_var = find_among(a_1, 6);
|
||||
if (among_var == 0)
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
// ], line 63
|
||||
ket = cursor;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
break lab1;
|
||||
case 1:
|
||||
// (, line 64
|
||||
// <-, line 64
|
||||
slice_from("y");
|
||||
break;
|
||||
case 2:
|
||||
// (, line 65
|
||||
// <-, line 65
|
||||
slice_from("u");
|
||||
break;
|
||||
case 3:
|
||||
// (, line 66
|
||||
// <-, line 66
|
||||
slice_from("a");
|
||||
break;
|
||||
case 4:
|
||||
// (, line 67
|
||||
// <-, line 67
|
||||
slice_from("o");
|
||||
break;
|
||||
case 5:
|
||||
// (, line 68
|
||||
// <-, line 68
|
||||
slice_from("u");
|
||||
break;
|
||||
case 6:
|
||||
// (, line 69
|
||||
// next, line 69
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
cursor++;
|
||||
break;
|
||||
}
|
||||
continue replab0;
|
||||
} while (false);
|
||||
cursor = v_1;
|
||||
break replab0;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_R1() {
|
||||
if (!(I_p1 <= cursor))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_R2() {
|
||||
if (!(I_p2 <= cursor))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_standard_suffix() {
|
||||
int among_var;
|
||||
int v_1;
|
||||
int v_2;
|
||||
int v_3;
|
||||
int v_4;
|
||||
int v_5;
|
||||
int v_6;
|
||||
int v_7;
|
||||
int v_8;
|
||||
int v_9;
|
||||
// (, line 79
|
||||
// do, line 80
|
||||
v_1 = limit - cursor;
|
||||
lab0: do {
|
||||
// (, line 80
|
||||
// [, line 81
|
||||
ket = cursor;
|
||||
// substring, line 81
|
||||
among_var = find_among_b(a_2, 7);
|
||||
if (among_var == 0)
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
// ], line 81
|
||||
bra = cursor;
|
||||
// call R1, line 81
|
||||
if (!r_R1())
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
break lab0;
|
||||
case 1:
|
||||
// (, line 83
|
||||
// delete, line 83
|
||||
slice_del();
|
||||
break;
|
||||
case 2:
|
||||
// (, line 86
|
||||
if (!(in_grouping_b(g_s_ending, 98, 116)))
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
// delete, line 86
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_1;
|
||||
// do, line 90
|
||||
v_2 = limit - cursor;
|
||||
lab1: do {
|
||||
// (, line 90
|
||||
// [, line 91
|
||||
ket = cursor;
|
||||
// substring, line 91
|
||||
among_var = find_among_b(a_3, 4);
|
||||
if (among_var == 0)
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
// ], line 91
|
||||
bra = cursor;
|
||||
// call R1, line 91
|
||||
if (!r_R1())
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
break lab1;
|
||||
case 1:
|
||||
// (, line 93
|
||||
// delete, line 93
|
||||
slice_del();
|
||||
break;
|
||||
case 2:
|
||||
// (, line 96
|
||||
if (!(in_grouping_b(g_st_ending, 98, 116)))
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
// hop, line 96
|
||||
{
|
||||
int c = cursor - 3;
|
||||
if (limit_backward > c || c > limit)
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
cursor = c;
|
||||
}
|
||||
// delete, line 96
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_2;
|
||||
// do, line 100
|
||||
v_3 = limit - cursor;
|
||||
lab2: do {
|
||||
// (, line 100
|
||||
// [, line 101
|
||||
ket = cursor;
|
||||
// substring, line 101
|
||||
among_var = find_among_b(a_5, 8);
|
||||
if (among_var == 0)
|
||||
{
|
||||
break lab2;
|
||||
}
|
||||
// ], line 101
|
||||
bra = cursor;
|
||||
// call R2, line 101
|
||||
if (!r_R2())
|
||||
{
|
||||
break lab2;
|
||||
}
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
break lab2;
|
||||
case 1:
|
||||
// (, line 103
|
||||
// delete, line 103
|
||||
slice_del();
|
||||
// try, line 104
|
||||
v_4 = limit - cursor;
|
||||
lab3: do {
|
||||
// (, line 104
|
||||
// [, line 104
|
||||
ket = cursor;
|
||||
// literal, line 104
|
||||
if (!(eq_s_b(2, "ig")))
|
||||
{
|
||||
cursor = limit - v_4;
|
||||
break lab3;
|
||||
}
|
||||
// ], line 104
|
||||
bra = cursor;
|
||||
// not, line 104
|
||||
{
|
||||
v_5 = limit - cursor;
|
||||
lab4: do {
|
||||
// literal, line 104
|
||||
if (!(eq_s_b(1, "e")))
|
||||
{
|
||||
break lab4;
|
||||
}
|
||||
cursor = limit - v_4;
|
||||
break lab3;
|
||||
} while (false);
|
||||
cursor = limit - v_5;
|
||||
}
|
||||
// call R2, line 104
|
||||
if (!r_R2())
|
||||
{
|
||||
cursor = limit - v_4;
|
||||
break lab3;
|
||||
}
|
||||
// delete, line 104
|
||||
slice_del();
|
||||
} while (false);
|
||||
break;
|
||||
case 2:
|
||||
// (, line 107
|
||||
// not, line 107
|
||||
{
|
||||
v_6 = limit - cursor;
|
||||
lab5: do {
|
||||
// literal, line 107
|
||||
if (!(eq_s_b(1, "e")))
|
||||
{
|
||||
break lab5;
|
||||
}
|
||||
break lab2;
|
||||
} while (false);
|
||||
cursor = limit - v_6;
|
||||
}
|
||||
// delete, line 107
|
||||
slice_del();
|
||||
break;
|
||||
case 3:
|
||||
// (, line 110
|
||||
// delete, line 110
|
||||
slice_del();
|
||||
// try, line 111
|
||||
v_7 = limit - cursor;
|
||||
lab6: do {
|
||||
// (, line 111
|
||||
// [, line 112
|
||||
ket = cursor;
|
||||
// or, line 112
|
||||
lab7: do {
|
||||
v_8 = limit - cursor;
|
||||
lab8: do {
|
||||
// literal, line 112
|
||||
if (!(eq_s_b(2, "er")))
|
||||
{
|
||||
break lab8;
|
||||
}
|
||||
break lab7;
|
||||
} while (false);
|
||||
cursor = limit - v_8;
|
||||
// literal, line 112
|
||||
if (!(eq_s_b(2, "en")))
|
||||
{
|
||||
cursor = limit - v_7;
|
||||
break lab6;
|
||||
}
|
||||
} while (false);
|
||||
// ], line 112
|
||||
bra = cursor;
|
||||
// call R1, line 112
|
||||
if (!r_R1())
|
||||
{
|
||||
cursor = limit - v_7;
|
||||
break lab6;
|
||||
}
|
||||
// delete, line 112
|
||||
slice_del();
|
||||
} while (false);
|
||||
break;
|
||||
case 4:
|
||||
// (, line 116
|
||||
// delete, line 116
|
||||
slice_del();
|
||||
// try, line 117
|
||||
v_9 = limit - cursor;
|
||||
lab9: do {
|
||||
// (, line 117
|
||||
// [, line 118
|
||||
ket = cursor;
|
||||
// substring, line 118
|
||||
among_var = find_among_b(a_4, 2);
|
||||
if (among_var == 0)
|
||||
{
|
||||
cursor = limit - v_9;
|
||||
break lab9;
|
||||
}
|
||||
// ], line 118
|
||||
bra = cursor;
|
||||
// call R2, line 118
|
||||
if (!r_R2())
|
||||
{
|
||||
cursor = limit - v_9;
|
||||
break lab9;
|
||||
}
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
cursor = limit - v_9;
|
||||
break lab9;
|
||||
case 1:
|
||||
// (, line 120
|
||||
// delete, line 120
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
} while (false);
|
||||
break;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_3;
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean stem() {
|
||||
int v_1;
|
||||
int v_2;
|
||||
int v_3;
|
||||
int v_4;
|
||||
// (, line 130
|
||||
// do, line 131
|
||||
v_1 = cursor;
|
||||
lab0: do {
|
||||
// call prelude, line 131
|
||||
if (!r_prelude())
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
} while (false);
|
||||
cursor = v_1;
|
||||
// do, line 132
|
||||
v_2 = cursor;
|
||||
lab1: do {
|
||||
// call mark_regions, line 132
|
||||
if (!r_mark_regions())
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
} while (false);
|
||||
cursor = v_2;
|
||||
// backwards, line 133
|
||||
limit_backward = cursor; cursor = limit;
|
||||
// do, line 134
|
||||
v_3 = limit - cursor;
|
||||
lab2: do {
|
||||
// call standard_suffix, line 134
|
||||
if (!r_standard_suffix())
|
||||
{
|
||||
break lab2;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_3;
|
||||
cursor = limit_backward; // do, line 135
|
||||
v_4 = cursor;
|
||||
lab3: do {
|
||||
// call postlude, line 135
|
||||
if (!r_postlude())
|
||||
{
|
||||
break lab3;
|
||||
}
|
||||
} while (false);
|
||||
cursor = v_4;
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,688 @@
|
|||
// This file was generated automatically by the Snowball to Java compiler
|
||||
|
||||
package org.tartarus.snowball.ext;
|
||||
import org.tartarus.snowball.SnowballProgram;
|
||||
import org.tartarus.snowball.Among;
|
||||
|
||||
/**
|
||||
* Generated class implementing code defined by a snowball script.
|
||||
*/
|
||||
public class GermanStemmer extends SnowballProgram {
|
||||
|
||||
private Among a_0[] = {
|
||||
new Among ( "", -1, 6, "", this),
|
||||
new Among ( "U", 0, 2, "", this),
|
||||
new Among ( "Y", 0, 1, "", this),
|
||||
new Among ( "\u00E4", 0, 3, "", this),
|
||||
new Among ( "\u00F6", 0, 4, "", this),
|
||||
new Among ( "\u00FC", 0, 5, "", this)
|
||||
};
|
||||
|
||||
private Among a_1[] = {
|
||||
new Among ( "e", -1, 1, "", this),
|
||||
new Among ( "em", -1, 1, "", this),
|
||||
new Among ( "en", -1, 1, "", this),
|
||||
new Among ( "ern", -1, 1, "", this),
|
||||
new Among ( "er", -1, 1, "", this),
|
||||
new Among ( "s", -1, 2, "", this),
|
||||
new Among ( "es", 5, 1, "", this)
|
||||
};
|
||||
|
||||
private Among a_2[] = {
|
||||
new Among ( "en", -1, 1, "", this),
|
||||
new Among ( "er", -1, 1, "", this),
|
||||
new Among ( "st", -1, 2, "", this),
|
||||
new Among ( "est", 2, 1, "", this)
|
||||
};
|
||||
|
||||
private Among a_3[] = {
|
||||
new Among ( "ig", -1, 1, "", this),
|
||||
new Among ( "lich", -1, 1, "", this)
|
||||
};
|
||||
|
||||
private Among a_4[] = {
|
||||
new Among ( "end", -1, 1, "", this),
|
||||
new Among ( "ig", -1, 2, "", this),
|
||||
new Among ( "ung", -1, 1, "", this),
|
||||
new Among ( "lich", -1, 3, "", this),
|
||||
new Among ( "isch", -1, 2, "", this),
|
||||
new Among ( "ik", -1, 2, "", this),
|
||||
new Among ( "heit", -1, 3, "", this),
|
||||
new Among ( "keit", -1, 4, "", this)
|
||||
};
|
||||
|
||||
private static final char g_v[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 32, 8 };
|
||||
|
||||
private static final char g_s_ending[] = {117, 30, 5 };
|
||||
|
||||
private static final char g_st_ending[] = {117, 30, 4 };
|
||||
|
||||
private int I_x;
|
||||
private int I_p2;
|
||||
private int I_p1;
|
||||
|
||||
private void copy_from(GermanStemmer other) {
|
||||
I_x = other.I_x;
|
||||
I_p2 = other.I_p2;
|
||||
I_p1 = other.I_p1;
|
||||
super.copy_from(other);
|
||||
}
|
||||
|
||||
private boolean r_prelude() {
|
||||
int v_1;
|
||||
int v_2;
|
||||
int v_3;
|
||||
int v_4;
|
||||
int v_5;
|
||||
int v_6;
|
||||
// (, line 28
|
||||
// test, line 30
|
||||
v_1 = cursor;
|
||||
// repeat, line 30
|
||||
replab0: while(true)
|
||||
{
|
||||
v_2 = cursor;
|
||||
lab1: do {
|
||||
// (, line 30
|
||||
// or, line 33
|
||||
lab2: do {
|
||||
v_3 = cursor;
|
||||
lab3: do {
|
||||
// (, line 31
|
||||
// [, line 32
|
||||
bra = cursor;
|
||||
// literal, line 32
|
||||
if (!(eq_s(1, "\u00DF")))
|
||||
{
|
||||
break lab3;
|
||||
}
|
||||
// ], line 32
|
||||
ket = cursor;
|
||||
// <-, line 32
|
||||
slice_from("ss");
|
||||
break lab2;
|
||||
} while (false);
|
||||
cursor = v_3;
|
||||
// next, line 33
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
cursor++;
|
||||
} while (false);
|
||||
continue replab0;
|
||||
} while (false);
|
||||
cursor = v_2;
|
||||
break replab0;
|
||||
}
|
||||
cursor = v_1;
|
||||
// repeat, line 36
|
||||
replab4: while(true)
|
||||
{
|
||||
v_4 = cursor;
|
||||
lab5: do {
|
||||
// goto, line 36
|
||||
golab6: while(true)
|
||||
{
|
||||
v_5 = cursor;
|
||||
lab7: do {
|
||||
// (, line 36
|
||||
if (!(in_grouping(g_v, 97, 252)))
|
||||
{
|
||||
break lab7;
|
||||
}
|
||||
// [, line 37
|
||||
bra = cursor;
|
||||
// or, line 37
|
||||
lab8: do {
|
||||
v_6 = cursor;
|
||||
lab9: do {
|
||||
// (, line 37
|
||||
// literal, line 37
|
||||
if (!(eq_s(1, "u")))
|
||||
{
|
||||
break lab9;
|
||||
}
|
||||
// ], line 37
|
||||
ket = cursor;
|
||||
if (!(in_grouping(g_v, 97, 252)))
|
||||
{
|
||||
break lab9;
|
||||
}
|
||||
// <-, line 37
|
||||
slice_from("U");
|
||||
break lab8;
|
||||
} while (false);
|
||||
cursor = v_6;
|
||||
// (, line 38
|
||||
// literal, line 38
|
||||
if (!(eq_s(1, "y")))
|
||||
{
|
||||
break lab7;
|
||||
}
|
||||
// ], line 38
|
||||
ket = cursor;
|
||||
if (!(in_grouping(g_v, 97, 252)))
|
||||
{
|
||||
break lab7;
|
||||
}
|
||||
// <-, line 38
|
||||
slice_from("Y");
|
||||
} while (false);
|
||||
cursor = v_5;
|
||||
break golab6;
|
||||
} while (false);
|
||||
cursor = v_5;
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab5;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
continue replab4;
|
||||
} while (false);
|
||||
cursor = v_4;
|
||||
break replab4;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_mark_regions() {
|
||||
int v_1;
|
||||
// (, line 42
|
||||
I_p1 = limit;
|
||||
I_p2 = limit;
|
||||
// test, line 47
|
||||
v_1 = cursor;
|
||||
// (, line 47
|
||||
// hop, line 47
|
||||
{
|
||||
int c = cursor + 3;
|
||||
if (0 > c || c > limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = c;
|
||||
}
|
||||
// setmark x, line 47
|
||||
I_x = cursor;
|
||||
cursor = v_1;
|
||||
// gopast, line 49
|
||||
golab0: while(true)
|
||||
{
|
||||
lab1: do {
|
||||
if (!(in_grouping(g_v, 97, 252)))
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
break golab0;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// gopast, line 49
|
||||
golab2: while(true)
|
||||
{
|
||||
lab3: do {
|
||||
if (!(out_grouping(g_v, 97, 252)))
|
||||
{
|
||||
break lab3;
|
||||
}
|
||||
break golab2;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// setmark p1, line 49
|
||||
I_p1 = cursor;
|
||||
// try, line 50
|
||||
lab4: do {
|
||||
// (, line 50
|
||||
if (!(I_p1 < I_x))
|
||||
{
|
||||
break lab4;
|
||||
}
|
||||
I_p1 = I_x;
|
||||
} while (false);
|
||||
// gopast, line 51
|
||||
golab5: while(true)
|
||||
{
|
||||
lab6: do {
|
||||
if (!(in_grouping(g_v, 97, 252)))
|
||||
{
|
||||
break lab6;
|
||||
}
|
||||
break golab5;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// gopast, line 51
|
||||
golab7: while(true)
|
||||
{
|
||||
lab8: do {
|
||||
if (!(out_grouping(g_v, 97, 252)))
|
||||
{
|
||||
break lab8;
|
||||
}
|
||||
break golab7;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// setmark p2, line 51
|
||||
I_p2 = cursor;
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_postlude() {
|
||||
int among_var;
|
||||
int v_1;
|
||||
// repeat, line 55
|
||||
replab0: while(true)
|
||||
{
|
||||
v_1 = cursor;
|
||||
lab1: do {
|
||||
// (, line 55
|
||||
// [, line 57
|
||||
bra = cursor;
|
||||
// substring, line 57
|
||||
among_var = find_among(a_0, 6);
|
||||
if (among_var == 0)
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
// ], line 57
|
||||
ket = cursor;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
break lab1;
|
||||
case 1:
|
||||
// (, line 58
|
||||
// <-, line 58
|
||||
slice_from("y");
|
||||
break;
|
||||
case 2:
|
||||
// (, line 59
|
||||
// <-, line 59
|
||||
slice_from("u");
|
||||
break;
|
||||
case 3:
|
||||
// (, line 60
|
||||
// <-, line 60
|
||||
slice_from("a");
|
||||
break;
|
||||
case 4:
|
||||
// (, line 61
|
||||
// <-, line 61
|
||||
slice_from("o");
|
||||
break;
|
||||
case 5:
|
||||
// (, line 62
|
||||
// <-, line 62
|
||||
slice_from("u");
|
||||
break;
|
||||
case 6:
|
||||
// (, line 63
|
||||
// next, line 63
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
cursor++;
|
||||
break;
|
||||
}
|
||||
continue replab0;
|
||||
} while (false);
|
||||
cursor = v_1;
|
||||
break replab0;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_R1() {
|
||||
if (!(I_p1 <= cursor))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_R2() {
|
||||
if (!(I_p2 <= cursor))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_standard_suffix() {
|
||||
int among_var;
|
||||
int v_1;
|
||||
int v_2;
|
||||
int v_3;
|
||||
int v_4;
|
||||
int v_5;
|
||||
int v_6;
|
||||
int v_7;
|
||||
int v_8;
|
||||
int v_9;
|
||||
// (, line 73
|
||||
// do, line 74
|
||||
v_1 = limit - cursor;
|
||||
lab0: do {
|
||||
// (, line 74
|
||||
// [, line 75
|
||||
ket = cursor;
|
||||
// substring, line 75
|
||||
among_var = find_among_b(a_1, 7);
|
||||
if (among_var == 0)
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
// ], line 75
|
||||
bra = cursor;
|
||||
// call R1, line 75
|
||||
if (!r_R1())
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
break lab0;
|
||||
case 1:
|
||||
// (, line 77
|
||||
// delete, line 77
|
||||
slice_del();
|
||||
break;
|
||||
case 2:
|
||||
// (, line 80
|
||||
if (!(in_grouping_b(g_s_ending, 98, 116)))
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
// delete, line 80
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_1;
|
||||
// do, line 84
|
||||
v_2 = limit - cursor;
|
||||
lab1: do {
|
||||
// (, line 84
|
||||
// [, line 85
|
||||
ket = cursor;
|
||||
// substring, line 85
|
||||
among_var = find_among_b(a_2, 4);
|
||||
if (among_var == 0)
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
// ], line 85
|
||||
bra = cursor;
|
||||
// call R1, line 85
|
||||
if (!r_R1())
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
break lab1;
|
||||
case 1:
|
||||
// (, line 87
|
||||
// delete, line 87
|
||||
slice_del();
|
||||
break;
|
||||
case 2:
|
||||
// (, line 90
|
||||
if (!(in_grouping_b(g_st_ending, 98, 116)))
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
// hop, line 90
|
||||
{
|
||||
int c = cursor - 3;
|
||||
if (limit_backward > c || c > limit)
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
cursor = c;
|
||||
}
|
||||
// delete, line 90
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_2;
|
||||
// do, line 94
|
||||
v_3 = limit - cursor;
|
||||
lab2: do {
|
||||
// (, line 94
|
||||
// [, line 95
|
||||
ket = cursor;
|
||||
// substring, line 95
|
||||
among_var = find_among_b(a_4, 8);
|
||||
if (among_var == 0)
|
||||
{
|
||||
break lab2;
|
||||
}
|
||||
// ], line 95
|
||||
bra = cursor;
|
||||
// call R2, line 95
|
||||
if (!r_R2())
|
||||
{
|
||||
break lab2;
|
||||
}
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
break lab2;
|
||||
case 1:
|
||||
// (, line 97
|
||||
// delete, line 97
|
||||
slice_del();
|
||||
// try, line 98
|
||||
v_4 = limit - cursor;
|
||||
lab3: do {
|
||||
// (, line 98
|
||||
// [, line 98
|
||||
ket = cursor;
|
||||
// literal, line 98
|
||||
if (!(eq_s_b(2, "ig")))
|
||||
{
|
||||
cursor = limit - v_4;
|
||||
break lab3;
|
||||
}
|
||||
// ], line 98
|
||||
bra = cursor;
|
||||
// not, line 98
|
||||
{
|
||||
v_5 = limit - cursor;
|
||||
lab4: do {
|
||||
// literal, line 98
|
||||
if (!(eq_s_b(1, "e")))
|
||||
{
|
||||
break lab4;
|
||||
}
|
||||
cursor = limit - v_4;
|
||||
break lab3;
|
||||
} while (false);
|
||||
cursor = limit - v_5;
|
||||
}
|
||||
// call R2, line 98
|
||||
if (!r_R2())
|
||||
{
|
||||
cursor = limit - v_4;
|
||||
break lab3;
|
||||
}
|
||||
// delete, line 98
|
||||
slice_del();
|
||||
} while (false);
|
||||
break;
|
||||
case 2:
|
||||
// (, line 101
|
||||
// not, line 101
|
||||
{
|
||||
v_6 = limit - cursor;
|
||||
lab5: do {
|
||||
// literal, line 101
|
||||
if (!(eq_s_b(1, "e")))
|
||||
{
|
||||
break lab5;
|
||||
}
|
||||
break lab2;
|
||||
} while (false);
|
||||
cursor = limit - v_6;
|
||||
}
|
||||
// delete, line 101
|
||||
slice_del();
|
||||
break;
|
||||
case 3:
|
||||
// (, line 104
|
||||
// delete, line 104
|
||||
slice_del();
|
||||
// try, line 105
|
||||
v_7 = limit - cursor;
|
||||
lab6: do {
|
||||
// (, line 105
|
||||
// [, line 106
|
||||
ket = cursor;
|
||||
// or, line 106
|
||||
lab7: do {
|
||||
v_8 = limit - cursor;
|
||||
lab8: do {
|
||||
// literal, line 106
|
||||
if (!(eq_s_b(2, "er")))
|
||||
{
|
||||
break lab8;
|
||||
}
|
||||
break lab7;
|
||||
} while (false);
|
||||
cursor = limit - v_8;
|
||||
// literal, line 106
|
||||
if (!(eq_s_b(2, "en")))
|
||||
{
|
||||
cursor = limit - v_7;
|
||||
break lab6;
|
||||
}
|
||||
} while (false);
|
||||
// ], line 106
|
||||
bra = cursor;
|
||||
// call R1, line 106
|
||||
if (!r_R1())
|
||||
{
|
||||
cursor = limit - v_7;
|
||||
break lab6;
|
||||
}
|
||||
// delete, line 106
|
||||
slice_del();
|
||||
} while (false);
|
||||
break;
|
||||
case 4:
|
||||
// (, line 110
|
||||
// delete, line 110
|
||||
slice_del();
|
||||
// try, line 111
|
||||
v_9 = limit - cursor;
|
||||
lab9: do {
|
||||
// (, line 111
|
||||
// [, line 112
|
||||
ket = cursor;
|
||||
// substring, line 112
|
||||
among_var = find_among_b(a_3, 2);
|
||||
if (among_var == 0)
|
||||
{
|
||||
cursor = limit - v_9;
|
||||
break lab9;
|
||||
}
|
||||
// ], line 112
|
||||
bra = cursor;
|
||||
// call R2, line 112
|
||||
if (!r_R2())
|
||||
{
|
||||
cursor = limit - v_9;
|
||||
break lab9;
|
||||
}
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
cursor = limit - v_9;
|
||||
break lab9;
|
||||
case 1:
|
||||
// (, line 114
|
||||
// delete, line 114
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
} while (false);
|
||||
break;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_3;
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean stem() {
|
||||
int v_1;
|
||||
int v_2;
|
||||
int v_3;
|
||||
int v_4;
|
||||
// (, line 124
|
||||
// do, line 125
|
||||
v_1 = cursor;
|
||||
lab0: do {
|
||||
// call prelude, line 125
|
||||
if (!r_prelude())
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
} while (false);
|
||||
cursor = v_1;
|
||||
// do, line 126
|
||||
v_2 = cursor;
|
||||
lab1: do {
|
||||
// call mark_regions, line 126
|
||||
if (!r_mark_regions())
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
} while (false);
|
||||
cursor = v_2;
|
||||
// backwards, line 127
|
||||
limit_backward = cursor; cursor = limit;
|
||||
// do, line 128
|
||||
v_3 = limit - cursor;
|
||||
lab2: do {
|
||||
// call standard_suffix, line 128
|
||||
if (!r_standard_suffix())
|
||||
{
|
||||
break lab2;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_3;
|
||||
cursor = limit_backward; // do, line 129
|
||||
v_4 = cursor;
|
||||
lab3: do {
|
||||
// call postlude, line 129
|
||||
if (!r_postlude())
|
||||
{
|
||||
break lab3;
|
||||
}
|
||||
} while (false);
|
||||
cursor = v_4;
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,358 @@
|
|||
// This file was generated automatically by the Snowball to Java compiler
|
||||
|
||||
package org.tartarus.snowball.ext;
|
||||
import org.tartarus.snowball.SnowballProgram;
|
||||
import org.tartarus.snowball.Among;
|
||||
|
||||
/**
|
||||
* Generated class implementing code defined by a snowball script.
|
||||
*/
|
||||
public class NorwegianStemmer extends SnowballProgram {
|
||||
|
||||
private Among a_0[] = {
|
||||
new Among ( "a", -1, 1, "", this),
|
||||
new Among ( "e", -1, 1, "", this),
|
||||
new Among ( "ede", 1, 1, "", this),
|
||||
new Among ( "ande", 1, 1, "", this),
|
||||
new Among ( "ende", 1, 1, "", this),
|
||||
new Among ( "ane", 1, 1, "", this),
|
||||
new Among ( "ene", 1, 1, "", this),
|
||||
new Among ( "hetene", 6, 1, "", this),
|
||||
new Among ( "erte", 1, 3, "", this),
|
||||
new Among ( "en", -1, 1, "", this),
|
||||
new Among ( "heten", 9, 1, "", this),
|
||||
new Among ( "ar", -1, 1, "", this),
|
||||
new Among ( "er", -1, 1, "", this),
|
||||
new Among ( "heter", 12, 1, "", this),
|
||||
new Among ( "s", -1, 2, "", this),
|
||||
new Among ( "as", 14, 1, "", this),
|
||||
new Among ( "es", 14, 1, "", this),
|
||||
new Among ( "edes", 16, 1, "", this),
|
||||
new Among ( "endes", 16, 1, "", this),
|
||||
new Among ( "enes", 16, 1, "", this),
|
||||
new Among ( "hetenes", 19, 1, "", this),
|
||||
new Among ( "ens", 14, 1, "", this),
|
||||
new Among ( "hetens", 21, 1, "", this),
|
||||
new Among ( "ers", 14, 1, "", this),
|
||||
new Among ( "ets", 14, 1, "", this),
|
||||
new Among ( "et", -1, 1, "", this),
|
||||
new Among ( "het", 25, 1, "", this),
|
||||
new Among ( "ert", -1, 3, "", this),
|
||||
new Among ( "ast", -1, 1, "", this)
|
||||
};
|
||||
|
||||
private Among a_1[] = {
|
||||
new Among ( "dt", -1, -1, "", this),
|
||||
new Among ( "vt", -1, -1, "", this)
|
||||
};
|
||||
|
||||
private Among a_2[] = {
|
||||
new Among ( "leg", -1, 1, "", this),
|
||||
new Among ( "eleg", 0, 1, "", this),
|
||||
new Among ( "ig", -1, 1, "", this),
|
||||
new Among ( "eig", 2, 1, "", this),
|
||||
new Among ( "lig", 2, 1, "", this),
|
||||
new Among ( "elig", 4, 1, "", this),
|
||||
new Among ( "els", -1, 1, "", this),
|
||||
new Among ( "lov", -1, 1, "", this),
|
||||
new Among ( "elov", 7, 1, "", this),
|
||||
new Among ( "slov", 7, 1, "", this),
|
||||
new Among ( "hetslov", 9, 1, "", this)
|
||||
};
|
||||
|
||||
private static final char g_v[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128 };
|
||||
|
||||
private static final char g_s_ending[] = {119, 125, 149, 1 };
|
||||
|
||||
private int I_x;
|
||||
private int I_p1;
|
||||
|
||||
private void copy_from(NorwegianStemmer other) {
|
||||
I_x = other.I_x;
|
||||
I_p1 = other.I_p1;
|
||||
super.copy_from(other);
|
||||
}
|
||||
|
||||
private boolean r_mark_regions() {
|
||||
int v_1;
|
||||
int v_2;
|
||||
// (, line 26
|
||||
I_p1 = limit;
|
||||
// test, line 30
|
||||
v_1 = cursor;
|
||||
// (, line 30
|
||||
// hop, line 30
|
||||
{
|
||||
int c = cursor + 3;
|
||||
if (0 > c || c > limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = c;
|
||||
}
|
||||
// setmark x, line 30
|
||||
I_x = cursor;
|
||||
cursor = v_1;
|
||||
// goto, line 31
|
||||
golab0: while(true)
|
||||
{
|
||||
v_2 = cursor;
|
||||
lab1: do {
|
||||
if (!(in_grouping(g_v, 97, 248)))
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
cursor = v_2;
|
||||
break golab0;
|
||||
} while (false);
|
||||
cursor = v_2;
|
||||
if (cursor >= limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// gopast, line 31
|
||||
golab2: while(true)
|
||||
{
|
||||
lab3: do {
|
||||
if (!(out_grouping(g_v, 97, 248)))
|
||||
{
|
||||
break lab3;
|
||||
}
|
||||
break golab2;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// setmark p1, line 31
|
||||
I_p1 = cursor;
|
||||
// try, line 32
|
||||
lab4: do {
|
||||
// (, line 32
|
||||
if (!(I_p1 < I_x))
|
||||
{
|
||||
break lab4;
|
||||
}
|
||||
I_p1 = I_x;
|
||||
} while (false);
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_main_suffix() {
|
||||
int among_var;
|
||||
int v_1;
|
||||
int v_2;
|
||||
int v_3;
|
||||
// (, line 37
|
||||
// setlimit, line 38
|
||||
v_1 = limit - cursor;
|
||||
// tomark, line 38
|
||||
if (cursor < I_p1)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = I_p1;
|
||||
v_2 = limit_backward;
|
||||
limit_backward = cursor;
|
||||
cursor = limit - v_1;
|
||||
// (, line 38
|
||||
// [, line 38
|
||||
ket = cursor;
|
||||
// substring, line 38
|
||||
among_var = find_among_b(a_0, 29);
|
||||
if (among_var == 0)
|
||||
{
|
||||
limit_backward = v_2;
|
||||
return false;
|
||||
}
|
||||
// ], line 38
|
||||
bra = cursor;
|
||||
limit_backward = v_2;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 44
|
||||
// delete, line 44
|
||||
slice_del();
|
||||
break;
|
||||
case 2:
|
||||
// (, line 46
|
||||
// or, line 46
|
||||
lab0: do {
|
||||
v_3 = limit - cursor;
|
||||
lab1: do {
|
||||
if (!(in_grouping_b(g_s_ending, 98, 122)))
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
break lab0;
|
||||
} while (false);
|
||||
cursor = limit - v_3;
|
||||
// (, line 46
|
||||
// literal, line 46
|
||||
if (!(eq_s_b(1, "k")))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
if (!(out_grouping_b(g_v, 97, 248)))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
} while (false);
|
||||
// delete, line 46
|
||||
slice_del();
|
||||
break;
|
||||
case 3:
|
||||
// (, line 48
|
||||
// <-, line 48
|
||||
slice_from("er");
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_consonant_pair() {
|
||||
int v_1;
|
||||
int v_2;
|
||||
int v_3;
|
||||
// (, line 52
|
||||
// test, line 53
|
||||
v_1 = limit - cursor;
|
||||
// (, line 53
|
||||
// setlimit, line 54
|
||||
v_2 = limit - cursor;
|
||||
// tomark, line 54
|
||||
if (cursor < I_p1)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = I_p1;
|
||||
v_3 = limit_backward;
|
||||
limit_backward = cursor;
|
||||
cursor = limit - v_2;
|
||||
// (, line 54
|
||||
// [, line 54
|
||||
ket = cursor;
|
||||
// substring, line 54
|
||||
if (find_among_b(a_1, 2) == 0)
|
||||
{
|
||||
limit_backward = v_3;
|
||||
return false;
|
||||
}
|
||||
// ], line 54
|
||||
bra = cursor;
|
||||
limit_backward = v_3;
|
||||
cursor = limit - v_1;
|
||||
// next, line 59
|
||||
if (cursor <= limit_backward)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor--;
|
||||
// ], line 59
|
||||
bra = cursor;
|
||||
// delete, line 59
|
||||
slice_del();
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_other_suffix() {
|
||||
int among_var;
|
||||
int v_1;
|
||||
int v_2;
|
||||
// (, line 62
|
||||
// setlimit, line 63
|
||||
v_1 = limit - cursor;
|
||||
// tomark, line 63
|
||||
if (cursor < I_p1)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = I_p1;
|
||||
v_2 = limit_backward;
|
||||
limit_backward = cursor;
|
||||
cursor = limit - v_1;
|
||||
// (, line 63
|
||||
// [, line 63
|
||||
ket = cursor;
|
||||
// substring, line 63
|
||||
among_var = find_among_b(a_2, 11);
|
||||
if (among_var == 0)
|
||||
{
|
||||
limit_backward = v_2;
|
||||
return false;
|
||||
}
|
||||
// ], line 63
|
||||
bra = cursor;
|
||||
limit_backward = v_2;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 67
|
||||
// delete, line 67
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean stem() {
|
||||
int v_1;
|
||||
int v_2;
|
||||
int v_3;
|
||||
int v_4;
|
||||
// (, line 72
|
||||
// do, line 74
|
||||
v_1 = cursor;
|
||||
lab0: do {
|
||||
// call mark_regions, line 74
|
||||
if (!r_mark_regions())
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
} while (false);
|
||||
cursor = v_1;
|
||||
// backwards, line 75
|
||||
limit_backward = cursor; cursor = limit;
|
||||
// (, line 75
|
||||
// do, line 76
|
||||
v_2 = limit - cursor;
|
||||
lab1: do {
|
||||
// call main_suffix, line 76
|
||||
if (!r_main_suffix())
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_2;
|
||||
// do, line 77
|
||||
v_3 = limit - cursor;
|
||||
lab2: do {
|
||||
// call consonant_pair, line 77
|
||||
if (!r_consonant_pair())
|
||||
{
|
||||
break lab2;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_3;
|
||||
// do, line 78
|
||||
v_4 = limit - cursor;
|
||||
lab3: do {
|
||||
// call other_suffix, line 78
|
||||
if (!r_other_suffix())
|
||||
{
|
||||
break lab3;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_4;
|
||||
cursor = limit_backward; return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,906 @@
|
|||
// This file was generated automatically by the Snowball to Java compiler
|
||||
|
||||
package org.tartarus.snowball.ext;
|
||||
import org.tartarus.snowball.SnowballProgram;
|
||||
import org.tartarus.snowball.Among;
|
||||
|
||||
/**
|
||||
* Generated class implementing code defined by a snowball script.
|
||||
*/
|
||||
public class PorterStemmer extends SnowballProgram {
|
||||
|
||||
private Among a_0[] = {
|
||||
new Among ( "s", -1, 3, "", this),
|
||||
new Among ( "ies", 0, 2, "", this),
|
||||
new Among ( "sses", 0, 1, "", this),
|
||||
new Among ( "ss", 0, -1, "", this)
|
||||
};
|
||||
|
||||
private Among a_1[] = {
|
||||
new Among ( "", -1, 3, "", this),
|
||||
new Among ( "bb", 0, 2, "", this),
|
||||
new Among ( "dd", 0, 2, "", this),
|
||||
new Among ( "ff", 0, 2, "", this),
|
||||
new Among ( "gg", 0, 2, "", this),
|
||||
new Among ( "bl", 0, 1, "", this),
|
||||
new Among ( "mm", 0, 2, "", this),
|
||||
new Among ( "nn", 0, 2, "", this),
|
||||
new Among ( "pp", 0, 2, "", this),
|
||||
new Among ( "rr", 0, 2, "", this),
|
||||
new Among ( "at", 0, 1, "", this),
|
||||
new Among ( "tt", 0, 2, "", this),
|
||||
new Among ( "iz", 0, 1, "", this)
|
||||
};
|
||||
|
||||
private Among a_2[] = {
|
||||
new Among ( "ed", -1, 2, "", this),
|
||||
new Among ( "eed", 0, 1, "", this),
|
||||
new Among ( "ing", -1, 2, "", this)
|
||||
};
|
||||
|
||||
private Among a_3[] = {
|
||||
new Among ( "anci", -1, 3, "", this),
|
||||
new Among ( "enci", -1, 2, "", this),
|
||||
new Among ( "abli", -1, 4, "", this),
|
||||
new Among ( "eli", -1, 6, "", this),
|
||||
new Among ( "alli", -1, 9, "", this),
|
||||
new Among ( "ousli", -1, 12, "", this),
|
||||
new Among ( "entli", -1, 5, "", this),
|
||||
new Among ( "aliti", -1, 10, "", this),
|
||||
new Among ( "biliti", -1, 14, "", this),
|
||||
new Among ( "iviti", -1, 13, "", this),
|
||||
new Among ( "tional", -1, 1, "", this),
|
||||
new Among ( "ational", 10, 8, "", this),
|
||||
new Among ( "alism", -1, 10, "", this),
|
||||
new Among ( "ation", -1, 8, "", this),
|
||||
new Among ( "ization", 13, 7, "", this),
|
||||
new Among ( "izer", -1, 7, "", this),
|
||||
new Among ( "ator", -1, 8, "", this),
|
||||
new Among ( "iveness", -1, 13, "", this),
|
||||
new Among ( "fulness", -1, 11, "", this),
|
||||
new Among ( "ousness", -1, 12, "", this)
|
||||
};
|
||||
|
||||
private Among a_4[] = {
|
||||
new Among ( "icate", -1, 2, "", this),
|
||||
new Among ( "ative", -1, 3, "", this),
|
||||
new Among ( "alize", -1, 1, "", this),
|
||||
new Among ( "iciti", -1, 2, "", this),
|
||||
new Among ( "ical", -1, 2, "", this),
|
||||
new Among ( "ful", -1, 3, "", this),
|
||||
new Among ( "ness", -1, 3, "", this)
|
||||
};
|
||||
|
||||
private Among a_5[] = {
|
||||
new Among ( "ic", -1, 1, "", this),
|
||||
new Among ( "ance", -1, 1, "", this),
|
||||
new Among ( "ence", -1, 1, "", this),
|
||||
new Among ( "able", -1, 1, "", this),
|
||||
new Among ( "ible", -1, 1, "", this),
|
||||
new Among ( "ate", -1, 1, "", this),
|
||||
new Among ( "ive", -1, 1, "", this),
|
||||
new Among ( "ize", -1, 1, "", this),
|
||||
new Among ( "iti", -1, 1, "", this),
|
||||
new Among ( "al", -1, 1, "", this),
|
||||
new Among ( "ism", -1, 1, "", this),
|
||||
new Among ( "ion", -1, 2, "", this),
|
||||
new Among ( "er", -1, 1, "", this),
|
||||
new Among ( "ous", -1, 1, "", this),
|
||||
new Among ( "ant", -1, 1, "", this),
|
||||
new Among ( "ent", -1, 1, "", this),
|
||||
new Among ( "ment", 15, 1, "", this),
|
||||
new Among ( "ement", 16, 1, "", this),
|
||||
new Among ( "ou", -1, 1, "", this)
|
||||
};
|
||||
|
||||
private static final char g_v[] = {17, 65, 16, 1 };
|
||||
|
||||
private static final char g_v_WXY[] = {1, 17, 65, 208, 1 };
|
||||
|
||||
private boolean B_Y_found;
|
||||
private int I_p2;
|
||||
private int I_p1;
|
||||
|
||||
private void copy_from(PorterStemmer other) {
|
||||
B_Y_found = other.B_Y_found;
|
||||
I_p2 = other.I_p2;
|
||||
I_p1 = other.I_p1;
|
||||
super.copy_from(other);
|
||||
}
|
||||
|
||||
private boolean r_shortv() {
|
||||
// (, line 19
|
||||
if (!(out_grouping_b(g_v_WXY, 89, 121)))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
if (!(in_grouping_b(g_v, 97, 121)))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
if (!(out_grouping_b(g_v, 97, 121)))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_R1() {
|
||||
if (!(I_p1 <= cursor))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_R2() {
|
||||
if (!(I_p2 <= cursor))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_Step_1a() {
|
||||
int among_var;
|
||||
// (, line 24
|
||||
// [, line 25
|
||||
ket = cursor;
|
||||
// substring, line 25
|
||||
among_var = find_among_b(a_0, 4);
|
||||
if (among_var == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// ], line 25
|
||||
bra = cursor;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 26
|
||||
// <-, line 26
|
||||
slice_from("ss");
|
||||
break;
|
||||
case 2:
|
||||
// (, line 27
|
||||
// <-, line 27
|
||||
slice_from("i");
|
||||
break;
|
||||
case 3:
|
||||
// (, line 29
|
||||
// delete, line 29
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_Step_1b() {
|
||||
int among_var;
|
||||
int v_1;
|
||||
int v_3;
|
||||
int v_4;
|
||||
// (, line 33
|
||||
// [, line 34
|
||||
ket = cursor;
|
||||
// substring, line 34
|
||||
among_var = find_among_b(a_2, 3);
|
||||
if (among_var == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// ], line 34
|
||||
bra = cursor;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 35
|
||||
// call R1, line 35
|
||||
if (!r_R1())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// <-, line 35
|
||||
slice_from("ee");
|
||||
break;
|
||||
case 2:
|
||||
// (, line 37
|
||||
// test, line 38
|
||||
v_1 = limit - cursor;
|
||||
// gopast, line 38
|
||||
golab0: while(true)
|
||||
{
|
||||
lab1: do {
|
||||
if (!(in_grouping_b(g_v, 97, 121)))
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
break golab0;
|
||||
} while (false);
|
||||
if (cursor <= limit_backward)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor--;
|
||||
}
|
||||
cursor = limit - v_1;
|
||||
// delete, line 38
|
||||
slice_del();
|
||||
// test, line 39
|
||||
v_3 = limit - cursor;
|
||||
// substring, line 39
|
||||
among_var = find_among_b(a_1, 13);
|
||||
if (among_var == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = limit - v_3;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 41
|
||||
// <+, line 41
|
||||
{
|
||||
int c = cursor;
|
||||
insert(cursor, cursor, "e");
|
||||
cursor = c;
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
// (, line 44
|
||||
// [, line 44
|
||||
ket = cursor;
|
||||
// next, line 44
|
||||
if (cursor <= limit_backward)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor--;
|
||||
// ], line 44
|
||||
bra = cursor;
|
||||
// delete, line 44
|
||||
slice_del();
|
||||
break;
|
||||
case 3:
|
||||
// (, line 45
|
||||
// atmark, line 45
|
||||
if (cursor != I_p1)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// test, line 45
|
||||
v_4 = limit - cursor;
|
||||
// call shortv, line 45
|
||||
if (!r_shortv())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = limit - v_4;
|
||||
// <+, line 45
|
||||
{
|
||||
int c = cursor;
|
||||
insert(cursor, cursor, "e");
|
||||
cursor = c;
|
||||
}
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_Step_1c() {
|
||||
int v_1;
|
||||
// (, line 51
|
||||
// [, line 52
|
||||
ket = cursor;
|
||||
// or, line 52
|
||||
lab0: do {
|
||||
v_1 = limit - cursor;
|
||||
lab1: do {
|
||||
// literal, line 52
|
||||
if (!(eq_s_b(1, "y")))
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
break lab0;
|
||||
} while (false);
|
||||
cursor = limit - v_1;
|
||||
// literal, line 52
|
||||
if (!(eq_s_b(1, "Y")))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
} while (false);
|
||||
// ], line 52
|
||||
bra = cursor;
|
||||
// gopast, line 53
|
||||
golab2: while(true)
|
||||
{
|
||||
lab3: do {
|
||||
if (!(in_grouping_b(g_v, 97, 121)))
|
||||
{
|
||||
break lab3;
|
||||
}
|
||||
break golab2;
|
||||
} while (false);
|
||||
if (cursor <= limit_backward)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor--;
|
||||
}
|
||||
// <-, line 54
|
||||
slice_from("i");
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_Step_2() {
|
||||
int among_var;
|
||||
// (, line 57
|
||||
// [, line 58
|
||||
ket = cursor;
|
||||
// substring, line 58
|
||||
among_var = find_among_b(a_3, 20);
|
||||
if (among_var == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// ], line 58
|
||||
bra = cursor;
|
||||
// call R1, line 58
|
||||
if (!r_R1())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 59
|
||||
// <-, line 59
|
||||
slice_from("tion");
|
||||
break;
|
||||
case 2:
|
||||
// (, line 60
|
||||
// <-, line 60
|
||||
slice_from("ence");
|
||||
break;
|
||||
case 3:
|
||||
// (, line 61
|
||||
// <-, line 61
|
||||
slice_from("ance");
|
||||
break;
|
||||
case 4:
|
||||
// (, line 62
|
||||
// <-, line 62
|
||||
slice_from("able");
|
||||
break;
|
||||
case 5:
|
||||
// (, line 63
|
||||
// <-, line 63
|
||||
slice_from("ent");
|
||||
break;
|
||||
case 6:
|
||||
// (, line 64
|
||||
// <-, line 64
|
||||
slice_from("e");
|
||||
break;
|
||||
case 7:
|
||||
// (, line 66
|
||||
// <-, line 66
|
||||
slice_from("ize");
|
||||
break;
|
||||
case 8:
|
||||
// (, line 68
|
||||
// <-, line 68
|
||||
slice_from("ate");
|
||||
break;
|
||||
case 9:
|
||||
// (, line 69
|
||||
// <-, line 69
|
||||
slice_from("al");
|
||||
break;
|
||||
case 10:
|
||||
// (, line 71
|
||||
// <-, line 71
|
||||
slice_from("al");
|
||||
break;
|
||||
case 11:
|
||||
// (, line 72
|
||||
// <-, line 72
|
||||
slice_from("ful");
|
||||
break;
|
||||
case 12:
|
||||
// (, line 74
|
||||
// <-, line 74
|
||||
slice_from("ous");
|
||||
break;
|
||||
case 13:
|
||||
// (, line 76
|
||||
// <-, line 76
|
||||
slice_from("ive");
|
||||
break;
|
||||
case 14:
|
||||
// (, line 77
|
||||
// <-, line 77
|
||||
slice_from("ble");
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_Step_3() {
|
||||
int among_var;
|
||||
// (, line 81
|
||||
// [, line 82
|
||||
ket = cursor;
|
||||
// substring, line 82
|
||||
among_var = find_among_b(a_4, 7);
|
||||
if (among_var == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// ], line 82
|
||||
bra = cursor;
|
||||
// call R1, line 82
|
||||
if (!r_R1())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 83
|
||||
// <-, line 83
|
||||
slice_from("al");
|
||||
break;
|
||||
case 2:
|
||||
// (, line 85
|
||||
// <-, line 85
|
||||
slice_from("ic");
|
||||
break;
|
||||
case 3:
|
||||
// (, line 87
|
||||
// delete, line 87
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_Step_4() {
|
||||
int among_var;
|
||||
int v_1;
|
||||
// (, line 91
|
||||
// [, line 92
|
||||
ket = cursor;
|
||||
// substring, line 92
|
||||
among_var = find_among_b(a_5, 19);
|
||||
if (among_var == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// ], line 92
|
||||
bra = cursor;
|
||||
// call R2, line 92
|
||||
if (!r_R2())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 95
|
||||
// delete, line 95
|
||||
slice_del();
|
||||
break;
|
||||
case 2:
|
||||
// (, line 96
|
||||
// or, line 96
|
||||
lab0: do {
|
||||
v_1 = limit - cursor;
|
||||
lab1: do {
|
||||
// literal, line 96
|
||||
if (!(eq_s_b(1, "s")))
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
break lab0;
|
||||
} while (false);
|
||||
cursor = limit - v_1;
|
||||
// literal, line 96
|
||||
if (!(eq_s_b(1, "t")))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
} while (false);
|
||||
// delete, line 96
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_Step_5a() {
|
||||
int v_1;
|
||||
int v_2;
|
||||
// (, line 100
|
||||
// [, line 101
|
||||
ket = cursor;
|
||||
// literal, line 101
|
||||
if (!(eq_s_b(1, "e")))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// ], line 101
|
||||
bra = cursor;
|
||||
// or, line 102
|
||||
lab0: do {
|
||||
v_1 = limit - cursor;
|
||||
lab1: do {
|
||||
// call R2, line 102
|
||||
if (!r_R2())
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
break lab0;
|
||||
} while (false);
|
||||
cursor = limit - v_1;
|
||||
// (, line 102
|
||||
// call R1, line 102
|
||||
if (!r_R1())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// not, line 102
|
||||
{
|
||||
v_2 = limit - cursor;
|
||||
lab2: do {
|
||||
// call shortv, line 102
|
||||
if (!r_shortv())
|
||||
{
|
||||
break lab2;
|
||||
}
|
||||
return false;
|
||||
} while (false);
|
||||
cursor = limit - v_2;
|
||||
}
|
||||
} while (false);
|
||||
// delete, line 103
|
||||
slice_del();
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_Step_5b() {
|
||||
// (, line 106
|
||||
// [, line 107
|
||||
ket = cursor;
|
||||
// literal, line 107
|
||||
if (!(eq_s_b(1, "l")))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// ], line 107
|
||||
bra = cursor;
|
||||
// call R2, line 108
|
||||
if (!r_R2())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// literal, line 108
|
||||
if (!(eq_s_b(1, "l")))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// delete, line 109
|
||||
slice_del();
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean stem() {
|
||||
int v_1;
|
||||
int v_2;
|
||||
int v_3;
|
||||
int v_4;
|
||||
int v_5;
|
||||
int v_10;
|
||||
int v_11;
|
||||
int v_12;
|
||||
int v_13;
|
||||
int v_14;
|
||||
int v_15;
|
||||
int v_16;
|
||||
int v_17;
|
||||
int v_18;
|
||||
int v_19;
|
||||
int v_20;
|
||||
// (, line 113
|
||||
// unset Y_found, line 115
|
||||
B_Y_found = false;
|
||||
// do, line 116
|
||||
v_1 = cursor;
|
||||
lab0: do {
|
||||
// (, line 116
|
||||
// [, line 116
|
||||
bra = cursor;
|
||||
// literal, line 116
|
||||
if (!(eq_s(1, "y")))
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
// ], line 116
|
||||
ket = cursor;
|
||||
// <-, line 116
|
||||
slice_from("Y");
|
||||
// set Y_found, line 116
|
||||
B_Y_found = true;
|
||||
} while (false);
|
||||
cursor = v_1;
|
||||
// do, line 117
|
||||
v_2 = cursor;
|
||||
lab1: do {
|
||||
// repeat, line 117
|
||||
replab2: while(true)
|
||||
{
|
||||
v_3 = cursor;
|
||||
lab3: do {
|
||||
// (, line 117
|
||||
// goto, line 117
|
||||
golab4: while(true)
|
||||
{
|
||||
v_4 = cursor;
|
||||
lab5: do {
|
||||
// (, line 117
|
||||
if (!(in_grouping(g_v, 97, 121)))
|
||||
{
|
||||
break lab5;
|
||||
}
|
||||
// [, line 117
|
||||
bra = cursor;
|
||||
// literal, line 117
|
||||
if (!(eq_s(1, "y")))
|
||||
{
|
||||
break lab5;
|
||||
}
|
||||
// ], line 117
|
||||
ket = cursor;
|
||||
cursor = v_4;
|
||||
break golab4;
|
||||
} while (false);
|
||||
cursor = v_4;
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab3;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// <-, line 117
|
||||
slice_from("Y");
|
||||
// set Y_found, line 117
|
||||
B_Y_found = true;
|
||||
continue replab2;
|
||||
} while (false);
|
||||
cursor = v_3;
|
||||
break replab2;
|
||||
}
|
||||
} while (false);
|
||||
cursor = v_2;
|
||||
I_p1 = limit;
|
||||
I_p2 = limit;
|
||||
// do, line 121
|
||||
v_5 = cursor;
|
||||
lab6: do {
|
||||
// (, line 121
|
||||
// gopast, line 122
|
||||
golab7: while(true)
|
||||
{
|
||||
lab8: do {
|
||||
if (!(in_grouping(g_v, 97, 121)))
|
||||
{
|
||||
break lab8;
|
||||
}
|
||||
break golab7;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab6;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// gopast, line 122
|
||||
golab9: while(true)
|
||||
{
|
||||
lab10: do {
|
||||
if (!(out_grouping(g_v, 97, 121)))
|
||||
{
|
||||
break lab10;
|
||||
}
|
||||
break golab9;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab6;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// setmark p1, line 122
|
||||
I_p1 = cursor;
|
||||
// gopast, line 123
|
||||
golab11: while(true)
|
||||
{
|
||||
lab12: do {
|
||||
if (!(in_grouping(g_v, 97, 121)))
|
||||
{
|
||||
break lab12;
|
||||
}
|
||||
break golab11;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab6;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// gopast, line 123
|
||||
golab13: while(true)
|
||||
{
|
||||
lab14: do {
|
||||
if (!(out_grouping(g_v, 97, 121)))
|
||||
{
|
||||
break lab14;
|
||||
}
|
||||
break golab13;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab6;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// setmark p2, line 123
|
||||
I_p2 = cursor;
|
||||
} while (false);
|
||||
cursor = v_5;
|
||||
// backwards, line 126
|
||||
limit_backward = cursor; cursor = limit;
|
||||
// (, line 126
|
||||
// do, line 127
|
||||
v_10 = limit - cursor;
|
||||
lab15: do {
|
||||
// call Step_1a, line 127
|
||||
if (!r_Step_1a())
|
||||
{
|
||||
break lab15;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_10;
|
||||
// do, line 128
|
||||
v_11 = limit - cursor;
|
||||
lab16: do {
|
||||
// call Step_1b, line 128
|
||||
if (!r_Step_1b())
|
||||
{
|
||||
break lab16;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_11;
|
||||
// do, line 129
|
||||
v_12 = limit - cursor;
|
||||
lab17: do {
|
||||
// call Step_1c, line 129
|
||||
if (!r_Step_1c())
|
||||
{
|
||||
break lab17;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_12;
|
||||
// do, line 130
|
||||
v_13 = limit - cursor;
|
||||
lab18: do {
|
||||
// call Step_2, line 130
|
||||
if (!r_Step_2())
|
||||
{
|
||||
break lab18;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_13;
|
||||
// do, line 131
|
||||
v_14 = limit - cursor;
|
||||
lab19: do {
|
||||
// call Step_3, line 131
|
||||
if (!r_Step_3())
|
||||
{
|
||||
break lab19;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_14;
|
||||
// do, line 132
|
||||
v_15 = limit - cursor;
|
||||
lab20: do {
|
||||
// call Step_4, line 132
|
||||
if (!r_Step_4())
|
||||
{
|
||||
break lab20;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_15;
|
||||
// do, line 133
|
||||
v_16 = limit - cursor;
|
||||
lab21: do {
|
||||
// call Step_5a, line 133
|
||||
if (!r_Step_5a())
|
||||
{
|
||||
break lab21;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_16;
|
||||
// do, line 134
|
||||
v_17 = limit - cursor;
|
||||
lab22: do {
|
||||
// call Step_5b, line 134
|
||||
if (!r_Step_5b())
|
||||
{
|
||||
break lab22;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_17;
|
||||
cursor = limit_backward; // do, line 137
|
||||
v_18 = cursor;
|
||||
lab23: do {
|
||||
// (, line 137
|
||||
// Boolean test Y_found, line 137
|
||||
if (!(B_Y_found))
|
||||
{
|
||||
break lab23;
|
||||
}
|
||||
// repeat, line 137
|
||||
replab24: while(true)
|
||||
{
|
||||
v_19 = cursor;
|
||||
lab25: do {
|
||||
// (, line 137
|
||||
// goto, line 137
|
||||
golab26: while(true)
|
||||
{
|
||||
v_20 = cursor;
|
||||
lab27: do {
|
||||
// (, line 137
|
||||
// [, line 137
|
||||
bra = cursor;
|
||||
// literal, line 137
|
||||
if (!(eq_s(1, "Y")))
|
||||
{
|
||||
break lab27;
|
||||
}
|
||||
// ], line 137
|
||||
ket = cursor;
|
||||
cursor = v_20;
|
||||
break golab26;
|
||||
} while (false);
|
||||
cursor = v_20;
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab25;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// <-, line 137
|
||||
slice_from("y");
|
||||
continue replab24;
|
||||
} while (false);
|
||||
cursor = v_19;
|
||||
break replab24;
|
||||
}
|
||||
} while (false);
|
||||
cursor = v_18;
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,727 @@
|
|||
// This file was generated automatically by the Snowball to Java compiler
|
||||
|
||||
package org.tartarus.snowball.ext;
|
||||
import org.tartarus.snowball.SnowballProgram;
|
||||
import org.tartarus.snowball.Among;
|
||||
|
||||
/**
|
||||
* Generated class implementing code defined by a snowball script.
|
||||
*/
|
||||
public class RussianStemmer extends SnowballProgram {
|
||||
|
||||
private Among a_0[] = {
|
||||
new Among ( "\u0432", -1, 1, "", this),
|
||||
new Among ( "\u0438\u0432", 0, 2, "", this),
|
||||
new Among ( "\u044B\u0432", 0, 2, "", this),
|
||||
new Among ( "\u0432\u0448\u0438", -1, 1, "", this),
|
||||
new Among ( "\u0438\u0432\u0448\u0438", 3, 2, "", this),
|
||||
new Among ( "\u044B\u0432\u0448\u0438", 3, 2, "", this),
|
||||
new Among ( "\u0432\u0448\u0438\u0441\u044C", -1, 1, "", this),
|
||||
new Among ( "\u0438\u0432\u0448\u0438\u0441\u044C", 6, 2, "", this),
|
||||
new Among ( "\u044B\u0432\u0448\u0438\u0441\u044C", 6, 2, "", this)
|
||||
};
|
||||
|
||||
private Among a_1[] = {
|
||||
new Among ( "\u0435\u0435", -1, 1, "", this),
|
||||
new Among ( "\u0438\u0435", -1, 1, "", this),
|
||||
new Among ( "\u043E\u0435", -1, 1, "", this),
|
||||
new Among ( "\u044B\u0435", -1, 1, "", this),
|
||||
new Among ( "\u0438\u043C\u0438", -1, 1, "", this),
|
||||
new Among ( "\u044B\u043C\u0438", -1, 1, "", this),
|
||||
new Among ( "\u0435\u0439", -1, 1, "", this),
|
||||
new Among ( "\u0438\u0439", -1, 1, "", this),
|
||||
new Among ( "\u043E\u0439", -1, 1, "", this),
|
||||
new Among ( "\u044B\u0439", -1, 1, "", this),
|
||||
new Among ( "\u0435\u043C", -1, 1, "", this),
|
||||
new Among ( "\u0438\u043C", -1, 1, "", this),
|
||||
new Among ( "\u043E\u043C", -1, 1, "", this),
|
||||
new Among ( "\u044B\u043C", -1, 1, "", this),
|
||||
new Among ( "\u0435\u0433\u043E", -1, 1, "", this),
|
||||
new Among ( "\u043E\u0433\u043E", -1, 1, "", this),
|
||||
new Among ( "\u0435\u043C\u0443", -1, 1, "", this),
|
||||
new Among ( "\u043E\u043C\u0443", -1, 1, "", this),
|
||||
new Among ( "\u0438\u0445", -1, 1, "", this),
|
||||
new Among ( "\u044B\u0445", -1, 1, "", this),
|
||||
new Among ( "\u0435\u044E", -1, 1, "", this),
|
||||
new Among ( "\u043E\u044E", -1, 1, "", this),
|
||||
new Among ( "\u0443\u044E", -1, 1, "", this),
|
||||
new Among ( "\u044E\u044E", -1, 1, "", this),
|
||||
new Among ( "\u0430\u044F", -1, 1, "", this),
|
||||
new Among ( "\u044F\u044F", -1, 1, "", this)
|
||||
};
|
||||
|
||||
private Among a_2[] = {
|
||||
new Among ( "\u0435\u043C", -1, 1, "", this),
|
||||
new Among ( "\u043D\u043D", -1, 1, "", this),
|
||||
new Among ( "\u0432\u0448", -1, 1, "", this),
|
||||
new Among ( "\u0438\u0432\u0448", 2, 2, "", this),
|
||||
new Among ( "\u044B\u0432\u0448", 2, 2, "", this),
|
||||
new Among ( "\u0449", -1, 1, "", this),
|
||||
new Among ( "\u044E\u0449", 5, 1, "", this),
|
||||
new Among ( "\u0443\u044E\u0449", 6, 2, "", this)
|
||||
};
|
||||
|
||||
private Among a_3[] = {
|
||||
new Among ( "\u0441\u044C", -1, 1, "", this),
|
||||
new Among ( "\u0441\u044F", -1, 1, "", this)
|
||||
};
|
||||
|
||||
private Among a_4[] = {
|
||||
new Among ( "\u043B\u0430", -1, 1, "", this),
|
||||
new Among ( "\u0438\u043B\u0430", 0, 2, "", this),
|
||||
new Among ( "\u044B\u043B\u0430", 0, 2, "", this),
|
||||
new Among ( "\u043D\u0430", -1, 1, "", this),
|
||||
new Among ( "\u0435\u043D\u0430", 3, 2, "", this),
|
||||
new Among ( "\u0435\u0442\u0435", -1, 1, "", this),
|
||||
new Among ( "\u0438\u0442\u0435", -1, 2, "", this),
|
||||
new Among ( "\u0439\u0442\u0435", -1, 1, "", this),
|
||||
new Among ( "\u0435\u0439\u0442\u0435", 7, 2, "", this),
|
||||
new Among ( "\u0443\u0439\u0442\u0435", 7, 2, "", this),
|
||||
new Among ( "\u043B\u0438", -1, 1, "", this),
|
||||
new Among ( "\u0438\u043B\u0438", 10, 2, "", this),
|
||||
new Among ( "\u044B\u043B\u0438", 10, 2, "", this),
|
||||
new Among ( "\u0439", -1, 1, "", this),
|
||||
new Among ( "\u0435\u0439", 13, 2, "", this),
|
||||
new Among ( "\u0443\u0439", 13, 2, "", this),
|
||||
new Among ( "\u043B", -1, 1, "", this),
|
||||
new Among ( "\u0438\u043B", 16, 2, "", this),
|
||||
new Among ( "\u044B\u043B", 16, 2, "", this),
|
||||
new Among ( "\u0435\u043C", -1, 1, "", this),
|
||||
new Among ( "\u0438\u043C", -1, 2, "", this),
|
||||
new Among ( "\u044B\u043C", -1, 2, "", this),
|
||||
new Among ( "\u043D", -1, 1, "", this),
|
||||
new Among ( "\u0435\u043D", 22, 2, "", this),
|
||||
new Among ( "\u043B\u043E", -1, 1, "", this),
|
||||
new Among ( "\u0438\u043B\u043E", 24, 2, "", this),
|
||||
new Among ( "\u044B\u043B\u043E", 24, 2, "", this),
|
||||
new Among ( "\u043D\u043E", -1, 1, "", this),
|
||||
new Among ( "\u0435\u043D\u043E", 27, 2, "", this),
|
||||
new Among ( "\u043D\u043D\u043E", 27, 1, "", this),
|
||||
new Among ( "\u0435\u0442", -1, 1, "", this),
|
||||
new Among ( "\u0443\u0435\u0442", 30, 2, "", this),
|
||||
new Among ( "\u0438\u0442", -1, 2, "", this),
|
||||
new Among ( "\u044B\u0442", -1, 2, "", this),
|
||||
new Among ( "\u044E\u0442", -1, 1, "", this),
|
||||
new Among ( "\u0443\u044E\u0442", 34, 2, "", this),
|
||||
new Among ( "\u044F\u0442", -1, 2, "", this),
|
||||
new Among ( "\u043D\u044B", -1, 1, "", this),
|
||||
new Among ( "\u0435\u043D\u044B", 37, 2, "", this),
|
||||
new Among ( "\u0442\u044C", -1, 1, "", this),
|
||||
new Among ( "\u0438\u0442\u044C", 39, 2, "", this),
|
||||
new Among ( "\u044B\u0442\u044C", 39, 2, "", this),
|
||||
new Among ( "\u0435\u0448\u044C", -1, 1, "", this),
|
||||
new Among ( "\u0438\u0448\u044C", -1, 2, "", this),
|
||||
new Among ( "\u044E", -1, 2, "", this),
|
||||
new Among ( "\u0443\u044E", 44, 2, "", this)
|
||||
};
|
||||
|
||||
private Among a_5[] = {
|
||||
new Among ( "\u0430", -1, 1, "", this),
|
||||
new Among ( "\u0435\u0432", -1, 1, "", this),
|
||||
new Among ( "\u043E\u0432", -1, 1, "", this),
|
||||
new Among ( "\u0435", -1, 1, "", this),
|
||||
new Among ( "\u0438\u0435", 3, 1, "", this),
|
||||
new Among ( "\u044C\u0435", 3, 1, "", this),
|
||||
new Among ( "\u0438", -1, 1, "", this),
|
||||
new Among ( "\u0435\u0438", 6, 1, "", this),
|
||||
new Among ( "\u0438\u0438", 6, 1, "", this),
|
||||
new Among ( "\u0430\u043C\u0438", 6, 1, "", this),
|
||||
new Among ( "\u044F\u043C\u0438", 6, 1, "", this),
|
||||
new Among ( "\u0438\u044F\u043C\u0438", 10, 1, "", this),
|
||||
new Among ( "\u0439", -1, 1, "", this),
|
||||
new Among ( "\u0435\u0439", 12, 1, "", this),
|
||||
new Among ( "\u0438\u0435\u0439", 13, 1, "", this),
|
||||
new Among ( "\u0438\u0439", 12, 1, "", this),
|
||||
new Among ( "\u043E\u0439", 12, 1, "", this),
|
||||
new Among ( "\u0430\u043C", -1, 1, "", this),
|
||||
new Among ( "\u0435\u043C", -1, 1, "", this),
|
||||
new Among ( "\u0438\u0435\u043C", 18, 1, "", this),
|
||||
new Among ( "\u043E\u043C", -1, 1, "", this),
|
||||
new Among ( "\u044F\u043C", -1, 1, "", this),
|
||||
new Among ( "\u0438\u044F\u043C", 21, 1, "", this),
|
||||
new Among ( "\u043E", -1, 1, "", this),
|
||||
new Among ( "\u0443", -1, 1, "", this),
|
||||
new Among ( "\u0430\u0445", -1, 1, "", this),
|
||||
new Among ( "\u044F\u0445", -1, 1, "", this),
|
||||
new Among ( "\u0438\u044F\u0445", 26, 1, "", this),
|
||||
new Among ( "\u044B", -1, 1, "", this),
|
||||
new Among ( "\u044C", -1, 1, "", this),
|
||||
new Among ( "\u044E", -1, 1, "", this),
|
||||
new Among ( "\u0438\u044E", 30, 1, "", this),
|
||||
new Among ( "\u044C\u044E", 30, 1, "", this),
|
||||
new Among ( "\u044F", -1, 1, "", this),
|
||||
new Among ( "\u0438\u044F", 33, 1, "", this),
|
||||
new Among ( "\u044C\u044F", 33, 1, "", this)
|
||||
};
|
||||
|
||||
private Among a_6[] = {
|
||||
new Among ( "\u043E\u0441\u0442", -1, 1, "", this),
|
||||
new Among ( "\u043E\u0441\u0442\u044C", -1, 1, "", this)
|
||||
};
|
||||
|
||||
private Among a_7[] = {
|
||||
new Among ( "\u0435\u0439\u0448\u0435", -1, 1, "", this),
|
||||
new Among ( "\u043D", -1, 2, "", this),
|
||||
new Among ( "\u0435\u0439\u0448", -1, 1, "", this),
|
||||
new Among ( "\u044C", -1, 3, "", this)
|
||||
};
|
||||
|
||||
private static final char g_v[] = {33, 65, 8, 232 };
|
||||
|
||||
private int I_p2;
|
||||
private int I_pV;
|
||||
|
||||
private void copy_from(RussianStemmer other) {
|
||||
I_p2 = other.I_p2;
|
||||
I_pV = other.I_pV;
|
||||
super.copy_from(other);
|
||||
}
|
||||
|
||||
private boolean r_mark_regions() {
|
||||
int v_1;
|
||||
// (, line 57
|
||||
I_pV = limit;
|
||||
I_p2 = limit;
|
||||
// do, line 61
|
||||
v_1 = cursor;
|
||||
lab0: do {
|
||||
// (, line 61
|
||||
// gopast, line 62
|
||||
golab1: while(true)
|
||||
{
|
||||
lab2: do {
|
||||
if (!(in_grouping(g_v, 1072, 1103)))
|
||||
{
|
||||
break lab2;
|
||||
}
|
||||
break golab1;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// setmark pV, line 62
|
||||
I_pV = cursor;
|
||||
// gopast, line 62
|
||||
golab3: while(true)
|
||||
{
|
||||
lab4: do {
|
||||
if (!(out_grouping(g_v, 1072, 1103)))
|
||||
{
|
||||
break lab4;
|
||||
}
|
||||
break golab3;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// gopast, line 63
|
||||
golab5: while(true)
|
||||
{
|
||||
lab6: do {
|
||||
if (!(in_grouping(g_v, 1072, 1103)))
|
||||
{
|
||||
break lab6;
|
||||
}
|
||||
break golab5;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// gopast, line 63
|
||||
golab7: while(true)
|
||||
{
|
||||
lab8: do {
|
||||
if (!(out_grouping(g_v, 1072, 1103)))
|
||||
{
|
||||
break lab8;
|
||||
}
|
||||
break golab7;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// setmark p2, line 63
|
||||
I_p2 = cursor;
|
||||
} while (false);
|
||||
cursor = v_1;
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_R2() {
|
||||
if (!(I_p2 <= cursor))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_perfective_gerund() {
|
||||
int among_var;
|
||||
int v_1;
|
||||
// (, line 71
|
||||
// [, line 72
|
||||
ket = cursor;
|
||||
// substring, line 72
|
||||
among_var = find_among_b(a_0, 9);
|
||||
if (among_var == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// ], line 72
|
||||
bra = cursor;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 76
|
||||
// or, line 76
|
||||
lab0: do {
|
||||
v_1 = limit - cursor;
|
||||
lab1: do {
|
||||
// literal, line 76
|
||||
if (!(eq_s_b(1, "\u0430")))
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
break lab0;
|
||||
} while (false);
|
||||
cursor = limit - v_1;
|
||||
// literal, line 76
|
||||
if (!(eq_s_b(1, "\u044F")))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
} while (false);
|
||||
// delete, line 76
|
||||
slice_del();
|
||||
break;
|
||||
case 2:
|
||||
// (, line 83
|
||||
// delete, line 83
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_adjective() {
|
||||
int among_var;
|
||||
// (, line 87
|
||||
// [, line 88
|
||||
ket = cursor;
|
||||
// substring, line 88
|
||||
among_var = find_among_b(a_1, 26);
|
||||
if (among_var == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// ], line 88
|
||||
bra = cursor;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 97
|
||||
// delete, line 97
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_adjectival() {
|
||||
int among_var;
|
||||
int v_1;
|
||||
int v_2;
|
||||
// (, line 101
|
||||
// call adjective, line 102
|
||||
if (!r_adjective())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// try, line 109
|
||||
v_1 = limit - cursor;
|
||||
lab0: do {
|
||||
// (, line 109
|
||||
// [, line 110
|
||||
ket = cursor;
|
||||
// substring, line 110
|
||||
among_var = find_among_b(a_2, 8);
|
||||
if (among_var == 0)
|
||||
{
|
||||
cursor = limit - v_1;
|
||||
break lab0;
|
||||
}
|
||||
// ], line 110
|
||||
bra = cursor;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
cursor = limit - v_1;
|
||||
break lab0;
|
||||
case 1:
|
||||
// (, line 115
|
||||
// or, line 115
|
||||
lab1: do {
|
||||
v_2 = limit - cursor;
|
||||
lab2: do {
|
||||
// literal, line 115
|
||||
if (!(eq_s_b(1, "\u0430")))
|
||||
{
|
||||
break lab2;
|
||||
}
|
||||
break lab1;
|
||||
} while (false);
|
||||
cursor = limit - v_2;
|
||||
// literal, line 115
|
||||
if (!(eq_s_b(1, "\u044F")))
|
||||
{
|
||||
cursor = limit - v_1;
|
||||
break lab0;
|
||||
}
|
||||
} while (false);
|
||||
// delete, line 115
|
||||
slice_del();
|
||||
break;
|
||||
case 2:
|
||||
// (, line 122
|
||||
// delete, line 122
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
} while (false);
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_reflexive() {
|
||||
int among_var;
|
||||
// (, line 128
|
||||
// [, line 129
|
||||
ket = cursor;
|
||||
// substring, line 129
|
||||
among_var = find_among_b(a_3, 2);
|
||||
if (among_var == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// ], line 129
|
||||
bra = cursor;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 132
|
||||
// delete, line 132
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_verb() {
|
||||
int among_var;
|
||||
int v_1;
|
||||
// (, line 136
|
||||
// [, line 137
|
||||
ket = cursor;
|
||||
// substring, line 137
|
||||
among_var = find_among_b(a_4, 46);
|
||||
if (among_var == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// ], line 137
|
||||
bra = cursor;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 143
|
||||
// or, line 143
|
||||
lab0: do {
|
||||
v_1 = limit - cursor;
|
||||
lab1: do {
|
||||
// literal, line 143
|
||||
if (!(eq_s_b(1, "\u0430")))
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
break lab0;
|
||||
} while (false);
|
||||
cursor = limit - v_1;
|
||||
// literal, line 143
|
||||
if (!(eq_s_b(1, "\u044F")))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
} while (false);
|
||||
// delete, line 143
|
||||
slice_del();
|
||||
break;
|
||||
case 2:
|
||||
// (, line 151
|
||||
// delete, line 151
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_noun() {
|
||||
int among_var;
|
||||
// (, line 159
|
||||
// [, line 160
|
||||
ket = cursor;
|
||||
// substring, line 160
|
||||
among_var = find_among_b(a_5, 36);
|
||||
if (among_var == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// ], line 160
|
||||
bra = cursor;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 167
|
||||
// delete, line 167
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_derivational() {
|
||||
int among_var;
|
||||
// (, line 175
|
||||
// [, line 176
|
||||
ket = cursor;
|
||||
// substring, line 176
|
||||
among_var = find_among_b(a_6, 2);
|
||||
if (among_var == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// ], line 176
|
||||
bra = cursor;
|
||||
// call R2, line 176
|
||||
if (!r_R2())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 179
|
||||
// delete, line 179
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_tidy_up() {
|
||||
int among_var;
|
||||
// (, line 183
|
||||
// [, line 184
|
||||
ket = cursor;
|
||||
// substring, line 184
|
||||
among_var = find_among_b(a_7, 4);
|
||||
if (among_var == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// ], line 184
|
||||
bra = cursor;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 188
|
||||
// delete, line 188
|
||||
slice_del();
|
||||
// [, line 189
|
||||
ket = cursor;
|
||||
// literal, line 189
|
||||
if (!(eq_s_b(1, "\u043D")))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// ], line 189
|
||||
bra = cursor;
|
||||
// literal, line 189
|
||||
if (!(eq_s_b(1, "\u043D")))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// delete, line 189
|
||||
slice_del();
|
||||
break;
|
||||
case 2:
|
||||
// (, line 192
|
||||
// literal, line 192
|
||||
if (!(eq_s_b(1, "\u043D")))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// delete, line 192
|
||||
slice_del();
|
||||
break;
|
||||
case 3:
|
||||
// (, line 194
|
||||
// delete, line 194
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean stem() {
|
||||
int v_1;
|
||||
int v_2;
|
||||
int v_3;
|
||||
int v_4;
|
||||
int v_5;
|
||||
int v_6;
|
||||
int v_7;
|
||||
int v_8;
|
||||
int v_9;
|
||||
int v_10;
|
||||
// (, line 199
|
||||
// do, line 201
|
||||
v_1 = cursor;
|
||||
lab0: do {
|
||||
// call mark_regions, line 201
|
||||
if (!r_mark_regions())
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
} while (false);
|
||||
cursor = v_1;
|
||||
// backwards, line 202
|
||||
limit_backward = cursor; cursor = limit;
|
||||
// setlimit, line 202
|
||||
v_2 = limit - cursor;
|
||||
// tomark, line 202
|
||||
if (cursor < I_pV)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = I_pV;
|
||||
v_3 = limit_backward;
|
||||
limit_backward = cursor;
|
||||
cursor = limit - v_2;
|
||||
// (, line 202
|
||||
// do, line 203
|
||||
v_4 = limit - cursor;
|
||||
lab1: do {
|
||||
// (, line 203
|
||||
// or, line 204
|
||||
lab2: do {
|
||||
v_5 = limit - cursor;
|
||||
lab3: do {
|
||||
// call perfective_gerund, line 204
|
||||
if (!r_perfective_gerund())
|
||||
{
|
||||
break lab3;
|
||||
}
|
||||
break lab2;
|
||||
} while (false);
|
||||
cursor = limit - v_5;
|
||||
// (, line 205
|
||||
// try, line 205
|
||||
v_6 = limit - cursor;
|
||||
lab4: do {
|
||||
// call reflexive, line 205
|
||||
if (!r_reflexive())
|
||||
{
|
||||
cursor = limit - v_6;
|
||||
break lab4;
|
||||
}
|
||||
} while (false);
|
||||
// or, line 206
|
||||
lab5: do {
|
||||
v_7 = limit - cursor;
|
||||
lab6: do {
|
||||
// call adjectival, line 206
|
||||
if (!r_adjectival())
|
||||
{
|
||||
break lab6;
|
||||
}
|
||||
break lab5;
|
||||
} while (false);
|
||||
cursor = limit - v_7;
|
||||
lab7: do {
|
||||
// call verb, line 206
|
||||
if (!r_verb())
|
||||
{
|
||||
break lab7;
|
||||
}
|
||||
break lab5;
|
||||
} while (false);
|
||||
cursor = limit - v_7;
|
||||
// call noun, line 206
|
||||
if (!r_noun())
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
} while (false);
|
||||
} while (false);
|
||||
} while (false);
|
||||
cursor = limit - v_4;
|
||||
// try, line 209
|
||||
v_8 = limit - cursor;
|
||||
lab8: do {
|
||||
// (, line 209
|
||||
// [, line 209
|
||||
ket = cursor;
|
||||
// literal, line 209
|
||||
if (!(eq_s_b(1, "\u0438")))
|
||||
{
|
||||
cursor = limit - v_8;
|
||||
break lab8;
|
||||
}
|
||||
// ], line 209
|
||||
bra = cursor;
|
||||
// delete, line 209
|
||||
slice_del();
|
||||
} while (false);
|
||||
// do, line 212
|
||||
v_9 = limit - cursor;
|
||||
lab9: do {
|
||||
// call derivational, line 212
|
||||
if (!r_derivational())
|
||||
{
|
||||
break lab9;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_9;
|
||||
// do, line 213
|
||||
v_10 = limit - cursor;
|
||||
lab10: do {
|
||||
// call tidy_up, line 213
|
||||
if (!r_tidy_up())
|
||||
{
|
||||
break lab10;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_10;
|
||||
limit_backward = v_3;
|
||||
cursor = limit_backward; return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,349 @@
|
|||
// This file was generated automatically by the Snowball to Java compiler
|
||||
|
||||
package org.tartarus.snowball.ext;
|
||||
import org.tartarus.snowball.SnowballProgram;
|
||||
import org.tartarus.snowball.Among;
|
||||
|
||||
/**
|
||||
* Generated class implementing code defined by a snowball script.
|
||||
*/
|
||||
public class SwedishStemmer extends SnowballProgram {
|
||||
|
||||
private Among a_0[] = {
|
||||
new Among ( "a", -1, 1, "", this),
|
||||
new Among ( "arna", 0, 1, "", this),
|
||||
new Among ( "erna", 0, 1, "", this),
|
||||
new Among ( "heterna", 2, 1, "", this),
|
||||
new Among ( "orna", 0, 1, "", this),
|
||||
new Among ( "ad", -1, 1, "", this),
|
||||
new Among ( "e", -1, 1, "", this),
|
||||
new Among ( "ade", 6, 1, "", this),
|
||||
new Among ( "ande", 6, 1, "", this),
|
||||
new Among ( "arne", 6, 1, "", this),
|
||||
new Among ( "are", 6, 1, "", this),
|
||||
new Among ( "aste", 6, 1, "", this),
|
||||
new Among ( "en", -1, 1, "", this),
|
||||
new Among ( "anden", 12, 1, "", this),
|
||||
new Among ( "aren", 12, 1, "", this),
|
||||
new Among ( "heten", 12, 1, "", this),
|
||||
new Among ( "ern", -1, 1, "", this),
|
||||
new Among ( "ar", -1, 1, "", this),
|
||||
new Among ( "er", -1, 1, "", this),
|
||||
new Among ( "heter", 18, 1, "", this),
|
||||
new Among ( "or", -1, 1, "", this),
|
||||
new Among ( "s", -1, 2, "", this),
|
||||
new Among ( "as", 21, 1, "", this),
|
||||
new Among ( "arnas", 22, 1, "", this),
|
||||
new Among ( "ernas", 22, 1, "", this),
|
||||
new Among ( "ornas", 22, 1, "", this),
|
||||
new Among ( "es", 21, 1, "", this),
|
||||
new Among ( "ades", 26, 1, "", this),
|
||||
new Among ( "andes", 26, 1, "", this),
|
||||
new Among ( "ens", 21, 1, "", this),
|
||||
new Among ( "arens", 29, 1, "", this),
|
||||
new Among ( "hetens", 29, 1, "", this),
|
||||
new Among ( "erns", 21, 1, "", this),
|
||||
new Among ( "at", -1, 1, "", this),
|
||||
new Among ( "andet", -1, 1, "", this),
|
||||
new Among ( "het", -1, 1, "", this),
|
||||
new Among ( "ast", -1, 1, "", this)
|
||||
};
|
||||
|
||||
private Among a_1[] = {
|
||||
new Among ( "dd", -1, -1, "", this),
|
||||
new Among ( "gd", -1, -1, "", this),
|
||||
new Among ( "nn", -1, -1, "", this),
|
||||
new Among ( "dt", -1, -1, "", this),
|
||||
new Among ( "gt", -1, -1, "", this),
|
||||
new Among ( "kt", -1, -1, "", this),
|
||||
new Among ( "tt", -1, -1, "", this)
|
||||
};
|
||||
|
||||
private Among a_2[] = {
|
||||
new Among ( "ig", -1, 1, "", this),
|
||||
new Among ( "lig", 0, 1, "", this),
|
||||
new Among ( "els", -1, 1, "", this),
|
||||
new Among ( "fullt", -1, 3, "", this),
|
||||
new Among ( "l\u00F6st", -1, 2, "", this)
|
||||
};
|
||||
|
||||
private static final char g_v[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 32 };
|
||||
|
||||
private static final char g_s_ending[] = {119, 127, 149 };
|
||||
|
||||
private int I_x;
|
||||
private int I_p1;
|
||||
|
||||
private void copy_from(SwedishStemmer other) {
|
||||
I_x = other.I_x;
|
||||
I_p1 = other.I_p1;
|
||||
super.copy_from(other);
|
||||
}
|
||||
|
||||
private boolean r_mark_regions() {
|
||||
int v_1;
|
||||
int v_2;
|
||||
// (, line 26
|
||||
I_p1 = limit;
|
||||
// test, line 29
|
||||
v_1 = cursor;
|
||||
// (, line 29
|
||||
// hop, line 29
|
||||
{
|
||||
int c = cursor + 3;
|
||||
if (0 > c || c > limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = c;
|
||||
}
|
||||
// setmark x, line 29
|
||||
I_x = cursor;
|
||||
cursor = v_1;
|
||||
// goto, line 30
|
||||
golab0: while(true)
|
||||
{
|
||||
v_2 = cursor;
|
||||
lab1: do {
|
||||
if (!(in_grouping(g_v, 97, 246)))
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
cursor = v_2;
|
||||
break golab0;
|
||||
} while (false);
|
||||
cursor = v_2;
|
||||
if (cursor >= limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// gopast, line 30
|
||||
golab2: while(true)
|
||||
{
|
||||
lab3: do {
|
||||
if (!(out_grouping(g_v, 97, 246)))
|
||||
{
|
||||
break lab3;
|
||||
}
|
||||
break golab2;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// setmark p1, line 30
|
||||
I_p1 = cursor;
|
||||
// try, line 31
|
||||
lab4: do {
|
||||
// (, line 31
|
||||
if (!(I_p1 < I_x))
|
||||
{
|
||||
break lab4;
|
||||
}
|
||||
I_p1 = I_x;
|
||||
} while (false);
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_main_suffix() {
|
||||
int among_var;
|
||||
int v_1;
|
||||
int v_2;
|
||||
// (, line 36
|
||||
// setlimit, line 37
|
||||
v_1 = limit - cursor;
|
||||
// tomark, line 37
|
||||
if (cursor < I_p1)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = I_p1;
|
||||
v_2 = limit_backward;
|
||||
limit_backward = cursor;
|
||||
cursor = limit - v_1;
|
||||
// (, line 37
|
||||
// [, line 37
|
||||
ket = cursor;
|
||||
// substring, line 37
|
||||
among_var = find_among_b(a_0, 37);
|
||||
if (among_var == 0)
|
||||
{
|
||||
limit_backward = v_2;
|
||||
return false;
|
||||
}
|
||||
// ], line 37
|
||||
bra = cursor;
|
||||
limit_backward = v_2;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 44
|
||||
// delete, line 44
|
||||
slice_del();
|
||||
break;
|
||||
case 2:
|
||||
// (, line 46
|
||||
if (!(in_grouping_b(g_s_ending, 98, 121)))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// delete, line 46
|
||||
slice_del();
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_consonant_pair() {
|
||||
int v_1;
|
||||
int v_2;
|
||||
int v_3;
|
||||
// setlimit, line 50
|
||||
v_1 = limit - cursor;
|
||||
// tomark, line 50
|
||||
if (cursor < I_p1)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = I_p1;
|
||||
v_2 = limit_backward;
|
||||
limit_backward = cursor;
|
||||
cursor = limit - v_1;
|
||||
// (, line 50
|
||||
// and, line 52
|
||||
v_3 = limit - cursor;
|
||||
// among, line 51
|
||||
if (find_among_b(a_1, 7) == 0)
|
||||
{
|
||||
limit_backward = v_2;
|
||||
return false;
|
||||
}
|
||||
cursor = limit - v_3;
|
||||
// (, line 52
|
||||
// [, line 52
|
||||
ket = cursor;
|
||||
// next, line 52
|
||||
if (cursor <= limit_backward)
|
||||
{
|
||||
limit_backward = v_2;
|
||||
return false;
|
||||
}
|
||||
cursor--;
|
||||
// ], line 52
|
||||
bra = cursor;
|
||||
// delete, line 52
|
||||
slice_del();
|
||||
limit_backward = v_2;
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_other_suffix() {
|
||||
int among_var;
|
||||
int v_1;
|
||||
int v_2;
|
||||
// setlimit, line 55
|
||||
v_1 = limit - cursor;
|
||||
// tomark, line 55
|
||||
if (cursor < I_p1)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = I_p1;
|
||||
v_2 = limit_backward;
|
||||
limit_backward = cursor;
|
||||
cursor = limit - v_1;
|
||||
// (, line 55
|
||||
// [, line 56
|
||||
ket = cursor;
|
||||
// substring, line 56
|
||||
among_var = find_among_b(a_2, 5);
|
||||
if (among_var == 0)
|
||||
{
|
||||
limit_backward = v_2;
|
||||
return false;
|
||||
}
|
||||
// ], line 56
|
||||
bra = cursor;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
limit_backward = v_2;
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 57
|
||||
// delete, line 57
|
||||
slice_del();
|
||||
break;
|
||||
case 2:
|
||||
// (, line 58
|
||||
// <-, line 58
|
||||
slice_from("l\u00F6s");
|
||||
break;
|
||||
case 3:
|
||||
// (, line 59
|
||||
// <-, line 59
|
||||
slice_from("full");
|
||||
break;
|
||||
}
|
||||
limit_backward = v_2;
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean stem() {
|
||||
int v_1;
|
||||
int v_2;
|
||||
int v_3;
|
||||
int v_4;
|
||||
// (, line 64
|
||||
// do, line 66
|
||||
v_1 = cursor;
|
||||
lab0: do {
|
||||
// call mark_regions, line 66
|
||||
if (!r_mark_regions())
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
} while (false);
|
||||
cursor = v_1;
|
||||
// backwards, line 67
|
||||
limit_backward = cursor; cursor = limit;
|
||||
// (, line 67
|
||||
// do, line 68
|
||||
v_2 = limit - cursor;
|
||||
lab1: do {
|
||||
// call main_suffix, line 68
|
||||
if (!r_main_suffix())
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_2;
|
||||
// do, line 69
|
||||
v_3 = limit - cursor;
|
||||
lab2: do {
|
||||
// call consonant_pair, line 69
|
||||
if (!r_consonant_pair())
|
||||
{
|
||||
break lab2;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_3;
|
||||
// do, line 70
|
||||
v_4 = limit - cursor;
|
||||
lab3: do {
|
||||
// call other_suffix, line 70
|
||||
if (!r_other_suffix())
|
||||
{
|
||||
break lab3;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_4;
|
||||
cursor = limit_backward; return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,53 @@
|
|||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<body>
|
||||
<p>
|
||||
Lucene Snowball README file
|
||||
</p>
|
||||
|
||||
<p>
|
||||
This project provides pre-compiled version of the Snowball stemmers
|
||||
based on revision 500 of the Tartarus Snowball repository,
|
||||
together with classes integrating them with the Lucene search engine.
|
||||
</p>
|
||||
<p>
|
||||
A few changes has been made to the static Snowball code and compiled stemmers:
|
||||
</p>
|
||||
<ul>
|
||||
<li>Class SnowballProgram is made abstract and contains new abstract method stem() to avoid reflection in Lucene filter class SnowballFilter.</li>
|
||||
<li>All use of StringBuffers has been refactored to StringBuilder for speed.</li>
|
||||
<li>Snowball BSD license header has been added to the Java classes to avoid having RAT adding ASL headers.</li>
|
||||
</ul>
|
||||
<p>
|
||||
See the Snowball <a href ="http://snowball.tartarus.org/">home page</a> for more information about the algorithms.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
<b>IMPORTANT NOTICE ON BACKWARDS COMPATIBILITY!</b>
|
||||
</p>
|
||||
<p>
|
||||
An index created using the Snowball module in Lucene 2.3.2 and below
|
||||
might not be compatible with the Snowball module in Lucene 2.4 or greater.
|
||||
</p>
|
||||
<p>
|
||||
For more information about this issue see:
|
||||
https://issues.apache.org/jira/browse/LUCENE-1142
|
||||
</p>
|
||||
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,108 @@
|
|||
| From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| A Danish stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| This is a ranked list (commonest to rarest) of stopwords derived from
|
||||
| a large text sample.
|
||||
|
||||
|
||||
og | and
|
||||
i | in
|
||||
jeg | I
|
||||
det | that (dem. pronoun)/it (pers. pronoun)
|
||||
at | that (in front of a sentence)/to (with infinitive)
|
||||
en | a/an
|
||||
den | it (pers. pronoun)/that (dem. pronoun)
|
||||
til | to/at/for/until/against/by/of/into, more
|
||||
er | present tense of "to be"
|
||||
som | who, as
|
||||
på | on/upon/in/on/at/to/after/of/with/for, on
|
||||
de | they
|
||||
med | with/by/in, along
|
||||
han | he
|
||||
af | of/by/from/off/for/in/with/on, off
|
||||
for | at/for/to/from/by/of/ago, in front/before, because
|
||||
ikke | not
|
||||
der | who/which, there/those
|
||||
var | past tense of "to be"
|
||||
mig | me/myself
|
||||
sig | oneself/himself/herself/itself/themselves
|
||||
men | but
|
||||
et | a/an/one, one (number), someone/somebody/one
|
||||
har | present tense of "to have"
|
||||
om | round/about/for/in/a, about/around/down, if
|
||||
vi | we
|
||||
min | my
|
||||
havde | past tense of "to have"
|
||||
ham | him
|
||||
hun | she
|
||||
nu | now
|
||||
over | over/above/across/by/beyond/past/on/about, over/past
|
||||
da | then, when/as/since
|
||||
fra | from/off/since, off, since
|
||||
du | you
|
||||
ud | out
|
||||
sin | his/her/its/one's
|
||||
dem | them
|
||||
os | us/ourselves
|
||||
op | up
|
||||
man | you/one
|
||||
hans | his
|
||||
hvor | where
|
||||
eller | or
|
||||
hvad | what
|
||||
skal | must/shall etc.
|
||||
selv | myself/youself/herself/ourselves etc., even
|
||||
her | here
|
||||
alle | all/everyone/everybody etc.
|
||||
vil | will (verb)
|
||||
blev | past tense of "to stay/to remain/to get/to become"
|
||||
kunne | could
|
||||
ind | in
|
||||
når | when
|
||||
være | present tense of "to be"
|
||||
dog | however/yet/after all
|
||||
noget | something
|
||||
ville | would
|
||||
jo | you know/you see (adv), yes
|
||||
deres | their/theirs
|
||||
efter | after/behind/according to/for/by/from, later/afterwards
|
||||
ned | down
|
||||
skulle | should
|
||||
denne | this
|
||||
end | than
|
||||
dette | this
|
||||
mit | my/mine
|
||||
også | also
|
||||
under | under/beneath/below/during, below/underneath
|
||||
have | have
|
||||
dig | you
|
||||
anden | other
|
||||
hende | her
|
||||
mine | my
|
||||
alt | everything
|
||||
meget | much/very, plenty of
|
||||
sit | his, her, its, one's
|
||||
sine | his, her, its, one's
|
||||
vor | our
|
||||
mod | against
|
||||
disse | these
|
||||
hvis | if
|
||||
din | your/yours
|
||||
nogle | some
|
||||
hos | by/at
|
||||
blive | be/become
|
||||
mange | many
|
||||
ad | by/through
|
||||
bliver | present tense of "to be/to become"
|
||||
hendes | her/hers
|
||||
været | be
|
||||
thi | for (conj)
|
||||
jer | you
|
||||
sådan | such, like this/like that
|
|
@ -0,0 +1,117 @@
|
|||
| From svn.tartarus.org/snowball/trunk/website/algorithms/dutch/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| A Dutch stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| This is a ranked list (commonest to rarest) of stopwords derived from
|
||||
| a large sample of Dutch text.
|
||||
|
||||
| Dutch stop words frequently exhibit homonym clashes. These are indicated
|
||||
| clearly below.
|
||||
|
||||
de | the
|
||||
en | and
|
||||
van | of, from
|
||||
ik | I, the ego
|
||||
te | (1) chez, at etc, (2) to, (3) too
|
||||
dat | that, which
|
||||
die | that, those, who, which
|
||||
in | in, inside
|
||||
een | a, an, one
|
||||
hij | he
|
||||
het | the, it
|
||||
niet | not, nothing, naught
|
||||
zijn | (1) to be, being, (2) his, one's, its
|
||||
is | is
|
||||
was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river
|
||||
op | on, upon, at, in, up, used up
|
||||
aan | on, upon, to (as dative)
|
||||
met | with, by
|
||||
als | like, such as, when
|
||||
voor | (1) before, in front of, (2) furrow
|
||||
had | had, past tense all persons sing. of 'hebben' (have)
|
||||
er | there
|
||||
maar | but, only
|
||||
om | round, about, for etc
|
||||
hem | him
|
||||
dan | then
|
||||
zou | should/would, past tense all persons sing. of 'zullen'
|
||||
of | or, whether, if
|
||||
wat | what, something, anything
|
||||
mijn | possessive and noun 'mine'
|
||||
men | people, 'one'
|
||||
dit | this
|
||||
zo | so, thus, in this way
|
||||
door | through by
|
||||
over | over, across
|
||||
ze | she, her, they, them
|
||||
zich | oneself
|
||||
bij | (1) a bee, (2) by, near, at
|
||||
ook | also, too
|
||||
tot | till, until
|
||||
je | you
|
||||
mij | me
|
||||
uit | out of, from
|
||||
der | Old Dutch form of 'van der' still found in surnames
|
||||
daar | (1) there, (2) because
|
||||
haar | (1) her, their, them, (2) hair
|
||||
naar | (1) unpleasant, unwell etc, (2) towards, (3) as
|
||||
heb | present first person sing. of 'to have'
|
||||
hoe | how, why
|
||||
heeft | present third person sing. of 'to have'
|
||||
hebben | 'to have' and various parts thereof
|
||||
deze | this
|
||||
u | you
|
||||
want | (1) for, (2) mitten, (3) rigging
|
||||
nog | yet, still
|
||||
zal | 'shall', first and third person sing. of verb 'zullen' (will)
|
||||
me | me
|
||||
zij | she, they
|
||||
nu | now
|
||||
ge | 'thou', still used in Belgium and south Netherlands
|
||||
geen | none
|
||||
omdat | because
|
||||
iets | something, somewhat
|
||||
worden | to become, grow, get
|
||||
toch | yet, still
|
||||
al | all, every, each
|
||||
waren | (1) 'were' (2) to wander, (3) wares, (3)
|
||||
veel | much, many
|
||||
meer | (1) more, (2) lake
|
||||
doen | to do, to make
|
||||
toen | then, when
|
||||
moet | noun 'spot/mote' and present form of 'to must'
|
||||
ben | (1) am, (2) 'are' in interrogative second person singular of 'to be'
|
||||
zonder | without
|
||||
kan | noun 'can' and present form of 'to be able'
|
||||
hun | their, them
|
||||
dus | so, consequently
|
||||
alles | all, everything, anything
|
||||
onder | under, beneath
|
||||
ja | yes, of course
|
||||
eens | once, one day
|
||||
hier | here
|
||||
wie | who
|
||||
werd | imperfect third person sing. of 'become'
|
||||
altijd | always
|
||||
doch | yet, but etc
|
||||
wordt | present third person sing. of 'become'
|
||||
wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans
|
||||
kunnen | to be able
|
||||
ons | us/our
|
||||
zelf | self
|
||||
tegen | against, towards, at
|
||||
na | after, near
|
||||
reeds | already
|
||||
wil | (1) present tense of 'want', (2) 'will', noun, (3) fender
|
||||
kon | could; past tense of 'to be able'
|
||||
niets | nothing
|
||||
uw | your
|
||||
iemand | somebody
|
||||
geweest | been; past participle of 'be'
|
||||
andere | other
|
|
@ -0,0 +1,317 @@
|
|||
| From svn.tartarus.org/snowball/trunk/website/algorithms/english/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| An English stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| Many of the forms below are quite rare (e.g. "yourselves") but included for
|
||||
| completeness.
|
||||
|
||||
| PRONOUNS FORMS
|
||||
| 1st person sing
|
||||
|
||||
i | subject, always in upper case of course
|
||||
|
||||
me | object
|
||||
my | possessive adjective
|
||||
| the possessive pronoun `mine' is best suppressed, because of the
|
||||
| sense of coal-mine etc.
|
||||
myself | reflexive
|
||||
| 1st person plural
|
||||
we | subject
|
||||
|
||||
| us | object
|
||||
| care is required here because US = United States. It is usually
|
||||
| safe to remove it if it is in lower case.
|
||||
our | possessive adjective
|
||||
ours | possessive pronoun
|
||||
ourselves | reflexive
|
||||
| second person (archaic `thou' forms not included)
|
||||
you | subject and object
|
||||
your | possessive adjective
|
||||
yours | possessive pronoun
|
||||
yourself | reflexive (singular)
|
||||
yourselves | reflexive (plural)
|
||||
| third person singular
|
||||
he | subject
|
||||
him | object
|
||||
his | possessive adjective and pronoun
|
||||
himself | reflexive
|
||||
|
||||
she | subject
|
||||
her | object and possessive adjective
|
||||
hers | possessive pronoun
|
||||
herself | reflexive
|
||||
|
||||
it | subject and object
|
||||
its | possessive adjective
|
||||
itself | reflexive
|
||||
| third person plural
|
||||
they | subject
|
||||
them | object
|
||||
their | possessive adjective
|
||||
theirs | possessive pronoun
|
||||
themselves | reflexive
|
||||
| other forms (demonstratives, interrogatives)
|
||||
what
|
||||
which
|
||||
who
|
||||
whom
|
||||
this
|
||||
that
|
||||
these
|
||||
those
|
||||
|
||||
| VERB FORMS (using F.R. Palmer's nomenclature)
|
||||
| BE
|
||||
am | 1st person, present
|
||||
is | -s form (3rd person, present)
|
||||
are | present
|
||||
was | 1st person, past
|
||||
were | past
|
||||
be | infinitive
|
||||
been | past participle
|
||||
being | -ing form
|
||||
| HAVE
|
||||
have | simple
|
||||
has | -s form
|
||||
had | past
|
||||
having | -ing form
|
||||
| DO
|
||||
do | simple
|
||||
does | -s form
|
||||
did | past
|
||||
doing | -ing form
|
||||
|
||||
| The forms below are, I believe, best omitted, because of the significant
|
||||
| homonym forms:
|
||||
|
||||
| He made a WILL
|
||||
| old tin CAN
|
||||
| merry month of MAY
|
||||
| a smell of MUST
|
||||
| fight the good fight with all thy MIGHT
|
||||
|
||||
| would, could, should, ought might however be included
|
||||
|
||||
| | AUXILIARIES
|
||||
| | WILL
|
||||
|will
|
||||
|
||||
would
|
||||
|
||||
| | SHALL
|
||||
|shall
|
||||
|
||||
should
|
||||
|
||||
| | CAN
|
||||
|can
|
||||
|
||||
could
|
||||
|
||||
| | MAY
|
||||
|may
|
||||
|might
|
||||
| | MUST
|
||||
|must
|
||||
| | OUGHT
|
||||
|
||||
ought
|
||||
|
||||
| COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing
|
||||
| pronoun + verb
|
||||
|
||||
i'm
|
||||
you're
|
||||
he's
|
||||
she's
|
||||
it's
|
||||
we're
|
||||
they're
|
||||
i've
|
||||
you've
|
||||
we've
|
||||
they've
|
||||
i'd
|
||||
you'd
|
||||
he'd
|
||||
she'd
|
||||
we'd
|
||||
they'd
|
||||
i'll
|
||||
you'll
|
||||
he'll
|
||||
she'll
|
||||
we'll
|
||||
they'll
|
||||
|
||||
| verb + negation
|
||||
|
||||
isn't
|
||||
aren't
|
||||
wasn't
|
||||
weren't
|
||||
hasn't
|
||||
haven't
|
||||
hadn't
|
||||
doesn't
|
||||
don't
|
||||
didn't
|
||||
|
||||
| auxiliary + negation
|
||||
|
||||
won't
|
||||
wouldn't
|
||||
shan't
|
||||
shouldn't
|
||||
can't
|
||||
cannot
|
||||
couldn't
|
||||
mustn't
|
||||
|
||||
| miscellaneous forms
|
||||
|
||||
let's
|
||||
that's
|
||||
who's
|
||||
what's
|
||||
here's
|
||||
there's
|
||||
when's
|
||||
where's
|
||||
why's
|
||||
how's
|
||||
|
||||
| rarer forms
|
||||
|
||||
| daren't needn't
|
||||
|
||||
| doubtful forms
|
||||
|
||||
| oughtn't mightn't
|
||||
|
||||
| ARTICLES
|
||||
a
|
||||
an
|
||||
the
|
||||
|
||||
| THE REST (Overlap among prepositions, conjunctions, adverbs etc is so
|
||||
| high, that classification is pointless.)
|
||||
and
|
||||
but
|
||||
if
|
||||
or
|
||||
because
|
||||
as
|
||||
until
|
||||
while
|
||||
|
||||
of
|
||||
at
|
||||
by
|
||||
for
|
||||
with
|
||||
about
|
||||
against
|
||||
between
|
||||
into
|
||||
through
|
||||
during
|
||||
before
|
||||
after
|
||||
above
|
||||
below
|
||||
to
|
||||
from
|
||||
up
|
||||
down
|
||||
in
|
||||
out
|
||||
on
|
||||
off
|
||||
over
|
||||
under
|
||||
|
||||
again
|
||||
further
|
||||
then
|
||||
once
|
||||
|
||||
here
|
||||
there
|
||||
when
|
||||
where
|
||||
why
|
||||
how
|
||||
|
||||
all
|
||||
any
|
||||
both
|
||||
each
|
||||
few
|
||||
more
|
||||
most
|
||||
other
|
||||
some
|
||||
such
|
||||
|
||||
no
|
||||
nor
|
||||
not
|
||||
only
|
||||
own
|
||||
same
|
||||
so
|
||||
than
|
||||
too
|
||||
very
|
||||
|
||||
| Just for the record, the following words are among the commonest in English
|
||||
|
||||
| one
|
||||
| every
|
||||
| least
|
||||
| less
|
||||
| many
|
||||
| now
|
||||
| ever
|
||||
| never
|
||||
| say
|
||||
| says
|
||||
| said
|
||||
| also
|
||||
| get
|
||||
| go
|
||||
| goes
|
||||
| just
|
||||
| made
|
||||
| make
|
||||
| put
|
||||
| see
|
||||
| seen
|
||||
| whether
|
||||
| like
|
||||
| well
|
||||
| back
|
||||
| even
|
||||
| still
|
||||
| way
|
||||
| take
|
||||
| since
|
||||
| another
|
||||
| however
|
||||
| two
|
||||
| three
|
||||
| four
|
||||
| five
|
||||
| first
|
||||
| second
|
||||
| new
|
||||
| old
|
||||
| high
|
||||
| long
|
|
@ -0,0 +1,95 @@
|
|||
| From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| forms of BE
|
||||
|
||||
olla
|
||||
olen
|
||||
olet
|
||||
on
|
||||
olemme
|
||||
olette
|
||||
ovat
|
||||
ole | negative form
|
||||
|
||||
oli
|
||||
olisi
|
||||
olisit
|
||||
olisin
|
||||
olisimme
|
||||
olisitte
|
||||
olisivat
|
||||
olit
|
||||
olin
|
||||
olimme
|
||||
olitte
|
||||
olivat
|
||||
ollut
|
||||
olleet
|
||||
|
||||
en | negation
|
||||
et
|
||||
ei
|
||||
emme
|
||||
ette
|
||||
eivät
|
||||
|
||||
|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans
|
||||
minä minun minut minua minussa minusta minuun minulla minulta minulle | I
|
||||
sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you
|
||||
hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she
|
||||
me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we
|
||||
te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you
|
||||
he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they
|
||||
|
||||
tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this
|
||||
tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that
|
||||
se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it
|
||||
nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these
|
||||
nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those
|
||||
ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they
|
||||
|
||||
kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who
|
||||
ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl)
|
||||
mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what
|
||||
mitkä | (pl)
|
||||
|
||||
joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which
|
||||
jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl)
|
||||
|
||||
| conjunctions
|
||||
|
||||
että | that
|
||||
ja | and
|
||||
jos | if
|
||||
koska | because
|
||||
kuin | than
|
||||
mutta | but
|
||||
niin | so
|
||||
sekä | and
|
||||
sillä | for
|
||||
tai | or
|
||||
vaan | but
|
||||
vai | or
|
||||
vaikka | although
|
||||
|
||||
|
||||
| prepositions
|
||||
|
||||
kanssa | with
|
||||
mukaan | according to
|
||||
noin | about
|
||||
poikki | across
|
||||
yli | over, across
|
||||
|
||||
| other
|
||||
|
||||
kun | when
|
||||
niin | so
|
||||
nyt | now
|
||||
itse | self
|
||||
|
|
@ -0,0 +1,183 @@
|
|||
| From svn.tartarus.org/snowball/trunk/website/algorithms/french/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| A French stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
au | a + le
|
||||
aux | a + les
|
||||
avec | with
|
||||
ce | this
|
||||
ces | these
|
||||
dans | with
|
||||
de | of
|
||||
des | de + les
|
||||
du | de + le
|
||||
elle | she
|
||||
en | `of them' etc
|
||||
et | and
|
||||
eux | them
|
||||
il | he
|
||||
je | I
|
||||
la | the
|
||||
le | the
|
||||
leur | their
|
||||
lui | him
|
||||
ma | my (fem)
|
||||
mais | but
|
||||
me | me
|
||||
même | same; as in moi-même (myself) etc
|
||||
mes | me (pl)
|
||||
moi | me
|
||||
mon | my (masc)
|
||||
ne | not
|
||||
nos | our (pl)
|
||||
notre | our
|
||||
nous | we
|
||||
on | one
|
||||
ou | where
|
||||
par | by
|
||||
pas | not
|
||||
pour | for
|
||||
qu | que before vowel
|
||||
que | that
|
||||
qui | who
|
||||
sa | his, her (fem)
|
||||
se | oneself
|
||||
ses | his (pl)
|
||||
son | his, her (masc)
|
||||
sur | on
|
||||
ta | thy (fem)
|
||||
te | thee
|
||||
tes | thy (pl)
|
||||
toi | thee
|
||||
ton | thy (masc)
|
||||
tu | thou
|
||||
un | a
|
||||
une | a
|
||||
vos | your (pl)
|
||||
votre | your
|
||||
vous | you
|
||||
|
||||
| single letter forms
|
||||
|
||||
c | c'
|
||||
d | d'
|
||||
j | j'
|
||||
l | l'
|
||||
à | to, at
|
||||
m | m'
|
||||
n | n'
|
||||
s | s'
|
||||
t | t'
|
||||
y | there
|
||||
|
||||
| forms of être (not including the infinitive):
|
||||
été
|
||||
étée
|
||||
étées
|
||||
étés
|
||||
étant
|
||||
suis
|
||||
es
|
||||
est
|
||||
sommes
|
||||
êtes
|
||||
sont
|
||||
serai
|
||||
seras
|
||||
sera
|
||||
serons
|
||||
serez
|
||||
seront
|
||||
serais
|
||||
serait
|
||||
serions
|
||||
seriez
|
||||
seraient
|
||||
étais
|
||||
était
|
||||
étions
|
||||
étiez
|
||||
étaient
|
||||
fus
|
||||
fut
|
||||
fûmes
|
||||
fûtes
|
||||
furent
|
||||
sois
|
||||
soit
|
||||
soyons
|
||||
soyez
|
||||
soient
|
||||
fusse
|
||||
fusses
|
||||
fût
|
||||
fussions
|
||||
fussiez
|
||||
fussent
|
||||
|
||||
| forms of avoir (not including the infinitive):
|
||||
ayant
|
||||
eu
|
||||
eue
|
||||
eues
|
||||
eus
|
||||
ai
|
||||
as
|
||||
avons
|
||||
avez
|
||||
ont
|
||||
aurai
|
||||
auras
|
||||
aura
|
||||
aurons
|
||||
aurez
|
||||
auront
|
||||
aurais
|
||||
aurait
|
||||
aurions
|
||||
auriez
|
||||
auraient
|
||||
avais
|
||||
avait
|
||||
avions
|
||||
aviez
|
||||
avaient
|
||||
eut
|
||||
eûmes
|
||||
eûtes
|
||||
eurent
|
||||
aie
|
||||
aies
|
||||
ait
|
||||
ayons
|
||||
ayez
|
||||
aient
|
||||
eusse
|
||||
eusses
|
||||
eût
|
||||
eussions
|
||||
eussiez
|
||||
eussent
|
||||
|
||||
| Later additions (from Jean-Christophe Deschamps)
|
||||
ceci | this
|
||||
celà | that
|
||||
cet | this
|
||||
cette | this
|
||||
ici | here
|
||||
ils | they
|
||||
les | the (pl)
|
||||
leurs | their (pl)
|
||||
quel | which
|
||||
quels | which
|
||||
quelle | which
|
||||
quelles | which
|
||||
sans | without
|
||||
soi | oneself
|
||||
|
|
@ -0,0 +1,292 @@
|
|||
| From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| A German stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| The number of forms in this list is reduced significantly by passing it
|
||||
| through the German stemmer.
|
||||
|
||||
|
||||
aber | but
|
||||
|
||||
alle | all
|
||||
allem
|
||||
allen
|
||||
aller
|
||||
alles
|
||||
|
||||
als | than, as
|
||||
also | so
|
||||
am | an + dem
|
||||
an | at
|
||||
|
||||
ander | other
|
||||
andere
|
||||
anderem
|
||||
anderen
|
||||
anderer
|
||||
anderes
|
||||
anderm
|
||||
andern
|
||||
anderr
|
||||
anders
|
||||
|
||||
auch | also
|
||||
auf | on
|
||||
aus | out of
|
||||
bei | by
|
||||
bin | am
|
||||
bis | until
|
||||
bist | art
|
||||
da | there
|
||||
damit | with it
|
||||
dann | then
|
||||
|
||||
der | the
|
||||
den
|
||||
des
|
||||
dem
|
||||
die
|
||||
das
|
||||
|
||||
daß | that
|
||||
|
||||
derselbe | the same
|
||||
derselben
|
||||
denselben
|
||||
desselben
|
||||
demselben
|
||||
dieselbe
|
||||
dieselben
|
||||
dasselbe
|
||||
|
||||
dazu | to that
|
||||
|
||||
dein | thy
|
||||
deine
|
||||
deinem
|
||||
deinen
|
||||
deiner
|
||||
deines
|
||||
|
||||
denn | because
|
||||
|
||||
derer | of those
|
||||
dessen | of him
|
||||
|
||||
dich | thee
|
||||
dir | to thee
|
||||
du | thou
|
||||
|
||||
dies | this
|
||||
diese
|
||||
diesem
|
||||
diesen
|
||||
dieser
|
||||
dieses
|
||||
|
||||
|
||||
doch | (several meanings)
|
||||
dort | (over) there
|
||||
|
||||
|
||||
durch | through
|
||||
|
||||
ein | a
|
||||
eine
|
||||
einem
|
||||
einen
|
||||
einer
|
||||
eines
|
||||
|
||||
einig | some
|
||||
einige
|
||||
einigem
|
||||
einigen
|
||||
einiger
|
||||
einiges
|
||||
|
||||
einmal | once
|
||||
|
||||
er | he
|
||||
ihn | him
|
||||
ihm | to him
|
||||
|
||||
es | it
|
||||
etwas | something
|
||||
|
||||
euer | your
|
||||
eure
|
||||
eurem
|
||||
euren
|
||||
eurer
|
||||
eures
|
||||
|
||||
für | for
|
||||
gegen | towards
|
||||
gewesen | p.p. of sein
|
||||
hab | have
|
||||
habe | have
|
||||
haben | have
|
||||
hat | has
|
||||
hatte | had
|
||||
hatten | had
|
||||
hier | here
|
||||
hin | there
|
||||
hinter | behind
|
||||
|
||||
ich | I
|
||||
mich | me
|
||||
mir | to me
|
||||
|
||||
|
||||
ihr | you, to her
|
||||
ihre
|
||||
ihrem
|
||||
ihren
|
||||
ihrer
|
||||
ihres
|
||||
euch | to you
|
||||
|
||||
im | in + dem
|
||||
in | in
|
||||
indem | while
|
||||
ins | in + das
|
||||
ist | is
|
||||
|
||||
jede | each, every
|
||||
jedem
|
||||
jeden
|
||||
jeder
|
||||
jedes
|
||||
|
||||
jene | that
|
||||
jenem
|
||||
jenen
|
||||
jener
|
||||
jenes
|
||||
|
||||
jetzt | now
|
||||
kann | can
|
||||
|
||||
kein | no
|
||||
keine
|
||||
keinem
|
||||
keinen
|
||||
keiner
|
||||
keines
|
||||
|
||||
können | can
|
||||
könnte | could
|
||||
machen | do
|
||||
man | one
|
||||
|
||||
manche | some, many a
|
||||
manchem
|
||||
manchen
|
||||
mancher
|
||||
manches
|
||||
|
||||
mein | my
|
||||
meine
|
||||
meinem
|
||||
meinen
|
||||
meiner
|
||||
meines
|
||||
|
||||
mit | with
|
||||
muss | must
|
||||
musste | had to
|
||||
nach | to(wards)
|
||||
nicht | not
|
||||
nichts | nothing
|
||||
noch | still, yet
|
||||
nun | now
|
||||
nur | only
|
||||
ob | whether
|
||||
oder | or
|
||||
ohne | without
|
||||
sehr | very
|
||||
|
||||
sein | his
|
||||
seine
|
||||
seinem
|
||||
seinen
|
||||
seiner
|
||||
seines
|
||||
|
||||
selbst | self
|
||||
sich | herself
|
||||
|
||||
sie | they, she
|
||||
ihnen | to them
|
||||
|
||||
sind | are
|
||||
so | so
|
||||
|
||||
solche | such
|
||||
solchem
|
||||
solchen
|
||||
solcher
|
||||
solches
|
||||
|
||||
soll | shall
|
||||
sollte | should
|
||||
sondern | but
|
||||
sonst | else
|
||||
über | over
|
||||
um | about, around
|
||||
und | and
|
||||
|
||||
uns | us
|
||||
unse
|
||||
unsem
|
||||
unsen
|
||||
unser
|
||||
unses
|
||||
|
||||
unter | under
|
||||
viel | much
|
||||
vom | von + dem
|
||||
von | from
|
||||
vor | before
|
||||
während | while
|
||||
war | was
|
||||
waren | were
|
||||
warst | wast
|
||||
was | what
|
||||
weg | away, off
|
||||
weil | because
|
||||
weiter | further
|
||||
|
||||
welche | which
|
||||
welchem
|
||||
welchen
|
||||
welcher
|
||||
welches
|
||||
|
||||
wenn | when
|
||||
werde | will
|
||||
werden | will
|
||||
wie | how
|
||||
wieder | again
|
||||
will | want
|
||||
wir | we
|
||||
wird | will
|
||||
wirst | willst
|
||||
wo | where
|
||||
wollen | want
|
||||
wollte | wanted
|
||||
würde | would
|
||||
würden | would
|
||||
zu | to
|
||||
zum | zu + dem
|
||||
zur | zu + der
|
||||
zwar | indeed
|
||||
zwischen | between
|
||||
|
|
@ -0,0 +1,209 @@
|
|||
| From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| Hungarian stop word list
|
||||
| prepared by Anna Tordai
|
||||
|
||||
a
|
||||
ahogy
|
||||
ahol
|
||||
aki
|
||||
akik
|
||||
akkor
|
||||
alatt
|
||||
által
|
||||
általában
|
||||
amely
|
||||
amelyek
|
||||
amelyekben
|
||||
amelyeket
|
||||
amelyet
|
||||
amelynek
|
||||
ami
|
||||
amit
|
||||
amolyan
|
||||
amíg
|
||||
amikor
|
||||
át
|
||||
abban
|
||||
ahhoz
|
||||
annak
|
||||
arra
|
||||
arról
|
||||
az
|
||||
azok
|
||||
azon
|
||||
azt
|
||||
azzal
|
||||
azért
|
||||
aztán
|
||||
azután
|
||||
azonban
|
||||
bár
|
||||
be
|
||||
belül
|
||||
benne
|
||||
cikk
|
||||
cikkek
|
||||
cikkeket
|
||||
csak
|
||||
de
|
||||
e
|
||||
eddig
|
||||
egész
|
||||
egy
|
||||
egyes
|
||||
egyetlen
|
||||
egyéb
|
||||
egyik
|
||||
egyre
|
||||
ekkor
|
||||
el
|
||||
elég
|
||||
ellen
|
||||
elő
|
||||
először
|
||||
előtt
|
||||
első
|
||||
én
|
||||
éppen
|
||||
ebben
|
||||
ehhez
|
||||
emilyen
|
||||
ennek
|
||||
erre
|
||||
ez
|
||||
ezt
|
||||
ezek
|
||||
ezen
|
||||
ezzel
|
||||
ezért
|
||||
és
|
||||
fel
|
||||
felé
|
||||
hanem
|
||||
hiszen
|
||||
hogy
|
||||
hogyan
|
||||
igen
|
||||
így
|
||||
illetve
|
||||
ill.
|
||||
ill
|
||||
ilyen
|
||||
ilyenkor
|
||||
ison
|
||||
ismét
|
||||
itt
|
||||
jó
|
||||
jól
|
||||
jobban
|
||||
kell
|
||||
kellett
|
||||
keresztül
|
||||
keressünk
|
||||
ki
|
||||
kívül
|
||||
között
|
||||
közül
|
||||
legalább
|
||||
lehet
|
||||
lehetett
|
||||
legyen
|
||||
lenne
|
||||
lenni
|
||||
lesz
|
||||
lett
|
||||
maga
|
||||
magát
|
||||
majd
|
||||
majd
|
||||
már
|
||||
más
|
||||
másik
|
||||
meg
|
||||
még
|
||||
mellett
|
||||
mert
|
||||
mely
|
||||
melyek
|
||||
mi
|
||||
mit
|
||||
míg
|
||||
miért
|
||||
milyen
|
||||
mikor
|
||||
minden
|
||||
mindent
|
||||
mindenki
|
||||
mindig
|
||||
mint
|
||||
mintha
|
||||
mivel
|
||||
most
|
||||
nagy
|
||||
nagyobb
|
||||
nagyon
|
||||
ne
|
||||
néha
|
||||
nekem
|
||||
neki
|
||||
nem
|
||||
néhány
|
||||
nélkül
|
||||
nincs
|
||||
olyan
|
||||
ott
|
||||
össze
|
||||
ő
|
||||
ők
|
||||
őket
|
||||
pedig
|
||||
persze
|
||||
rá
|
||||
s
|
||||
saját
|
||||
sem
|
||||
semmi
|
||||
sok
|
||||
sokat
|
||||
sokkal
|
||||
számára
|
||||
szemben
|
||||
szerint
|
||||
szinte
|
||||
talán
|
||||
tehát
|
||||
teljes
|
||||
tovább
|
||||
továbbá
|
||||
több
|
||||
úgy
|
||||
ugyanis
|
||||
új
|
||||
újabb
|
||||
újra
|
||||
után
|
||||
utána
|
||||
utolsó
|
||||
vagy
|
||||
vagyis
|
||||
valaki
|
||||
valami
|
||||
valamint
|
||||
való
|
||||
vagyok
|
||||
van
|
||||
vannak
|
||||
volt
|
||||
voltam
|
||||
voltak
|
||||
voltunk
|
||||
vissza
|
||||
vele
|
||||
viszont
|
||||
volna
|
|
@ -0,0 +1,301 @@
|
|||
| From svn.tartarus.org/snowball/trunk/website/algorithms/italian/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| An Italian stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
ad | a (to) before vowel
|
||||
al | a + il
|
||||
allo | a + lo
|
||||
ai | a + i
|
||||
agli | a + gli
|
||||
all | a + l'
|
||||
agl | a + gl'
|
||||
alla | a + la
|
||||
alle | a + le
|
||||
con | with
|
||||
col | con + il
|
||||
coi | con + i (forms collo, cogli etc are now very rare)
|
||||
da | from
|
||||
dal | da + il
|
||||
dallo | da + lo
|
||||
dai | da + i
|
||||
dagli | da + gli
|
||||
dall | da + l'
|
||||
dagl | da + gll'
|
||||
dalla | da + la
|
||||
dalle | da + le
|
||||
di | of
|
||||
del | di + il
|
||||
dello | di + lo
|
||||
dei | di + i
|
||||
degli | di + gli
|
||||
dell | di + l'
|
||||
degl | di + gl'
|
||||
della | di + la
|
||||
delle | di + le
|
||||
in | in
|
||||
nel | in + el
|
||||
nello | in + lo
|
||||
nei | in + i
|
||||
negli | in + gli
|
||||
nell | in + l'
|
||||
negl | in + gl'
|
||||
nella | in + la
|
||||
nelle | in + le
|
||||
su | on
|
||||
sul | su + il
|
||||
sullo | su + lo
|
||||
sui | su + i
|
||||
sugli | su + gli
|
||||
sull | su + l'
|
||||
sugl | su + gl'
|
||||
sulla | su + la
|
||||
sulle | su + le
|
||||
per | through, by
|
||||
tra | among
|
||||
contro | against
|
||||
io | I
|
||||
tu | thou
|
||||
lui | he
|
||||
lei | she
|
||||
noi | we
|
||||
voi | you
|
||||
loro | they
|
||||
mio | my
|
||||
mia |
|
||||
miei |
|
||||
mie |
|
||||
tuo |
|
||||
tua |
|
||||
tuoi | thy
|
||||
tue |
|
||||
suo |
|
||||
sua |
|
||||
suoi | his, her
|
||||
sue |
|
||||
nostro | our
|
||||
nostra |
|
||||
nostri |
|
||||
nostre |
|
||||
vostro | your
|
||||
vostra |
|
||||
vostri |
|
||||
vostre |
|
||||
mi | me
|
||||
ti | thee
|
||||
ci | us, there
|
||||
vi | you, there
|
||||
lo | him, the
|
||||
la | her, the
|
||||
li | them
|
||||
le | them, the
|
||||
gli | to him, the
|
||||
ne | from there etc
|
||||
il | the
|
||||
un | a
|
||||
uno | a
|
||||
una | a
|
||||
ma | but
|
||||
ed | and
|
||||
se | if
|
||||
perché | why, because
|
||||
anche | also
|
||||
come | how
|
||||
dov | where (as dov')
|
||||
dove | where
|
||||
che | who, that
|
||||
chi | who
|
||||
cui | whom
|
||||
non | not
|
||||
più | more
|
||||
quale | who, that
|
||||
quanto | how much
|
||||
quanti |
|
||||
quanta |
|
||||
quante |
|
||||
quello | that
|
||||
quelli |
|
||||
quella |
|
||||
quelle |
|
||||
questo | this
|
||||
questi |
|
||||
questa |
|
||||
queste |
|
||||
si | yes
|
||||
tutto | all
|
||||
tutti | all
|
||||
|
||||
| single letter forms:
|
||||
|
||||
a | at
|
||||
c | as c' for ce or ci
|
||||
e | and
|
||||
i | the
|
||||
l | as l'
|
||||
o | or
|
||||
|
||||
| forms of avere, to have (not including the infinitive):
|
||||
|
||||
ho
|
||||
hai
|
||||
ha
|
||||
abbiamo
|
||||
avete
|
||||
hanno
|
||||
abbia
|
||||
abbiate
|
||||
abbiano
|
||||
avrò
|
||||
avrai
|
||||
avrà
|
||||
avremo
|
||||
avrete
|
||||
avranno
|
||||
avrei
|
||||
avresti
|
||||
avrebbe
|
||||
avremmo
|
||||
avreste
|
||||
avrebbero
|
||||
avevo
|
||||
avevi
|
||||
aveva
|
||||
avevamo
|
||||
avevate
|
||||
avevano
|
||||
ebbi
|
||||
avesti
|
||||
ebbe
|
||||
avemmo
|
||||
aveste
|
||||
ebbero
|
||||
avessi
|
||||
avesse
|
||||
avessimo
|
||||
avessero
|
||||
avendo
|
||||
avuto
|
||||
avuta
|
||||
avuti
|
||||
avute
|
||||
|
||||
| forms of essere, to be (not including the infinitive):
|
||||
sono
|
||||
sei
|
||||
è
|
||||
siamo
|
||||
siete
|
||||
sia
|
||||
siate
|
||||
siano
|
||||
sarò
|
||||
sarai
|
||||
sarà
|
||||
saremo
|
||||
sarete
|
||||
saranno
|
||||
sarei
|
||||
saresti
|
||||
sarebbe
|
||||
saremmo
|
||||
sareste
|
||||
sarebbero
|
||||
ero
|
||||
eri
|
||||
era
|
||||
eravamo
|
||||
eravate
|
||||
erano
|
||||
fui
|
||||
fosti
|
||||
fu
|
||||
fummo
|
||||
foste
|
||||
furono
|
||||
fossi
|
||||
fosse
|
||||
fossimo
|
||||
fossero
|
||||
essendo
|
||||
|
||||
| forms of fare, to do (not including the infinitive, fa, fat-):
|
||||
faccio
|
||||
fai
|
||||
facciamo
|
||||
fanno
|
||||
faccia
|
||||
facciate
|
||||
facciano
|
||||
farò
|
||||
farai
|
||||
farà
|
||||
faremo
|
||||
farete
|
||||
faranno
|
||||
farei
|
||||
faresti
|
||||
farebbe
|
||||
faremmo
|
||||
fareste
|
||||
farebbero
|
||||
facevo
|
||||
facevi
|
||||
faceva
|
||||
facevamo
|
||||
facevate
|
||||
facevano
|
||||
feci
|
||||
facesti
|
||||
fece
|
||||
facemmo
|
||||
faceste
|
||||
fecero
|
||||
facessi
|
||||
facesse
|
||||
facessimo
|
||||
facessero
|
||||
facendo
|
||||
|
||||
| forms of stare, to be (not including the infinitive):
|
||||
sto
|
||||
stai
|
||||
sta
|
||||
stiamo
|
||||
stanno
|
||||
stia
|
||||
stiate
|
||||
stiano
|
||||
starò
|
||||
starai
|
||||
starà
|
||||
staremo
|
||||
starete
|
||||
staranno
|
||||
starei
|
||||
staresti
|
||||
starebbe
|
||||
staremmo
|
||||
stareste
|
||||
starebbero
|
||||
stavo
|
||||
stavi
|
||||
stava
|
||||
stavamo
|
||||
stavate
|
||||
stavano
|
||||
stetti
|
||||
stesti
|
||||
stette
|
||||
stemmo
|
||||
steste
|
||||
stettero
|
||||
stessi
|
||||
stesse
|
||||
stessimo
|
||||
stessero
|
||||
stando
|
|
@ -0,0 +1,192 @@
|
|||
| From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| A Norwegian stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| This stop word list is for the dominant bokmål dialect. Words unique
|
||||
| to nynorsk are marked *.
|
||||
|
||||
| Revised by Jan Bruusgaard <Jan.Bruusgaard@ssb.no>, Jan 2005
|
||||
|
||||
og | and
|
||||
i | in
|
||||
jeg | I
|
||||
det | it/this/that
|
||||
at | to (w. inf.)
|
||||
en | a/an
|
||||
et | a/an
|
||||
den | it/this/that
|
||||
til | to
|
||||
er | is/am/are
|
||||
som | who/that
|
||||
på | on
|
||||
de | they / you(formal)
|
||||
med | with
|
||||
han | he
|
||||
av | of
|
||||
ikke | not
|
||||
ikkje | not *
|
||||
der | there
|
||||
så | so
|
||||
var | was/were
|
||||
meg | me
|
||||
seg | you
|
||||
men | but
|
||||
ett | one
|
||||
har | have
|
||||
om | about
|
||||
vi | we
|
||||
min | my
|
||||
mitt | my
|
||||
ha | have
|
||||
hadde | had
|
||||
hun | she
|
||||
nå | now
|
||||
over | over
|
||||
da | when/as
|
||||
ved | by/know
|
||||
fra | from
|
||||
du | you
|
||||
ut | out
|
||||
sin | your
|
||||
dem | them
|
||||
oss | us
|
||||
opp | up
|
||||
man | you/one
|
||||
kan | can
|
||||
hans | his
|
||||
hvor | where
|
||||
eller | or
|
||||
hva | what
|
||||
skal | shall/must
|
||||
selv | self (reflective)
|
||||
sjøl | self (reflective)
|
||||
her | here
|
||||
alle | all
|
||||
vil | will
|
||||
bli | become
|
||||
ble | became
|
||||
blei | became *
|
||||
blitt | have become
|
||||
kunne | could
|
||||
inn | in
|
||||
når | when
|
||||
være | be
|
||||
kom | come
|
||||
noen | some
|
||||
noe | some
|
||||
ville | would
|
||||
dere | you
|
||||
som | who/which/that
|
||||
deres | their/theirs
|
||||
kun | only/just
|
||||
ja | yes
|
||||
etter | after
|
||||
ned | down
|
||||
skulle | should
|
||||
denne | this
|
||||
for | for/because
|
||||
deg | you
|
||||
si | hers/his
|
||||
sine | hers/his
|
||||
sitt | hers/his
|
||||
mot | against
|
||||
å | to
|
||||
meget | much
|
||||
hvorfor | why
|
||||
dette | this
|
||||
disse | these/those
|
||||
uten | without
|
||||
hvordan | how
|
||||
ingen | none
|
||||
din | your
|
||||
ditt | your
|
||||
blir | become
|
||||
samme | same
|
||||
hvilken | which
|
||||
hvilke | which (plural)
|
||||
sånn | such a
|
||||
inni | inside/within
|
||||
mellom | between
|
||||
vår | our
|
||||
hver | each
|
||||
hvem | who
|
||||
vors | us/ours
|
||||
hvis | whose
|
||||
både | both
|
||||
bare | only/just
|
||||
enn | than
|
||||
fordi | as/because
|
||||
før | before
|
||||
mange | many
|
||||
også | also
|
||||
slik | just
|
||||
vært | been
|
||||
være | to be
|
||||
båe | both *
|
||||
begge | both
|
||||
siden | since
|
||||
dykk | your *
|
||||
dykkar | yours *
|
||||
dei | they *
|
||||
deira | them *
|
||||
deires | theirs *
|
||||
deim | them *
|
||||
di | your (fem.) *
|
||||
då | as/when *
|
||||
eg | I *
|
||||
ein | a/an *
|
||||
eit | a/an *
|
||||
eitt | a/an *
|
||||
elles | or *
|
||||
honom | he *
|
||||
hjå | at *
|
||||
ho | she *
|
||||
hoe | she *
|
||||
henne | her
|
||||
hennar | her/hers
|
||||
hennes | hers
|
||||
hoss | how *
|
||||
hossen | how *
|
||||
ikkje | not *
|
||||
ingi | noone *
|
||||
inkje | noone *
|
||||
korleis | how *
|
||||
korso | how *
|
||||
kva | what/which *
|
||||
kvar | where *
|
||||
kvarhelst | where *
|
||||
kven | who/whom *
|
||||
kvi | why *
|
||||
kvifor | why *
|
||||
me | we *
|
||||
medan | while *
|
||||
mi | my *
|
||||
mine | my *
|
||||
mykje | much *
|
||||
no | now *
|
||||
nokon | some (masc./neut.) *
|
||||
noka | some (fem.) *
|
||||
nokor | some *
|
||||
noko | some *
|
||||
nokre | some *
|
||||
si | his/hers *
|
||||
sia | since *
|
||||
sidan | since *
|
||||
so | so *
|
||||
somt | some *
|
||||
somme | some *
|
||||
um | about*
|
||||
upp | up *
|
||||
vere | be *
|
||||
vore | was *
|
||||
verte | become *
|
||||
vort | become *
|
||||
varte | became *
|
||||
vart | became *
|
||||
|
|
@ -0,0 +1,251 @@
|
|||
| From svn.tartarus.org/snowball/trunk/website/algorithms/portuguese/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| A Portuguese stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
|
||||
| The following is a ranked list (commonest to rarest) of stopwords
|
||||
| deriving from a large sample of text.
|
||||
|
||||
| Extra words have been added at the end.
|
||||
|
||||
de | of, from
|
||||
a | the; to, at; her
|
||||
o | the; him
|
||||
que | who, that
|
||||
e | and
|
||||
do | de + o
|
||||
da | de + a
|
||||
em | in
|
||||
um | a
|
||||
para | for
|
||||
| é from SER
|
||||
com | with
|
||||
não | not, no
|
||||
uma | a
|
||||
os | the; them
|
||||
no | em + o
|
||||
se | himself etc
|
||||
na | em + a
|
||||
por | for
|
||||
mais | more
|
||||
as | the; them
|
||||
dos | de + os
|
||||
como | as, like
|
||||
mas | but
|
||||
| foi from SER
|
||||
ao | a + o
|
||||
ele | he
|
||||
das | de + as
|
||||
| tem from TER
|
||||
à | a + a
|
||||
seu | his
|
||||
sua | her
|
||||
ou | or
|
||||
| ser from SER
|
||||
quando | when
|
||||
muito | much
|
||||
| há from HAV
|
||||
nos | em + os; us
|
||||
já | already, now
|
||||
| está from EST
|
||||
eu | I
|
||||
também | also
|
||||
só | only, just
|
||||
pelo | per + o
|
||||
pela | per + a
|
||||
até | up to
|
||||
isso | that
|
||||
ela | he
|
||||
entre | between
|
||||
| era from SER
|
||||
depois | after
|
||||
sem | without
|
||||
mesmo | same
|
||||
aos | a + os
|
||||
| ter from TER
|
||||
seus | his
|
||||
quem | whom
|
||||
nas | em + as
|
||||
me | me
|
||||
esse | that
|
||||
eles | they
|
||||
| estão from EST
|
||||
você | you
|
||||
| tinha from TER
|
||||
| foram from SER
|
||||
essa | that
|
||||
num | em + um
|
||||
nem | nor
|
||||
suas | her
|
||||
meu | my
|
||||
às | a + as
|
||||
minha | my
|
||||
| têm from TER
|
||||
numa | em + uma
|
||||
pelos | per + os
|
||||
elas | they
|
||||
| havia from HAV
|
||||
| seja from SER
|
||||
qual | which
|
||||
| será from SER
|
||||
nós | we
|
||||
| tenho from TER
|
||||
lhe | to him, her
|
||||
deles | of them
|
||||
essas | those
|
||||
esses | those
|
||||
pelas | per + as
|
||||
este | this
|
||||
| fosse from SER
|
||||
dele | of him
|
||||
|
||||
| other words. There are many contractions such as naquele = em+aquele,
|
||||
| mo = me+o, but they are rare.
|
||||
| Indefinite article plural forms are also rare.
|
||||
|
||||
tu | thou
|
||||
te | thee
|
||||
vocês | you (plural)
|
||||
vos | you
|
||||
lhes | to them
|
||||
meus | my
|
||||
minhas
|
||||
teu | thy
|
||||
tua
|
||||
teus
|
||||
tuas
|
||||
nosso | our
|
||||
nossa
|
||||
nossos
|
||||
nossas
|
||||
|
||||
dela | of her
|
||||
delas | of them
|
||||
|
||||
esta | this
|
||||
estes | these
|
||||
estas | these
|
||||
aquele | that
|
||||
aquela | that
|
||||
aqueles | those
|
||||
aquelas | those
|
||||
isto | this
|
||||
aquilo | that
|
||||
|
||||
| forms of estar, to be (not including the infinitive):
|
||||
estou
|
||||
está
|
||||
estamos
|
||||
estão
|
||||
estive
|
||||
esteve
|
||||
estivemos
|
||||
estiveram
|
||||
estava
|
||||
estávamos
|
||||
estavam
|
||||
estivera
|
||||
estivéramos
|
||||
esteja
|
||||
estejamos
|
||||
estejam
|
||||
estivesse
|
||||
estivéssemos
|
||||
estivessem
|
||||
estiver
|
||||
estivermos
|
||||
estiverem
|
||||
|
||||
| forms of haver, to have (not including the infinitive):
|
||||
hei
|
||||
há
|
||||
havemos
|
||||
hão
|
||||
houve
|
||||
houvemos
|
||||
houveram
|
||||
houvera
|
||||
houvéramos
|
||||
haja
|
||||
hajamos
|
||||
hajam
|
||||
houvesse
|
||||
houvéssemos
|
||||
houvessem
|
||||
houver
|
||||
houvermos
|
||||
houverem
|
||||
houverei
|
||||
houverá
|
||||
houveremos
|
||||
houverão
|
||||
houveria
|
||||
houveríamos
|
||||
houveriam
|
||||
|
||||
| forms of ser, to be (not including the infinitive):
|
||||
sou
|
||||
somos
|
||||
são
|
||||
era
|
||||
éramos
|
||||
eram
|
||||
fui
|
||||
foi
|
||||
fomos
|
||||
foram
|
||||
fora
|
||||
fôramos
|
||||
seja
|
||||
sejamos
|
||||
sejam
|
||||
fosse
|
||||
fôssemos
|
||||
fossem
|
||||
for
|
||||
formos
|
||||
forem
|
||||
serei
|
||||
será
|
||||
seremos
|
||||
serão
|
||||
seria
|
||||
seríamos
|
||||
seriam
|
||||
|
||||
| forms of ter, to have (not including the infinitive):
|
||||
tenho
|
||||
tem
|
||||
temos
|
||||
tém
|
||||
tinha
|
||||
tínhamos
|
||||
tinham
|
||||
tive
|
||||
teve
|
||||
tivemos
|
||||
tiveram
|
||||
tivera
|
||||
tivéramos
|
||||
tenha
|
||||
tenhamos
|
||||
tenham
|
||||
tivesse
|
||||
tivéssemos
|
||||
tivessem
|
||||
tiver
|
||||
tivermos
|
||||
tiverem
|
||||
terei
|
||||
terá
|
||||
teremos
|
||||
terão
|
||||
teria
|
||||
teríamos
|
||||
teriam
|
|
@ -0,0 +1,241 @@
|
|||
| From svn.tartarus.org/snowball/trunk/website/algorithms/russian/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| a russian stop word list. comments begin with vertical bar. each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| this is a ranked list (commonest to rarest) of stopwords derived from
|
||||
| a large text sample.
|
||||
|
||||
| letter `ё' is translated to `е'.
|
||||
|
||||
и | and
|
||||
в | in/into
|
||||
во | alternative form
|
||||
не | not
|
||||
что | what/that
|
||||
он | he
|
||||
на | on/onto
|
||||
я | i
|
||||
с | from
|
||||
со | alternative form
|
||||
как | how
|
||||
а | milder form of `no' (but)
|
||||
то | conjunction and form of `that'
|
||||
все | all
|
||||
она | she
|
||||
так | so, thus
|
||||
его | him
|
||||
но | but
|
||||
да | yes/and
|
||||
ты | thou
|
||||
к | towards, by
|
||||
у | around, chez
|
||||
же | intensifier particle
|
||||
вы | you
|
||||
за | beyond, behind
|
||||
бы | conditional/subj. particle
|
||||
по | up to, along
|
||||
только | only
|
||||
ее | her
|
||||
мне | to me
|
||||
было | it was
|
||||
вот | here is/are, particle
|
||||
от | away from
|
||||
меня | me
|
||||
еще | still, yet, more
|
||||
нет | no, there isnt/arent
|
||||
о | about
|
||||
из | out of
|
||||
ему | to him
|
||||
теперь | now
|
||||
когда | when
|
||||
даже | even
|
||||
ну | so, well
|
||||
вдруг | suddenly
|
||||
ли | interrogative particle
|
||||
если | if
|
||||
уже | already, but homonym of `narrower'
|
||||
или | or
|
||||
ни | neither
|
||||
быть | to be
|
||||
был | he was
|
||||
него | prepositional form of его
|
||||
до | up to
|
||||
вас | you accusative
|
||||
нибудь | indef. suffix preceded by hyphen
|
||||
опять | again
|
||||
уж | already, but homonym of `adder'
|
||||
вам | to you
|
||||
сказал | he said
|
||||
ведь | particle `after all'
|
||||
там | there
|
||||
потом | then
|
||||
себя | oneself
|
||||
ничего | nothing
|
||||
ей | to her
|
||||
может | usually with `быть' as `maybe'
|
||||
они | they
|
||||
тут | here
|
||||
где | where
|
||||
есть | there is/are
|
||||
надо | got to, must
|
||||
ней | prepositional form of ей
|
||||
для | for
|
||||
мы | we
|
||||
тебя | thee
|
||||
их | them, their
|
||||
чем | than
|
||||
была | she was
|
||||
сам | self
|
||||
чтоб | in order to
|
||||
без | without
|
||||
будто | as if
|
||||
человек | man, person, one
|
||||
чего | genitive form of `what'
|
||||
раз | once
|
||||
тоже | also
|
||||
себе | to oneself
|
||||
под | beneath
|
||||
жизнь | life
|
||||
будет | will be
|
||||
ж | short form of intensifer particle `же'
|
||||
тогда | then
|
||||
кто | who
|
||||
этот | this
|
||||
говорил | was saying
|
||||
того | genitive form of `that'
|
||||
потому | for that reason
|
||||
этого | genitive form of `this'
|
||||
какой | which
|
||||
совсем | altogether
|
||||
ним | prepositional form of `его', `они'
|
||||
здесь | here
|
||||
этом | prepositional form of `этот'
|
||||
один | one
|
||||
почти | almost
|
||||
мой | my
|
||||
тем | instrumental/dative plural of `тот', `то'
|
||||
чтобы | full form of `in order that'
|
||||
нее | her (acc.)
|
||||
кажется | it seems
|
||||
сейчас | now
|
||||
были | they were
|
||||
куда | where to
|
||||
зачем | why
|
||||
сказать | to say
|
||||
всех | all (acc., gen. preposn. plural)
|
||||
никогда | never
|
||||
сегодня | today
|
||||
можно | possible, one can
|
||||
при | by
|
||||
наконец | finally
|
||||
два | two
|
||||
об | alternative form of `о', about
|
||||
другой | another
|
||||
хоть | even
|
||||
после | after
|
||||
над | above
|
||||
больше | more
|
||||
тот | that one (masc.)
|
||||
через | across, in
|
||||
эти | these
|
||||
нас | us
|
||||
про | about
|
||||
всего | in all, only, of all
|
||||
них | prepositional form of `они' (they)
|
||||
какая | which, feminine
|
||||
много | lots
|
||||
разве | interrogative particle
|
||||
сказала | she said
|
||||
три | three
|
||||
эту | this, acc. fem. sing.
|
||||
моя | my, feminine
|
||||
впрочем | moreover, besides
|
||||
хорошо | good
|
||||
свою | ones own, acc. fem. sing.
|
||||
этой | oblique form of `эта', fem. `this'
|
||||
перед | in front of
|
||||
иногда | sometimes
|
||||
лучше | better
|
||||
чуть | a little
|
||||
том | preposn. form of `that one'
|
||||
нельзя | one must not
|
||||
такой | such a one
|
||||
им | to them
|
||||
более | more
|
||||
всегда | always
|
||||
конечно | of course
|
||||
всю | acc. fem. sing of `all'
|
||||
между | between
|
||||
|
||||
|
||||
| b: some paradigms
|
||||
|
|
||||
| personal pronouns
|
||||
|
|
||||
| я меня мне мной [мною]
|
||||
| ты тебя тебе тобой [тобою]
|
||||
| он его ему им [него, нему, ним]
|
||||
| она ее эи ею [нее, нэи, нею]
|
||||
| оно его ему им [него, нему, ним]
|
||||
|
|
||||
| мы нас нам нами
|
||||
| вы вас вам вами
|
||||
| они их им ими [них, ним, ними]
|
||||
|
|
||||
| себя себе собой [собою]
|
||||
|
|
||||
| demonstrative pronouns: этот (this), тот (that)
|
||||
|
|
||||
| этот эта это эти
|
||||
| этого эты это эти
|
||||
| этого этой этого этих
|
||||
| этому этой этому этим
|
||||
| этим этой этим [этою] этими
|
||||
| этом этой этом этих
|
||||
|
|
||||
| тот та то те
|
||||
| того ту то те
|
||||
| того той того тех
|
||||
| тому той тому тем
|
||||
| тем той тем [тою] теми
|
||||
| том той том тех
|
||||
|
|
||||
| determinative pronouns
|
||||
|
|
||||
| (a) весь (all)
|
||||
|
|
||||
| весь вся все все
|
||||
| всего всю все все
|
||||
| всего всей всего всех
|
||||
| всему всей всему всем
|
||||
| всем всей всем [всею] всеми
|
||||
| всем всей всем всех
|
||||
|
|
||||
| (b) сам (himself etc)
|
||||
|
|
||||
| сам сама само сами
|
||||
| самого саму само самих
|
||||
| самого самой самого самих
|
||||
| самому самой самому самим
|
||||
| самим самой самим [самою] самими
|
||||
| самом самой самом самих
|
||||
|
|
||||
| stems of verbs `to be', `to have', `to do' and modal
|
||||
|
|
||||
| быть бы буд быв есть суть
|
||||
| име
|
||||
| дел
|
||||
| мог мож мочь
|
||||
| уме
|
||||
| хоч хот
|
||||
| долж
|
||||
| можн
|
||||
| нужн
|
||||
| нельзя
|
||||
|
|
@ -0,0 +1,354 @@
|
|||
| From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| A Spanish stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
|
||||
| The following is a ranked list (commonest to rarest) of stopwords
|
||||
| deriving from a large sample of text.
|
||||
|
||||
| Extra words have been added at the end.
|
||||
|
||||
de | from, of
|
||||
la | the, her
|
||||
que | who, that
|
||||
el | the
|
||||
en | in
|
||||
y | and
|
||||
a | to
|
||||
los | the, them
|
||||
del | de + el
|
||||
se | himself, from him etc
|
||||
las | the, them
|
||||
por | for, by, etc
|
||||
un | a
|
||||
para | for
|
||||
con | with
|
||||
no | no
|
||||
una | a
|
||||
su | his, her
|
||||
al | a + el
|
||||
| es from SER
|
||||
lo | him
|
||||
como | how
|
||||
más | more
|
||||
pero | pero
|
||||
sus | su plural
|
||||
le | to him, her
|
||||
ya | already
|
||||
o | or
|
||||
| fue from SER
|
||||
este | this
|
||||
| ha from HABER
|
||||
sí | himself etc
|
||||
porque | because
|
||||
esta | this
|
||||
| son from SER
|
||||
entre | between
|
||||
| está from ESTAR
|
||||
cuando | when
|
||||
muy | very
|
||||
sin | without
|
||||
sobre | on
|
||||
| ser from SER
|
||||
| tiene from TENER
|
||||
también | also
|
||||
me | me
|
||||
hasta | until
|
||||
hay | there is/are
|
||||
donde | where
|
||||
| han from HABER
|
||||
quien | whom, that
|
||||
| están from ESTAR
|
||||
| estado from ESTAR
|
||||
desde | from
|
||||
todo | all
|
||||
nos | us
|
||||
durante | during
|
||||
| estados from ESTAR
|
||||
todos | all
|
||||
uno | a
|
||||
les | to them
|
||||
ni | nor
|
||||
contra | against
|
||||
otros | other
|
||||
| fueron from SER
|
||||
ese | that
|
||||
eso | that
|
||||
| había from HABER
|
||||
ante | before
|
||||
ellos | they
|
||||
e | and (variant of y)
|
||||
esto | this
|
||||
mí | me
|
||||
antes | before
|
||||
algunos | some
|
||||
qué | what?
|
||||
unos | a
|
||||
yo | I
|
||||
otro | other
|
||||
otras | other
|
||||
otra | other
|
||||
él | he
|
||||
tanto | so much, many
|
||||
esa | that
|
||||
estos | these
|
||||
mucho | much, many
|
||||
quienes | who
|
||||
nada | nothing
|
||||
muchos | many
|
||||
cual | who
|
||||
| sea from SER
|
||||
poco | few
|
||||
ella | she
|
||||
estar | to be
|
||||
| haber from HABER
|
||||
estas | these
|
||||
| estaba from ESTAR
|
||||
| estamos from ESTAR
|
||||
algunas | some
|
||||
algo | something
|
||||
nosotros | we
|
||||
|
||||
| other forms
|
||||
|
||||
mi | me
|
||||
mis | mi plural
|
||||
tú | thou
|
||||
te | thee
|
||||
ti | thee
|
||||
tu | thy
|
||||
tus | tu plural
|
||||
ellas | they
|
||||
nosotras | we
|
||||
vosotros | you
|
||||
vosotras | you
|
||||
os | you
|
||||
mío | mine
|
||||
mía |
|
||||
míos |
|
||||
mías |
|
||||
tuyo | thine
|
||||
tuya |
|
||||
tuyos |
|
||||
tuyas |
|
||||
suyo | his, hers, theirs
|
||||
suya |
|
||||
suyos |
|
||||
suyas |
|
||||
nuestro | ours
|
||||
nuestra |
|
||||
nuestros |
|
||||
nuestras |
|
||||
vuestro | yours
|
||||
vuestra |
|
||||
vuestros |
|
||||
vuestras |
|
||||
esos | those
|
||||
esas | those
|
||||
|
||||
| forms of estar, to be (not including the infinitive):
|
||||
estoy
|
||||
estás
|
||||
está
|
||||
estamos
|
||||
estáis
|
||||
están
|
||||
esté
|
||||
estés
|
||||
estemos
|
||||
estéis
|
||||
estén
|
||||
estaré
|
||||
estarás
|
||||
estará
|
||||
estaremos
|
||||
estaréis
|
||||
estarán
|
||||
estaría
|
||||
estarías
|
||||
estaríamos
|
||||
estaríais
|
||||
estarían
|
||||
estaba
|
||||
estabas
|
||||
estábamos
|
||||
estabais
|
||||
estaban
|
||||
estuve
|
||||
estuviste
|
||||
estuvo
|
||||
estuvimos
|
||||
estuvisteis
|
||||
estuvieron
|
||||
estuviera
|
||||
estuvieras
|
||||
estuviéramos
|
||||
estuvierais
|
||||
estuvieran
|
||||
estuviese
|
||||
estuvieses
|
||||
estuviésemos
|
||||
estuvieseis
|
||||
estuviesen
|
||||
estando
|
||||
estado
|
||||
estada
|
||||
estados
|
||||
estadas
|
||||
estad
|
||||
|
||||
| forms of haber, to have (not including the infinitive):
|
||||
he
|
||||
has
|
||||
ha
|
||||
hemos
|
||||
habéis
|
||||
han
|
||||
haya
|
||||
hayas
|
||||
hayamos
|
||||
hayáis
|
||||
hayan
|
||||
habré
|
||||
habrás
|
||||
habrá
|
||||
habremos
|
||||
habréis
|
||||
habrán
|
||||
habría
|
||||
habrías
|
||||
habríamos
|
||||
habríais
|
||||
habrían
|
||||
había
|
||||
habías
|
||||
habíamos
|
||||
habíais
|
||||
habían
|
||||
hube
|
||||
hubiste
|
||||
hubo
|
||||
hubimos
|
||||
hubisteis
|
||||
hubieron
|
||||
hubiera
|
||||
hubieras
|
||||
hubiéramos
|
||||
hubierais
|
||||
hubieran
|
||||
hubiese
|
||||
hubieses
|
||||
hubiésemos
|
||||
hubieseis
|
||||
hubiesen
|
||||
habiendo
|
||||
habido
|
||||
habida
|
||||
habidos
|
||||
habidas
|
||||
|
||||
| forms of ser, to be (not including the infinitive):
|
||||
soy
|
||||
eres
|
||||
es
|
||||
somos
|
||||
sois
|
||||
son
|
||||
sea
|
||||
seas
|
||||
seamos
|
||||
seáis
|
||||
sean
|
||||
seré
|
||||
serás
|
||||
será
|
||||
seremos
|
||||
seréis
|
||||
serán
|
||||
sería
|
||||
serías
|
||||
seríamos
|
||||
seríais
|
||||
serían
|
||||
era
|
||||
eras
|
||||
éramos
|
||||
erais
|
||||
eran
|
||||
fui
|
||||
fuiste
|
||||
fue
|
||||
fuimos
|
||||
fuisteis
|
||||
fueron
|
||||
fuera
|
||||
fueras
|
||||
fuéramos
|
||||
fuerais
|
||||
fueran
|
||||
fuese
|
||||
fueses
|
||||
fuésemos
|
||||
fueseis
|
||||
fuesen
|
||||
siendo
|
||||
sido
|
||||
| sed also means 'thirst'
|
||||
|
||||
| forms of tener, to have (not including the infinitive):
|
||||
tengo
|
||||
tienes
|
||||
tiene
|
||||
tenemos
|
||||
tenéis
|
||||
tienen
|
||||
tenga
|
||||
tengas
|
||||
tengamos
|
||||
tengáis
|
||||
tengan
|
||||
tendré
|
||||
tendrás
|
||||
tendrá
|
||||
tendremos
|
||||
tendréis
|
||||
tendrán
|
||||
tendría
|
||||
tendrías
|
||||
tendríamos
|
||||
tendríais
|
||||
tendrían
|
||||
tenía
|
||||
tenías
|
||||
teníamos
|
||||
teníais
|
||||
tenían
|
||||
tuve
|
||||
tuviste
|
||||
tuvo
|
||||
tuvimos
|
||||
tuvisteis
|
||||
tuvieron
|
||||
tuviera
|
||||
tuvieras
|
||||
tuviéramos
|
||||
tuvierais
|
||||
tuvieran
|
||||
tuviese
|
||||
tuvieses
|
||||
tuviésemos
|
||||
tuvieseis
|
||||
tuviesen
|
||||
teniendo
|
||||
tenido
|
||||
tenida
|
||||
tenidos
|
||||
tenidas
|
||||
tened
|
||||
|
|
@ -0,0 +1,131 @@
|
|||
| From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| A Swedish stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| This is a ranked list (commonest to rarest) of stopwords derived from
|
||||
| a large text sample.
|
||||
|
||||
| Swedish stop words occasionally exhibit homonym clashes. For example
|
||||
| så = so, but also seed. These are indicated clearly below.
|
||||
|
||||
och | and
|
||||
det | it, this/that
|
||||
att | to (with infinitive)
|
||||
i | in, at
|
||||
en | a
|
||||
jag | I
|
||||
hon | she
|
||||
som | who, that
|
||||
han | he
|
||||
på | on
|
||||
den | it, this/that
|
||||
med | with
|
||||
var | where, each
|
||||
sig | him(self) etc
|
||||
för | for
|
||||
så | so (also: seed)
|
||||
till | to
|
||||
är | is
|
||||
men | but
|
||||
ett | a
|
||||
om | if; around, about
|
||||
hade | had
|
||||
de | they, these/those
|
||||
av | of
|
||||
icke | not, no
|
||||
mig | me
|
||||
du | you
|
||||
henne | her
|
||||
då | then, when
|
||||
sin | his
|
||||
nu | now
|
||||
har | have
|
||||
inte | inte någon = no one
|
||||
hans | his
|
||||
honom | him
|
||||
skulle | 'sake'
|
||||
hennes | her
|
||||
där | there
|
||||
min | my
|
||||
man | one (pronoun)
|
||||
ej | nor
|
||||
vid | at, by, on (also: vast)
|
||||
kunde | could
|
||||
något | some etc
|
||||
från | from, off
|
||||
ut | out
|
||||
när | when
|
||||
efter | after, behind
|
||||
upp | up
|
||||
vi | we
|
||||
dem | them
|
||||
vara | be
|
||||
vad | what
|
||||
över | over
|
||||
än | than
|
||||
dig | you
|
||||
kan | can
|
||||
sina | his
|
||||
här | here
|
||||
ha | have
|
||||
mot | towards
|
||||
alla | all
|
||||
under | under (also: wonder)
|
||||
någon | some etc
|
||||
eller | or (else)
|
||||
allt | all
|
||||
mycket | much
|
||||
sedan | since
|
||||
ju | why
|
||||
denna | this/that
|
||||
själv | myself, yourself etc
|
||||
detta | this/that
|
||||
åt | to
|
||||
utan | without
|
||||
varit | was
|
||||
hur | how
|
||||
ingen | no
|
||||
mitt | my
|
||||
ni | you
|
||||
bli | to be, become
|
||||
blev | from bli
|
||||
oss | us
|
||||
din | thy
|
||||
dessa | these/those
|
||||
några | some etc
|
||||
deras | their
|
||||
blir | from bli
|
||||
mina | my
|
||||
samma | (the) same
|
||||
vilken | who, that
|
||||
er | you, your
|
||||
sådan | such a
|
||||
vår | our
|
||||
blivit | from bli
|
||||
dess | its
|
||||
inom | within
|
||||
mellan | between
|
||||
sådant | such a
|
||||
varför | why
|
||||
varje | each
|
||||
vilka | who, that
|
||||
ditt | thy
|
||||
vem | who
|
||||
vilket | who, that
|
||||
sitta | his
|
||||
sådana | such a
|
||||
vart | each
|
||||
dina | thy
|
||||
vars | whose
|
||||
vårt | our
|
||||
våra | our
|
||||
ert | your
|
||||
era | your
|
||||
vilkas | whose
|
||||
|
|
@ -0,0 +1,144 @@
|
|||
package org.apache.lucene.analysis.snowball;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestSnowball extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testEnglish() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English");
|
||||
assertAnalyzesTo(a, "he abhorred accents",
|
||||
new String[]{"he", "abhor", "accent"});
|
||||
}
|
||||
|
||||
public void testStopwords() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English",
|
||||
StandardAnalyzer.STOP_WORDS_SET);
|
||||
assertAnalyzesTo(a, "the quick brown fox jumped",
|
||||
new String[]{"quick", "brown", "fox", "jump"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test english lowercasing. Test both cases (pre-3.1 and post-3.1) to ensure
|
||||
* we lowercase I correct for non-Turkish languages in either case.
|
||||
*/
|
||||
public void testEnglishLowerCase() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English");
|
||||
assertAnalyzesTo(a, "cryogenic", new String[] { "cryogen" });
|
||||
assertAnalyzesTo(a, "CRYOGENIC", new String[] { "cryogen" });
|
||||
|
||||
Analyzer b = new SnowballAnalyzer(Version.LUCENE_30, "English");
|
||||
assertAnalyzesTo(b, "cryogenic", new String[] { "cryogen" });
|
||||
assertAnalyzesTo(b, "CRYOGENIC", new String[] { "cryogen" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test turkish lowercasing
|
||||
*/
|
||||
public void testTurkish() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "Turkish");
|
||||
|
||||
assertAnalyzesTo(a, "ağacı", new String[] { "ağaç" });
|
||||
assertAnalyzesTo(a, "AĞACI", new String[] { "ağaç" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test turkish lowercasing (old buggy behavior)
|
||||
* @deprecated Remove this when support for 3.0 indexes is no longer required
|
||||
*/
|
||||
public void testTurkishBWComp() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer(Version.LUCENE_30, "Turkish");
|
||||
// AĞACI in turkish lowercases to ağacı, but with lowercase filter ağaci.
|
||||
// this fails due to wrong casing, because the stemmer
|
||||
// will only remove -ı, not -i
|
||||
assertAnalyzesTo(a, "ağacı", new String[] { "ağaç" });
|
||||
assertAnalyzesTo(a, "AĞACI", new String[] { "ağaci" });
|
||||
}
|
||||
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English");
|
||||
assertAnalyzesToReuse(a, "he abhorred accents",
|
||||
new String[]{"he", "abhor", "accent"});
|
||||
assertAnalyzesToReuse(a, "she abhorred him",
|
||||
new String[]{"she", "abhor", "him"});
|
||||
}
|
||||
|
||||
public void testFilterTokens() throws Exception {
|
||||
SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
|
||||
TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
|
||||
TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
|
||||
PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class);
|
||||
FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class);
|
||||
|
||||
filter.incrementToken();
|
||||
|
||||
assertEquals("accent", termAtt.term());
|
||||
assertEquals(2, offsetAtt.startOffset());
|
||||
assertEquals(7, offsetAtt.endOffset());
|
||||
assertEquals("wrd", typeAtt.type());
|
||||
assertEquals(3, posIncAtt.getPositionIncrement());
|
||||
assertEquals(77, flagsAtt.getFlags());
|
||||
assertEquals(new Payload(new byte[]{0,1,2,3}), payloadAtt.getPayload());
|
||||
}
|
||||
|
||||
private final class TestTokenStream extends TokenStream {
|
||||
private TermAttribute termAtt;
|
||||
private OffsetAttribute offsetAtt;
|
||||
private TypeAttribute typeAtt;
|
||||
private PayloadAttribute payloadAtt;
|
||||
private PositionIncrementAttribute posIncAtt;
|
||||
private FlagsAttribute flagsAtt;
|
||||
|
||||
TestTokenStream() {
|
||||
super();
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
typeAtt = addAttribute(TypeAttribute.class);
|
||||
payloadAtt = addAttribute(PayloadAttribute.class);
|
||||
posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
flagsAtt = addAttribute(FlagsAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() {
|
||||
clearAttributes();
|
||||
termAtt.setTermBuffer("accents");
|
||||
offsetAtt.setOffset(2, 7);
|
||||
typeAtt.setType("wrd");
|
||||
posIncAtt.setPositionIncrement(3);
|
||||
payloadAtt.setPayload(new Payload(new byte[]{0,1,2,3}));
|
||||
flagsAtt.setFlags(77);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,102 @@
|
|||
package org.apache.lucene.analysis.snowball;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
/**
|
||||
* Test the snowball filters against the snowball data tests
|
||||
*/
|
||||
public class TestSnowballVocab extends BaseTokenStreamTestCase {
|
||||
private Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
|
||||
static final File dataDir = new File(System.getProperty("dataDir", "./bin"));
|
||||
static final File dataRoot = new File(dataDir,
|
||||
"org/apache/lucene/analysis/snowball/data");
|
||||
|
||||
/**
|
||||
* Run all languages against their snowball vocabulary tests.
|
||||
*/
|
||||
public void testStemmers() throws IOException {
|
||||
if (!dataRoot.exists()) {
|
||||
System.err.println("WARN: This test was disabled, as the svn checkout of snowball test files is not supported on your system!");
|
||||
return;
|
||||
}
|
||||
assertCorrectOutput("Danish", "danish");
|
||||
assertCorrectOutput("Dutch", "dutch");
|
||||
assertCorrectOutput("English", "english");
|
||||
// disabled due to snowball java code generation bug:
|
||||
// see http://article.gmane.org/gmane.comp.search.snowball/1139
|
||||
// assertCorrectOutput("Finnish", "finnish");
|
||||
assertCorrectOutput("French", "french");
|
||||
assertCorrectOutput("German", "german");
|
||||
assertCorrectOutput("German2", "german2");
|
||||
assertCorrectOutput("Hungarian", "hungarian");
|
||||
assertCorrectOutput("Italian", "italian");
|
||||
assertCorrectOutput("Kp", "kraaij_pohlmann");
|
||||
// disabled due to snowball java code generation bug:
|
||||
// see http://article.gmane.org/gmane.comp.search.snowball/1139
|
||||
// assertCorrectOutput("Lovins", "lovins");
|
||||
assertCorrectOutput("Norwegian", "norwegian");
|
||||
assertCorrectOutput("Porter", "porter");
|
||||
assertCorrectOutput("Portuguese", "portuguese");
|
||||
assertCorrectOutput("Romanian", "romanian");
|
||||
assertCorrectOutput("Russian", "russian");
|
||||
assertCorrectOutput("Spanish", "spanish");
|
||||
assertCorrectOutput("Swedish", "swedish");
|
||||
assertCorrectOutput("Turkish", "turkish");
|
||||
}
|
||||
|
||||
/**
|
||||
* For the supplied language, run the stemmer against all strings in voc.txt
|
||||
* The output should be the same as the string in output.txt
|
||||
*/
|
||||
private void assertCorrectOutput(String snowballLanguage, String dataDirectory)
|
||||
throws IOException {
|
||||
System.err.println("checking snowball language: " + snowballLanguage);
|
||||
TokenStream filter = new SnowballFilter(tokenizer, snowballLanguage);
|
||||
InputStream vocFile = new FileInputStream(new File(dataRoot,
|
||||
dataDirectory + "/voc.txt"));
|
||||
InputStream outputFile = new FileInputStream(new File(dataRoot,
|
||||
dataDirectory + "/output.txt"));
|
||||
BufferedReader vocReader = new BufferedReader(new InputStreamReader(
|
||||
vocFile, "UTF-8"));
|
||||
BufferedReader outputReader = new BufferedReader(new InputStreamReader(
|
||||
outputFile, "UTF-8"));
|
||||
String inputWord = null;
|
||||
while ((inputWord = vocReader.readLine()) != null) {
|
||||
String expectedWord = outputReader.readLine();
|
||||
assertNotNull(expectedWord);
|
||||
tokenizer.reset(new StringReader(inputWord));
|
||||
filter.reset();
|
||||
assertTokenStreamContents(filter, new String[] {expectedWord});
|
||||
}
|
||||
vocReader.close();
|
||||
outputReader.close();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
<?xml version="1.0"?>
|
||||
<document>
|
||||
<properties>
|
||||
<title>Overview - Snowball Stemmers for Lucene</title>
|
||||
</properties>
|
||||
<body>
|
||||
|
||||
<section name="Snowball Stemmers for Lucene">
|
||||
<p>
|
||||
This project provides pre-compiled version of the Snowball stemmers
|
||||
together with classes integrating them with the Lucene search engine.
|
||||
</p>
|
||||
</section>
|
||||
|
||||
</body>
|
||||
</document>
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
<?xml version="1.0" encoding="ISO-8859-1"?>
|
||||
<project name="Snowball Stemers for Lucene"
|
||||
href="http://jakarta.apache.org/lucene-sandbox/snowball/">
|
||||
|
||||
<title>Snowball Stemmers for Lucene</title>
|
||||
|
||||
<body>
|
||||
<menu name="Documentation">
|
||||
<item name="Javadoc" href="/api/index.html"/>
|
||||
</menu>
|
||||
|
||||
<menu name="Download">
|
||||
<item name="Releases"
|
||||
href="http://jakarta.apache.org/builds/jakarta-lucene-sandbox/snowball/"/>
|
||||
<item name="CVS Repository" href="/site/cvsindex.html"/>
|
||||
</menu>
|
||||
|
||||
<menu name="Links">
|
||||
<item name="Snowball Home" href="http://snowball.tartarus.org/"/>
|
||||
<item name="Lucene Home" href="http://jakarta.apache.org/lucene/"/>
|
||||
<item name="Lucene Sandbox"
|
||||
href="http://jakarta.apache.org/lucene/docs/lucene-sandbox/"/>
|
||||
</menu>
|
||||
|
||||
<menu name="Jakarta">
|
||||
<item name="Get Involved" href="/site/getinvolved.html"/>
|
||||
<item name="Acknowledgements" href="/site/acknowledgements.html"/>
|
||||
<item name="Contact" href="/site/contact.html"/>
|
||||
<item name="Legal" href="/site/legal.html"/>
|
||||
</menu>
|
||||
</body>
|
||||
</project>
|
|
@ -1,10 +1,5 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -22,6 +17,12 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
/**
|
||||
* This class converts alphabetic, numeric, and symbolic Unicode characters
|
||||
* which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
|
||||
|
@ -101,7 +102,7 @@ public final class ASCIIFoldingFilter extends TokenFilter {
|
|||
// Worst-case length required:
|
||||
final int maxSizeNeeded = 4 * length;
|
||||
if (output.length < maxSizeNeeded) {
|
||||
output = new char[ArrayUtil.getNextSize(maxSizeNeeded)];
|
||||
output = new char[ArrayUtil.oversize(maxSizeNeeded, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
}
|
||||
|
||||
outputPos = 0;
|
||||
|
|
|
@ -29,6 +29,7 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
/**
|
||||
A Token is an occurrence of a term from the text of a field. It consists of
|
||||
|
@ -347,12 +348,12 @@ public class Token extends AttributeImpl
|
|||
public char[] resizeTermBuffer(int newSize) {
|
||||
if (termBuffer == null) {
|
||||
// The buffer is always at least MIN_BUFFER_SIZE
|
||||
termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
|
||||
termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
} else {
|
||||
if(termBuffer.length < newSize){
|
||||
// Not big enough; create a new array with slight
|
||||
// over allocation and preserve content
|
||||
final char[] newCharBuffer = new char[ArrayUtil.getNextSize(newSize)];
|
||||
final char[] newCharBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
|
||||
termBuffer = newCharBuffer;
|
||||
}
|
||||
|
@ -367,19 +368,19 @@ public class Token extends AttributeImpl
|
|||
private void growTermBuffer(int newSize) {
|
||||
if (termBuffer == null) {
|
||||
// The buffer is always at least MIN_BUFFER_SIZE
|
||||
termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
|
||||
termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
} else {
|
||||
if(termBuffer.length < newSize){
|
||||
// Not big enough; create a new array with slight
|
||||
// over allocation:
|
||||
termBuffer = new char[ArrayUtil.getNextSize(newSize)];
|
||||
termBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void initTermBuffer() {
|
||||
if (termBuffer == null) {
|
||||
termBuffer = new char[ArrayUtil.getNextSize(MIN_BUFFER_SIZE)];
|
||||
termBuffer = new char[ArrayUtil.oversize(MIN_BUFFER_SIZE, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
termLength = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.Serializable;
|
|||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
/**
|
||||
* The term text of a Token.
|
||||
|
@ -106,12 +107,12 @@ public class TermAttributeImpl extends AttributeImpl implements TermAttribute, C
|
|||
public char[] resizeTermBuffer(int newSize) {
|
||||
if (termBuffer == null) {
|
||||
// The buffer is always at least MIN_BUFFER_SIZE
|
||||
termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
|
||||
termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
} else {
|
||||
if(termBuffer.length < newSize){
|
||||
// Not big enough; create a new array with slight
|
||||
// over allocation and preserve content
|
||||
final char[] newCharBuffer = new char[ArrayUtil.getNextSize(newSize)];
|
||||
final char[] newCharBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
|
||||
termBuffer = newCharBuffer;
|
||||
}
|
||||
|
@ -127,19 +128,19 @@ public class TermAttributeImpl extends AttributeImpl implements TermAttribute, C
|
|||
private void growTermBuffer(int newSize) {
|
||||
if (termBuffer == null) {
|
||||
// The buffer is always at least MIN_BUFFER_SIZE
|
||||
termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
|
||||
termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
} else {
|
||||
if(termBuffer.length < newSize){
|
||||
// Not big enough; create a new array with slight
|
||||
// over allocation:
|
||||
termBuffer = new char[ArrayUtil.getNextSize(newSize)];
|
||||
termBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void initTermBuffer() {
|
||||
if (termBuffer == null) {
|
||||
termBuffer = new char[ArrayUtil.getNextSize(MIN_BUFFER_SIZE)];
|
||||
termBuffer = new char[ArrayUtil.oversize(MIN_BUFFER_SIZE, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
termLength = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,6 +25,7 @@ import java.util.HashSet;
|
|||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
/** This is just a "splitter" class: it lets you wrap two
|
||||
* DocFieldConsumer instances as a single consumer. */
|
||||
|
@ -117,7 +118,7 @@ final class DocFieldConsumers extends DocFieldConsumer {
|
|||
// enough space to recycle all outstanding PerDoc
|
||||
// instances
|
||||
assert allocCount == 1+docFreeList.length;
|
||||
docFreeList = new PerDoc[ArrayUtil.getNextSize(allocCount)];
|
||||
docFreeList = new PerDoc[ArrayUtil.oversize(allocCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
}
|
||||
return new PerDoc();
|
||||
} else
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.io.IOException;
|
|||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
/**
|
||||
* Gathers all Fieldables for a document under the same
|
||||
|
@ -340,7 +341,7 @@ final class DocFieldProcessorPerThread extends DocConsumerPerThread {
|
|||
// enough space to recycle all outstanding PerDoc
|
||||
// instances
|
||||
assert allocCount == 1+docFreeList.length;
|
||||
docFreeList = new PerDoc[ArrayUtil.getNextSize(allocCount)];
|
||||
docFreeList = new PerDoc[ArrayUtil.oversize(allocCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
}
|
||||
return new PerDoc();
|
||||
} else
|
||||
|
|
|
@ -40,6 +40,7 @@ import org.apache.lucene.store.Directory;
|
|||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Constants;
|
||||
import org.apache.lucene.util.ThreadInterruptedException;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
/**
|
||||
* This class accepts multiple added documents and directly
|
||||
|
@ -1503,7 +1504,7 @@ final class DocumentsWriter {
|
|||
int gap = doc.docID - nextWriteDocID;
|
||||
if (gap >= waiting.length) {
|
||||
// Grow queue
|
||||
DocWriter[] newArray = new DocWriter[ArrayUtil.getNextSize(gap)];
|
||||
DocWriter[] newArray = new DocWriter[ArrayUtil.oversize(gap, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
assert nextWriteLoc >= 0;
|
||||
System.arraycopy(waiting, nextWriteLoc, newArray, 0, waiting.length-nextWriteLoc);
|
||||
System.arraycopy(waiting, 0, newArray, waiting.length-nextWriteLoc, nextWriteLoc);
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.index;
|
|||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
/**
|
||||
* NOTE: this API is experimental and will likely change
|
||||
|
@ -35,7 +36,7 @@ abstract class FormatPostingsTermsConsumer {
|
|||
FormatPostingsDocsConsumer addTerm(String text) throws IOException {
|
||||
final int len = text.length();
|
||||
if (termBuffer == null || termBuffer.length < 1+len)
|
||||
termBuffer = new char[ArrayUtil.getNextSize(1+len)];
|
||||
termBuffer = new char[ArrayUtil.oversize(1+len, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
text.getChars(0, len, termBuffer, 0);
|
||||
termBuffer[len] = 0xffff;
|
||||
return addTerm(termBuffer, 0);
|
||||
|
|
|
@ -63,11 +63,13 @@ final class NormsWriterPerField extends InvertedDocEndConsumerPerField implement
|
|||
|
||||
@Override
|
||||
void finish() {
|
||||
assert docIDs.length == norms.length;
|
||||
if (fieldInfo.isIndexed && !fieldInfo.omitNorms) {
|
||||
if (docIDs.length <= upto) {
|
||||
assert docIDs.length == upto;
|
||||
docIDs = ArrayUtil.grow(docIDs, 1+upto);
|
||||
}
|
||||
if (norms.length <= upto) {
|
||||
assert norms.length == upto;
|
||||
norms = ArrayUtil.grow(norms, 1+upto);
|
||||
}
|
||||
final float norm = docState.similarity.computeNorm(fieldInfo.name, fieldState);
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.index;
|
|||
import java.io.IOException;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
/** This is a DocFieldConsumer that writes stored fields. */
|
||||
final class StoredFieldsWriter {
|
||||
|
@ -108,7 +109,7 @@ final class StoredFieldsWriter {
|
|||
// enough space to recycle all outstanding PerDoc
|
||||
// instances
|
||||
assert allocCount == 1+docFreeList.length;
|
||||
docFreeList = new PerDoc[ArrayUtil.getNextSize(allocCount)];
|
||||
docFreeList = new PerDoc[ArrayUtil.oversize(allocCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
}
|
||||
return new PerDoc();
|
||||
} else
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.index;
|
|||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
|
@ -117,7 +118,7 @@ final class TermVectorsTermsWriter extends TermsHashConsumer {
|
|||
// enough space to recycle all outstanding PerDoc
|
||||
// instances
|
||||
assert allocCount == 1+docFreeList.length;
|
||||
docFreeList = new PerDoc[ArrayUtil.getNextSize(allocCount)];
|
||||
docFreeList = new PerDoc[ArrayUtil.oversize(allocCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
}
|
||||
return new PerDoc();
|
||||
} else
|
||||
|
@ -266,6 +267,8 @@ final class TermVectorsTermsWriter extends TermsHashConsumer {
|
|||
void addField(final int fieldNumber) {
|
||||
if (numVectorFields == fieldNumbers.length) {
|
||||
fieldNumbers = ArrayUtil.grow(fieldNumbers);
|
||||
}
|
||||
if (numVectorFields == fieldPointers.length) {
|
||||
fieldPointers = ArrayUtil.grow(fieldPointers);
|
||||
}
|
||||
fieldNumbers[numVectorFields] = fieldNumber;
|
||||
|
|
|
@ -26,6 +26,7 @@ import java.util.Arrays;
|
|||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
/** This class implements {@link InvertedDocConsumer}, which
|
||||
* is passed each token produced by the analyzer on each
|
||||
|
@ -89,7 +90,7 @@ final class TermsHash extends InvertedDocConsumer {
|
|||
|
||||
assert postingsFreeCount == postingsAllocCount: Thread.currentThread().getName() + ": postingsFreeCount=" + postingsFreeCount + " postingsAllocCount=" + postingsAllocCount + " consumer=" + consumer;
|
||||
|
||||
final int newSize = ArrayUtil.getShrinkSize(postingsFreeList.length, postingsAllocCount);
|
||||
final int newSize = ArrayUtil.getShrinkSize(postingsFreeList.length, postingsAllocCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
|
||||
if (newSize != postingsFreeList.length) {
|
||||
RawPostingList[] newArray = new RawPostingList[newSize];
|
||||
System.arraycopy(postingsFreeList, 0, newArray, 0, postingsFreeCount);
|
||||
|
@ -222,7 +223,7 @@ final class TermsHash extends InvertedDocConsumer {
|
|||
if (newPostingsAllocCount > postingsFreeList.length)
|
||||
// Pre-allocate the postingsFreeList so it's large
|
||||
// enough to hold all postings we've given out
|
||||
postingsFreeList = new RawPostingList[ArrayUtil.getNextSize(newPostingsAllocCount)];
|
||||
postingsFreeList = new RawPostingList[ArrayUtil.oversize(newPostingsAllocCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
}
|
||||
|
||||
postingsFreeCount -= numToCopy;
|
||||
|
|
|
@ -7,9 +7,9 @@ package org.apache.lucene.util;
|
|||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* <p/>
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* <p/>
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -122,20 +122,95 @@ public final class ArrayUtil {
|
|||
END APACHE HARMONY CODE
|
||||
*/
|
||||
|
||||
/** Returns an array size >= minTargetSize, generally
|
||||
* over-allocating exponentially to achieve amortized
|
||||
* linear-time cost as the array grows.
|
||||
*
|
||||
* NOTE: this was originally borrowed from Python 2.4.2
|
||||
* listobject.c sources (attribution in LICENSE.txt), but
|
||||
* has now been substantially changed based on
|
||||
* discussions from java-dev thread with subject "Dynamic
|
||||
* array reallocation algorithms", started on Jan 12
|
||||
* 2010.
|
||||
*
|
||||
* @param minTargetSize Minimum required value to be returned.
|
||||
* @param bytesPerElement Bytes used by each element of
|
||||
* the array. See constants in {@link RamUsageEstimator}.
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
|
||||
public static int getNextSize(int targetSize) {
|
||||
/* This over-allocates proportional to the list size, making room
|
||||
* for additional growth. The over-allocation is mild, but is
|
||||
* enough to give linear-time amortized behavior over a long
|
||||
* sequence of appends() in the presence of a poorly-performing
|
||||
* system realloc().
|
||||
* The growth pattern is: 0, 4, 8, 16, 25, 35, 46, 58, 72, 88, ...
|
||||
*/
|
||||
return (targetSize >> 3) + (targetSize < 9 ? 3 : 6) + targetSize;
|
||||
public static int oversize(int minTargetSize, int bytesPerElement) {
|
||||
|
||||
if (minTargetSize < 0) {
|
||||
// catch usage that accidentally overflows int
|
||||
throw new IllegalArgumentException("invalid array size " + minTargetSize);
|
||||
}
|
||||
|
||||
if (minTargetSize == 0) {
|
||||
// wait until at least one element is requested
|
||||
return 0;
|
||||
}
|
||||
|
||||
// asymptotic exponential growth by 1/8th, favors
|
||||
// spending a bit more CPU to not tye up too much wasted
|
||||
// RAM:
|
||||
int extra = minTargetSize >> 3;
|
||||
|
||||
if (extra < 3) {
|
||||
// for very small arrays, where constant overhead of
|
||||
// realloc is presumably relatively high, we grow
|
||||
// faster
|
||||
extra = 3;
|
||||
}
|
||||
|
||||
int newSize = minTargetSize + extra;
|
||||
|
||||
// add 7 to allow for worst case byte alignment addition below:
|
||||
if (newSize+7 < 0) {
|
||||
// int overflowed -- return max allowed array size
|
||||
return Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
if (Constants.JRE_IS_64BIT) {
|
||||
// round up to 8 byte alignment in 64bit env
|
||||
switch(bytesPerElement) {
|
||||
case 4:
|
||||
// round up to multiple of 2
|
||||
return (newSize + 1) & 0x7ffffffe;
|
||||
case 2:
|
||||
// round up to multiple of 4
|
||||
return (newSize + 3) & 0x7ffffffc;
|
||||
case 1:
|
||||
// round up to multiple of 8
|
||||
return (newSize + 7) & 0x7ffffff8;
|
||||
case 8:
|
||||
// no rounding
|
||||
default:
|
||||
// odd (invalid?) size
|
||||
return newSize;
|
||||
}
|
||||
} else {
|
||||
// round up to 4 byte alignment in 64bit env
|
||||
switch(bytesPerElement) {
|
||||
case 2:
|
||||
// round up to multiple of 2
|
||||
return (newSize + 1) & 0x7ffffffe;
|
||||
case 1:
|
||||
// round up to multiple of 4
|
||||
return (newSize + 3) & 0x7ffffffc;
|
||||
case 4:
|
||||
case 8:
|
||||
// no rounding
|
||||
default:
|
||||
// odd (invalid?) size
|
||||
return newSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static int getShrinkSize(int currentSize, int targetSize) {
|
||||
final int newSize = getNextSize(targetSize);
|
||||
public static int getShrinkSize(int currentSize, int targetSize, int bytesPerElement) {
|
||||
final int newSize = oversize(targetSize, bytesPerElement);
|
||||
// Only reallocate if we are "substantially" smaller.
|
||||
// This saves us from "running hot" (constantly making a
|
||||
// bit bigger then a bit smaller, over and over):
|
||||
|
@ -147,7 +222,7 @@ public final class ArrayUtil {
|
|||
|
||||
public static int[] grow(int[] array, int minSize) {
|
||||
if (array.length < minSize) {
|
||||
int[] newArray = new int[getNextSize(minSize)];
|
||||
int[] newArray = new int[oversize(minSize, RamUsageEstimator.NUM_BYTES_INT)];
|
||||
System.arraycopy(array, 0, newArray, 0, array.length);
|
||||
return newArray;
|
||||
} else
|
||||
|
@ -159,7 +234,7 @@ public final class ArrayUtil {
|
|||
}
|
||||
|
||||
public static int[] shrink(int[] array, int targetSize) {
|
||||
final int newSize = getShrinkSize(array.length, targetSize);
|
||||
final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_INT);
|
||||
if (newSize != array.length) {
|
||||
int[] newArray = new int[newSize];
|
||||
System.arraycopy(array, 0, newArray, 0, newSize);
|
||||
|
@ -170,7 +245,7 @@ public final class ArrayUtil {
|
|||
|
||||
public static long[] grow(long[] array, int minSize) {
|
||||
if (array.length < minSize) {
|
||||
long[] newArray = new long[getNextSize(minSize)];
|
||||
long[] newArray = new long[oversize(minSize, RamUsageEstimator.NUM_BYTES_LONG)];
|
||||
System.arraycopy(array, 0, newArray, 0, array.length);
|
||||
return newArray;
|
||||
} else
|
||||
|
@ -182,7 +257,7 @@ public final class ArrayUtil {
|
|||
}
|
||||
|
||||
public static long[] shrink(long[] array, int targetSize) {
|
||||
final int newSize = getShrinkSize(array.length, targetSize);
|
||||
final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_LONG);
|
||||
if (newSize != array.length) {
|
||||
long[] newArray = new long[newSize];
|
||||
System.arraycopy(array, 0, newArray, 0, newSize);
|
||||
|
@ -193,7 +268,7 @@ public final class ArrayUtil {
|
|||
|
||||
public static byte[] grow(byte[] array, int minSize) {
|
||||
if (array.length < minSize) {
|
||||
byte[] newArray = new byte[getNextSize(minSize)];
|
||||
byte[] newArray = new byte[oversize(minSize, 1)];
|
||||
System.arraycopy(array, 0, newArray, 0, array.length);
|
||||
return newArray;
|
||||
} else
|
||||
|
@ -205,7 +280,7 @@ public final class ArrayUtil {
|
|||
}
|
||||
|
||||
public static byte[] shrink(byte[] array, int targetSize) {
|
||||
final int newSize = getShrinkSize(array.length, targetSize);
|
||||
final int newSize = getShrinkSize(array.length, targetSize, 1);
|
||||
if (newSize != array.length) {
|
||||
byte[] newArray = new byte[newSize];
|
||||
System.arraycopy(array, 0, newArray, 0, newSize);
|
||||
|
|
|
@ -43,6 +43,14 @@ public final class RamUsageEstimator {
|
|||
private int arraySize;
|
||||
private int classSize;
|
||||
|
||||
public final static int NUM_BYTES_OBJECT_REF = Constants.JRE_IS_64BIT ? 8 : 4;
|
||||
public final static int NUM_BYTES_CHAR = 2;
|
||||
public final static int NUM_BYTES_SHORT = 2;
|
||||
public final static int NUM_BYTES_INT = 4;
|
||||
public final static int NUM_BYTES_LONG = 8;
|
||||
public final static int NUM_BYTES_FLOAT = 4;
|
||||
public final static int NUM_BYTES_DOUBLE = 8;
|
||||
|
||||
private boolean checkInterned;
|
||||
|
||||
/**
|
||||
|
|
|
@ -85,7 +85,6 @@ public class TestToken extends LuceneTestCase {
|
|||
buf.append(buf.toString());
|
||||
}
|
||||
assertEquals(1048576, t.termLength());
|
||||
assertEquals(1179654, t.termBuffer().length);
|
||||
|
||||
// now as a string, first variant
|
||||
t = new Token();
|
||||
|
@ -99,7 +98,6 @@ public class TestToken extends LuceneTestCase {
|
|||
buf.append(content);
|
||||
}
|
||||
assertEquals(1048576, t.termLength());
|
||||
assertEquals(1179654, t.termBuffer().length);
|
||||
|
||||
// now as a string, second variant
|
||||
t = new Token();
|
||||
|
@ -113,7 +111,6 @@ public class TestToken extends LuceneTestCase {
|
|||
buf.append(content);
|
||||
}
|
||||
assertEquals(1048576, t.termLength());
|
||||
assertEquals(1179654, t.termBuffer().length);
|
||||
|
||||
// Test for slow growth to a long term
|
||||
t = new Token();
|
||||
|
@ -127,7 +124,6 @@ public class TestToken extends LuceneTestCase {
|
|||
buf.append("a");
|
||||
}
|
||||
assertEquals(20000, t.termLength());
|
||||
assertEquals(20167, t.termBuffer().length);
|
||||
|
||||
// Test for slow growth to a long term
|
||||
t = new Token();
|
||||
|
@ -141,7 +137,6 @@ public class TestToken extends LuceneTestCase {
|
|||
buf.append("a");
|
||||
}
|
||||
assertEquals(20000, t.termLength());
|
||||
assertEquals(20167, t.termBuffer().length);
|
||||
}
|
||||
|
||||
public void testToString() throws Exception {
|
||||
|
|
|
@ -49,7 +49,6 @@ public class TestTermAttributeImpl extends LuceneTestCase {
|
|||
buf.append(buf.toString());
|
||||
}
|
||||
assertEquals(1048576, t.termLength());
|
||||
assertEquals(1179654, t.termBuffer().length);
|
||||
|
||||
// now as a string, first variant
|
||||
t = new TermAttributeImpl();
|
||||
|
@ -63,7 +62,6 @@ public class TestTermAttributeImpl extends LuceneTestCase {
|
|||
buf.append(content);
|
||||
}
|
||||
assertEquals(1048576, t.termLength());
|
||||
assertEquals(1179654, t.termBuffer().length);
|
||||
|
||||
// now as a string, second variant
|
||||
t = new TermAttributeImpl();
|
||||
|
@ -77,7 +75,6 @@ public class TestTermAttributeImpl extends LuceneTestCase {
|
|||
buf.append(content);
|
||||
}
|
||||
assertEquals(1048576, t.termLength());
|
||||
assertEquals(1179654, t.termBuffer().length);
|
||||
|
||||
// Test for slow growth to a long term
|
||||
t = new TermAttributeImpl();
|
||||
|
@ -91,7 +88,6 @@ public class TestTermAttributeImpl extends LuceneTestCase {
|
|||
buf.append("a");
|
||||
}
|
||||
assertEquals(20000, t.termLength());
|
||||
assertEquals(20167, t.termBuffer().length);
|
||||
|
||||
// Test for slow growth to a long term
|
||||
t = new TermAttributeImpl();
|
||||
|
@ -105,7 +101,6 @@ public class TestTermAttributeImpl extends LuceneTestCase {
|
|||
buf.append("a");
|
||||
}
|
||||
assertEquals(20000, t.termLength());
|
||||
assertEquals(20167, t.termBuffer().length);
|
||||
}
|
||||
|
||||
public void testToString() throws Exception {
|
||||
|
|
|
@ -0,0 +1,59 @@
|
|||
package org.apache.lucene.util;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
public class TestArrayUtil extends LuceneTestCase {
|
||||
|
||||
// Ensure ArrayUtil.getNextSize gives linear amortized cost of realloc/copy
|
||||
public void testGrowth() {
|
||||
int currentSize = 0;
|
||||
long copyCost = 0;
|
||||
|
||||
// Make sure ArrayUtil hits Integer.MAX_VALUE, if we insist:
|
||||
while(currentSize != Integer.MAX_VALUE) {
|
||||
int nextSize = ArrayUtil.oversize(1+currentSize, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
|
||||
assertTrue(nextSize > currentSize);
|
||||
if (currentSize > 0) {
|
||||
copyCost += currentSize;
|
||||
double copyCostPerElement = ((double) copyCost)/currentSize;
|
||||
assertTrue("cost " + copyCostPerElement, copyCostPerElement < 10.0);
|
||||
}
|
||||
currentSize = nextSize;
|
||||
}
|
||||
}
|
||||
|
||||
public void testMaxSize() {
|
||||
// intentionally pass invalid elemSizes:
|
||||
for(int elemSize=0;elemSize<10;elemSize++) {
|
||||
assertEquals(Integer.MAX_VALUE, ArrayUtil.oversize(Integer.MAX_VALUE, elemSize));
|
||||
assertEquals(Integer.MAX_VALUE, ArrayUtil.oversize(Integer.MAX_VALUE-1, elemSize));
|
||||
}
|
||||
}
|
||||
|
||||
public void testInvalidElementSizes() {
|
||||
final Random r = newRandom();
|
||||
for(int iter=0;iter<10000;iter++) {
|
||||
final int minTargetSize = r.nextInt(Integer.MAX_VALUE);
|
||||
final int elemSize = r.nextInt(11);
|
||||
final int v = ArrayUtil.oversize(minTargetSize, elemSize);
|
||||
assertTrue(v >= minTargetSize);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -164,14 +164,14 @@ public class TestIndexableBinaryStringTools extends LuceneTestCase {
|
|||
int encodedLen1 = IndexableBinaryStringTools.getEncodedLength(
|
||||
originalArray1, 0, numBytes1);
|
||||
if (encodedLen1 > encoded1.length)
|
||||
encoded1 = new char[ArrayUtil.getNextSize(encodedLen1)];
|
||||
encoded1 = new char[ArrayUtil.oversize(encodedLen1, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
IndexableBinaryStringTools.encode(originalArray1, 0, numBytes1, encoded1,
|
||||
0, encodedLen1);
|
||||
|
||||
int encodedLen2 = IndexableBinaryStringTools.getEncodedLength(original2,
|
||||
0, numBytes2);
|
||||
if (encodedLen2 > encoded2.length)
|
||||
encoded2 = new char[ArrayUtil.getNextSize(encodedLen2)];
|
||||
encoded2 = new char[ArrayUtil.oversize(encodedLen2, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
IndexableBinaryStringTools.encode(original2, 0, numBytes2, encoded2, 0,
|
||||
encodedLen2);
|
||||
|
||||
|
@ -308,7 +308,7 @@ public class TestIndexableBinaryStringTools extends LuceneTestCase {
|
|||
int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
|
||||
numBytes);
|
||||
if (encoded.length < encodedLen)
|
||||
encoded = new char[ArrayUtil.getNextSize(encodedLen)];
|
||||
encoded = new char[ArrayUtil.oversize(encodedLen, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
IndexableBinaryStringTools.encode(binary, 0, numBytes, encoded, 0,
|
||||
encodedLen);
|
||||
|
||||
|
|
Loading…
Reference in New Issue