mirror of https://github.com/apache/lucene.git
LUCENE-6694: Add LithuanianAnalyzer and LithuanianStemmer
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1692544 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
068549d8a8
commit
8f58afc41a
|
@ -141,6 +141,9 @@ New Features
|
|||
the specified distance from the center point. Fix
|
||||
GeoPointInBBoxQuery to handle dateline crossing.
|
||||
|
||||
* LUCENE-6694: Add LithuanianAnalyzer and LithuanianStemmer.
|
||||
(Dainius Jocas via Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-6508: Simplify Lock api, there is now just
|
||||
|
|
|
@ -0,0 +1,124 @@
|
|||
package org.apache.lucene.analysis.lt;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.tartarus.snowball.ext.LithuanianStemmer;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Lithuanian.
|
||||
*/
|
||||
public final class LithuanianAnalyzer extends StopwordAnalyzerBase {
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/** File containing default Lithuanian stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = loadStopwordSet(false,
|
||||
LithuanianAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public LithuanianAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public LithuanianAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public LithuanianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* which tokenizes all the text in the provided {@link Reader}.
|
||||
*
|
||||
* @return A
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new LithuanianStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Analyzer for Lithuanian.
|
||||
*/
|
||||
package org.apache.lucene.analysis.lt;
|
|
@ -0,0 +1,396 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
externals ( stem )
|
||||
/* Special characters in Unicode Latin-1 and Latin Extended-A */
|
||||
// ' nosine
|
||||
stringdef a' decimal '261' // ą a + ogonek
|
||||
stringdef e' decimal '281' // ę e + ogonek
|
||||
stringdef i' decimal '303' // į i + ogonek
|
||||
stringdef u' decimal '371' // ų u + ogonek
|
||||
|
||||
// . taskas
|
||||
stringdef e. decimal '279' // ė e + dot
|
||||
|
||||
// - ilgoji
|
||||
stringdef u- decimal '363' // ū u + macron
|
||||
|
||||
// * varnele
|
||||
stringdef c* decimal '269' // č c + caron (haček)
|
||||
stringdef s* decimal '353' // š s + caron (haček)
|
||||
stringdef z* decimal '382' // ž z + caron (haček)
|
||||
|
||||
// [C](VC)^m[V|C]
|
||||
// definitions of variables for
|
||||
// p1 - position of m = 0
|
||||
// p2 - position of m = 1
|
||||
integers ( p1 p2 s)
|
||||
|
||||
// booleans - to be commented
|
||||
// CHANGE
|
||||
booleans ( CHANGE )
|
||||
|
||||
// escape symbols for substituting lithuanian characters
|
||||
stringescapes { }
|
||||
|
||||
// groupings
|
||||
// v - lithuanian vowels
|
||||
groupings ( v )
|
||||
|
||||
// v - all lithuanian vowels
|
||||
define v 'aeiyou{a'}{e'}{i'}{u'}{e.}{u-}'
|
||||
|
||||
// all lithuanian stemmer routines: 4 steps
|
||||
routines (
|
||||
step2 R1 step1 fix_chdz fix_gd fix_conflicts
|
||||
)
|
||||
|
||||
backwardmode(
|
||||
|
||||
define R1 as $p1 <= cursor
|
||||
define step1 as (
|
||||
setlimit tomark p1 for ([substring]) R1 among(
|
||||
// Daiktavardžiai
|
||||
// I linksniuotė
|
||||
'as' 'ias' 'is' 'ys' // vyras, kelias, brolis, gaidys
|
||||
'o' 'io' // vyro, kelio
|
||||
'ui' 'iui' // vyrui, keliui
|
||||
'{a'}' 'i{a'}' '{i'}' // vyrą, kelią, brolį
|
||||
'u' 'iu' // vyru, keliu
|
||||
'e' 'yje' // vyre, kelyje
|
||||
'y' 'au' 'i' // kely, brolau, broli,
|
||||
'an' // nusižengiman
|
||||
|
||||
'ai' 'iai' // vyrai, keliai
|
||||
'{u'}' 'i{u'}' // vyrų, kelių
|
||||
'ams' 'am' // vyrams, vyram
|
||||
'iams' 'iam' // broliams, broliam
|
||||
'us' 'ius' // vyrus, brolius
|
||||
'ais' 'iais' // vyrais, keliais
|
||||
'uose' 'iuose' 'uos' 'iuos' // vyruose, keliuose, vyruos, keliuos
|
||||
'uosna' 'iuosna' // vyruosna, keliuosna
|
||||
'ysna' // žutysna
|
||||
|
||||
'asis' 'aisi' // sukimasis, sukimaisi
|
||||
'osi' '{u'}si' // sukimosi, sukimųsi
|
||||
'uisi' // sukimuisi
|
||||
'{a'}si' // sukimąsi
|
||||
'usi' // sukimusi
|
||||
'esi' // sukimesi
|
||||
|
||||
'uo' // mėnuo
|
||||
|
||||
|
||||
// II linksniuote
|
||||
'a' 'ia' // galva, vysnios
|
||||
'os' 'ios' // galvos, vysnios
|
||||
'oj' 'oje' 'ioje' // galvoje, vysnioje
|
||||
'osna' 'iosna' // galvosna, vyšniosna
|
||||
'om' 'oms' 'ioms' // galvoms, vysnioms
|
||||
'omis' 'iomis' // galvomis, vysniomis
|
||||
'ose' 'iose' // galvose, vysniose
|
||||
'on' 'ion' // galvon, vyšnion
|
||||
|
||||
|
||||
// III linksniuote
|
||||
'{e.}' // gervė
|
||||
'{e.}s' // gervės
|
||||
'ei' // gervei
|
||||
'{e'}' // gervę
|
||||
'{e.}j' '{e.}je' // gervėj, gervėje
|
||||
'{e.}ms' // gervėms
|
||||
'es' // gerves
|
||||
'{e.}mis' // gervėmis
|
||||
'{e.}se' // gervėse
|
||||
'{e.}sna' // gervėsna
|
||||
'{e.}n' // žydaitėn
|
||||
|
||||
|
||||
// IV linksniuote
|
||||
'aus' 'iaus' // sūnaus, skaičiaus
|
||||
'umi' 'iumi' // sūnumi, skaičiumi
|
||||
'uje' 'iuje' // sūnuje, skaičiuje
|
||||
'iau' // skaičiau
|
||||
|
||||
'{u-}s' // sūnūs
|
||||
'ums' // sūnums
|
||||
'umis' // sūnumis
|
||||
'un' 'iun' // sūnun, administratoriun
|
||||
|
||||
|
||||
// V linksniuote
|
||||
'ies' 'ens' 'enio' 'ers' // avies, vandens, sesers
|
||||
'eniui' 'eriai' // vandeniui, eriai
|
||||
'en{i'}' 'er{i'}' // vandenį, seserį
|
||||
'imi' 'eniu' 'erimi' 'eria' // avimi, vandeniu, seserimi, seseria
|
||||
'enyje' 'eryje' // vandenyje, seseryje
|
||||
'ie' 'enie' 'erie' // avie, vandenie, seserie
|
||||
|
||||
'enys' 'erys' // vandenys, seserys
|
||||
// 'en{u'}' konfliktas su 'žandenų' 'antenų'
|
||||
'er{u'}' // seserų
|
||||
'ims' 'enims' 'erims' // avims, vandemins, seserims
|
||||
'enis' // vandenis
|
||||
'imis' // žebenkštimis
|
||||
'enimis' // vandenimis
|
||||
'yse' 'enyse' 'eryse' // avyse, vandenyse, seseryse
|
||||
|
||||
|
||||
// Būdvardžiai
|
||||
// (i)a linksniuotė
|
||||
'iem' 'iems' // geriem, geriems
|
||||
'ame' 'iame' // naujame, mediniame
|
||||
|
||||
|
||||
// Veiksmažodžiai
|
||||
// Tiesioginė nuosaka
|
||||
// esamasis laikas
|
||||
// (i)a asmenuotė
|
||||
'uosi' 'iuosi' // dirbuosi, traukiuosi
|
||||
'iesi' // dirbiesi
|
||||
'asi' 'iasi' // dirbasi, traukiasi
|
||||
'am{e.}s' 'iam{e.}s' // dirbamės, traukiamės
|
||||
'at' 'ate' 'iat' 'iate' // dirbat, dirbate, ariat, traukiate
|
||||
'at{e.}s' 'iat{e.}s' // dirbatės, traukiatės
|
||||
|
||||
// i asmenuotė
|
||||
'isi' // tikisi
|
||||
'im' // mylim
|
||||
//'ime' konfliktassu daiktavardžiu vietininku, pvz. 'gėrime'
|
||||
'im{e.}s' // tikimės
|
||||
'it' 'ite' // mylit, mylite, tikitės
|
||||
// 'it{e.}s' konfliktas su priesaga ir dgs. vardininko galūne -ait-ės pvz. žydaitės
|
||||
|
||||
// o asmenuotė
|
||||
'ome' 'omės' // mokome, bijomės
|
||||
'ot' 'ote' 'otės' // mokot, mokote, bijotės
|
||||
|
||||
// būtasis laikas
|
||||
// o asmenuotė
|
||||
'{e.}jo' '{e.}josi' // tikėjo, tikėjosi
|
||||
'ot{e.}s' // tikėjotės
|
||||
|
||||
// ė asmenuotė
|
||||
'eisi' // mokeisi
|
||||
'{e.}si' // mokėsi
|
||||
'{e.}m' '{e.}me' // mokėm, mokėme
|
||||
'{e.}m{e.}s' // mokėmės
|
||||
'{e.}t' '{e.}te' // mokėt, mokėte
|
||||
'{e.}t{e.}s' // mokėtės
|
||||
|
||||
// būtasis dažninis laikas
|
||||
'ausi' // mokydavausi
|
||||
'om{e.}s' // mokydavomės
|
||||
|
||||
|
||||
// būsimasis laikas
|
||||
'siu' 'siuosi' // dirbsiu, mokysiuosi
|
||||
'si' 'siesi' // dirbsi, dirbsiesi
|
||||
's' 'ysis' // dirbs, mokysis
|
||||
'sim' 'sime' // dirbsim, dirbsime
|
||||
'sit' 'site' // gersit, gersite
|
||||
|
||||
// tariamoji nuosaka
|
||||
'{c*}iau' '{c*}iausi' // dirbčiau
|
||||
'tum' 'tumei' // dirbtum, dirbtumei
|
||||
'tumeis' 'tumeisi' // mokytumeis, mokytumeisi
|
||||
// 't{u'}' nes blogai batutų -> batų
|
||||
't{u'}si' // mokytųsi
|
||||
// 'tume' konfliktas su 'šventume'
|
||||
'tum{e.}m' // dirbtumėm
|
||||
'tum{e.}me' // dirbtumėme
|
||||
'tum{e.}m{e.}s' // mokytumėmės
|
||||
'tute' 'tum{e.}t' // dirbtute, dirbtumėt
|
||||
'tum{e.}te' // dirbtumėte
|
||||
'tum{e.}t{e.}s' // mokytumėtės
|
||||
|
||||
// liepiamoji nuosaka
|
||||
'k' 'ki' // dirbk, dirbki, mokykis
|
||||
// 'kis' konfliktas viln-išk-is
|
||||
// 'kime' konfliktas, nes pirkime
|
||||
'kim{e.}s' // mokykimės
|
||||
|
||||
// bendratis
|
||||
'uoti' 'iuoti' // meluoti, dygsniuoti
|
||||
'auti' 'iauti' // draugauti, girtuokliauti
|
||||
'oti' 'ioti' // dovanoti, meškerioti
|
||||
'{e.}ti' // auklėti
|
||||
'yti' // akyti
|
||||
'inti' // auginti
|
||||
'in{e.}ti' // blusinėti
|
||||
'enti' // gyventi
|
||||
'tel{e.}ti' // bumbtelėti
|
||||
'ter{e.}ti' // bumbterėti
|
||||
|
||||
'ti' // skalbti
|
||||
// 'tis' konfliktas, nes rytme-tis -> rytme
|
||||
|
||||
// dalyviai
|
||||
'{a'}s' 'i{a'}s' '{i'}s' // dirbąs, žaidžiąs, gulįs
|
||||
't{u'}s' // suktųs -> suk
|
||||
'sim{e.}s' // suksimės
|
||||
'sit{e.}s' // suksitės
|
||||
'kite' // supkite
|
||||
)
|
||||
|
||||
delete
|
||||
)
|
||||
|
||||
define step2 as repeat (
|
||||
setlimit tomark p1 for ([substring]) among(
|
||||
// daiktavardziu priesagos
|
||||
|
||||
// budvardziu priesagos
|
||||
// 'in' // konfliktas su 'augintinis' ir 'akiniais' // lauk-in-is
|
||||
'ing' // tvark-ing-as
|
||||
'i{s*}k' // lenk-išk-as
|
||||
'{e.}t' // dem-ėt-as
|
||||
'ot' // garban-ot-as
|
||||
'uot' 'iuot' // lang-uot-as, akin-iuot-as
|
||||
// 'tin', nes augintinis // dirb-tin-is
|
||||
// 'ut', nes batutas, degutas etc. // maž-ut-is
|
||||
'yt' // maž-yt-is
|
||||
'iuk' // maž-iuk-as
|
||||
'iul' // maž-ul-is
|
||||
'{e.}l' // maž-ėl-is
|
||||
'yl' // maž-yl-is
|
||||
'u{c*}iuk' // maž-učiuk-as
|
||||
'uliuk' // maž-uliuk-as
|
||||
'ut{e.}ait' // maž-utėlait-is
|
||||
'ok' // did-ok-as
|
||||
'iok' // višč-iok-as
|
||||
'sv' '{s*}v' 'zgan' // sal-sv-as, pilk-šv-as, bal-zgan-as
|
||||
'op' 'iop' // dvej-op-as, viener-iop-as
|
||||
'ain' // apval-ain-as
|
||||
'yk{s*}t' 'yk{s*}{c*}' // ten-ykšt-is, vakar-ykšč-ias
|
||||
|
||||
// laisniai
|
||||
'esn' // did-esn-is
|
||||
'aus' 'iaus' // nauj-aus-ias, ger-iaus-ias
|
||||
|
||||
// ivardziuotiniai budvardziai
|
||||
// vyriska gimine
|
||||
'ias' // žaliasis
|
||||
'oj' 'ioj' // gerojo, žaliojo
|
||||
'aj' 'iaj' // gerajam, žaliajam
|
||||
'{a'}j' 'i{a'}j' // garąjį, žaliąjį
|
||||
'uoj' 'iuoj' // geruoju, žaliuoju
|
||||
'iej' // gerieji
|
||||
'{u'}j' 'i{u'}j' // gerųjų, žaliųjų
|
||||
'ies' // geriesiems
|
||||
'uos' 'iuos' // geruosius, žaliuosius
|
||||
'ais' 'iais' // geraisiais, žaliaisiais
|
||||
|
||||
// moteriska gimine
|
||||
'os' 'ios' // gerosios, žaliosios
|
||||
'{a'}s' 'i{a'}s' // gerąsios, žaliąsias
|
||||
|
||||
// būtasis dažninis laikas
|
||||
'dav' // ei-dav-o
|
||||
|
||||
// dalyvių priesagos
|
||||
'ant' 'iant'
|
||||
'int' // tur-int-is
|
||||
'{e.}j' // tur-ėj-o
|
||||
'{e'}' //
|
||||
'{e.}j{e'}'
|
||||
'{e'}s' // dirb-ęs-is
|
||||
|
||||
'siant' // dirb-siant
|
||||
|
||||
// pusdalyviai
|
||||
'dam' // bėg-dam-as
|
||||
|
||||
'auj' // ūkinink-auj-a
|
||||
'jam'
|
||||
'iau'
|
||||
'am' // baiminim-ams-i
|
||||
)
|
||||
|
||||
delete
|
||||
)
|
||||
|
||||
define fix_conflicts as (
|
||||
[substring] among (
|
||||
// 'lietuvaite' -> 'lietuvaitė', konfliktas su 'myl-ite'
|
||||
'aite' (<-'ait{e.}' set CHANGE)
|
||||
// 'lietuvaitės' -> 'lietuvaitė', konfliktas su 'myl-itės'
|
||||
'ait{e.}s' (<-'ait{e.}' set CHANGE)
|
||||
|
||||
// ''ūs-uotės' -> 'ūs-uotė', konfliktas 'mokotės'
|
||||
'uot{e.}s' (<-'uot{e.}' set CHANGE)
|
||||
// ''ūs-uote' -> 'ūs-uotė', konfliktas 'mokote'
|
||||
'uote' (<-'uot{e.}' set CHANGE)
|
||||
|
||||
// 'žerėjime' -> 'žėrėjimas', konfliktas su 'žais-ime'
|
||||
'{e.}jime' (<-'{e.}jimas' set CHANGE)
|
||||
|
||||
// 'žvilgesiu' -> 'žvilgesys', konfliktas su 'dirb-siu'
|
||||
'esiu' (<-'esys' set CHANGE)
|
||||
// 'duobkasiu' -> 'duobkasys', konfliktas su 'pakasiu'
|
||||
'asius' (<-'asys' set CHANGE)
|
||||
|
||||
// 'žioravime' -> 'žioravimas', konfliktas su 'myl-ime'
|
||||
'avime' (<-'avimas' set CHANGE)
|
||||
'ojime' (<-'ojimas' set CHANGE)
|
||||
|
||||
// 'advokatės' -> 'advokatė', konfliktas su 'dirb-atės'
|
||||
'okat{e.}s' (<-'okat{e.}' set CHANGE)
|
||||
// 'advokate' -> 'advokatė', konfliktas su 'dirb-ate'
|
||||
'okate' (<-'okat{e.}' set CHANGE)
|
||||
)
|
||||
)
|
||||
|
||||
define fix_chdz as (
|
||||
[substring] among (
|
||||
'{c*}' (<-'t' set CHANGE)
|
||||
'd{z*}' (<-'d' set CHANGE)
|
||||
)
|
||||
)
|
||||
|
||||
define fix_gd as (
|
||||
[substring] among (
|
||||
'gd' (<-'g' set CHANGE)
|
||||
//'{e.}k' (<-'{e.}g' set CHANGE)
|
||||
)
|
||||
)
|
||||
|
||||
)
|
||||
|
||||
define stem as (
|
||||
|
||||
$p1 = limit
|
||||
$p2 = limit
|
||||
$s = size
|
||||
|
||||
do (
|
||||
// priešdėlis 'a' ilgeniuose nei 6 raidės žodžiuose, pvz. 'a-liejus'.
|
||||
try (test 'a' $s > 6 hop 1)
|
||||
|
||||
gopast v gopast non-v setmark p1
|
||||
gopast v gopast non-v setmark p2
|
||||
)
|
||||
|
||||
backwards (
|
||||
do fix_conflicts
|
||||
do step1
|
||||
do fix_chdz
|
||||
do step2
|
||||
do fix_chdz
|
||||
do fix_gd
|
||||
)
|
||||
|
||||
)
|
|
@ -0,0 +1,769 @@
|
|||
// This file was generated automatically by the Snowball to Java compiler
|
||||
|
||||
package org.tartarus.snowball.ext;
|
||||
|
||||
import org.tartarus.snowball.Among;
|
||||
|
||||
/**
|
||||
* This class was automatically generated by a Snowball to Java compiler
|
||||
* It implements the stemming algorithm defined by a snowball script.
|
||||
*/
|
||||
|
||||
public class LithuanianStemmer extends org.tartarus.snowball.SnowballProgram {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
private final static LithuanianStemmer methodObject = new LithuanianStemmer ();
|
||||
|
||||
private final static Among a_0[] = {
|
||||
new Among ( "a", -1, -1, "", methodObject ),
|
||||
new Among ( "ia", 0, -1, "", methodObject ),
|
||||
new Among ( "eria", 1, -1, "", methodObject ),
|
||||
new Among ( "osna", 0, -1, "", methodObject ),
|
||||
new Among ( "iosna", 3, -1, "", methodObject ),
|
||||
new Among ( "uosna", 3, -1, "", methodObject ),
|
||||
new Among ( "iuosna", 5, -1, "", methodObject ),
|
||||
new Among ( "ysna", 0, -1, "", methodObject ),
|
||||
new Among ( "\u0117sna", 0, -1, "", methodObject ),
|
||||
new Among ( "e", -1, -1, "", methodObject ),
|
||||
new Among ( "ie", 9, -1, "", methodObject ),
|
||||
new Among ( "enie", 10, -1, "", methodObject ),
|
||||
new Among ( "erie", 10, -1, "", methodObject ),
|
||||
new Among ( "oje", 9, -1, "", methodObject ),
|
||||
new Among ( "ioje", 13, -1, "", methodObject ),
|
||||
new Among ( "uje", 9, -1, "", methodObject ),
|
||||
new Among ( "iuje", 15, -1, "", methodObject ),
|
||||
new Among ( "yje", 9, -1, "", methodObject ),
|
||||
new Among ( "enyje", 17, -1, "", methodObject ),
|
||||
new Among ( "eryje", 17, -1, "", methodObject ),
|
||||
new Among ( "\u0117je", 9, -1, "", methodObject ),
|
||||
new Among ( "ame", 9, -1, "", methodObject ),
|
||||
new Among ( "iame", 21, -1, "", methodObject ),
|
||||
new Among ( "sime", 9, -1, "", methodObject ),
|
||||
new Among ( "ome", 9, -1, "", methodObject ),
|
||||
new Among ( "\u0117me", 9, -1, "", methodObject ),
|
||||
new Among ( "tum\u0117me", 25, -1, "", methodObject ),
|
||||
new Among ( "ose", 9, -1, "", methodObject ),
|
||||
new Among ( "iose", 27, -1, "", methodObject ),
|
||||
new Among ( "uose", 27, -1, "", methodObject ),
|
||||
new Among ( "iuose", 29, -1, "", methodObject ),
|
||||
new Among ( "yse", 9, -1, "", methodObject ),
|
||||
new Among ( "enyse", 31, -1, "", methodObject ),
|
||||
new Among ( "eryse", 31, -1, "", methodObject ),
|
||||
new Among ( "\u0117se", 9, -1, "", methodObject ),
|
||||
new Among ( "ate", 9, -1, "", methodObject ),
|
||||
new Among ( "iate", 35, -1, "", methodObject ),
|
||||
new Among ( "ite", 9, -1, "", methodObject ),
|
||||
new Among ( "kite", 37, -1, "", methodObject ),
|
||||
new Among ( "site", 37, -1, "", methodObject ),
|
||||
new Among ( "ote", 9, -1, "", methodObject ),
|
||||
new Among ( "tute", 9, -1, "", methodObject ),
|
||||
new Among ( "\u0117te", 9, -1, "", methodObject ),
|
||||
new Among ( "tum\u0117te", 42, -1, "", methodObject ),
|
||||
new Among ( "i", -1, -1, "", methodObject ),
|
||||
new Among ( "ai", 44, -1, "", methodObject ),
|
||||
new Among ( "iai", 45, -1, "", methodObject ),
|
||||
new Among ( "eriai", 46, -1, "", methodObject ),
|
||||
new Among ( "ei", 44, -1, "", methodObject ),
|
||||
new Among ( "tumei", 48, -1, "", methodObject ),
|
||||
new Among ( "ki", 44, -1, "", methodObject ),
|
||||
new Among ( "imi", 44, -1, "", methodObject ),
|
||||
new Among ( "erimi", 51, -1, "", methodObject ),
|
||||
new Among ( "umi", 44, -1, "", methodObject ),
|
||||
new Among ( "iumi", 53, -1, "", methodObject ),
|
||||
new Among ( "si", 44, -1, "", methodObject ),
|
||||
new Among ( "asi", 55, -1, "", methodObject ),
|
||||
new Among ( "iasi", 56, -1, "", methodObject ),
|
||||
new Among ( "esi", 55, -1, "", methodObject ),
|
||||
new Among ( "iesi", 58, -1, "", methodObject ),
|
||||
new Among ( "siesi", 59, -1, "", methodObject ),
|
||||
new Among ( "isi", 55, -1, "", methodObject ),
|
||||
new Among ( "aisi", 61, -1, "", methodObject ),
|
||||
new Among ( "eisi", 61, -1, "", methodObject ),
|
||||
new Among ( "tumeisi", 63, -1, "", methodObject ),
|
||||
new Among ( "uisi", 61, -1, "", methodObject ),
|
||||
new Among ( "osi", 55, -1, "", methodObject ),
|
||||
new Among ( "\u0117josi", 66, -1, "", methodObject ),
|
||||
new Among ( "uosi", 66, -1, "", methodObject ),
|
||||
new Among ( "iuosi", 68, -1, "", methodObject ),
|
||||
new Among ( "siuosi", 69, -1, "", methodObject ),
|
||||
new Among ( "usi", 55, -1, "", methodObject ),
|
||||
new Among ( "ausi", 71, -1, "", methodObject ),
|
||||
new Among ( "\u010Diausi", 72, -1, "", methodObject ),
|
||||
new Among ( "\u0105si", 55, -1, "", methodObject ),
|
||||
new Among ( "\u0117si", 55, -1, "", methodObject ),
|
||||
new Among ( "\u0173si", 55, -1, "", methodObject ),
|
||||
new Among ( "t\u0173si", 76, -1, "", methodObject ),
|
||||
new Among ( "ti", 44, -1, "", methodObject ),
|
||||
new Among ( "enti", 78, -1, "", methodObject ),
|
||||
new Among ( "inti", 78, -1, "", methodObject ),
|
||||
new Among ( "oti", 78, -1, "", methodObject ),
|
||||
new Among ( "ioti", 81, -1, "", methodObject ),
|
||||
new Among ( "uoti", 81, -1, "", methodObject ),
|
||||
new Among ( "iuoti", 83, -1, "", methodObject ),
|
||||
new Among ( "auti", 78, -1, "", methodObject ),
|
||||
new Among ( "iauti", 85, -1, "", methodObject ),
|
||||
new Among ( "yti", 78, -1, "", methodObject ),
|
||||
new Among ( "\u0117ti", 78, -1, "", methodObject ),
|
||||
new Among ( "tel\u0117ti", 88, -1, "", methodObject ),
|
||||
new Among ( "in\u0117ti", 88, -1, "", methodObject ),
|
||||
new Among ( "ter\u0117ti", 88, -1, "", methodObject ),
|
||||
new Among ( "ui", 44, -1, "", methodObject ),
|
||||
new Among ( "iui", 92, -1, "", methodObject ),
|
||||
new Among ( "eniui", 93, -1, "", methodObject ),
|
||||
new Among ( "oj", -1, -1, "", methodObject ),
|
||||
new Among ( "\u0117j", -1, -1, "", methodObject ),
|
||||
new Among ( "k", -1, -1, "", methodObject ),
|
||||
new Among ( "am", -1, -1, "", methodObject ),
|
||||
new Among ( "iam", 98, -1, "", methodObject ),
|
||||
new Among ( "iem", -1, -1, "", methodObject ),
|
||||
new Among ( "im", -1, -1, "", methodObject ),
|
||||
new Among ( "sim", 101, -1, "", methodObject ),
|
||||
new Among ( "om", -1, -1, "", methodObject ),
|
||||
new Among ( "tum", -1, -1, "", methodObject ),
|
||||
new Among ( "\u0117m", -1, -1, "", methodObject ),
|
||||
new Among ( "tum\u0117m", 105, -1, "", methodObject ),
|
||||
new Among ( "an", -1, -1, "", methodObject ),
|
||||
new Among ( "on", -1, -1, "", methodObject ),
|
||||
new Among ( "ion", 108, -1, "", methodObject ),
|
||||
new Among ( "un", -1, -1, "", methodObject ),
|
||||
new Among ( "iun", 110, -1, "", methodObject ),
|
||||
new Among ( "\u0117n", -1, -1, "", methodObject ),
|
||||
new Among ( "o", -1, -1, "", methodObject ),
|
||||
new Among ( "io", 113, -1, "", methodObject ),
|
||||
new Among ( "enio", 114, -1, "", methodObject ),
|
||||
new Among ( "\u0117jo", 113, -1, "", methodObject ),
|
||||
new Among ( "uo", 113, -1, "", methodObject ),
|
||||
new Among ( "s", -1, -1, "", methodObject ),
|
||||
new Among ( "as", 118, -1, "", methodObject ),
|
||||
new Among ( "ias", 119, -1, "", methodObject ),
|
||||
new Among ( "es", 118, -1, "", methodObject ),
|
||||
new Among ( "ies", 121, -1, "", methodObject ),
|
||||
new Among ( "is", 118, -1, "", methodObject ),
|
||||
new Among ( "ais", 123, -1, "", methodObject ),
|
||||
new Among ( "iais", 124, -1, "", methodObject ),
|
||||
new Among ( "tumeis", 123, -1, "", methodObject ),
|
||||
new Among ( "imis", 123, -1, "", methodObject ),
|
||||
new Among ( "enimis", 127, -1, "", methodObject ),
|
||||
new Among ( "omis", 123, -1, "", methodObject ),
|
||||
new Among ( "iomis", 129, -1, "", methodObject ),
|
||||
new Among ( "umis", 123, -1, "", methodObject ),
|
||||
new Among ( "\u0117mis", 123, -1, "", methodObject ),
|
||||
new Among ( "enis", 123, -1, "", methodObject ),
|
||||
new Among ( "asis", 123, -1, "", methodObject ),
|
||||
new Among ( "ysis", 123, -1, "", methodObject ),
|
||||
new Among ( "ams", 118, -1, "", methodObject ),
|
||||
new Among ( "iams", 136, -1, "", methodObject ),
|
||||
new Among ( "iems", 118, -1, "", methodObject ),
|
||||
new Among ( "ims", 118, -1, "", methodObject ),
|
||||
new Among ( "enims", 139, -1, "", methodObject ),
|
||||
new Among ( "erims", 139, -1, "", methodObject ),
|
||||
new Among ( "oms", 118, -1, "", methodObject ),
|
||||
new Among ( "ioms", 142, -1, "", methodObject ),
|
||||
new Among ( "ums", 118, -1, "", methodObject ),
|
||||
new Among ( "\u0117ms", 118, -1, "", methodObject ),
|
||||
new Among ( "ens", 118, -1, "", methodObject ),
|
||||
new Among ( "os", 118, -1, "", methodObject ),
|
||||
new Among ( "ios", 147, -1, "", methodObject ),
|
||||
new Among ( "uos", 147, -1, "", methodObject ),
|
||||
new Among ( "iuos", 149, -1, "", methodObject ),
|
||||
new Among ( "ers", 118, -1, "", methodObject ),
|
||||
new Among ( "us", 118, -1, "", methodObject ),
|
||||
new Among ( "aus", 152, -1, "", methodObject ),
|
||||
new Among ( "iaus", 153, -1, "", methodObject ),
|
||||
new Among ( "ius", 152, -1, "", methodObject ),
|
||||
new Among ( "ys", 118, -1, "", methodObject ),
|
||||
new Among ( "enys", 156, -1, "", methodObject ),
|
||||
new Among ( "erys", 156, -1, "", methodObject ),
|
||||
new Among ( "om\u00C4\u0097s", 118, -1, "", methodObject ),
|
||||
new Among ( "ot\u00C4\u0097s", 118, -1, "", methodObject ),
|
||||
new Among ( "\u0105s", 118, -1, "", methodObject ),
|
||||
new Among ( "i\u0105s", 161, -1, "", methodObject ),
|
||||
new Among ( "\u0117s", 118, -1, "", methodObject ),
|
||||
new Among ( "am\u0117s", 163, -1, "", methodObject ),
|
||||
new Among ( "iam\u0117s", 164, -1, "", methodObject ),
|
||||
new Among ( "im\u0117s", 163, -1, "", methodObject ),
|
||||
new Among ( "kim\u0117s", 166, -1, "", methodObject ),
|
||||
new Among ( "sim\u0117s", 166, -1, "", methodObject ),
|
||||
new Among ( "om\u0117s", 163, -1, "", methodObject ),
|
||||
new Among ( "\u0117m\u0117s", 163, -1, "", methodObject ),
|
||||
new Among ( "tum\u0117m\u0117s", 170, -1, "", methodObject ),
|
||||
new Among ( "at\u0117s", 163, -1, "", methodObject ),
|
||||
new Among ( "iat\u0117s", 172, -1, "", methodObject ),
|
||||
new Among ( "sit\u0117s", 163, -1, "", methodObject ),
|
||||
new Among ( "ot\u0117s", 163, -1, "", methodObject ),
|
||||
new Among ( "\u0117t\u0117s", 163, -1, "", methodObject ),
|
||||
new Among ( "tum\u0117t\u0117s", 176, -1, "", methodObject ),
|
||||
new Among ( "\u012Fs", 118, -1, "", methodObject ),
|
||||
new Among ( "\u016Bs", 118, -1, "", methodObject ),
|
||||
new Among ( "t\u0173s", 118, -1, "", methodObject ),
|
||||
new Among ( "at", -1, -1, "", methodObject ),
|
||||
new Among ( "iat", 181, -1, "", methodObject ),
|
||||
new Among ( "it", -1, -1, "", methodObject ),
|
||||
new Among ( "sit", 183, -1, "", methodObject ),
|
||||
new Among ( "ot", -1, -1, "", methodObject ),
|
||||
new Among ( "\u0117t", -1, -1, "", methodObject ),
|
||||
new Among ( "tum\u0117t", 186, -1, "", methodObject ),
|
||||
new Among ( "u", -1, -1, "", methodObject ),
|
||||
new Among ( "au", 188, -1, "", methodObject ),
|
||||
new Among ( "iau", 189, -1, "", methodObject ),
|
||||
new Among ( "\u010Diau", 190, -1, "", methodObject ),
|
||||
new Among ( "iu", 188, -1, "", methodObject ),
|
||||
new Among ( "eniu", 192, -1, "", methodObject ),
|
||||
new Among ( "siu", 192, -1, "", methodObject ),
|
||||
new Among ( "y", -1, -1, "", methodObject ),
|
||||
new Among ( "\u0105", -1, -1, "", methodObject ),
|
||||
new Among ( "i\u0105", 196, -1, "", methodObject ),
|
||||
new Among ( "\u0117", -1, -1, "", methodObject ),
|
||||
new Among ( "\u0119", -1, -1, "", methodObject ),
|
||||
new Among ( "\u012F", -1, -1, "", methodObject ),
|
||||
new Among ( "en\u012F", 200, -1, "", methodObject ),
|
||||
new Among ( "er\u012F", 200, -1, "", methodObject ),
|
||||
new Among ( "\u0173", -1, -1, "", methodObject ),
|
||||
new Among ( "i\u0173", 203, -1, "", methodObject ),
|
||||
new Among ( "er\u0173", 203, -1, "", methodObject )
|
||||
};
|
||||
|
||||
private final static Among a_1[] = {
|
||||
new Among ( "ing", -1, -1, "", methodObject ),
|
||||
new Among ( "aj", -1, -1, "", methodObject ),
|
||||
new Among ( "iaj", 1, -1, "", methodObject ),
|
||||
new Among ( "iej", -1, -1, "", methodObject ),
|
||||
new Among ( "oj", -1, -1, "", methodObject ),
|
||||
new Among ( "ioj", 4, -1, "", methodObject ),
|
||||
new Among ( "uoj", 4, -1, "", methodObject ),
|
||||
new Among ( "iuoj", 6, -1, "", methodObject ),
|
||||
new Among ( "auj", -1, -1, "", methodObject ),
|
||||
new Among ( "\u0105j", -1, -1, "", methodObject ),
|
||||
new Among ( "i\u0105j", 9, -1, "", methodObject ),
|
||||
new Among ( "\u0117j", -1, -1, "", methodObject ),
|
||||
new Among ( "\u0173j", -1, -1, "", methodObject ),
|
||||
new Among ( "i\u0173j", 12, -1, "", methodObject ),
|
||||
new Among ( "ok", -1, -1, "", methodObject ),
|
||||
new Among ( "iok", 14, -1, "", methodObject ),
|
||||
new Among ( "iuk", -1, -1, "", methodObject ),
|
||||
new Among ( "uliuk", 16, -1, "", methodObject ),
|
||||
new Among ( "u\u010Diuk", 16, -1, "", methodObject ),
|
||||
new Among ( "i\u0161k", -1, -1, "", methodObject ),
|
||||
new Among ( "iul", -1, -1, "", methodObject ),
|
||||
new Among ( "yl", -1, -1, "", methodObject ),
|
||||
new Among ( "\u0117l", -1, -1, "", methodObject ),
|
||||
new Among ( "am", -1, -1, "", methodObject ),
|
||||
new Among ( "dam", 23, -1, "", methodObject ),
|
||||
new Among ( "jam", 23, -1, "", methodObject ),
|
||||
new Among ( "zgan", -1, -1, "", methodObject ),
|
||||
new Among ( "ain", -1, -1, "", methodObject ),
|
||||
new Among ( "esn", -1, -1, "", methodObject ),
|
||||
new Among ( "op", -1, -1, "", methodObject ),
|
||||
new Among ( "iop", 29, -1, "", methodObject ),
|
||||
new Among ( "ias", -1, -1, "", methodObject ),
|
||||
new Among ( "ies", -1, -1, "", methodObject ),
|
||||
new Among ( "ais", -1, -1, "", methodObject ),
|
||||
new Among ( "iais", 33, -1, "", methodObject ),
|
||||
new Among ( "os", -1, -1, "", methodObject ),
|
||||
new Among ( "ios", 35, -1, "", methodObject ),
|
||||
new Among ( "uos", 35, -1, "", methodObject ),
|
||||
new Among ( "iuos", 37, -1, "", methodObject ),
|
||||
new Among ( "aus", -1, -1, "", methodObject ),
|
||||
new Among ( "iaus", 39, -1, "", methodObject ),
|
||||
new Among ( "\u0105s", -1, -1, "", methodObject ),
|
||||
new Among ( "i\u0105s", 41, -1, "", methodObject ),
|
||||
new Among ( "\u0119s", -1, -1, "", methodObject ),
|
||||
new Among ( "ut\u0117ait", -1, -1, "", methodObject ),
|
||||
new Among ( "ant", -1, -1, "", methodObject ),
|
||||
new Among ( "iant", 45, -1, "", methodObject ),
|
||||
new Among ( "siant", 46, -1, "", methodObject ),
|
||||
new Among ( "int", -1, -1, "", methodObject ),
|
||||
new Among ( "ot", -1, -1, "", methodObject ),
|
||||
new Among ( "uot", 49, -1, "", methodObject ),
|
||||
new Among ( "iuot", 50, -1, "", methodObject ),
|
||||
new Among ( "yt", -1, -1, "", methodObject ),
|
||||
new Among ( "\u0117t", -1, -1, "", methodObject ),
|
||||
new Among ( "yk\u0161t", -1, -1, "", methodObject ),
|
||||
new Among ( "iau", -1, -1, "", methodObject ),
|
||||
new Among ( "dav", -1, -1, "", methodObject ),
|
||||
new Among ( "sv", -1, -1, "", methodObject ),
|
||||
new Among ( "\u0161v", -1, -1, "", methodObject ),
|
||||
new Among ( "yk\u0161\u010D", -1, -1, "", methodObject ),
|
||||
new Among ( "\u0119", -1, -1, "", methodObject ),
|
||||
new Among ( "\u0117j\u0119", 60, -1, "", methodObject )
|
||||
};
|
||||
|
||||
private final static Among a_2[] = {
|
||||
new Among ( "ojime", -1, 9, "", methodObject ),
|
||||
new Among ( "\u0117jime", -1, 5, "", methodObject ),
|
||||
new Among ( "avime", -1, 8, "", methodObject ),
|
||||
new Among ( "okate", -1, 11, "", methodObject ),
|
||||
new Among ( "aite", -1, 1, "", methodObject ),
|
||||
new Among ( "uote", -1, 4, "", methodObject ),
|
||||
new Among ( "asius", -1, 7, "", methodObject ),
|
||||
new Among ( "okat\u0117s", -1, 10, "", methodObject ),
|
||||
new Among ( "ait\u0117s", -1, 2, "", methodObject ),
|
||||
new Among ( "uot\u0117s", -1, 3, "", methodObject ),
|
||||
new Among ( "esiu", -1, 6, "", methodObject )
|
||||
};
|
||||
|
||||
private final static Among a_3[] = {
|
||||
new Among ( "\u010D", -1, 1, "", methodObject ),
|
||||
new Among ( "d\u017E", -1, 2, "", methodObject )
|
||||
};
|
||||
|
||||
private final static Among a_4[] = {
|
||||
new Among ( "gd", -1, 1, "", methodObject )
|
||||
};
|
||||
|
||||
private static final char g_v[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 64, 1, 0, 64, 0, 0, 0, 0, 0, 0, 0, 4, 4 };
|
||||
|
||||
private boolean B_CHANGE;
|
||||
private int I_s;
|
||||
private int I_p2;
|
||||
private int I_p1;
|
||||
|
||||
private void copy_from(LithuanianStemmer other) {
|
||||
B_CHANGE = other.B_CHANGE;
|
||||
I_s = other.I_s;
|
||||
I_p2 = other.I_p2;
|
||||
I_p1 = other.I_p1;
|
||||
super.copy_from(other);
|
||||
}
|
||||
|
||||
private boolean r_R1() {
|
||||
if (!(I_p1 <= cursor))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_step1() {
|
||||
int v_1;
|
||||
int v_2;
|
||||
// (, line 48
|
||||
// setlimit, line 49
|
||||
v_1 = limit - cursor;
|
||||
// tomark, line 49
|
||||
if (cursor < I_p1)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cursor = I_p1;
|
||||
v_2 = limit_backward;
|
||||
limit_backward = cursor;
|
||||
cursor = limit - v_1;
|
||||
// (, line 49
|
||||
// [, line 49
|
||||
ket = cursor;
|
||||
// substring, line 49
|
||||
if (find_among_b(a_0, 206) == 0)
|
||||
{
|
||||
limit_backward = v_2;
|
||||
return false;
|
||||
}
|
||||
// ], line 49
|
||||
bra = cursor;
|
||||
limit_backward = v_2;
|
||||
// call R1, line 49
|
||||
if (!r_R1())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// delete, line 235
|
||||
slice_del();
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_step2() {
|
||||
int v_1;
|
||||
int v_2;
|
||||
int v_3;
|
||||
// repeat, line 238
|
||||
replab0: while(true)
|
||||
{
|
||||
v_1 = limit - cursor;
|
||||
lab1: do {
|
||||
// (, line 238
|
||||
// setlimit, line 239
|
||||
v_2 = limit - cursor;
|
||||
// tomark, line 239
|
||||
if (cursor < I_p1)
|
||||
{
|
||||
break lab1;
|
||||
}
|
||||
cursor = I_p1;
|
||||
v_3 = limit_backward;
|
||||
limit_backward = cursor;
|
||||
cursor = limit - v_2;
|
||||
// (, line 239
|
||||
// [, line 239
|
||||
ket = cursor;
|
||||
// substring, line 239
|
||||
if (find_among_b(a_1, 62) == 0)
|
||||
{
|
||||
limit_backward = v_3;
|
||||
break lab1;
|
||||
}
|
||||
// ], line 239
|
||||
bra = cursor;
|
||||
limit_backward = v_3;
|
||||
// delete, line 309
|
||||
slice_del();
|
||||
continue replab0;
|
||||
} while (false);
|
||||
cursor = limit - v_1;
|
||||
break replab0;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_fix_conflicts() {
|
||||
int among_var;
|
||||
// (, line 312
|
||||
// [, line 313
|
||||
ket = cursor;
|
||||
// substring, line 313
|
||||
among_var = find_among_b(a_2, 11);
|
||||
if (among_var == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// ], line 313
|
||||
bra = cursor;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 315
|
||||
// <-, line 315
|
||||
slice_from("ait\u0117");
|
||||
// set CHANGE, line 315
|
||||
B_CHANGE = true;
|
||||
break;
|
||||
case 2:
|
||||
// (, line 317
|
||||
// <-, line 317
|
||||
slice_from("ait\u0117");
|
||||
// set CHANGE, line 317
|
||||
B_CHANGE = true;
|
||||
break;
|
||||
case 3:
|
||||
// (, line 320
|
||||
// <-, line 320
|
||||
slice_from("uot\u0117");
|
||||
// set CHANGE, line 320
|
||||
B_CHANGE = true;
|
||||
break;
|
||||
case 4:
|
||||
// (, line 322
|
||||
// <-, line 322
|
||||
slice_from("uot\u0117");
|
||||
// set CHANGE, line 322
|
||||
B_CHANGE = true;
|
||||
break;
|
||||
case 5:
|
||||
// (, line 325
|
||||
// <-, line 325
|
||||
slice_from("\u0117jimas");
|
||||
// set CHANGE, line 325
|
||||
B_CHANGE = true;
|
||||
break;
|
||||
case 6:
|
||||
// (, line 328
|
||||
// <-, line 328
|
||||
slice_from("esys");
|
||||
// set CHANGE, line 328
|
||||
B_CHANGE = true;
|
||||
break;
|
||||
case 7:
|
||||
// (, line 330
|
||||
// <-, line 330
|
||||
slice_from("asys");
|
||||
// set CHANGE, line 330
|
||||
B_CHANGE = true;
|
||||
break;
|
||||
case 8:
|
||||
// (, line 334
|
||||
// <-, line 334
|
||||
slice_from("avimas");
|
||||
// set CHANGE, line 334
|
||||
B_CHANGE = true;
|
||||
break;
|
||||
case 9:
|
||||
// (, line 335
|
||||
// <-, line 335
|
||||
slice_from("ojimas");
|
||||
// set CHANGE, line 335
|
||||
B_CHANGE = true;
|
||||
break;
|
||||
case 10:
|
||||
// (, line 338
|
||||
// <-, line 338
|
||||
slice_from("okat\u0117");
|
||||
// set CHANGE, line 338
|
||||
B_CHANGE = true;
|
||||
break;
|
||||
case 11:
|
||||
// (, line 340
|
||||
// <-, line 340
|
||||
slice_from("okat\u0117");
|
||||
// set CHANGE, line 340
|
||||
B_CHANGE = true;
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_fix_chdz() {
|
||||
int among_var;
|
||||
// (, line 346
|
||||
// [, line 347
|
||||
ket = cursor;
|
||||
// substring, line 347
|
||||
among_var = find_among_b(a_3, 2);
|
||||
if (among_var == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// ], line 347
|
||||
bra = cursor;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 348
|
||||
// <-, line 348
|
||||
slice_from("t");
|
||||
// set CHANGE, line 348
|
||||
B_CHANGE = true;
|
||||
break;
|
||||
case 2:
|
||||
// (, line 349
|
||||
// <-, line 349
|
||||
slice_from("d");
|
||||
// set CHANGE, line 349
|
||||
B_CHANGE = true;
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean r_fix_gd() {
|
||||
int among_var;
|
||||
// (, line 353
|
||||
// [, line 354
|
||||
ket = cursor;
|
||||
// substring, line 354
|
||||
among_var = find_among_b(a_4, 1);
|
||||
if (among_var == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// ], line 354
|
||||
bra = cursor;
|
||||
switch(among_var) {
|
||||
case 0:
|
||||
return false;
|
||||
case 1:
|
||||
// (, line 355
|
||||
// <-, line 355
|
||||
slice_from("g");
|
||||
// set CHANGE, line 355
|
||||
B_CHANGE = true;
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean stem() {
|
||||
int v_1;
|
||||
int v_2;
|
||||
int v_3;
|
||||
int v_8;
|
||||
int v_9;
|
||||
int v_10;
|
||||
int v_11;
|
||||
int v_12;
|
||||
int v_13;
|
||||
// (, line 362
|
||||
I_p1 = limit;
|
||||
I_p2 = limit;
|
||||
I_s = (getCurrent().length());
|
||||
// do, line 368
|
||||
v_1 = cursor;
|
||||
lab0: do {
|
||||
// (, line 368
|
||||
// try, line 370
|
||||
v_2 = cursor;
|
||||
lab1: do {
|
||||
// (, line 370
|
||||
// test, line 370
|
||||
v_3 = cursor;
|
||||
// literal, line 370
|
||||
if (!(eq_s(1, "a")))
|
||||
{
|
||||
cursor = v_2;
|
||||
break lab1;
|
||||
}
|
||||
cursor = v_3;
|
||||
if (!(I_s > 6))
|
||||
{
|
||||
cursor = v_2;
|
||||
break lab1;
|
||||
}
|
||||
// hop, line 370
|
||||
{
|
||||
int c = cursor + 1;
|
||||
if (0 > c || c > limit)
|
||||
{
|
||||
cursor = v_2;
|
||||
break lab1;
|
||||
}
|
||||
cursor = c;
|
||||
}
|
||||
} while (false);
|
||||
// gopast, line 372
|
||||
golab2: while(true)
|
||||
{
|
||||
lab3: do {
|
||||
if (!(in_grouping(g_v, 97, 371)))
|
||||
{
|
||||
break lab3;
|
||||
}
|
||||
break golab2;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// gopast, line 372
|
||||
golab4: while(true)
|
||||
{
|
||||
lab5: do {
|
||||
if (!(out_grouping(g_v, 97, 371)))
|
||||
{
|
||||
break lab5;
|
||||
}
|
||||
break golab4;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// setmark p1, line 372
|
||||
I_p1 = cursor;
|
||||
// gopast, line 373
|
||||
golab6: while(true)
|
||||
{
|
||||
lab7: do {
|
||||
if (!(in_grouping(g_v, 97, 371)))
|
||||
{
|
||||
break lab7;
|
||||
}
|
||||
break golab6;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// gopast, line 373
|
||||
golab8: while(true)
|
||||
{
|
||||
lab9: do {
|
||||
if (!(out_grouping(g_v, 97, 371)))
|
||||
{
|
||||
break lab9;
|
||||
}
|
||||
break golab8;
|
||||
} while (false);
|
||||
if (cursor >= limit)
|
||||
{
|
||||
break lab0;
|
||||
}
|
||||
cursor++;
|
||||
}
|
||||
// setmark p2, line 373
|
||||
I_p2 = cursor;
|
||||
} while (false);
|
||||
cursor = v_1;
|
||||
// backwards, line 377
|
||||
limit_backward = cursor; cursor = limit;
|
||||
// (, line 377
|
||||
// do, line 378
|
||||
v_8 = limit - cursor;
|
||||
lab10: do {
|
||||
// call fix_conflicts, line 378
|
||||
if (!r_fix_conflicts())
|
||||
{
|
||||
break lab10;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_8;
|
||||
// do, line 379
|
||||
v_9 = limit - cursor;
|
||||
lab11: do {
|
||||
// call step1, line 379
|
||||
if (!r_step1())
|
||||
{
|
||||
break lab11;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_9;
|
||||
// do, line 380
|
||||
v_10 = limit - cursor;
|
||||
lab12: do {
|
||||
// call fix_chdz, line 380
|
||||
if (!r_fix_chdz())
|
||||
{
|
||||
break lab12;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_10;
|
||||
// do, line 381
|
||||
v_11 = limit - cursor;
|
||||
lab13: do {
|
||||
// call step2, line 381
|
||||
if (!r_step2())
|
||||
{
|
||||
break lab13;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_11;
|
||||
// do, line 382
|
||||
v_12 = limit - cursor;
|
||||
lab14: do {
|
||||
// call fix_chdz, line 382
|
||||
if (!r_fix_chdz())
|
||||
{
|
||||
break lab14;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_12;
|
||||
// do, line 383
|
||||
v_13 = limit - cursor;
|
||||
lab15: do {
|
||||
// call fix_gd, line 383
|
||||
if (!r_fix_gd())
|
||||
{
|
||||
break lab15;
|
||||
}
|
||||
} while (false);
|
||||
cursor = limit - v_13;
|
||||
cursor = limit_backward; return true;
|
||||
}
|
||||
|
||||
public boolean equals( Object o ) {
|
||||
return o instanceof LithuanianStemmer;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return LithuanianStemmer.class.getName().hashCode();
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,126 @@
|
|||
# Lithuanian stopwords list
|
||||
ant
|
||||
apie
|
||||
ar
|
||||
arba
|
||||
aš
|
||||
be
|
||||
bei
|
||||
bet
|
||||
bus
|
||||
būti
|
||||
būtų
|
||||
buvo
|
||||
dėl
|
||||
gali
|
||||
į
|
||||
iki
|
||||
ir
|
||||
iš
|
||||
ja
|
||||
ją
|
||||
jai
|
||||
jais
|
||||
jam
|
||||
jame
|
||||
jas
|
||||
jei
|
||||
ji
|
||||
jį
|
||||
jie
|
||||
jiedu
|
||||
jiedvi
|
||||
jiedviem
|
||||
jiedviese
|
||||
jiems
|
||||
jis
|
||||
jo
|
||||
jodviem
|
||||
jog
|
||||
joje
|
||||
jomis
|
||||
joms
|
||||
jos
|
||||
jose
|
||||
jų
|
||||
judu
|
||||
judvi
|
||||
judviejų
|
||||
jųdviejų
|
||||
judviem
|
||||
judviese
|
||||
jumis
|
||||
jums
|
||||
jumyse
|
||||
juo
|
||||
juodu
|
||||
juodviese
|
||||
juos
|
||||
juose
|
||||
jus
|
||||
jūs
|
||||
jūsų
|
||||
ką
|
||||
kad
|
||||
kai
|
||||
kaip
|
||||
kas
|
||||
kiek
|
||||
kol
|
||||
kur
|
||||
kurie
|
||||
kuris
|
||||
man
|
||||
mane
|
||||
manęs
|
||||
manimi
|
||||
mano
|
||||
manyje
|
||||
mes
|
||||
metu
|
||||
mudu
|
||||
mudvi
|
||||
mudviejų
|
||||
mudviem
|
||||
mudviese
|
||||
mumis
|
||||
mums
|
||||
mumyse
|
||||
mus
|
||||
mūsų
|
||||
nei
|
||||
nes
|
||||
net
|
||||
nors
|
||||
nuo
|
||||
o
|
||||
pat
|
||||
per
|
||||
po
|
||||
prie
|
||||
prieš
|
||||
sau
|
||||
save
|
||||
savęs
|
||||
savimi
|
||||
savo
|
||||
savyje
|
||||
su
|
||||
tačiau
|
||||
tada
|
||||
tai
|
||||
taip
|
||||
tas
|
||||
tau
|
||||
tave
|
||||
tavęs
|
||||
tavimi
|
||||
tavyje
|
||||
ten
|
||||
to
|
||||
todėl
|
||||
tu
|
||||
tuo
|
||||
už
|
||||
visi
|
||||
yra
|
|
@ -0,0 +1,53 @@
|
|||
package org.apache.lucene.analysis.lt;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestLithuanianAnalyzer extends BaseTokenStreamTestCase {
|
||||
|
||||
/** This test fails with NPE when the
|
||||
* stopwords file is missing in classpath */
|
||||
public void testResourcesAvailable() {
|
||||
new LithuanianAnalyzer().close();
|
||||
}
|
||||
|
||||
/** Test stopword removal */
|
||||
public void testStopWord() throws Exception {
|
||||
Analyzer a = new LithuanianAnalyzer();
|
||||
assertAnalyzesTo(a, "man",
|
||||
new String[] { });
|
||||
}
|
||||
|
||||
/** Test stemmer exceptions */
|
||||
public void testStemExclusion() throws IOException{
|
||||
CharArraySet set = new CharArraySet(1, true);
|
||||
set.add("vaikų");
|
||||
Analyzer a = new LithuanianAnalyzer(CharArraySet.EMPTY_SET, set);
|
||||
assertAnalyzesTo(a, "vaikų", new String[] {"vaikų"});
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new LithuanianAnalyzer(), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,481 @@
|
|||
package org.apache.lucene.analysis.lt;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.tartarus.snowball.ext.LithuanianStemmer;
|
||||
|
||||
/**
|
||||
* Basic tests for {@link LithuanianStemmer}.
|
||||
* We test some n/adj templates from wikipedia and some high frequency
|
||||
* terms from mixed corpora.
|
||||
*/
|
||||
public class TestLithuanianStemming extends BaseTokenStreamTestCase {
|
||||
private Analyzer a;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, new SnowballFilter(tokenizer, new LithuanianStemmer()));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
a.close();
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
public void testNounsI() throws IOException {
|
||||
// n. decl. I (-as)
|
||||
checkOneTerm(a, "vaikas", "vaik"); // nom. sing.
|
||||
checkOneTerm(a, "vaikai", "vaik"); // nom. pl.
|
||||
checkOneTerm(a, "vaiko", "vaik"); // gen. sg.
|
||||
checkOneTerm(a, "vaikų", "vaik"); // gen. pl.
|
||||
checkOneTerm(a, "vaikui", "vaik"); // dat. sg.
|
||||
checkOneTerm(a, "vaikams", "vaik"); // dat. pl.
|
||||
checkOneTerm(a, "vaiką", "vaik"); // acc. sg.
|
||||
checkOneTerm(a, "vaikus", "vaik"); // acc. pl.
|
||||
checkOneTerm(a, "vaiku", "vaik"); // ins. sg.
|
||||
checkOneTerm(a, "vaikais", "vaik"); // ins. pl.
|
||||
checkOneTerm(a, "vaike", "vaik"); // loc. sg.
|
||||
checkOneTerm(a, "vaikuose", "vaik"); // loc. pl.
|
||||
checkOneTerm(a, "vaike", "vaik"); // voc. sg.
|
||||
checkOneTerm(a, "vaikai", "vaik"); // voc. pl.
|
||||
|
||||
// n. decl. I (-is)
|
||||
checkOneTerm(a, "brolis", "brol"); // nom. sing.
|
||||
checkOneTerm(a, "broliai", "brol"); // nom. pl.
|
||||
checkOneTerm(a, "brolio", "brol"); // gen. sg.
|
||||
checkOneTerm(a, "brolių", "brol"); // gen. pl.
|
||||
checkOneTerm(a, "broliui", "brol"); // dat. sg.
|
||||
checkOneTerm(a, "broliams", "brol"); // dat. pl.
|
||||
checkOneTerm(a, "brolį", "brol"); // acc. sg.
|
||||
checkOneTerm(a, "brolius", "brol"); // acc. pl.
|
||||
checkOneTerm(a, "broliu", "brol"); // ins. sg.
|
||||
checkOneTerm(a, "broliais", "brol"); // ins. pl.
|
||||
checkOneTerm(a, "brolyje", "brol"); // loc. sg.
|
||||
checkOneTerm(a, "broliuose", "brol"); // loc. pl.
|
||||
checkOneTerm(a, "broli", "brol"); // voc. sg.
|
||||
checkOneTerm(a, "broliai", "brol"); // voc. pl.
|
||||
|
||||
// n. decl. I (-ys)
|
||||
// note: some forms don't conflate
|
||||
checkOneTerm(a, "arklys", "arkl"); // nom. sing.
|
||||
checkOneTerm(a, "arkliai", "arkliai"); // nom. pl.
|
||||
checkOneTerm(a, "arklio", "arkl"); // gen. sg.
|
||||
checkOneTerm(a, "arklių", "arkl"); // gen. pl.
|
||||
checkOneTerm(a, "arkliui", "arkliui"); // dat. sg.
|
||||
checkOneTerm(a, "arkliams", "arkliam"); // dat. pl.
|
||||
checkOneTerm(a, "arklį", "arkl"); // acc. sg.
|
||||
checkOneTerm(a, "arklius", "arklius"); // acc. pl.
|
||||
checkOneTerm(a, "arkliu", "arkl"); // ins. sg.
|
||||
checkOneTerm(a, "arkliais", "arkliais"); // ins. pl.
|
||||
checkOneTerm(a, "arklyje", "arklyj"); // loc. sg.
|
||||
checkOneTerm(a, "arkliuose", "arkliuos"); // loc. pl.
|
||||
checkOneTerm(a, "arkly", "arkl"); // voc. sg.
|
||||
checkOneTerm(a, "arkliai", "arkliai"); // voc. pl.
|
||||
}
|
||||
|
||||
public void testNounsII() throws IOException {
|
||||
// n. decl II (-a)
|
||||
checkOneTerm(a, "motina", "motin"); // nom. sing.
|
||||
checkOneTerm(a, "motinos", "motin"); // nom. pl.
|
||||
checkOneTerm(a, "motinos", "motin"); // gen. sg.
|
||||
checkOneTerm(a, "motinų", "motin"); // gen. pl.
|
||||
checkOneTerm(a, "motinai", "motin"); // dat. sg.
|
||||
checkOneTerm(a, "motinoms", "motin"); // dat. pl.
|
||||
checkOneTerm(a, "motiną", "motin"); // acc. sg.
|
||||
checkOneTerm(a, "motinas", "motin"); // acc. pl.
|
||||
checkOneTerm(a, "motina", "motin"); // ins. sg.
|
||||
checkOneTerm(a, "motinomis", "motin"); // ins. pl.
|
||||
checkOneTerm(a, "motinoje", "motin"); // loc. sg.
|
||||
checkOneTerm(a, "motinose", "motin"); // loc. pl.
|
||||
checkOneTerm(a, "motina", "motin"); // voc. sg.
|
||||
checkOneTerm(a, "motinos", "motin"); // voc. pl.
|
||||
|
||||
// n. decl II (-ė)
|
||||
checkOneTerm(a, "katė", "kat"); // nom. sing.
|
||||
checkOneTerm(a, "katės", "kat"); // nom. pl.
|
||||
checkOneTerm(a, "katės", "kat"); // gen. sg.
|
||||
checkOneTerm(a, "kačių", "kat"); // gen. pl.
|
||||
checkOneTerm(a, "katei", "kat"); // dat. sg.
|
||||
checkOneTerm(a, "katėms", "kat"); // dat. pl.
|
||||
checkOneTerm(a, "katę", "kat"); // acc. sg.
|
||||
checkOneTerm(a, "kates", "kat"); // acc. pl.
|
||||
checkOneTerm(a, "kate", "kat"); // ins. sg.
|
||||
checkOneTerm(a, "katėmis", "kat"); // ins. pl.
|
||||
checkOneTerm(a, "katėje", "kat"); // loc. sg.
|
||||
checkOneTerm(a, "katėse", "kat"); // loc. pl.
|
||||
checkOneTerm(a, "kate", "kat"); // voc. sg.
|
||||
checkOneTerm(a, "katės", "kat"); // voc. pl.
|
||||
|
||||
// n. decl II (-ti)
|
||||
checkOneTerm(a, "pati", "pat"); // nom. sing.
|
||||
checkOneTerm(a, "pačios", "pat"); // nom. pl.
|
||||
checkOneTerm(a, "pačios", "pat"); // gen. sg.
|
||||
checkOneTerm(a, "pačių", "pat"); // gen. pl.
|
||||
checkOneTerm(a, "pačiai", "pat"); // dat. sg.
|
||||
checkOneTerm(a, "pačioms", "pat"); // dat. pl.
|
||||
checkOneTerm(a, "pačią", "pat"); // acc. sg.
|
||||
checkOneTerm(a, "pačias", "pat"); // acc. pl.
|
||||
checkOneTerm(a, "pačia", "pat"); // ins. sg.
|
||||
checkOneTerm(a, "pačiomis", "pat"); // ins. pl.
|
||||
checkOneTerm(a, "pačioje", "pat"); // loc. sg.
|
||||
checkOneTerm(a, "pačiose", "pat"); // loc. pl.
|
||||
checkOneTerm(a, "pati", "pat"); // voc. sg.
|
||||
checkOneTerm(a, "pačios", "pat"); // voc. pl.
|
||||
}
|
||||
|
||||
public void testNounsIII() throws IOException {
|
||||
// n. decl III-m
|
||||
checkOneTerm(a, "vagis", "vag"); // nom. sing.
|
||||
checkOneTerm(a, "vagys", "vag"); // nom. pl.
|
||||
checkOneTerm(a, "vagies", "vag"); // gen. sg.
|
||||
checkOneTerm(a, "vagių", "vag"); // gen. pl.
|
||||
checkOneTerm(a, "vagiui", "vag"); // dat. sg.
|
||||
checkOneTerm(a, "vagims", "vag"); // dat. pl.
|
||||
checkOneTerm(a, "vagį", "vag"); // acc. sg.
|
||||
checkOneTerm(a, "vagis", "vag"); // acc. pl.
|
||||
checkOneTerm(a, "vagimi", "vag"); // ins. sg.
|
||||
checkOneTerm(a, "vagimis", "vag"); // ins. pl.
|
||||
checkOneTerm(a, "vagyje", "vag"); // loc. sg.
|
||||
checkOneTerm(a, "vagyse", "vag"); // loc. pl.
|
||||
checkOneTerm(a, "vagie", "vag"); // voc. sg.
|
||||
checkOneTerm(a, "vagys", "vag"); // voc. pl.
|
||||
|
||||
// n. decl III-f
|
||||
checkOneTerm(a, "akis", "ak"); // nom. sing.
|
||||
checkOneTerm(a, "akys", "ak"); // nom. pl.
|
||||
checkOneTerm(a, "akies", "ak"); // gen. sg.
|
||||
checkOneTerm(a, "akių", "ak"); // gen. pl.
|
||||
checkOneTerm(a, "akiai", "ak"); // dat. sg.
|
||||
checkOneTerm(a, "akims", "ak"); // dat. pl.
|
||||
checkOneTerm(a, "akį", "ak"); // acc. sg.
|
||||
checkOneTerm(a, "akis", "ak"); // acc. pl.
|
||||
checkOneTerm(a, "akimi", "ak"); // ins. sg.
|
||||
checkOneTerm(a, "akimis", "ak"); // ins. pl.
|
||||
checkOneTerm(a, "akyje", "ak"); // loc. sg.
|
||||
checkOneTerm(a, "akyse", "ak"); // loc. pl.
|
||||
checkOneTerm(a, "akie", "ak"); // voc. sg.
|
||||
checkOneTerm(a, "akys", "ak"); // voc. pl.
|
||||
}
|
||||
|
||||
public void testNounsIV() throws IOException {
|
||||
// n. decl IV (-us)
|
||||
checkOneTerm(a, "sūnus", "sūn"); // nom. sing.
|
||||
checkOneTerm(a, "sūnūs", "sūn"); // nom. pl.
|
||||
checkOneTerm(a, "sūnaus", "sūn"); // gen. sg.
|
||||
checkOneTerm(a, "sūnų", "sūn"); // gen. pl.
|
||||
checkOneTerm(a, "sūnui", "sūn"); // dat. sg.
|
||||
checkOneTerm(a, "sūnums", "sūn"); // dat. pl.
|
||||
checkOneTerm(a, "sūnų", "sūn"); // acc. sg.
|
||||
checkOneTerm(a, "sūnus", "sūn"); // acc. pl.
|
||||
checkOneTerm(a, "sūnumi", "sūn"); // ins. sg.
|
||||
checkOneTerm(a, "sūnumis", "sūn"); // ins. pl.
|
||||
checkOneTerm(a, "sūnuje", "sūn"); // loc. sg.
|
||||
checkOneTerm(a, "sūnuose", "sūn"); // loc. pl.
|
||||
checkOneTerm(a, "sūnau", "sūn"); // voc. sg.
|
||||
checkOneTerm(a, "sūnūs", "sūn"); // voc. pl.
|
||||
|
||||
// n. decl IV (-ius)
|
||||
checkOneTerm(a, "profesorius", "profesor"); // nom. sing.
|
||||
checkOneTerm(a, "profesoriai", "profesor"); // nom. pl.
|
||||
checkOneTerm(a, "profesoriaus", "profesor"); // gen. sg.
|
||||
checkOneTerm(a, "profesorių", "profesor"); // gen. pl.
|
||||
checkOneTerm(a, "profesoriui", "profesor"); // dat. sg.
|
||||
checkOneTerm(a, "profesoriams", "profesor"); // dat. pl.
|
||||
checkOneTerm(a, "profesorių", "profesor"); // acc. sg.
|
||||
checkOneTerm(a, "profesorius", "profesor"); // acc. pl.
|
||||
checkOneTerm(a, "profesoriumi", "profesor"); // ins. sg.
|
||||
checkOneTerm(a, "profesoriais", "profesor"); // ins. pl.
|
||||
checkOneTerm(a, "profesoriuje", "profesor"); // loc. sg.
|
||||
checkOneTerm(a, "profesoriuose", "profesor"); // loc. pl.
|
||||
checkOneTerm(a, "profesoriau", "profesor"); // voc. sg.
|
||||
checkOneTerm(a, "profesoriai", "profesor"); // voc. pl.
|
||||
}
|
||||
|
||||
public void testNounsV() throws IOException {
|
||||
// n. decl V
|
||||
// note: gen.pl. doesn't conflate
|
||||
checkOneTerm(a, "vanduo", "vand"); // nom. sing.
|
||||
checkOneTerm(a, "vandenys", "vand"); // nom. pl.
|
||||
checkOneTerm(a, "vandens", "vand"); // gen. sg.
|
||||
checkOneTerm(a, "vandenų", "vanden"); // gen. pl.
|
||||
checkOneTerm(a, "vandeniui", "vand"); // dat. sg.
|
||||
checkOneTerm(a, "vandenims", "vand"); // dat. pl.
|
||||
checkOneTerm(a, "vandenį", "vand"); // acc. sg.
|
||||
checkOneTerm(a, "vandenis", "vand"); // acc. pl.
|
||||
checkOneTerm(a, "vandeniu", "vand"); // ins. sg.
|
||||
checkOneTerm(a, "vandenimis", "vand"); // ins. pl.
|
||||
checkOneTerm(a, "vandenyje", "vand"); // loc. sg.
|
||||
checkOneTerm(a, "vandenyse", "vand"); // loc. pl.
|
||||
checkOneTerm(a, "vandenie", "vand"); // voc. sg.
|
||||
checkOneTerm(a, "vandenys", "vand"); // voc. pl.
|
||||
}
|
||||
|
||||
public void testAdjI() throws IOException {
|
||||
// adj. decl I
|
||||
checkOneTerm(a, "geras", "ger"); // m. nom. sing.
|
||||
checkOneTerm(a, "geri", "ger"); // m. nom. pl.
|
||||
checkOneTerm(a, "gero", "ger"); // m. gen. sg.
|
||||
checkOneTerm(a, "gerų", "ger"); // m. gen. pl.
|
||||
checkOneTerm(a, "geram", "ger"); // m. dat. sg.
|
||||
checkOneTerm(a, "geriems", "ger"); // m. dat. pl.
|
||||
checkOneTerm(a, "gerą", "ger"); // m. acc. sg.
|
||||
checkOneTerm(a, "gerus", "ger"); // m. acc. pl.
|
||||
checkOneTerm(a, "geru", "ger"); // m. ins. sg.
|
||||
checkOneTerm(a, "gerais", "ger"); // m. ins. pl.
|
||||
checkOneTerm(a, "gerame", "ger"); // m. loc. sg.
|
||||
checkOneTerm(a, "geruose", "ger"); // m. loc. pl.
|
||||
|
||||
checkOneTerm(a, "gera", "ger"); // f. nom. sing.
|
||||
checkOneTerm(a, "geros", "ger"); // f. nom. pl.
|
||||
checkOneTerm(a, "geros", "ger"); // f. gen. sg.
|
||||
checkOneTerm(a, "gerų", "ger"); // f. gen. pl.
|
||||
checkOneTerm(a, "gerai", "ger"); // f. dat. sg.
|
||||
checkOneTerm(a, "geroms", "ger"); // f. dat. pl.
|
||||
checkOneTerm(a, "gerą", "ger"); // f. acc. sg.
|
||||
checkOneTerm(a, "geras", "ger"); // f. acc. pl.
|
||||
checkOneTerm(a, "gera", "ger"); // f. ins. sg.
|
||||
checkOneTerm(a, "geromis", "ger"); // f. ins. pl.
|
||||
checkOneTerm(a, "geroje", "ger"); // f. loc. sg.
|
||||
checkOneTerm(a, "gerose", "ger"); // f. loc. pl.
|
||||
}
|
||||
|
||||
public void testAdjII() throws IOException {
|
||||
// adj. decl II
|
||||
checkOneTerm(a, "gražus", "graž"); // m. nom. sing.
|
||||
checkOneTerm(a, "gražūs", "graž"); // m. nom. pl.
|
||||
checkOneTerm(a, "gražaus", "graž"); // m. gen. sg.
|
||||
checkOneTerm(a, "gražių", "graž"); // m. gen. pl.
|
||||
checkOneTerm(a, "gražiam", "graž"); // m. dat. sg.
|
||||
checkOneTerm(a, "gražiems", "graž"); // m. dat. pl.
|
||||
checkOneTerm(a, "gražų", "graž"); // m. acc. sg.
|
||||
checkOneTerm(a, "gražius", "graž"); // m. acc. pl.
|
||||
checkOneTerm(a, "gražiu", "graž"); // m. ins. sg.
|
||||
checkOneTerm(a, "gražiais", "graž"); // m. ins. pl.
|
||||
checkOneTerm(a, "gražiame", "graž"); // m. loc. sg.
|
||||
checkOneTerm(a, "gražiuose", "graž"); // m. loc. pl.
|
||||
|
||||
checkOneTerm(a, "graži", "graž"); // f. nom. sing.
|
||||
checkOneTerm(a, "gražios", "graž"); // f. nom. pl.
|
||||
checkOneTerm(a, "gražios", "graž"); // f. gen. sg.
|
||||
checkOneTerm(a, "gražių", "graž"); // f. gen. pl.
|
||||
checkOneTerm(a, "gražiai", "graž"); // f. dat. sg.
|
||||
checkOneTerm(a, "gražioms", "graž"); // f. dat. pl.
|
||||
checkOneTerm(a, "gražią", "graž"); // f. acc. sg.
|
||||
checkOneTerm(a, "gražias", "graž"); // f. acc. pl.
|
||||
checkOneTerm(a, "gražia", "graž"); // f. ins. sg.
|
||||
checkOneTerm(a, "gražiomis", "graž"); // f. ins. pl.
|
||||
checkOneTerm(a, "gražioje", "graž"); // f. loc. sg.
|
||||
checkOneTerm(a, "gražiose", "graž"); // f. loc. pl.
|
||||
}
|
||||
|
||||
public void testAdjIII() throws IOException {
|
||||
// adj. decl III
|
||||
checkOneTerm(a, "vidutinis", "vidutin"); // m. nom. sing.
|
||||
checkOneTerm(a, "vidutiniai", "vidutin"); // m. nom. pl.
|
||||
checkOneTerm(a, "vidutinio", "vidutin"); // m. gen. sg.
|
||||
checkOneTerm(a, "vidutinių", "vidutin"); // m. gen. pl.
|
||||
checkOneTerm(a, "vidutiniam", "vidutin"); // m. dat. sg.
|
||||
checkOneTerm(a, "vidutiniams", "vidutin"); // m. dat. pl.
|
||||
checkOneTerm(a, "vidutinį", "vidutin"); // m. acc. sg.
|
||||
checkOneTerm(a, "vidutinius", "vidutin"); // m. acc. pl.
|
||||
checkOneTerm(a, "vidutiniu", "vidutin"); // m. ins. sg.
|
||||
checkOneTerm(a, "vidutiniais", "vidutin"); // m. ins. pl.
|
||||
checkOneTerm(a, "vidutiniame", "vidutin"); // m. loc. sg.
|
||||
checkOneTerm(a, "vidutiniuose", "vidutin"); // m. loc. pl.
|
||||
|
||||
checkOneTerm(a, "vidutinė", "vidutin"); // f. nom. sing.
|
||||
checkOneTerm(a, "vidutinės", "vidutin"); // f. nom. pl.
|
||||
checkOneTerm(a, "vidutinės", "vidutin"); // f. gen. sg.
|
||||
checkOneTerm(a, "vidutinių", "vidutin"); // f. gen. pl.
|
||||
checkOneTerm(a, "vidutinei", "vidutin"); // f. dat. sg.
|
||||
checkOneTerm(a, "vidutinėms", "vidutin"); // f. dat. pl.
|
||||
checkOneTerm(a, "vidutinę", "vidutin"); // f. acc. sg.
|
||||
checkOneTerm(a, "vidutines", "vidutin"); // f. acc. pl.
|
||||
checkOneTerm(a, "vidutine", "vidutin"); // f. ins. sg.
|
||||
checkOneTerm(a, "vidutinėmis", "vidutin"); // f. ins. pl.
|
||||
checkOneTerm(a, "vidutinėje", "vidutin"); // f. loc. sg.
|
||||
checkOneTerm(a, "vidutinėse", "vidutin"); // f. loc. pl.
|
||||
}
|
||||
|
||||
/**
|
||||
* test some high frequency terms from corpora to look for anything crazy
|
||||
*/
|
||||
public void testHighFrequencyTerms() throws IOException {
|
||||
checkOneTerm(a, "ir", "ir");
|
||||
checkOneTerm(a, "kad", "kad");
|
||||
checkOneTerm(a, "į", "į");
|
||||
checkOneTerm(a, "tai", "tai");
|
||||
checkOneTerm(a, "su", "su");
|
||||
checkOneTerm(a, "o", "o");
|
||||
checkOneTerm(a, "iš", "iš");
|
||||
checkOneTerm(a, "kaip", "kaip");
|
||||
checkOneTerm(a, "bet", "bet");
|
||||
checkOneTerm(a, "yra", "yr");
|
||||
checkOneTerm(a, "buvo", "buv");
|
||||
checkOneTerm(a, "tik", "tik");
|
||||
checkOneTerm(a, "ne", "ne");
|
||||
checkOneTerm(a, "taip", "taip");
|
||||
checkOneTerm(a, "ar", "ar");
|
||||
checkOneTerm(a, "dar", "dar");
|
||||
checkOneTerm(a, "jau", "jau");
|
||||
checkOneTerm(a, "savo", "sav");
|
||||
checkOneTerm(a, "apie", "ap");
|
||||
checkOneTerm(a, "kai", "kai");
|
||||
checkOneTerm(a, "aš", "aš");
|
||||
checkOneTerm(a, "per", "per");
|
||||
checkOneTerm(a, "nuo", "nuo");
|
||||
checkOneTerm(a, "po", "po");
|
||||
checkOneTerm(a, "jis", "jis");
|
||||
checkOneTerm(a, "kas", "kas");
|
||||
checkOneTerm(a, "d", "d");
|
||||
checkOneTerm(a, "labai", "lab");
|
||||
checkOneTerm(a, "man", "man");
|
||||
checkOneTerm(a, "dėl", "dėl");
|
||||
checkOneTerm(a, "tačiau", "tat");
|
||||
checkOneTerm(a, "nes", "nes");
|
||||
checkOneTerm(a, "už", "už");
|
||||
checkOneTerm(a, "to", "to");
|
||||
checkOneTerm(a, "jo", "jo");
|
||||
checkOneTerm(a, "iki", "ik");
|
||||
checkOneTerm(a, "ką", "ką");
|
||||
checkOneTerm(a, "mano", "man");
|
||||
checkOneTerm(a, "metų", "met");
|
||||
checkOneTerm(a, "nors", "nor");
|
||||
checkOneTerm(a, "jei", "jei");
|
||||
checkOneTerm(a, "bus", "bus");
|
||||
checkOneTerm(a, "jų", "jų");
|
||||
checkOneTerm(a, "čia", "čia");
|
||||
checkOneTerm(a, "dabar", "dabar");
|
||||
checkOneTerm(a, "Lietuvos", "Lietuv");
|
||||
checkOneTerm(a, "net", "net");
|
||||
checkOneTerm(a, "nei", "nei");
|
||||
checkOneTerm(a, "gali", "gal");
|
||||
checkOneTerm(a, "daug", "daug");
|
||||
checkOneTerm(a, "prie", "prie");
|
||||
checkOneTerm(a, "ji", "ji");
|
||||
checkOneTerm(a, "jos", "jos");
|
||||
checkOneTerm(a, "pat", "pat");
|
||||
checkOneTerm(a, "jie", "jie");
|
||||
checkOneTerm(a, "kur", "kur");
|
||||
checkOneTerm(a, "gal", "gal");
|
||||
checkOneTerm(a, "ant", "ant");
|
||||
checkOneTerm(a, "tiek", "tiek");
|
||||
checkOneTerm(a, "be", "be");
|
||||
checkOneTerm(a, "būti", "būt");
|
||||
checkOneTerm(a, "bei", "bei");
|
||||
checkOneTerm(a, "daugiau", "daug");
|
||||
checkOneTerm(a, "turi", "tur");
|
||||
checkOneTerm(a, "prieš", "prieš");
|
||||
checkOneTerm(a, "vis", "vis");
|
||||
checkOneTerm(a, "būtų", "būt");
|
||||
checkOneTerm(a, "jog", "jog");
|
||||
checkOneTerm(a, "reikia", "reik");
|
||||
checkOneTerm(a, "mūsų", "mūs");
|
||||
checkOneTerm(a, "metu", "met");
|
||||
checkOneTerm(a, "galima", "galim");
|
||||
checkOneTerm(a, "nėra", "nėr");
|
||||
checkOneTerm(a, "arba", "arb");
|
||||
checkOneTerm(a, "mes", "mes");
|
||||
checkOneTerm(a, "kurie", "kur");
|
||||
checkOneTerm(a, "tikrai", "tikr");
|
||||
checkOneTerm(a, "todėl", "tod");
|
||||
checkOneTerm(a, "ten", "ten");
|
||||
checkOneTerm(a, "šiandien", "šiandien");
|
||||
checkOneTerm(a, "vienas", "vien");
|
||||
checkOneTerm(a, "visi", "vis");
|
||||
checkOneTerm(a, "kuris", "kur");
|
||||
checkOneTerm(a, "tada", "tad");
|
||||
checkOneTerm(a, "kiek", "kiek");
|
||||
checkOneTerm(a, "tuo", "tuo");
|
||||
checkOneTerm(a, "gerai", "ger");
|
||||
checkOneTerm(a, "nieko", "niek");
|
||||
checkOneTerm(a, "jį", "jį");
|
||||
checkOneTerm(a, "kol", "kol");
|
||||
checkOneTerm(a, "viskas", "visk");
|
||||
checkOneTerm(a, "mane", "man");
|
||||
checkOneTerm(a, "kartą", "kart");
|
||||
checkOneTerm(a, "m", "m");
|
||||
checkOneTerm(a, "tas", "tas");
|
||||
checkOneTerm(a, "sakė", "sak");
|
||||
checkOneTerm(a, "žmonių", "žmon");
|
||||
checkOneTerm(a, "tu", "tu");
|
||||
checkOneTerm(a, "dieną", "dien");
|
||||
checkOneTerm(a, "žmonės", "žmon");
|
||||
checkOneTerm(a, "metais", "met");
|
||||
checkOneTerm(a, "vieną", "vien");
|
||||
checkOneTerm(a, "vėl", "vėl");
|
||||
checkOneTerm(a, "na", "na");
|
||||
checkOneTerm(a, "tą", "tą");
|
||||
checkOneTerm(a, "tiesiog", "tiesiog");
|
||||
checkOneTerm(a, "toks", "tok");
|
||||
checkOneTerm(a, "pats", "pat");
|
||||
checkOneTerm(a, "ko", "ko");
|
||||
checkOneTerm(a, "Lietuvoje", "Lietuv");
|
||||
checkOneTerm(a, "pagal", "pagal");
|
||||
checkOneTerm(a, "jeigu", "jeig");
|
||||
checkOneTerm(a, "visai", "vis");
|
||||
checkOneTerm(a, "viena", "vien");
|
||||
checkOneTerm(a, "šį", "šį");
|
||||
checkOneTerm(a, "metus", "met");
|
||||
checkOneTerm(a, "jam", "jam");
|
||||
checkOneTerm(a, "kodėl", "kod");
|
||||
checkOneTerm(a, "litų", "lit");
|
||||
checkOneTerm(a, "ją", "ją");
|
||||
checkOneTerm(a, "kuri", "kur");
|
||||
checkOneTerm(a, "darbo", "darb");
|
||||
checkOneTerm(a, "tarp", "tarp");
|
||||
checkOneTerm(a, "juk", "juk");
|
||||
checkOneTerm(a, "laiko", "laik");
|
||||
checkOneTerm(a, "juos", "juos");
|
||||
checkOneTerm(a, "visą", "vis");
|
||||
checkOneTerm(a, "kurios", "kur");
|
||||
checkOneTerm(a, "tam", "tam");
|
||||
checkOneTerm(a, "pas", "pas");
|
||||
checkOneTerm(a, "viską", "visk");
|
||||
checkOneTerm(a, "Europos", "Eur");
|
||||
checkOneTerm(a, "atrodo", "atrod");
|
||||
checkOneTerm(a, "tad", "tad");
|
||||
checkOneTerm(a, "bent", "bent");
|
||||
checkOneTerm(a, "kitų", "kit");
|
||||
checkOneTerm(a, "šis", "šis");
|
||||
checkOneTerm(a, "Vilniaus", "Viln");
|
||||
checkOneTerm(a, "beveik", "bevei");
|
||||
checkOneTerm(a, "proc", "proc");
|
||||
checkOneTerm(a, "tokia", "tok");
|
||||
checkOneTerm(a, "šiuo", "šiuo");
|
||||
checkOneTerm(a, "du", "du");
|
||||
checkOneTerm(a, "kartu", "kart");
|
||||
checkOneTerm(a, "visada", "visad");
|
||||
checkOneTerm(a, "kuo", "kuo");
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue