diff --git a/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java b/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java deleted file mode 100644 index adaddc92ce6..00000000000 --- a/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java +++ /dev/null @@ -1,78 +0,0 @@ -package org.apache.lucene.analysis.de; - -/** - * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.StringReader; - -import junit.framework.TestCase; - -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.standard.StandardTokenizer; - -/** - * Test the German stemmer. The stemming algorithm is known to work less - * than perfect, as it doesn't use any word lists with exceptions. We - * also check some of the cases where the algorithm is wrong. - * - * @author Daniel Naber - */ -public class TestGermanStemFilter extends TestCase { - - public void testStemming() { - try { - // read test cases from external file: - File dataDir = new File(System.getProperty("dataDir", "./bin")); - File testFile = new File(dataDir, "org/apache/lucene/analysis/de/data.txt"); - FileInputStream fis = new FileInputStream(testFile); - InputStreamReader isr = new InputStreamReader(fis, "iso-8859-1"); - BufferedReader breader = new BufferedReader(isr); - while(true) { - String line = breader.readLine(); - if (line == null) - break; - line = line.trim(); - if (line.startsWith("#") || line.equals("")) - continue; // ignore comments and empty lines - String[] parts = line.split(";"); - //System.out.println(parts[0] + " -- " + parts[1]); - check(parts[0], parts[1]); - } - breader.close(); - isr.close(); - fis.close(); - } catch (IOException e) { - e.printStackTrace(); - fail(); - } - } - - private void check(final String input, final String expected) throws IOException { - StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input)); - GermanStemFilter filter = new GermanStemFilter(tokenStream); - Token t = filter.next(); - if (t == null) - fail(); - assertEquals(expected, t.termText()); - filter.close(); - } - -} diff --git a/src/test/org/apache/lucene/analysis/de/data.txt b/src/test/org/apache/lucene/analysis/de/data.txt deleted file mode 100644 index 520c18a1df6..00000000000 --- a/src/test/org/apache/lucene/analysis/de/data.txt +++ /dev/null @@ -1,48 +0,0 @@ -# German special characters are replaced: -häufig;haufig - -# here the stemmer works okay, it maps related words to the same stem: -abschließen;abschliess -abschließender;abschliess -abschließendes;abschliess -abschließenden;abschliess - -Tisch;tisch -Tische;tisch -Tischen;tisch - -Haus;hau -Hauses;hau -Häuser;hau -Häusern;hau -# here's a case where overstemming occurs, i.e. a word is -# mapped to the same stem as unrelated words: -hauen;hau - -# here's a case where understemming occurs, i.e. two related words -# are not mapped to the same stem. This is the case with basically -# all irregular forms: -Drama;drama -Dramen;dram - -# replace "ß" with 'ss': -Ausmaß;ausmass - -# fake words to test if suffixes are cut off: -xxxxxe;xxxxx -xxxxxs;xxxxx -xxxxxn;xxxxx -xxxxxt;xxxxx -xxxxxem;xxxxx -xxxxxer;xxxxx -xxxxxnd;xxxxx -# the suffixes are also removed when combined: -xxxxxetende;xxxxx - -# words that are shorter than four charcters are not changed: -xxe;xxe -# -em and -er are not removed from words shorter than five characters: -xxem;xxem -xxer;xxer -# -nd is not removed from words shorter than six characters: -xxxnd;xxxnd diff --git a/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java b/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java deleted file mode 100644 index 1294d5e577b..00000000000 --- a/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java +++ /dev/null @@ -1,170 +0,0 @@ -package org.apache.lucene.analysis.ru; - -/** - * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import junit.framework.TestCase; - -import java.io.*; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Token; - -/** - * Test case for RussianAnalyzer. - * - * @author Boris Okner - * @version $Id$ - */ - -public class TestRussianAnalyzer extends TestCase -{ - private InputStreamReader inWords; - - private InputStreamReader sampleUnicode; - - private Reader inWordsKOI8; - - private Reader sampleKOI8; - - private Reader inWords1251; - - private Reader sample1251; - - private File dataDir; - - protected void setUp() throws Exception - { - dataDir = new File(System.getProperty("dataDir", "./bin")); - } - - public void testUnicode() throws IOException - { - RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian); - inWords = - new InputStreamReader( - new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUnicode.txt")), - "Unicode"); - - sampleUnicode = - new InputStreamReader( - new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resUnicode.htm")), - "Unicode"); - - TokenStream in = ra.tokenStream("all", inWords); - - RussianLetterTokenizer sample = - new RussianLetterTokenizer( - sampleUnicode, - RussianCharsets.UnicodeRussian); - - for (;;) - { - Token token = in.next(); - - if (token == null) - { - break; - } - - Token sampleToken = sample.next(); - assertEquals( - "Unicode", - token.termText(), - sampleToken == null - ? null - : sampleToken.termText()); - } - - inWords.close(); - sampleUnicode.close(); - } - - public void testKOI8() throws IOException - { - //System.out.println(new java.util.Date()); - RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8); - // KOI8 - inWordsKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testKOI8.txt")), "iso-8859-1"); - - sampleKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resKOI8.htm")), "iso-8859-1"); - - TokenStream in = ra.tokenStream("all", inWordsKOI8); - RussianLetterTokenizer sample = - new RussianLetterTokenizer( - sampleKOI8, - RussianCharsets.KOI8); - - for (;;) - { - Token token = in.next(); - - if (token == null) - { - break; - } - - Token sampleToken = sample.next(); - assertEquals( - "KOI8", - token.termText(), - sampleToken == null - ? null - : sampleToken.termText()); - - } - - inWordsKOI8.close(); - sampleKOI8.close(); - } - - public void test1251() throws IOException - { - // 1251 - inWords1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/test1251.txt")), "iso-8859-1"); - - sample1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/res1251.htm")), "iso-8859-1"); - - RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251); - TokenStream in = ra.tokenStream("", inWords1251); - RussianLetterTokenizer sample = - new RussianLetterTokenizer( - sample1251, - RussianCharsets.CP1251); - - for (;;) - { - Token token = in.next(); - - if (token == null) - { - break; - } - - Token sampleToken = sample.next(); - assertEquals( - "1251", - token.termText(), - sampleToken == null - ? null - : sampleToken.termText()); - - } - - inWords1251.close(); - sample1251.close(); - } -} diff --git a/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java b/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java deleted file mode 100644 index d85c5ba1116..00000000000 --- a/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java +++ /dev/null @@ -1,94 +0,0 @@ -package org.apache.lucene.analysis.ru; - -/** - * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import junit.framework.TestCase; - -import java.io.BufferedReader; -import java.io.File; -import java.io.InputStreamReader; -import java.io.FileInputStream; -import java.util.ArrayList; - -public class TestRussianStem extends TestCase -{ - private ArrayList words = new ArrayList(); - private ArrayList stems = new ArrayList(); - - public TestRussianStem(String name) - { - super(name); - } - - /** - * @see TestCase#setUp() - */ - protected void setUp() throws Exception - { - super.setUp(); - //System.out.println(new java.util.Date()); - String str; - - File dataDir = new File(System.getProperty("dataDir")); - - // open and read words into an array list - BufferedReader inWords = - new BufferedReader( - new InputStreamReader( - new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/wordsUnicode.txt")), - "Unicode")); - while ((str = inWords.readLine()) != null) - { - words.add(str); - } - inWords.close(); - - // open and read stems into an array list - BufferedReader inStems = - new BufferedReader( - new InputStreamReader( - new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/stemsUnicode.txt")), - "Unicode")); - while ((str = inStems.readLine()) != null) - { - stems.add(str); - } - inStems.close(); - } - - /** - * @see TestCase#tearDown() - */ - protected void tearDown() throws Exception - { - super.tearDown(); - } - - public void testStem() - { - for (int i = 0; i < words.size(); i++) - { - //if ( (i % 100) == 0 ) System.err.println(i); - String realStem = - RussianStemmer.stem( - (String) words.get(i), - RussianCharsets.UnicodeRussian); - assertEquals("unicode", stems.get(i), realStem); - } - } - -} diff --git a/src/test/org/apache/lucene/analysis/ru/res1251.htm b/src/test/org/apache/lucene/analysis/ru/res1251.htm deleted file mode 100644 index d3d2e2badad..00000000000 --- a/src/test/org/apache/lucene/analysis/ru/res1251.htm +++ /dev/null @@ -1 +0,0 @@ -[âìåñò][ñèë][ýëåêòðîìàãíèòí][ýíåðã][èìåë][ïðåäñòàâëåí][ñêàæ][æðåö][äðåâí][åãèïò][çíàí][õðàí][òàéí][óçê][êðóã][ïîñâÿùåí][âñÿê][âðåìåí][âèòîê][ïðèí][ñîá][íîâ][òåõíîëîã][ñàì][äåë][ðàñêðûâà][ïîòàåí][çíàí][ïðåæí][âåê][ãîâîð][íîâ][èíôîðìàö][ñòàíîâ][äîñòóïí][øèðîê][êðóã][ïîëüçîâàòåë][òåõ][ñëó÷à][ñîçíàí][îáùåñòâ][ãîòîâ][âîñïðèíÿ][âîñïîëüçîâà] \ No newline at end of file diff --git a/src/test/org/apache/lucene/analysis/ru/resKOI8.htm b/src/test/org/apache/lucene/analysis/ru/resKOI8.htm deleted file mode 100644 index 7cfab861990..00000000000 --- a/src/test/org/apache/lucene/analysis/ru/resKOI8.htm +++ /dev/null @@ -1 +0,0 @@ -[×ÍÅÓÔ][ÓÉÌ][ÜÌÅËÔÒÏÍÁÇÎÉÔÎ][ÜÎÅÒÇ][ÉÍÅÌ][ÐÒÅÄÓÔÁ×ÌÅÎ][ÓËÁÖ][ÖÒÅÃ][ÄÒÅ×Î][ÅÇÉÐÔ][ÚÎÁÎ][ÈÒÁÎ][ÔÁÊÎ][ÕÚË][ËÒÕÇ][ÐÏÓ×ÑÝÅÎ][×ÓÑË][×ÒÅÍÅÎ][×ÉÔÏË][ÐÒÉÎ][ÓÏÂ][ÎÏ×][ÔÅÈÎÏÌÏÇ][ÓÁÍ][ÄÅÌ][ÒÁÓËÒÙ×Á][ÐÏÔÁÅÎ][ÚÎÁÎ][ÐÒÅÖÎ][×ÅË][ÇÏ×ÏÒ][ÎÏ×][ÉÎÆÏÒÍÁÃ][ÓÔÁÎÏ×][ÄÏÓÔÕÐÎ][ÛÉÒÏË][ËÒÕÇ][ÐÏÌØÚÏ×ÁÔÅÌ][ÔÅÈ][ÓÌÕÞÁ][ÓÏÚÎÁÎ][ÏÂÝÅÓÔ×][ÇÏÔÏ×][×ÏÓÐÒÉÎÑ][×ÏÓÐÏÌØÚÏ×Á] \ No newline at end of file diff --git a/src/test/org/apache/lucene/analysis/ru/resUnicode.htm b/src/test/org/apache/lucene/analysis/ru/resUnicode.htm deleted file mode 100644 index ea71882a505..00000000000 Binary files a/src/test/org/apache/lucene/analysis/ru/resUnicode.htm and /dev/null differ diff --git a/src/test/org/apache/lucene/analysis/ru/stemsUnicode.txt b/src/test/org/apache/lucene/analysis/ru/stemsUnicode.txt deleted file mode 100644 index 504c6fd79f1..00000000000 Binary files a/src/test/org/apache/lucene/analysis/ru/stemsUnicode.txt and /dev/null differ diff --git a/src/test/org/apache/lucene/analysis/ru/test1251.txt b/src/test/org/apache/lucene/analysis/ru/test1251.txt deleted file mode 100644 index c386b743a2e..00000000000 --- a/src/test/org/apache/lucene/analysis/ru/test1251.txt +++ /dev/null @@ -1,2 +0,0 @@ -Âìåñòå ñ òåì î ñèëå ýëåêòðîìàãíèòíîé ýíåðãèè èìåëè ïðåäñòàâëåíèå åùå, ñêàæåì, æðåöû Äðåâíåãî Åãèïòà. Íî çíàíèå ýòî õðàíèëîñü â òàéíå, â -óçêîì êðóãó ïîñâÿùåííûõ. Âñÿêèé âðåìåííîé âèòîê, ïðèíîñÿ ñ ñîáîé íîâûå òåõíîëîãèè, íà ñàìîì äåëå ðàñêðûâàåò ïîòàåííîå çíàíèå ïðåæíèõ âåêîâ. Ìû óæå ãîâîðèëè, ÷òî íîâàÿ èíôîðìàöèÿ ñòàíîâèòñÿ äîñòóïíîé øèðîêîìó êðóãó ïîëüçîâàòåëåé òîëüêî â òåõ ñëó÷àÿõ, êîãäà ñîçíàíèå îáùåñòâà ãîòîâî åå âîñïðèíÿòü è âîñïîëüçîâàòüñÿ åþ. diff --git a/src/test/org/apache/lucene/analysis/ru/testKOI8.txt b/src/test/org/apache/lucene/analysis/ru/testKOI8.txt deleted file mode 100644 index b5923f18434..00000000000 --- a/src/test/org/apache/lucene/analysis/ru/testKOI8.txt +++ /dev/null @@ -1,2 +0,0 @@ -÷ÍÅÓÔÅ Ó ÔÅÍ Ï ÓÉÌÅ ÜÌÅËÔÒÏÍÁÇÎÉÔÎÏÊ ÜÎÅÒÇÉÉ ÉÍÅÌÉ ÐÒÅÄÓÔÁ×ÌÅÎÉÅ ÅÝÅ, ÓËÁÖÅÍ, ÖÒÅÃÙ äÒÅ×ÎÅÇÏ åÇÉÐÔÁ. îÏ ÚÎÁÎÉÅ ÜÔÏ ÈÒÁÎÉÌÏÓØ × ÔÁÊÎÅ, × -ÕÚËÏÍ ËÒÕÇÕ ÐÏÓ×ÑÝÅÎÎÙÈ. ÷ÓÑËÉÊ ×ÒÅÍÅÎÎÏÊ ×ÉÔÏË, ÐÒÉÎÏÓÑ Ó ÓÏÂÏÊ ÎÏ×ÙÅ ÔÅÈÎÏÌÏÇÉÉ, ÎÁ ÓÁÍÏÍ ÄÅÌÅ ÒÁÓËÒÙ×ÁÅÔ ÐÏÔÁÅÎÎÏÅ ÚÎÁÎÉÅ ÐÒÅÖÎÉÈ ×ÅËÏ×. íÙ ÕÖÅ ÇÏ×ÏÒÉÌÉ, ÞÔÏ ÎÏ×ÁÑ ÉÎÆÏÒÍÁÃÉÑ ÓÔÁÎÏ×ÉÔÓÑ ÄÏÓÔÕÐÎÏÊ ÛÉÒÏËÏÍÕ ËÒÕÇÕ ÐÏÌØÚÏ×ÁÔÅÌÅÊ ÔÏÌØËÏ × ÔÅÈ ÓÌÕÞÁÑÈ, ËÏÇÄÁ ÓÏÚÎÁÎÉÅ ÏÂÝÅÓÔ×Á ÇÏÔÏ×Ï ÅÅ ×ÏÓÐÒÉÎÑÔØ É ×ÏÓÐÏÌØÚÏ×ÁÔØÓÑ ÅÀ. diff --git a/src/test/org/apache/lucene/analysis/ru/testUnicode.txt b/src/test/org/apache/lucene/analysis/ru/testUnicode.txt deleted file mode 100644 index 7348a60a155..00000000000 Binary files a/src/test/org/apache/lucene/analysis/ru/testUnicode.txt and /dev/null differ diff --git a/src/test/org/apache/lucene/analysis/ru/wordsUnicode.txt b/src/test/org/apache/lucene/analysis/ru/wordsUnicode.txt deleted file mode 100644 index 79f8d48e3e9..00000000000 Binary files a/src/test/org/apache/lucene/analysis/ru/wordsUnicode.txt and /dev/null differ