mirror of https://github.com/apache/lucene.git
LUCENE-1786: improve performance of TestCompoundWordTokenFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@892355 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
aff4f4a464
commit
16eaa6198f
|
@ -17,94 +17,65 @@ package org.apache.lucene.analysis.compound;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.net.URL;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipInputStream;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
||||
private static String[] locations = {
|
||||
"http://dfn.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip",
|
||||
"http://surfnet.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip",
|
||||
"http://superb-west.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip",
|
||||
"http://voxel.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip"};
|
||||
// too slow:
|
||||
//"http://superb-east.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip"};
|
||||
|
||||
private static byte[] patternsFileContent;
|
||||
static final File dataDir = new File(System.getProperty("dataDir", "./bin"));
|
||||
static final File testFile = new File(dataDir, "org/apache/lucene/analysis/compound/da_UTF8.xml");
|
||||
|
||||
@Override
|
||||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
getHyphenationPatternFileContents();
|
||||
}
|
||||
|
||||
public void testHyphenationCompoundWordsDE() throws Exception {
|
||||
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
|
||||
"Aufgabe", "Überwachung" };
|
||||
public void testHyphenationCompoundWordsDA() throws Exception {
|
||||
String[] dict = { "læse", "hest" };
|
||||
|
||||
Reader reader = getHyphenationReader("de_DR.xml");
|
||||
if (reader == null) {
|
||||
// we gracefully die if we have no reader
|
||||
return;
|
||||
}
|
||||
Reader reader = getHyphenationReader();
|
||||
|
||||
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
|
||||
.getHyphenationTree(reader);
|
||||
|
||||
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
|
||||
new WhitespaceTokenizer(new StringReader(
|
||||
"Rindfleischüberwachungsgesetz Drahtschere abba")), hyphenator,
|
||||
"min veninde som er lidt af en læsehest")), hyphenator,
|
||||
dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
assertTokenStreamContents(tf, new String[] { "Rindfleischüberwachungsgesetz", "Rind",
|
||||
"fleisch", "überwachung", "gesetz", "Drahtschere", "Draht", "schere",
|
||||
"abba" }, new int[] { 0, 0, 4, 11, 23, 30, 30, 35, 42 }, new int[] {
|
||||
29, 4, 11, 22, 29, 41, 35, 41, 46 }, new int[] { 1, 0, 0, 0, 0, 1, 0,
|
||||
0, 1 });
|
||||
assertTokenStreamContents(tf,
|
||||
new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 }
|
||||
);
|
||||
}
|
||||
|
||||
public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
|
||||
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
|
||||
"Aufgabe", "Überwachung", "Rindfleisch", "Überwachungsgesetz" };
|
||||
|
||||
Reader reader = getHyphenationReader("de_DR.xml");
|
||||
if (reader == null) {
|
||||
// we gracefully die if we have no reader
|
||||
return;
|
||||
}
|
||||
String[] dict = { "basketball", "basket", "ball", "kurv" };
|
||||
Reader reader = getHyphenationReader();
|
||||
|
||||
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
|
||||
.getHyphenationTree(reader);
|
||||
|
||||
// the word basket will not be added due to the longest match option
|
||||
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
|
||||
new WhitespaceTokenizer(new StringReader(
|
||||
"Rindfleischüberwachungsgesetz")), hyphenator, dict,
|
||||
"basketballkurv")), hyphenator, dict,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
|
||||
assertTokenStreamContents(tf, new String[] { "Rindfleischüberwachungsgesetz",
|
||||
"Rindfleisch", "fleisch", "überwachungsgesetz", "gesetz" }, new int[] {
|
||||
0, 0, 4, 11, 23 }, new int[] { 29, 11, 11, 29, 29 }, new int[] { 1, 0,
|
||||
0, 0, 0 });
|
||||
assertTokenStreamContents(tf,
|
||||
new String[] { "basketballkurv", "basketball", "ball", "kurv" },
|
||||
new int[] { 1, 0, 0, 0 }
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
public void testDumbCompoundWordsSE() throws Exception {
|
||||
|
@ -157,19 +128,10 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
|
||||
"Aufgabe", "Überwachung" };
|
||||
|
||||
Reader reader = getHyphenationReader("de_DR.xml");
|
||||
if (reader == null) {
|
||||
// we gracefully die if we have no reader
|
||||
return;
|
||||
}
|
||||
|
||||
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
|
||||
.getHyphenationTree(reader);
|
||||
|
||||
Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader(
|
||||
"Rindfleischüberwachungsgesetz"));
|
||||
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
|
||||
wsTokenizer, hyphenator, dict,
|
||||
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
|
||||
wsTokenizer, dict,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
|
@ -185,53 +147,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
assertEquals("Rindfleischüberwachungsgesetz", termAtt.term());
|
||||
}
|
||||
|
||||
private void getHyphenationPatternFileContents() {
|
||||
if (patternsFileContent == null) {
|
||||
try {
|
||||
List urls = new LinkedList(Arrays.asList(locations));
|
||||
Collections.shuffle(urls);
|
||||
URL url = new URL((String)urls.get(0));
|
||||
InputStream in = url.openStream();
|
||||
byte[] buffer = new byte[1024];
|
||||
ByteArrayOutputStream out = new ByteArrayOutputStream();
|
||||
int count;
|
||||
|
||||
while ((count = in.read(buffer)) != -1) {
|
||||
out.write(buffer, 0, count);
|
||||
}
|
||||
in.close();
|
||||
out.close();
|
||||
patternsFileContent = out.toByteArray();
|
||||
} catch (IOException e) {
|
||||
// we swallow all exceptions - the user might have no internet connection
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Reader getHyphenationReader(String filename) throws Exception {
|
||||
if (patternsFileContent == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
ZipInputStream zipstream = new ZipInputStream(new ByteArrayInputStream(
|
||||
patternsFileContent));
|
||||
|
||||
ZipEntry entry;
|
||||
while ((entry = zipstream.getNextEntry()) != null) {
|
||||
if (entry.getName().equals("offo-hyphenation/hyph/" + filename)) {
|
||||
byte[] buffer = new byte[1024];
|
||||
ByteArrayOutputStream outstream = new ByteArrayOutputStream();
|
||||
int count;
|
||||
while ((count = zipstream.read(buffer)) != -1) {
|
||||
outstream.write(buffer, 0, count);
|
||||
}
|
||||
outstream.close();
|
||||
zipstream.close();
|
||||
return new StringReader(new String(outstream.toByteArray(),
|
||||
"ISO-8859-1"));
|
||||
}
|
||||
}
|
||||
// we never should get here
|
||||
return null;
|
||||
private Reader getHyphenationReader() throws Exception {
|
||||
return new InputStreamReader(new FileInputStream(testFile), "UTF-8");
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,68 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<!--
|
||||
Copyright 1999-2004 The Apache Software Foundation
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ -->
|
||||
|
||||
<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,
|
||||
classes, exceptions?, patterns)>
|
||||
|
||||
<!-- Hyphen character to be used in the exception list as shortcut for
|
||||
<hyphen pre-break="-"/>. Defaults to '-'
|
||||
-->
|
||||
<!ELEMENT hyphen-char EMPTY>
|
||||
<!ATTLIST hyphen-char value CDATA #REQUIRED>
|
||||
|
||||
<!-- Default minimun length in characters of hyphenated word fragments
|
||||
before and after the line break. For some languages this is not
|
||||
only for aesthetic purposes, wrong hyphens may be generated if this
|
||||
is not accounted for.
|
||||
-->
|
||||
<!ELEMENT hyphen-min EMPTY>
|
||||
<!ATTLIST hyphen-min before CDATA #REQUIRED>
|
||||
<!ATTLIST hyphen-min after CDATA #REQUIRED>
|
||||
|
||||
<!-- Character equivalent classes: space separated list of character groups, all
|
||||
characters in a group are to be treated equivalent as far as
|
||||
the hyphenation algorithm is concerned. The first character in a group
|
||||
is the group's equivalent character. Patterns should only contain
|
||||
first characters. It also defines word characters, i.e. a word that
|
||||
contains characters not present in any of the classes is not hyphenated.
|
||||
-->
|
||||
<!ELEMENT classes (#PCDATA)>
|
||||
|
||||
<!-- Hyphenation exceptions: space separated list of hyphenated words.
|
||||
A hyphen is indicated by the hyphen tag, but you can use the
|
||||
hyphen-char defined previously as shortcut. This is in cases
|
||||
when the algorithm procedure finds wrong hyphens or you want
|
||||
to provide your own hyphenation for some words.
|
||||
-->
|
||||
<!ELEMENT exceptions (#PCDATA|hyphen)* >
|
||||
|
||||
<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'
|
||||
characters as described before, between any two word characters a digit
|
||||
in the range 0 to 9 may be specified. The absence of a digit is equivalent
|
||||
to zero. The '.' character is reserved to indicate begining or ending
|
||||
of words. -->
|
||||
<!ELEMENT patterns (#PCDATA)>
|
||||
|
||||
<!-- A "full hyphen" equivalent to TeX's \discretionary
|
||||
with pre-break, post-break and no-break attributes.
|
||||
To be used in the exceptions list, the hyphen character is not
|
||||
automatically added -->
|
||||
<!ELEMENT hyphen EMPTY>
|
||||
<!ATTLIST hyphen pre CDATA #IMPLIED>
|
||||
<!ATTLIST hyphen no CDATA #IMPLIED>
|
||||
<!ATTLIST hyphen post CDATA #IMPLIED>
|
Loading…
Reference in New Issue