LUCENE-1786: improve performance of TestCompoundWordTokenFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@892355 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2009-12-18 19:25:24 +00:00
parent aff4f4a464
commit 16eaa6198f
3 changed files with 1302 additions and 110 deletions

View File

@ -17,94 +17,65 @@ package org.apache.lucene.analysis.compound;
* limitations under the License.
*/
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.net.URL;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
private static String[] locations = {
"http://dfn.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip",
"http://surfnet.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip",
"http://superb-west.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip",
"http://voxel.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip"};
// too slow:
//"http://superb-east.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip"};
private static byte[] patternsFileContent;
static final File dataDir = new File(System.getProperty("dataDir", "./bin"));
static final File testFile = new File(dataDir, "org/apache/lucene/analysis/compound/da_UTF8.xml");
@Override
protected void setUp() throws Exception {
super.setUp();
getHyphenationPatternFileContents();
}
public void testHyphenationCompoundWordsDE() throws Exception {
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
"Aufgabe", "Überwachung" };
public void testHyphenationCompoundWordsDA() throws Exception {
String[] dict = { "læse", "hest" };
Reader reader = getHyphenationReader("de_DR.xml");
if (reader == null) {
// we gracefully die if we have no reader
return;
}
Reader reader = getHyphenationReader();
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
.getHyphenationTree(reader);
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
new WhitespaceTokenizer(new StringReader(
"Rindfleischüberwachungsgesetz Drahtschere abba")), hyphenator,
"min veninde som er lidt af en læsehest")), hyphenator,
dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
assertTokenStreamContents(tf, new String[] { "Rindfleischüberwachungsgesetz", "Rind",
"fleisch", "überwachung", "gesetz", "Drahtschere", "Draht", "schere",
"abba" }, new int[] { 0, 0, 4, 11, 23, 30, 30, 35, 42 }, new int[] {
29, 4, 11, 22, 29, 41, 35, 41, 46 }, new int[] { 1, 0, 0, 0, 0, 1, 0,
0, 1 });
assertTokenStreamContents(tf,
new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 }
);
}
public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
"Aufgabe", "Überwachung", "Rindfleisch", "Überwachungsgesetz" };
Reader reader = getHyphenationReader("de_DR.xml");
if (reader == null) {
// we gracefully die if we have no reader
return;
}
String[] dict = { "basketball", "basket", "ball", "kurv" };
Reader reader = getHyphenationReader();
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
.getHyphenationTree(reader);
// the word basket will not be added due to the longest match option
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
new WhitespaceTokenizer(new StringReader(
"Rindfleischüberwachungsgesetz")), hyphenator, dict,
"basketballkurv")), hyphenator, dict,
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
assertTokenStreamContents(tf, new String[] { "Rindfleischüberwachungsgesetz",
"Rindfleisch", "fleisch", "überwachungsgesetz", "gesetz" }, new int[] {
0, 0, 4, 11, 23 }, new int[] { 29, 11, 11, 29, 29 }, new int[] { 1, 0,
0, 0, 0 });
assertTokenStreamContents(tf,
new String[] { "basketballkurv", "basketball", "ball", "kurv" },
new int[] { 1, 0, 0, 0 }
);
}
public void testDumbCompoundWordsSE() throws Exception {
@ -157,19 +128,10 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
"Aufgabe", "Überwachung" };
Reader reader = getHyphenationReader("de_DR.xml");
if (reader == null) {
// we gracefully die if we have no reader
return;
}
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
.getHyphenationTree(reader);
Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader(
"Rindfleischüberwachungsgesetz"));
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
wsTokenizer, hyphenator, dict,
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
wsTokenizer, dict,
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
@ -185,53 +147,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
assertEquals("Rindfleischüberwachungsgesetz", termAtt.term());
}
private void getHyphenationPatternFileContents() {
if (patternsFileContent == null) {
try {
List urls = new LinkedList(Arrays.asList(locations));
Collections.shuffle(urls);
URL url = new URL((String)urls.get(0));
InputStream in = url.openStream();
byte[] buffer = new byte[1024];
ByteArrayOutputStream out = new ByteArrayOutputStream();
int count;
while ((count = in.read(buffer)) != -1) {
out.write(buffer, 0, count);
}
in.close();
out.close();
patternsFileContent = out.toByteArray();
} catch (IOException e) {
// we swallow all exceptions - the user might have no internet connection
}
}
}
private Reader getHyphenationReader(String filename) throws Exception {
if (patternsFileContent == null) {
return null;
}
ZipInputStream zipstream = new ZipInputStream(new ByteArrayInputStream(
patternsFileContent));
ZipEntry entry;
while ((entry = zipstream.getNextEntry()) != null) {
if (entry.getName().equals("offo-hyphenation/hyph/" + filename)) {
byte[] buffer = new byte[1024];
ByteArrayOutputStream outstream = new ByteArrayOutputStream();
int count;
while ((count = zipstream.read(buffer)) != -1) {
outstream.write(buffer, 0, count);
}
outstream.close();
zipstream.close();
return new StringReader(new String(outstream.toByteArray(),
"ISO-8859-1"));
}
}
// we never should get here
return null;
private Reader getHyphenationReader() throws Exception {
return new InputStreamReader(new FileInputStream(testFile), "UTF-8");
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,68 @@
<?xml version="1.0" encoding="utf-8"?>
<!--
Copyright 1999-2004 The Apache Software Foundation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ -->
<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,
classes, exceptions?, patterns)>
<!-- Hyphen character to be used in the exception list as shortcut for
<hyphen pre-break="-"/>. Defaults to '-'
-->
<!ELEMENT hyphen-char EMPTY>
<!ATTLIST hyphen-char value CDATA #REQUIRED>
<!-- Default minimun length in characters of hyphenated word fragments
before and after the line break. For some languages this is not
only for aesthetic purposes, wrong hyphens may be generated if this
is not accounted for.
-->
<!ELEMENT hyphen-min EMPTY>
<!ATTLIST hyphen-min before CDATA #REQUIRED>
<!ATTLIST hyphen-min after CDATA #REQUIRED>
<!-- Character equivalent classes: space separated list of character groups, all
characters in a group are to be treated equivalent as far as
the hyphenation algorithm is concerned. The first character in a group
is the group's equivalent character. Patterns should only contain
first characters. It also defines word characters, i.e. a word that
contains characters not present in any of the classes is not hyphenated.
-->
<!ELEMENT classes (#PCDATA)>
<!-- Hyphenation exceptions: space separated list of hyphenated words.
A hyphen is indicated by the hyphen tag, but you can use the
hyphen-char defined previously as shortcut. This is in cases
when the algorithm procedure finds wrong hyphens or you want
to provide your own hyphenation for some words.
-->
<!ELEMENT exceptions (#PCDATA|hyphen)* >
<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'
characters as described before, between any two word characters a digit
in the range 0 to 9 may be specified. The absence of a digit is equivalent
to zero. The '.' character is reserved to indicate begining or ending
of words. -->
<!ELEMENT patterns (#PCDATA)>
<!-- A "full hyphen" equivalent to TeX's \discretionary
with pre-break, post-break and no-break attributes.
To be used in the exceptions list, the hyphen character is not
automatically added -->
<!ELEMENT hyphen EMPTY>
<!ATTLIST hyphen pre CDATA #IMPLIED>
<!ATTLIST hyphen no CDATA #IMPLIED>
<!ATTLIST hyphen post CDATA #IMPLIED>