LUCENE-1786: improve performance of TestCompoundWordTokenFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@892355 13f79535-47bb-0310-9956-ffa450edef68
2009-12-18 19:25:24 +00:00 · 2009-12-18 19:25:24 +00:00 · 16eaa6198f
parent aff4f4a464
commit 16eaa6198f
3 changed files with 1302 additions and 110 deletions
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
@ -17,94 +17,65 @@ package org.apache.lucene.analysis.compound;
 * limitations under the License.
 */

-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StringReader;
-import java.net.URL;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipInputStream;

 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;

 public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
-  private static String[] locations = {
-      "http://dfn.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip",
-      "http://surfnet.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip",
-      "http://superb-west.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip",
-      "http://voxel.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip"};
-      // too slow:
-      //"http://superb-east.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip"};
-
-  private static byte[] patternsFileContent;
+  static final File dataDir = new File(System.getProperty("dataDir", "./bin"));
+  static final File testFile = new File(dataDir, "org/apache/lucene/analysis/compound/da_UTF8.xml");

  @Override
  protected void setUp() throws Exception {
    super.setUp();
-    getHyphenationPatternFileContents();
  }

-  public void testHyphenationCompoundWordsDE() throws Exception {
-    String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
-        "Aufgabe", "Überwachung" };
+  public void testHyphenationCompoundWordsDA() throws Exception {
+    String[] dict = { "læse", "hest" };

-    Reader reader = getHyphenationReader("de_DR.xml");
-    if (reader == null) {
-      // we gracefully die if we have no reader
-      return;
-    }
+    Reader reader = getHyphenationReader();

    HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
        .getHyphenationTree(reader);

    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
        new WhitespaceTokenizer(new StringReader(
-            "Rindfleischüberwachungsgesetz Drahtschere abba")), hyphenator,
+            "min veninde som er lidt af en læsehest")), hyphenator,
        dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
        CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
        CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
-    assertTokenStreamContents(tf, new String[] { "Rindfleischüberwachungsgesetz", "Rind",
-        "fleisch", "überwachung", "gesetz", "Drahtschere", "Draht", "schere",
-        "abba" }, new int[] { 0, 0, 4, 11, 23, 30, 30, 35, 42 }, new int[] {
-        29, 4, 11, 22, 29, 41, 35, 41, 46 }, new int[] { 1, 0, 0, 0, 0, 1, 0,
-        0, 1 });
+    assertTokenStreamContents(tf, 
+        new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" },
+        new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 }
+    );
  }

  public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
-    String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
-        "Aufgabe", "Überwachung", "Rindfleisch", "Überwachungsgesetz" };
-
-    Reader reader = getHyphenationReader("de_DR.xml");
-    if (reader == null) {
-      // we gracefully die if we have no reader
-      return;
-    }
+    String[] dict = { "basketball", "basket", "ball", "kurv" };
+    Reader reader = getHyphenationReader();

    HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
        .getHyphenationTree(reader);

+    // the word basket will not be added due to the longest match option
    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
        new WhitespaceTokenizer(new StringReader(
-            "Rindfleischüberwachungsgesetz")), hyphenator, dict,
+            "basketballkurv")), hyphenator, dict,
        CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
        CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
-    assertTokenStreamContents(tf, new String[] { "Rindfleischüberwachungsgesetz",
-        "Rindfleisch", "fleisch", "überwachungsgesetz", "gesetz" }, new int[] {
-        0, 0, 4, 11, 23 }, new int[] { 29, 11, 11, 29, 29 }, new int[] { 1, 0,
-        0, 0, 0 });
+    assertTokenStreamContents(tf, 
+        new String[] { "basketballkurv", "basketball", "ball", "kurv" },
+        new int[] { 1, 0, 0, 0 }
+    );
+
  }

  public void testDumbCompoundWordsSE() throws Exception {
@ -157,19 +128,10 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
    String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
        "Aufgabe", "Überwachung" };

-    Reader reader = getHyphenationReader("de_DR.xml");
-    if (reader == null) {
-      // we gracefully die if we have no reader
-      return;
-    }
-
-    HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
-        .getHyphenationTree(reader);
-
    Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader(
        "Rindfleischüberwachungsgesetz"));
-    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
-        wsTokenizer, hyphenator, dict,
+    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
+        wsTokenizer, dict,
        CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
        CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
        CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
@ -185,53 +147,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
    assertEquals("Rindfleischüberwachungsgesetz", termAtt.term());
  }

-  private void getHyphenationPatternFileContents() {
-    if (patternsFileContent == null) {
-      try {
-        List urls = new LinkedList(Arrays.asList(locations));
-        Collections.shuffle(urls);
-        URL url = new URL((String)urls.get(0));
-        InputStream in = url.openStream();
-        byte[] buffer = new byte[1024];
-        ByteArrayOutputStream out = new ByteArrayOutputStream();
-        int count;
-
-        while ((count = in.read(buffer)) != -1) {
-          out.write(buffer, 0, count);
-        }
-        in.close();
-        out.close();
-        patternsFileContent = out.toByteArray();
-      } catch (IOException e) {
-        // we swallow all exceptions - the user might have no internet connection
-      }
-    }
-  }
-
-  private Reader getHyphenationReader(String filename) throws Exception {
-    if (patternsFileContent == null) {
-      return null;
-    }
-
-    ZipInputStream zipstream = new ZipInputStream(new ByteArrayInputStream(
-        patternsFileContent));
-
-    ZipEntry entry;
-    while ((entry = zipstream.getNextEntry()) != null) {
-      if (entry.getName().equals("offo-hyphenation/hyph/" + filename)) {
-        byte[] buffer = new byte[1024];
-        ByteArrayOutputStream outstream = new ByteArrayOutputStream();
-        int count;
-        while ((count = zipstream.read(buffer)) != -1) {
-          outstream.write(buffer, 0, count);
-        }
-        outstream.close();
-        zipstream.close();
-        return new StringReader(new String(outstream.toByteArray(),
-            "ISO-8859-1"));
-      }
-    }
-    // we never should get here
-    return null;
+  private Reader getHyphenationReader() throws Exception {
+    return new InputStreamReader(new FileInputStream(testFile), "UTF-8");
  }
 }
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/da_UTF8.xml
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/da_UTF8.xml
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/hyphenation.dtd
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/hyphenation.dtd
@ -0,0 +1,68 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+  Copyright 1999-2004 The Apache Software Foundation
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ -->
+
+<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,
+                           classes, exceptions?, patterns)>
+
+<!-- Hyphen character to be used in the exception list as shortcut for
+     <hyphen pre-break="-"/>. Defaults to '-'
+-->
+<!ELEMENT hyphen-char EMPTY>
+<!ATTLIST hyphen-char value CDATA #REQUIRED>
+
+<!-- Default minimun length in characters of hyphenated word fragments
+     before and after the line break. For some languages this is not
+     only for aesthetic purposes, wrong hyphens may be generated if this
+     is not accounted for.
+-->
+<!ELEMENT hyphen-min EMPTY>
+<!ATTLIST hyphen-min before CDATA #REQUIRED>
+<!ATTLIST hyphen-min after CDATA #REQUIRED>
+
+<!-- Character equivalent classes: space separated list of character groups, all
+     characters in a group are to be treated equivalent as far as
+     the hyphenation algorithm is concerned. The first character in a group
+     is the group's equivalent character. Patterns should only contain
+     first characters. It also defines word characters, i.e. a word that
+     contains characters not present in any of the classes is not hyphenated.
+-->
+<!ELEMENT classes (#PCDATA)>
+
+<!-- Hyphenation exceptions: space separated list of hyphenated words.
+     A hyphen is indicated by the hyphen tag, but you can use the
+     hyphen-char defined previously as shortcut. This is in cases
+     when the algorithm procedure finds wrong hyphens or you want
+     to provide your own hyphenation for some words.
+-->
+<!ELEMENT exceptions (#PCDATA|hyphen)* >
+
+<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'
+     characters as described before, between any two word characters a digit
+     in the range 0 to 9 may be specified. The absence of a digit is equivalent
+     to zero. The '.' character is reserved to indicate begining or ending
+     of words. -->
+<!ELEMENT patterns (#PCDATA)>
+
+<!-- A "full hyphen" equivalent to TeX's \discretionary
+     with pre-break, post-break and no-break attributes.
+     To be used in the exceptions list, the hyphen character is not
+     automatically added -->
+<!ELEMENT hyphen EMPTY>
+<!ATTLIST hyphen pre CDATA #IMPLIED>
+<!ATTLIST hyphen no CDATA #IMPLIED>
+<!ATTLIST hyphen post CDATA #IMPLIED>