From 8b7f6e4ef670511a124d60f8cf42121566a333a0 Mon Sep 17 00:00:00 2001 From: Otis Gospodnetic Date: Fri, 22 Dec 2006 23:43:17 +0000 Subject: [PATCH] - LUCENE-759: New n-gram-capable tokenizers and their unit tests. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@489802 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 4 + .../analysis/ngram/EdgeNGramTokenizer.java | 95 ++++++++++++ .../lucene/analysis/ngram/NGramTokenizer.java | 90 ++++++++++++ .../ngram/EdgeNGramTokenizerTest.java | 81 +++++++++++ .../analysis/ngram/NGramTokenizerTest.java | 137 ++++++++++++++++++ 5 files changed, 407 insertions(+) create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java create mode 100644 contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java create mode 100644 contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java diff --git a/CHANGES.txt b/CHANGES.txt index 92a01318f30..4dbd52e7cf8 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -91,6 +91,10 @@ New features and lives in contrib/miscellaneous. (Chris Hostetter, Otis Gospodnetic) +12. LUCENE-759: Added NGramTokenizer and EdgeNGramTokenizer classes and + their passing unit tests. + (Otis Gospodnetic) + API Changes 1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java new file mode 100644 index 00000000000..eaf85807c63 --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java @@ -0,0 +1,95 @@ +package org.apache.lucene.analysis.ngram; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.Tokenizer; + +import java.io.IOException; +import java.io.Reader; + +/** + * Tokenizes the input into n-grams of given size(s). + * @author Otis Gospodnetic + */ +public class EdgeNGramTokenizer extends Tokenizer { + // which side to get the n-gram from + // TODO: switch to using this enum when we move to 1.5+ +// public enum Side { +// FRONT (), +// BACK (); +// } + public static class Side { + public static Side FRONT = new Side("front"); + public static Side BACK = new Side("back"); + private Side(String label) {} + } + private int gramSize; + private Side side; + private int inLen; + private String inStr; + private boolean started = false; + + /** + * Creates EdgeNGramTokenizer with given min and max n-grams. + * @param input Reader holding the input to be tokenized + * @param side the {@link Side} from which to chop off an n-gram + * @param gramSize the n-gram size to generate + */ + public EdgeNGramTokenizer(Reader input, Side side, int gramSize) { + super(input); + if (gramSize < 1) { + throw new IllegalArgumentException("gramSize must be greater than zero"); + } + this.gramSize = gramSize; + this.side = side; + } + public EdgeNGramTokenizer(Reader input, String side, int gramSize) { + + } + + /** Returns the next token in the stream, or null at EOS. */ + public final Token next() throws IOException { + // if we already returned the edge n-gram, we are done + if (started) + return null; + if (!started) { + started = true; + char[] chars = new char[1024]; + input.read(chars); + inStr = new String(chars).trim(); // remove any trailing empty strings + inLen = inStr.length(); + } + // if the input is too short, we can't generate any n-grams + if (gramSize > inLen) + return null; + if (side == Side.FRONT) + return new Token(inStr.substring(0, gramSize), 0, gramSize); + else + return new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen); + } + + static Side side(String label) { + if (label == null || label.isEmpty()) + throw new IllegalArgumentException("Label must be either 'front' or 'back'"); + if (label.equals("front")) + return Side.FRONT; + else + return Side.BACK; + } +} diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java new file mode 100644 index 00000000000..8864b6b4539 --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java @@ -0,0 +1,90 @@ +package org.apache.lucene.analysis.ngram; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.Tokenizer; + +import java.io.IOException; +import java.io.Reader; + +/** + * Tokenizes the input into n-grams of given size(s). + * @author Otis Gospodnetic + */ +public class NGramTokenizer extends Tokenizer { + public static final int DEFAULT_MIN_NGRAM_SIZE = 1; + public static final int DEFAULT_MAX_NGRAM_SIZE = 2; + + private int minGram, maxGram; + private int gramSize; + private int pos = 0; + private int inLen; + private String inStr; + private boolean started = false; + + /** + * Creates NGramTokenizer with given min and max n-grams. + * @param input Reader holding the input to be tokenized + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + public NGramTokenizer(Reader input, int minGram, int maxGram) { + super(input); + if (minGram < 1) { + throw new IllegalArgumentException("minGram must be greater than zero"); + } + if (minGram > maxGram) { + throw new IllegalArgumentException("minGram must not be greater than maxGram"); + } + this.minGram = minGram; + this.maxGram = maxGram; + } + /** + * Creates NGramTokenizer with default min and max n-grams. + * @param input Reader holding the input to be tokenized + */ + public NGramTokenizer(Reader input) { + this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE); + } + + /** Returns the next token in the stream, or null at EOS. */ + public final Token next() throws IOException { + if (!started) { + started = true; + gramSize = minGram; + char[] chars = new char[1024]; + input.read(chars); + inStr = new String(chars).trim(); // remove any trailing empty strings + inLen = inStr.length(); + } + + if (pos+gramSize > inLen) { // if we hit the end of the string + pos = 0; // reset to beginning of string + gramSize++; // increase n-gram size + if (gramSize > maxGram) // we are done + return null; + if (pos+gramSize > inLen) + return null; + } + String gram = inStr.substring(pos, pos+gramSize); + int oldPos = pos; + pos++; + return new Token(gram, oldPos, oldPos+gramSize); + } +} diff --git a/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java b/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java new file mode 100644 index 00000000000..dd50c0a4ad0 --- /dev/null +++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java @@ -0,0 +1,81 @@ +package org.apache.lucene.analysis.ngram; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Token; + +import java.io.StringReader; + +import junit.framework.TestCase; + +/** + * Tests {@link EdgeNGramTokenizer} for correctness. + * @author Otis Gospodnetic + */ +public class EdgeNGramTokenizerTest extends TestCase { + private StringReader input; + + public void setUp() { + input = new StringReader("abcde"); + } + + public void testInvalidInput() throws Exception { + boolean gotException = false; + try { + new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 0); + } catch (IllegalArgumentException e) { + gotException = true; + } + assertTrue(gotException); + } + + public void testInvalidInput2() throws Exception { + boolean gotException = false; + try { + new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, -1); + } catch (IllegalArgumentException e) { + gotException = true; + } + assertTrue(gotException); + } + + public void testFrontUnigram() throws Exception { + EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1); + Token token = null; + token = tokenizer.next(); + assertEquals("(a,0,1)", token.toString()); + token = tokenizer.next(); + assertNull(token); + } + + public void testBackUnigram() throws Exception { + EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1); + Token token = null; + token = tokenizer.next(); + assertEquals("(e,4,5)", token.toString()); + token = tokenizer.next(); + assertNull(token); + } + + public void testOversizedNgrams() throws Exception { + EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 6); + Token token = null; + token = tokenizer.next(); + assertNull(token); + } +} diff --git a/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java b/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java new file mode 100644 index 00000000000..ae412050ae5 --- /dev/null +++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java @@ -0,0 +1,137 @@ +package org.apache.lucene.analysis.ngram; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Token; + +import java.io.StringReader; +import java.util.ArrayList; + +import junit.framework.TestCase; + +/** + * Tests {@link NGramTokenizer} for correctness. + * @author Otis Gospodnetic + */ +public class NGramTokenizerTest extends TestCase { + private StringReader input; + private ArrayList tokens = new ArrayList(); + + public void setUp() { + input = new StringReader("abcde"); + } + + public void testInvalidInput() throws Exception { + boolean gotException = false; + try { + new NGramTokenizer(input, 2, 1); + } catch (IllegalArgumentException e) { + gotException = true; + } + assertTrue(gotException); + } + + public void testInvalidInput2() throws Exception { + boolean gotException = false; + try { + new NGramTokenizer(input, 0, 1); + } catch (IllegalArgumentException e) { + gotException = true; + } + assertTrue(gotException); + } + + public void testUnigrams() throws Exception { + NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1); + + Token token = null; + do { + token = tokenizer.next(); + if (token != null) { + tokens.add(token.toString()); +// System.out.println(token.termText()); +// System.out.println(token); +// Thread.sleep(1000); + } + } while (token != null); + + assertEquals(5, tokens.size()); + ArrayList exp = new ArrayList(); + exp.add("(a,0,1)"); exp.add("(b,1,2)"); exp.add("(c,2,3)"); exp.add("(d,3,4)"); exp.add("(e,4,5)"); + assertEquals(exp, tokens); + } + + public void testBigrams() throws Exception { + NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2); + + Token token = null; + do { + token = tokenizer.next(); + if (token != null) { + tokens.add(token.toString()); +// System.out.println(token.termText()); +// System.out.println(token); +// Thread.sleep(1000); + } + } while (token != null); + + assertEquals(4, tokens.size()); + ArrayList exp = new ArrayList(); + exp.add("(ab,0,2)"); exp.add("(bc,1,3)"); exp.add("(cd,2,4)"); exp.add("(de,3,5)"); + assertEquals(exp, tokens); + } + + public void testNgrams() throws Exception { + NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3); + + Token token = null; + do { + token = tokenizer.next(); + if (token != null) { + tokens.add(token.toString()); +// System.out.println(token.termText()); +// System.out.println(token); +// Thread.sleep(1000); + } + } while (token != null); + + assertEquals(12, tokens.size()); + ArrayList exp = new ArrayList(); + exp.add("(a,0,1)"); exp.add("(b,1,2)"); exp.add("(c,2,3)"); exp.add("(d,3,4)"); exp.add("(e,4,5)"); + exp.add("(ab,0,2)"); exp.add("(bc,1,3)"); exp.add("(cd,2,4)"); exp.add("(de,3,5)"); + exp.add("(abc,0,3)"); exp.add("(bcd,1,4)"); exp.add("(cde,2,5)"); + assertEquals(exp, tokens); + } + + public void testOversizedNgrams() throws Exception { + NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7); + + Token token = null; + do { + token = tokenizer.next(); + if (token != null) { + tokens.add(token.toString()); +// System.out.println(token.termText()); +// System.out.println(token); +// Thread.sleep(1000); + } + } while (token != null); + + assertTrue(tokens.isEmpty()); + } +}