From 7b570fc8b2c8754900e8823731b4fa5e7a0d67f1 Mon Sep 17 00:00:00 2001
From: Otis Gospodnetic <otis@apache.org>
Date: Thu, 1 Mar 2007 14:22:57 +0000
Subject: [PATCH] - LUCENE-759: Made the tokenizer capable of creating n-grams
 of a varying sizes - from min to max characters per n-gram.  Patch from Adam
 Hiatt.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@513344 13f79535-47bb-0310-9956-ffa450edef68
---
 .../analysis/ngram/EdgeNGramTokenizer.java    | 122 ++++++++++++------
 .../ngram/EdgeNGramTokenizerTest.java         | 120 +++++++++++------
 2 files changed, 163 insertions(+), 79 deletions(-)

diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
index b1018f7b8df..f284547a0e5 100644
--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
@@ -24,75 +24,123 @@ import java.io.IOException;
 import java.io.Reader;
 
 /**
- * Tokenizes the input into n-grams of the given size.
+ * Tokenizes the input from an edge into n-grams of given size(s).
  * @author Otis Gospodnetic
+ * @author Adam Hiatt
  */
 public class EdgeNGramTokenizer extends Tokenizer {
-  // which side to get the n-gram from
-  // TODO: switch to using this enum when we move to 1.5+
-//  public enum Side {
-//    FRONT (),
-//    BACK ();
-//  }
+  public static final Side DEFAULT_SIDE = Side.FRONT;
+  public static final int DEFAULT_MAX_GRAM_SIZE = 1;
+  public static final int DEFAULT_MIN_GRAM_SIZE = 1;
+
+  // Replace this with an enum when the Java 1.5 upgrade is made, the impl will be simplified
   /** Specifies which side of the input the n-gram should be generated from */
   public static class Side {
+    private String label;
+
     /** Get the n-gram from the front of the input */
     public static Side FRONT = new Side("front");
+
     /** Get the n-gram from the end of the input */
     public static Side BACK = new Side("back");
-    private Side(String label) {}
+
+    // Private ctor
+    private Side(String label) { this.label = label; }
+
+
+    public String getLabel() { return label; }
+
+    // Get the appropriate Side from a string
+    public static Side getSide(String sideName) {
+      if (FRONT.getLabel().equals(sideName)) {
+        return FRONT;
+      }
+      else if (BACK.getLabel().equals(sideName)) {
+        return BACK;
+      }
+      return null;
+    }
   }
+
+  private int minGram;
+  private int maxGram;
   private int gramSize;
   private Side side;
+  private boolean started = false;
   private int inLen;
   private String inStr;
-  private boolean started = false;
+
 
   /**
-   * Creates EdgeNGramTokenizer that can generate an n-gram of the given size.
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
    * @param input Reader holding the input to be tokenized
-   * @param side the {@link Side} from which to chop off an n-gram 
-   * @param gramSize the size of the n-gram to generate
+   * @param side the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
    */
-  public EdgeNGramTokenizer(Reader input, Side side, int gramSize) {
+  public EdgeNGramTokenizer(Reader input, Side side, int minGram, int maxGram) {
     super(input);
-    if (gramSize < 1) {
-      throw new IllegalArgumentException("gramSize must be greater than zero");
+
+    if (side == null) {
+      throw new IllegalArgumentException("sideLabel must be either front or back");
     }
-    this.gramSize = gramSize;
+
+    if (minGram < 1) {
+      throw new IllegalArgumentException("minGram must be greater than zero");
+    }
+
+    if (minGram > maxGram) {
+      throw new IllegalArgumentException("minGram must not be greater than maxGram");
+    }
+
+    this.minGram = minGram;
+    this.maxGram = maxGram;
     this.side = side;
   }
-  public EdgeNGramTokenizer(Reader input, String side, int gramSize) {
-
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param input Reader holding the input to be tokenized
+   * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public EdgeNGramTokenizer(Reader input, String sideLabel, int minGram, int maxGram) {
+    this(input, Side.getSide(sideLabel), minGram, maxGram);
   }
 
   /** Returns the next token in the stream, or null at EOS. */
   public final Token next() throws IOException {
-    // if we already returned the edge n-gram, we are done
-    if (started)
-      return null;
+    // if we are just starting, read the whole input
     if (!started) {
       started = true;
       char[] chars = new char[1024];
       input.read(chars);
-      inStr = new String(chars).trim();  // remove any trailing empty strings 
+      inStr = new String(chars).trim();  // remove any trailing empty strings
       inLen = inStr.length();
+      gramSize = minGram;
     }
-    // if the input is too short, we can't generate any n-grams
-    if (gramSize > inLen)
-      return null;
-    if (side == Side.FRONT)
-      return new Token(inStr.substring(0, gramSize), 0, gramSize);
-    else
-      return new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen);            
-  }
 
-  static Side side(String label) {
-    if (label == null || label.trim().length() == 0)
-      throw new IllegalArgumentException("Label must be either 'front' or 'back'");
-    if (label.equals("front"))
-      return Side.FRONT;
-    else
-      return Side.BACK;
+    // if the remaining input is too short, we can't generate any n-grams
+    if (gramSize > inLen) {
+      return null;
+    }
+
+    // if we have hit the end of our n-gram size range, quit
+    if (gramSize > maxGram) {
+      return null;
+    }
+
+    Token tok;
+    if (side == Side.FRONT) {
+      tok = new Token(inStr.substring(0, gramSize), 0, gramSize);
+    }
+    else {
+      tok = new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen);
+    }
+
+    gramSize++;
+    return tok;
   }
 }
diff --git a/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java b/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
index dd50c0a4ad0..a140aa2dc30 100644
--- a/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
+++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
@@ -28,54 +28,90 @@ import junit.framework.TestCase;
  * @author Otis Gospodnetic
  */
 public class EdgeNGramTokenizerTest extends TestCase {
-    private StringReader input;
-    
-    public void setUp() {
-        input = new StringReader("abcde");
-    }
+  private StringReader input;
 
-    public void testInvalidInput() throws Exception {
-        boolean gotException = false;
-        try {        
-            new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 0);
-        } catch (IllegalArgumentException e) {
-            gotException = true;
-        }
-        assertTrue(gotException);
-    }
+  public void setUp() {
+    input = new StringReader("abcde");
+  }
 
-    public void testInvalidInput2() throws Exception {
-        boolean gotException = false;
-        try {        
-            new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, -1);
-        } catch (IllegalArgumentException e) {
-            gotException = true;
-        }
-        assertTrue(gotException);
+  public void testInvalidInput() throws Exception {
+    boolean gotException = false;
+    try {        
+      new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 0, 0);
+    } catch (IllegalArgumentException e) {
+      gotException = true;
     }
+    assertTrue(gotException);
+  }
 
-    public void testFrontUnigram() throws Exception {
-        EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1);
-        Token token = null;
-        token = tokenizer.next();
-        assertEquals("(a,0,1)", token.toString());
-        token = tokenizer.next();
-        assertNull(token);
+  public void testInvalidInput2() throws Exception {
+    boolean gotException = false;
+    try {        
+      new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 2, 1);
+    } catch (IllegalArgumentException e) {
+      gotException = true;
     }
+    assertTrue(gotException);
+  }
 
-    public void testBackUnigram() throws Exception {
-        EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1);
-        Token token = null;
-        token = tokenizer.next();
-        assertEquals("(e,4,5)", token.toString());
-        token = tokenizer.next();
-        assertNull(token);
+  public void testInvalidInput3() throws Exception {
+    boolean gotException = false;
+    try {        
+      new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, -1, 2);
+    } catch (IllegalArgumentException e) {
+      gotException = true;
     }
+    assertTrue(gotException);
+  }
 
-    public void testOversizedNgrams() throws Exception {
-        EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 6);
-        Token token = null;
-        token = tokenizer.next();
-        assertNull(token);
-    }
+  public void testFrontUnigram() throws Exception {
+    EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 1);
+    Token token = null;
+    token = tokenizer.next();
+    assertEquals("(a,0,1)", token.toString());
+    token = tokenizer.next();
+    assertNull(token);
+  }
+
+  public void testBackUnigram() throws Exception {
+    EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 1);
+    Token token = null;
+    token = tokenizer.next();
+    assertEquals("(e,4,5)", token.toString());
+    token = tokenizer.next();
+    assertNull(token);
+  }
+
+  public void testOversizedNgrams() throws Exception {
+    EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 6, 6);
+    Token token = null;
+    token = tokenizer.next();
+    assertNull(token);
+  }
+
+  public void testFrontRangeOfNgrams() throws Exception {
+    EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 3);
+    Token token = null;
+    token = tokenizer.next();
+    assertEquals("(a,0,1)", token.toString());
+    token = tokenizer.next();
+    assertEquals("(ab,0,2)", token.toString());
+    token = tokenizer.next();
+    assertEquals("(abc,0,3)", token.toString());
+    token = tokenizer.next();
+    assertNull(token);
+  }
+
+  public void testBackRangeOfNgrams() throws Exception {
+    EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 3);
+    Token token = null;
+    token = tokenizer.next();
+    assertEquals("(e,4,5)", token.toString());
+    token = tokenizer.next();
+    assertEquals("(de,3,5)", token.toString());
+    token = tokenizer.next();
+    assertEquals("(cde,2,5)", token.toString());
+    token = tokenizer.next();
+    assertNull(token);
+  }
 }