From 3743037bc1d1b7e5822e203b240ef4def9bd651d Mon Sep 17 00:00:00 2001
From: Yonik Seeley <yonik@apache.org>
Date: Tue, 22 Sep 2009 21:55:57 +0000
Subject: [PATCH] SOLR-908: add CommonGramsFilterFactory
 CommonGramsQueryFilterFactory

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@817859 13f79535-47bb-0310-9956-ffa450edef68
---
 CHANGES.txt                                   |   5 +
 .../solr/analysis/CommonGramsFilter.java      | 223 +++++++++++++++
 .../analysis/CommonGramsFilterFactory.java    |  89 ++++++
 .../solr/analysis/CommonGramsQueryFilter.java | 138 +++++++++
 .../CommonGramsQueryFilterFactory.java        |  98 +++++++
 .../CommonGramsFilterFactoryTest.java         |  69 +++++
 .../solr/analysis/CommonGramsFilterTest.java  | 265 ++++++++++++++++++
 .../CommonGramsQueryFilterFactoryTest.java    |  68 +++++
 8 files changed, 955 insertions(+)
 create mode 100644 src/java/org/apache/solr/analysis/CommonGramsFilter.java
 create mode 100644 src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java
 create mode 100644 src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java
 create mode 100644 src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java
 create mode 100644 src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java
 create mode 100644 src/test/org/apache/solr/analysis/CommonGramsFilterTest.java
 create mode 100644 src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java
diff --git a/CHANGES.txt b/CHANGES.txt
index cf96c0244cd..846c40f668b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -321,6 +321,11 @@ New Features
 
 80. SOLR-1447 : Simple property injection. <mergePolicy> & <mergeSceduler> syntaxes are deprecated ( Jason rutherglen noble)
 
+82. SOLR-908 : CommonGramsFilterFactory/CommonGramsQueryFilterFactory for
+    speeding up phrase queries containing common words by indexing
+    n-grams and using them at query time. 
+    (Tom Burton-West, Jason Rutherglen via yonik)
+
 Optimizations
 ----------------------
  1. SOLR-374: Use IndexReader.reopen to save resources by re-using parts of the
diff --git a/src/java/org/apache/solr/analysis/CommonGramsFilter.java b/src/java/org/apache/solr/analysis/CommonGramsFilter.java
new file mode 100644
index 00000000000..b31ebc206e4
--- /dev/null
+++ b/src/java/org/apache/solr/analysis/CommonGramsFilter.java
@@ -0,0 +1,223 @@
+/*
+ * Licensed under the Apache License, 
+ * Version 2.0 (the "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software distributed under the License 
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+ * See the License for the specific language governing permissions and limitations under the License. 
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+/*
+ * TODO: Rewrite to use new TokenStream api from lucene 2.9 when BufferedTokenStream uses it.
+ * TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and
+ * associated constructors 
+ */
+
+/**
+ * Construct bigrams for frequently occurring terms while indexing. Single terms
+ * are still indexed too, with bigrams overlaid. This is achieved through the
+ * use of {@link Token#Set<String>PositionIncrement(int)}. Bigrams have a type
+ * of "gram" Example
+ * <ul>
+ * <li>input:"the quick brown fox"</li>
+ * <li>output:|"the","the-quick"|"brown"|"fox"|</li>
+ * <li>"the-quick" has a position increment of 0 so it is in the same position
+ * as "the" "the-quick" has a term.type() of "gram"</li>
+ * 
+ * </ul>
+ */
+
+/*
+ * Constructors and makeCommonSet based on similar code in StopFilter
+ */
+
+public class CommonGramsFilter extends BufferedTokenStream {
+
+  private static final char SEPARATOR = '_';
+
+  private final CharArraySet commonWords;
+
+  private StringBuilder buffer = new StringBuilder();
+
+  /**
+   * Construct a token stream filtering the given input using a Set of common
+   * words to create bigrams. Outputs both unigrams with position increment and
+   * bigrams with position increment 0 type=gram where one or both of the words
+   * in a potential bigram are in the set of common words .
+   * 
+   * @param input TokenStream input in filter chain
+   * @param commonWords The set of common words.
+   * 
+   */
+  public CommonGramsFilter(TokenStream input, Set commonWords) {
+    this(input, commonWords, false);
+  }
+
+  /**
+   * Construct a token stream filtering the given input using a Set of common
+   * words to create bigrams, case-sensitive if ignoreCase is false (unless Set
+   * is CharArraySet). If <code>commonWords</code> is an instance of
+   * {@link CharArraySet} (true if <code>makeCommonSet()</code> was used to
+   * construct the set) it will be directly used and <code>ignoreCase</code>
+   * will be ignored since <code>CharArraySet</code> directly controls case
+   * sensitivity.
+   * <p/>
+   * If <code>commonWords</code> is not an instance of {@link CharArraySet}, a
+   * new CharArraySet will be constructed and <code>ignoreCase</code> will be
+   * used to specify the case sensitivity of that set.
+   * 
+   * @param input TokenStream input in filter chain.
+   * @param commonWords The set of common words.
+   * @param ignoreCase -Ignore case when constructing bigrams for common words.
+   */
+  public CommonGramsFilter(TokenStream input, Set commonWords,
+      boolean ignoreCase) {
+    super(input);
+    if (commonWords instanceof CharArraySet) {
+      this.commonWords = (CharArraySet) commonWords;
+    } else {
+      this.commonWords = new CharArraySet(commonWords.size(), ignoreCase);
+      this.commonWords.addAll(commonWords);
+    }
+    init();
+  }
+
+  /**
+   * Construct a token stream filtering the given input using an Array of common
+   * words to create bigrams.
+   * 
+   * @param input Tokenstream in filter chain
+   * @param commonWords words to be used in constructing bigrams
+   */
+  public CommonGramsFilter(TokenStream input, String[] commonWords) {
+    this(input, commonWords, false);
+    init();
+  }
+
+  /**
+   * Construct a token stream filtering the given input using an Array of common
+   * words to create bigrams and is case-sensitive if ignoreCase is false.
+   * 
+   * @param input Tokenstream in filter chain
+   * @param commonWords words to be used in constructing bigrams
+   * @param ignoreCase -Ignore case when constructing bigrams for common words.
+   */
+  public CommonGramsFilter(TokenStream input, String[] commonWords,
+      boolean ignoreCase) {
+    super(input);
+    this.commonWords = (CharArraySet) makeCommonSet(commonWords, ignoreCase);
+    init();
+  }
+
+  // Here for future moving to 2.9 api See StopFilter code
+
+  public void init() {
+    /**
+     * termAtt = (TermAttribute) addAttribute(TermAttribute.class); posIncrAtt
+     * =(PositionIncrementAttribute)
+     * addAttribute(PositionIncrementAttribute.class); typeAdd =(TypeAttribute)
+     * addAttribute(TypeAttribute.class);
+     */
+  }
+
+  /**
+   * Build a CharArraySet from an array of common words, appropriate for passing
+   * into the CommonGramsFilter constructor. This permits this commonWords
+   * construction to be cached once when an Analyzer is constructed.
+   * 
+   * @see #makeCommonSet(java.lang.String[], boolean) passing false to
+   *      ignoreCase
+   */
+  public static final CharArraySet makeCommonSet(String[] commonWords) {
+    return makeCommonSet(commonWords, false);
+  }
+
+  /**
+   * Build a CharArraySet from an array of common words, appropriate for passing
+   * into the CommonGramsFilter constructor,case-sensitive if ignoreCase is
+   * false.
+   * 
+   * @param commonWords
+   * @param ignoreCase If true, all words are lower cased first.
+   * @return a Set containing the words
+   */
+  public static final CharArraySet makeCommonSet(String[] commonWords,
+      boolean ignoreCase) {
+    CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase);
+    commonSet.addAll(Arrays.asList(commonWords));
+    return commonSet;
+  }
+
+  /**
+   * Inserts bigrams for common words into a token stream. For each input token,
+   * output the token. If the token and/or the following token are in the list
+   * of common words also output a bigram with position increment 0 and
+   * type="gram"
+   */
+  /*
+   * TODO: implement new lucene 2.9 API incrementToken() instead of deprecated
+   * Token.next() TODO:Consider adding an option to not emit unigram stopwords
+   * as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be
+   * changed to work with this. TODO: Consider optimizing for the case of three
+   * commongrams i.e "man of the year" normally produces 3 bigrams: "man-of",
+   * "of-the", "the-year" but with proper management of positions we could
+   * eliminate the middle bigram "of-the"and save a disk seek and a whole set of
+   * position lookups.
+   */
+  public Token process(Token token) throws IOException {
+    Token next = peek(1);
+    // if this is the last token just spit it out. Any commongram would have
+    // been output in the previous call
+    if (next == null) {
+      return token;
+    }
+
+    /**
+     * if this token or next are common then construct a bigram with type="gram"
+     * position increment = 0, and put it in the output queue. It will be
+     * returned when super.next() is called, before this method gets called with
+     * a new token from the input stream See implementation of next() in
+     * BufferedTokenStream
+     */
+
+    if (isCommon(token) || isCommon(next)) {
+      Token gram = gramToken(token, next);
+      write(gram);
+    }
+    // we always return the unigram token
+    return token;
+  }
+
+  /** True if token is for a common term. */
+  private boolean isCommon(Token token) {
+    return commonWords != null
+        && commonWords.contains(token.termBuffer(), 0, token.termLength());
+  }
+
+  /** Construct a compound token. */
+  private Token gramToken(Token first, Token second) {
+    buffer.setLength(0);
+    buffer.append(first.termText());
+    buffer.append(SEPARATOR);
+    buffer.append(second.termText());
+    Token result = new Token(buffer.toString(), first.startOffset(), second
+        .endOffset(), "gram");
+    result.setPositionIncrement(0);
+    return result;
+  }
+  
+  public void reset() throws IOException {
+    super.reset();
+    buffer.setLength(0);
+  }
+}
diff --git a/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java b/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java
new file mode 100644
index 00000000000..f7820b6c140
--- /dev/null
+++ b/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopAnalyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+/**
+ * Constructs a CommonGramsFilter
+ */
+
+/*
+ * This is pretty close to a straight copy from StopFilterFactory
+ */
+public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements
+    ResourceLoaderAware {
+
+  public void inform(ResourceLoader loader) {
+    String commonWordFiles = args.get("words");
+    ignoreCase = getBoolean("ignoreCase", false);
+    enablePositionIncrements = getBoolean("enablePositionIncrements", false);
+
+    if (commonWordFiles != null) {
+      try {
+        List<String> files = StrUtils.splitFileNames(commonWordFiles);
+          if (commonWords == null && files.size() > 0){
+            //default stopwords list has 35 or so words, but maybe don't make it that big to start
+            commonWords = new CharArraySet(files.size() * 10, ignoreCase);
+          }
+          for (String file : files) {
+            List<String> wlist = loader.getLines(file.trim());
+            //TODO: once StopFilter.makeStopSet(List) method is available, switch to using that so we can avoid a toArray() call
+            commonWords.addAll(CommonGramsFilter.makeCommonSet((String[])wlist.toArray(new String[0]), ignoreCase));
+          }
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+    } else {
+      commonWords = (CharArraySet) CommonGramsFilter.makeCommonSet(StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
+    }
+  }
+      
+    //Force the use of a char array set, as it is the most performant, although this may break things if Lucene ever goes away from it.  See SOLR-1095
+    private CharArraySet commonWords;
+    private boolean ignoreCase;
+    private boolean enablePositionIncrements;
+  
+  public boolean isEnablePositionIncrements() {
+    return enablePositionIncrements;
+  }
+
+  public boolean isIgnoreCase() {
+    return ignoreCase;
+  }
+
+  public Set getCommonWords() {
+    return commonWords;
+  }
+
+  public CommonGramsFilter create(TokenStream input) {
+    CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords, ignoreCase);
+    return commonGrams;
+  }
+}
+ 
+  
+  
\ No newline at end of file
diff --git a/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java b/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java
new file mode 100644
index 00000000000..65c3c9b5238
--- /dev/null
+++ b/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java
@@ -0,0 +1,138 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.Token;
+
+/**
+ * Wrap a CommonGramsFilter optimizing phrase queries by only returning single
+ * words when they are not a member of a bigram.
+ * 
+ * Example:
+ * <ul>
+ * <li>query input to CommonGramsFilter: "the rain in spain falls mainly"
+ * <li>output of CommomGramsFilter/input to CommonGramsQueryFilter:
+ * |"the, "the-rain"|"rain" "rain-in"|"in, "in-spain"|"spain"|"falls"|"mainly"
+ * <li>output of CommonGramsQueryFilter:"the-rain", "rain-in" ,"in-spain",
+ * "falls", "mainly"
+ * </ul>
+ */
+
+/*
+ * TODO: When org.apache.solr.analysis.BufferedTokenStream is changed to use the
+ * 2.9 lucene TokenStream api, make necessary changes here.
+ * See:http://hudson.zones
+ * .apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache
+ * /lucene/analysis/TokenStream.html and
+ * http://svn.apache.org/viewvc/lucene/java
+ * /trunk/src/java/org/apache/lucene/analysis/package.html?revision=718798
+ */
+public class CommonGramsQueryFilter extends BufferedTokenStream {
+  //private CharArraySet commonWords;
+  private Token prev;
+
+  /**
+   * Constructor
+   * 
+   * @param input must be a CommonGramsFilter!
+   * 
+   */
+
+  public CommonGramsQueryFilter(CommonGramsFilter input) {
+    super(input);
+    prev = new Token();
+  }
+  
+  public void reset() throws IOException {
+    super.reset();
+    prev = new Token();
+  }
+  
+  /**
+   * Output bigrams whenever possible to optimize queries. Only output unigrams
+   * when they are not a member of a bigram. Example:
+   * <ul>
+   * <li>input: "the rain in spain falls mainly"
+   * <li>output:"the-rain", "rain-in" ,"in-spain", "falls", "mainly"
+   */
+
+  public Token process(Token token) throws IOException {
+    Token next = peek(1);
+    /*
+     * Deal with last token (next=null when current token is the last word) Last
+     * token will be a unigram. If previous token was a bigram, then we already
+     * output the last token as part of the unigram and should not additionally
+     * output the unigram. <p> Example: If the end of the input to the
+     * CommonGramsFilter is "...the plain" <ul> <li>current token = "plain"</li>
+     * <li>next token = null</li> <li>previous token = "the-plain" (bigram)</li>
+     * <li> Since the word "plain" was already output as part of the bigram we
+     * don't output it.</li> </ul> Example: If the end of the input to the
+     * CommonGramsFilter is "falls mainly" <ul> <li>current token =
+     * "mainly"</li> <li>next token = null</li> <li>previous token = "falls"
+     * (unigram)</li> <li>Since we haven't yet output the current token, we
+     * output it</li> </ul>
+     */
+
+    // Deal with special case of last token
+    if (next == null) {
+      if (prev == null) {
+        // This is the first and only token i.e. one word query
+        return token;
+      }
+      if (prev != null && prev.type() != "gram") {
+        // If previous token was a unigram, output the current token
+        return token;
+      } else {
+        // If previous token was a bigram, we already output it and this token
+        // was output as part of the bigram so we are done.
+        return null;
+      }
+    }
+
+    /*
+     * Possible cases are: |token |next 1|word |gram 2|word |word The
+     * CommonGramsFilter we are wrapping always outputs the unigram word prior
+     * to outputting an optional bigram: "the sound of" gets output as |"the",
+     * "the_sound"|"sound", "sound_of" For case 1 we consume the gram from the
+     * input stream and output it rather than the current token This means that
+     * the call to super.next() which reads a token from input and passes it on
+     * to this process method will always get a token of type word
+     */
+    if (next != null && next.type() == "gram") {
+      // consume "next" token from list and output it
+      token = read();
+      // use this to clone the token because clone requires all these args but
+      // won't take the token.type
+      // see
+      // http://hudson.zones.apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache/lucene/analysis/Token.html
+      prev.reinit(token.termBuffer(), 0, token.termLength(), token
+          .startOffset(), token.endOffset(), token.type());
+      token.setPositionIncrement(1);
+      return token;
+    }
+
+    // if the next token is not a bigram, then output the token
+    // see note above regarding this method of copying token to prev
+    prev.reinit(token.termBuffer(), 0, token.termLength(), token.startOffset(),
+        token.endOffset(), token.type());
+    assert token.type() == "word";
+    return token;
+  }
+}
diff --git a/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java b/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java
new file mode 100644
index 00000000000..fbf053f8279
--- /dev/null
+++ b/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopAnalyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+/**
+ * Construct CommonGramsQueryFilter
+ * 
+ * This is pretty close to a straight copy from StopFilterFactory
+ * 
+ */
+public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
+    implements ResourceLoaderAware {
+
+  public void inform(ResourceLoader loader) {
+    String commonWordFiles = args.get("words");
+    ignoreCase = getBoolean("ignoreCase", false);
+    enablePositionIncrements = getBoolean("enablePositionIncrements", false);
+
+    if (commonWordFiles != null) {
+      try {
+        List<String> files = StrUtils.splitFileNames(commonWordFiles);
+        if (commonWords == null && files.size() > 0) {
+          // default stopwords list has 35 or so words, but maybe don't make it
+          // that big to start
+          commonWords = new CharArraySet(files.size() * 10, ignoreCase);
+        }
+        for (String file : files) {
+          List<String> wlist = loader.getLines(file.trim());
+          // TODO: once StopFilter.makeStopSet(List) method is available, switch
+          // to using that so we can avoid a toArray() call
+          commonWords.addAll(CommonGramsFilter.makeCommonSet((String[]) wlist
+              .toArray(new String[0]), ignoreCase));
+        }
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+    } else {
+      commonWords = (CharArraySet) CommonGramsFilter.makeCommonSet(
+          StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
+    }
+  }
+
+  // Force the use of a char array set, as it is the most performant, although
+  // this may break things if Lucene ever goes away from it. See SOLR-1095
+  private CharArraySet commonWords;
+
+  private boolean ignoreCase;
+
+  private boolean enablePositionIncrements;
+
+  public boolean isEnablePositionIncrements() {
+    return enablePositionIncrements;
+  }
+
+  public boolean isIgnoreCase() {
+    return ignoreCase;
+  }
+
+  public Set getCommonWords() {
+    return commonWords;
+  }
+
+  /**
+   * Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter
+   */
+  public CommonGramsQueryFilter create(TokenStream input) {
+    CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords,
+        ignoreCase);
+    CommonGramsQueryFilter commonGramsQuery = new CommonGramsQueryFilter(
+        commonGrams);
+    return commonGramsQuery;
+  }
+}
diff --git a/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java b/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java
new file mode 100644
index 00000000000..1eb46174d0e
--- /dev/null
+++ b/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java
@@ -0,0 +1,69 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.solr.util.AbstractSolrTestCase;
+import org.apache.solr.common.ResourceLoader;
+
+import java.util.Set;
+import java.util.Map;
+import java.util.HashMap;
+
+/**
+ * Tests pretty much copied from StopFilterFactoryTest We use the test files
+ * used by the StopFilterFactoryTest TODO: consider creating separate test files
+ * so this won't break if stop filter test files change
+ **/
+public class CommonGramsFilterFactoryTest extends AbstractSolrTestCase {
+  public String getSchemaFile() {
+    return "schema-stop-keep.xml";
+  }
+
+  public String getSolrConfigFile() {
+    return "solrconfig.xml";
+  }
+
+  public void testInform() throws Exception {
+    ResourceLoader loader = solrConfig.getResourceLoader();
+    assertTrue("loader is null and it shouldn't be", loader != null);
+    CommonGramsFilterFactory factory = new CommonGramsFilterFactory();
+    Map<String, String> args = new HashMap<String, String>();
+    args.put("words", "stop-1.txt");
+    args.put("ignoreCase", "true");
+    factory.init(args);
+    factory.inform(loader);
+    Set words = factory.getCommonWords();
+    assertTrue("words is null and it shouldn't be", words != null);
+    assertTrue("words Size: " + words.size() + " is not: " + 2,
+        words.size() == 2);
+    assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
+        .isIgnoreCase() == true);
+
+    factory = new CommonGramsFilterFactory();
+    args.put("words", "stop-1.txt, stop-2.txt");
+    factory.init(args);
+    factory.inform(loader);
+    words = factory.getCommonWords();
+    assertTrue("words is null and it shouldn't be", words != null);
+    assertTrue("words Size: " + words.size() + " is not: " + 4,
+        words.size() == 4);
+    assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
+        .isIgnoreCase() == true);
+
+  }
+}
diff --git a/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java b/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java
new file mode 100644
index 00000000000..1ab2ba7b6f1
--- /dev/null
+++ b/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java
@@ -0,0 +1,265 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.StringTokenizer;
+import java.util.Map.Entry;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.solr.analysis.TestBufferedTokenStream.AB_AAB_Stream;
+
+/**
+ * Tests CommonGramsQueryFilter
+ */
+public class CommonGramsFilterTest extends TestCase {
+  private static final String[] commonWords = { "s", "a", "b", "c", "d", "the",
+      "of" };
+  
+  public void testReset() throws Exception {
+    final String input = "How the s a brown s cow d like A B thing?";
+    WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
+    CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
+    
+    TermAttribute term = (TermAttribute) cgf.addAttribute(TermAttribute.class);
+    assertTrue(cgf.incrementToken());
+    assertEquals("How", term.term());
+    assertTrue(cgf.incrementToken());
+    assertEquals("How_the", term.term());
+    assertTrue(cgf.incrementToken());
+    assertEquals("the", term.term());
+    assertTrue(cgf.incrementToken());
+    assertEquals("the_s", term.term());
+    
+    wt.reset(new StringReader(input));
+    cgf.reset();
+    assertTrue(cgf.incrementToken());
+    assertEquals("How", term.term());
+  }
+  
+  public void testCommonGramsQueryFilter() throws Exception {
+    Set<Map.Entry<String, String>> input2expectedSet = initQueryMap().entrySet();
+    for (Iterator<Entry<String, String>> i = input2expectedSet.iterator(); i
+        .hasNext();) {
+      Map.Entry<String, String> me = i.next();
+      String input = me.getKey();
+      String expected = me.getValue();
+      String message = "message: input value is: " + input;
+      assertEquals(message, expected, testFilter(input, "query"));
+    }
+  }
+  
+  public void testQueryReset() throws Exception {
+    final String input = "How the s a brown s cow d like A B thing?";
+    WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
+    CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
+    CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
+    
+    TermAttribute term = (TermAttribute) wt.addAttribute(TermAttribute.class);
+    assertTrue(nsf.incrementToken());
+    assertEquals("How_the", term.term());
+    assertTrue(nsf.incrementToken());
+    assertEquals("the_s", term.term());
+    
+    wt.reset(new StringReader(input));
+    nsf.reset();
+    assertTrue(nsf.incrementToken());
+    assertEquals("How_the", term.term());
+  }
+  
+  public void testCommonGramsFilter() throws Exception {
+    Set<Map.Entry<String, String>> input2expectedSet = initMap().entrySet();
+    for (Iterator<Entry<String, String>> i = input2expectedSet.iterator(); i
+        .hasNext();) {
+      Map.Entry<String, String> me = i.next();
+      String input = me.getKey();
+      String expected = me.getValue();
+      String message = "message: input value is: " + input;
+      assertEquals(message, expected, testFilter(input, "common"));
+    }
+  }
+  
+  /**
+   * This is for testing CommonGramsQueryFilter which outputs a set of tokens
+   * optimized for querying with only one token at each position, either a
+   * unigram or a bigram It also will not return a token for the final position
+   * if the final word is already in the preceding bigram Example:(three
+   * tokens/positions in)
+   * "foo bar the"=>"foo:1|bar:2,bar-the:2|the:3=> "foo" "bar-the" (2 tokens
+   * out)
+   * 
+   * @return Map<String,String>
+   */
+  private static Map<String, String> initQueryMap() {
+    Map<String, String> input2expected = new LinkedHashMap<String, String>();
+
+    // Stop words used below are "of" "the" and "s"
+    
+    // two word queries
+    input2expected.put("brown fox", "/brown/fox");
+    input2expected.put("the fox", "/the_fox");
+    input2expected.put("fox of", "/fox_of");
+    input2expected.put("of the", "/of_the");
+    
+    // one word queries
+    input2expected.put("the", "/the");
+    input2expected.put("foo", "/foo");
+
+    // 3 word combinations s=stopword/common word n=not a stop word
+    input2expected.put("n n n", "/n/n/n");
+    input2expected.put("quick brown fox", "/quick/brown/fox");
+
+    input2expected.put("n n s", "/n/n_s");
+    input2expected.put("quick brown the", "/quick/brown_the");
+
+    input2expected.put("n s n", "/n_s/s_n");
+    input2expected.put("quick the brown", "/quick_the/the_brown");
+
+    input2expected.put("n s s", "/n_s/s_s");
+    input2expected.put("fox of the", "/fox_of/of_the");
+
+    input2expected.put("s n n", "/s_n/n/n");
+    input2expected.put("the quick brown", "/the_quick/quick/brown");
+
+    input2expected.put("s n s", "/s_n/n_s");
+    input2expected.put("the fox of", "/the_fox/fox_of");
+
+    input2expected.put("s s n", "/s_s/s_n");
+    input2expected.put("of the fox", "/of_the/the_fox");
+
+    input2expected.put("s s s", "/s_s/s_s");
+    input2expected.put("of the of", "/of_the/the_of");
+
+    return input2expected;
+  }
+  
+  private static Map<String, String> initMap() {
+    Map<String, String> input2expected = new HashMap<String, String>();
+
+    // Stop words used below are "of" "the" and "s"
+    // one word queries
+    input2expected.put("the", "/the");
+    input2expected.put("foo", "/foo");
+
+    // two word queries
+    input2expected.put("brown fox", "/brown/fox");
+    input2expected.put("the fox", "/the,the_fox/fox");
+    input2expected.put("fox of", "/fox,fox_of/of");
+    input2expected.put("of the", "/of,of_the/the");
+
+    // 3 word combinations s=stopword/common word n=not a stop word
+    input2expected.put("n n n", "/n/n/n");
+    input2expected.put("quick brown fox", "/quick/brown/fox");
+
+    input2expected.put("n n s", "/n/n,n_s/s");
+    input2expected.put("quick brown the", "/quick/brown,brown_the/the");
+
+    input2expected.put("n s n", "/n,n_s/s,s_n/n");
+    input2expected.put("quick the fox", "/quick,quick_the/the,the_fox/fox");
+
+    input2expected.put("n s s", "/n,n_s/s,s_s/s");
+    input2expected.put("fox of the", "/fox,fox_of/of,of_the/the");
+
+    input2expected.put("s n n", "/s,s_n/n/n");
+    input2expected.put("the quick brown", "/the,the_quick/quick/brown");
+
+    input2expected.put("s n s", "/s,s_n/n,n_s/s");
+    input2expected.put("the fox of", "/the,the_fox/fox,fox_of/of");
+
+    input2expected.put("s s n", "/s,s_s/s,s_n/n");
+    input2expected.put("of the fox", "/of,of_the/the,the_fox/fox");
+
+    input2expected.put("s s s", "/s,s_s/s,s_s/s");
+    input2expected.put("of the of", "/of,of_the/the,the_of/of");
+
+    return input2expected;
+  }
+  
+  /*
+   * Helper methodsCopied and from CDL XTF BigramsStopFilter.java and slightly
+   * modified to use with CommonGrams http://xtf.wiki.sourceforge.net/
+   */
+  /**
+   * Very simple tokenizer that breaks up a string into a series of Lucene
+   * {@link Token Token}s.
+   */
+  static class StringTokenStream extends TokenStream {
+    private String str;
+
+    private int prevEnd = 0;
+
+    private StringTokenizer tok;
+
+    private int count = 0;
+
+    public StringTokenStream(String str, String delim) {
+      this.str = str;
+      tok = new StringTokenizer(str, delim);
+    }
+
+    public Token next() {
+      if (!tok.hasMoreTokens())
+        return null;
+      count++;
+      String term = tok.nextToken();
+      Token t = new Token(term, str.indexOf(term, prevEnd), str.indexOf(term,
+          prevEnd)
+          + term.length(), "word");
+      prevEnd = t.endOffset();
+      return t;
+    }
+  }
+  
+  public static String testFilter(String in, String type) throws IOException {
+    TokenStream nsf;
+    StringTokenStream ts = new StringTokenStream(in, " .");
+    if (type.equals("query")) {
+      CommonGramsFilter cgf = new CommonGramsFilter(ts, commonWords);
+      nsf = new CommonGramsQueryFilter(cgf);
+    } else {
+      nsf = new CommonGramsFilter(ts, commonWords);
+    }
+
+    StringBuffer outBuf = new StringBuffer();
+    while (true) {
+      Token t = nsf.next();
+      if (t == null)
+        break;
+      for (int i = 0; i < t.getPositionIncrement(); i++)
+        outBuf.append('/');
+      if (t.getPositionIncrement() == 0)
+        outBuf.append(',');
+      outBuf.append(t.term());
+    }
+
+    String out = outBuf.toString();
+    out = out.replaceAll(" ", "");
+    return out;
+  }
+}
diff --git a/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java b/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java
new file mode 100644
index 00000000000..b6eca9d6b23
--- /dev/null
+++ b/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import org.apache.solr.util.AbstractSolrTestCase;
+import org.apache.solr.common.ResourceLoader;
+
+import java.util.Set;
+import java.util.Map;
+import java.util.HashMap;
+
+/**
+ * Tests pretty much copied from StopFilterFactoryTest We use the test files
+ * used by the StopFilterFactoryTest TODO: consider creating separate test files
+ * so this won't break if stop filter test files change
+ **/
+public class CommonGramsQueryFilterFactoryTest extends AbstractSolrTestCase {
+  public String getSchemaFile() {
+    return "schema-stop-keep.xml";
+  }
+
+  public String getSolrConfigFile() {
+    return "solrconfig.xml";
+  }
+
+  public void testInform() throws Exception {
+    ResourceLoader loader = solrConfig.getResourceLoader();
+    assertTrue("loader is null and it shouldn't be", loader != null);
+    CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory();
+    Map<String, String> args = new HashMap<String, String>();
+    args.put("words", "stop-1.txt");
+    args.put("ignoreCase", "true");
+    factory.init(args);
+    factory.inform(loader);
+    Set words = factory.getCommonWords();
+    assertTrue("words is null and it shouldn't be", words != null);
+    assertTrue("words Size: " + words.size() + " is not: " + 2,
+        words.size() == 2);
+    assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
+        .isIgnoreCase() == true);
+
+    factory = new CommonGramsQueryFilterFactory();
+    args.put("words", "stop-1.txt, stop-2.txt");
+    factory.init(args);
+    factory.inform(loader);
+    words = factory.getCommonWords();
+    assertTrue("words is null and it shouldn't be", words != null);
+    assertTrue("words Size: " + words.size() + " is not: " + 4,
+        words.size() == 4);
+    assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
+        .isIgnoreCase() == true);
+
+  }
+}