LUCENE-4176: fix AnalyzingQueryParser to analyze range endpoints as bytes

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1355001 13f79535-47bb-0310-9956-ffa450edef68
2012-06-28 13:20:15 +00:00 · 2012-06-28 13:20:15 +00:00 · 93462313c3
parent 032cad944a
commit 93462313c3
6 changed files with 153 additions and 69 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -16,6 +16,11 @@ API Changes
  has a different API (carries a list of tags instead of a compound tag). Upgrade
  of embedded morfologik dictionaries to version 1.9. (Dawid Weiss)

+Bug Fixes
+
+* LUCENE-4176: Fix AnalyzingQueryParser to analyze range endpoints as bytes,
+  so that it works correctly with Analyzers that produce binary non-UTF-8 terms
+  such as CollationAnalyzer. (Nattapong Sirilappanich via Robert Muir) 

 ======================= Lucene 4.0.0-ALPHA =======================

--- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/analyzing/AnalyzingQueryParser.java
+++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/analyzing/AnalyzingQueryParser.java
@ -50,6 +50,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
   */
  public AnalyzingQueryParser(Version matchVersion, String field, Analyzer analyzer) {
    super(matchVersion, field, analyzer);
+    setAnalyzeRangeTerms(true);
  }

  /**
@ -278,72 +279,4 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.

    return (nextToken == null) ? null : super.getFuzzyQuery(field, nextToken, minSimilarity);
  }
-
-  /**
-   * Overrides super class, by passing terms through analyzer.
-   * @exception ParseException
-   */
-  @Override
-  protected Query getRangeQuery(String field, String part1, String part2, boolean startInclusive, boolean endInclusive)
-      throws ParseException {
-    // get Analyzer from superclass and tokenize the terms
-    TokenStream source = null;
-    CharTermAttribute termAtt = null;
-    boolean multipleTokens = false;
-
-    if (part1 != null) {
-      // part1
-      try {
-        source = getAnalyzer().tokenStream(field, new StringReader(part1));
-        termAtt = source.addAttribute(CharTermAttribute.class);
-        source.reset();
-        multipleTokens = false;
-
-
-        if (source.incrementToken()) {
-          part1 = termAtt.toString();
-        }
-        multipleTokens = source.incrementToken();
-      } catch (IOException e) {
-        // ignore
-      }
-      try {
-        source.end();
-        source.close();
-      } catch (IOException e) {
-        // ignore
-      }
-      if (multipleTokens) {
-        throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
-            + " - tokens were added to part1");
-      }
-    }
-
-    if (part2 != null) {
-      try {
-        // part2
-        source = getAnalyzer().tokenStream(field, new StringReader(part2));
-        termAtt = source.addAttribute(CharTermAttribute.class);
-        source.reset();
-        if (source.incrementToken()) {
-          part2 = termAtt.toString();
-        }
-        multipleTokens = source.incrementToken();
-      } catch (IOException e) {
-        // ignore
-      }
-      try {
-        source.end();
-        source.close();
-      } catch (IOException e) {
-        // ignore
-      }
-      if (multipleTokens) {
-        throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
-            + " - tokens were added to part2");
-      }
-    }
-    return super.getRangeQuery(field, part1, part2, startInclusive, endInclusive);
-  }
-
 }
--- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/analyzing/TestAnalyzingQueryParser.java
+++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/analyzing/TestAnalyzingQueryParser.java
@ -22,7 +22,16 @@ import java.io.Reader;

 import org.apache.lucene.analysis.*;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;

 /**
@ -138,5 +147,28 @@ public class TestAnalyzingQueryParser extends LuceneTestCase {
      Tokenizer result = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
      return new TokenStreamComponents(result, new FoldingFilter(result));
    }
-  }  
+  }
+  
+  // LUCENE-4176
+  public void testByteTerms() throws Exception {
+    Directory ramDir = newDirectory();
+    Analyzer analyzer = new MockBytesAnalyzer();
+    RandomIndexWriter writer = new RandomIndexWriter(random(), ramDir, analyzer);
+    Document doc = new Document();
+    FieldType fieldType = new FieldType();
+    fieldType.setIndexed(true);
+    fieldType.setTokenized(true);
+    fieldType.setStored(true);
+    Field field = new Field("content","เข", fieldType);
+    doc.add(field);
+    writer.addDocument(doc);
+    writer.close();
+    DirectoryReader ir = DirectoryReader.open(ramDir);
+    IndexSearcher is = new IndexSearcher(ir);
+    QueryParser qp = new AnalyzingQueryParser(TEST_VERSION_CURRENT, "content", analyzer);
+    Query q = qp.parse("[เข TO เข]");
+    assertEquals(1, is.search(q, 10).totalHits);
+    ir.close();
+    ramDir.close();
+  }
 }
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockBytesAnalyzer.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockBytesAnalyzer.java
@ -0,0 +1,33 @@
+package org.apache.lucene.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+/**
+ * Analyzer for testing that encodes terms as UTF-16 bytes.
+ */
+public class MockBytesAnalyzer extends Analyzer {
+  private final MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
+  
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+    Tokenizer t = new MockTokenizer(factory, reader, MockTokenizer.KEYWORD, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
+    return new TokenStreamComponents(t);
+  }
+}
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockBytesAttributeFactory.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockBytesAttributeFactory.java
@ -0,0 +1,40 @@
+package org.apache.lucene.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.Attribute;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * Attribute factory that implements CharTermAttribute with 
+ * {@link MockUTF16TermAttributeImpl}
+ */
+public class MockBytesAttributeFactory extends AttributeSource.AttributeFactory {
+  private final AttributeSource.AttributeFactory delegate =
+      AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
+  
+  @Override
+  public AttributeImpl createAttributeInstance(
+      Class<? extends Attribute> attClass) {
+    return attClass.isAssignableFrom(MockUTF16TermAttributeImpl.class)
+      ? new MockUTF16TermAttributeImpl()
+      : delegate.createAttributeInstance(attClass);
+  }
+  
+}
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockUTF16TermAttributeImpl.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockUTF16TermAttributeImpl.java
@ -0,0 +1,41 @@
+package org.apache.lucene.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.nio.charset.Charset;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * Extension of {@link CharTermAttributeImpl} that encodes the term
+ * text as UTF-16 bytes instead of as UTF-8 bytes.
+ */
+public class MockUTF16TermAttributeImpl extends CharTermAttributeImpl {
+  static final Charset charset = Charset.forName("UTF-16LE");
+  
+  @Override
+  public int fillBytesRef() {
+    BytesRef bytes = getBytesRef();
+    byte[] utf16 = toString().getBytes(charset);
+    bytes.bytes = utf16;
+    bytes.offset = 0;
+    bytes.length = utf16.length;
+    return bytes.hashCode();
+  }
+}