From f9575d47405ec1699e6c76131d14752f85dcf99a Mon Sep 17 00:00:00 2001
From: Shalin Shekhar Mangar <shalin@apache.org>
Date: Wed, 16 Sep 2009 14:28:40 +0000
Subject: [PATCH] SOLR-1407 -- SpellingQueryConverter disallows underscores and
 digits in field names

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@815801 13f79535-47bb-0310-9956-ffa450edef68
---
 CHANGES.txt                                   |  4 +-
 .../solr/spelling/SpellingQueryConverter.java | 44 ++++++++++++-
 .../spelling/SpellingQueryConverterTest.java  | 61 +++++++++++++++++++
 3 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 4352a9e887d..be2e9ad78a7 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -254,7 +254,9 @@ New Features
 
 57. SOLR-1152: Snapshoot on ReplicationHandler should accept location as a request parameter (shalin)
 
-58. SOLR-1204: Enhance SpellingQueryConverter to handle UTF-8 instead of ASCII only (Michael Ludwig via shalin)
+58. SOLR-1204: Enhance SpellingQueryConverter to handle UTF-8 instead of ASCII only.
+    Use the NMTOKEN syntax for matching field names.
+    (Michael Ludwig, shalin)
 
 59. SOLR-1189: Support providing username and password for basic HTTP authentication in Java replication
     (Matthew Gregg, shalin)
diff --git a/src/java/org/apache/solr/spelling/SpellingQueryConverter.java b/src/java/org/apache/solr/spelling/SpellingQueryConverter.java
index 6723c721323..c42cc9e2af2 100644
--- a/src/java/org/apache/solr/spelling/SpellingQueryConverter.java
+++ b/src/java/org/apache/solr/spelling/SpellingQueryConverter.java
@@ -37,7 +37,49 @@ import org.apache.lucene.analysis.TokenStream;
  **/
 public class SpellingQueryConverter extends QueryConverter  {
 
-  protected Pattern QUERY_REGEX = Pattern.compile("(?:(?!(\\p{L}+:|\\d+)))\\p{L}+");
+  /*
+  * The following builds up a regular expression that matches productions
+  * of the syntax for NMTOKEN as per the W3C XML Recommendation - with one
+  * important exception (see below).
+  *
+  * http://www.w3.org/TR/2008/REC-xml-20081126/ - version used as reference
+  *
+  * http://www.w3.org/TR/REC-xml/#NT-Nmtoken
+  *
+  * An NMTOKEN is a series of one or more NAMECHAR characters, which is an
+  * extension of the NAMESTARTCHAR character class.
+  *
+  * The EXCEPTION referred to above concerns the colon, which is legal in an
+  * NMTOKEN, but cannot currently be used as a valid field name within Solr,
+  * as it is used to delimit the field name from the query string.
+  */
+
+  final static String[] NAMESTARTCHAR_PARTS = {
+          "A-Z_a-z", "\\xc0-\\xd6", "\\xd8-\\xf6", "\\xf8-\\u02ff",
+          "\\u0370-\\u037d", "\\u037f-\\u1fff",
+          "\\u200c-\\u200d", "\\u2070-\\u218f",
+          "\\u2c00-\\u2fef", "\\u2001-\\ud7ff",
+          "\\uf900-\\ufdcf", "\\ufdf0-\\ufffd"
+  };
+  final static String[] ADDITIONAL_NAMECHAR_PARTS = {
+          "\\-.0-9\\xb7", "\\u0300-\\u036f", "\\u203f-\\u2040"
+  };
+  final static String SURROGATE_PAIR = "\\p{Cs}{2}";
+  final static String NMTOKEN;
+
+  static {
+    StringBuilder sb = new StringBuilder();
+    for (String part : NAMESTARTCHAR_PARTS)
+      sb.append(part);
+    for (String part : ADDITIONAL_NAMECHAR_PARTS)
+      sb.append(part);
+    NMTOKEN = "([" + sb.toString() + "]|" + SURROGATE_PAIR + ")+";
+  }
+
+  final static String PATTERN = "(?:(?!(" + NMTOKEN + ":|\\d+)))[^\\s]+";
+  // previous version: Pattern.compile("(?:(?!(\\w+:|\\d+)))\\w+");
+  protected Pattern QUERY_REGEX = Pattern.compile(PATTERN);
+
 
   /**
    * Converts the original query string to a collection of Lucene Tokens.
diff --git a/src/test/org/apache/solr/spelling/SpellingQueryConverterTest.java b/src/test/org/apache/solr/spelling/SpellingQueryConverterTest.java
index 5d0aa4dd226..b7e7514a432 100644
--- a/src/test/org/apache/solr/spelling/SpellingQueryConverterTest.java
+++ b/src/test/org/apache/solr/spelling/SpellingQueryConverterTest.java
@@ -22,6 +22,7 @@ import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.solr.common.util.NamedList;
 import static org.junit.Assert.assertTrue;
 import org.junit.Test;
+import org.junit.Assert;
 
 import java.util.Collection;
 
@@ -43,4 +44,64 @@ public class SpellingQueryConverterTest {
     assertTrue("tokens is null and it shouldn't be", tokens != null);
     assertTrue("tokens Size: " + tokens.size() + " is not: " + 1, tokens.size() == 1);
   }
+
+  @Test
+  public void testSpecialChars()  {
+    SpellingQueryConverter converter = new SpellingQueryConverter();
+    converter.init(new NamedList());
+    converter.setAnalyzer(new WhitespaceAnalyzer());
+    Collection<Token> tokens = converter.convert("field_with_underscore:value_with_underscore");
+    assertTrue("tokens is null and it shouldn't be", tokens != null);
+    Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
+
+    tokens = converter.convert("field_with_digits123:value_with_digits123");
+    assertTrue("tokens is null and it shouldn't be", tokens != null);
+    Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
+
+    tokens = converter.convert("field-with-hyphens:value-with-hyphens");
+    assertTrue("tokens is null and it shouldn't be", tokens != null);
+    Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
+
+    // mix 'em up and add some to the value
+    tokens = converter.convert("field_with-123s:value_,.|with-hyphens");
+    assertTrue("tokens is null and it shouldn't be", tokens != null);
+    Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
+  }
+
+  @Test
+  public void testUnicode() {
+    SpellingQueryConverter converter = new SpellingQueryConverter();
+    converter.init(new NamedList());
+    converter.setAnalyzer(new WhitespaceAnalyzer());
+    
+    // chinese text value
+    Collection<Token> tokens = converter.convert("text_field:我购买了道具和服装。");
+    assertTrue("tokens is null and it shouldn't be", tokens != null);
+    Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
+
+    tokens = converter.convert("text_购field:我购买了道具和服装。");
+    assertTrue("tokens is null and it shouldn't be", tokens != null);
+    Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
+
+    tokens = converter.convert("text_field:我购xyz买了道具和服装。");
+    assertTrue("tokens is null and it shouldn't be", tokens != null);
+    Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
+  }
+
+  @Test
+  public void testMultipleClauses() {
+    SpellingQueryConverter converter = new SpellingQueryConverter();
+    converter.init(new NamedList());
+    converter.setAnalyzer(new WhitespaceAnalyzer());
+
+    // two field:value pairs should give two tokens
+    Collection<Token> tokens = converter.convert("买text_field:我购买了道具和服装。 field2:bar");
+    assertTrue("tokens is null and it shouldn't be", tokens != null);
+    Assert.assertEquals("tokens Size: " + tokens.size() + " is not 2", 2, tokens.size());
+
+    // a field:value pair and a search term should give two tokens
+    tokens = converter.convert("text_field:我购买了道具和服装。 bar");
+    assertTrue("tokens is null and it shouldn't be", tokens != null);
+    Assert.assertEquals("tokens Size: " + tokens.size() + " is not 2", 2, tokens.size());
+  }
 }