diff --git a/CHANGES.txt b/CHANGES.txt index 4352a9e887d..be2e9ad78a7 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -254,7 +254,9 @@ New Features 57. SOLR-1152: Snapshoot on ReplicationHandler should accept location as a request parameter (shalin) -58. SOLR-1204: Enhance SpellingQueryConverter to handle UTF-8 instead of ASCII only (Michael Ludwig via shalin) +58. SOLR-1204: Enhance SpellingQueryConverter to handle UTF-8 instead of ASCII only. + Use the NMTOKEN syntax for matching field names. + (Michael Ludwig, shalin) 59. SOLR-1189: Support providing username and password for basic HTTP authentication in Java replication (Matthew Gregg, shalin) diff --git a/src/java/org/apache/solr/spelling/SpellingQueryConverter.java b/src/java/org/apache/solr/spelling/SpellingQueryConverter.java index 6723c721323..c42cc9e2af2 100644 --- a/src/java/org/apache/solr/spelling/SpellingQueryConverter.java +++ b/src/java/org/apache/solr/spelling/SpellingQueryConverter.java @@ -37,7 +37,49 @@ import org.apache.lucene.analysis.TokenStream; **/ public class SpellingQueryConverter extends QueryConverter { - protected Pattern QUERY_REGEX = Pattern.compile("(?:(?!(\\p{L}+:|\\d+)))\\p{L}+"); + /* + * The following builds up a regular expression that matches productions + * of the syntax for NMTOKEN as per the W3C XML Recommendation - with one + * important exception (see below). + * + * http://www.w3.org/TR/2008/REC-xml-20081126/ - version used as reference + * + * http://www.w3.org/TR/REC-xml/#NT-Nmtoken + * + * An NMTOKEN is a series of one or more NAMECHAR characters, which is an + * extension of the NAMESTARTCHAR character class. + * + * The EXCEPTION referred to above concerns the colon, which is legal in an + * NMTOKEN, but cannot currently be used as a valid field name within Solr, + * as it is used to delimit the field name from the query string. + */ + + final static String[] NAMESTARTCHAR_PARTS = { + "A-Z_a-z", "\\xc0-\\xd6", "\\xd8-\\xf6", "\\xf8-\\u02ff", + "\\u0370-\\u037d", "\\u037f-\\u1fff", + "\\u200c-\\u200d", "\\u2070-\\u218f", + "\\u2c00-\\u2fef", "\\u2001-\\ud7ff", + "\\uf900-\\ufdcf", "\\ufdf0-\\ufffd" + }; + final static String[] ADDITIONAL_NAMECHAR_PARTS = { + "\\-.0-9\\xb7", "\\u0300-\\u036f", "\\u203f-\\u2040" + }; + final static String SURROGATE_PAIR = "\\p{Cs}{2}"; + final static String NMTOKEN; + + static { + StringBuilder sb = new StringBuilder(); + for (String part : NAMESTARTCHAR_PARTS) + sb.append(part); + for (String part : ADDITIONAL_NAMECHAR_PARTS) + sb.append(part); + NMTOKEN = "([" + sb.toString() + "]|" + SURROGATE_PAIR + ")+"; + } + + final static String PATTERN = "(?:(?!(" + NMTOKEN + ":|\\d+)))[^\\s]+"; + // previous version: Pattern.compile("(?:(?!(\\w+:|\\d+)))\\w+"); + protected Pattern QUERY_REGEX = Pattern.compile(PATTERN); + /** * Converts the original query string to a collection of Lucene Tokens. diff --git a/src/test/org/apache/solr/spelling/SpellingQueryConverterTest.java b/src/test/org/apache/solr/spelling/SpellingQueryConverterTest.java index 5d0aa4dd226..b7e7514a432 100644 --- a/src/test/org/apache/solr/spelling/SpellingQueryConverterTest.java +++ b/src/test/org/apache/solr/spelling/SpellingQueryConverterTest.java @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.solr.common.util.NamedList; import static org.junit.Assert.assertTrue; import org.junit.Test; +import org.junit.Assert; import java.util.Collection; @@ -43,4 +44,64 @@ public class SpellingQueryConverterTest { assertTrue("tokens is null and it shouldn't be", tokens != null); assertTrue("tokens Size: " + tokens.size() + " is not: " + 1, tokens.size() == 1); } + + @Test + public void testSpecialChars() { + SpellingQueryConverter converter = new SpellingQueryConverter(); + converter.init(new NamedList()); + converter.setAnalyzer(new WhitespaceAnalyzer()); + Collection tokens = converter.convert("field_with_underscore:value_with_underscore"); + assertTrue("tokens is null and it shouldn't be", tokens != null); + Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size()); + + tokens = converter.convert("field_with_digits123:value_with_digits123"); + assertTrue("tokens is null and it shouldn't be", tokens != null); + Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size()); + + tokens = converter.convert("field-with-hyphens:value-with-hyphens"); + assertTrue("tokens is null and it shouldn't be", tokens != null); + Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size()); + + // mix 'em up and add some to the value + tokens = converter.convert("field_with-123s:value_,.|with-hyphens"); + assertTrue("tokens is null and it shouldn't be", tokens != null); + Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size()); + } + + @Test + public void testUnicode() { + SpellingQueryConverter converter = new SpellingQueryConverter(); + converter.init(new NamedList()); + converter.setAnalyzer(new WhitespaceAnalyzer()); + + // chinese text value + Collection tokens = converter.convert("text_field:我购买了道具和服装。"); + assertTrue("tokens is null and it shouldn't be", tokens != null); + Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size()); + + tokens = converter.convert("text_购field:我购买了道具和服装。"); + assertTrue("tokens is null and it shouldn't be", tokens != null); + Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size()); + + tokens = converter.convert("text_field:我购xyz买了道具和服装。"); + assertTrue("tokens is null and it shouldn't be", tokens != null); + Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size()); + } + + @Test + public void testMultipleClauses() { + SpellingQueryConverter converter = new SpellingQueryConverter(); + converter.init(new NamedList()); + converter.setAnalyzer(new WhitespaceAnalyzer()); + + // two field:value pairs should give two tokens + Collection tokens = converter.convert("买text_field:我购买了道具和服装。 field2:bar"); + assertTrue("tokens is null and it shouldn't be", tokens != null); + Assert.assertEquals("tokens Size: " + tokens.size() + " is not 2", 2, tokens.size()); + + // a field:value pair and a search term should give two tokens + tokens = converter.convert("text_field:我购买了道具和服装。 bar"); + assertTrue("tokens is null and it shouldn't be", tokens != null); + Assert.assertEquals("tokens Size: " + tokens.size() + " is not 2", 2, tokens.size()); + } }