mirror of https://github.com/apache/lucene.git
SOLR-1407 -- SpellingQueryConverter disallows underscores and digits in field names
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@815801 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
804160cd13
commit
f9575d4740
|
@ -254,7 +254,9 @@ New Features
|
||||||
|
|
||||||
57. SOLR-1152: Snapshoot on ReplicationHandler should accept location as a request parameter (shalin)
|
57. SOLR-1152: Snapshoot on ReplicationHandler should accept location as a request parameter (shalin)
|
||||||
|
|
||||||
58. SOLR-1204: Enhance SpellingQueryConverter to handle UTF-8 instead of ASCII only (Michael Ludwig via shalin)
|
58. SOLR-1204: Enhance SpellingQueryConverter to handle UTF-8 instead of ASCII only.
|
||||||
|
Use the NMTOKEN syntax for matching field names.
|
||||||
|
(Michael Ludwig, shalin)
|
||||||
|
|
||||||
59. SOLR-1189: Support providing username and password for basic HTTP authentication in Java replication
|
59. SOLR-1189: Support providing username and password for basic HTTP authentication in Java replication
|
||||||
(Matthew Gregg, shalin)
|
(Matthew Gregg, shalin)
|
||||||
|
|
|
@ -37,7 +37,49 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
**/
|
**/
|
||||||
public class SpellingQueryConverter extends QueryConverter {
|
public class SpellingQueryConverter extends QueryConverter {
|
||||||
|
|
||||||
protected Pattern QUERY_REGEX = Pattern.compile("(?:(?!(\\p{L}+:|\\d+)))\\p{L}+");
|
/*
|
||||||
|
* The following builds up a regular expression that matches productions
|
||||||
|
* of the syntax for NMTOKEN as per the W3C XML Recommendation - with one
|
||||||
|
* important exception (see below).
|
||||||
|
*
|
||||||
|
* http://www.w3.org/TR/2008/REC-xml-20081126/ - version used as reference
|
||||||
|
*
|
||||||
|
* http://www.w3.org/TR/REC-xml/#NT-Nmtoken
|
||||||
|
*
|
||||||
|
* An NMTOKEN is a series of one or more NAMECHAR characters, which is an
|
||||||
|
* extension of the NAMESTARTCHAR character class.
|
||||||
|
*
|
||||||
|
* The EXCEPTION referred to above concerns the colon, which is legal in an
|
||||||
|
* NMTOKEN, but cannot currently be used as a valid field name within Solr,
|
||||||
|
* as it is used to delimit the field name from the query string.
|
||||||
|
*/
|
||||||
|
|
||||||
|
final static String[] NAMESTARTCHAR_PARTS = {
|
||||||
|
"A-Z_a-z", "\\xc0-\\xd6", "\\xd8-\\xf6", "\\xf8-\\u02ff",
|
||||||
|
"\\u0370-\\u037d", "\\u037f-\\u1fff",
|
||||||
|
"\\u200c-\\u200d", "\\u2070-\\u218f",
|
||||||
|
"\\u2c00-\\u2fef", "\\u2001-\\ud7ff",
|
||||||
|
"\\uf900-\\ufdcf", "\\ufdf0-\\ufffd"
|
||||||
|
};
|
||||||
|
final static String[] ADDITIONAL_NAMECHAR_PARTS = {
|
||||||
|
"\\-.0-9\\xb7", "\\u0300-\\u036f", "\\u203f-\\u2040"
|
||||||
|
};
|
||||||
|
final static String SURROGATE_PAIR = "\\p{Cs}{2}";
|
||||||
|
final static String NMTOKEN;
|
||||||
|
|
||||||
|
static {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (String part : NAMESTARTCHAR_PARTS)
|
||||||
|
sb.append(part);
|
||||||
|
for (String part : ADDITIONAL_NAMECHAR_PARTS)
|
||||||
|
sb.append(part);
|
||||||
|
NMTOKEN = "([" + sb.toString() + "]|" + SURROGATE_PAIR + ")+";
|
||||||
|
}
|
||||||
|
|
||||||
|
final static String PATTERN = "(?:(?!(" + NMTOKEN + ":|\\d+)))[^\\s]+";
|
||||||
|
// previous version: Pattern.compile("(?:(?!(\\w+:|\\d+)))\\w+");
|
||||||
|
protected Pattern QUERY_REGEX = Pattern.compile(PATTERN);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts the original query string to a collection of Lucene Tokens.
|
* Converts the original query string to a collection of Lucene Tokens.
|
||||||
|
|
|
@ -22,6 +22,7 @@ import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||||
import org.apache.solr.common.util.NamedList;
|
import org.apache.solr.common.util.NamedList;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
import org.junit.Assert;
|
||||||
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
|
||||||
|
@ -43,4 +44,64 @@ public class SpellingQueryConverterTest {
|
||||||
assertTrue("tokens is null and it shouldn't be", tokens != null);
|
assertTrue("tokens is null and it shouldn't be", tokens != null);
|
||||||
assertTrue("tokens Size: " + tokens.size() + " is not: " + 1, tokens.size() == 1);
|
assertTrue("tokens Size: " + tokens.size() + " is not: " + 1, tokens.size() == 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSpecialChars() {
|
||||||
|
SpellingQueryConverter converter = new SpellingQueryConverter();
|
||||||
|
converter.init(new NamedList());
|
||||||
|
converter.setAnalyzer(new WhitespaceAnalyzer());
|
||||||
|
Collection<Token> tokens = converter.convert("field_with_underscore:value_with_underscore");
|
||||||
|
assertTrue("tokens is null and it shouldn't be", tokens != null);
|
||||||
|
Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
|
||||||
|
|
||||||
|
tokens = converter.convert("field_with_digits123:value_with_digits123");
|
||||||
|
assertTrue("tokens is null and it shouldn't be", tokens != null);
|
||||||
|
Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
|
||||||
|
|
||||||
|
tokens = converter.convert("field-with-hyphens:value-with-hyphens");
|
||||||
|
assertTrue("tokens is null and it shouldn't be", tokens != null);
|
||||||
|
Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
|
||||||
|
|
||||||
|
// mix 'em up and add some to the value
|
||||||
|
tokens = converter.convert("field_with-123s:value_,.|with-hyphens");
|
||||||
|
assertTrue("tokens is null and it shouldn't be", tokens != null);
|
||||||
|
Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testUnicode() {
|
||||||
|
SpellingQueryConverter converter = new SpellingQueryConverter();
|
||||||
|
converter.init(new NamedList());
|
||||||
|
converter.setAnalyzer(new WhitespaceAnalyzer());
|
||||||
|
|
||||||
|
// chinese text value
|
||||||
|
Collection<Token> tokens = converter.convert("text_field:我购买了道具和服装。");
|
||||||
|
assertTrue("tokens is null and it shouldn't be", tokens != null);
|
||||||
|
Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
|
||||||
|
|
||||||
|
tokens = converter.convert("text_购field:我购买了道具和服装。");
|
||||||
|
assertTrue("tokens is null and it shouldn't be", tokens != null);
|
||||||
|
Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
|
||||||
|
|
||||||
|
tokens = converter.convert("text_field:我购xyz买了道具和服装。");
|
||||||
|
assertTrue("tokens is null and it shouldn't be", tokens != null);
|
||||||
|
Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMultipleClauses() {
|
||||||
|
SpellingQueryConverter converter = new SpellingQueryConverter();
|
||||||
|
converter.init(new NamedList());
|
||||||
|
converter.setAnalyzer(new WhitespaceAnalyzer());
|
||||||
|
|
||||||
|
// two field:value pairs should give two tokens
|
||||||
|
Collection<Token> tokens = converter.convert("买text_field:我购买了道具和服装。 field2:bar");
|
||||||
|
assertTrue("tokens is null and it shouldn't be", tokens != null);
|
||||||
|
Assert.assertEquals("tokens Size: " + tokens.size() + " is not 2", 2, tokens.size());
|
||||||
|
|
||||||
|
// a field:value pair and a search term should give two tokens
|
||||||
|
tokens = converter.convert("text_field:我购买了道具和服装。 bar");
|
||||||
|
assertTrue("tokens is null and it shouldn't be", tokens != null);
|
||||||
|
Assert.assertEquals("tokens Size: " + tokens.size() + " is not 2", 2, tokens.size());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue