WordDelimiterFilter could loose position info

git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@419321 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2006-07-05 19:36:08 +00:00
parent b5919a7dc1
commit a35e30cb35
4 changed files with 58 additions and 26 deletions

View File

@ -43,6 +43,7 @@ Bug Fixes
2. Added escaping of attribute values in the XML response (Erik Hatcher) 2. Added escaping of attribute values in the XML response (Erik Hatcher)
3. Added empty extractTerms() to FunctionQuery to enable use in 3. Added empty extractTerms() to FunctionQuery to enable use in
a MultiSearcher (Yonik) a MultiSearcher (Yonik)
4. WordDelimiterFilter sometimes lost token positionIncrement information
Other Changes Other Changes
1. Upgrade to Lucene 2.0 nightly build 2006-06-22, lucene SVN revision 416224, 1. Upgrade to Lucene 2.0 nightly build 2006-06-22, lucene SVN revision 416224,

View File

@ -123,10 +123,6 @@ final class WordDelimiterFilter extends TokenFilter {
} }
} }
private int charType(String s, int pos) {
return charType(s.charAt(pos));
}
// use the type of the first char as the type // use the type of the first char as the type
// of the token. // of the token.
private int tokType(Token t) { private int tokType(Token t) {
@ -170,16 +166,18 @@ final class WordDelimiterFilter extends TokenFilter {
// Would it actually be faster to check for the common form // Would it actually be faster to check for the common form
// of isLetter() isLower()*, and then backtrack if it doesn't match? // of isLetter() isLower()*, and then backtrack if it doesn't match?
int origPosIncrement;
while(true) { while(true) {
Token t = input.next(); Token t = input.next();
if (t == null) return null; if (t == null) return null;
String s = t.termText(); String s = t.termText();
int off=t.startOffset();
int start=0; int start=0;
int end=s.length(); int end=s.length();
if (end==0) continue; if (end==0) continue;
origPosIncrement = t.getPositionIncrement();
// Avoid calling charType more than once for each char (basically // Avoid calling charType more than once for each char (basically
// avoid any backtracking). // avoid any backtracking).
// makes code slightly more difficult, but faster. // makes code slightly more difficult, but faster.
@ -273,6 +271,7 @@ final class WordDelimiterFilter extends TokenFilter {
// optimization... if this is the only token, // optimization... if this is the only token,
// return it immediately. // return it immediately.
if (queue.size()==0) { if (queue.size()==0) {
newtok.setPositionIncrement(origPosIncrement);
return newtok; return newtok;
} }
@ -376,7 +375,9 @@ final class WordDelimiterFilter extends TokenFilter {
// System.out.println("##########AFTER COMBINATIONS:"+ str(queue)); // System.out.println("##########AFTER COMBINATIONS:"+ str(queue));
queuePos=1; queuePos=1;
return queue.get(0); Token tok = queue.get(0);
tok.setPositionIncrement(origPosIncrement);
return tok;
} }
@ -416,29 +417,10 @@ final class WordDelimiterFilter extends TokenFilter {
} }
} }
private String str(List<Token> lst) {
StringBuilder sb = new StringBuilder();
sb.append('{');
for (Token t : lst) {
sb.append('(');
sb.append('"');
sb.append(t.termText());
sb.append("\",increment=");
sb.append(Integer.toString(t.getPositionIncrement()));
sb.append(')');
sb.append(',');
}
sb.append('}');
return sb.toString();
}
// questions: // questions:
// negative numbers? -42 indexed as just 42? // negative numbers? -42 indexed as just 42?
// dollar sign? $42 // dollar sign? $42
// percent sign? 33% // percent sign? 33%
// downsides: if source text is "powershot" then a query of "PowerShot" won't match! // downsides: if source text is "powershot" then a query of "PowerShot" won't match!
} }

View File

@ -0,0 +1,49 @@
package org.apache.solr.analysis;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.util.TestHarness;
import org.apache.solr.request.SolrQueryRequest;
/**
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
*/
public class TestWordDelimiterFilter extends AbstractSolrTestCase {
public String getSchemaFile() { return "solr/conf/schema.xml"; }
public String getSolrConfigFile() { return "solr/conf/solrconfig.xml"; }
public void posTst(String v1, String v2, String s1, String s2) {
assertU(adoc("id", "42",
"subword", v1,
"subword", v2));
assertU(commit());
// there is a positionIncrementGap of 100 between field values, so
// we test if that was maintained.
assertQ("position increment lost",
req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~90")
,"//result[@numFound=0]"
);
assertQ("position increment lost",
req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~110")
,"//result[@numFound=1]"
);
}
public void testRetainPositionIncrement() {
posTst("foo","bar","foo","bar");
posTst("-foo-","-bar-","foo","bar");
posTst("foo","bar","-foo-","-bar-");
posTst("123","456","123","456");
posTst("/123/","/456/","123","456");
posTst("/123/abc","qwe/456/","abc","qwe");
posTst("zoo-foo","bar-baz","foo","bar");
posTst("zoo-foo-123","456-bar-baz","foo","bar");
}
}

View File

@ -174,7 +174,7 @@
</analyzer> </analyzer>
</fieldtype> </fieldtype>
<fieldtype name="subword" class="solr.TextField"> <fieldtype name="subword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
<analyzer type="index"> <analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/> <tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>