WordDelimiterFilter could loose position info

git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@419321 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2006-07-05 19:36:08 +00:00
parent b5919a7dc1
commit a35e30cb35
4 changed files with 58 additions and 26 deletions

View File

@ -43,6 +43,7 @@ Bug Fixes
2. Added escaping of attribute values in the XML response (Erik Hatcher)
3. Added empty extractTerms() to FunctionQuery to enable use in
a MultiSearcher (Yonik)
4. WordDelimiterFilter sometimes lost token positionIncrement information
Other Changes
1. Upgrade to Lucene 2.0 nightly build 2006-06-22, lucene SVN revision 416224,

View File

@ -123,10 +123,6 @@ final class WordDelimiterFilter extends TokenFilter {
}
}
private int charType(String s, int pos) {
return charType(s.charAt(pos));
}
// use the type of the first char as the type
// of the token.
private int tokType(Token t) {
@ -170,16 +166,18 @@ final class WordDelimiterFilter extends TokenFilter {
// Would it actually be faster to check for the common form
// of isLetter() isLower()*, and then backtrack if it doesn't match?
int origPosIncrement;
while(true) {
Token t = input.next();
if (t == null) return null;
String s = t.termText();
int off=t.startOffset();
int start=0;
int end=s.length();
if (end==0) continue;
origPosIncrement = t.getPositionIncrement();
// Avoid calling charType more than once for each char (basically
// avoid any backtracking).
// makes code slightly more difficult, but faster.
@ -273,6 +271,7 @@ final class WordDelimiterFilter extends TokenFilter {
// optimization... if this is the only token,
// return it immediately.
if (queue.size()==0) {
newtok.setPositionIncrement(origPosIncrement);
return newtok;
}
@ -376,7 +375,9 @@ final class WordDelimiterFilter extends TokenFilter {
// System.out.println("##########AFTER COMBINATIONS:"+ str(queue));
queuePos=1;
return queue.get(0);
Token tok = queue.get(0);
tok.setPositionIncrement(origPosIncrement);
return tok;
}
@ -416,29 +417,10 @@ final class WordDelimiterFilter extends TokenFilter {
}
}
private String str(List<Token> lst) {
StringBuilder sb = new StringBuilder();
sb.append('{');
for (Token t : lst) {
sb.append('(');
sb.append('"');
sb.append(t.termText());
sb.append("\",increment=");
sb.append(Integer.toString(t.getPositionIncrement()));
sb.append(')');
sb.append(',');
}
sb.append('}');
return sb.toString();
}
// questions:
// negative numbers? -42 indexed as just 42?
// dollar sign? $42
// percent sign? 33%
// downsides: if source text is "powershot" then a query of "PowerShot" won't match!
}

View File

@ -0,0 +1,49 @@
package org.apache.solr.analysis;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.util.TestHarness;
import org.apache.solr.request.SolrQueryRequest;
/**
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
*/
public class TestWordDelimiterFilter extends AbstractSolrTestCase {
public String getSchemaFile() { return "solr/conf/schema.xml"; }
public String getSolrConfigFile() { return "solr/conf/solrconfig.xml"; }
public void posTst(String v1, String v2, String s1, String s2) {
assertU(adoc("id", "42",
"subword", v1,
"subword", v2));
assertU(commit());
// there is a positionIncrementGap of 100 between field values, so
// we test if that was maintained.
assertQ("position increment lost",
req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~90")
,"//result[@numFound=0]"
);
assertQ("position increment lost",
req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~110")
,"//result[@numFound=1]"
);
}
public void testRetainPositionIncrement() {
posTst("foo","bar","foo","bar");
posTst("-foo-","-bar-","foo","bar");
posTst("foo","bar","-foo-","-bar-");
posTst("123","456","123","456");
posTst("/123/","/456/","123","456");
posTst("/123/abc","qwe/456/","abc","qwe");
posTst("zoo-foo","bar-baz","foo","bar");
posTst("zoo-foo-123","456-bar-baz","foo","bar");
}
}

View File

@ -174,7 +174,7 @@
</analyzer>
</fieldtype>
<fieldtype name="subword" class="solr.TextField">
<fieldtype name="subword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>