mirror of https://github.com/apache/lucene.git
WordDelimiterFilter could loose position info
git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@419321 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b5919a7dc1
commit
a35e30cb35
|
@ -43,6 +43,7 @@ Bug Fixes
|
|||
2. Added escaping of attribute values in the XML response (Erik Hatcher)
|
||||
3. Added empty extractTerms() to FunctionQuery to enable use in
|
||||
a MultiSearcher (Yonik)
|
||||
4. WordDelimiterFilter sometimes lost token positionIncrement information
|
||||
|
||||
Other Changes
|
||||
1. Upgrade to Lucene 2.0 nightly build 2006-06-22, lucene SVN revision 416224,
|
||||
|
|
|
@ -123,10 +123,6 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
}
|
||||
}
|
||||
|
||||
private int charType(String s, int pos) {
|
||||
return charType(s.charAt(pos));
|
||||
}
|
||||
|
||||
// use the type of the first char as the type
|
||||
// of the token.
|
||||
private int tokType(Token t) {
|
||||
|
@ -170,16 +166,18 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
// Would it actually be faster to check for the common form
|
||||
// of isLetter() isLower()*, and then backtrack if it doesn't match?
|
||||
|
||||
int origPosIncrement;
|
||||
while(true) {
|
||||
Token t = input.next();
|
||||
if (t == null) return null;
|
||||
|
||||
String s = t.termText();
|
||||
int off=t.startOffset();
|
||||
int start=0;
|
||||
int end=s.length();
|
||||
if (end==0) continue;
|
||||
|
||||
origPosIncrement = t.getPositionIncrement();
|
||||
|
||||
// Avoid calling charType more than once for each char (basically
|
||||
// avoid any backtracking).
|
||||
// makes code slightly more difficult, but faster.
|
||||
|
@ -273,6 +271,7 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
// optimization... if this is the only token,
|
||||
// return it immediately.
|
||||
if (queue.size()==0) {
|
||||
newtok.setPositionIncrement(origPosIncrement);
|
||||
return newtok;
|
||||
}
|
||||
|
||||
|
@ -376,7 +375,9 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
// System.out.println("##########AFTER COMBINATIONS:"+ str(queue));
|
||||
|
||||
queuePos=1;
|
||||
return queue.get(0);
|
||||
Token tok = queue.get(0);
|
||||
tok.setPositionIncrement(origPosIncrement);
|
||||
return tok;
|
||||
}
|
||||
|
||||
|
||||
|
@ -416,29 +417,10 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
}
|
||||
}
|
||||
|
||||
private String str(List<Token> lst) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append('{');
|
||||
for (Token t : lst) {
|
||||
sb.append('(');
|
||||
sb.append('"');
|
||||
sb.append(t.termText());
|
||||
sb.append("\",increment=");
|
||||
sb.append(Integer.toString(t.getPositionIncrement()));
|
||||
sb.append(')');
|
||||
|
||||
sb.append(',');
|
||||
}
|
||||
sb.append('}');
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
|
||||
// questions:
|
||||
// negative numbers? -42 indexed as just 42?
|
||||
// dollar sign? $42
|
||||
// percent sign? 33%
|
||||
// downsides: if source text is "powershot" then a query of "PowerShot" won't match!
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.solr.util.AbstractSolrTestCase;
|
||||
import org.apache.solr.util.TestHarness;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
|
||||
/**
|
||||
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
|
||||
*/
|
||||
public class TestWordDelimiterFilter extends AbstractSolrTestCase {
|
||||
public String getSchemaFile() { return "solr/conf/schema.xml"; }
|
||||
public String getSolrConfigFile() { return "solr/conf/solrconfig.xml"; }
|
||||
|
||||
|
||||
public void posTst(String v1, String v2, String s1, String s2) {
|
||||
assertU(adoc("id", "42",
|
||||
"subword", v1,
|
||||
"subword", v2));
|
||||
assertU(commit());
|
||||
|
||||
// there is a positionIncrementGap of 100 between field values, so
|
||||
// we test if that was maintained.
|
||||
assertQ("position increment lost",
|
||||
req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~90")
|
||||
,"//result[@numFound=0]"
|
||||
);
|
||||
assertQ("position increment lost",
|
||||
req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~110")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
public void testRetainPositionIncrement() {
|
||||
posTst("foo","bar","foo","bar");
|
||||
posTst("-foo-","-bar-","foo","bar");
|
||||
posTst("foo","bar","-foo-","-bar-");
|
||||
|
||||
posTst("123","456","123","456");
|
||||
posTst("/123/","/456/","123","456");
|
||||
|
||||
posTst("/123/abc","qwe/456/","abc","qwe");
|
||||
|
||||
posTst("zoo-foo","bar-baz","foo","bar");
|
||||
posTst("zoo-foo-123","456-bar-baz","foo","bar");
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -174,7 +174,7 @@
|
|||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
<fieldtype name="subword" class="solr.TextField">
|
||||
<fieldtype name="subword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
|
||||
|
|
Loading…
Reference in New Issue