mirror of https://github.com/apache/lucene.git
WordDelimiterFilter could loose position info
git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@419321 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b5919a7dc1
commit
a35e30cb35
|
@ -43,6 +43,7 @@ Bug Fixes
|
||||||
2. Added escaping of attribute values in the XML response (Erik Hatcher)
|
2. Added escaping of attribute values in the XML response (Erik Hatcher)
|
||||||
3. Added empty extractTerms() to FunctionQuery to enable use in
|
3. Added empty extractTerms() to FunctionQuery to enable use in
|
||||||
a MultiSearcher (Yonik)
|
a MultiSearcher (Yonik)
|
||||||
|
4. WordDelimiterFilter sometimes lost token positionIncrement information
|
||||||
|
|
||||||
Other Changes
|
Other Changes
|
||||||
1. Upgrade to Lucene 2.0 nightly build 2006-06-22, lucene SVN revision 416224,
|
1. Upgrade to Lucene 2.0 nightly build 2006-06-22, lucene SVN revision 416224,
|
||||||
|
|
|
@ -123,10 +123,6 @@ final class WordDelimiterFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private int charType(String s, int pos) {
|
|
||||||
return charType(s.charAt(pos));
|
|
||||||
}
|
|
||||||
|
|
||||||
// use the type of the first char as the type
|
// use the type of the first char as the type
|
||||||
// of the token.
|
// of the token.
|
||||||
private int tokType(Token t) {
|
private int tokType(Token t) {
|
||||||
|
@ -170,16 +166,18 @@ final class WordDelimiterFilter extends TokenFilter {
|
||||||
// Would it actually be faster to check for the common form
|
// Would it actually be faster to check for the common form
|
||||||
// of isLetter() isLower()*, and then backtrack if it doesn't match?
|
// of isLetter() isLower()*, and then backtrack if it doesn't match?
|
||||||
|
|
||||||
|
int origPosIncrement;
|
||||||
while(true) {
|
while(true) {
|
||||||
Token t = input.next();
|
Token t = input.next();
|
||||||
if (t == null) return null;
|
if (t == null) return null;
|
||||||
|
|
||||||
String s = t.termText();
|
String s = t.termText();
|
||||||
int off=t.startOffset();
|
|
||||||
int start=0;
|
int start=0;
|
||||||
int end=s.length();
|
int end=s.length();
|
||||||
if (end==0) continue;
|
if (end==0) continue;
|
||||||
|
|
||||||
|
origPosIncrement = t.getPositionIncrement();
|
||||||
|
|
||||||
// Avoid calling charType more than once for each char (basically
|
// Avoid calling charType more than once for each char (basically
|
||||||
// avoid any backtracking).
|
// avoid any backtracking).
|
||||||
// makes code slightly more difficult, but faster.
|
// makes code slightly more difficult, but faster.
|
||||||
|
@ -273,6 +271,7 @@ final class WordDelimiterFilter extends TokenFilter {
|
||||||
// optimization... if this is the only token,
|
// optimization... if this is the only token,
|
||||||
// return it immediately.
|
// return it immediately.
|
||||||
if (queue.size()==0) {
|
if (queue.size()==0) {
|
||||||
|
newtok.setPositionIncrement(origPosIncrement);
|
||||||
return newtok;
|
return newtok;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -376,7 +375,9 @@ final class WordDelimiterFilter extends TokenFilter {
|
||||||
// System.out.println("##########AFTER COMBINATIONS:"+ str(queue));
|
// System.out.println("##########AFTER COMBINATIONS:"+ str(queue));
|
||||||
|
|
||||||
queuePos=1;
|
queuePos=1;
|
||||||
return queue.get(0);
|
Token tok = queue.get(0);
|
||||||
|
tok.setPositionIncrement(origPosIncrement);
|
||||||
|
return tok;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -416,29 +417,10 @@ final class WordDelimiterFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private String str(List<Token> lst) {
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
sb.append('{');
|
|
||||||
for (Token t : lst) {
|
|
||||||
sb.append('(');
|
|
||||||
sb.append('"');
|
|
||||||
sb.append(t.termText());
|
|
||||||
sb.append("\",increment=");
|
|
||||||
sb.append(Integer.toString(t.getPositionIncrement()));
|
|
||||||
sb.append(')');
|
|
||||||
|
|
||||||
sb.append(',');
|
|
||||||
}
|
|
||||||
sb.append('}');
|
|
||||||
return sb.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// questions:
|
// questions:
|
||||||
// negative numbers? -42 indexed as just 42?
|
// negative numbers? -42 indexed as just 42?
|
||||||
// dollar sign? $42
|
// dollar sign? $42
|
||||||
// percent sign? 33%
|
// percent sign? 33%
|
||||||
// downsides: if source text is "powershot" then a query of "PowerShot" won't match!
|
// downsides: if source text is "powershot" then a query of "PowerShot" won't match!
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,49 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
import org.apache.solr.util.AbstractSolrTestCase;
|
||||||
|
import org.apache.solr.util.TestHarness;
|
||||||
|
import org.apache.solr.request.SolrQueryRequest;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
|
||||||
|
*/
|
||||||
|
public class TestWordDelimiterFilter extends AbstractSolrTestCase {
|
||||||
|
public String getSchemaFile() { return "solr/conf/schema.xml"; }
|
||||||
|
public String getSolrConfigFile() { return "solr/conf/solrconfig.xml"; }
|
||||||
|
|
||||||
|
|
||||||
|
public void posTst(String v1, String v2, String s1, String s2) {
|
||||||
|
assertU(adoc("id", "42",
|
||||||
|
"subword", v1,
|
||||||
|
"subword", v2));
|
||||||
|
assertU(commit());
|
||||||
|
|
||||||
|
// there is a positionIncrementGap of 100 between field values, so
|
||||||
|
// we test if that was maintained.
|
||||||
|
assertQ("position increment lost",
|
||||||
|
req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~90")
|
||||||
|
,"//result[@numFound=0]"
|
||||||
|
);
|
||||||
|
assertQ("position increment lost",
|
||||||
|
req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~110")
|
||||||
|
,"//result[@numFound=1]"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void testRetainPositionIncrement() {
|
||||||
|
posTst("foo","bar","foo","bar");
|
||||||
|
posTst("-foo-","-bar-","foo","bar");
|
||||||
|
posTst("foo","bar","-foo-","-bar-");
|
||||||
|
|
||||||
|
posTst("123","456","123","456");
|
||||||
|
posTst("/123/","/456/","123","456");
|
||||||
|
|
||||||
|
posTst("/123/abc","qwe/456/","abc","qwe");
|
||||||
|
|
||||||
|
posTst("zoo-foo","bar-baz","foo","bar");
|
||||||
|
posTst("zoo-foo-123","456-bar-baz","foo","bar");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -174,7 +174,7 @@
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldtype>
|
</fieldtype>
|
||||||
|
|
||||||
<fieldtype name="subword" class="solr.TextField">
|
<fieldtype name="subword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
|
||||||
<analyzer type="index">
|
<analyzer type="index">
|
||||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
|
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
|
||||||
|
|
Loading…
Reference in New Issue