From a35e30cb353e2464cb35dfc88b1fedb1773bb3d9 Mon Sep 17 00:00:00 2001 From: Yonik Seeley Date: Wed, 5 Jul 2006 19:36:08 +0000 Subject: [PATCH] WordDelimiterFilter could loose position info git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@419321 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 1 + .../solr/analysis/WordDelimiterFilter.java | 32 +++--------- .../analysis/TestWordDelimiterFilter.java | 49 +++++++++++++++++++ src/test/test-files/solr/conf/schema.xml | 2 +- 4 files changed, 58 insertions(+), 26 deletions(-) create mode 100644 src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java diff --git a/CHANGES.txt b/CHANGES.txt index 1ab2af761d2..79134109f54 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -43,6 +43,7 @@ Bug Fixes 2. Added escaping of attribute values in the XML response (Erik Hatcher) 3. Added empty extractTerms() to FunctionQuery to enable use in a MultiSearcher (Yonik) + 4. WordDelimiterFilter sometimes lost token positionIncrement information Other Changes 1. Upgrade to Lucene 2.0 nightly build 2006-06-22, lucene SVN revision 416224, diff --git a/src/java/org/apache/solr/analysis/WordDelimiterFilter.java b/src/java/org/apache/solr/analysis/WordDelimiterFilter.java index 47ff9bd3265..47e0788a063 100644 --- a/src/java/org/apache/solr/analysis/WordDelimiterFilter.java +++ b/src/java/org/apache/solr/analysis/WordDelimiterFilter.java @@ -123,10 +123,6 @@ final class WordDelimiterFilter extends TokenFilter { } } - private int charType(String s, int pos) { - return charType(s.charAt(pos)); - } - // use the type of the first char as the type // of the token. private int tokType(Token t) { @@ -170,16 +166,18 @@ final class WordDelimiterFilter extends TokenFilter { // Would it actually be faster to check for the common form // of isLetter() isLower()*, and then backtrack if it doesn't match? + int origPosIncrement; while(true) { Token t = input.next(); if (t == null) return null; String s = t.termText(); - int off=t.startOffset(); int start=0; int end=s.length(); if (end==0) continue; + origPosIncrement = t.getPositionIncrement(); + // Avoid calling charType more than once for each char (basically // avoid any backtracking). // makes code slightly more difficult, but faster. @@ -273,6 +271,7 @@ final class WordDelimiterFilter extends TokenFilter { // optimization... if this is the only token, // return it immediately. if (queue.size()==0) { + newtok.setPositionIncrement(origPosIncrement); return newtok; } @@ -376,7 +375,9 @@ final class WordDelimiterFilter extends TokenFilter { // System.out.println("##########AFTER COMBINATIONS:"+ str(queue)); queuePos=1; - return queue.get(0); + Token tok = queue.get(0); + tok.setPositionIncrement(origPosIncrement); + return tok; } @@ -416,29 +417,10 @@ final class WordDelimiterFilter extends TokenFilter { } } - private String str(List lst) { - StringBuilder sb = new StringBuilder(); - sb.append('{'); - for (Token t : lst) { - sb.append('('); - sb.append('"'); - sb.append(t.termText()); - sb.append("\",increment="); - sb.append(Integer.toString(t.getPositionIncrement())); - sb.append(')'); - - sb.append(','); - } - sb.append('}'); - return sb.toString(); - } - - // questions: // negative numbers? -42 indexed as just 42? // dollar sign? $42 // percent sign? 33% // downsides: if source text is "powershot" then a query of "PowerShot" won't match! - } diff --git a/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java b/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java new file mode 100644 index 00000000000..77d07351b56 --- /dev/null +++ b/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java @@ -0,0 +1,49 @@ +package org.apache.solr.analysis; + +import org.apache.solr.util.AbstractSolrTestCase; +import org.apache.solr.util.TestHarness; +import org.apache.solr.request.SolrQueryRequest; + +/** + * New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest + */ +public class TestWordDelimiterFilter extends AbstractSolrTestCase { + public String getSchemaFile() { return "solr/conf/schema.xml"; } + public String getSolrConfigFile() { return "solr/conf/solrconfig.xml"; } + + + public void posTst(String v1, String v2, String s1, String s2) { + assertU(adoc("id", "42", + "subword", v1, + "subword", v2)); + assertU(commit()); + + // there is a positionIncrementGap of 100 between field values, so + // we test if that was maintained. + assertQ("position increment lost", + req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~90") + ,"//result[@numFound=0]" + ); + assertQ("position increment lost", + req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~110") + ,"//result[@numFound=1]" + ); + } + + + public void testRetainPositionIncrement() { + posTst("foo","bar","foo","bar"); + posTst("-foo-","-bar-","foo","bar"); + posTst("foo","bar","-foo-","-bar-"); + + posTst("123","456","123","456"); + posTst("/123/","/456/","123","456"); + + posTst("/123/abc","qwe/456/","abc","qwe"); + + posTst("zoo-foo","bar-baz","foo","bar"); + posTst("zoo-foo-123","456-bar-baz","foo","bar"); + } + + +} diff --git a/src/test/test-files/solr/conf/schema.xml b/src/test/test-files/solr/conf/schema.xml index 7216014adfe..f0c962a10cc 100644 --- a/src/test/test-files/solr/conf/schema.xml +++ b/src/test/test-files/solr/conf/schema.xml @@ -174,7 +174,7 @@ - +