diff --git a/CHANGES.txt b/CHANGES.txt index 0d87320110f..4af29f03bf3 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -302,6 +302,9 @@ New Features 55. SOLR-603: Added ability to partially optimize. (gsingers) 56. SOLR-483: Add byte/short sorting support (gsingers) + +57. SOLR-14: Add preserveOriginal flag to WordDelimiterFilter + (Geoffrey Young, Trey Hyde, Ankur Madnani, yonik) Changes in runtime behavior 1. SOLR-559: use Lucene updateDocument, deleteDocuments methods. This diff --git a/src/java/org/apache/solr/analysis/WordDelimiterFilter.java b/src/java/org/apache/solr/analysis/WordDelimiterFilter.java index c6057f37783..f4278d12e25 100644 --- a/src/java/org/apache/solr/analysis/WordDelimiterFilter.java +++ b/src/java/org/apache/solr/analysis/WordDelimiterFilter.java @@ -133,6 +133,13 @@ final class WordDelimiterFilter extends TokenFilter { */ final int splitOnCaseChange; + /** + * If 1, original words are preserved and added to the subword list (Defaults to 0) + *

+ * "500-42" => "500" "42" "500-42" + */ + final int preserveOriginal; + /** * * @param in Token stream to be filtered. @@ -143,8 +150,9 @@ final class WordDelimiterFilter extends TokenFilter { * @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042" * @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" * @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) + * @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42" */ - public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange) { + public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) { super(in); this.generateWordParts = generateWordParts; this.generateNumberParts = generateNumberParts; @@ -152,6 +160,7 @@ final class WordDelimiterFilter extends TokenFilter { this.catenateNumbers = catenateNumbers; this.catenateAll = catenateAll; this.splitOnCaseChange = splitOnCaseChange; + this.preserveOriginal = preserveOriginal; this.charTypeTable = charTypeTable; } /** @@ -162,19 +171,20 @@ final class WordDelimiterFilter extends TokenFilter { * @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042" * @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" * @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) + * @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42" */ - public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange) { - this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange); + public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) { + this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal); } /** Compatibility constructor */ @Deprecated public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) { - this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1); + this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0); } /** Compatibility constructor */ @Deprecated public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) { - this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1); + this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0); } int charType(int ch) { @@ -242,11 +252,12 @@ final class WordDelimiterFilter extends TokenFilter { // Would it actually be faster to check for the common form // of isLetter() isLower()*, and then backtrack if it doesn't match? - int origPosIncrement; + int origPosIncrement = 0; + Token t; while(true) { // t is either returned, or a new token is made from it, so it should // be safe to use the next(Token) method. - Token t = input.next(in); + t = input.next(in); if (t == null) return null; char [] termBuffer = t.termBuffer(); @@ -254,7 +265,7 @@ final class WordDelimiterFilter extends TokenFilter { int start=0; if (len ==0) continue; - origPosIncrement = t.getPositionIncrement(); + origPosIncrement += t.getPositionIncrement(); // Avoid calling charType more than once for each char (basically // avoid any backtracking). @@ -348,15 +359,17 @@ final class WordDelimiterFilter extends TokenFilter { return t; } - Token newtok = newTok(t,start,pos); - // optimization... if this is the only token, // return it immediately. - if (queue.size()==0) { - newtok.setPositionIncrement(origPosIncrement); - return newtok; + if (queue.size()==0 && preserveOriginal == 0) { + // just adjust the text w/o changing the rest + // of the original token + t.setTermBuffer(termBuffer, start, len-start); + return t; } + Token newtok = newTok(t,start,pos); + queue.add(newtok); if ((firstType & ALPHA)!=0) numWords++; break; @@ -379,14 +392,20 @@ final class WordDelimiterFilter extends TokenFilter { // If the queue is empty, we should continue by reading // the next token if (numtok==0) { + // the token might have been all delimiters, in which + // case return it if we're meant to preserve it + if (preserveOriginal != 0) { + return t; + } continue; } - // if number of tokens is 1, always return the single tok + // if number of tokens is 1, there are no catenations to be done. if (numtok==1) { break; } + final int numNumbers = numtok - numWords; // check conditions under which the current token @@ -411,16 +430,16 @@ final class WordDelimiterFilter extends TokenFilter { if (numWords==0) { // all numbers addCombos(tlist,0,numtok,generateNumberParts!=0,catenateNumbers!=0 || catenateAll!=0, 1); - if (queue.size() > 0) break; else continue; + if (queue.size() > 0 || preserveOriginal!=0) break; else continue; } else if (numNumbers==0) { // all words addCombos(tlist,0,numtok,generateWordParts!=0,catenateWords!=0 || catenateAll!=0, 1); - if (queue.size() > 0) break; else continue; + if (queue.size() > 0 || preserveOriginal!=0) break; else continue; } else if (generateNumberParts==0 && generateWordParts==0 && catenateNumbers==0 && catenateWords==0) { // catenate all *only* // OPT:could be optimized to add to current queue... addCombos(tlist,0,numtok,false,catenateAll!=0, 1); - if (queue.size() > 0) break; else continue; + if (queue.size() > 0 || preserveOriginal!=0) break; else continue; } // @@ -454,15 +473,24 @@ final class WordDelimiterFilter extends TokenFilter { // NOTE: in certain cases, queue may be empty (for instance, if catenate // and generate are both set to false). Only exit the loop if the queue // is not empty. - if (queue.size() > 0) break; + if (queue.size() > 0 || preserveOriginal!=0) break; } // System.out.println("##########AFTER COMBINATIONS:"+ str(queue)); - queuePos=1; - Token tok = queue.get(0); - tok.setPositionIncrement(origPosIncrement); - return tok; + if (preserveOriginal != 0) { + queuePos = 0; + if (queue.size() > 0) { + // overlap first token with the original + queue.get(0).setPositionIncrement(0); + } + return t; // return the original + } else { + queuePos=1; + Token tok = queue.get(0); + tok.setPositionIncrement(origPosIncrement); + return tok; + } } diff --git a/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java b/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java index b294075f9e3..132ae993c8d 100644 --- a/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java +++ b/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java @@ -30,6 +30,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory { int catenateNumbers=0; int catenateAll=0; int splitOnCaseChange=0; + int preserveOriginal=0; @Override public void init(Map args) { @@ -40,12 +41,13 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory { catenateNumbers = getInt("catenateNumbers", 0); catenateAll = getInt("catenateAll", 0); splitOnCaseChange = getInt("splitOnCaseChange", 1); + preserveOriginal = getInt("preserveOriginal", 0); } public WordDelimiterFilter create(TokenStream input) { return new WordDelimiterFilter(input, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, - splitOnCaseChange); + splitOnCaseChange, preserveOriginal); } } diff --git a/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java b/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java index 07d0b6a0047..0b34fdc700a 100644 --- a/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java +++ b/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java @@ -20,8 +20,10 @@ package org.apache.solr.analysis; import org.apache.solr.util.AbstractSolrTestCase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.WhitespaceTokenizer; import java.io.IOException; +import java.io.StringReader; /** * New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest @@ -85,6 +87,49 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase { ); } + + public void testPreserveOrignalTrue() { + + assertU(adoc("id", "144", + "wdf_preserve", "404-123")); + assertU(commit()); + + assertQ("preserving original word", + req("wdf_preserve:404") + ,"//result[@numFound=1]" + ); + + assertQ("preserving original word", + req("wdf_preserve:123") + ,"//result[@numFound=1]" + ); + + assertQ("preserving original word", + req("wdf_preserve:404-123*") + ,"//result[@numFound=1]" + ); + + } + + /*** + public void testPerformance() throws IOException { + String s = "now is the time-for all good men to come to-the aid of their country."; + Token tok = new Token(); + long start = System.currentTimeMillis(); + int ret=0; + for (int i=0; i<1000000; i++) { + StringReader r = new StringReader(s); + TokenStream ts = new WhitespaceTokenizer(r); + ts = new WordDelimiterFilter(ts, 1,1,1,1,0); + + while (ts.next(tok) != null) ret++; + } + + System.out.println("ret="+ret+" time="+(System.currentTimeMillis()-start)); + } + ***/ + + public void testOffsets() throws IOException { // test that subwords and catenated subwords have @@ -98,7 +143,7 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase { return t; } }, - 1,1,0,0,1,1); + 1,1,0,0,1,1,0); int i=0; for(Token t; (t=wdf.next())!=null;) { @@ -131,7 +176,7 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase { return t; } }, - 1,1,0,0,1,1); + 1,1,0,0,1,1,0); for(Token t; (t=wdf.next())!=null;) { assertEquals(5, t.startOffset()); assertEquals(6, t.endOffset()); diff --git a/src/test/test-files/solr/conf/schema.xml b/src/test/test-files/solr/conf/schema.xml index f0c6aaccd1c..89e82b42bd0 100644 --- a/src/test/test-files/solr/conf/schema.xml +++ b/src/test/test-files/solr/conf/schema.xml @@ -86,7 +86,15 @@ - + + + + + + + + + @@ -369,6 +377,7 @@ +