SOLR-14: Add preserveOriginal flag to WordDelimiterFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@673715 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2008-07-03 15:40:14 +00:00
parent de0dc7ecf7
commit 7db89131f8
5 changed files with 113 additions and 26 deletions

View File

@ -302,6 +302,9 @@ New Features
55. SOLR-603: Added ability to partially optimize. (gsingers)
56. SOLR-483: Add byte/short sorting support (gsingers)
57. SOLR-14: Add preserveOriginal flag to WordDelimiterFilter
(Geoffrey Young, Trey Hyde, Ankur Madnani, yonik)
Changes in runtime behavior
1. SOLR-559: use Lucene updateDocument, deleteDocuments methods. This

View File

@ -133,6 +133,13 @@ final class WordDelimiterFilter extends TokenFilter {
*/
final int splitOnCaseChange;
/**
* If 1, original words are preserved and added to the subword list (Defaults to 0)
* <p/>
* "500-42" => "500" "42" "500-42"
*/
final int preserveOriginal;
/**
*
* @param in Token stream to be filtered.
@ -143,8 +150,9 @@ final class WordDelimiterFilter extends TokenFilter {
* @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
*/
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange) {
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
super(in);
this.generateWordParts = generateWordParts;
this.generateNumberParts = generateNumberParts;
@ -152,6 +160,7 @@ final class WordDelimiterFilter extends TokenFilter {
this.catenateNumbers = catenateNumbers;
this.catenateAll = catenateAll;
this.splitOnCaseChange = splitOnCaseChange;
this.preserveOriginal = preserveOriginal;
this.charTypeTable = charTypeTable;
}
/**
@ -162,19 +171,20 @@ final class WordDelimiterFilter extends TokenFilter {
* @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
*/
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange) {
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange);
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal);
}
/** Compatibility constructor */
@Deprecated
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1);
this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0);
}
/** Compatibility constructor */
@Deprecated
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1);
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0);
}
int charType(int ch) {
@ -242,11 +252,12 @@ final class WordDelimiterFilter extends TokenFilter {
// Would it actually be faster to check for the common form
// of isLetter() isLower()*, and then backtrack if it doesn't match?
int origPosIncrement;
int origPosIncrement = 0;
Token t;
while(true) {
// t is either returned, or a new token is made from it, so it should
// be safe to use the next(Token) method.
Token t = input.next(in);
t = input.next(in);
if (t == null) return null;
char [] termBuffer = t.termBuffer();
@ -254,7 +265,7 @@ final class WordDelimiterFilter extends TokenFilter {
int start=0;
if (len ==0) continue;
origPosIncrement = t.getPositionIncrement();
origPosIncrement += t.getPositionIncrement();
// Avoid calling charType more than once for each char (basically
// avoid any backtracking).
@ -348,15 +359,17 @@ final class WordDelimiterFilter extends TokenFilter {
return t;
}
Token newtok = newTok(t,start,pos);
// optimization... if this is the only token,
// return it immediately.
if (queue.size()==0) {
newtok.setPositionIncrement(origPosIncrement);
return newtok;
if (queue.size()==0 && preserveOriginal == 0) {
// just adjust the text w/o changing the rest
// of the original token
t.setTermBuffer(termBuffer, start, len-start);
return t;
}
Token newtok = newTok(t,start,pos);
queue.add(newtok);
if ((firstType & ALPHA)!=0) numWords++;
break;
@ -379,14 +392,20 @@ final class WordDelimiterFilter extends TokenFilter {
// If the queue is empty, we should continue by reading
// the next token
if (numtok==0) {
// the token might have been all delimiters, in which
// case return it if we're meant to preserve it
if (preserveOriginal != 0) {
return t;
}
continue;
}
// if number of tokens is 1, always return the single tok
// if number of tokens is 1, there are no catenations to be done.
if (numtok==1) {
break;
}
final int numNumbers = numtok - numWords;
// check conditions under which the current token
@ -411,16 +430,16 @@ final class WordDelimiterFilter extends TokenFilter {
if (numWords==0) {
// all numbers
addCombos(tlist,0,numtok,generateNumberParts!=0,catenateNumbers!=0 || catenateAll!=0, 1);
if (queue.size() > 0) break; else continue;
if (queue.size() > 0 || preserveOriginal!=0) break; else continue;
} else if (numNumbers==0) {
// all words
addCombos(tlist,0,numtok,generateWordParts!=0,catenateWords!=0 || catenateAll!=0, 1);
if (queue.size() > 0) break; else continue;
if (queue.size() > 0 || preserveOriginal!=0) break; else continue;
} else if (generateNumberParts==0 && generateWordParts==0 && catenateNumbers==0 && catenateWords==0) {
// catenate all *only*
// OPT:could be optimized to add to current queue...
addCombos(tlist,0,numtok,false,catenateAll!=0, 1);
if (queue.size() > 0) break; else continue;
if (queue.size() > 0 || preserveOriginal!=0) break; else continue;
}
//
@ -454,15 +473,24 @@ final class WordDelimiterFilter extends TokenFilter {
// NOTE: in certain cases, queue may be empty (for instance, if catenate
// and generate are both set to false). Only exit the loop if the queue
// is not empty.
if (queue.size() > 0) break;
if (queue.size() > 0 || preserveOriginal!=0) break;
}
// System.out.println("##########AFTER COMBINATIONS:"+ str(queue));
queuePos=1;
Token tok = queue.get(0);
tok.setPositionIncrement(origPosIncrement);
return tok;
if (preserveOriginal != 0) {
queuePos = 0;
if (queue.size() > 0) {
// overlap first token with the original
queue.get(0).setPositionIncrement(0);
}
return t; // return the original
} else {
queuePos=1;
Token tok = queue.get(0);
tok.setPositionIncrement(origPosIncrement);
return tok;
}
}

View File

@ -30,6 +30,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory {
int catenateNumbers=0;
int catenateAll=0;
int splitOnCaseChange=0;
int preserveOriginal=0;
@Override
public void init(Map<String, String> args) {
@ -40,12 +41,13 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory {
catenateNumbers = getInt("catenateNumbers", 0);
catenateAll = getInt("catenateAll", 0);
splitOnCaseChange = getInt("splitOnCaseChange", 1);
preserveOriginal = getInt("preserveOriginal", 0);
}
public WordDelimiterFilter create(TokenStream input) {
return new WordDelimiterFilter(input,
generateWordParts, generateNumberParts,
catenateWords, catenateNumbers, catenateAll,
splitOnCaseChange);
splitOnCaseChange, preserveOriginal);
}
}

View File

@ -20,8 +20,10 @@ package org.apache.solr.analysis;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import java.io.IOException;
import java.io.StringReader;
/**
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
@ -85,6 +87,49 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
);
}
public void testPreserveOrignalTrue() {
assertU(adoc("id", "144",
"wdf_preserve", "404-123"));
assertU(commit());
assertQ("preserving original word",
req("wdf_preserve:404")
,"//result[@numFound=1]"
);
assertQ("preserving original word",
req("wdf_preserve:123")
,"//result[@numFound=1]"
);
assertQ("preserving original word",
req("wdf_preserve:404-123*")
,"//result[@numFound=1]"
);
}
/***
public void testPerformance() throws IOException {
String s = "now is the time-for all good men to come to-the aid of their country.";
Token tok = new Token();
long start = System.currentTimeMillis();
int ret=0;
for (int i=0; i<1000000; i++) {
StringReader r = new StringReader(s);
TokenStream ts = new WhitespaceTokenizer(r);
ts = new WordDelimiterFilter(ts, 1,1,1,1,0);
while (ts.next(tok) != null) ret++;
}
System.out.println("ret="+ret+" time="+(System.currentTimeMillis()-start));
}
***/
public void testOffsets() throws IOException {
// test that subwords and catenated subwords have
@ -98,7 +143,7 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
return t;
}
},
1,1,0,0,1,1);
1,1,0,0,1,1,0);
int i=0;
for(Token t; (t=wdf.next())!=null;) {
@ -131,7 +176,7 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
return t;
}
},
1,1,0,0,1,1);
1,1,0,0,1,1,0);
for(Token t; (t=wdf.next())!=null;) {
assertEquals(5, t.startOffset());
assertEquals(6, t.endOffset());

View File

@ -86,7 +86,15 @@
<fieldtype name="wdf_nocase" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="wdf_preserve" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldtype>
@ -369,6 +377,7 @@
<field name="lengthfilt" type="lengthfilt" indexed="true" stored="true"/>
<field name="dedup" type="dedup" indexed="true" stored="true"/>
<field name="wdf_nocase" type="wdf_nocase" indexed="true" stored="true"/>
<field name="wdf_preserve" type="wdf_preserve" indexed="true" stored="true"/>
<field name="numberpartfail" type="failtype1" indexed="true" stored="true"/>