mirror of https://github.com/apache/lucene.git
SOLR-14: Add preserveOriginal flag to WordDelimiterFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@673715 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
de0dc7ecf7
commit
7db89131f8
|
@ -302,6 +302,9 @@ New Features
|
|||
55. SOLR-603: Added ability to partially optimize. (gsingers)
|
||||
|
||||
56. SOLR-483: Add byte/short sorting support (gsingers)
|
||||
|
||||
57. SOLR-14: Add preserveOriginal flag to WordDelimiterFilter
|
||||
(Geoffrey Young, Trey Hyde, Ankur Madnani, yonik)
|
||||
|
||||
Changes in runtime behavior
|
||||
1. SOLR-559: use Lucene updateDocument, deleteDocuments methods. This
|
||||
|
|
|
@ -133,6 +133,13 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
*/
|
||||
final int splitOnCaseChange;
|
||||
|
||||
/**
|
||||
* If 1, original words are preserved and added to the subword list (Defaults to 0)
|
||||
* <p/>
|
||||
* "500-42" => "500" "42" "500-42"
|
||||
*/
|
||||
final int preserveOriginal;
|
||||
|
||||
/**
|
||||
*
|
||||
* @param in Token stream to be filtered.
|
||||
|
@ -143,8 +150,9 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
* @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
|
||||
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
|
||||
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
|
||||
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
|
||||
*/
|
||||
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange) {
|
||||
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
|
||||
super(in);
|
||||
this.generateWordParts = generateWordParts;
|
||||
this.generateNumberParts = generateNumberParts;
|
||||
|
@ -152,6 +160,7 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
this.catenateNumbers = catenateNumbers;
|
||||
this.catenateAll = catenateAll;
|
||||
this.splitOnCaseChange = splitOnCaseChange;
|
||||
this.preserveOriginal = preserveOriginal;
|
||||
this.charTypeTable = charTypeTable;
|
||||
}
|
||||
/**
|
||||
|
@ -162,19 +171,20 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
* @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
|
||||
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
|
||||
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
|
||||
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
|
||||
*/
|
||||
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange) {
|
||||
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange);
|
||||
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
|
||||
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal);
|
||||
}
|
||||
/** Compatibility constructor */
|
||||
@Deprecated
|
||||
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
|
||||
this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1);
|
||||
this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0);
|
||||
}
|
||||
/** Compatibility constructor */
|
||||
@Deprecated
|
||||
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
|
||||
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1);
|
||||
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0);
|
||||
}
|
||||
|
||||
int charType(int ch) {
|
||||
|
@ -242,11 +252,12 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
// Would it actually be faster to check for the common form
|
||||
// of isLetter() isLower()*, and then backtrack if it doesn't match?
|
||||
|
||||
int origPosIncrement;
|
||||
int origPosIncrement = 0;
|
||||
Token t;
|
||||
while(true) {
|
||||
// t is either returned, or a new token is made from it, so it should
|
||||
// be safe to use the next(Token) method.
|
||||
Token t = input.next(in);
|
||||
t = input.next(in);
|
||||
if (t == null) return null;
|
||||
|
||||
char [] termBuffer = t.termBuffer();
|
||||
|
@ -254,7 +265,7 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
int start=0;
|
||||
if (len ==0) continue;
|
||||
|
||||
origPosIncrement = t.getPositionIncrement();
|
||||
origPosIncrement += t.getPositionIncrement();
|
||||
|
||||
// Avoid calling charType more than once for each char (basically
|
||||
// avoid any backtracking).
|
||||
|
@ -348,15 +359,17 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
return t;
|
||||
}
|
||||
|
||||
Token newtok = newTok(t,start,pos);
|
||||
|
||||
// optimization... if this is the only token,
|
||||
// return it immediately.
|
||||
if (queue.size()==0) {
|
||||
newtok.setPositionIncrement(origPosIncrement);
|
||||
return newtok;
|
||||
if (queue.size()==0 && preserveOriginal == 0) {
|
||||
// just adjust the text w/o changing the rest
|
||||
// of the original token
|
||||
t.setTermBuffer(termBuffer, start, len-start);
|
||||
return t;
|
||||
}
|
||||
|
||||
Token newtok = newTok(t,start,pos);
|
||||
|
||||
queue.add(newtok);
|
||||
if ((firstType & ALPHA)!=0) numWords++;
|
||||
break;
|
||||
|
@ -379,14 +392,20 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
// If the queue is empty, we should continue by reading
|
||||
// the next token
|
||||
if (numtok==0) {
|
||||
// the token might have been all delimiters, in which
|
||||
// case return it if we're meant to preserve it
|
||||
if (preserveOriginal != 0) {
|
||||
return t;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// if number of tokens is 1, always return the single tok
|
||||
// if number of tokens is 1, there are no catenations to be done.
|
||||
if (numtok==1) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
final int numNumbers = numtok - numWords;
|
||||
|
||||
// check conditions under which the current token
|
||||
|
@ -411,16 +430,16 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
if (numWords==0) {
|
||||
// all numbers
|
||||
addCombos(tlist,0,numtok,generateNumberParts!=0,catenateNumbers!=0 || catenateAll!=0, 1);
|
||||
if (queue.size() > 0) break; else continue;
|
||||
if (queue.size() > 0 || preserveOriginal!=0) break; else continue;
|
||||
} else if (numNumbers==0) {
|
||||
// all words
|
||||
addCombos(tlist,0,numtok,generateWordParts!=0,catenateWords!=0 || catenateAll!=0, 1);
|
||||
if (queue.size() > 0) break; else continue;
|
||||
if (queue.size() > 0 || preserveOriginal!=0) break; else continue;
|
||||
} else if (generateNumberParts==0 && generateWordParts==0 && catenateNumbers==0 && catenateWords==0) {
|
||||
// catenate all *only*
|
||||
// OPT:could be optimized to add to current queue...
|
||||
addCombos(tlist,0,numtok,false,catenateAll!=0, 1);
|
||||
if (queue.size() > 0) break; else continue;
|
||||
if (queue.size() > 0 || preserveOriginal!=0) break; else continue;
|
||||
}
|
||||
|
||||
//
|
||||
|
@ -454,15 +473,24 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
// NOTE: in certain cases, queue may be empty (for instance, if catenate
|
||||
// and generate are both set to false). Only exit the loop if the queue
|
||||
// is not empty.
|
||||
if (queue.size() > 0) break;
|
||||
if (queue.size() > 0 || preserveOriginal!=0) break;
|
||||
}
|
||||
|
||||
// System.out.println("##########AFTER COMBINATIONS:"+ str(queue));
|
||||
|
||||
queuePos=1;
|
||||
Token tok = queue.get(0);
|
||||
tok.setPositionIncrement(origPosIncrement);
|
||||
return tok;
|
||||
if (preserveOriginal != 0) {
|
||||
queuePos = 0;
|
||||
if (queue.size() > 0) {
|
||||
// overlap first token with the original
|
||||
queue.get(0).setPositionIncrement(0);
|
||||
}
|
||||
return t; // return the original
|
||||
} else {
|
||||
queuePos=1;
|
||||
Token tok = queue.get(0);
|
||||
tok.setPositionIncrement(origPosIncrement);
|
||||
return tok;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -30,6 +30,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory {
|
|||
int catenateNumbers=0;
|
||||
int catenateAll=0;
|
||||
int splitOnCaseChange=0;
|
||||
int preserveOriginal=0;
|
||||
|
||||
@Override
|
||||
public void init(Map<String, String> args) {
|
||||
|
@ -40,12 +41,13 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory {
|
|||
catenateNumbers = getInt("catenateNumbers", 0);
|
||||
catenateAll = getInt("catenateAll", 0);
|
||||
splitOnCaseChange = getInt("splitOnCaseChange", 1);
|
||||
preserveOriginal = getInt("preserveOriginal", 0);
|
||||
}
|
||||
|
||||
public WordDelimiterFilter create(TokenStream input) {
|
||||
return new WordDelimiterFilter(input,
|
||||
generateWordParts, generateNumberParts,
|
||||
catenateWords, catenateNumbers, catenateAll,
|
||||
splitOnCaseChange);
|
||||
splitOnCaseChange, preserveOriginal);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,8 +20,10 @@ package org.apache.solr.analysis;
|
|||
import org.apache.solr.util.AbstractSolrTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
/**
|
||||
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
|
||||
|
@ -85,6 +87,49 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
|
|||
);
|
||||
}
|
||||
|
||||
|
||||
public void testPreserveOrignalTrue() {
|
||||
|
||||
assertU(adoc("id", "144",
|
||||
"wdf_preserve", "404-123"));
|
||||
assertU(commit());
|
||||
|
||||
assertQ("preserving original word",
|
||||
req("wdf_preserve:404")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
|
||||
assertQ("preserving original word",
|
||||
req("wdf_preserve:123")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
|
||||
assertQ("preserving original word",
|
||||
req("wdf_preserve:404-123*")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
/***
|
||||
public void testPerformance() throws IOException {
|
||||
String s = "now is the time-for all good men to come to-the aid of their country.";
|
||||
Token tok = new Token();
|
||||
long start = System.currentTimeMillis();
|
||||
int ret=0;
|
||||
for (int i=0; i<1000000; i++) {
|
||||
StringReader r = new StringReader(s);
|
||||
TokenStream ts = new WhitespaceTokenizer(r);
|
||||
ts = new WordDelimiterFilter(ts, 1,1,1,1,0);
|
||||
|
||||
while (ts.next(tok) != null) ret++;
|
||||
}
|
||||
|
||||
System.out.println("ret="+ret+" time="+(System.currentTimeMillis()-start));
|
||||
}
|
||||
***/
|
||||
|
||||
|
||||
public void testOffsets() throws IOException {
|
||||
|
||||
// test that subwords and catenated subwords have
|
||||
|
@ -98,7 +143,7 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
|
|||
return t;
|
||||
}
|
||||
},
|
||||
1,1,0,0,1,1);
|
||||
1,1,0,0,1,1,0);
|
||||
|
||||
int i=0;
|
||||
for(Token t; (t=wdf.next())!=null;) {
|
||||
|
@ -131,7 +176,7 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
|
|||
return t;
|
||||
}
|
||||
},
|
||||
1,1,0,0,1,1);
|
||||
1,1,0,0,1,1,0);
|
||||
for(Token t; (t=wdf.next())!=null;) {
|
||||
assertEquals(5, t.startOffset());
|
||||
assertEquals(6, t.endOffset());
|
||||
|
|
|
@ -86,7 +86,15 @@
|
|||
<fieldtype name="wdf_nocase" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="0"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
<fieldtype name="wdf_preserve" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="1"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
@ -369,6 +377,7 @@
|
|||
<field name="lengthfilt" type="lengthfilt" indexed="true" stored="true"/>
|
||||
<field name="dedup" type="dedup" indexed="true" stored="true"/>
|
||||
<field name="wdf_nocase" type="wdf_nocase" indexed="true" stored="true"/>
|
||||
<field name="wdf_preserve" type="wdf_preserve" indexed="true" stored="true"/>
|
||||
|
||||
<field name="numberpartfail" type="failtype1" indexed="true" stored="true"/>
|
||||
|
||||
|
|
Loading…
Reference in New Issue