HyphenatedWordsFilter: SOLR-41

git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@428512 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2006-08-03 20:19:45 +00:00
parent bc40671140
commit 75770514c9
4 changed files with 164 additions and 0 deletions

View File

@ -40,6 +40,8 @@ New Features
The default operator remains "OR". The default operator remains "OR".
21. JAVA API: new version of SolrIndexSearcher.getDocListAndSet() which takes 21. JAVA API: new version of SolrIndexSearcher.getDocListAndSet() which takes
flags (Greg Ludington via yonik, SOLR-39) flags (Greg Ludington via yonik, SOLR-39)
22. A HyphenatedWordsFilter, a text analysis filter used during indexing to rejoin
words that were hyphenated and split by a newline. (Boris Vitez via yonik, SOLR-41)
Changes in runtime behavior Changes in runtime behavior
1. classes reorganized into different packages, package names changed to Apache 1. classes reorganized into different packages, package names changed to Apache

View File

@ -0,0 +1,94 @@
package org.apache.solr.analysis;
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.*;
/**
* When the plain text is extracted from documents, we will often have many words hyphenated and broken into
* two lines. This is often the case with documents where narrow text columns are used, such as newsletters.
* In order to increase search efficiency, this filter puts hyphenated words broken into two lines back together.
* This filter should be used on indexing time only.
* Example field definition in schema.xml:
* <pre>
* <fieldtype name="text" class="solr.TextField" positionIncrementGap="100">
* <analyzer type="index">
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
* <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
* <filter class="solr.StopFilterFactory" ignoreCase="true"/>
* <filter class="solr.HyphenatedWordsFilterFactory"/>
* <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
* <filter class="solr.LowerCaseFilterFactory"/>
* <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
* </analyzer>
* <analyzer type="query">
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
* <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
* <filter class="solr.StopFilterFactory" ignoreCase="true"/>
* <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
* <filter class="solr.LowerCaseFilterFactory"/>
* <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
* </analyzer>
* </fieldtype>
*
* @author Boris Vitez
*/
public final class HyphenatedWordsFilter extends TokenFilter {
public HyphenatedWordsFilter(TokenStream in) {
super(in);
}
/**
* @inheritDoc
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public final Token next() throws IOException {
StringBuffer termText = new StringBuffer(25);
int startOffset = -1, firstPositionIncrement = -1, wordsMerged = 0;
Token lastToken = null;
for (Token token = input.next(); token != null; token = input.next()) {
termText.append(token.termText());
//current token ends with hyphen -> grab the next token and glue them together
if (termText.charAt(termText.length() - 1) == '-') {
wordsMerged++;
//remove the hyphen
termText.setLength(termText.length()-1);
if (startOffset == -1) {
startOffset = token.startOffset();
firstPositionIncrement = token.getPositionIncrement();
}
lastToken = token;
} else {
//shortcut returns token
if (wordsMerged == 0)
return token;
Token mergedToken = new Token(termText.toString(), startOffset, token.endOffset(), token.type());
mergedToken.setPositionIncrement(firstPositionIncrement);
return mergedToken;
}
}
//last token ending with hyphen? - we know that we have only one token in
//this situation, so we can safely return firstToken
if (startOffset != -1)
return lastToken;
else
return null; //end of token stream
}
}

View File

@ -0,0 +1,30 @@
package org.apache.solr.analysis;
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.analysis.BaseTokenFilterFactory;
/**
* Factory for HyphenatedWordsFilter
* @author Boris Vitez
*/
public class HyphenatedWordsFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new HyphenatedWordsFilter(input);
}
}

View File

@ -0,0 +1,38 @@
package org.apache.solr.analysis;
import java.io.IOException;
import java.io.StringReader;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* HyphenatedWordsFilter test
*/
public class TestHyphenatedWordsFilter extends TestCase {
public void testHyphenatedWords() throws Exception {
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on";
String outputAfterHyphenatedWordsFilter = "ecological develop comprehensive-hands-on";
// first test
TokenStream ts = new WhitespaceTokenizer(new StringReader(input));
ts = new HyphenatedWordsFilter(ts);
String actual = tsToString(ts);
assertEquals("Testing HyphenatedWordsFilter",
outputAfterHyphenatedWordsFilter, actual);
}
public static String tsToString(TokenStream in) throws IOException {
StringBuffer out = new StringBuffer();
Token t = in.next();
if (null != t)
out.append(t.termText());
for (t = in.next(); null != t; t = in.next()) {
out.append(" ").append(t.termText());
}
return out.toString();
}
}