From e2c2a8d240c61465b59c8ce4df61a37acba8a335 Mon Sep 17 00:00:00 2001 From: Grant Ingersoll Date: Tue, 1 Apr 2008 16:10:19 +0000 Subject: [PATCH] SOLR-330: Converted Solr tokenstreams to use Lucene's char[] capabilities git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@643465 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 1 + .../solr/analysis/BufferedTokenStream.java | 2 +- .../analysis/EnglishPorterFilterFactory.java | 101 ++++++++-------- .../solr/analysis/HyphenatedWordsFilter.java | 51 +++++---- .../apache/solr/analysis/KeepWordFilter.java | 18 ++- .../apache/solr/analysis/LengthFilter.java | 4 +- .../solr/analysis/LengthFilterFactory.java | 8 +- .../solr/analysis/PatternReplaceFilter.java | 9 +- .../apache/solr/analysis/PhoneticFilter.java | 22 ++-- .../analysis/RemoveDuplicatesTokenFilter.java | 17 ++- .../org/apache/solr/analysis/TrimFilter.java | 72 ++++++------ .../solr/analysis/WordDelimiterFilter.java | 43 ++++--- .../org/apache/solr/util/ArraysUtils.java | 35 ++++++ .../EnglishPorterFilterFactoryTest.java | 96 ++++++++++++++++ .../solr/analysis/LengthFilterTest.java | 36 ++++++ .../analysis/TestHyphenatedWordsFilter.java | 4 +- .../analysis/TestPatternReplaceFilter.java | 108 +++++++++++------- .../solr/analysis/TestPhoneticFilter.java | 4 +- .../apache/solr/analysis/TestTrimFilter.java | 15 ++- .../org/apache/solr/util/ArraysUtilsTest.java | 48 ++++++++ 20 files changed, 469 insertions(+), 225 deletions(-) create mode 100644 src/java/org/apache/solr/util/ArraysUtils.java create mode 100644 src/test/org/apache/solr/analysis/EnglishPorterFilterFactoryTest.java create mode 100644 src/test/org/apache/solr/analysis/LengthFilterTest.java create mode 100644 src/test/org/apache/solr/util/ArraysUtilsTest.java diff --git a/CHANGES.txt b/CHANGES.txt index 1107ec2ec6e..c9f08a2ac0e 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -867,6 +867,7 @@ Optimizations a single token per document (not multiValued & not tokenized) by using the Lucene FieldCache entry for that field to tally term counts. The first request utilizing the FieldCache will take longer than subsequent ones. + 7. Converted TokenStreams to use Lucene's new char array based capabilities. (SOLR-330, gsingers) Bug Fixes 1. Fixed delete-by-id for field types who's indexed form is different diff --git a/src/java/org/apache/solr/analysis/BufferedTokenStream.java b/src/java/org/apache/solr/analysis/BufferedTokenStream.java index 45332dfd0d2..e320f7ebd4f 100644 --- a/src/java/org/apache/solr/analysis/BufferedTokenStream.java +++ b/src/java/org/apache/solr/analysis/BufferedTokenStream.java @@ -55,7 +55,7 @@ import java.util.LinkedList; * @version $Id$ */ public abstract class BufferedTokenStream extends TokenStream { - // in the futute, might be faster if we implemented as an array based CircularQueue + // in the future, might be faster if we implemented as an array based CircularQueue private final LinkedList inQueue = new LinkedList(); private final LinkedList outQueue = new LinkedList(); private final TokenStream input; diff --git a/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java b/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java index 5dcd998deca..c61b56c289a 100644 --- a/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java +++ b/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java @@ -17,97 +17,92 @@ package org.apache.solr.analysis; +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; import org.apache.solr.common.ResourceLoader; import org.apache.solr.util.plugin.ResourceLoaderAware; -import org.apache.lucene.analysis.StopFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.Token; -import java.util.List; -import java.util.Set; import java.io.IOException; +import java.util.List; /** * @version $Id$ */ public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { - + public static final String PROTECTED_TOKENS = "protected"; + public void inform(ResourceLoader loader) { - String wordFile = args.get("protected"); + String wordFile = args.get(PROTECTED_TOKENS); if (wordFile != null) { try { List wlist = loader.getLines(wordFile); - protectedWords = StopFilter.makeStopSet((String[])wlist.toArray(new String[0])); + //This cast is safe in Lucene + protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally } catch (IOException e) { throw new RuntimeException(e); } } } - private Set protectedWords = null; + private CharArraySet protectedWords = null; public EnglishPorterFilter create(TokenStream input) { - return new EnglishPorterFilter(input,protectedWords); + return new EnglishPorterFilter(input, protectedWords); } } -/** English Porter2 filter that doesn't use reflection to -/* adapt lucene to the snowball stemmer code. +/** + * English Porter2 filter that doesn't use reflection to + * adapt lucene to the snowball stemmer code. */ class EnglishPorterFilter extends TokenFilter { - private final Set protWords; + private final CharArraySet protWords; private net.sf.snowball.ext.EnglishStemmer stemmer; - public EnglishPorterFilter(TokenStream source, Set protWords) { + public EnglishPorterFilter(TokenStream source, CharArraySet protWords) { super(source); - this.protWords=protWords; + this.protWords = protWords; stemmer = new net.sf.snowball.ext.EnglishStemmer(); } - /** the original code from lucene sandbox - public final Token next() throws IOException { - Token token = input.next(); - if (token == null) - return null; - stemmer.setCurrent(token.termText()); - try { - stemMethod.invoke(stemmer, EMPTY_ARGS); - } catch (Exception e) { - throw new RuntimeException(e.toString()); - } - return new Token(stemmer.getCurrent(), - token.startOffset(), token.endOffset(), token.type()); - } - **/ + /** + * the original code from lucene sandbox + * public final Token next() throws IOException { + * Token token = input.next(); + * if (token == null) + * return null; + * stemmer.setCurrent(token.termText()); + * try { + * stemMethod.invoke(stemmer, EMPTY_ARGS); + * } catch (Exception e) { + * throw new RuntimeException(e.toString()); + * } + * return new Token(stemmer.getCurrent(), + * token.startOffset(), token.endOffset(), token.type()); + * } + */ @Override - public Token next() throws IOException { - Token tok = input.next(); - if (tok==null) return null; - String tokstr = tok.termText(); - - // if protected, don't stem. use this to avoid stemming collisions. - if (protWords != null && protWords.contains(tokstr)) { - return tok; + public Token next(Token token) throws IOException { + Token result = input.next(token); + if (result != null) { + char[] termBuffer = result.termBuffer(); + int len = result.termLength(); + // if protected, don't stem. use this to avoid stemming collisions. + if (protWords != null && protWords.contains(termBuffer, 0, len)) { + return result; + } + stemmer.setCurrent(new String(termBuffer, 0, len));//ugh, wish the Stemmer took a char array + stemmer.stem(); + String newstr = stemmer.getCurrent(); + result.setTermBuffer(newstr.toCharArray(), 0, newstr.length()); } - - stemmer.setCurrent(tokstr); - stemmer.stem(); - String newstr = stemmer.getCurrent(); - if (tokstr.equals(newstr)) { - return tok; - } else { - // TODO: it would be nice if I could just set termText directly like - // lucene packages can. - Token newtok = new Token(newstr, tok.startOffset(), tok.endOffset(), tok.type()); - newtok.setPositionIncrement(tok.getPositionIncrement()); - return newtok; - } - + return result; } } diff --git a/src/java/org/apache/solr/analysis/HyphenatedWordsFilter.java b/src/java/org/apache/solr/analysis/HyphenatedWordsFilter.java index 9d620423fc9..c681621244a 100755 --- a/src/java/org/apache/solr/analysis/HyphenatedWordsFilter.java +++ b/src/java/org/apache/solr/analysis/HyphenatedWordsFilter.java @@ -28,25 +28,26 @@ import org.apache.lucene.analysis.*; * This filter should be used on indexing time only. * Example field definition in schema.xml: *
- * 
- * 	
- * 		
- *      
- *      
- *      
- *      
- *      
- *      
- *  
- *  
- *      
- *      
- *      
- *      
- *      
- *      
- *  
- * 
+ * <fieldtype name="text" class="solr.TextField" positionIncrementGap="100">
+ * 	<analyzer type="index">
+ * 		<tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *      <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ *      <filter class="solr.StopFilterFactory" ignoreCase="true"/>
+ *      <filter class="solr.HyphenatedWordsFilterFactory"/>
+ *      <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ *      <filter class="solr.LowerCaseFilterFactory"/>
+ *      <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ *  </analyzer>
+ *  <analyzer type="query">
+ *      <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *      <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ *      <filter class="solr.StopFilterFactory" ignoreCase="true"/>
+ *      <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
+ *      <filter class="solr.LowerCaseFilterFactory"/>
+ *      <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ *  </analyzer>
+ * </fieldtype>
+ * 
* */ public final class HyphenatedWordsFilter extends TokenFilter { @@ -55,16 +56,18 @@ public final class HyphenatedWordsFilter extends TokenFilter { super(in); } - /** + + + /** * @inheritDoc * @see org.apache.lucene.analysis.TokenStream#next() */ - public final Token next() throws IOException { - StringBuffer termText = new StringBuffer(25); + public final Token next(Token in) throws IOException { + StringBuilder termText = new StringBuilder(25); int startOffset = -1, firstPositionIncrement = -1, wordsMerged = 0; Token lastToken = null; - for (Token token = input.next(); token != null; token = input.next()) { - termText.append(token.termText()); + for (Token token = input.next(in); token != null; token = input.next()) { + termText.append(token.termBuffer(), 0, token.termLength()); //current token ends with hyphen -> grab the next token and glue them together if (termText.charAt(termText.length() - 1) == '-') { wordsMerged++; diff --git a/src/java/org/apache/solr/analysis/KeepWordFilter.java b/src/java/org/apache/solr/analysis/KeepWordFilter.java index f731a9b2477..5ff2881c92d 100644 --- a/src/java/org/apache/solr/analysis/KeepWordFilter.java +++ b/src/java/org/apache/solr/analysis/KeepWordFilter.java @@ -20,6 +20,7 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.CharArraySet; import java.io.IOException; import java.util.Set; @@ -32,23 +33,18 @@ import java.util.Set; * @since solr 1.3 */ public final class KeepWordFilter extends TokenFilter { - final Set words; - final boolean ignoreCase; + final CharArraySet words; + public KeepWordFilter(TokenStream in, Set words, boolean ignoreCase ) { super(in); - this.words=words; - this.ignoreCase=ignoreCase; + this.words = new CharArraySet(words, ignoreCase); } @Override - public final Token next() throws IOException { - for (Token token=input.next(); token!=null; token=input.next()) { - String txt = ignoreCase - ? token.termText().toLowerCase() - : token.termText(); - - if( words.contains( txt ) ) { + public final Token next(Token in) throws IOException { + for (Token token=input.next(in); token!=null; token=input.next()) { + if( words.contains( token.termBuffer(), 0, token.termLength() ) ) { return token; } } diff --git a/src/java/org/apache/solr/analysis/LengthFilter.java b/src/java/org/apache/solr/analysis/LengthFilter.java index 1fb511b0746..1a78352d035 100644 --- a/src/java/org/apache/solr/analysis/LengthFilter.java +++ b/src/java/org/apache/solr/analysis/LengthFilter.java @@ -36,8 +36,8 @@ public final class LengthFilter extends TokenFilter { //System.out.println("min="+min+" max="+max); } - public final Token next() throws IOException { - for (Token token=input.next(); token!=null; token=input.next()) { + public final Token next(Token in) throws IOException { + for (Token token=input.next(in); token!=null; token=input.next(in)) { final int len = token.endOffset() - token.startOffset(); if (lenmax) continue; return token; diff --git a/src/java/org/apache/solr/analysis/LengthFilterFactory.java b/src/java/org/apache/solr/analysis/LengthFilterFactory.java index ac2ee1785f6..25ef94fb7f0 100644 --- a/src/java/org/apache/solr/analysis/LengthFilterFactory.java +++ b/src/java/org/apache/solr/analysis/LengthFilterFactory.java @@ -27,12 +27,14 @@ import java.util.Map; */ public class LengthFilterFactory extends BaseTokenFilterFactory { int min,max; - + public static final String MIN_KEY = "min"; + public static final String MAX_KEY = "max"; + @Override public void init(Map args) { super.init(args); - min=Integer.parseInt(args.get("min")); - max=Integer.parseInt(args.get("max")); + min=Integer.parseInt(args.get(MIN_KEY)); + max=Integer.parseInt(args.get(MAX_KEY)); } public LengthFilter create(TokenStream input) { return new LengthFilter(input,min,max); diff --git a/src/java/org/apache/solr/analysis/PatternReplaceFilter.java b/src/java/org/apache/solr/analysis/PatternReplaceFilter.java index b9477148303..3598018c616 100644 --- a/src/java/org/apache/solr/analysis/PatternReplaceFilter.java +++ b/src/java/org/apache/solr/analysis/PatternReplaceFilter.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Token; import java.util.regex.Pattern; import java.util.regex.Matcher; import java.io.IOException; +import java.nio.CharBuffer; /** * A TokenFilter which applies a Pattern to each token in the stream, @@ -64,12 +65,12 @@ public final class PatternReplaceFilter extends TokenFilter { this.all=all; } - public final Token next() throws IOException { - Token t = input.next(); + public final Token next(Token in) throws IOException { + Token t = input.next(in); if (t == null) return null; - - Matcher m = p.matcher(t.termText()); + CharSequence text = CharBuffer.wrap(t.termBuffer(), 0, t.termLength()); + Matcher m = p.matcher(text); if (all) { t.setTermText(m.replaceAll(replacement)); } else { diff --git a/src/java/org/apache/solr/analysis/PhoneticFilter.java b/src/java/org/apache/solr/analysis/PhoneticFilter.java index 50c38ae5f21..f0d68819ece 100644 --- a/src/java/org/apache/solr/analysis/PhoneticFilter.java +++ b/src/java/org/apache/solr/analysis/PhoneticFilter.java @@ -46,29 +46,27 @@ public class PhoneticFilter extends TokenFilter } @Override - public final Token next() throws IOException { + public final Token next(Token in) throws IOException { if( save != null ) { Token temp = save; save = null; return temp; } - Token t = input.next(); + Token t = input.next(in); if( t != null ) { - String value = t.termText(); + String value = new String(t.termBuffer(), 0, t.termLength()); try { - value = encoder.encode(t.termText()).toString(); + value = encoder.encode(value).toString(); } catch (Exception ignored) {} // just use the direct text - - Token m = new Token(value, t.startOffset(), t.endOffset(), name ); + //Token m = new Token(value, t.startOffset(), t.endOffset(), name ); if( inject ) { - m.setPositionIncrement(0); - save = m; - } - else { - // replace the token rather then add it too the stream - return m; + save = (Token) t.clone(); + save.setPositionIncrement(0); + save.setTermBuffer(value.toCharArray(), 0, value.length()); + } else { + t.setTermBuffer(value.toCharArray(), 0, value.length()); } } return t; diff --git a/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java b/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java index 26a8e64f657..ccfa54ef0c9 100644 --- a/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java +++ b/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java @@ -19,6 +19,7 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.solr.util.ArraysUtils; import java.io.IOException; @@ -30,23 +31,27 @@ public class RemoveDuplicatesTokenFilter extends BufferedTokenStream { public RemoveDuplicatesTokenFilter(TokenStream input) {super(input);} protected Token process(Token t) throws IOException { Token tok = read(); - OUT: while (tok != null && tok.getPositionIncrement()==0) { + while (tok != null && tok.getPositionIncrement()==0) { if (null != t) { write(t); t = null; } boolean dup=false; - IN: for (Token outTok : output()) { - if (outTok.termText().equals(tok.termText())) { + for (Token outTok : output()) { + int tokLen = tok.termLength(); + if (outTok.termLength() == tokLen && ArraysUtils.equals(outTok.termBuffer(), 0, tok.termBuffer(), 0, tokLen)) { dup=true; - break IN; + //continue;; } } - if (!dup) + if (!dup){ write(tok); + } tok = read(); } - if (tok != null) pushBack(tok); + if (tok != null) { + pushBack(tok); + } return t; } } diff --git a/src/java/org/apache/solr/analysis/TrimFilter.java b/src/java/org/apache/solr/analysis/TrimFilter.java index a4ff190b0f0..4378d42c26b 100644 --- a/src/java/org/apache/solr/analysis/TrimFilter.java +++ b/src/java/org/apache/solr/analysis/TrimFilter.java @@ -17,9 +17,9 @@ package org.apache.solr.analysis; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Token; import java.io.IOException; @@ -29,50 +29,54 @@ import java.io.IOException; * @version $Id:$ */ public final class TrimFilter extends TokenFilter { - + final boolean updateOffsets; - public TrimFilter(TokenStream in, boolean updateOffsets ) { + public TrimFilter(TokenStream in, boolean updateOffsets) { super(in); this.updateOffsets = updateOffsets; } @Override - public final Token next() throws IOException { - Token t = input.next(); - if (null == t || null == t.termText()) + public final Token next(Token in) throws IOException { + Token t = input.next(in); + if (null == t || null == t.termBuffer() || t.termLength() == 0){ return t; + } + char[] termBuffer = t.termBuffer(); + int len = t.termLength(); + int start = 0; + int end = 0; + int endOff = 0; - if( updateOffsets ) { - String txt = t.termText(); - int start = 0; - int end = txt.length(); - int endOff = 0; - - // eat the first characters - while ((start < end) && (txt.charAt(start) <= ' ')) { - start++; - } - - // eat the end characters - while ((start < end) && (txt.charAt(end-1) <= ' ')) { - end--; - endOff++; - } - - if( start > 0 || end < txt.length() ) { - int incr = t.getPositionIncrement(); - t = new Token( t.termText().substring( start, end ), - t.startOffset()+start, - t.endOffset()-endOff, - t.type() ); - - t.setPositionIncrement( incr ); //+ start ); TODO? what should happen with the offset - } + // eat the first characters + //QUESTION: Should we use Character.isWhitespace() instead? + for (start = 0; start < len && termBuffer[start] <= ' '; start++) { } - else { - t.setTermText( t.termText().trim() ); + // eat the end characters + for (end = len; end >= start && termBuffer[end - 1] <= ' '; end--) { + endOff++; } + if (start > 0 || end < len) { + if (start < end) { + t.setTermBuffer(t.termBuffer(), start, (end - start)); + } else { + t.setTermLength(0); + } + if (updateOffsets) { + t.setStartOffset(t.startOffset() + start); + if (start < end) { + t.setEndOffset(t.endOffset() - endOff); + } //else if end is less than, start, then the term length is 0, so, no need to bother w/ the end offset + } + /*t = new Token( t.termText().substring( start, end ), + t.startOffset()+start, + t.endOffset()-endOff, + t.type() );*/ + + + } + return t; } } diff --git a/src/java/org/apache/solr/analysis/WordDelimiterFilter.java b/src/java/org/apache/solr/analysis/WordDelimiterFilter.java index 810aa6393af..af33531788a 100644 --- a/src/java/org/apache/solr/analysis/WordDelimiterFilter.java +++ b/src/java/org/apache/solr/analysis/WordDelimiterFilter.java @@ -192,7 +192,7 @@ final class WordDelimiterFilter extends TokenFilter { // use the type of the first char as the type // of the token. private int tokType(Token t) { - return charType(t.termText().charAt(0)); + return charType(t.termBuffer()[0]); } // There isn't really an efficient queue class, so we will @@ -207,23 +207,22 @@ final class WordDelimiterFilter extends TokenFilter { private Token newTok(Token orig, int start, int end) { int startOff = orig.startOffset(); int endOff = orig.endOffset(); - String origStr = orig.termText(); - // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. - if (origStr.length() == endOff-startOff) { + if (orig.termLength() == endOff-startOff) { endOff = startOff + end; startOff += start; } - return new Token(orig.termText().substring(start,end), - startOff, + Token newTok = new Token(startOff, endOff, orig.type()); + newTok.setTermBuffer(orig.termBuffer(), start, (end - start)); + return newTok; } - public final Token next() throws IOException { + public final Token next(Token in) throws IOException { // check the queue first if (queuePos=end) { + if (pos+2>= len) { // end of string detected after "'s" pos+=2; } else { // make sure that a delimiter follows "'s" - int ch2 = s.charAt(pos+2); + int ch2 = termBuffer[pos+2]; int type2 = charType(ch2); if ((type2 & SUBWORD_DELIM)!=0) { // if delimiter, move position pointer @@ -340,7 +339,7 @@ final class WordDelimiterFilter extends TokenFilter { } } - if (++pos >= end) { + if (++pos >= len) { if (start==0) { // the subword is the whole original token, so // return it unchanged. @@ -362,7 +361,7 @@ final class WordDelimiterFilter extends TokenFilter { } lastType = type; - ch = s.charAt(pos); + ch = termBuffer[pos]; type = charType(ch); } @@ -482,7 +481,7 @@ final class WordDelimiterFilter extends TokenFilter { tok = lst.get(i); if (catenateSubwords) { if (i==start) firstTok=tok; - sb.append(tok.termText()); + sb.append(tok.termBuffer(), 0, tok.termLength()); } if (generateSubwords) { queue.add(tok); diff --git a/src/java/org/apache/solr/util/ArraysUtils.java b/src/java/org/apache/solr/util/ArraysUtils.java new file mode 100644 index 00000000000..9be432bbcda --- /dev/null +++ b/src/java/org/apache/solr/util/ArraysUtils.java @@ -0,0 +1,35 @@ +package org.apache.solr.util; + + +/** + * + * + **/ +//Since Arrays.equals doesn't implement offsets for equals +public class ArraysUtils { + + /** + * See if two array slices are the same. + * + * @param left The left array to compare + * @param offsetLeft The offset into the array. Must be positive + * @param right The right array to compare + * @param offsetRight the offset into the right array. Must be positive + * @param length The length of the section of the array to compare + * @return true if the two arrays, starting at their respective offsets, are equal + * + * @see java.util.Arrays#equals(char[], char[]) + */ + public static boolean equals(char[] left, int offsetLeft, char[] right, int offsetRight, int length) { + if ((offsetLeft + length <= left.length) && (offsetRight + length <= right.length)) { + for (int i = 0; i < length; i++) { + if (left[offsetLeft + i] != right[offsetRight + i]) { + return false; + } + + } + return true; + } + return false; + } +} diff --git a/src/test/org/apache/solr/analysis/EnglishPorterFilterFactoryTest.java b/src/test/org/apache/solr/analysis/EnglishPorterFilterFactoryTest.java new file mode 100644 index 00000000000..da6407596d2 --- /dev/null +++ b/src/test/org/apache/solr/analysis/EnglishPorterFilterFactoryTest.java @@ -0,0 +1,96 @@ +package org.apache.solr.analysis; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import net.sf.snowball.ext.EnglishStemmer; +import org.apache.solr.common.ResourceLoader; + +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.ArrayList; +import java.util.Collections; + +public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase { + + public void test() throws IOException { + EnglishStemmer stemmer = new net.sf.snowball.ext.EnglishStemmer(); + String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"}; + StringBuilder gold = new StringBuilder(); + for (int i = 0; i < test.length; i++) { + stemmer.setCurrent(test[i]); + stemmer.stem(); + gold.append(stemmer.getCurrent()).append(' '); + } + + EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory(); + Map args = new HashMap(); + + factory.init(args); + factory.inform(new LinesMockSolrResourceLoader(new ArrayList())); + String out = tsToString(factory.create(new IterTokenStream(test))); + assertEquals(gold.toString().trim(), out); + } + + public void testProtected() throws Exception { + EnglishStemmer stemmer = new net.sf.snowball.ext.EnglishStemmer(); + String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"}; + StringBuilder gold = new StringBuilder(); + for (int i = 0; i < test.length; i++) { + if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) { + stemmer.setCurrent(test[i]); + stemmer.stem(); + gold.append(stemmer.getCurrent()).append(' '); + } else { + gold.append(test[i]).append(' '); + } + } + + EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory(); + Map args = new HashMap(); + args.put(EnglishPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt"); + factory.init(args); + List lines = new ArrayList(); + Collections.addAll(lines, "banks", "fledgling"); + factory.inform(new LinesMockSolrResourceLoader(lines)); + String out = tsToString(factory.create(new IterTokenStream(test))); + assertEquals(gold.toString().trim(), out); + } + + class LinesMockSolrResourceLoader implements ResourceLoader { + List lines; + + LinesMockSolrResourceLoader(List lines) { + this.lines = lines; + } + + public List getLines(String resource) throws IOException { + return lines; + } + + public Object newInstance(String cname, String... subpackages) { + return null; + } + + public InputStream openResource(String resource) throws IOException { + return null; + } + } +} + diff --git a/src/test/org/apache/solr/analysis/LengthFilterTest.java b/src/test/org/apache/solr/analysis/LengthFilterTest.java new file mode 100644 index 00000000000..96c729360f4 --- /dev/null +++ b/src/test/org/apache/solr/analysis/LengthFilterTest.java @@ -0,0 +1,36 @@ +package org.apache.solr.analysis; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +public class LengthFilterTest extends BaseTokenTestCase { + + public void test() throws IOException { + LengthFilterFactory factory = new LengthFilterFactory(); + Map args = new HashMap(); + args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4)); + args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10)); + factory.init(args); + String[] test = {"foo", "foobar", "super-duper-trooper"}; + String gold = "foobar"; + String out = tsToString(factory.create(new IterTokenStream(test))); + assertEquals(gold.toString(), out); + } +} \ No newline at end of file diff --git a/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java b/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java index 2ec70003eeb..13e9c353745 100755 --- a/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java +++ b/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java @@ -27,8 +27,8 @@ import org.apache.lucene.analysis.WhitespaceTokenizer; */ public class TestHyphenatedWordsFilter extends BaseTokenTestCase { public void testHyphenatedWords() throws Exception { - String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on"; - String outputAfterHyphenatedWordsFilter = "ecological develop comprehensive-hands-on"; + String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal"; + String outputAfterHyphenatedWordsFilter = "ecological develop comprehensive-hands-on and ecological"; // first test TokenStream ts = new WhitespaceTokenizer(new StringReader(input)); ts = new HyphenatedWordsFilter(ts); diff --git a/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java b/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java index 1d91837b3b5..c9147f0bac6 100644 --- a/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java +++ b/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java @@ -17,76 +17,96 @@ package org.apache.solr.analysis; -import java.io.StringReader; -import java.util.regex.Pattern; -import junit.framework.TestCase; - +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceTokenizer; +import java.io.StringReader; +import java.util.regex.Pattern; + /** * @version $Id:$ */ public class TestPatternReplaceFilter extends AnalysisTestCase { - + public void testReplaceAll() throws Exception { String input = "aabfooaabfooabfoob ab caaaaaaaaab"; TokenStream ts = new PatternReplaceFilter - (new WhitespaceTokenizer(new StringReader(input)), - Pattern.compile("a*b"), - "-", true); - assertEquals("-foo-foo-foo-", ts.next().termText()); - assertEquals("-", ts.next().termText()); - assertEquals("c-", ts.next().termText()); - assertNull(ts.next()); + (new WhitespaceTokenizer(new StringReader(input)), + Pattern.compile("a*b"), + "-", true); + Token token = ts.next(); + assertEquals("-foo-foo-foo-", new String(token.termBuffer(), 0, token.termLength())); + token = ts.next(); + assertEquals("-", new String(token.termBuffer(), 0, token.termLength())); + token = ts.next(); + assertEquals("c-", new String(token.termBuffer(), 0, token.termLength())); + token = ts.next(); + assertNull(token); } - + public void testReplaceFirst() throws Exception { String input = "aabfooaabfooabfoob ab caaaaaaaaab"; TokenStream ts = new PatternReplaceFilter - (new WhitespaceTokenizer(new StringReader(input)), - Pattern.compile("a*b"), - "-", false); - assertEquals("-fooaabfooabfoob", ts.next().termText()); - assertEquals("-", ts.next().termText()); - assertEquals("c-", ts.next().termText()); - assertNull(ts.next()); + (new WhitespaceTokenizer(new StringReader(input)), + Pattern.compile("a*b"), + "-", false); + Token token = ts.next(); + assertEquals("-fooaabfooabfoob", new String(token.termBuffer(), 0, token.termLength())); + token = ts.next(); + assertEquals("-", new String(token.termBuffer(), 0, token.termLength())); + token = ts.next(); + assertEquals("c-", new String(token.termBuffer(), 0, token.termLength())); + token = ts.next(); + assertNull(token); } - + public void testStripFirst() throws Exception { String input = "aabfooaabfooabfoob ab caaaaaaaaab"; TokenStream ts = new PatternReplaceFilter - (new WhitespaceTokenizer(new StringReader(input)), - Pattern.compile("a*b"), - null, false); - assertEquals("fooaabfooabfoob", ts.next().termText()); - assertEquals("", ts.next().termText()); - assertEquals("c", ts.next().termText()); - assertNull(ts.next()); + (new WhitespaceTokenizer(new StringReader(input)), + Pattern.compile("a*b"), + null, false); + Token token = ts.next(); + assertEquals("fooaabfooabfoob", new String(token.termBuffer(), 0, token.termLength())); + token = ts.next(); + assertEquals("", new String(token.termBuffer(), 0, token.termLength())); + token = ts.next(); + assertEquals("c", new String(token.termBuffer(), 0, token.termLength())); + token = ts.next(); + assertNull(token); } - + public void testStripAll() throws Exception { String input = "aabfooaabfooabfoob ab caaaaaaaaab"; TokenStream ts = new PatternReplaceFilter - (new WhitespaceTokenizer(new StringReader(input)), - Pattern.compile("a*b"), - null, true); - assertEquals("foofoofoo", ts.next().termText()); - assertEquals("", ts.next().termText()); - assertEquals("c", ts.next().termText()); - assertNull(ts.next()); + (new WhitespaceTokenizer(new StringReader(input)), + Pattern.compile("a*b"), + null, true); + Token token = ts.next(); + assertEquals("foofoofoo", new String(token.termBuffer(), 0, token.termLength())); + token = ts.next(); + assertEquals("", new String(token.termBuffer(), 0, token.termLength())); + token = ts.next(); + assertEquals("c", new String(token.termBuffer(), 0, token.termLength())); + token = ts.next(); + assertNull(token); } - + public void testReplaceAllWithBackRef() throws Exception { String input = "aabfooaabfooabfoob ab caaaaaaaaab"; TokenStream ts = new PatternReplaceFilter - (new WhitespaceTokenizer(new StringReader(input)), - Pattern.compile("(a*)b"), - "$1\\$", true); - assertEquals("aa$fooaa$fooa$foo$", ts.next().termText()); - assertEquals("a$", ts.next().termText()); - assertEquals("caaaaaaaaa$", ts.next().termText()); - assertNull(ts.next()); + (new WhitespaceTokenizer(new StringReader(input)), + Pattern.compile("(a*)b"), + "$1\\$", true); + Token token = ts.next(); + assertEquals("aa$fooaa$fooa$foo$", new String(token.termBuffer(), 0, token.termLength())); + token = ts.next(); + assertEquals("a$", new String(token.termBuffer(), 0, token.termLength())); + token = ts.next(); + assertEquals("caaaaaaaaa$", new String(token.termBuffer(), 0, token.termLength())); + token = ts.next(); + assertNull(token); } } diff --git a/src/test/org/apache/solr/analysis/TestPhoneticFilter.java b/src/test/org/apache/solr/analysis/TestPhoneticFilter.java index 55aca6670fd..494d0a95445 100644 --- a/src/test/org/apache/solr/analysis/TestPhoneticFilter.java +++ b/src/test/org/apache/solr/analysis/TestPhoneticFilter.java @@ -81,8 +81,8 @@ public class TestPhoneticFilter extends BaseTokenTestCase { new IterTokenStream(stream.iterator()), enc, "text", inject ); for( Token t : output ) { - Token got = filter.next(); - assertEquals( t.termText(), got.termText()); + Token got = filter.next(t); + assertEquals( new String(t.termBuffer(), 0, t.termLength()), new String(got.termBuffer(), 0, got.termLength())); } assertNull( filter.next() ); // no more tokens } diff --git a/src/test/org/apache/solr/analysis/TestTrimFilter.java b/src/test/org/apache/solr/analysis/TestTrimFilter.java index 5f4f31c2e8f..a40bf2dfde3 100644 --- a/src/test/org/apache/solr/analysis/TestTrimFilter.java +++ b/src/test/org/apache/solr/analysis/TestTrimFilter.java @@ -35,11 +35,16 @@ public class TestTrimFilter extends BaseTokenTestCase { new Token("cCc",11,15), new Token(" ",16,20)), false ); - assertEquals("a", ts.next().termText()); - assertEquals("b", ts.next().termText()); - assertEquals("cCc", ts.next().termText()); - assertEquals("", ts.next().termText()); - assertNull(ts.next()); + Token token = ts.next(); + assertEquals("a", new String(token.termBuffer(), 0, token.termLength())); + token = ts.next(); + assertEquals("b", new String(token.termBuffer(), 0, token.termLength())); + token = ts.next(); + assertEquals("cCc", new String(token.termBuffer(), 0, token.termLength())); + token = ts.next(); + assertEquals("", new String(token.termBuffer(), 0, token.termLength())); + token = ts.next(); + assertNull(token); ts = new TrimFilter( new IterTokenStream( new Token(" a", 0,2), diff --git a/src/test/org/apache/solr/util/ArraysUtilsTest.java b/src/test/org/apache/solr/util/ArraysUtilsTest.java new file mode 100644 index 00000000000..59eb420ee1a --- /dev/null +++ b/src/test/org/apache/solr/util/ArraysUtilsTest.java @@ -0,0 +1,48 @@ +package org.apache.solr.util; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; + +public class ArraysUtilsTest extends TestCase { + + + public ArraysUtilsTest(String s) { + super(s); + } + + protected void setUp() { + } + + protected void tearDown() { + + } + + public void test() { + String left = "this is equal"; + String right = left; + char[] leftChars = left.toCharArray(); + char[] rightChars = right.toCharArray(); + assertTrue(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 0, rightChars, 0, left.length())); + + assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 1, rightChars, 0, left.length())); + assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 1, rightChars, 2, left.length())); + + assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 25, rightChars, 0, left.length())); + assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 12, rightChars, 0, left.length())); + } +} \ No newline at end of file