mirror of https://github.com/apache/lucene.git
a hackey commit of stuff needed to get on lucene 3.0.1
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/branches/solr@922957 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5b0c6919e0
commit
65a21459a2
|
@ -114,7 +114,7 @@
|
|||
The version suffix of the Lucene artifacts checked into "lib"
|
||||
IF YOU CHANGE THIS, SANITY CHECK "javadoc.link.lucene"
|
||||
-->
|
||||
<property name="lucene_version" value="2.9.2"/>
|
||||
<property name="lucene_version" value="3.0.1"/>
|
||||
<!-- The version number to assign to the Maven artifacts. -->
|
||||
<property name="maven_version" value="1.5-SNAPSHOT"/>
|
||||
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[9117ad96a4d5290e0731e2fc2fb326899a4999fd] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[876ca004312baada28f235c96ad74c9ee467045a] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[34b447a890e395c06906a75a7567f6fe8197b147] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[c156dab2c44abc562f7d061581aeb1aaa1f28a72] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[f897531d6823d0717f65a06e9f3cc648547c2cfe] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[f39bb741c2563c55fe9185f1c32615d75be056be] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[9139afc9ede79205a745d831b24a4316406710d2] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[9d9508a2199ff767f7853a0663d62896c60f0654] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[017161b212f274b87e3d8ef0809fbdee0c2099ce] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[445a216d3341a569cc6f38480fdda9a3c2ee1d10] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[6f12da563f7f852877998443d9e772579bfcf076] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -366,12 +366,12 @@ public class ConcurrentLRUCache<K,V> {
|
|||
// necessary because maxSize is private in base class
|
||||
public Object myInsertWithOverflow(Object element) {
|
||||
if (size() < myMaxSize) {
|
||||
put(element);
|
||||
add(element);
|
||||
return null;
|
||||
} else if (size() > 0 && !lessThan(element, heap[1])) {
|
||||
Object ret = heap[1];
|
||||
heap[1] = element;
|
||||
adjustTop();
|
||||
updateTop();
|
||||
return ret;
|
||||
} else {
|
||||
return element;
|
||||
|
|
|
@ -20,6 +20,13 @@ package org.apache.solr.analysis;
|
|||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource; // javadoc @link
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
|
@ -56,13 +63,23 @@ import java.util.LinkedList;
|
|||
* responsibility of the implementing subclass. In the "A" "B" => "A" "A" "B"
|
||||
* example above, the subclass must clone the additional "A" it creates.
|
||||
*
|
||||
* @version $Id$
|
||||
* @deprecated This class does not support custom attributes. Extend TokenFilter instead,
|
||||
* using {@link AttributeSource#captureState()} and {@link AttributeSource#restoreState()}
|
||||
* which support all attributes.
|
||||
*/
|
||||
@Deprecated
|
||||
public abstract class BufferedTokenStream extends TokenFilter {
|
||||
// in the future, might be faster if we implemented as an array based CircularQueue
|
||||
private final LinkedList<Token> inQueue = new LinkedList<Token>();
|
||||
private final LinkedList<Token> outQueue = new LinkedList<Token>();
|
||||
|
||||
private final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
private final TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
private final FlagsAttribute flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
|
||||
private final PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
public BufferedTokenStream(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
@ -77,13 +94,13 @@ public abstract class BufferedTokenStream extends TokenFilter {
|
|||
*/
|
||||
protected abstract Token process(Token t) throws IOException;
|
||||
|
||||
public final Token next() throws IOException {
|
||||
public final boolean incrementToken() throws IOException {
|
||||
while (true) {
|
||||
if (!outQueue.isEmpty()) return outQueue.removeFirst();
|
||||
if (!outQueue.isEmpty()) return writeToken(outQueue.removeFirst());
|
||||
Token t = read();
|
||||
if (null == t) return null;
|
||||
if (null == t) return false;
|
||||
Token out = process(t);
|
||||
if (null != out) return out;
|
||||
if (null != out) return writeToken(out);
|
||||
// loop back to top in case process() put something on the output queue
|
||||
}
|
||||
}
|
||||
|
@ -94,7 +111,7 @@ public abstract class BufferedTokenStream extends TokenFilter {
|
|||
*/
|
||||
protected Token read() throws IOException {
|
||||
if (inQueue.isEmpty()) {
|
||||
Token t = input.next();
|
||||
Token t = readToken();
|
||||
return t;
|
||||
}
|
||||
return inQueue.removeFirst();
|
||||
|
@ -120,13 +137,41 @@ public abstract class BufferedTokenStream extends TokenFilter {
|
|||
protected Token peek(int n) throws IOException {
|
||||
int fillCount = n-inQueue.size();
|
||||
for (int i=0; i < fillCount; i++) {
|
||||
Token t = input.next();
|
||||
Token t = readToken();
|
||||
if (null==t) return null;
|
||||
inQueue.addLast(t);
|
||||
}
|
||||
return inQueue.get(n-1);
|
||||
}
|
||||
|
||||
/** old api emulation for back compat */
|
||||
private Token readToken() throws IOException {
|
||||
if (!input.incrementToken()) {
|
||||
return null;
|
||||
} else {
|
||||
Token token = new Token();
|
||||
token.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||
token.setType(typeAtt.type());
|
||||
token.setFlags(flagsAtt.getFlags());
|
||||
token.setPositionIncrement(posIncAtt.getPositionIncrement());
|
||||
token.setPayload(payloadAtt.getPayload());
|
||||
return token;
|
||||
}
|
||||
}
|
||||
|
||||
/** old api emulation for back compat */
|
||||
private boolean writeToken(Token token) throws IOException {
|
||||
clearAttributes();
|
||||
termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
|
||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||
typeAtt.setType(token.type());
|
||||
flagsAtt.setFlags(token.getFlags());
|
||||
posIncAtt.setPositionIncrement(token.getPositionIncrement());
|
||||
payloadAtt.setPayload(token.getPayload());
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a token to the buffered output stream
|
||||
*/
|
||||
|
|
|
@ -14,20 +14,22 @@ import java.util.Arrays;
|
|||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
||||
/*
|
||||
* TODO: Rewrite to use new TokenStream api from lucene 2.9 when BufferedTokenStream uses it.
|
||||
* TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and
|
||||
* associated constructors
|
||||
* TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors
|
||||
*/
|
||||
|
||||
/**
|
||||
* Construct bigrams for frequently occurring terms while indexing. Single terms
|
||||
* are still indexed too, with bigrams overlaid. This is achieved through the
|
||||
* use of {@link Token#setPositionIncrement(int)}. Bigrams have a type
|
||||
* of "gram" Example
|
||||
* use of {@link PositionIncrementAttribute#setPositionIncrement(int)}. Bigrams have a type
|
||||
* of {@link #GRAM_TYPE} Example:
|
||||
* <ul>
|
||||
* <li>input:"the quick brown fox"</li>
|
||||
* <li>output:|"the","the-quick"|"brown"|"fox"|</li>
|
||||
|
@ -40,14 +42,23 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
/*
|
||||
* Constructors and makeCommonSet based on similar code in StopFilter
|
||||
*/
|
||||
public final class CommonGramsFilter extends TokenFilter {
|
||||
|
||||
public class CommonGramsFilter extends BufferedTokenStream {
|
||||
|
||||
static final String GRAM_TYPE = "gram";
|
||||
private static final char SEPARATOR = '_';
|
||||
|
||||
private final CharArraySet commonWords;
|
||||
|
||||
private StringBuilder buffer = new StringBuilder();
|
||||
private final StringBuilder buffer = new StringBuilder();
|
||||
|
||||
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
private int lastStartOffset;
|
||||
private boolean lastWasCommon;
|
||||
private State savedState;
|
||||
|
||||
/**
|
||||
* Construct a token stream filtering the given input using a Set of common
|
||||
|
@ -57,7 +68,6 @@ public class CommonGramsFilter extends BufferedTokenStream {
|
|||
*
|
||||
* @param input TokenStream input in filter chain
|
||||
* @param commonWords The set of common words.
|
||||
*
|
||||
*/
|
||||
public CommonGramsFilter(TokenStream input, Set commonWords) {
|
||||
this(input, commonWords, false);
|
||||
|
@ -80,8 +90,7 @@ public class CommonGramsFilter extends BufferedTokenStream {
|
|||
* @param commonWords The set of common words.
|
||||
* @param ignoreCase -Ignore case when constructing bigrams for common words.
|
||||
*/
|
||||
public CommonGramsFilter(TokenStream input, Set commonWords,
|
||||
boolean ignoreCase) {
|
||||
public CommonGramsFilter(TokenStream input, Set commonWords, boolean ignoreCase) {
|
||||
super(input);
|
||||
if (commonWords instanceof CharArraySet) {
|
||||
this.commonWords = (CharArraySet) commonWords;
|
||||
|
@ -89,7 +98,6 @@ public class CommonGramsFilter extends BufferedTokenStream {
|
|||
this.commonWords = new CharArraySet(commonWords.size(), ignoreCase);
|
||||
this.commonWords.addAll(commonWords);
|
||||
}
|
||||
init();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -101,7 +109,6 @@ public class CommonGramsFilter extends BufferedTokenStream {
|
|||
*/
|
||||
public CommonGramsFilter(TokenStream input, String[] commonWords) {
|
||||
this(input, commonWords, false);
|
||||
init();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -112,33 +119,21 @@ public class CommonGramsFilter extends BufferedTokenStream {
|
|||
* @param commonWords words to be used in constructing bigrams
|
||||
* @param ignoreCase -Ignore case when constructing bigrams for common words.
|
||||
*/
|
||||
public CommonGramsFilter(TokenStream input, String[] commonWords,
|
||||
boolean ignoreCase) {
|
||||
public CommonGramsFilter(TokenStream input, String[] commonWords, boolean ignoreCase) {
|
||||
super(input);
|
||||
this.commonWords = (CharArraySet) makeCommonSet(commonWords, ignoreCase);
|
||||
init();
|
||||
}
|
||||
|
||||
// Here for future moving to 2.9 api See StopFilter code
|
||||
|
||||
public void init() {
|
||||
/**
|
||||
* termAtt = (TermAttribute) addAttribute(TermAttribute.class); posIncrAtt
|
||||
* =(PositionIncrementAttribute)
|
||||
* addAttribute(PositionIncrementAttribute.class); typeAdd =(TypeAttribute)
|
||||
* addAttribute(TypeAttribute.class);
|
||||
*/
|
||||
this.commonWords = makeCommonSet(commonWords, ignoreCase);
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a CharArraySet from an array of common words, appropriate for passing
|
||||
* into the CommonGramsFilter constructor. This permits this commonWords
|
||||
* construction to be cached once when an Analyzer is constructed.
|
||||
*
|
||||
* @see #makeCommonSet(java.lang.String[], boolean) passing false to
|
||||
* ignoreCase
|
||||
*
|
||||
* @param commonWords Array of common words which will be converted into the CharArraySet
|
||||
* @return CharArraySet of the given words, appropriate for passing into the CommonGramFilter constructor
|
||||
* @see #makeCommonSet(java.lang.String[], boolean) passing false to ignoreCase
|
||||
*/
|
||||
public static final CharArraySet makeCommonSet(String[] commonWords) {
|
||||
public static CharArraySet makeCommonSet(String[] commonWords) {
|
||||
return makeCommonSet(commonWords, false);
|
||||
}
|
||||
|
||||
|
@ -147,12 +142,11 @@ public class CommonGramsFilter extends BufferedTokenStream {
|
|||
* into the CommonGramsFilter constructor,case-sensitive if ignoreCase is
|
||||
* false.
|
||||
*
|
||||
* @param commonWords
|
||||
* @param commonWords Array of common words which will be converted into the CharArraySet
|
||||
* @param ignoreCase If true, all words are lower cased first.
|
||||
* @return a Set containing the words
|
||||
*/
|
||||
public static final CharArraySet makeCommonSet(String[] commonWords,
|
||||
boolean ignoreCase) {
|
||||
public static CharArraySet makeCommonSet(String[] commonWords, boolean ignoreCase) {
|
||||
CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase);
|
||||
commonSet.addAll(Arrays.asList(commonWords));
|
||||
return commonSet;
|
||||
|
@ -163,61 +157,95 @@ public class CommonGramsFilter extends BufferedTokenStream {
|
|||
* output the token. If the token and/or the following token are in the list
|
||||
* of common words also output a bigram with position increment 0 and
|
||||
* type="gram"
|
||||
*/
|
||||
/*
|
||||
* TODO: implement new lucene 2.9 API incrementToken() instead of deprecated
|
||||
* Token.next() TODO:Consider adding an option to not emit unigram stopwords
|
||||
*
|
||||
* TODO:Consider adding an option to not emit unigram stopwords
|
||||
* as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be
|
||||
* changed to work with this. TODO: Consider optimizing for the case of three
|
||||
* changed to work with this.
|
||||
*
|
||||
* TODO: Consider optimizing for the case of three
|
||||
* commongrams i.e "man of the year" normally produces 3 bigrams: "man-of",
|
||||
* "of-the", "the-year" but with proper management of positions we could
|
||||
* eliminate the middle bigram "of-the"and save a disk seek and a whole set of
|
||||
* position lookups.
|
||||
*/
|
||||
public Token process(Token token) throws IOException {
|
||||
Token next = peek(1);
|
||||
// if this is the last token just spit it out. Any commongram would have
|
||||
// been output in the previous call
|
||||
if (next == null) {
|
||||
return token;
|
||||
public boolean incrementToken() throws IOException {
|
||||
// get the next piece of input
|
||||
if (savedState != null) {
|
||||
restoreState(savedState);
|
||||
savedState = null;
|
||||
saveTermBuffer();
|
||||
return true;
|
||||
} else if (!input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* if this token or next are common then construct a bigram with type="gram"
|
||||
* position increment = 0, and put it in the output queue. It will be
|
||||
* returned when super.next() is called, before this method gets called with
|
||||
* a new token from the input stream See implementation of next() in
|
||||
* BufferedTokenStream
|
||||
|
||||
/* We build n-grams before and after stopwords.
|
||||
* When valid, the buffer always contains at least the separator.
|
||||
* If its empty, there is nothing before this stopword.
|
||||
*/
|
||||
|
||||
if (isCommon(token) || isCommon(next)) {
|
||||
Token gram = gramToken(token, next);
|
||||
write(gram);
|
||||
if (lastWasCommon || (isCommon() && buffer.length() > 0)) {
|
||||
savedState = captureState();
|
||||
gramToken();
|
||||
return true;
|
||||
}
|
||||
// we always return the unigram token
|
||||
return token;
|
||||
|
||||
saveTermBuffer();
|
||||
return true;
|
||||
}
|
||||
|
||||
/** True if token is for a common term. */
|
||||
private boolean isCommon(Token token) {
|
||||
return commonWords != null
|
||||
&& commonWords.contains(token.termBuffer(), 0, token.termLength());
|
||||
}
|
||||
|
||||
/** Construct a compound token. */
|
||||
private Token gramToken(Token first, Token second) {
|
||||
buffer.setLength(0);
|
||||
buffer.append(first.termText());
|
||||
buffer.append(SEPARATOR);
|
||||
buffer.append(second.termText());
|
||||
Token result = new Token(buffer.toString(), first.startOffset(), second
|
||||
.endOffset(), "gram");
|
||||
result.setPositionIncrement(0);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
lastWasCommon = false;
|
||||
savedState = null;
|
||||
buffer.setLength(0);
|
||||
}
|
||||
|
||||
// ================================================= Helper Methods ================================================
|
||||
|
||||
/**
|
||||
* Determines if the current token is a common term
|
||||
*
|
||||
* @return {@code true} if the current token is a common term, {@code false} otherwise
|
||||
*/
|
||||
private boolean isCommon() {
|
||||
return commonWords != null && commonWords.contains(termAttribute.termBuffer(), 0, termAttribute.termLength());
|
||||
}
|
||||
|
||||
/**
|
||||
* Saves this information to form the left part of a gram
|
||||
*/
|
||||
private void saveTermBuffer() {
|
||||
buffer.setLength(0);
|
||||
buffer.append(termAttribute.termBuffer(), 0, termAttribute.termLength());
|
||||
buffer.append(SEPARATOR);
|
||||
lastStartOffset = offsetAttribute.startOffset();
|
||||
lastWasCommon = isCommon();
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a compound token.
|
||||
*/
|
||||
private void gramToken() {
|
||||
buffer.append(termAttribute.termBuffer(), 0, termAttribute.termLength());
|
||||
int endOffset = offsetAttribute.endOffset();
|
||||
|
||||
clearAttributes();
|
||||
|
||||
int length = buffer.length();
|
||||
char termText[] = termAttribute.termBuffer();
|
||||
if (length > termText.length) {
|
||||
termText = termAttribute.resizeTermBuffer(length);
|
||||
}
|
||||
|
||||
buffer.getChars(0, length, termText, 0);
|
||||
termAttribute.setTermLength(length);
|
||||
posIncAttribute.setPositionIncrement(0);
|
||||
offsetAttribute.setOffset(lastStartOffset, endOffset);
|
||||
typeAttribute.setType(GRAM_TYPE);
|
||||
buffer.setLength(0);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -57,7 +57,7 @@ public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements
|
|||
throw new RuntimeException(e);
|
||||
}
|
||||
} else {
|
||||
commonWords = (CharArraySet) CommonGramsFilter.makeCommonSet(StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
|
||||
commonWords = (CharArraySet) StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -18,8 +18,11 @@ package org.apache.solr.analysis;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
||||
import static org.apache.solr.analysis.CommonGramsFilter.GRAM_TYPE;
|
||||
|
||||
/**
|
||||
* Wrap a CommonGramsFilter optimizing phrase queries by only returning single
|
||||
|
@ -36,33 +39,36 @@ import org.apache.lucene.analysis.Token;
|
|||
*/
|
||||
|
||||
/*
|
||||
* TODO: When org.apache.solr.analysis.BufferedTokenStream is changed to use the
|
||||
* 2.9 lucene TokenStream api, make necessary changes here.
|
||||
* See:http://hudson.zones
|
||||
* .apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache
|
||||
* /lucene/analysis/TokenStream.html and
|
||||
* http://svn.apache.org/viewvc/lucene/java
|
||||
* /trunk/src/java/org/apache/lucene/analysis/package.html?revision=718798
|
||||
*/
|
||||
public class CommonGramsQueryFilter extends BufferedTokenStream {
|
||||
//private CharArraySet commonWords;
|
||||
private Token prev;
|
||||
public final class CommonGramsQueryFilter extends TokenFilter {
|
||||
|
||||
private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
private State previous;
|
||||
private String previousType;
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*
|
||||
* @param input must be a CommonGramsFilter!
|
||||
* Constructs a new CommonGramsQueryFilter based on the provided CommomGramsFilter
|
||||
*
|
||||
* @param input CommonGramsFilter the QueryFilter will use
|
||||
*/
|
||||
|
||||
public CommonGramsQueryFilter(CommonGramsFilter input) {
|
||||
super(input);
|
||||
prev = new Token();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
prev = new Token();
|
||||
previous = null;
|
||||
previousType = null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -71,68 +77,47 @@ public class CommonGramsQueryFilter extends BufferedTokenStream {
|
|||
* <ul>
|
||||
* <li>input: "the rain in spain falls mainly"
|
||||
* <li>output:"the-rain", "rain-in" ,"in-spain", "falls", "mainly"
|
||||
* </ul>
|
||||
*/
|
||||
public boolean incrementToken() throws IOException {
|
||||
while (input.incrementToken()) {
|
||||
State current = captureState();
|
||||
|
||||
public Token process(Token token) throws IOException {
|
||||
Token next = peek(1);
|
||||
/*
|
||||
* Deal with last token (next=null when current token is the last word) Last
|
||||
* token will be a unigram. If previous token was a bigram, then we already
|
||||
* output the last token as part of the unigram and should not additionally
|
||||
* output the unigram. <p> Example: If the end of the input to the
|
||||
* CommonGramsFilter is "...the plain" <ul> <li>current token = "plain"</li>
|
||||
* <li>next token = null</li> <li>previous token = "the-plain" (bigram)</li>
|
||||
* <li> Since the word "plain" was already output as part of the bigram we
|
||||
* don't output it.</li> </ul> Example: If the end of the input to the
|
||||
* CommonGramsFilter is "falls mainly" <ul> <li>current token =
|
||||
* "mainly"</li> <li>next token = null</li> <li>previous token = "falls"
|
||||
* (unigram)</li> <li>Since we haven't yet output the current token, we
|
||||
* output it</li> </ul>
|
||||
*/
|
||||
if (previous != null && !isGramType()) {
|
||||
restoreState(previous);
|
||||
previous = current;
|
||||
previousType = typeAttribute.type();
|
||||
|
||||
if (isGramType()) {
|
||||
posIncAttribute.setPositionIncrement(1);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Deal with special case of last token
|
||||
if (next == null) {
|
||||
if (prev == null) {
|
||||
// This is the first and only token i.e. one word query
|
||||
return token;
|
||||
}
|
||||
if (prev != null && prev.type() != "gram") {
|
||||
// If previous token was a unigram, output the current token
|
||||
return token;
|
||||
} else {
|
||||
// If previous token was a bigram, we already output it and this token
|
||||
// was output as part of the bigram so we are done.
|
||||
return null;
|
||||
}
|
||||
previous = current;
|
||||
}
|
||||
|
||||
/*
|
||||
* Possible cases are: |token |next 1|word |gram 2|word |word The
|
||||
* CommonGramsFilter we are wrapping always outputs the unigram word prior
|
||||
* to outputting an optional bigram: "the sound of" gets output as |"the",
|
||||
* "the_sound"|"sound", "sound_of" For case 1 we consume the gram from the
|
||||
* input stream and output it rather than the current token This means that
|
||||
* the call to super.next() which reads a token from input and passes it on
|
||||
* to this process method will always get a token of type word
|
||||
*/
|
||||
if (next != null && next.type() == "gram") {
|
||||
// consume "next" token from list and output it
|
||||
token = read();
|
||||
// use this to clone the token because clone requires all these args but
|
||||
// won't take the token.type
|
||||
// see
|
||||
// http://hudson.zones.apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache/lucene/analysis/Token.html
|
||||
prev.reinit(token.termBuffer(), 0, token.termLength(), token
|
||||
.startOffset(), token.endOffset(), token.type());
|
||||
token.setPositionIncrement(1);
|
||||
return token;
|
||||
if (previous == null || GRAM_TYPE.equals(previousType)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
restoreState(previous);
|
||||
previous = null;
|
||||
|
||||
if (isGramType()) {
|
||||
posIncAttribute.setPositionIncrement(1);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// if the next token is not a bigram, then output the token
|
||||
// see note above regarding this method of copying token to prev
|
||||
prev.reinit(token.termBuffer(), 0, token.termLength(), token.startOffset(),
|
||||
token.endOffset(), token.type());
|
||||
assert token.type() == "word";
|
||||
return token;
|
||||
// ================================================= Helper Methods ================================================
|
||||
|
||||
/**
|
||||
* Convenience method to check if the current type is a gram type
|
||||
*
|
||||
* @return {@code true} if the current type is a gram type, {@code false} otherwise
|
||||
*/
|
||||
public boolean isGramType() {
|
||||
return GRAM_TYPE.equals(typeAttribute.type());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -59,8 +59,7 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
|
|||
throw new RuntimeException(e);
|
||||
}
|
||||
} else {
|
||||
commonWords = (CharArraySet) CommonGramsFilter.makeCommonSet(
|
||||
StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
|
||||
commonWords = (CharArraySet) StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -23,7 +23,6 @@ import java.util.HashMap;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.el.GreekCharsets;
|
||||
import org.apache.lucene.analysis.el.GreekLowerCaseFilter;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrException.ErrorCode;
|
||||
|
@ -32,40 +31,16 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory
|
||||
{
|
||||
@Deprecated
|
||||
private static Map<String,char[]> CHARSETS = new HashMap<String,char[]>();
|
||||
static {
|
||||
CHARSETS.put("UnicodeGreek",GreekCharsets.UnicodeGreek);
|
||||
CHARSETS.put("ISO",GreekCharsets.ISO);
|
||||
CHARSETS.put("CP1253",GreekCharsets.CP1253);
|
||||
}
|
||||
|
||||
private char[] charset = GreekCharsets.UnicodeGreek;
|
||||
|
||||
private static Logger logger = LoggerFactory.getLogger(GreekLowerCaseFilterFactory.class);
|
||||
|
||||
@Override
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
String charsetName = args.get("charset");
|
||||
if (null != charsetName) {
|
||||
charset = CHARSETS.get(charsetName);
|
||||
if (charset.equals(GreekCharsets.UnicodeGreek))
|
||||
logger.warn("Specifying UnicodeGreek is no longer required (default). "
|
||||
+ "Use of the charset parameter will cause an error in Solr 1.5");
|
||||
else
|
||||
logger.warn("Support for this custom encoding is deprecated. "
|
||||
+ "Use of the charset parameter will cause an error in Solr 1.5");
|
||||
} else {
|
||||
charset = GreekCharsets.UnicodeGreek; /* default to unicode */
|
||||
}
|
||||
if (null == charset) {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR,
|
||||
"Don't understand charset: " + charsetName);
|
||||
}
|
||||
|
||||
}
|
||||
public GreekLowerCaseFilter create(TokenStream in) {
|
||||
return new GreekLowerCaseFilter(in,charset);
|
||||
return new GreekLowerCaseFilter(in);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.solr.analysis;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.IOException;
|
||||
|
@ -31,11 +32,6 @@ import java.io.IOException;
|
|||
@Deprecated
|
||||
public class HTMLStripStandardTokenizerFactory extends BaseTokenizerFactory {
|
||||
public Tokenizer create(Reader input) {
|
||||
return new StandardTokenizer(new HTMLStripReader(input)) {
|
||||
@Override
|
||||
public void reset(Reader reader) throws IOException {
|
||||
super.reset(new HTMLStripReader(reader));
|
||||
}
|
||||
};
|
||||
return new StandardTokenizer(Version.LUCENE_24, new HTMLStripReader(input));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,6 +20,8 @@ package org.apache.solr.analysis;
|
|||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
/**
|
||||
* When the plain text is extracted from documents, we will often have many words hyphenated and broken into
|
||||
|
@ -52,46 +54,89 @@ import org.apache.lucene.analysis.*;
|
|||
*/
|
||||
public final class HyphenatedWordsFilter extends TokenFilter {
|
||||
|
||||
public HyphenatedWordsFilter(TokenStream in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
|
||||
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
|
||||
private final StringBuilder hyphenated = new StringBuilder();
|
||||
private State savedState;
|
||||
|
||||
/**
|
||||
* @inheritDoc
|
||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||
*/
|
||||
public final Token next(Token in) throws IOException {
|
||||
StringBuilder termText = new StringBuilder(25);
|
||||
int startOffset = -1, firstPositionIncrement = -1, wordsMerged = 0;
|
||||
Token lastToken = null;
|
||||
for (Token token = input.next(in); token != null; token = input.next()) {
|
||||
termText.append(token.termBuffer(), 0, token.termLength());
|
||||
//current token ends with hyphen -> grab the next token and glue them together
|
||||
if (termText.charAt(termText.length() - 1) == '-') {
|
||||
wordsMerged++;
|
||||
//remove the hyphen
|
||||
termText.setLength(termText.length()-1);
|
||||
if (startOffset == -1) {
|
||||
startOffset = token.startOffset();
|
||||
firstPositionIncrement = token.getPositionIncrement();
|
||||
}
|
||||
lastToken = token;
|
||||
} else {
|
||||
//shortcut returns token
|
||||
if (wordsMerged == 0)
|
||||
return token;
|
||||
Token mergedToken = new Token(termText.toString(), startOffset, token.endOffset(), token.type());
|
||||
mergedToken.setPositionIncrement(firstPositionIncrement);
|
||||
return mergedToken;
|
||||
}
|
||||
}
|
||||
//last token ending with hyphen? - we know that we have only one token in
|
||||
//this situation, so we can safely return firstToken
|
||||
if (startOffset != -1)
|
||||
return lastToken;
|
||||
else
|
||||
return null; //end of token stream
|
||||
}
|
||||
* Creates a new HyphenatedWordsFilter
|
||||
*
|
||||
* @param in TokenStream that will be filtered
|
||||
*/
|
||||
public HyphenatedWordsFilter(TokenStream in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
while (input.incrementToken()) {
|
||||
char[] term = termAttribute.termBuffer();
|
||||
int termLength = termAttribute.termLength();
|
||||
|
||||
if (termLength > 0 && term[termLength - 1] == '-') {
|
||||
// a hyphenated word
|
||||
// capture the state of the first token only
|
||||
if (savedState == null) {
|
||||
savedState = captureState();
|
||||
}
|
||||
hyphenated.append(term, 0, termLength - 1);
|
||||
} else if (savedState == null) {
|
||||
// not part of a hyphenated word.
|
||||
return true;
|
||||
} else {
|
||||
// the final portion of a hyphenated word
|
||||
hyphenated.append(term, 0, termLength);
|
||||
unhyphenate();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (savedState != null) {
|
||||
// the final term ends with a hyphen
|
||||
// add back the hyphen, for backwards compatibility.
|
||||
hyphenated.append('-');
|
||||
unhyphenate();
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
hyphenated.setLength(0);
|
||||
savedState = null;
|
||||
}
|
||||
|
||||
// ================================================= Helper Methods ================================================
|
||||
|
||||
/**
|
||||
* Writes the joined unhyphenated term
|
||||
*/
|
||||
private void unhyphenate() {
|
||||
int endOffset = offsetAttribute.endOffset();
|
||||
|
||||
restoreState(savedState);
|
||||
savedState = null;
|
||||
|
||||
char term[] = termAttribute.termBuffer();
|
||||
int length = hyphenated.length();
|
||||
if (length > termAttribute.termLength()) {
|
||||
term = termAttribute.resizeTermBuffer(length);
|
||||
}
|
||||
|
||||
hyphenated.getChars(0, length, term, 0);
|
||||
termAttribute.setTermLength(length);
|
||||
offsetAttribute.setOffset(offsetAttribute.startOffset(), endOffset);
|
||||
hyphenated.setLength(0);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -75,7 +75,7 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res
|
|||
}
|
||||
|
||||
public KeepWordFilter create(TokenStream input) {
|
||||
return new KeepWordFilter(input, words, ignoreCase);
|
||||
return new KeepWordFilter(input, (Set)words, ignoreCase);
|
||||
}
|
||||
|
||||
public CharArraySet getWords() {
|
||||
|
|
|
@ -1,49 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* @version $Id$
|
||||
* @deprecated use {@link org.apache.lucene.analysis.LengthFilter}
|
||||
*/
|
||||
@Deprecated
|
||||
public final class LengthFilter extends TokenFilter {
|
||||
final int min,max;
|
||||
|
||||
public LengthFilter(TokenStream in, int min, int max) {
|
||||
super(in);
|
||||
this.min=min;
|
||||
this.max=max;
|
||||
//System.out.println("min="+min+" max="+max);
|
||||
}
|
||||
|
||||
public final Token next(Token in) throws IOException {
|
||||
for (Token token=input.next(in); token!=null; token=input.next(in)) {
|
||||
final int len = token.endOffset() - token.startOffset();
|
||||
if (len<min || len>max) continue;
|
||||
return token;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -17,41 +17,69 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.solr.util.ArraysUtils;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.solr.util.CharArrayMap;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* A TokenFilter which filters out Tokens at the same position and Term
|
||||
* text as the previous token in the stream.
|
||||
* A TokenFilter which filters out Tokens at the same position and Term text as the previous token in the stream.
|
||||
*/
|
||||
public class RemoveDuplicatesTokenFilter extends BufferedTokenStream {
|
||||
public RemoveDuplicatesTokenFilter(TokenStream input) {super(input);}
|
||||
protected Token process(Token t) throws IOException {
|
||||
Token tok = read();
|
||||
while (tok != null && tok.getPositionIncrement()==0) {
|
||||
if (null != t) {
|
||||
write(t);
|
||||
t = null;
|
||||
public final class RemoveDuplicatesTokenFilter extends TokenFilter {
|
||||
|
||||
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
// keep a seen 'set' after each term with posInc > 0
|
||||
// for now use CharArrayMap vs CharArraySet, as it has clear()
|
||||
private final CharArrayMap<Boolean> previous = new CharArrayMap<Boolean>(8, false);
|
||||
|
||||
/**
|
||||
* Creates a new RemoveDuplicatesTokenFilter
|
||||
*
|
||||
* @param in TokenStream that will be filtered
|
||||
*/
|
||||
public RemoveDuplicatesTokenFilter(TokenStream in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
while (input.incrementToken()) {
|
||||
final char term[] = termAttribute.termBuffer();
|
||||
final int length = termAttribute.termLength();
|
||||
final int posIncrement = posIncAttribute.getPositionIncrement();
|
||||
|
||||
if (posIncrement > 0) {
|
||||
previous.clear();
|
||||
}
|
||||
boolean dup=false;
|
||||
for (Token outTok : output()) {
|
||||
int tokLen = tok.termLength();
|
||||
if (outTok.termLength() == tokLen && ArraysUtils.equals(outTok.termBuffer(), 0, tok.termBuffer(), 0, tokLen)) {
|
||||
dup=true;
|
||||
//continue;;
|
||||
}
|
||||
|
||||
boolean duplicate = (posIncrement == 0 && previous.get(term, 0, length) != null);
|
||||
|
||||
// clone the term, and add to the set of seen terms.
|
||||
char saved[] = new char[length];
|
||||
System.arraycopy(term, 0, saved, 0, length);
|
||||
previous.put(saved, Boolean.TRUE);
|
||||
|
||||
if (!duplicate) {
|
||||
return true;
|
||||
}
|
||||
if (!dup){
|
||||
write(tok);
|
||||
}
|
||||
tok = read();
|
||||
}
|
||||
if (tok != null) {
|
||||
pushBack(tok);
|
||||
}
|
||||
return t;
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
previous.clear();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,46 +16,46 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.ru.*;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
import org.apache.solr.core.SolrConfig;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrException.ErrorCode;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@Deprecated
|
||||
public class RussianCommon {
|
||||
|
||||
private static Logger logger = LoggerFactory.getLogger(RussianCommon.class);
|
||||
|
||||
private static Map<String,char[]> CHARSETS = new HashMap<String,char[]>();
|
||||
static {
|
||||
CHARSETS.put("UnicodeRussian",RussianCharsets.UnicodeRussian);
|
||||
CHARSETS.put("KOI8",RussianCharsets.KOI8);
|
||||
CHARSETS.put("CP1251",RussianCharsets.CP1251);
|
||||
}
|
||||
|
||||
public static char[] getCharset(String name) {
|
||||
if (null == name)
|
||||
return RussianCharsets.UnicodeRussian;
|
||||
|
||||
char[] charset = CHARSETS.get(name);
|
||||
|
||||
if (charset.equals(RussianCharsets.UnicodeRussian))
|
||||
logger.warn("Specifying UnicodeRussian is no longer required (default). "
|
||||
+ "Use of the charset parameter will cause an error in Solr 1.5");
|
||||
else
|
||||
logger.warn("Support for this custom encoding is deprecated. "
|
||||
+ "Use of the charset parameter will cause an error in Solr 1.5");
|
||||
|
||||
if (null == charset) {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR,
|
||||
"Don't understand charset: " + name);
|
||||
}
|
||||
return charset;
|
||||
}
|
||||
}
|
||||
//package org.apache.solr.analysis;
|
||||
//import org.apache.lucene.analysis.ru.*;
|
||||
//import java.util.Map;
|
||||
//import java.util.HashMap;
|
||||
//import org.apache.solr.core.SolrConfig;
|
||||
//import org.apache.solr.common.SolrException;
|
||||
//import org.apache.solr.common.SolrException.ErrorCode;
|
||||
//import org.slf4j.Logger;
|
||||
//import org.slf4j.LoggerFactory;
|
||||
//
|
||||
//@Deprecated
|
||||
//public class RussianCommon {
|
||||
//
|
||||
// private static Logger logger = LoggerFactory.getLogger(RussianCommon.class);
|
||||
//
|
||||
// private static Map<String,char[]> CHARSETS = new HashMap<String,char[]>();
|
||||
// static {
|
||||
// CHARSETS.put("UnicodeRussian",RussianCharsets.UnicodeRussian);
|
||||
// CHARSETS.put("KOI8",RussianCharsets.KOI8);
|
||||
// CHARSETS.put("CP1251",RussianCharsets.CP1251);
|
||||
// }
|
||||
//
|
||||
// public static char[] getCharset(String name) {
|
||||
// if (null == name)
|
||||
// return RussianCharsets.UnicodeRussian;
|
||||
//
|
||||
// char[] charset = CHARSETS.get(name);
|
||||
//
|
||||
// if (charset.equals(RussianCharsets.UnicodeRussian))
|
||||
// logger.warn("Specifying UnicodeRussian is no longer required (default). "
|
||||
// + "Use of the charset parameter will cause an error in Solr 1.5");
|
||||
// else
|
||||
// logger.warn("Support for this custom encoding is deprecated. "
|
||||
// + "Use of the charset parameter will cause an error in Solr 1.5");
|
||||
//
|
||||
// if (null == charset) {
|
||||
// throw new SolrException(ErrorCode.SERVER_ERROR,
|
||||
// "Don't understand charset: " + name);
|
||||
// }
|
||||
// return charset;
|
||||
// }
|
||||
//}
|
||||
|
||||
|
|
|
@ -23,17 +23,10 @@ import java.util.Map;
|
|||
import org.apache.lucene.analysis.ru.RussianLetterTokenizer;
|
||||
|
||||
public class RussianLetterTokenizerFactory extends BaseTokenizerFactory {
|
||||
@Deprecated
|
||||
private char[] charset;
|
||||
|
||||
@Override
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
charset = RussianCommon.getCharset(args.get("charset"));
|
||||
}
|
||||
|
||||
|
||||
public RussianLetterTokenizer create(Reader in) {
|
||||
return new RussianLetterTokenizer(in,charset);
|
||||
return new RussianLetterTokenizer(in);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -23,17 +23,9 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.ru.RussianLowerCaseFilter;
|
||||
|
||||
public class RussianLowerCaseFilterFactory extends BaseTokenFilterFactory {
|
||||
@Deprecated
|
||||
private char[] charset;
|
||||
|
||||
@Override
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
charset = RussianCommon.getCharset(args.get("charset"));
|
||||
}
|
||||
|
||||
public RussianLowerCaseFilter create(TokenStream in) {
|
||||
return new RussianLowerCaseFilter(in,charset);
|
||||
return new RussianLowerCaseFilter(in);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -25,16 +25,10 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.ru.RussianStemFilter;
|
||||
|
||||
public class RussianStemFilterFactory extends BaseTokenFilterFactory {
|
||||
@Deprecated
|
||||
private char[] charset;
|
||||
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
charset = RussianCommon.getCharset(args.get("charset"));
|
||||
}
|
||||
|
||||
|
||||
public RussianStemFilter create(TokenStream in) {
|
||||
return new RussianStemFilter(in,charset);
|
||||
return new RussianStemFilter(in);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.solr.analysis;
|
|||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
|
@ -28,6 +29,6 @@ import java.io.Reader;
|
|||
|
||||
public class StandardTokenizerFactory extends BaseTokenizerFactory {
|
||||
public StandardTokenizer create(Reader input) {
|
||||
return new StandardTokenizer(input);
|
||||
return new StandardTokenizer(Version.LUCENE_24, input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -58,7 +58,7 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
|
|||
throw new RuntimeException(e);
|
||||
}
|
||||
} else {
|
||||
stopWords = (CharArraySet) StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
|
||||
stopWords = (CharArraySet) StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
||||
}
|
||||
}
|
||||
//Force the use of a char array set, as it is the most performant, although this may break things if Lucene ever goes away from it. See SOLR-1095
|
||||
|
@ -79,8 +79,7 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
|
|||
}
|
||||
|
||||
public StopFilter create(TokenStream input) {
|
||||
StopFilter stopFilter = new StopFilter(input,stopWords,ignoreCase);
|
||||
stopFilter.setEnablePositionIncrements(enablePositionIncrements);
|
||||
StopFilter stopFilter = new StopFilter(enablePositionIncrements, input,stopWords,ignoreCase);
|
||||
return stopFilter;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,6 +20,12 @@ package org.apache.solr.analysis;
|
|||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
|
@ -39,11 +45,16 @@ import java.util.LinkedList;
|
|||
public class SynonymFilter extends TokenFilter {
|
||||
|
||||
private final SynonymMap map; // Map<String, SynonymMap>
|
||||
private Iterator<Token> replacement; // iterator over generated tokens
|
||||
private Iterator<AttributeSource> replacement; // iterator over generated tokens
|
||||
|
||||
public SynonymFilter(TokenStream in, SynonymMap map) {
|
||||
super(in);
|
||||
this.map = map;
|
||||
// just ensuring these exist attributes exist...
|
||||
addAttribute(TermAttribute.class);
|
||||
addAttribute(PositionIncrementAttribute.class);
|
||||
addAttribute(OffsetAttribute.class);
|
||||
addAttribute(TypeAttribute.class);
|
||||
}
|
||||
|
||||
|
||||
|
@ -65,74 +76,100 @@ public class SynonymFilter extends TokenFilter {
|
|||
* - preserve original positionIncrement of first matched token
|
||||
*/
|
||||
@Override
|
||||
public Token next(Token target) throws IOException {
|
||||
public boolean incrementToken() throws IOException {
|
||||
while (true) {
|
||||
// if there are any generated tokens, return them... don't try any
|
||||
// matches against them, as we specifically don't want recursion.
|
||||
if (replacement!=null && replacement.hasNext()) {
|
||||
return replacement.next();
|
||||
copy(this, replacement.next());
|
||||
return true;
|
||||
}
|
||||
|
||||
// common case fast-path of first token not matching anything
|
||||
Token firstTok = nextTok(target);
|
||||
if (firstTok == null) return null;
|
||||
SynonymMap result = map.submap!=null ? map.submap.get(firstTok.termBuffer(), 0, firstTok.termLength()) : null;
|
||||
if (result == null) return firstTok;
|
||||
AttributeSource firstTok = nextTok();
|
||||
if (firstTok == null) return false;
|
||||
TermAttribute termAtt = (TermAttribute) firstTok.addAttribute(TermAttribute.class);
|
||||
SynonymMap result = map.submap!=null ? map.submap.get(termAtt.termBuffer(), 0, termAtt.termLength()) : null;
|
||||
if (result == null) {
|
||||
copy(this, firstTok);
|
||||
return true;
|
||||
}
|
||||
|
||||
// fast-path failed, clone ourselves if needed
|
||||
if (firstTok == this)
|
||||
firstTok = cloneAttributes();
|
||||
// OK, we matched a token, so find the longest match.
|
||||
|
||||
matched = new LinkedList<Token>();
|
||||
matched = new LinkedList<AttributeSource>();
|
||||
|
||||
result = match(result);
|
||||
|
||||
if (result==null) {
|
||||
// no match, simply return the first token read.
|
||||
return firstTok;
|
||||
copy(this, firstTok);
|
||||
return true;
|
||||
}
|
||||
|
||||
// reuse, or create new one each time?
|
||||
ArrayList<Token> generated = new ArrayList<Token>(result.synonyms.length + matched.size() + 1);
|
||||
ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>(result.synonyms.length + matched.size() + 1);
|
||||
|
||||
//
|
||||
// there was a match... let's generate the new tokens, merging
|
||||
// in the matched tokens (position increments need adjusting)
|
||||
//
|
||||
Token lastTok = matched.isEmpty() ? firstTok : matched.getLast();
|
||||
AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
|
||||
boolean includeOrig = result.includeOrig();
|
||||
|
||||
Token origTok = includeOrig ? firstTok : null;
|
||||
int origPos = firstTok.getPositionIncrement(); // position of origTok in the original stream
|
||||
AttributeSource origTok = includeOrig ? firstTok : null;
|
||||
PositionIncrementAttribute firstPosIncAtt = (PositionIncrementAttribute) firstTok.addAttribute(PositionIncrementAttribute.class);
|
||||
int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
|
||||
int repPos=0; // curr position in replacement token stream
|
||||
int pos=0; // current position in merged token stream
|
||||
|
||||
for (int i=0; i<result.synonyms.length; i++) {
|
||||
Token repTok = result.synonyms[i];
|
||||
Token newTok = new Token(firstTok.startOffset(), lastTok.endOffset(), firstTok.type());
|
||||
newTok.setTermBuffer(repTok.termBuffer(), 0, repTok.termLength());
|
||||
AttributeSource newTok = firstTok.cloneAttributes();
|
||||
TermAttribute newTermAtt = (TermAttribute) newTok.addAttribute(TermAttribute.class);
|
||||
OffsetAttribute newOffsetAtt = (OffsetAttribute) newTok.addAttribute(OffsetAttribute.class);
|
||||
TypeAttribute newTypeAtt = (TypeAttribute) newTok.addAttribute(TypeAttribute.class);
|
||||
PositionIncrementAttribute newPosIncAtt = (PositionIncrementAttribute) newTok.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
OffsetAttribute lastOffsetAtt = (OffsetAttribute) lastTok.addAttribute(OffsetAttribute.class);
|
||||
|
||||
newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
|
||||
newTermAtt.setTermBuffer(repTok.termBuffer(), 0, repTok.termLength());
|
||||
repPos += repTok.getPositionIncrement();
|
||||
if (i==0) repPos=origPos; // make position of first token equal to original
|
||||
|
||||
// if necessary, insert original tokens and adjust position increment
|
||||
while (origTok != null && origPos <= repPos) {
|
||||
origTok.setPositionIncrement(origPos-pos);
|
||||
PositionIncrementAttribute origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPosInc.setPositionIncrement(origPos-pos);
|
||||
generated.add(origTok);
|
||||
pos += origTok.getPositionIncrement();
|
||||
pos += origPosInc.getPositionIncrement();
|
||||
origTok = matched.isEmpty() ? null : matched.removeFirst();
|
||||
if (origTok != null) origPos += origTok.getPositionIncrement();
|
||||
if (origTok != null) {
|
||||
origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPos += origPosInc.getPositionIncrement();
|
||||
}
|
||||
}
|
||||
|
||||
newTok.setPositionIncrement(repPos - pos);
|
||||
newPosIncAtt.setPositionIncrement(repPos - pos);
|
||||
generated.add(newTok);
|
||||
pos += newTok.getPositionIncrement();
|
||||
pos += newPosIncAtt.getPositionIncrement();
|
||||
}
|
||||
|
||||
// finish up any leftover original tokens
|
||||
while (origTok!=null) {
|
||||
origTok.setPositionIncrement(origPos-pos);
|
||||
PositionIncrementAttribute origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPosInc.setPositionIncrement(origPos-pos);
|
||||
generated.add(origTok);
|
||||
pos += origTok.getPositionIncrement();
|
||||
pos += origPosInc.getPositionIncrement();
|
||||
origTok = matched.isEmpty() ? null : matched.removeFirst();
|
||||
if (origTok != null) origPos += origTok.getPositionIncrement();
|
||||
if (origTok != null) {
|
||||
origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPos += origPosInc.getPositionIncrement();
|
||||
}
|
||||
}
|
||||
|
||||
// what if we replaced a longer sequence with a shorter one?
|
||||
|
@ -151,27 +188,22 @@ public class SynonymFilter extends TokenFilter {
|
|||
// Defer creation of the buffer until the first time it is used to
|
||||
// optimize short fields with no matches.
|
||||
//
|
||||
private LinkedList<Token> buffer;
|
||||
private LinkedList<Token> matched;
|
||||
private LinkedList<AttributeSource> buffer;
|
||||
private LinkedList<AttributeSource> matched;
|
||||
|
||||
private Token nextTok() throws IOException {
|
||||
private AttributeSource nextTok() throws IOException {
|
||||
if (buffer!=null && !buffer.isEmpty()) {
|
||||
return buffer.removeFirst();
|
||||
} else {
|
||||
return input.next();
|
||||
if (input.incrementToken()) {
|
||||
return this;
|
||||
} else
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private Token nextTok(Token target) throws IOException {
|
||||
if (buffer!=null && !buffer.isEmpty()) {
|
||||
return buffer.removeFirst();
|
||||
} else {
|
||||
return input.next(target);
|
||||
}
|
||||
}
|
||||
|
||||
private void pushTok(Token t) {
|
||||
if (buffer==null) buffer=new LinkedList<Token>();
|
||||
private void pushTok(AttributeSource t) {
|
||||
if (buffer==null) buffer=new LinkedList<AttributeSource>();
|
||||
buffer.addFirst(t);
|
||||
}
|
||||
|
||||
|
@ -179,15 +211,20 @@ public class SynonymFilter extends TokenFilter {
|
|||
SynonymMap result = null;
|
||||
|
||||
if (map.submap != null) {
|
||||
Token tok = nextTok();
|
||||
AttributeSource tok = nextTok();
|
||||
if (tok != null) {
|
||||
// clone ourselves.
|
||||
if (tok == this)
|
||||
tok = cloneAttributes();
|
||||
// check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
|
||||
SynonymMap subMap = map.submap.get(tok.termBuffer(), 0, tok.termLength());
|
||||
TermAttribute termAtt = (TermAttribute) tok.getAttribute(TermAttribute.class);
|
||||
SynonymMap subMap = map.submap.get(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||
|
||||
if (subMap != null) {
|
||||
// recurse
|
||||
result = match(subMap);
|
||||
}
|
||||
;
|
||||
if (result != null) {
|
||||
matched.addFirst(tok);
|
||||
} else {
|
||||
|
@ -205,6 +242,15 @@ public class SynonymFilter extends TokenFilter {
|
|||
return result;
|
||||
}
|
||||
|
||||
private void copy(AttributeSource target, AttributeSource source) {
|
||||
if (target == source)
|
||||
return;
|
||||
for (Iterator<AttributeImpl> sourceIt = source.getAttributeImplsIterator(), targetIt=target.getAttributeImplsIterator();
|
||||
sourceIt.hasNext();) {
|
||||
sourceIt.next().copyTo(targetIt.next());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
input.reset();
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.solr.analysis;
|
|||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
|
@ -135,8 +136,9 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso
|
|||
TokenStream ts = loadTokenizer(tokFactory, reader);
|
||||
List<String> tokList = new ArrayList<String>();
|
||||
try {
|
||||
for( Token token = ts.next(); token != null; token = ts.next() ){
|
||||
String text = new String(token.termBuffer(), 0, token.termLength());
|
||||
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||
while (ts.incrementToken()){
|
||||
String text = new String(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||
if( text.length() > 0 )
|
||||
tokList.add( text );
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,315 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.solr.analysis.WordDelimiterFilter.*;
|
||||
|
||||
/**
|
||||
* A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterFilter rules.
|
||||
*/
|
||||
final class WordDelimiterIterator {
|
||||
|
||||
/** Indicates the end of iteration */
|
||||
public static final int DONE = -1;
|
||||
|
||||
public static final byte[] DEFAULT_WORD_DELIM_TABLE;
|
||||
|
||||
char text[];
|
||||
int length;
|
||||
|
||||
/** start position of text, excluding leading delimiters */
|
||||
int startBounds;
|
||||
/** end position of text, excluding trailing delimiters */
|
||||
int endBounds;
|
||||
|
||||
/** Beginning of subword */
|
||||
int current;
|
||||
/** End of subword */
|
||||
int end;
|
||||
|
||||
/* does this string end with a possessive such as 's */
|
||||
private boolean hasFinalPossessive = false;
|
||||
|
||||
/**
|
||||
* If false, causes case changes to be ignored (subwords will only be generated
|
||||
* given SUBWORD_DELIM tokens). (Defaults to true)
|
||||
*/
|
||||
final boolean splitOnCaseChange;
|
||||
|
||||
/**
|
||||
* If false, causes numeric changes to be ignored (subwords will only be generated
|
||||
* given SUBWORD_DELIM tokens). (Defaults to true)
|
||||
*/
|
||||
final boolean splitOnNumerics;
|
||||
|
||||
/**
|
||||
* If true, causes trailing "'s" to be removed for each subword. (Defaults to true)
|
||||
* <p/>
|
||||
* "O'Neil's" => "O", "Neil"
|
||||
*/
|
||||
final boolean stemEnglishPossessive;
|
||||
|
||||
private final byte[] charTypeTable;
|
||||
|
||||
/** if true, need to skip over a possessive found in the last call to next() */
|
||||
private boolean skipPossessive = false;
|
||||
|
||||
// TODO: should there be a WORD_DELIM category for chars that only separate words (no catenation of subwords will be
|
||||
// done if separated by these chars?) "," would be an obvious candidate...
|
||||
static {
|
||||
byte[] tab = new byte[256];
|
||||
for (int i = 0; i < 256; i++) {
|
||||
byte code = 0;
|
||||
if (Character.isLowerCase(i)) {
|
||||
code |= LOWER;
|
||||
}
|
||||
else if (Character.isUpperCase(i)) {
|
||||
code |= UPPER;
|
||||
}
|
||||
else if (Character.isDigit(i)) {
|
||||
code |= DIGIT;
|
||||
}
|
||||
if (code == 0) {
|
||||
code = SUBWORD_DELIM;
|
||||
}
|
||||
tab[i] = code;
|
||||
}
|
||||
DEFAULT_WORD_DELIM_TABLE = tab;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new WordDelimiterIterator operating with the supplied rules.
|
||||
*
|
||||
* @param charTypeTable table containing character types
|
||||
* @param splitOnCaseChange if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
|
||||
* @param splitOnNumerics if true, causes "j2se" to be three tokens; "j" "2" "se"
|
||||
* @param stemEnglishPossessive if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
||||
*/
|
||||
WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean splitOnNumerics, boolean stemEnglishPossessive) {
|
||||
this.charTypeTable = charTypeTable;
|
||||
this.splitOnCaseChange = splitOnCaseChange;
|
||||
this.splitOnNumerics = splitOnNumerics;
|
||||
this.stemEnglishPossessive = stemEnglishPossessive;
|
||||
}
|
||||
|
||||
/**
|
||||
* Advance to the next subword in the string.
|
||||
*
|
||||
* @return index of the next subword, or {@link #DONE} if all subwords have been returned
|
||||
*/
|
||||
int next() {
|
||||
current = end;
|
||||
if (current == DONE) {
|
||||
return DONE;
|
||||
}
|
||||
|
||||
if (skipPossessive) {
|
||||
current += 2;
|
||||
skipPossessive = false;
|
||||
}
|
||||
|
||||
int lastType = 0;
|
||||
|
||||
while (current < endBounds && (isSubwordDelim(lastType = charType(text[current])))) {
|
||||
current++;
|
||||
}
|
||||
|
||||
if (current >= endBounds) {
|
||||
return end = DONE;
|
||||
}
|
||||
|
||||
for (end = current + 1; end < endBounds; end++) {
|
||||
int type = charType(text[end]);
|
||||
if (isBreak(lastType, type)) {
|
||||
break;
|
||||
}
|
||||
lastType = type;
|
||||
}
|
||||
|
||||
if (end < endBounds - 1 && endsWithPossessive(end + 2)) {
|
||||
skipPossessive = true;
|
||||
}
|
||||
|
||||
return end;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return the type of the current subword.
|
||||
* This currently uses the type of the first character in the subword.
|
||||
*
|
||||
* @return type of the current word
|
||||
*/
|
||||
int type() {
|
||||
if (end == DONE) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int type = charType(text[current]);
|
||||
switch (type) {
|
||||
// return ALPHA word type for both lower and upper
|
||||
case LOWER:
|
||||
case UPPER:
|
||||
return ALPHA;
|
||||
default:
|
||||
return type;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset the text to a new value, and reset all state
|
||||
*
|
||||
* @param text New text
|
||||
* @param length length of the text
|
||||
*/
|
||||
void setText(char text[], int length) {
|
||||
this.text = text;
|
||||
this.length = this.endBounds = length;
|
||||
current = startBounds = end = 0;
|
||||
skipPossessive = hasFinalPossessive = false;
|
||||
setBounds();
|
||||
}
|
||||
|
||||
// ================================================= Helper Methods ================================================
|
||||
|
||||
/**
|
||||
* Determines whether the transition from lastType to type indicates a break
|
||||
*
|
||||
* @param lastType Last subword type
|
||||
* @param type Current subword type
|
||||
* @return {@code true} if the transition indicates a break, {@code false} otherwise
|
||||
*/
|
||||
private boolean isBreak(int lastType, int type) {
|
||||
if ((type & lastType) != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!splitOnCaseChange && isAlpha(lastType) && isAlpha(type)) {
|
||||
// ALPHA->ALPHA: always ignore if case isn't considered.
|
||||
return false;
|
||||
} else if (isUpper(lastType) && isAlpha(type)) {
|
||||
// UPPER->letter: Don't split
|
||||
return false;
|
||||
} else if (!splitOnNumerics && ((isAlpha(lastType) && isDigit(type)) || (isDigit(lastType) && isAlpha(type)))) {
|
||||
// ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if the current word contains only one subword. Note, it could be potentially surrounded by delimiters
|
||||
*
|
||||
* @return {@code true} if the current word contains only one subword, {@code false} otherwise
|
||||
*/
|
||||
boolean isSingleWord() {
|
||||
if (hasFinalPossessive) {
|
||||
return current == startBounds && end == endBounds - 2;
|
||||
}
|
||||
else {
|
||||
return current == startBounds && end == endBounds;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove
|
||||
* it yet, simply note it.
|
||||
*/
|
||||
private void setBounds() {
|
||||
while (startBounds < length && (isSubwordDelim(charType(text[startBounds])))) {
|
||||
startBounds++;
|
||||
}
|
||||
|
||||
while (endBounds > startBounds && (isSubwordDelim(charType(text[endBounds - 1])))) {
|
||||
endBounds--;
|
||||
}
|
||||
if (endsWithPossessive(endBounds)) {
|
||||
hasFinalPossessive = true;
|
||||
}
|
||||
current = startBounds;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if the text at the given position indicates an English possessive which should be removed
|
||||
*
|
||||
* @param pos Position in the text to check if it indicates an English possessive
|
||||
* @return {@code true} if the text at the position indicates an English posessive, {@code false} otherwise
|
||||
*/
|
||||
private boolean endsWithPossessive(int pos) {
|
||||
return (stemEnglishPossessive &&
|
||||
pos > 2 &&
|
||||
text[pos - 2] == '\'' &&
|
||||
(text[pos - 1] == 's' || text[pos - 1] == 'S') &&
|
||||
isAlpha(charType(text[pos - 3])) &&
|
||||
(pos == endBounds || isSubwordDelim(charType(text[pos]))));
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines the type of the given character
|
||||
*
|
||||
* @param ch Character whose type is to be determined
|
||||
* @return Type of the character
|
||||
*/
|
||||
private int charType(int ch) {
|
||||
if (ch < charTypeTable.length) {
|
||||
return charTypeTable[ch];
|
||||
}
|
||||
switch (Character.getType(ch)) {
|
||||
case Character.UPPERCASE_LETTER: return UPPER;
|
||||
case Character.LOWERCASE_LETTER: return LOWER;
|
||||
|
||||
case Character.TITLECASE_LETTER:
|
||||
case Character.MODIFIER_LETTER:
|
||||
case Character.OTHER_LETTER:
|
||||
case Character.NON_SPACING_MARK:
|
||||
case Character.ENCLOSING_MARK: // depends what it encloses?
|
||||
case Character.COMBINING_SPACING_MARK:
|
||||
return ALPHA;
|
||||
|
||||
case Character.DECIMAL_DIGIT_NUMBER:
|
||||
case Character.LETTER_NUMBER:
|
||||
case Character.OTHER_NUMBER:
|
||||
return DIGIT;
|
||||
|
||||
// case Character.SPACE_SEPARATOR:
|
||||
// case Character.LINE_SEPARATOR:
|
||||
// case Character.PARAGRAPH_SEPARATOR:
|
||||
// case Character.CONTROL:
|
||||
// case Character.FORMAT:
|
||||
// case Character.PRIVATE_USE:
|
||||
|
||||
case Character.SURROGATE: // prevent splitting
|
||||
return ALPHA|DIGIT;
|
||||
|
||||
// case Character.DASH_PUNCTUATION:
|
||||
// case Character.START_PUNCTUATION:
|
||||
// case Character.END_PUNCTUATION:
|
||||
// case Character.CONNECTOR_PUNCTUATION:
|
||||
// case Character.OTHER_PUNCTUATION:
|
||||
// case Character.MATH_SYMBOL:
|
||||
// case Character.CURRENCY_SYMBOL:
|
||||
// case Character.MODIFIER_SYMBOL:
|
||||
// case Character.OTHER_SYMBOL:
|
||||
// case Character.INITIAL_QUOTE_PUNCTUATION:
|
||||
// case Character.FINAL_QUOTE_PUNCTUATION:
|
||||
|
||||
default: return SUBWORD_DELIM;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -20,6 +20,12 @@ import org.apache.commons.io.IOUtils;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
|
@ -132,15 +138,20 @@ public class AnalysisRequestHandler extends RequestHandlerBase {
|
|||
static NamedList<NamedList<Object>> getTokens(TokenStream tstream) throws IOException {
|
||||
// outer is namedList since order of tokens is important
|
||||
NamedList<NamedList<Object>> tokens = new NamedList<NamedList<Object>>();
|
||||
Token t = null;
|
||||
while (((t = tstream.next()) != null)) {
|
||||
// TODO: support custom attributes
|
||||
TermAttribute termAtt = (TermAttribute) tstream.addAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) tstream.addAttribute(OffsetAttribute.class);
|
||||
TypeAttribute typeAtt = (TypeAttribute) tstream.addAttribute(TypeAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) tstream.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
while (tstream.incrementToken()) {
|
||||
NamedList<Object> token = new SimpleOrderedMap<Object>();
|
||||
tokens.add("token", token);
|
||||
token.add("value", new String(t.termBuffer(), 0, t.termLength()));
|
||||
token.add("start", t.startOffset());
|
||||
token.add("end", t.endOffset());
|
||||
token.add("posInc", t.getPositionIncrement());
|
||||
token.add("type", t.type());
|
||||
token.add("value", new String(termAtt.termBuffer(), 0, termAtt.termLength()));
|
||||
token.add("start", offsetAtt.startOffset());
|
||||
token.add("end", offsetAtt.endOffset());
|
||||
token.add("posInc", posIncAtt.getPositionIncrement());
|
||||
token.add("type", typeAtt.type());
|
||||
//TODO: handle payloads
|
||||
}
|
||||
return tokens;
|
||||
|
|
|
@ -22,6 +22,12 @@ import org.apache.lucene.analysis.CharReader;
|
|||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.solr.analysis.CharFilterFactory;
|
||||
import org.apache.solr.analysis.TokenFilterFactory;
|
||||
import org.apache.solr.analysis.TokenizerChain;
|
||||
|
@ -141,11 +147,30 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
|
|||
*/
|
||||
private List<Token> analyzeTokenStream(TokenStream tokenStream) {
|
||||
List<Token> tokens = new ArrayList<Token>();
|
||||
Token reusableToken = new Token();
|
||||
Token token = null;
|
||||
|
||||
// TODO change this API to support custom attributes
|
||||
TermAttribute termAtt = (TermAttribute)
|
||||
tokenStream.addAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute)
|
||||
tokenStream.addAttribute(OffsetAttribute.class);
|
||||
TypeAttribute typeAtt = (TypeAttribute)
|
||||
tokenStream.addAttribute(TypeAttribute.class);
|
||||
FlagsAttribute flagsAtt = (FlagsAttribute)
|
||||
tokenStream.addAttribute(FlagsAttribute.class);
|
||||
PayloadAttribute payloadAtt = (PayloadAttribute)
|
||||
tokenStream.addAttribute(PayloadAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute)
|
||||
tokenStream.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
try {
|
||||
while ((token = tokenStream.next(reusableToken)) != null) {
|
||||
while (tokenStream.incrementToken()) {
|
||||
Token token = new Token();
|
||||
token.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||
token.setType(typeAtt.type());
|
||||
token.setFlags(flagsAtt.getFlags());
|
||||
token.setPayload(payloadAtt.getPayload());
|
||||
token.setPositionIncrement(posIncAtt.getPositionIncrement());
|
||||
tokens.add((Token) token.clone());
|
||||
}
|
||||
} catch (IOException ioe) {
|
||||
|
@ -229,16 +254,30 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
|
|||
/**
|
||||
* TokenStream that iterates over a list of pre-existing Tokens
|
||||
*/
|
||||
// TODO refactor to support custom attributes
|
||||
protected static class ListBasedTokenStream extends TokenStream {
|
||||
private final List<Token> tokens;
|
||||
private Iterator<Token> tokenIterator;
|
||||
|
||||
private final Iterator<Token> tokenIterator;
|
||||
|
||||
private final TermAttribute termAtt = (TermAttribute)
|
||||
addAttribute(TermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = (OffsetAttribute)
|
||||
addAttribute(OffsetAttribute.class);
|
||||
private final TypeAttribute typeAtt = (TypeAttribute)
|
||||
addAttribute(TypeAttribute.class);
|
||||
private final FlagsAttribute flagsAtt = (FlagsAttribute)
|
||||
addAttribute(FlagsAttribute.class);
|
||||
private final PayloadAttribute payloadAtt = (PayloadAttribute)
|
||||
addAttribute(PayloadAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute)
|
||||
addAttribute(PositionIncrementAttribute.class);
|
||||
/**
|
||||
* Creates a new ListBasedTokenStream which uses the given tokens as its token source.
|
||||
*
|
||||
* @param tokens Source of tokens to be used
|
||||
*/
|
||||
ListBasedTokenStream(List<Token> tokens) {
|
||||
this.tokens = tokens;
|
||||
tokenIterator = tokens.iterator();
|
||||
}
|
||||
|
||||
|
@ -246,8 +285,25 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
|
|||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public Token next(Token token) throws IOException {
|
||||
return (tokenIterator.hasNext()) ? tokenIterator.next() : null;
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (tokenIterator.hasNext()) {
|
||||
Token next = tokenIterator.next();
|
||||
termAtt.setTermBuffer(next.termBuffer(), 0, next.termLength());
|
||||
typeAtt.setType(next.type());
|
||||
offsetAtt.setOffset(next.startOffset(), next.endOffset());
|
||||
flagsAtt.setFlags(next.getFlags());
|
||||
payloadAtt.setPayload(next.getPayload());
|
||||
posIncAtt.setPositionIncrement(next.getPositionIncrement());
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
tokenIterator = tokens.iterator();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -258,7 +258,7 @@ public class SpellCheckerRequestHandler extends RequestHandlerBase implements So
|
|||
}
|
||||
dirDescription = f.getAbsolutePath();
|
||||
log.info("using spell directory: " + dirDescription);
|
||||
spellcheckerIndexDir = FSDirectory.getDirectory(f);
|
||||
spellcheckerIndexDir = FSDirectory.open(f);
|
||||
} else {
|
||||
log.info("using RAM based spell directory");
|
||||
}
|
||||
|
|
|
@ -40,7 +40,7 @@ import org.apache.lucene.index.Term;
|
|||
import org.apache.lucene.index.TermEnum;
|
||||
import org.apache.lucene.index.TermFreqVector;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ConstantScoreRangeQuery;
|
||||
import org.apache.lucene.search.TermRangeQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
@ -172,7 +172,8 @@ public class LukeRequestHandler extends RequestHandlerBase
|
|||
flags.append( (f != null && f.getOmitNorms()) ? FieldFlag.OMIT_NORMS.getAbbreviation() : '-' );
|
||||
flags.append( (f != null && f.isLazy()) ? FieldFlag.LAZY.getAbbreviation() : '-' );
|
||||
flags.append( (f != null && f.isBinary()) ? FieldFlag.BINARY.getAbbreviation() : '-' );
|
||||
flags.append( (f != null && f.isCompressed()) ? FieldFlag.COMPRESSED.getAbbreviation() : '-' );
|
||||
//nocommit: handle compressed
|
||||
//flags.append( (f != null && f.isCompressed()) ? FieldFlag.COMPRESSED.getAbbreviation() : '-' );
|
||||
flags.append( (false) ? FieldFlag.SORT_MISSING_FIRST.getAbbreviation() : '-' ); // SchemaField Specific
|
||||
flags.append( (false) ? FieldFlag.SORT_MISSING_LAST.getAbbreviation() : '-' ); // SchemaField Specific
|
||||
return flags.toString();
|
||||
|
@ -312,7 +313,7 @@ public class LukeRequestHandler extends RequestHandlerBase
|
|||
|
||||
// If numTerms==0, the call is just asking for a quick field list
|
||||
if( ttinfo != null && sfield != null && sfield.indexed() ) {
|
||||
Query q = new ConstantScoreRangeQuery(fieldName,null,null,false,false);
|
||||
Query q = new TermRangeQuery(fieldName,null,null,false,false);
|
||||
TopDocs top = searcher.search( q, 1 );
|
||||
if( top.totalHits > 0 ) {
|
||||
// Find a document with this field
|
||||
|
@ -652,7 +653,7 @@ public class LukeRequestHandler extends RequestHandlerBase
|
|||
}
|
||||
|
||||
if( terms.docFreq() > tiq.minFreq ) {
|
||||
tiq.put(new TopTermQueue.TermInfo(terms.term(), terms.docFreq()));
|
||||
tiq.add(new TopTermQueue.TermInfo(terms.term(), terms.docFreq()));
|
||||
if (tiq.size() > numTerms) { // if tiq full
|
||||
tiq.pop(); // remove lowest in tiq
|
||||
tiq.minFreq = ((TopTermQueue.TermInfo)tiq.top()).docFreq; // reset minFreq
|
||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.solr.common.params.ShardParams;
|
|||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.apache.solr.common.SolrException.ErrorCode;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.response.SolrQueryResponse;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
|
@ -305,7 +306,6 @@ public class QueryComponent extends SearchComponent
|
|||
public void handleResponses(ResponseBuilder rb, ShardRequest sreq) {
|
||||
if ((sreq.purpose & ShardRequest.PURPOSE_GET_TOP_IDS) != 0) {
|
||||
mergeIds(rb, sreq);
|
||||
return;
|
||||
}
|
||||
|
||||
if ((sreq.purpose & ShardRequest.PURPOSE_GET_FIELDS) != 0) {
|
||||
|
@ -399,7 +399,8 @@ public class QueryComponent extends SearchComponent
|
|||
|
||||
// Merge the docs via a priority queue so we don't have to sort *all* of the
|
||||
// documents... we only need to order the top (rows+start)
|
||||
ShardFieldSortedHitQueue queue = new ShardFieldSortedHitQueue(sortFields, ss.getOffset() + ss.getCount());
|
||||
ShardFieldSortedHitQueue queue;
|
||||
queue = new ShardFieldSortedHitQueue(sortFields, ss.getOffset() + ss.getCount());
|
||||
|
||||
long numFound = 0;
|
||||
Float maxScore=null;
|
||||
|
@ -451,7 +452,7 @@ public class QueryComponent extends SearchComponent
|
|||
|
||||
shardDoc.sortFieldValues = sortFieldValues;
|
||||
|
||||
queue.insert(shardDoc);
|
||||
queue.insertWithOverflow(shardDoc);
|
||||
} // end for-each-doc-in-response
|
||||
} // end for-each-response
|
||||
|
||||
|
|
|
@ -38,8 +38,8 @@ import javax.xml.xpath.XPathExpressionException;
|
|||
import javax.xml.xpath.XPathFactory;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.*;
|
||||
|
@ -298,10 +298,9 @@ public class QueryElevationComponent extends SearchComponent implements SolrCore
|
|||
TokenStream tokens = analyzer.reusableTokenStream( "", new StringReader( query ) );
|
||||
tokens.reset();
|
||||
|
||||
Token token = tokens.next();
|
||||
while( token != null ) {
|
||||
norm.append( new String(token.termBuffer(), 0, token.termLength()) );
|
||||
token = tokens.next();
|
||||
TermAttribute termAtt = (TermAttribute) tokens.addAttribute(TermAttribute.class);
|
||||
while( tokens.incrementToken() ) {
|
||||
norm.append( termAtt.termBuffer(), 0, termAtt.termLength() );
|
||||
}
|
||||
return norm.toString();
|
||||
}
|
||||
|
|
|
@ -33,6 +33,12 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.params.CommonParams;
|
||||
|
@ -332,7 +338,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
// create token
|
||||
SpellCheckResponse.Suggestion suggestion = origVsSuggestion.get(original);
|
||||
Token token = new Token();
|
||||
token.setTermText(original);
|
||||
token.setTermBuffer(original);
|
||||
token.setStartOffset(suggestion.getStartOffset());
|
||||
token.setEndOffset(suggestion.getEndOffset());
|
||||
|
||||
|
@ -364,10 +370,24 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
|
||||
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
|
||||
Collection<Token> result = new ArrayList<Token>();
|
||||
Token token = null;
|
||||
TokenStream ts = analyzer.reusableTokenStream("", new StringReader(q));
|
||||
ts.reset();
|
||||
while ((token = ts.next()) != null){
|
||||
// TODO: support custom attributes
|
||||
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) ts.addAttribute(OffsetAttribute.class);
|
||||
TypeAttribute typeAtt = (TypeAttribute) ts.addAttribute(TypeAttribute.class);
|
||||
FlagsAttribute flagsAtt = (FlagsAttribute) ts.addAttribute(FlagsAttribute.class);
|
||||
PayloadAttribute payloadAtt = (PayloadAttribute) ts.addAttribute(PayloadAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
while (ts.incrementToken()){
|
||||
Token token = new Token();
|
||||
token.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||
token.setType(typeAtt.type());
|
||||
token.setFlags(flagsAtt.getFlags());
|
||||
token.setPayload(payloadAtt.getPayload());
|
||||
token.setPositionIncrement(posIncAtt.getPositionIncrement());
|
||||
result.add(token);
|
||||
}
|
||||
return result;
|
||||
|
|
|
@ -113,7 +113,7 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
|
|||
IndexSchema schema = rb.req.getSchema();
|
||||
String uniqFieldName = schema.getUniqueKeyField().getName();
|
||||
//Only load the id field
|
||||
SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector(Collections.singleton(uniqFieldName), Collections.emptySet());
|
||||
SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector(Collections.singleton(uniqFieldName), Collections.<String>emptySet());
|
||||
while (iter.hasNext()) {
|
||||
Integer docId = iter.next();
|
||||
NamedList docNL = new NamedList();
|
||||
|
|
|
@ -32,6 +32,7 @@ import org.apache.lucene.analysis.CachingTokenFilter;
|
|||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.highlight.*;
|
||||
|
@ -39,6 +40,7 @@ import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
|
|||
import org.apache.lucene.search.vectorhighlight.FieldQuery;
|
||||
import org.apache.lucene.search.vectorhighlight.FragListBuilder;
|
||||
import org.apache.lucene.search.vectorhighlight.FragmentsBuilder;
|
||||
import org.apache.lucene.util.AttributeSource.State;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.params.HighlightParams;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
|
@ -512,28 +514,28 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
|
|||
*/
|
||||
class TokenOrderingFilter extends TokenFilter {
|
||||
private final int windowSize;
|
||||
private final LinkedList<Token> queue = new LinkedList<Token>();
|
||||
private final LinkedList<OrderedToken> queue = new LinkedList<OrderedToken>();
|
||||
private boolean done=false;
|
||||
|
||||
private final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
|
||||
protected TokenOrderingFilter(TokenStream input, int windowSize) {
|
||||
super(input);
|
||||
this.windowSize = windowSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Token next() throws IOException {
|
||||
public boolean incrementToken() throws IOException {
|
||||
while (!done && queue.size() < windowSize) {
|
||||
Token newTok = input.next();
|
||||
if (newTok==null) {
|
||||
done=true;
|
||||
if (!input.incrementToken()) {
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
|
||||
// reverse iterating for better efficiency since we know the
|
||||
// list is already sorted, and most token start offsets will be too.
|
||||
ListIterator<Token> iter = queue.listIterator(queue.size());
|
||||
ListIterator<OrderedToken> iter = queue.listIterator(queue.size());
|
||||
while(iter.hasPrevious()) {
|
||||
if (newTok.startOffset() >= iter.previous().startOffset()) {
|
||||
if (offsetAtt.startOffset() >= iter.previous().startOffset) {
|
||||
// insertion will be before what next() would return (what
|
||||
// we just compared against), so move back one so the insertion
|
||||
// will be after.
|
||||
|
@ -541,50 +543,82 @@ class TokenOrderingFilter extends TokenFilter {
|
|||
break;
|
||||
}
|
||||
}
|
||||
iter.add(newTok);
|
||||
OrderedToken ot = new OrderedToken();
|
||||
ot.state = captureState();
|
||||
ot.startOffset = offsetAtt.startOffset();
|
||||
iter.add(ot);
|
||||
}
|
||||
|
||||
return queue.isEmpty() ? null : queue.removeFirst();
|
||||
if (queue.isEmpty()) {
|
||||
return false;
|
||||
} else {
|
||||
restoreState(queue.removeFirst().state);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// for TokenOrderingFilter, so it can easily sort by startOffset
|
||||
class OrderedToken {
|
||||
State state;
|
||||
int startOffset;
|
||||
}
|
||||
|
||||
class TermOffsetsTokenStream {
|
||||
|
||||
TokenStream bufferedTokenStream = null;
|
||||
Token bufferedToken;
|
||||
OffsetAttribute bufferedOffsetAtt;
|
||||
State bufferedToken;
|
||||
int bufferedStartOffset;
|
||||
int bufferedEndOffset;
|
||||
int startOffset;
|
||||
int endOffset;
|
||||
|
||||
public TermOffsetsTokenStream( TokenStream tstream ){
|
||||
bufferedTokenStream = tstream;
|
||||
bufferedOffsetAtt = (OffsetAttribute) bufferedTokenStream.addAttribute(OffsetAttribute.class);
|
||||
startOffset = 0;
|
||||
bufferedToken = null;
|
||||
}
|
||||
|
||||
public TokenStream getMultiValuedTokenStream( final int length ){
|
||||
endOffset = startOffset + length;
|
||||
return new TokenStream(){
|
||||
Token token;
|
||||
public Token next() throws IOException {
|
||||
return new MultiValuedStream(length);
|
||||
}
|
||||
|
||||
class MultiValuedStream extends TokenStream {
|
||||
private final int length;
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
|
||||
MultiValuedStream(int length) {
|
||||
super(bufferedTokenStream.cloneAttributes());
|
||||
this.length = length;
|
||||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
while( true ){
|
||||
if( bufferedToken == null )
|
||||
bufferedToken = bufferedTokenStream.next();
|
||||
if( bufferedToken == null ) return null;
|
||||
if( startOffset <= bufferedToken.startOffset() &&
|
||||
bufferedToken.endOffset() <= endOffset ){
|
||||
token = bufferedToken;
|
||||
bufferedToken = null;
|
||||
token.setStartOffset( token.startOffset() - startOffset );
|
||||
token.setEndOffset( token.endOffset() - startOffset );
|
||||
return token;
|
||||
if( bufferedToken == null ) {
|
||||
if (!bufferedTokenStream.incrementToken())
|
||||
return false;
|
||||
bufferedToken = bufferedTokenStream.captureState();
|
||||
bufferedStartOffset = bufferedOffsetAtt.startOffset();
|
||||
bufferedEndOffset = bufferedOffsetAtt.endOffset();
|
||||
}
|
||||
else if( bufferedToken.endOffset() > endOffset ){
|
||||
|
||||
if( startOffset <= bufferedStartOffset &&
|
||||
bufferedEndOffset <= endOffset ){
|
||||
restoreState(bufferedToken);
|
||||
bufferedToken = null;
|
||||
offsetAtt.setOffset( offsetAtt.startOffset() - startOffset, offsetAtt.endOffset() - startOffset );
|
||||
return true;
|
||||
}
|
||||
else if( bufferedEndOffset > endOffset ){
|
||||
startOffset += length + 1;
|
||||
return null;
|
||||
return false;
|
||||
}
|
||||
bufferedToken = null;
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
};
|
||||
|
|
|
@ -176,7 +176,7 @@ public abstract class BaseResponseWriter {
|
|||
Object val = null;
|
||||
if (ft == null) { // handle fields not in the schema
|
||||
if (f.isBinary())
|
||||
val = f.binaryValue();
|
||||
val = f.getBinaryValue();
|
||||
else
|
||||
val = f.stringValue();
|
||||
} else {
|
||||
|
|
|
@ -140,7 +140,7 @@ public class BinaryResponseWriter implements BinaryQueryResponseWriter {
|
|||
if(sf != null) ft =sf.getType();
|
||||
Object val;
|
||||
if (ft == null) { // handle fields not in the schema
|
||||
if (f.isBinary()) val = f.binaryValue();
|
||||
if (f.isBinary()) val = f.getBinaryValue();
|
||||
else val = f.stringValue();
|
||||
} else {
|
||||
try {
|
||||
|
|
|
@ -58,8 +58,10 @@ public abstract class CompressableField extends FieldType {
|
|||
String internalVal) {
|
||||
/* compress field if length exceeds threshold */
|
||||
if(field.isCompressed()) {
|
||||
return internalVal.length() >= compressThreshold ?
|
||||
Field.Store.COMPRESS : Field.Store.YES;
|
||||
// nocommit: handle compression
|
||||
//return internalVal.length() >= compressThreshold ?
|
||||
// Field.Store.COMPRESS : Field.Store.YES;
|
||||
return Field.Store.YES;
|
||||
} else
|
||||
return super.getFieldStore(field, internalVal);
|
||||
}
|
||||
|
|
|
@ -302,8 +302,8 @@ public abstract class FieldType extends FieldProperties {
|
|||
}
|
||||
protected Field.Index getFieldIndex(SchemaField field,
|
||||
String internalVal) {
|
||||
return field.indexed() ? (isTokenized() ? Field.Index.TOKENIZED :
|
||||
Field.Index.UN_TOKENIZED) : Field.Index.NO;
|
||||
return field.indexed() ? (isTokenized() ? Field.Index.ANALYZED :
|
||||
Field.Index.NOT_ANALYZED) : Field.Index.NO;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -63,7 +63,7 @@ public class TrieDateField extends DateField {
|
|||
|
||||
@Override
|
||||
public Date toObject(Fieldable f) {
|
||||
byte[] arr = f.binaryValue();
|
||||
byte[] arr = f.getBinaryValue();
|
||||
if (arr==null) throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,TrieField.badFieldString(f));
|
||||
return new Date(TrieField.toLong(arr));
|
||||
}
|
||||
|
@ -85,7 +85,7 @@ public class TrieDateField extends DateField {
|
|||
|
||||
@Override
|
||||
public void write(XMLWriter xmlWriter, String name, Fieldable f) throws IOException {
|
||||
byte[] arr = f.binaryValue();
|
||||
byte[] arr = f.getBinaryValue();
|
||||
if (arr==null) {
|
||||
xmlWriter.writeStr(name, TrieField.badFieldString(f));
|
||||
return;
|
||||
|
@ -96,7 +96,7 @@ public class TrieDateField extends DateField {
|
|||
|
||||
@Override
|
||||
public void write(TextResponseWriter writer, String name, Fieldable f) throws IOException {
|
||||
byte[] arr = f.binaryValue();
|
||||
byte[] arr = f.getBinaryValue();
|
||||
if (arr==null) {
|
||||
writer.writeStr(name, TrieField.badFieldString(f),true);
|
||||
return;
|
||||
|
@ -136,7 +136,7 @@ public class TrieDateField extends DateField {
|
|||
|
||||
@Override
|
||||
public String toExternal(Fieldable f) {
|
||||
byte[] arr = f.binaryValue();
|
||||
byte[] arr = f.getBinaryValue();
|
||||
if (arr==null) return TrieField.badFieldString(f);
|
||||
return super.toExternal(new Date(TrieField.toLong(arr)));
|
||||
}
|
||||
|
|
|
@ -93,7 +93,7 @@ public class TrieField extends FieldType {
|
|||
|
||||
@Override
|
||||
public Object toObject(Fieldable f) {
|
||||
byte[] arr = f.binaryValue();
|
||||
byte[] arr = f.getBinaryValue();
|
||||
if (arr==null) return badFieldString(f);
|
||||
switch (type) {
|
||||
case INTEGER:
|
||||
|
@ -145,7 +145,7 @@ public class TrieField extends FieldType {
|
|||
}
|
||||
|
||||
public void write(XMLWriter xmlWriter, String name, Fieldable f) throws IOException {
|
||||
byte[] arr = f.binaryValue();
|
||||
byte[] arr = f.getBinaryValue();
|
||||
if (arr==null) {
|
||||
xmlWriter.writeStr(name, badFieldString(f));
|
||||
return;
|
||||
|
@ -173,7 +173,7 @@ public class TrieField extends FieldType {
|
|||
}
|
||||
|
||||
public void write(TextResponseWriter writer, String name, Fieldable f) throws IOException {
|
||||
byte[] arr = f.binaryValue();
|
||||
byte[] arr = f.getBinaryValue();
|
||||
if (arr==null) {
|
||||
writer.writeStr(name, badFieldString(f),true);
|
||||
return;
|
||||
|
@ -352,7 +352,7 @@ public class TrieField extends FieldType {
|
|||
|
||||
@Override
|
||||
public String toExternal(Fieldable f) {
|
||||
byte[] arr = f.binaryValue();
|
||||
byte[] arr = f.getBinaryValue();
|
||||
if (arr==null) return badFieldString(f);
|
||||
switch (type) {
|
||||
case INTEGER:
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
|
||||
package org.apache.solr.search;
|
||||
|
||||
import org.apache.lucene.search.HitCollector;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.util.OpenBitSet;
|
||||
|
|
|
@ -25,7 +25,6 @@ import org.apache.lucene.index.TermEnum;
|
|||
import org.apache.lucene.index.TermDocs;
|
||||
import org.apache.lucene.util.OpenBitSet;
|
||||
|
||||
import java.util.BitSet;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
|
@ -40,17 +39,6 @@ public class PrefixFilter extends Filter {
|
|||
|
||||
Term getPrefix() { return prefix; }
|
||||
|
||||
@Override
|
||||
public BitSet bits(IndexReader reader) throws IOException {
|
||||
final BitSet bitSet = new BitSet(reader.maxDoc());
|
||||
new PrefixGenerator(prefix) {
|
||||
public void handleDoc(int doc) {
|
||||
bitSet.set(doc);
|
||||
}
|
||||
}.generate(reader);
|
||||
return bitSet;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
|
||||
final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc());
|
||||
|
|
|
@ -17,10 +17,14 @@
|
|||
|
||||
package org.apache.solr.search;
|
||||
|
||||
import org.apache.lucene.search.FieldComparator;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.solr.common.SolrException.ErrorCode;
|
||||
import org.apache.solr.common.SolrException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
/** A hash key encapsulating a query, a list of filters, and a sort
|
||||
|
@ -38,7 +42,7 @@ public final class QueryResultKey {
|
|||
private static SortField[] defaultSort = new SortField[0];
|
||||
|
||||
|
||||
public QueryResultKey(Query query, List<Query> filters, Sort sort, int nc_flags) {
|
||||
public QueryResultKey(Query query, List<Query> filters, Sort sort, int nc_flags) throws IOException {
|
||||
this.query = query;
|
||||
this.sort = sort;
|
||||
this.filters = filters;
|
||||
|
|
|
@ -133,20 +133,10 @@ public class SolrConstantScoreQuery extends ConstantScoreQuery {
|
|||
}
|
||||
}
|
||||
|
||||
/** @deprecated use {@link #nextDoc()} instead. */
|
||||
public boolean next() throws IOException {
|
||||
return docIdSetIterator.nextDoc() != NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
public int nextDoc() throws IOException {
|
||||
return docIdSetIterator.nextDoc();
|
||||
}
|
||||
|
||||
/** @deprecated use {@link #docID()} instead. */
|
||||
public int doc() {
|
||||
return docIdSetIterator.doc();
|
||||
}
|
||||
|
||||
public int docID() {
|
||||
return docIdSetIterator.docID();
|
||||
}
|
||||
|
@ -155,11 +145,6 @@ public class SolrConstantScoreQuery extends ConstantScoreQuery {
|
|||
return theScore;
|
||||
}
|
||||
|
||||
/** @deprecated use {@link #advance(int)} instead. */
|
||||
public boolean skipTo(int target) throws IOException {
|
||||
return docIdSetIterator.advance(target) != NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
public int advance(int target) throws IOException {
|
||||
return docIdSetIterator.advance(target);
|
||||
}
|
||||
|
|
|
@ -454,11 +454,6 @@ public class SolrIndexReader extends FilterIndexReader {
|
|||
return in.getIndexCommit();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getTermInfosIndexDivisor() {
|
||||
return in.getTermInfosIndexDivisor();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void incRef() {
|
||||
in.incRef();
|
||||
|
@ -479,11 +474,6 @@ public class SolrIndexReader extends FilterIndexReader {
|
|||
in.setNorm(doc, field, value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setTermInfosIndexDivisor(int indexDivisor) throws IllegalStateException {
|
||||
in.setTermInfosIndexDivisor(indexDivisor);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermPositions termPositions(Term term) throws IOException {
|
||||
return in.termPositions(term);
|
||||
|
@ -498,16 +488,6 @@ public class SolrIndexReader extends FilterIndexReader {
|
|||
public Object getFieldCacheKey() {
|
||||
return in.getFieldCacheKey();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean getDisableFakeNorms() {
|
||||
return in.getDisableFakeNorms();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setDisableFakeNorms(boolean disableFakeNorms) {
|
||||
in.setDisableFakeNorms(disableFakeNorms);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -342,22 +342,22 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
|
|||
return qr;
|
||||
}
|
||||
|
||||
public Hits search(Query query, Filter filter, Sort sort) throws IOException {
|
||||
// todo - when Solr starts accepting filters, need to
|
||||
// change this conditional check (filter!=null) and create a new filter
|
||||
// that ANDs them together if it already exists.
|
||||
|
||||
if (optimizer==null || filter!=null || !(query instanceof BooleanQuery)
|
||||
) {
|
||||
return super.search(query,filter,sort);
|
||||
} else {
|
||||
Query[] newQuery = new Query[1];
|
||||
Filter[] newFilter = new Filter[1];
|
||||
optimizer.optimize((BooleanQuery)query, this, 0, newQuery, newFilter);
|
||||
|
||||
return super.search(newQuery[0], newFilter[0], sort);
|
||||
}
|
||||
}
|
||||
// public Hits search(Query query, Filter filter, Sort sort) throws IOException {
|
||||
// // todo - when Solr starts accepting filters, need to
|
||||
// // change this conditional check (filter!=null) and create a new filter
|
||||
// // that ANDs them together if it already exists.
|
||||
//
|
||||
// if (optimizer==null || filter!=null || !(query instanceof BooleanQuery)
|
||||
// ) {
|
||||
// return super.search(query,filter,sort);
|
||||
// } else {
|
||||
// Query[] newQuery = new Query[1];
|
||||
// Filter[] newFilter = new Filter[1];
|
||||
// optimizer.optimize((BooleanQuery)query, this, 0, newQuery, newFilter);
|
||||
//
|
||||
// return super.search(newQuery[0], newFilter[0], sort);
|
||||
// }
|
||||
// }
|
||||
|
||||
/**
|
||||
* @return the indexDir on which this searcher is opened
|
||||
|
@ -697,10 +697,12 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
|
|||
* This method is not cache-aware and no caches are checked.
|
||||
*/
|
||||
public DocSet convertFilter(Filter lfilter) throws IOException {
|
||||
BitSet bs = lfilter.bits(this.reader);
|
||||
OpenBitSet obs = new OpenBitSet(bs.size());
|
||||
for(int i=bs.nextSetBit(0); i>=0; i=bs.nextSetBit(i+1)) {
|
||||
obs.fastSet(i);
|
||||
DocIdSet docSet = lfilter.getDocIdSet(this.reader);
|
||||
OpenBitSet obs = new OpenBitSet();
|
||||
DocIdSetIterator it = docSet.iterator();
|
||||
int doc;
|
||||
while((doc = it.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
obs.fastSet(doc);
|
||||
}
|
||||
return new BitDocSet(obs);
|
||||
}
|
||||
|
|
|
@ -25,15 +25,13 @@ import org.apache.lucene.index.Term;
|
|||
import org.apache.lucene.queryParser.ParseException;
|
||||
import org.apache.lucene.queryParser.QueryParser;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
|
||||
import org.apache.solr.analysis.*;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
import org.apache.solr.schema.TrieField;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
import org.apache.solr.schema.TextField;
|
||||
|
||||
// TODO: implement the analysis of simple fields with
|
||||
|
@ -77,7 +75,7 @@ public class SolrQueryParser extends QueryParser {
|
|||
* @see IndexSchema#getDefaultSearchFieldName()
|
||||
*/
|
||||
public SolrQueryParser(IndexSchema schema, String defaultField) {
|
||||
super(defaultField == null ? schema.getDefaultSearchFieldName() : defaultField, schema.getQueryAnalyzer());
|
||||
super(Version.LUCENE_24, defaultField == null ? schema.getDefaultSearchFieldName() : defaultField, schema.getQueryAnalyzer());
|
||||
this.schema = schema;
|
||||
this.parser = null;
|
||||
this.defaultField = defaultField;
|
||||
|
@ -91,7 +89,7 @@ public class SolrQueryParser extends QueryParser {
|
|||
}
|
||||
|
||||
public SolrQueryParser(QParser parser, String defaultField, Analyzer analyzer) {
|
||||
super(defaultField, analyzer);
|
||||
super(Version.LUCENE_24, defaultField, analyzer);
|
||||
this.schema = parser.getReq().getSchema();
|
||||
this.parser = parser;
|
||||
this.defaultField = defaultField;
|
||||
|
|
|
@ -26,7 +26,6 @@ import org.apache.lucene.index.TermEnum;
|
|||
import org.apache.lucene.index.TermDocs;
|
||||
import org.apache.lucene.util.OpenBitSet;
|
||||
|
||||
import java.util.BitSet;
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
|
@ -43,20 +42,6 @@ public class WildcardFilter extends Filter {
|
|||
|
||||
public Term getTerm() { return term; }
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link #getDocIdSet(IndexReader)} instead.
|
||||
*/
|
||||
@Override
|
||||
public BitSet bits(IndexReader reader) throws IOException {
|
||||
final BitSet bitSet = new BitSet(reader.maxDoc());
|
||||
new WildcardGenerator(term) {
|
||||
public void handleDoc(int doc) {
|
||||
bitSet.set(doc);
|
||||
}
|
||||
}.generate(reader);
|
||||
return bitSet;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
|
||||
final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc());
|
||||
|
|
|
@ -100,7 +100,7 @@ public class FunctionQuery extends Query {
|
|||
int[] offsets = topReader.getLeafOffsets();
|
||||
int readerPos = SolrIndexReader.readerIndex(doc, offsets);
|
||||
int readerBase = offsets[readerPos];
|
||||
return scorer(subReaders[readerPos], true, true).explain(doc-readerBase);
|
||||
return ((AllScorer)scorer(subReaders[readerPos], true, true)).explain(doc-readerBase);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -33,7 +33,6 @@ import org.apache.lucene.index.Term;
|
|||
import org.apache.lucene.search.spell.Dictionary;
|
||||
import org.apache.lucene.search.spell.LevensteinDistance;
|
||||
import org.apache.lucene.search.spell.SpellChecker;
|
||||
import org.apache.lucene.search.spell.StringDistance;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
@ -184,7 +183,7 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
|
|||
*/
|
||||
protected void initIndex() throws IOException {
|
||||
if (indexDir != null) {
|
||||
index = FSDirectory.getDirectory(indexDir);
|
||||
index = FSDirectory.open(new File(indexDir));
|
||||
} else {
|
||||
index = new RAMDirectory();
|
||||
}
|
||||
|
|
|
@ -22,12 +22,10 @@ import java.util.List;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.spell.PlainTextDictionary;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
|
@ -98,7 +96,7 @@ public class FileBasedSpellChecker extends AbstractLuceneSpellChecker {
|
|||
|
||||
for (String s : lines) {
|
||||
Document d = new Document();
|
||||
d.add(new Field(WORD_FIELD_NAME, s, Field.Store.NO, Field.Index.TOKENIZED));
|
||||
d.add(new Field(WORD_FIELD_NAME, s, Field.Store.NO, Field.Index.ANALYZED));
|
||||
writer.addDocument(d);
|
||||
}
|
||||
writer.optimize();
|
||||
|
|
|
@ -16,17 +16,14 @@ package org.apache.solr.spelling;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
import org.apache.solr.util.HighFrequencyDictionary;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -63,7 +60,7 @@ public class IndexBasedSpellChecker extends AbstractLuceneSpellChecker {
|
|||
private void initSourceReader() {
|
||||
if (sourceLocation != null) {
|
||||
try {
|
||||
FSDirectory luceneIndexDir = FSDirectory.getDirectory(sourceLocation);
|
||||
FSDirectory luceneIndexDir = FSDirectory.open(new File(sourceLocation));
|
||||
this.reader = IndexReader.open(luceneIndexDir);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
|
|
|
@ -27,6 +27,11 @@ import java.util.regex.Pattern;
|
|||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -99,10 +104,22 @@ public class SpellingQueryConverter extends QueryConverter {
|
|||
if (word.equals("AND") == false && word.equals("OR") == false) {
|
||||
try {
|
||||
stream = analyzer.reusableTokenStream("", new StringReader(word));
|
||||
Token token;
|
||||
while ((token = stream.next()) != null) {
|
||||
// TODO: support custom attributes
|
||||
TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
|
||||
FlagsAttribute flagsAtt = (FlagsAttribute) stream.addAttribute(FlagsAttribute.class);
|
||||
TypeAttribute typeAtt = (TypeAttribute) stream.addAttribute(TypeAttribute.class);
|
||||
PayloadAttribute payloadAtt = (PayloadAttribute) stream.addAttribute(PayloadAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) stream.addAttribute(PositionIncrementAttribute.class);
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
Token token = new Token();
|
||||
token.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||
token.setStartOffset(matcher.start());
|
||||
token.setEndOffset(matcher.end());
|
||||
token.setFlags(flagsAtt.getFlags());
|
||||
token.setType(typeAtt.type());
|
||||
token.setPayload(payloadAtt.getPayload());
|
||||
token.setPositionIncrement(posIncAtt.getPositionIncrement());
|
||||
result.add(token);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
|
|
|
@ -66,12 +66,11 @@ public class OldRequestHandler implements SolrRequestHandler {
|
|||
sort = QueryParsing.parseSort(commands.get(1), req.getSchema());
|
||||
}
|
||||
|
||||
Hits hits=null;
|
||||
|
||||
try {
|
||||
hits = req.getSearcher().search(query,filter,sort);
|
||||
TopFieldDocs hits = req.getSearcher().search(query,filter, req.getStart()+req.getLimit(), sort);
|
||||
|
||||
int numHits = hits.length();
|
||||
int numHits = hits.totalHits;
|
||||
int startRow = Math.min(numHits, req.getStart());
|
||||
int endRow = Math.min(numHits,req.getStart()+req.getLimit());
|
||||
int numRows = endRow-startRow;
|
||||
|
@ -79,8 +78,8 @@ public class OldRequestHandler implements SolrRequestHandler {
|
|||
int[] ids = new int[numRows];
|
||||
Document[] data = new Document[numRows];
|
||||
for (int i=startRow; i<endRow; i++) {
|
||||
ids[i] = hits.id(i);
|
||||
data[i] = hits.doc(i);
|
||||
ids[i] = hits.scoreDocs[i].doc;
|
||||
data[i] = req.getSearcher().doc(ids[i]);
|
||||
}
|
||||
|
||||
rsp.add(null, new DocSlice(0,numRows,ids,null,numHits,0.0f));
|
||||
|
|
|
@ -144,7 +144,7 @@ public class TestRequestHandler implements SolrRequestHandler {
|
|||
nl.add("myLong",999999999999L);
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("id","55",Field.Store.YES, Field.Index.UN_TOKENIZED));
|
||||
doc.add(new Field("id","55",Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
nl.add("myDoc",doc);
|
||||
|
||||
nl.add("myResult",results);
|
||||
|
@ -172,8 +172,8 @@ public class TestRequestHandler implements SolrRequestHandler {
|
|||
//
|
||||
// test against hits
|
||||
//
|
||||
Hits hits = searcher.search(query, lfilter, sort);
|
||||
test(hits.length() == results.matches());
|
||||
TopFieldDocs hits = searcher.search(query, lfilter, 1000, sort);
|
||||
test(hits.totalHits == results.matches());
|
||||
|
||||
|
||||
DocList rrr2 = results.subset(start,limit);
|
||||
|
@ -189,7 +189,7 @@ public class TestRequestHandler implements SolrRequestHandler {
|
|||
***/
|
||||
|
||||
for (int i=0; i<results.size(); i++) {
|
||||
test( iter.nextDoc() == hits.id(i+results.offset()) );
|
||||
test( iter.nextDoc() == hits.scoreDocs[i].doc);
|
||||
|
||||
// Document doesn't implement equals()
|
||||
// test( searcher.document(i).equals(hits.doc(i)));
|
||||
|
|
|
@ -161,29 +161,32 @@ public class SolrIndexWriter extends IndexWriter {
|
|||
*
|
||||
*/
|
||||
public SolrIndexWriter(String name, String path, DirectoryFactory dirFactory, boolean create, IndexSchema schema) throws IOException {
|
||||
super(getDirectory(path, dirFactory, null), false, schema.getAnalyzer(), create);
|
||||
super(getDirectory(path, dirFactory, null), schema.getAnalyzer(), create, MaxFieldLength.LIMITED);
|
||||
init(name, schema, null);
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
// nocommit: remove?
|
||||
public SolrIndexWriter(String name, String path, DirectoryFactory dirFactory, boolean create, IndexSchema schema, SolrIndexConfig config) throws IOException {
|
||||
super(getDirectory(path, dirFactory, null), config.luceneAutoCommit, schema.getAnalyzer(), create);
|
||||
super(getDirectory(path, dirFactory, null), schema.getAnalyzer(), create, MaxFieldLength.LIMITED);
|
||||
init(name, schema, config);
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated
|
||||
*/
|
||||
// nocommit: remove?
|
||||
public SolrIndexWriter(String name, String path, boolean create, IndexSchema schema) throws IOException {
|
||||
super(getDirectory(path, null), false, schema.getAnalyzer(), create);
|
||||
super(getDirectory(path, null), schema.getAnalyzer(), create, MaxFieldLength.LIMITED);
|
||||
init(name, schema, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated
|
||||
*/
|
||||
// nocommit: remove?
|
||||
public SolrIndexWriter(String name, String path, boolean create, IndexSchema schema, SolrIndexConfig config) throws IOException {
|
||||
super(getDirectory(path, config), config.luceneAutoCommit, schema.getAnalyzer(), create);
|
||||
super(getDirectory(path, config), schema.getAnalyzer(), create, MaxFieldLength.LIMITED);
|
||||
init(name, schema, config);
|
||||
}
|
||||
|
||||
|
|
|
@ -18,11 +18,13 @@
|
|||
package org.apache.solr.update;
|
||||
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.search.HitCollector;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -135,17 +137,19 @@ public abstract class UpdateHandler implements SolrInfoMBean {
|
|||
public abstract void close() throws IOException;
|
||||
|
||||
|
||||
static class DeleteHitCollector extends HitCollector {
|
||||
static class DeleteHitCollector extends Collector {
|
||||
public int deleted=0;
|
||||
public final SolrIndexSearcher searcher;
|
||||
private int docBase;
|
||||
|
||||
public DeleteHitCollector(SolrIndexSearcher searcher) {
|
||||
this.searcher = searcher;
|
||||
}
|
||||
|
||||
public void collect(int doc, float score) {
|
||||
@Override
|
||||
public void collect(int doc) {
|
||||
try {
|
||||
searcher.getReader().deleteDocument(doc);
|
||||
searcher.getReader().deleteDocument(doc + docBase);
|
||||
deleted++;
|
||||
} catch (IOException e) {
|
||||
// don't try to close the searcher on failure for now...
|
||||
|
@ -153,6 +157,21 @@ public abstract class UpdateHandler implements SolrInfoMBean {
|
|||
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR,"Error deleting doc# "+doc,e,false);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean acceptsDocsOutOfOrder() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setNextReader(IndexReader arg0, int docBase) throws IOException {
|
||||
this.docBase = docBase;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setScorer(Scorer scorer) throws IOException {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -538,21 +538,21 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase {
|
|||
SchemaField f; // Solr field type
|
||||
Field luf; // Lucene field
|
||||
|
||||
f = ischema.getField("test_hlt");
|
||||
luf = f.createField("test", 0f);
|
||||
assertFalse(luf.isCompressed());
|
||||
assertTrue(luf.isStored());
|
||||
|
||||
f = ischema.getField("test_hlt");
|
||||
luf = f.createField(mkstr(345), 0f);
|
||||
assertTrue(luf.isCompressed());
|
||||
assertTrue(luf.isStored());
|
||||
|
||||
f = ischema.getField("test_hlt_off");
|
||||
luf = f.createField(mkstr(400), 0f);
|
||||
assertFalse(luf.isCompressed());
|
||||
assertTrue(luf.isStored());
|
||||
|
||||
// f = ischema.getField("test_hlt");
|
||||
// luf = f.createField("test", 0f);
|
||||
// assertFalse(luf.isCompressed());
|
||||
// assertTrue(luf.isStored());
|
||||
//
|
||||
// f = ischema.getField("test_hlt");
|
||||
// luf = f.createField(mkstr(345), 0f);
|
||||
// assertTrue(luf.isCompressed());
|
||||
// assertTrue(luf.isStored());
|
||||
//
|
||||
// f = ischema.getField("test_hlt_off");
|
||||
// luf = f.createField(mkstr(400), 0f);
|
||||
// assertFalse(luf.isCompressed());
|
||||
// assertTrue(luf.isStored());
|
||||
//
|
||||
}
|
||||
|
||||
public void testNotLazyField() throws IOException {
|
||||
|
|
|
@ -37,7 +37,7 @@ public class TestBufferedTokenStream extends BaseTokenTestCase {
|
|||
protected Token process(Token t) throws IOException {
|
||||
if ("A".equals(new String(t.termBuffer(), 0, t.termLength()))) {
|
||||
Token t2 = read();
|
||||
if (t2!=null && "B".equals(new String(t2.termBuffer(), 0, t2.termLength()))) t.setTermText("Q");
|
||||
if (t2!=null && "B".equals(new String(t2.termBuffer(), 0, t2.termLength()))) t.setTermBuffer("Q");
|
||||
if (t2!=null) pushBack(t2);
|
||||
}
|
||||
return t;
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
*/
|
||||
package org.apache.solr.core;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.solr.util.AbstractSolrTestCase;
|
||||
|
@ -44,7 +45,7 @@ public class AlternateDirectoryTest extends AbstractSolrTestCase {
|
|||
|
||||
public FSDirectory open(String path) throws IOException {
|
||||
openCalled = true;
|
||||
return FSDirectory.getDirectory(path);
|
||||
return FSDirectory.open(new File(path));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -30,11 +30,11 @@ import org.apache.lucene.index.IndexWriter;
|
|||
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
|
||||
import org.apache.lucene.queryParser.ParseException;
|
||||
import org.apache.lucene.queryParser.QueryParser;
|
||||
import org.apache.lucene.search.Hits;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.util.AbstractSolrTestCase;
|
||||
import org.apache.solr.util.TestHarness;
|
||||
|
@ -97,11 +97,11 @@ public class TestArbitraryIndexDir extends AbstractSolrTestCase{
|
|||
}
|
||||
|
||||
//add a doc in the new index dir
|
||||
Directory dir = FSDirectory.getDirectory(newDir);
|
||||
IndexWriter iw = new IndexWriter(dir, new StandardAnalyzer(), new MaxFieldLength(1000));
|
||||
Directory dir = FSDirectory.open(newDir);
|
||||
IndexWriter iw = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_24), new MaxFieldLength(1000));
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("id", "2", Field.Store.YES, Field.Index.TOKENIZED));
|
||||
doc.add(new Field("name", "name2", Field.Store.YES, Field.Index.TOKENIZED));
|
||||
doc.add(new Field("id", "2", Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(new Field("name", "name2", Field.Store.YES, Field.Index.ANALYZED));
|
||||
iw.addDocument(doc);
|
||||
iw.commit();
|
||||
iw.close();
|
||||
|
|
|
@ -158,12 +158,11 @@ public class HighlighterTest extends AbstractSolrTestCase {
|
|||
TokenStream ts1 = tots.getMultiValuedTokenStream( v.length() );
|
||||
Analyzer a2 = new WhitespaceAnalyzer();
|
||||
TokenStream ts2 = a2.tokenStream( "", new StringReader( v ) );
|
||||
Token t1 = new Token();
|
||||
Token t2 = new Token();
|
||||
for( t1 = ts1.next( t1 ); t1 != null; t1 = ts1.next( t1 ) ){
|
||||
t2 = ts2.next( t2 );
|
||||
assertEquals( t2, t1 );
|
||||
while (ts1.incrementToken()) {
|
||||
assertTrue(ts2.incrementToken());
|
||||
assertEquals(ts1, ts2);
|
||||
}
|
||||
assertFalse(ts2.incrementToken());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.index.IndexWriter;
|
|||
import org.apache.lucene.search.spell.JaroWinklerDistance;
|
||||
import org.apache.lucene.search.spell.SpellChecker;
|
||||
import org.apache.lucene.search.spell.StringDistance;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.util.AbstractSolrTestCase;
|
||||
|
@ -254,10 +255,10 @@ public class IndexBasedSpellCheckerTest extends AbstractSolrTestCase {
|
|||
File indexDir = new File(tmpDir, "spellingIdx" + new Date().getTime());
|
||||
//create a standalone index
|
||||
File altIndexDir = new File(tmpDir, "alternateIdx" + new Date().getTime());
|
||||
IndexWriter iw = new IndexWriter(altIndexDir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
|
||||
IndexWriter iw = new IndexWriter(FSDirectory.open(altIndexDir), new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
|
||||
for (int i = 0; i < ALT_DOCS.length; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("title", ALT_DOCS[i], Field.Store.YES, Field.Index.TOKENIZED));
|
||||
doc.add(new Field("title", ALT_DOCS[i], Field.Store.YES, Field.Index.ANALYZED));
|
||||
iw.addDocument(doc);
|
||||
}
|
||||
iw.optimize();
|
||||
|
|
|
@ -19,6 +19,12 @@ package org.apache.solr.spelling;
|
|||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
|
@ -36,9 +42,24 @@ class SimpleQueryConverter extends SpellingQueryConverter{
|
|||
Collection<Token> result = new HashSet<Token>();
|
||||
WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
|
||||
TokenStream ts = analyzer.tokenStream("", new StringReader(origQuery));
|
||||
Token tok = null;
|
||||
// TODO: support custom attributes
|
||||
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) ts.addAttribute(OffsetAttribute.class);
|
||||
TypeAttribute typeAtt = (TypeAttribute) ts.addAttribute(TypeAttribute.class);
|
||||
FlagsAttribute flagsAtt = (FlagsAttribute) ts.addAttribute(FlagsAttribute.class);
|
||||
PayloadAttribute payloadAtt = (PayloadAttribute) ts.addAttribute(PayloadAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
try {
|
||||
while ((tok = ts.next()) != null){
|
||||
ts.reset();
|
||||
while (ts.incrementToken()){
|
||||
Token tok = new Token();
|
||||
tok.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||
tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||
tok.setFlags(flagsAtt.getFlags());
|
||||
tok.setPayload(payloadAtt.getPayload());
|
||||
tok.setPositionIncrement(posIncAtt.getPositionIncrement());
|
||||
tok.setType(typeAtt.type());
|
||||
result.add(tok);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
|
|
|
@ -53,8 +53,8 @@ public class DirectUpdateHandlerOptimizeTest extends AbstractSolrTestCase {
|
|||
for (int i = 0; i < 99; i++) {
|
||||
// Add a valid document
|
||||
cmd.doc = new Document();
|
||||
cmd.doc.add(new Field("id", "id_" + i, Field.Store.YES, Field.Index.UN_TOKENIZED));
|
||||
cmd.doc.add(new Field("subject", "subject_" + i, Field.Store.NO, Field.Index.TOKENIZED));
|
||||
cmd.doc.add(new Field("id", "id_" + i, Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
cmd.doc.add(new Field("subject", "subject_" + i, Field.Store.NO, Field.Index.ANALYZED));
|
||||
updater.addDoc(cmd);
|
||||
}
|
||||
|
||||
|
|
|
@ -67,16 +67,16 @@ public class DirectUpdateHandlerTest extends AbstractSolrTestCase {
|
|||
|
||||
// Add a valid document
|
||||
cmd.doc = new Document();
|
||||
cmd.doc.add( new Field( "id", "AAA", Store.YES, Index.UN_TOKENIZED ) );
|
||||
cmd.doc.add( new Field( "subject", "xxxxx", Store.YES, Index.UN_TOKENIZED ) );
|
||||
cmd.doc.add( new Field( "id", "AAA", Store.YES, Index.NOT_ANALYZED ) );
|
||||
cmd.doc.add( new Field( "subject", "xxxxx", Store.YES, Index.NOT_ANALYZED ) );
|
||||
updater.addDoc( cmd );
|
||||
|
||||
// Add a document with multiple ids
|
||||
cmd.indexedId = null; // reset the id for this add
|
||||
cmd.doc = new Document();
|
||||
cmd.doc.add( new Field( "id", "AAA", Store.YES, Index.UN_TOKENIZED ) );
|
||||
cmd.doc.add( new Field( "id", "BBB", Store.YES, Index.UN_TOKENIZED ) );
|
||||
cmd.doc.add( new Field( "subject", "xxxxx", Store.YES, Index.UN_TOKENIZED ) );
|
||||
cmd.doc.add( new Field( "id", "AAA", Store.YES, Index.NOT_ANALYZED ) );
|
||||
cmd.doc.add( new Field( "id", "BBB", Store.YES, Index.NOT_ANALYZED ) );
|
||||
cmd.doc.add( new Field( "subject", "xxxxx", Store.YES, Index.NOT_ANALYZED ) );
|
||||
try {
|
||||
updater.addDoc( cmd );
|
||||
fail( "added a document with multiple ids" );
|
||||
|
@ -86,7 +86,7 @@ public class DirectUpdateHandlerTest extends AbstractSolrTestCase {
|
|||
// Add a document without an id
|
||||
cmd.indexedId = null; // reset the id for this add
|
||||
cmd.doc = new Document();
|
||||
cmd.doc.add( new Field( "subject", "xxxxx", Store.YES, Index.UN_TOKENIZED ) );
|
||||
cmd.doc.add( new Field( "subject", "xxxxx", Store.YES, Index.NOT_ANALYZED ) );
|
||||
try {
|
||||
updater.addDoc( cmd );
|
||||
fail( "added a document without an ids" );
|
||||
|
@ -325,7 +325,7 @@ public class DirectUpdateHandlerTest extends AbstractSolrTestCase {
|
|||
|
||||
// Add a document
|
||||
cmd.doc = new Document();
|
||||
cmd.doc.add( new Field( "id", id, Store.YES, Index.UN_TOKENIZED ) );
|
||||
cmd.doc.add( new Field( "id", id, Store.YES, Index.NOT_ANALYZED ) );
|
||||
updater.addDoc( cmd );
|
||||
}
|
||||
|
||||
|
|
|
@ -126,11 +126,12 @@ public class TestCharArrayMap extends TestCase {
|
|||
|
||||
int ret=0;
|
||||
long start = System.currentTimeMillis();
|
||||
String[] stopwords = StopAnalyzer.ENGLISH_STOP_WORDS;
|
||||
Set<String> stopwords = (Set<String>) StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
||||
// words = "this is a different test to see what is really going on here... I hope it works well but I'm not sure it will".split(" ");
|
||||
char[][] stopwordschars = new char[stopwords.length][];
|
||||
for (int i=0; i<stopwords.length; i++) {
|
||||
stopwordschars[i] = stopwords[i].toCharArray();
|
||||
char[][] stopwordschars = new char[stopwords.size()][];
|
||||
Iterator<String> it = stopwords.iterator();
|
||||
for (int i=0; i<stopwords.size(); i++) {
|
||||
stopwordschars[i] = it.next().toCharArray();
|
||||
}
|
||||
|
||||
String[] testwords = "now is the time for all good men to come to the aid of their country".split(" ");
|
||||
|
|
|
@ -57,11 +57,11 @@ public class TestOpenBitSet extends TestCase {
|
|||
do {
|
||||
aa = a.nextSetBit(aa+1);
|
||||
if (rand.nextBoolean()) {
|
||||
iterator.next();
|
||||
bb = iterator.doc();
|
||||
iterator.nextDoc();
|
||||
bb = iterator.docID();
|
||||
} else {
|
||||
iterator.skipTo(bb+1);
|
||||
bb = iterator.doc();
|
||||
iterator.advance(bb+1);
|
||||
bb = iterator.docID();
|
||||
}
|
||||
assertEquals(aa == -1 ? DocIdSetIterator.NO_MORE_DOCS : aa, bb);
|
||||
} while (aa>=0);
|
||||
|
|
Loading…
Reference in New Issue