a hackey commit of stuff needed to get on lucene 3.0.1

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/branches/solr@922957 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2010-03-14 20:58:32 +00:00
parent 5b0c6919e0
commit 65a21459a2
77 changed files with 1678 additions and 1134 deletions

View File

@ -114,7 +114,7 @@
The version suffix of the Lucene artifacts checked into "lib"
IF YOU CHANGE THIS, SANITY CHECK "javadoc.link.lucene"
-->
<property name="lucene_version" value="2.9.2"/>
<property name="lucene_version" value="3.0.1"/>
<!-- The version number to assign to the Maven artifacts. -->
<property name="maven_version" value="1.5-SNAPSHOT"/>

View File

@ -0,0 +1,2 @@
AnyObjectId[9117ad96a4d5290e0731e2fc2fb326899a4999fd] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[876ca004312baada28f235c96ad74c9ee467045a] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[34b447a890e395c06906a75a7567f6fe8197b147] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[c156dab2c44abc562f7d061581aeb1aaa1f28a72] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[f897531d6823d0717f65a06e9f3cc648547c2cfe] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[f39bb741c2563c55fe9185f1c32615d75be056be] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[9139afc9ede79205a745d831b24a4316406710d2] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[9d9508a2199ff767f7853a0663d62896c60f0654] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[017161b212f274b87e3d8ef0809fbdee0c2099ce] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[445a216d3341a569cc6f38480fdda9a3c2ee1d10] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[6f12da563f7f852877998443d9e772579bfcf076] was removed in git history.
Apache SVN contains full history.

View File

@ -366,12 +366,12 @@ public class ConcurrentLRUCache<K,V> {
// necessary because maxSize is private in base class
public Object myInsertWithOverflow(Object element) {
if (size() < myMaxSize) {
put(element);
add(element);
return null;
} else if (size() > 0 && !lessThan(element, heap[1])) {
Object ret = heap[1];
heap[1] = element;
adjustTop();
updateTop();
return ret;
} else {
return element;

View File

@ -20,6 +20,13 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource; // javadoc @link
import java.io.IOException;
import java.util.LinkedList;
@ -56,13 +63,23 @@ import java.util.LinkedList;
* responsibility of the implementing subclass. In the "A" "B" => "A" "A" "B"
* example above, the subclass must clone the additional "A" it creates.
*
* @version $Id$
* @deprecated This class does not support custom attributes. Extend TokenFilter instead,
* using {@link AttributeSource#captureState()} and {@link AttributeSource#restoreState()}
* which support all attributes.
*/
@Deprecated
public abstract class BufferedTokenStream extends TokenFilter {
// in the future, might be faster if we implemented as an array based CircularQueue
private final LinkedList<Token> inQueue = new LinkedList<Token>();
private final LinkedList<Token> outQueue = new LinkedList<Token>();
private final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
private final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
private final FlagsAttribute flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
private final PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
private final PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
public BufferedTokenStream(TokenStream input) {
super(input);
}
@ -77,13 +94,13 @@ public abstract class BufferedTokenStream extends TokenFilter {
*/
protected abstract Token process(Token t) throws IOException;
public final Token next() throws IOException {
public final boolean incrementToken() throws IOException {
while (true) {
if (!outQueue.isEmpty()) return outQueue.removeFirst();
if (!outQueue.isEmpty()) return writeToken(outQueue.removeFirst());
Token t = read();
if (null == t) return null;
if (null == t) return false;
Token out = process(t);
if (null != out) return out;
if (null != out) return writeToken(out);
// loop back to top in case process() put something on the output queue
}
}
@ -94,7 +111,7 @@ public abstract class BufferedTokenStream extends TokenFilter {
*/
protected Token read() throws IOException {
if (inQueue.isEmpty()) {
Token t = input.next();
Token t = readToken();
return t;
}
return inQueue.removeFirst();
@ -120,13 +137,41 @@ public abstract class BufferedTokenStream extends TokenFilter {
protected Token peek(int n) throws IOException {
int fillCount = n-inQueue.size();
for (int i=0; i < fillCount; i++) {
Token t = input.next();
Token t = readToken();
if (null==t) return null;
inQueue.addLast(t);
}
return inQueue.get(n-1);
}
/** old api emulation for back compat */
private Token readToken() throws IOException {
if (!input.incrementToken()) {
return null;
} else {
Token token = new Token();
token.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
token.setType(typeAtt.type());
token.setFlags(flagsAtt.getFlags());
token.setPositionIncrement(posIncAtt.getPositionIncrement());
token.setPayload(payloadAtt.getPayload());
return token;
}
}
/** old api emulation for back compat */
private boolean writeToken(Token token) throws IOException {
clearAttributes();
termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
offsetAtt.setOffset(token.startOffset(), token.endOffset());
typeAtt.setType(token.type());
flagsAtt.setFlags(token.getFlags());
posIncAtt.setPositionIncrement(token.getPositionIncrement());
payloadAtt.setPayload(token.getPayload());
return true;
}
/**
* Write a token to the buffered output stream
*/

View File

@ -14,20 +14,22 @@ import java.util.Arrays;
import java.util.Set;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/*
* TODO: Rewrite to use new TokenStream api from lucene 2.9 when BufferedTokenStream uses it.
* TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and
* associated constructors
* TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors
*/
/**
* Construct bigrams for frequently occurring terms while indexing. Single terms
* are still indexed too, with bigrams overlaid. This is achieved through the
* use of {@link Token#setPositionIncrement(int)}. Bigrams have a type
* of "gram" Example
* use of {@link PositionIncrementAttribute#setPositionIncrement(int)}. Bigrams have a type
* of {@link #GRAM_TYPE} Example:
* <ul>
* <li>input:"the quick brown fox"</li>
* <li>output:|"the","the-quick"|"brown"|"fox"|</li>
@ -40,14 +42,23 @@ import org.apache.lucene.analysis.TokenStream;
/*
* Constructors and makeCommonSet based on similar code in StopFilter
*/
public final class CommonGramsFilter extends TokenFilter {
public class CommonGramsFilter extends BufferedTokenStream {
static final String GRAM_TYPE = "gram";
private static final char SEPARATOR = '_';
private final CharArraySet commonWords;
private StringBuilder buffer = new StringBuilder();
private final StringBuilder buffer = new StringBuilder();
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class);
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
private int lastStartOffset;
private boolean lastWasCommon;
private State savedState;
/**
* Construct a token stream filtering the given input using a Set of common
@ -57,7 +68,6 @@ public class CommonGramsFilter extends BufferedTokenStream {
*
* @param input TokenStream input in filter chain
* @param commonWords The set of common words.
*
*/
public CommonGramsFilter(TokenStream input, Set commonWords) {
this(input, commonWords, false);
@ -80,8 +90,7 @@ public class CommonGramsFilter extends BufferedTokenStream {
* @param commonWords The set of common words.
* @param ignoreCase -Ignore case when constructing bigrams for common words.
*/
public CommonGramsFilter(TokenStream input, Set commonWords,
boolean ignoreCase) {
public CommonGramsFilter(TokenStream input, Set commonWords, boolean ignoreCase) {
super(input);
if (commonWords instanceof CharArraySet) {
this.commonWords = (CharArraySet) commonWords;
@ -89,7 +98,6 @@ public class CommonGramsFilter extends BufferedTokenStream {
this.commonWords = new CharArraySet(commonWords.size(), ignoreCase);
this.commonWords.addAll(commonWords);
}
init();
}
/**
@ -101,7 +109,6 @@ public class CommonGramsFilter extends BufferedTokenStream {
*/
public CommonGramsFilter(TokenStream input, String[] commonWords) {
this(input, commonWords, false);
init();
}
/**
@ -112,33 +119,21 @@ public class CommonGramsFilter extends BufferedTokenStream {
* @param commonWords words to be used in constructing bigrams
* @param ignoreCase -Ignore case when constructing bigrams for common words.
*/
public CommonGramsFilter(TokenStream input, String[] commonWords,
boolean ignoreCase) {
public CommonGramsFilter(TokenStream input, String[] commonWords, boolean ignoreCase) {
super(input);
this.commonWords = (CharArraySet) makeCommonSet(commonWords, ignoreCase);
init();
}
// Here for future moving to 2.9 api See StopFilter code
public void init() {
/**
* termAtt = (TermAttribute) addAttribute(TermAttribute.class); posIncrAtt
* =(PositionIncrementAttribute)
* addAttribute(PositionIncrementAttribute.class); typeAdd =(TypeAttribute)
* addAttribute(TypeAttribute.class);
*/
this.commonWords = makeCommonSet(commonWords, ignoreCase);
}
/**
* Build a CharArraySet from an array of common words, appropriate for passing
* into the CommonGramsFilter constructor. This permits this commonWords
* construction to be cached once when an Analyzer is constructed.
*
* @see #makeCommonSet(java.lang.String[], boolean) passing false to
* ignoreCase
*
* @param commonWords Array of common words which will be converted into the CharArraySet
* @return CharArraySet of the given words, appropriate for passing into the CommonGramFilter constructor
* @see #makeCommonSet(java.lang.String[], boolean) passing false to ignoreCase
*/
public static final CharArraySet makeCommonSet(String[] commonWords) {
public static CharArraySet makeCommonSet(String[] commonWords) {
return makeCommonSet(commonWords, false);
}
@ -147,12 +142,11 @@ public class CommonGramsFilter extends BufferedTokenStream {
* into the CommonGramsFilter constructor,case-sensitive if ignoreCase is
* false.
*
* @param commonWords
* @param commonWords Array of common words which will be converted into the CharArraySet
* @param ignoreCase If true, all words are lower cased first.
* @return a Set containing the words
*/
public static final CharArraySet makeCommonSet(String[] commonWords,
boolean ignoreCase) {
public static CharArraySet makeCommonSet(String[] commonWords, boolean ignoreCase) {
CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase);
commonSet.addAll(Arrays.asList(commonWords));
return commonSet;
@ -163,61 +157,95 @@ public class CommonGramsFilter extends BufferedTokenStream {
* output the token. If the token and/or the following token are in the list
* of common words also output a bigram with position increment 0 and
* type="gram"
*/
/*
* TODO: implement new lucene 2.9 API incrementToken() instead of deprecated
* Token.next() TODO:Consider adding an option to not emit unigram stopwords
*
* TODO:Consider adding an option to not emit unigram stopwords
* as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be
* changed to work with this. TODO: Consider optimizing for the case of three
* changed to work with this.
*
* TODO: Consider optimizing for the case of three
* commongrams i.e "man of the year" normally produces 3 bigrams: "man-of",
* "of-the", "the-year" but with proper management of positions we could
* eliminate the middle bigram "of-the"and save a disk seek and a whole set of
* position lookups.
*/
public Token process(Token token) throws IOException {
Token next = peek(1);
// if this is the last token just spit it out. Any commongram would have
// been output in the previous call
if (next == null) {
return token;
public boolean incrementToken() throws IOException {
// get the next piece of input
if (savedState != null) {
restoreState(savedState);
savedState = null;
saveTermBuffer();
return true;
} else if (!input.incrementToken()) {
return false;
}
/**
* if this token or next are common then construct a bigram with type="gram"
* position increment = 0, and put it in the output queue. It will be
* returned when super.next() is called, before this method gets called with
* a new token from the input stream See implementation of next() in
* BufferedTokenStream
/* We build n-grams before and after stopwords.
* When valid, the buffer always contains at least the separator.
* If its empty, there is nothing before this stopword.
*/
if (isCommon(token) || isCommon(next)) {
Token gram = gramToken(token, next);
write(gram);
if (lastWasCommon || (isCommon() && buffer.length() > 0)) {
savedState = captureState();
gramToken();
return true;
}
// we always return the unigram token
return token;
saveTermBuffer();
return true;
}
/** True if token is for a common term. */
private boolean isCommon(Token token) {
return commonWords != null
&& commonWords.contains(token.termBuffer(), 0, token.termLength());
}
/** Construct a compound token. */
private Token gramToken(Token first, Token second) {
buffer.setLength(0);
buffer.append(first.termText());
buffer.append(SEPARATOR);
buffer.append(second.termText());
Token result = new Token(buffer.toString(), first.startOffset(), second
.endOffset(), "gram");
result.setPositionIncrement(0);
return result;
}
/**
* {@inheritDoc}
*/
@Override
public void reset() throws IOException {
super.reset();
lastWasCommon = false;
savedState = null;
buffer.setLength(0);
}
// ================================================= Helper Methods ================================================
/**
* Determines if the current token is a common term
*
* @return {@code true} if the current token is a common term, {@code false} otherwise
*/
private boolean isCommon() {
return commonWords != null && commonWords.contains(termAttribute.termBuffer(), 0, termAttribute.termLength());
}
/**
* Saves this information to form the left part of a gram
*/
private void saveTermBuffer() {
buffer.setLength(0);
buffer.append(termAttribute.termBuffer(), 0, termAttribute.termLength());
buffer.append(SEPARATOR);
lastStartOffset = offsetAttribute.startOffset();
lastWasCommon = isCommon();
}
/**
* Constructs a compound token.
*/
private void gramToken() {
buffer.append(termAttribute.termBuffer(), 0, termAttribute.termLength());
int endOffset = offsetAttribute.endOffset();
clearAttributes();
int length = buffer.length();
char termText[] = termAttribute.termBuffer();
if (length > termText.length) {
termText = termAttribute.resizeTermBuffer(length);
}
buffer.getChars(0, length, termText, 0);
termAttribute.setTermLength(length);
posIncAttribute.setPositionIncrement(0);
offsetAttribute.setOffset(lastStartOffset, endOffset);
typeAttribute.setType(GRAM_TYPE);
buffer.setLength(0);
}
}

View File

@ -57,7 +57,7 @@ public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements
throw new RuntimeException(e);
}
} else {
commonWords = (CharArraySet) CommonGramsFilter.makeCommonSet(StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
commonWords = (CharArraySet) StopAnalyzer.ENGLISH_STOP_WORDS_SET;
}
}

View File

@ -18,8 +18,11 @@ package org.apache.solr.analysis;
import java.io.IOException;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import static org.apache.solr.analysis.CommonGramsFilter.GRAM_TYPE;
/**
* Wrap a CommonGramsFilter optimizing phrase queries by only returning single
@ -36,33 +39,36 @@ import org.apache.lucene.analysis.Token;
*/
/*
* TODO: When org.apache.solr.analysis.BufferedTokenStream is changed to use the
* 2.9 lucene TokenStream api, make necessary changes here.
* See:http://hudson.zones
* .apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache
* /lucene/analysis/TokenStream.html and
* http://svn.apache.org/viewvc/lucene/java
* /trunk/src/java/org/apache/lucene/analysis/package.html?revision=718798
*/
public class CommonGramsQueryFilter extends BufferedTokenStream {
//private CharArraySet commonWords;
private Token prev;
public final class CommonGramsQueryFilter extends TokenFilter {
private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class);
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
private State previous;
private String previousType;
/**
* Constructor
*
* @param input must be a CommonGramsFilter!
* Constructs a new CommonGramsQueryFilter based on the provided CommomGramsFilter
*
* @param input CommonGramsFilter the QueryFilter will use
*/
public CommonGramsQueryFilter(CommonGramsFilter input) {
super(input);
prev = new Token();
}
/**
* {@inheritDoc}
*/
public void reset() throws IOException {
super.reset();
prev = new Token();
previous = null;
previousType = null;
}
/**
@ -71,68 +77,47 @@ public class CommonGramsQueryFilter extends BufferedTokenStream {
* <ul>
* <li>input: "the rain in spain falls mainly"
* <li>output:"the-rain", "rain-in" ,"in-spain", "falls", "mainly"
* </ul>
*/
public boolean incrementToken() throws IOException {
while (input.incrementToken()) {
State current = captureState();
public Token process(Token token) throws IOException {
Token next = peek(1);
/*
* Deal with last token (next=null when current token is the last word) Last
* token will be a unigram. If previous token was a bigram, then we already
* output the last token as part of the unigram and should not additionally
* output the unigram. <p> Example: If the end of the input to the
* CommonGramsFilter is "...the plain" <ul> <li>current token = "plain"</li>
* <li>next token = null</li> <li>previous token = "the-plain" (bigram)</li>
* <li> Since the word "plain" was already output as part of the bigram we
* don't output it.</li> </ul> Example: If the end of the input to the
* CommonGramsFilter is "falls mainly" <ul> <li>current token =
* "mainly"</li> <li>next token = null</li> <li>previous token = "falls"
* (unigram)</li> <li>Since we haven't yet output the current token, we
* output it</li> </ul>
*/
if (previous != null && !isGramType()) {
restoreState(previous);
previous = current;
previousType = typeAttribute.type();
if (isGramType()) {
posIncAttribute.setPositionIncrement(1);
}
return true;
}
// Deal with special case of last token
if (next == null) {
if (prev == null) {
// This is the first and only token i.e. one word query
return token;
}
if (prev != null && prev.type() != "gram") {
// If previous token was a unigram, output the current token
return token;
} else {
// If previous token was a bigram, we already output it and this token
// was output as part of the bigram so we are done.
return null;
}
previous = current;
}
/*
* Possible cases are: |token |next 1|word |gram 2|word |word The
* CommonGramsFilter we are wrapping always outputs the unigram word prior
* to outputting an optional bigram: "the sound of" gets output as |"the",
* "the_sound"|"sound", "sound_of" For case 1 we consume the gram from the
* input stream and output it rather than the current token This means that
* the call to super.next() which reads a token from input and passes it on
* to this process method will always get a token of type word
*/
if (next != null && next.type() == "gram") {
// consume "next" token from list and output it
token = read();
// use this to clone the token because clone requires all these args but
// won't take the token.type
// see
// http://hudson.zones.apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache/lucene/analysis/Token.html
prev.reinit(token.termBuffer(), 0, token.termLength(), token
.startOffset(), token.endOffset(), token.type());
token.setPositionIncrement(1);
return token;
if (previous == null || GRAM_TYPE.equals(previousType)) {
return false;
}
restoreState(previous);
previous = null;
if (isGramType()) {
posIncAttribute.setPositionIncrement(1);
}
return true;
}
// if the next token is not a bigram, then output the token
// see note above regarding this method of copying token to prev
prev.reinit(token.termBuffer(), 0, token.termLength(), token.startOffset(),
token.endOffset(), token.type());
assert token.type() == "word";
return token;
// ================================================= Helper Methods ================================================
/**
* Convenience method to check if the current type is a gram type
*
* @return {@code true} if the current type is a gram type, {@code false} otherwise
*/
public boolean isGramType() {
return GRAM_TYPE.equals(typeAttribute.type());
}
}

View File

@ -59,8 +59,7 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
throw new RuntimeException(e);
}
} else {
commonWords = (CharArraySet) CommonGramsFilter.makeCommonSet(
StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
commonWords = (CharArraySet) StopAnalyzer.ENGLISH_STOP_WORDS_SET;
}
}

View File

@ -23,7 +23,6 @@ import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.el.GreekCharsets;
import org.apache.lucene.analysis.el.GreekLowerCaseFilter;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
@ -32,40 +31,16 @@ import org.slf4j.LoggerFactory;
public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory
{
@Deprecated
private static Map<String,char[]> CHARSETS = new HashMap<String,char[]>();
static {
CHARSETS.put("UnicodeGreek",GreekCharsets.UnicodeGreek);
CHARSETS.put("ISO",GreekCharsets.ISO);
CHARSETS.put("CP1253",GreekCharsets.CP1253);
}
private char[] charset = GreekCharsets.UnicodeGreek;
private static Logger logger = LoggerFactory.getLogger(GreekLowerCaseFilterFactory.class);
@Override
public void init(Map<String, String> args) {
super.init(args);
String charsetName = args.get("charset");
if (null != charsetName) {
charset = CHARSETS.get(charsetName);
if (charset.equals(GreekCharsets.UnicodeGreek))
logger.warn("Specifying UnicodeGreek is no longer required (default). "
+ "Use of the charset parameter will cause an error in Solr 1.5");
else
logger.warn("Support for this custom encoding is deprecated. "
+ "Use of the charset parameter will cause an error in Solr 1.5");
} else {
charset = GreekCharsets.UnicodeGreek; /* default to unicode */
}
if (null == charset) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"Don't understand charset: " + charsetName);
}
}
public GreekLowerCaseFilter create(TokenStream in) {
return new GreekLowerCaseFilter(in,charset);
return new GreekLowerCaseFilter(in);
}
}

View File

@ -20,6 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
import java.io.Reader;
import java.io.IOException;
@ -31,11 +32,6 @@ import java.io.IOException;
@Deprecated
public class HTMLStripStandardTokenizerFactory extends BaseTokenizerFactory {
public Tokenizer create(Reader input) {
return new StandardTokenizer(new HTMLStripReader(input)) {
@Override
public void reset(Reader reader) throws IOException {
super.reset(new HTMLStripReader(reader));
}
};
return new StandardTokenizer(Version.LUCENE_24, new HTMLStripReader(input));
}
}

View File

@ -20,6 +20,8 @@ package org.apache.solr.analysis;
import java.io.IOException;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* When the plain text is extracted from documents, we will often have many words hyphenated and broken into
@ -52,46 +54,89 @@ import org.apache.lucene.analysis.*;
*/
public final class HyphenatedWordsFilter extends TokenFilter {
public HyphenatedWordsFilter(TokenStream in) {
super(in);
}
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class);
private final StringBuilder hyphenated = new StringBuilder();
private State savedState;
/**
* @inheritDoc
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public final Token next(Token in) throws IOException {
StringBuilder termText = new StringBuilder(25);
int startOffset = -1, firstPositionIncrement = -1, wordsMerged = 0;
Token lastToken = null;
for (Token token = input.next(in); token != null; token = input.next()) {
termText.append(token.termBuffer(), 0, token.termLength());
//current token ends with hyphen -> grab the next token and glue them together
if (termText.charAt(termText.length() - 1) == '-') {
wordsMerged++;
//remove the hyphen
termText.setLength(termText.length()-1);
if (startOffset == -1) {
startOffset = token.startOffset();
firstPositionIncrement = token.getPositionIncrement();
}
lastToken = token;
} else {
//shortcut returns token
if (wordsMerged == 0)
return token;
Token mergedToken = new Token(termText.toString(), startOffset, token.endOffset(), token.type());
mergedToken.setPositionIncrement(firstPositionIncrement);
return mergedToken;
}
}
//last token ending with hyphen? - we know that we have only one token in
//this situation, so we can safely return firstToken
if (startOffset != -1)
return lastToken;
else
return null; //end of token stream
}
* Creates a new HyphenatedWordsFilter
*
* @param in TokenStream that will be filtered
*/
public HyphenatedWordsFilter(TokenStream in) {
super(in);
}
/**
* {@inheritDoc}
*/
@Override
public boolean incrementToken() throws IOException {
while (input.incrementToken()) {
char[] term = termAttribute.termBuffer();
int termLength = termAttribute.termLength();
if (termLength > 0 && term[termLength - 1] == '-') {
// a hyphenated word
// capture the state of the first token only
if (savedState == null) {
savedState = captureState();
}
hyphenated.append(term, 0, termLength - 1);
} else if (savedState == null) {
// not part of a hyphenated word.
return true;
} else {
// the final portion of a hyphenated word
hyphenated.append(term, 0, termLength);
unhyphenate();
return true;
}
}
if (savedState != null) {
// the final term ends with a hyphen
// add back the hyphen, for backwards compatibility.
hyphenated.append('-');
unhyphenate();
return true;
}
return false;
}
/**
* {@inheritDoc}
*/
@Override
public void reset() throws IOException {
super.reset();
hyphenated.setLength(0);
savedState = null;
}
// ================================================= Helper Methods ================================================
/**
* Writes the joined unhyphenated term
*/
private void unhyphenate() {
int endOffset = offsetAttribute.endOffset();
restoreState(savedState);
savedState = null;
char term[] = termAttribute.termBuffer();
int length = hyphenated.length();
if (length > termAttribute.termLength()) {
term = termAttribute.resizeTermBuffer(length);
}
hyphenated.getChars(0, length, term, 0);
termAttribute.setTermLength(length);
offsetAttribute.setOffset(offsetAttribute.startOffset(), endOffset);
hyphenated.setLength(0);
}
}

View File

@ -75,7 +75,7 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res
}
public KeepWordFilter create(TokenStream input) {
return new KeepWordFilter(input, words, ignoreCase);
return new KeepWordFilter(input, (Set)words, ignoreCase);
}
public CharArraySet getWords() {

View File

@ -1,49 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import java.io.IOException;
/**
* @version $Id$
* @deprecated use {@link org.apache.lucene.analysis.LengthFilter}
*/
@Deprecated
public final class LengthFilter extends TokenFilter {
final int min,max;
public LengthFilter(TokenStream in, int min, int max) {
super(in);
this.min=min;
this.max=max;
//System.out.println("min="+min+" max="+max);
}
public final Token next(Token in) throws IOException {
for (Token token=input.next(in); token!=null; token=input.next(in)) {
final int len = token.endOffset() - token.startOffset();
if (len<min || len>max) continue;
return token;
}
return null;
}
}

View File

@ -17,41 +17,69 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.util.ArraysUtils;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.solr.util.CharArrayMap;
import java.io.IOException;
/**
* A TokenFilter which filters out Tokens at the same position and Term
* text as the previous token in the stream.
* A TokenFilter which filters out Tokens at the same position and Term text as the previous token in the stream.
*/
public class RemoveDuplicatesTokenFilter extends BufferedTokenStream {
public RemoveDuplicatesTokenFilter(TokenStream input) {super(input);}
protected Token process(Token t) throws IOException {
Token tok = read();
while (tok != null && tok.getPositionIncrement()==0) {
if (null != t) {
write(t);
t = null;
public final class RemoveDuplicatesTokenFilter extends TokenFilter {
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
// keep a seen 'set' after each term with posInc > 0
// for now use CharArrayMap vs CharArraySet, as it has clear()
private final CharArrayMap<Boolean> previous = new CharArrayMap<Boolean>(8, false);
/**
* Creates a new RemoveDuplicatesTokenFilter
*
* @param in TokenStream that will be filtered
*/
public RemoveDuplicatesTokenFilter(TokenStream in) {
super(in);
}
/**
* {@inheritDoc}
*/
@Override
public boolean incrementToken() throws IOException {
while (input.incrementToken()) {
final char term[] = termAttribute.termBuffer();
final int length = termAttribute.termLength();
final int posIncrement = posIncAttribute.getPositionIncrement();
if (posIncrement > 0) {
previous.clear();
}
boolean dup=false;
for (Token outTok : output()) {
int tokLen = tok.termLength();
if (outTok.termLength() == tokLen && ArraysUtils.equals(outTok.termBuffer(), 0, tok.termBuffer(), 0, tokLen)) {
dup=true;
//continue;;
}
boolean duplicate = (posIncrement == 0 && previous.get(term, 0, length) != null);
// clone the term, and add to the set of seen terms.
char saved[] = new char[length];
System.arraycopy(term, 0, saved, 0, length);
previous.put(saved, Boolean.TRUE);
if (!duplicate) {
return true;
}
if (!dup){
write(tok);
}
tok = read();
}
if (tok != null) {
pushBack(tok);
}
return t;
return false;
}
/**
* {@inheritDoc}
*/
@Override
public void reset() throws IOException {
super.reset();
previous.clear();
}
}

View File

@ -16,46 +16,46 @@
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.ru.*;
import java.util.Map;
import java.util.HashMap;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@Deprecated
public class RussianCommon {
private static Logger logger = LoggerFactory.getLogger(RussianCommon.class);
private static Map<String,char[]> CHARSETS = new HashMap<String,char[]>();
static {
CHARSETS.put("UnicodeRussian",RussianCharsets.UnicodeRussian);
CHARSETS.put("KOI8",RussianCharsets.KOI8);
CHARSETS.put("CP1251",RussianCharsets.CP1251);
}
public static char[] getCharset(String name) {
if (null == name)
return RussianCharsets.UnicodeRussian;
char[] charset = CHARSETS.get(name);
if (charset.equals(RussianCharsets.UnicodeRussian))
logger.warn("Specifying UnicodeRussian is no longer required (default). "
+ "Use of the charset parameter will cause an error in Solr 1.5");
else
logger.warn("Support for this custom encoding is deprecated. "
+ "Use of the charset parameter will cause an error in Solr 1.5");
if (null == charset) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"Don't understand charset: " + name);
}
return charset;
}
}
//package org.apache.solr.analysis;
//import org.apache.lucene.analysis.ru.*;
//import java.util.Map;
//import java.util.HashMap;
//import org.apache.solr.core.SolrConfig;
//import org.apache.solr.common.SolrException;
//import org.apache.solr.common.SolrException.ErrorCode;
//import org.slf4j.Logger;
//import org.slf4j.LoggerFactory;
//
//@Deprecated
//public class RussianCommon {
//
// private static Logger logger = LoggerFactory.getLogger(RussianCommon.class);
//
// private static Map<String,char[]> CHARSETS = new HashMap<String,char[]>();
// static {
// CHARSETS.put("UnicodeRussian",RussianCharsets.UnicodeRussian);
// CHARSETS.put("KOI8",RussianCharsets.KOI8);
// CHARSETS.put("CP1251",RussianCharsets.CP1251);
// }
//
// public static char[] getCharset(String name) {
// if (null == name)
// return RussianCharsets.UnicodeRussian;
//
// char[] charset = CHARSETS.get(name);
//
// if (charset.equals(RussianCharsets.UnicodeRussian))
// logger.warn("Specifying UnicodeRussian is no longer required (default). "
// + "Use of the charset parameter will cause an error in Solr 1.5");
// else
// logger.warn("Support for this custom encoding is deprecated. "
// + "Use of the charset parameter will cause an error in Solr 1.5");
//
// if (null == charset) {
// throw new SolrException(ErrorCode.SERVER_ERROR,
// "Don't understand charset: " + name);
// }
// return charset;
// }
//}

View File

@ -23,17 +23,10 @@ import java.util.Map;
import org.apache.lucene.analysis.ru.RussianLetterTokenizer;
public class RussianLetterTokenizerFactory extends BaseTokenizerFactory {
@Deprecated
private char[] charset;
@Override
public void init(Map<String, String> args) {
super.init(args);
charset = RussianCommon.getCharset(args.get("charset"));
}
public RussianLetterTokenizer create(Reader in) {
return new RussianLetterTokenizer(in,charset);
return new RussianLetterTokenizer(in);
}
}

View File

@ -23,17 +23,9 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ru.RussianLowerCaseFilter;
public class RussianLowerCaseFilterFactory extends BaseTokenFilterFactory {
@Deprecated
private char[] charset;
@Override
public void init(Map<String, String> args) {
super.init(args);
charset = RussianCommon.getCharset(args.get("charset"));
}
public RussianLowerCaseFilter create(TokenStream in) {
return new RussianLowerCaseFilter(in,charset);
return new RussianLowerCaseFilter(in);
}
}

View File

@ -25,16 +25,10 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ru.RussianStemFilter;
public class RussianStemFilterFactory extends BaseTokenFilterFactory {
@Deprecated
private char[] charset;
public void init(Map<String, String> args) {
super.init(args);
charset = RussianCommon.getCharset(args.get("charset"));
}
public RussianStemFilter create(TokenStream in) {
return new RussianStemFilter(in,charset);
return new RussianStemFilter(in);
}
}

View File

@ -19,6 +19,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
import java.io.Reader;
@ -28,6 +29,6 @@ import java.io.Reader;
public class StandardTokenizerFactory extends BaseTokenizerFactory {
public StandardTokenizer create(Reader input) {
return new StandardTokenizer(input);
return new StandardTokenizer(Version.LUCENE_24, input);
}
}

View File

@ -58,7 +58,7 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
throw new RuntimeException(e);
}
} else {
stopWords = (CharArraySet) StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
stopWords = (CharArraySet) StopAnalyzer.ENGLISH_STOP_WORDS_SET;
}
}
//Force the use of a char array set, as it is the most performant, although this may break things if Lucene ever goes away from it. See SOLR-1095
@ -79,8 +79,7 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
}
public StopFilter create(TokenStream input) {
StopFilter stopFilter = new StopFilter(input,stopWords,ignoreCase);
stopFilter.setEnablePositionIncrements(enablePositionIncrements);
StopFilter stopFilter = new StopFilter(enablePositionIncrements, input,stopWords,ignoreCase);
return stopFilter;
}
}

View File

@ -20,6 +20,12 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeSource;
import java.io.IOException;
import java.util.ArrayList;
@ -39,11 +45,16 @@ import java.util.LinkedList;
public class SynonymFilter extends TokenFilter {
private final SynonymMap map; // Map<String, SynonymMap>
private Iterator<Token> replacement; // iterator over generated tokens
private Iterator<AttributeSource> replacement; // iterator over generated tokens
public SynonymFilter(TokenStream in, SynonymMap map) {
super(in);
this.map = map;
// just ensuring these exist attributes exist...
addAttribute(TermAttribute.class);
addAttribute(PositionIncrementAttribute.class);
addAttribute(OffsetAttribute.class);
addAttribute(TypeAttribute.class);
}
@ -65,74 +76,100 @@ public class SynonymFilter extends TokenFilter {
* - preserve original positionIncrement of first matched token
*/
@Override
public Token next(Token target) throws IOException {
public boolean incrementToken() throws IOException {
while (true) {
// if there are any generated tokens, return them... don't try any
// matches against them, as we specifically don't want recursion.
if (replacement!=null && replacement.hasNext()) {
return replacement.next();
copy(this, replacement.next());
return true;
}
// common case fast-path of first token not matching anything
Token firstTok = nextTok(target);
if (firstTok == null) return null;
SynonymMap result = map.submap!=null ? map.submap.get(firstTok.termBuffer(), 0, firstTok.termLength()) : null;
if (result == null) return firstTok;
AttributeSource firstTok = nextTok();
if (firstTok == null) return false;
TermAttribute termAtt = (TermAttribute) firstTok.addAttribute(TermAttribute.class);
SynonymMap result = map.submap!=null ? map.submap.get(termAtt.termBuffer(), 0, termAtt.termLength()) : null;
if (result == null) {
copy(this, firstTok);
return true;
}
// fast-path failed, clone ourselves if needed
if (firstTok == this)
firstTok = cloneAttributes();
// OK, we matched a token, so find the longest match.
matched = new LinkedList<Token>();
matched = new LinkedList<AttributeSource>();
result = match(result);
if (result==null) {
// no match, simply return the first token read.
return firstTok;
copy(this, firstTok);
return true;
}
// reuse, or create new one each time?
ArrayList<Token> generated = new ArrayList<Token>(result.synonyms.length + matched.size() + 1);
ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>(result.synonyms.length + matched.size() + 1);
//
// there was a match... let's generate the new tokens, merging
// in the matched tokens (position increments need adjusting)
//
Token lastTok = matched.isEmpty() ? firstTok : matched.getLast();
AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
boolean includeOrig = result.includeOrig();
Token origTok = includeOrig ? firstTok : null;
int origPos = firstTok.getPositionIncrement(); // position of origTok in the original stream
AttributeSource origTok = includeOrig ? firstTok : null;
PositionIncrementAttribute firstPosIncAtt = (PositionIncrementAttribute) firstTok.addAttribute(PositionIncrementAttribute.class);
int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
int repPos=0; // curr position in replacement token stream
int pos=0; // current position in merged token stream
for (int i=0; i<result.synonyms.length; i++) {
Token repTok = result.synonyms[i];
Token newTok = new Token(firstTok.startOffset(), lastTok.endOffset(), firstTok.type());
newTok.setTermBuffer(repTok.termBuffer(), 0, repTok.termLength());
AttributeSource newTok = firstTok.cloneAttributes();
TermAttribute newTermAtt = (TermAttribute) newTok.addAttribute(TermAttribute.class);
OffsetAttribute newOffsetAtt = (OffsetAttribute) newTok.addAttribute(OffsetAttribute.class);
TypeAttribute newTypeAtt = (TypeAttribute) newTok.addAttribute(TypeAttribute.class);
PositionIncrementAttribute newPosIncAtt = (PositionIncrementAttribute) newTok.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute lastOffsetAtt = (OffsetAttribute) lastTok.addAttribute(OffsetAttribute.class);
newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
newTermAtt.setTermBuffer(repTok.termBuffer(), 0, repTok.termLength());
repPos += repTok.getPositionIncrement();
if (i==0) repPos=origPos; // make position of first token equal to original
// if necessary, insert original tokens and adjust position increment
while (origTok != null && origPos <= repPos) {
origTok.setPositionIncrement(origPos-pos);
PositionIncrementAttribute origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
origPosInc.setPositionIncrement(origPos-pos);
generated.add(origTok);
pos += origTok.getPositionIncrement();
pos += origPosInc.getPositionIncrement();
origTok = matched.isEmpty() ? null : matched.removeFirst();
if (origTok != null) origPos += origTok.getPositionIncrement();
if (origTok != null) {
origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
origPos += origPosInc.getPositionIncrement();
}
}
newTok.setPositionIncrement(repPos - pos);
newPosIncAtt.setPositionIncrement(repPos - pos);
generated.add(newTok);
pos += newTok.getPositionIncrement();
pos += newPosIncAtt.getPositionIncrement();
}
// finish up any leftover original tokens
while (origTok!=null) {
origTok.setPositionIncrement(origPos-pos);
PositionIncrementAttribute origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
origPosInc.setPositionIncrement(origPos-pos);
generated.add(origTok);
pos += origTok.getPositionIncrement();
pos += origPosInc.getPositionIncrement();
origTok = matched.isEmpty() ? null : matched.removeFirst();
if (origTok != null) origPos += origTok.getPositionIncrement();
if (origTok != null) {
origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
origPos += origPosInc.getPositionIncrement();
}
}
// what if we replaced a longer sequence with a shorter one?
@ -151,27 +188,22 @@ public class SynonymFilter extends TokenFilter {
// Defer creation of the buffer until the first time it is used to
// optimize short fields with no matches.
//
private LinkedList<Token> buffer;
private LinkedList<Token> matched;
private LinkedList<AttributeSource> buffer;
private LinkedList<AttributeSource> matched;
private Token nextTok() throws IOException {
private AttributeSource nextTok() throws IOException {
if (buffer!=null && !buffer.isEmpty()) {
return buffer.removeFirst();
} else {
return input.next();
if (input.incrementToken()) {
return this;
} else
return null;
}
}
private Token nextTok(Token target) throws IOException {
if (buffer!=null && !buffer.isEmpty()) {
return buffer.removeFirst();
} else {
return input.next(target);
}
}
private void pushTok(Token t) {
if (buffer==null) buffer=new LinkedList<Token>();
private void pushTok(AttributeSource t) {
if (buffer==null) buffer=new LinkedList<AttributeSource>();
buffer.addFirst(t);
}
@ -179,15 +211,20 @@ public class SynonymFilter extends TokenFilter {
SynonymMap result = null;
if (map.submap != null) {
Token tok = nextTok();
AttributeSource tok = nextTok();
if (tok != null) {
// clone ourselves.
if (tok == this)
tok = cloneAttributes();
// check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
SynonymMap subMap = map.submap.get(tok.termBuffer(), 0, tok.termLength());
TermAttribute termAtt = (TermAttribute) tok.getAttribute(TermAttribute.class);
SynonymMap subMap = map.submap.get(termAtt.termBuffer(), 0, termAtt.termLength());
if (subMap != null) {
// recurse
result = match(subMap);
}
;
if (result != null) {
matched.addFirst(tok);
} else {
@ -205,6 +242,15 @@ public class SynonymFilter extends TokenFilter {
return result;
}
private void copy(AttributeSource target, AttributeSource source) {
if (target == source)
return;
for (Iterator<AttributeImpl> sourceIt = source.getAttributeImplsIterator(), targetIt=target.getAttributeImplsIterator();
sourceIt.hasNext();) {
sourceIt.next().copyTo(targetIt.next());
}
}
@Override
public void reset() throws IOException {
input.reset();

View File

@ -19,6 +19,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
@ -135,8 +136,9 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso
TokenStream ts = loadTokenizer(tokFactory, reader);
List<String> tokList = new ArrayList<String>();
try {
for( Token token = ts.next(); token != null; token = ts.next() ){
String text = new String(token.termBuffer(), 0, token.termLength());
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
while (ts.incrementToken()){
String text = new String(termAtt.termBuffer(), 0, termAtt.termLength());
if( text.length() > 0 )
tokList.add( text );
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,315 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.solr.analysis.WordDelimiterFilter.*;
/**
* A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterFilter rules.
*/
final class WordDelimiterIterator {
/** Indicates the end of iteration */
public static final int DONE = -1;
public static final byte[] DEFAULT_WORD_DELIM_TABLE;
char text[];
int length;
/** start position of text, excluding leading delimiters */
int startBounds;
/** end position of text, excluding trailing delimiters */
int endBounds;
/** Beginning of subword */
int current;
/** End of subword */
int end;
/* does this string end with a possessive such as 's */
private boolean hasFinalPossessive = false;
/**
* If false, causes case changes to be ignored (subwords will only be generated
* given SUBWORD_DELIM tokens). (Defaults to true)
*/
final boolean splitOnCaseChange;
/**
* If false, causes numeric changes to be ignored (subwords will only be generated
* given SUBWORD_DELIM tokens). (Defaults to true)
*/
final boolean splitOnNumerics;
/**
* If true, causes trailing "'s" to be removed for each subword. (Defaults to true)
* <p/>
* "O'Neil's" => "O", "Neil"
*/
final boolean stemEnglishPossessive;
private final byte[] charTypeTable;
/** if true, need to skip over a possessive found in the last call to next() */
private boolean skipPossessive = false;
// TODO: should there be a WORD_DELIM category for chars that only separate words (no catenation of subwords will be
// done if separated by these chars?) "," would be an obvious candidate...
static {
byte[] tab = new byte[256];
for (int i = 0; i < 256; i++) {
byte code = 0;
if (Character.isLowerCase(i)) {
code |= LOWER;
}
else if (Character.isUpperCase(i)) {
code |= UPPER;
}
else if (Character.isDigit(i)) {
code |= DIGIT;
}
if (code == 0) {
code = SUBWORD_DELIM;
}
tab[i] = code;
}
DEFAULT_WORD_DELIM_TABLE = tab;
}
/**
* Create a new WordDelimiterIterator operating with the supplied rules.
*
* @param charTypeTable table containing character types
* @param splitOnCaseChange if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
* @param splitOnNumerics if true, causes "j2se" to be three tokens; "j" "2" "se"
* @param stemEnglishPossessive if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
*/
WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean splitOnNumerics, boolean stemEnglishPossessive) {
this.charTypeTable = charTypeTable;
this.splitOnCaseChange = splitOnCaseChange;
this.splitOnNumerics = splitOnNumerics;
this.stemEnglishPossessive = stemEnglishPossessive;
}
/**
* Advance to the next subword in the string.
*
* @return index of the next subword, or {@link #DONE} if all subwords have been returned
*/
int next() {
current = end;
if (current == DONE) {
return DONE;
}
if (skipPossessive) {
current += 2;
skipPossessive = false;
}
int lastType = 0;
while (current < endBounds && (isSubwordDelim(lastType = charType(text[current])))) {
current++;
}
if (current >= endBounds) {
return end = DONE;
}
for (end = current + 1; end < endBounds; end++) {
int type = charType(text[end]);
if (isBreak(lastType, type)) {
break;
}
lastType = type;
}
if (end < endBounds - 1 && endsWithPossessive(end + 2)) {
skipPossessive = true;
}
return end;
}
/**
* Return the type of the current subword.
* This currently uses the type of the first character in the subword.
*
* @return type of the current word
*/
int type() {
if (end == DONE) {
return 0;
}
int type = charType(text[current]);
switch (type) {
// return ALPHA word type for both lower and upper
case LOWER:
case UPPER:
return ALPHA;
default:
return type;
}
}
/**
* Reset the text to a new value, and reset all state
*
* @param text New text
* @param length length of the text
*/
void setText(char text[], int length) {
this.text = text;
this.length = this.endBounds = length;
current = startBounds = end = 0;
skipPossessive = hasFinalPossessive = false;
setBounds();
}
// ================================================= Helper Methods ================================================
/**
* Determines whether the transition from lastType to type indicates a break
*
* @param lastType Last subword type
* @param type Current subword type
* @return {@code true} if the transition indicates a break, {@code false} otherwise
*/
private boolean isBreak(int lastType, int type) {
if ((type & lastType) != 0) {
return false;
}
if (!splitOnCaseChange && isAlpha(lastType) && isAlpha(type)) {
// ALPHA->ALPHA: always ignore if case isn't considered.
return false;
} else if (isUpper(lastType) && isAlpha(type)) {
// UPPER->letter: Don't split
return false;
} else if (!splitOnNumerics && ((isAlpha(lastType) && isDigit(type)) || (isDigit(lastType) && isAlpha(type)))) {
// ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
return false;
}
return true;
}
/**
* Determines if the current word contains only one subword. Note, it could be potentially surrounded by delimiters
*
* @return {@code true} if the current word contains only one subword, {@code false} otherwise
*/
boolean isSingleWord() {
if (hasFinalPossessive) {
return current == startBounds && end == endBounds - 2;
}
else {
return current == startBounds && end == endBounds;
}
}
/**
* Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove
* it yet, simply note it.
*/
private void setBounds() {
while (startBounds < length && (isSubwordDelim(charType(text[startBounds])))) {
startBounds++;
}
while (endBounds > startBounds && (isSubwordDelim(charType(text[endBounds - 1])))) {
endBounds--;
}
if (endsWithPossessive(endBounds)) {
hasFinalPossessive = true;
}
current = startBounds;
}
/**
* Determines if the text at the given position indicates an English possessive which should be removed
*
* @param pos Position in the text to check if it indicates an English possessive
* @return {@code true} if the text at the position indicates an English posessive, {@code false} otherwise
*/
private boolean endsWithPossessive(int pos) {
return (stemEnglishPossessive &&
pos > 2 &&
text[pos - 2] == '\'' &&
(text[pos - 1] == 's' || text[pos - 1] == 'S') &&
isAlpha(charType(text[pos - 3])) &&
(pos == endBounds || isSubwordDelim(charType(text[pos]))));
}
/**
* Determines the type of the given character
*
* @param ch Character whose type is to be determined
* @return Type of the character
*/
private int charType(int ch) {
if (ch < charTypeTable.length) {
return charTypeTable[ch];
}
switch (Character.getType(ch)) {
case Character.UPPERCASE_LETTER: return UPPER;
case Character.LOWERCASE_LETTER: return LOWER;
case Character.TITLECASE_LETTER:
case Character.MODIFIER_LETTER:
case Character.OTHER_LETTER:
case Character.NON_SPACING_MARK:
case Character.ENCLOSING_MARK: // depends what it encloses?
case Character.COMBINING_SPACING_MARK:
return ALPHA;
case Character.DECIMAL_DIGIT_NUMBER:
case Character.LETTER_NUMBER:
case Character.OTHER_NUMBER:
return DIGIT;
// case Character.SPACE_SEPARATOR:
// case Character.LINE_SEPARATOR:
// case Character.PARAGRAPH_SEPARATOR:
// case Character.CONTROL:
// case Character.FORMAT:
// case Character.PRIVATE_USE:
case Character.SURROGATE: // prevent splitting
return ALPHA|DIGIT;
// case Character.DASH_PUNCTUATION:
// case Character.START_PUNCTUATION:
// case Character.END_PUNCTUATION:
// case Character.CONNECTOR_PUNCTUATION:
// case Character.OTHER_PUNCTUATION:
// case Character.MATH_SYMBOL:
// case Character.CURRENCY_SYMBOL:
// case Character.MODIFIER_SYMBOL:
// case Character.OTHER_SYMBOL:
// case Character.INITIAL_QUOTE_PUNCTUATION:
// case Character.FINAL_QUOTE_PUNCTUATION:
default: return SUBWORD_DELIM;
}
}
}

View File

@ -20,6 +20,12 @@ import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.SolrParams;
@ -132,15 +138,20 @@ public class AnalysisRequestHandler extends RequestHandlerBase {
static NamedList<NamedList<Object>> getTokens(TokenStream tstream) throws IOException {
// outer is namedList since order of tokens is important
NamedList<NamedList<Object>> tokens = new NamedList<NamedList<Object>>();
Token t = null;
while (((t = tstream.next()) != null)) {
// TODO: support custom attributes
TermAttribute termAtt = (TermAttribute) tstream.addAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) tstream.addAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = (TypeAttribute) tstream.addAttribute(TypeAttribute.class);
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) tstream.addAttribute(PositionIncrementAttribute.class);
while (tstream.incrementToken()) {
NamedList<Object> token = new SimpleOrderedMap<Object>();
tokens.add("token", token);
token.add("value", new String(t.termBuffer(), 0, t.termLength()));
token.add("start", t.startOffset());
token.add("end", t.endOffset());
token.add("posInc", t.getPositionIncrement());
token.add("type", t.type());
token.add("value", new String(termAtt.termBuffer(), 0, termAtt.termLength()));
token.add("start", offsetAtt.startOffset());
token.add("end", offsetAtt.endOffset());
token.add("posInc", posIncAtt.getPositionIncrement());
token.add("type", typeAtt.type());
//TODO: handle payloads
}
return tokens;

View File

@ -22,6 +22,12 @@ import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.solr.analysis.CharFilterFactory;
import org.apache.solr.analysis.TokenFilterFactory;
import org.apache.solr.analysis.TokenizerChain;
@ -141,11 +147,30 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
*/
private List<Token> analyzeTokenStream(TokenStream tokenStream) {
List<Token> tokens = new ArrayList<Token>();
Token reusableToken = new Token();
Token token = null;
// TODO change this API to support custom attributes
TermAttribute termAtt = (TermAttribute)
tokenStream.addAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute)
tokenStream.addAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = (TypeAttribute)
tokenStream.addAttribute(TypeAttribute.class);
FlagsAttribute flagsAtt = (FlagsAttribute)
tokenStream.addAttribute(FlagsAttribute.class);
PayloadAttribute payloadAtt = (PayloadAttribute)
tokenStream.addAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute)
tokenStream.addAttribute(PositionIncrementAttribute.class);
try {
while ((token = tokenStream.next(reusableToken)) != null) {
while (tokenStream.incrementToken()) {
Token token = new Token();
token.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
token.setType(typeAtt.type());
token.setFlags(flagsAtt.getFlags());
token.setPayload(payloadAtt.getPayload());
token.setPositionIncrement(posIncAtt.getPositionIncrement());
tokens.add((Token) token.clone());
}
} catch (IOException ioe) {
@ -229,16 +254,30 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
/**
* TokenStream that iterates over a list of pre-existing Tokens
*/
// TODO refactor to support custom attributes
protected static class ListBasedTokenStream extends TokenStream {
private final List<Token> tokens;
private Iterator<Token> tokenIterator;
private final Iterator<Token> tokenIterator;
private final TermAttribute termAtt = (TermAttribute)
addAttribute(TermAttribute.class);
private final OffsetAttribute offsetAtt = (OffsetAttribute)
addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAtt = (TypeAttribute)
addAttribute(TypeAttribute.class);
private final FlagsAttribute flagsAtt = (FlagsAttribute)
addAttribute(FlagsAttribute.class);
private final PayloadAttribute payloadAtt = (PayloadAttribute)
addAttribute(PayloadAttribute.class);
private final PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute)
addAttribute(PositionIncrementAttribute.class);
/**
* Creates a new ListBasedTokenStream which uses the given tokens as its token source.
*
* @param tokens Source of tokens to be used
*/
ListBasedTokenStream(List<Token> tokens) {
this.tokens = tokens;
tokenIterator = tokens.iterator();
}
@ -246,8 +285,25 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
* {@inheritDoc}
*/
@Override
public Token next(Token token) throws IOException {
return (tokenIterator.hasNext()) ? tokenIterator.next() : null;
public boolean incrementToken() throws IOException {
if (tokenIterator.hasNext()) {
Token next = tokenIterator.next();
termAtt.setTermBuffer(next.termBuffer(), 0, next.termLength());
typeAtt.setType(next.type());
offsetAtt.setOffset(next.startOffset(), next.endOffset());
flagsAtt.setFlags(next.getFlags());
payloadAtt.setPayload(next.getPayload());
posIncAtt.setPositionIncrement(next.getPositionIncrement());
return true;
} else {
return false;
}
}
@Override
public void reset() throws IOException {
super.reset();
tokenIterator = tokens.iterator();
}
}

View File

@ -258,7 +258,7 @@ public class SpellCheckerRequestHandler extends RequestHandlerBase implements So
}
dirDescription = f.getAbsolutePath();
log.info("using spell directory: " + dirDescription);
spellcheckerIndexDir = FSDirectory.getDirectory(f);
spellcheckerIndexDir = FSDirectory.open(f);
} else {
log.info("using RAM based spell directory");
}

View File

@ -40,7 +40,7 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ConstantScoreRangeQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.PriorityQueue;
@ -172,7 +172,8 @@ public class LukeRequestHandler extends RequestHandlerBase
flags.append( (f != null && f.getOmitNorms()) ? FieldFlag.OMIT_NORMS.getAbbreviation() : '-' );
flags.append( (f != null && f.isLazy()) ? FieldFlag.LAZY.getAbbreviation() : '-' );
flags.append( (f != null && f.isBinary()) ? FieldFlag.BINARY.getAbbreviation() : '-' );
flags.append( (f != null && f.isCompressed()) ? FieldFlag.COMPRESSED.getAbbreviation() : '-' );
//nocommit: handle compressed
//flags.append( (f != null && f.isCompressed()) ? FieldFlag.COMPRESSED.getAbbreviation() : '-' );
flags.append( (false) ? FieldFlag.SORT_MISSING_FIRST.getAbbreviation() : '-' ); // SchemaField Specific
flags.append( (false) ? FieldFlag.SORT_MISSING_LAST.getAbbreviation() : '-' ); // SchemaField Specific
return flags.toString();
@ -312,7 +313,7 @@ public class LukeRequestHandler extends RequestHandlerBase
// If numTerms==0, the call is just asking for a quick field list
if( ttinfo != null && sfield != null && sfield.indexed() ) {
Query q = new ConstantScoreRangeQuery(fieldName,null,null,false,false);
Query q = new TermRangeQuery(fieldName,null,null,false,false);
TopDocs top = searcher.search( q, 1 );
if( top.totalHits > 0 ) {
// Find a document with this field
@ -652,7 +653,7 @@ public class LukeRequestHandler extends RequestHandlerBase
}
if( terms.docFreq() > tiq.minFreq ) {
tiq.put(new TopTermQueue.TermInfo(terms.term(), terms.docFreq()));
tiq.add(new TopTermQueue.TermInfo(terms.term(), terms.docFreq()));
if (tiq.size() > numTerms) { // if tiq full
tiq.pop(); // remove lowest in tiq
tiq.minFreq = ((TopTermQueue.TermInfo)tiq.top()).docFreq; // reset minFreq

View File

@ -33,6 +33,7 @@ import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.FieldType;
@ -305,7 +306,6 @@ public class QueryComponent extends SearchComponent
public void handleResponses(ResponseBuilder rb, ShardRequest sreq) {
if ((sreq.purpose & ShardRequest.PURPOSE_GET_TOP_IDS) != 0) {
mergeIds(rb, sreq);
return;
}
if ((sreq.purpose & ShardRequest.PURPOSE_GET_FIELDS) != 0) {
@ -399,7 +399,8 @@ public class QueryComponent extends SearchComponent
// Merge the docs via a priority queue so we don't have to sort *all* of the
// documents... we only need to order the top (rows+start)
ShardFieldSortedHitQueue queue = new ShardFieldSortedHitQueue(sortFields, ss.getOffset() + ss.getCount());
ShardFieldSortedHitQueue queue;
queue = new ShardFieldSortedHitQueue(sortFields, ss.getOffset() + ss.getCount());
long numFound = 0;
Float maxScore=null;
@ -451,7 +452,7 @@ public class QueryComponent extends SearchComponent
shardDoc.sortFieldValues = sortFieldValues;
queue.insert(shardDoc);
queue.insertWithOverflow(shardDoc);
} // end for-each-doc-in-response
} // end for-each-response

View File

@ -38,8 +38,8 @@ import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
@ -298,10 +298,9 @@ public class QueryElevationComponent extends SearchComponent implements SolrCore
TokenStream tokens = analyzer.reusableTokenStream( "", new StringReader( query ) );
tokens.reset();
Token token = tokens.next();
while( token != null ) {
norm.append( new String(token.termBuffer(), 0, token.termLength()) );
token = tokens.next();
TermAttribute termAtt = (TermAttribute) tokens.addAttribute(TermAttribute.class);
while( tokens.incrementToken() ) {
norm.append( termAtt.termBuffer(), 0, termAtt.termLength() );
}
return norm.toString();
}

View File

@ -33,6 +33,12 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.CommonParams;
@ -332,7 +338,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
// create token
SpellCheckResponse.Suggestion suggestion = origVsSuggestion.get(original);
Token token = new Token();
token.setTermText(original);
token.setTermBuffer(original);
token.setStartOffset(suggestion.getStartOffset());
token.setEndOffset(suggestion.getEndOffset());
@ -364,10 +370,24 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
Collection<Token> result = new ArrayList<Token>();
Token token = null;
TokenStream ts = analyzer.reusableTokenStream("", new StringReader(q));
ts.reset();
while ((token = ts.next()) != null){
// TODO: support custom attributes
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) ts.addAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = (TypeAttribute) ts.addAttribute(TypeAttribute.class);
FlagsAttribute flagsAtt = (FlagsAttribute) ts.addAttribute(FlagsAttribute.class);
PayloadAttribute payloadAtt = (PayloadAttribute) ts.addAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts.addAttribute(PositionIncrementAttribute.class);
while (ts.incrementToken()){
Token token = new Token();
token.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
token.setType(typeAtt.type());
token.setFlags(flagsAtt.getFlags());
token.setPayload(payloadAtt.getPayload());
token.setPositionIncrement(posIncAtt.getPositionIncrement());
result.add(token);
}
return result;

View File

@ -113,7 +113,7 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
IndexSchema schema = rb.req.getSchema();
String uniqFieldName = schema.getUniqueKeyField().getName();
//Only load the id field
SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector(Collections.singleton(uniqFieldName), Collections.emptySet());
SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector(Collections.singleton(uniqFieldName), Collections.<String>emptySet());
while (iter.hasNext()) {
Integer docId = iter.next();
NamedList docNL = new NamedList();

View File

@ -32,6 +32,7 @@ import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.*;
@ -39,6 +40,7 @@ import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
import org.apache.lucene.search.vectorhighlight.FieldQuery;
import org.apache.lucene.search.vectorhighlight.FragListBuilder;
import org.apache.lucene.search.vectorhighlight.FragmentsBuilder;
import org.apache.lucene.util.AttributeSource.State;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
@ -512,28 +514,28 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
*/
class TokenOrderingFilter extends TokenFilter {
private final int windowSize;
private final LinkedList<Token> queue = new LinkedList<Token>();
private final LinkedList<OrderedToken> queue = new LinkedList<OrderedToken>();
private boolean done=false;
private final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
protected TokenOrderingFilter(TokenStream input, int windowSize) {
super(input);
this.windowSize = windowSize;
}
@Override
public Token next() throws IOException {
public boolean incrementToken() throws IOException {
while (!done && queue.size() < windowSize) {
Token newTok = input.next();
if (newTok==null) {
done=true;
if (!input.incrementToken()) {
done = true;
break;
}
// reverse iterating for better efficiency since we know the
// list is already sorted, and most token start offsets will be too.
ListIterator<Token> iter = queue.listIterator(queue.size());
ListIterator<OrderedToken> iter = queue.listIterator(queue.size());
while(iter.hasPrevious()) {
if (newTok.startOffset() >= iter.previous().startOffset()) {
if (offsetAtt.startOffset() >= iter.previous().startOffset) {
// insertion will be before what next() would return (what
// we just compared against), so move back one so the insertion
// will be after.
@ -541,50 +543,82 @@ class TokenOrderingFilter extends TokenFilter {
break;
}
}
iter.add(newTok);
OrderedToken ot = new OrderedToken();
ot.state = captureState();
ot.startOffset = offsetAtt.startOffset();
iter.add(ot);
}
return queue.isEmpty() ? null : queue.removeFirst();
if (queue.isEmpty()) {
return false;
} else {
restoreState(queue.removeFirst().state);
return true;
}
}
}
// for TokenOrderingFilter, so it can easily sort by startOffset
class OrderedToken {
State state;
int startOffset;
}
class TermOffsetsTokenStream {
TokenStream bufferedTokenStream = null;
Token bufferedToken;
OffsetAttribute bufferedOffsetAtt;
State bufferedToken;
int bufferedStartOffset;
int bufferedEndOffset;
int startOffset;
int endOffset;
public TermOffsetsTokenStream( TokenStream tstream ){
bufferedTokenStream = tstream;
bufferedOffsetAtt = (OffsetAttribute) bufferedTokenStream.addAttribute(OffsetAttribute.class);
startOffset = 0;
bufferedToken = null;
}
public TokenStream getMultiValuedTokenStream( final int length ){
endOffset = startOffset + length;
return new TokenStream(){
Token token;
public Token next() throws IOException {
return new MultiValuedStream(length);
}
class MultiValuedStream extends TokenStream {
private final int length;
OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
MultiValuedStream(int length) {
super(bufferedTokenStream.cloneAttributes());
this.length = length;
}
public boolean incrementToken() throws IOException {
while( true ){
if( bufferedToken == null )
bufferedToken = bufferedTokenStream.next();
if( bufferedToken == null ) return null;
if( startOffset <= bufferedToken.startOffset() &&
bufferedToken.endOffset() <= endOffset ){
token = bufferedToken;
bufferedToken = null;
token.setStartOffset( token.startOffset() - startOffset );
token.setEndOffset( token.endOffset() - startOffset );
return token;
if( bufferedToken == null ) {
if (!bufferedTokenStream.incrementToken())
return false;
bufferedToken = bufferedTokenStream.captureState();
bufferedStartOffset = bufferedOffsetAtt.startOffset();
bufferedEndOffset = bufferedOffsetAtt.endOffset();
}
else if( bufferedToken.endOffset() > endOffset ){
if( startOffset <= bufferedStartOffset &&
bufferedEndOffset <= endOffset ){
restoreState(bufferedToken);
bufferedToken = null;
offsetAtt.setOffset( offsetAtt.startOffset() - startOffset, offsetAtt.endOffset() - startOffset );
return true;
}
else if( bufferedEndOffset > endOffset ){
startOffset += length + 1;
return null;
return false;
}
bufferedToken = null;
}
}
};
}
}
};
};

View File

@ -176,7 +176,7 @@ public abstract class BaseResponseWriter {
Object val = null;
if (ft == null) { // handle fields not in the schema
if (f.isBinary())
val = f.binaryValue();
val = f.getBinaryValue();
else
val = f.stringValue();
} else {

View File

@ -140,7 +140,7 @@ public class BinaryResponseWriter implements BinaryQueryResponseWriter {
if(sf != null) ft =sf.getType();
Object val;
if (ft == null) { // handle fields not in the schema
if (f.isBinary()) val = f.binaryValue();
if (f.isBinary()) val = f.getBinaryValue();
else val = f.stringValue();
} else {
try {

View File

@ -58,8 +58,10 @@ public abstract class CompressableField extends FieldType {
String internalVal) {
/* compress field if length exceeds threshold */
if(field.isCompressed()) {
return internalVal.length() >= compressThreshold ?
Field.Store.COMPRESS : Field.Store.YES;
// nocommit: handle compression
//return internalVal.length() >= compressThreshold ?
// Field.Store.COMPRESS : Field.Store.YES;
return Field.Store.YES;
} else
return super.getFieldStore(field, internalVal);
}

View File

@ -302,8 +302,8 @@ public abstract class FieldType extends FieldProperties {
}
protected Field.Index getFieldIndex(SchemaField field,
String internalVal) {
return field.indexed() ? (isTokenized() ? Field.Index.TOKENIZED :
Field.Index.UN_TOKENIZED) : Field.Index.NO;
return field.indexed() ? (isTokenized() ? Field.Index.ANALYZED :
Field.Index.NOT_ANALYZED) : Field.Index.NO;
}
/**

View File

@ -63,7 +63,7 @@ public class TrieDateField extends DateField {
@Override
public Date toObject(Fieldable f) {
byte[] arr = f.binaryValue();
byte[] arr = f.getBinaryValue();
if (arr==null) throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,TrieField.badFieldString(f));
return new Date(TrieField.toLong(arr));
}
@ -85,7 +85,7 @@ public class TrieDateField extends DateField {
@Override
public void write(XMLWriter xmlWriter, String name, Fieldable f) throws IOException {
byte[] arr = f.binaryValue();
byte[] arr = f.getBinaryValue();
if (arr==null) {
xmlWriter.writeStr(name, TrieField.badFieldString(f));
return;
@ -96,7 +96,7 @@ public class TrieDateField extends DateField {
@Override
public void write(TextResponseWriter writer, String name, Fieldable f) throws IOException {
byte[] arr = f.binaryValue();
byte[] arr = f.getBinaryValue();
if (arr==null) {
writer.writeStr(name, TrieField.badFieldString(f),true);
return;
@ -136,7 +136,7 @@ public class TrieDateField extends DateField {
@Override
public String toExternal(Fieldable f) {
byte[] arr = f.binaryValue();
byte[] arr = f.getBinaryValue();
if (arr==null) return TrieField.badFieldString(f);
return super.toExternal(new Date(TrieField.toLong(arr)));
}

View File

@ -93,7 +93,7 @@ public class TrieField extends FieldType {
@Override
public Object toObject(Fieldable f) {
byte[] arr = f.binaryValue();
byte[] arr = f.getBinaryValue();
if (arr==null) return badFieldString(f);
switch (type) {
case INTEGER:
@ -145,7 +145,7 @@ public class TrieField extends FieldType {
}
public void write(XMLWriter xmlWriter, String name, Fieldable f) throws IOException {
byte[] arr = f.binaryValue();
byte[] arr = f.getBinaryValue();
if (arr==null) {
xmlWriter.writeStr(name, badFieldString(f));
return;
@ -173,7 +173,7 @@ public class TrieField extends FieldType {
}
public void write(TextResponseWriter writer, String name, Fieldable f) throws IOException {
byte[] arr = f.binaryValue();
byte[] arr = f.getBinaryValue();
if (arr==null) {
writer.writeStr(name, badFieldString(f),true);
return;
@ -352,7 +352,7 @@ public class TrieField extends FieldType {
@Override
public String toExternal(Fieldable f) {
byte[] arr = f.binaryValue();
byte[] arr = f.getBinaryValue();
if (arr==null) return badFieldString(f);
switch (type) {
case INTEGER:

View File

@ -17,7 +17,6 @@
package org.apache.solr.search;
import org.apache.lucene.search.HitCollector;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.util.OpenBitSet;

View File

@ -25,7 +25,6 @@ import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.util.OpenBitSet;
import java.util.BitSet;
import java.io.IOException;
/**
@ -40,17 +39,6 @@ public class PrefixFilter extends Filter {
Term getPrefix() { return prefix; }
@Override
public BitSet bits(IndexReader reader) throws IOException {
final BitSet bitSet = new BitSet(reader.maxDoc());
new PrefixGenerator(prefix) {
public void handleDoc(int doc) {
bitSet.set(doc);
}
}.generate(reader);
return bitSet;
}
@Override
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc());

View File

@ -17,10 +17,14 @@
package org.apache.solr.search;
import org.apache.lucene.search.FieldComparator;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.SolrException;
import java.io.IOException;
import java.util.List;
/** A hash key encapsulating a query, a list of filters, and a sort
@ -38,7 +42,7 @@ public final class QueryResultKey {
private static SortField[] defaultSort = new SortField[0];
public QueryResultKey(Query query, List<Query> filters, Sort sort, int nc_flags) {
public QueryResultKey(Query query, List<Query> filters, Sort sort, int nc_flags) throws IOException {
this.query = query;
this.sort = sort;
this.filters = filters;

View File

@ -133,20 +133,10 @@ public class SolrConstantScoreQuery extends ConstantScoreQuery {
}
}
/** @deprecated use {@link #nextDoc()} instead. */
public boolean next() throws IOException {
return docIdSetIterator.nextDoc() != NO_MORE_DOCS;
}
public int nextDoc() throws IOException {
return docIdSetIterator.nextDoc();
}
/** @deprecated use {@link #docID()} instead. */
public int doc() {
return docIdSetIterator.doc();
}
public int docID() {
return docIdSetIterator.docID();
}
@ -155,11 +145,6 @@ public class SolrConstantScoreQuery extends ConstantScoreQuery {
return theScore;
}
/** @deprecated use {@link #advance(int)} instead. */
public boolean skipTo(int target) throws IOException {
return docIdSetIterator.advance(target) != NO_MORE_DOCS;
}
public int advance(int target) throws IOException {
return docIdSetIterator.advance(target);
}

View File

@ -454,11 +454,6 @@ public class SolrIndexReader extends FilterIndexReader {
return in.getIndexCommit();
}
@Override
public int getTermInfosIndexDivisor() {
return in.getTermInfosIndexDivisor();
}
@Override
public void incRef() {
in.incRef();
@ -479,11 +474,6 @@ public class SolrIndexReader extends FilterIndexReader {
in.setNorm(doc, field, value);
}
@Override
public void setTermInfosIndexDivisor(int indexDivisor) throws IllegalStateException {
in.setTermInfosIndexDivisor(indexDivisor);
}
@Override
public TermPositions termPositions(Term term) throws IOException {
return in.termPositions(term);
@ -498,16 +488,6 @@ public class SolrIndexReader extends FilterIndexReader {
public Object getFieldCacheKey() {
return in.getFieldCacheKey();
}
@Override
public boolean getDisableFakeNorms() {
return in.getDisableFakeNorms();
}
@Override
public void setDisableFakeNorms(boolean disableFakeNorms) {
in.setDisableFakeNorms(disableFakeNorms);
}
}

View File

@ -342,22 +342,22 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
return qr;
}
public Hits search(Query query, Filter filter, Sort sort) throws IOException {
// todo - when Solr starts accepting filters, need to
// change this conditional check (filter!=null) and create a new filter
// that ANDs them together if it already exists.
if (optimizer==null || filter!=null || !(query instanceof BooleanQuery)
) {
return super.search(query,filter,sort);
} else {
Query[] newQuery = new Query[1];
Filter[] newFilter = new Filter[1];
optimizer.optimize((BooleanQuery)query, this, 0, newQuery, newFilter);
return super.search(newQuery[0], newFilter[0], sort);
}
}
// public Hits search(Query query, Filter filter, Sort sort) throws IOException {
// // todo - when Solr starts accepting filters, need to
// // change this conditional check (filter!=null) and create a new filter
// // that ANDs them together if it already exists.
//
// if (optimizer==null || filter!=null || !(query instanceof BooleanQuery)
// ) {
// return super.search(query,filter,sort);
// } else {
// Query[] newQuery = new Query[1];
// Filter[] newFilter = new Filter[1];
// optimizer.optimize((BooleanQuery)query, this, 0, newQuery, newFilter);
//
// return super.search(newQuery[0], newFilter[0], sort);
// }
// }
/**
* @return the indexDir on which this searcher is opened
@ -697,10 +697,12 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
* This method is not cache-aware and no caches are checked.
*/
public DocSet convertFilter(Filter lfilter) throws IOException {
BitSet bs = lfilter.bits(this.reader);
OpenBitSet obs = new OpenBitSet(bs.size());
for(int i=bs.nextSetBit(0); i>=0; i=bs.nextSetBit(i+1)) {
obs.fastSet(i);
DocIdSet docSet = lfilter.getDocIdSet(this.reader);
OpenBitSet obs = new OpenBitSet();
DocIdSetIterator it = docSet.iterator();
int doc;
while((doc = it.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
obs.fastSet(doc);
}
return new BitDocSet(obs);
}

View File

@ -25,15 +25,13 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.util.Version;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
import org.apache.solr.analysis.*;
import org.apache.solr.common.SolrException;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TrieField;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TextField;
// TODO: implement the analysis of simple fields with
@ -77,7 +75,7 @@ public class SolrQueryParser extends QueryParser {
* @see IndexSchema#getDefaultSearchFieldName()
*/
public SolrQueryParser(IndexSchema schema, String defaultField) {
super(defaultField == null ? schema.getDefaultSearchFieldName() : defaultField, schema.getQueryAnalyzer());
super(Version.LUCENE_24, defaultField == null ? schema.getDefaultSearchFieldName() : defaultField, schema.getQueryAnalyzer());
this.schema = schema;
this.parser = null;
this.defaultField = defaultField;
@ -91,7 +89,7 @@ public class SolrQueryParser extends QueryParser {
}
public SolrQueryParser(QParser parser, String defaultField, Analyzer analyzer) {
super(defaultField, analyzer);
super(Version.LUCENE_24, defaultField, analyzer);
this.schema = parser.getReq().getSchema();
this.parser = parser;
this.defaultField = defaultField;

View File

@ -26,7 +26,6 @@ import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.util.OpenBitSet;
import java.util.BitSet;
import java.io.IOException;
@ -43,20 +42,6 @@ public class WildcardFilter extends Filter {
public Term getTerm() { return term; }
/**
* @deprecated Use {@link #getDocIdSet(IndexReader)} instead.
*/
@Override
public BitSet bits(IndexReader reader) throws IOException {
final BitSet bitSet = new BitSet(reader.maxDoc());
new WildcardGenerator(term) {
public void handleDoc(int doc) {
bitSet.set(doc);
}
}.generate(reader);
return bitSet;
}
@Override
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc());

View File

@ -100,7 +100,7 @@ public class FunctionQuery extends Query {
int[] offsets = topReader.getLeafOffsets();
int readerPos = SolrIndexReader.readerIndex(doc, offsets);
int readerBase = offsets[readerPos];
return scorer(subReaders[readerPos], true, true).explain(doc-readerBase);
return ((AllScorer)scorer(subReaders[readerPos], true, true)).explain(doc-readerBase);
}
}

View File

@ -33,7 +33,6 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.search.spell.LevensteinDistance;
import org.apache.lucene.search.spell.SpellChecker;
import org.apache.lucene.search.spell.StringDistance;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
@ -184,7 +183,7 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
*/
protected void initIndex() throws IOException {
if (indexDir != null) {
index = FSDirectory.getDirectory(indexDir);
index = FSDirectory.open(new File(indexDir));
} else {
index = new RAMDirectory();
}

View File

@ -22,12 +22,10 @@ import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.spell.PlainTextDictionary;
import org.apache.lucene.store.RAMDirectory;
import org.apache.solr.common.util.NamedList;
@ -98,7 +96,7 @@ public class FileBasedSpellChecker extends AbstractLuceneSpellChecker {
for (String s : lines) {
Document d = new Document();
d.add(new Field(WORD_FIELD_NAME, s, Field.Store.NO, Field.Index.TOKENIZED));
d.add(new Field(WORD_FIELD_NAME, s, Field.Store.NO, Field.Index.ANALYZED));
writer.addDocument(d);
}
writer.optimize();

View File

@ -16,17 +16,14 @@ package org.apache.solr.spelling;
* limitations under the License.
*/
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.store.FSDirectory;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.HighFrequencyDictionary;
import java.io.File;
import java.io.IOException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -63,7 +60,7 @@ public class IndexBasedSpellChecker extends AbstractLuceneSpellChecker {
private void initSourceReader() {
if (sourceLocation != null) {
try {
FSDirectory luceneIndexDir = FSDirectory.getDirectory(sourceLocation);
FSDirectory luceneIndexDir = FSDirectory.open(new File(sourceLocation));
this.reader = IndexReader.open(luceneIndexDir);
} catch (IOException e) {
throw new RuntimeException(e);

View File

@ -27,6 +27,11 @@ import java.util.regex.Pattern;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/**
@ -99,10 +104,22 @@ public class SpellingQueryConverter extends QueryConverter {
if (word.equals("AND") == false && word.equals("OR") == false) {
try {
stream = analyzer.reusableTokenStream("", new StringReader(word));
Token token;
while ((token = stream.next()) != null) {
// TODO: support custom attributes
TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
FlagsAttribute flagsAtt = (FlagsAttribute) stream.addAttribute(FlagsAttribute.class);
TypeAttribute typeAtt = (TypeAttribute) stream.addAttribute(TypeAttribute.class);
PayloadAttribute payloadAtt = (PayloadAttribute) stream.addAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) stream.addAttribute(PositionIncrementAttribute.class);
stream.reset();
while (stream.incrementToken()) {
Token token = new Token();
token.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
token.setStartOffset(matcher.start());
token.setEndOffset(matcher.end());
token.setFlags(flagsAtt.getFlags());
token.setType(typeAtt.type());
token.setPayload(payloadAtt.getPayload());
token.setPositionIncrement(posIncAtt.getPositionIncrement());
result.add(token);
}
} catch (IOException e) {

View File

@ -66,12 +66,11 @@ public class OldRequestHandler implements SolrRequestHandler {
sort = QueryParsing.parseSort(commands.get(1), req.getSchema());
}
Hits hits=null;
try {
hits = req.getSearcher().search(query,filter,sort);
TopFieldDocs hits = req.getSearcher().search(query,filter, req.getStart()+req.getLimit(), sort);
int numHits = hits.length();
int numHits = hits.totalHits;
int startRow = Math.min(numHits, req.getStart());
int endRow = Math.min(numHits,req.getStart()+req.getLimit());
int numRows = endRow-startRow;
@ -79,8 +78,8 @@ public class OldRequestHandler implements SolrRequestHandler {
int[] ids = new int[numRows];
Document[] data = new Document[numRows];
for (int i=startRow; i<endRow; i++) {
ids[i] = hits.id(i);
data[i] = hits.doc(i);
ids[i] = hits.scoreDocs[i].doc;
data[i] = req.getSearcher().doc(ids[i]);
}
rsp.add(null, new DocSlice(0,numRows,ids,null,numHits,0.0f));

View File

@ -144,7 +144,7 @@ public class TestRequestHandler implements SolrRequestHandler {
nl.add("myLong",999999999999L);
Document doc = new Document();
doc.add(new Field("id","55",Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field("id","55",Field.Store.YES, Field.Index.NOT_ANALYZED));
nl.add("myDoc",doc);
nl.add("myResult",results);
@ -172,8 +172,8 @@ public class TestRequestHandler implements SolrRequestHandler {
//
// test against hits
//
Hits hits = searcher.search(query, lfilter, sort);
test(hits.length() == results.matches());
TopFieldDocs hits = searcher.search(query, lfilter, 1000, sort);
test(hits.totalHits == results.matches());
DocList rrr2 = results.subset(start,limit);
@ -189,7 +189,7 @@ public class TestRequestHandler implements SolrRequestHandler {
***/
for (int i=0; i<results.size(); i++) {
test( iter.nextDoc() == hits.id(i+results.offset()) );
test( iter.nextDoc() == hits.scoreDocs[i].doc);
// Document doesn't implement equals()
// test( searcher.document(i).equals(hits.doc(i)));

View File

@ -161,29 +161,32 @@ public class SolrIndexWriter extends IndexWriter {
*
*/
public SolrIndexWriter(String name, String path, DirectoryFactory dirFactory, boolean create, IndexSchema schema) throws IOException {
super(getDirectory(path, dirFactory, null), false, schema.getAnalyzer(), create);
super(getDirectory(path, dirFactory, null), schema.getAnalyzer(), create, MaxFieldLength.LIMITED);
init(name, schema, null);
}
@Deprecated
// nocommit: remove?
public SolrIndexWriter(String name, String path, DirectoryFactory dirFactory, boolean create, IndexSchema schema, SolrIndexConfig config) throws IOException {
super(getDirectory(path, dirFactory, null), config.luceneAutoCommit, schema.getAnalyzer(), create);
super(getDirectory(path, dirFactory, null), schema.getAnalyzer(), create, MaxFieldLength.LIMITED);
init(name, schema, config);
}
/**
* @deprecated
*/
// nocommit: remove?
public SolrIndexWriter(String name, String path, boolean create, IndexSchema schema) throws IOException {
super(getDirectory(path, null), false, schema.getAnalyzer(), create);
super(getDirectory(path, null), schema.getAnalyzer(), create, MaxFieldLength.LIMITED);
init(name, schema, null);
}
/**
* @deprecated
*/
// nocommit: remove?
public SolrIndexWriter(String name, String path, boolean create, IndexSchema schema, SolrIndexConfig config) throws IOException {
super(getDirectory(path, config), config.luceneAutoCommit, schema.getAnalyzer(), create);
super(getDirectory(path, config), schema.getAnalyzer(), create, MaxFieldLength.LIMITED);
init(name, schema, config);
}

View File

@ -18,11 +18,13 @@
package org.apache.solr.update;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.search.HitCollector;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.Scorer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -135,17 +137,19 @@ public abstract class UpdateHandler implements SolrInfoMBean {
public abstract void close() throws IOException;
static class DeleteHitCollector extends HitCollector {
static class DeleteHitCollector extends Collector {
public int deleted=0;
public final SolrIndexSearcher searcher;
private int docBase;
public DeleteHitCollector(SolrIndexSearcher searcher) {
this.searcher = searcher;
}
public void collect(int doc, float score) {
@Override
public void collect(int doc) {
try {
searcher.getReader().deleteDocument(doc);
searcher.getReader().deleteDocument(doc + docBase);
deleted++;
} catch (IOException e) {
// don't try to close the searcher on failure for now...
@ -153,6 +157,21 @@ public abstract class UpdateHandler implements SolrInfoMBean {
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR,"Error deleting doc# "+doc,e,false);
}
}
@Override
public boolean acceptsDocsOutOfOrder() {
return false;
}
@Override
public void setNextReader(IndexReader arg0, int docBase) throws IOException {
this.docBase = docBase;
}
@Override
public void setScorer(Scorer scorer) throws IOException {
}
}

View File

@ -538,21 +538,21 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase {
SchemaField f; // Solr field type
Field luf; // Lucene field
f = ischema.getField("test_hlt");
luf = f.createField("test", 0f);
assertFalse(luf.isCompressed());
assertTrue(luf.isStored());
f = ischema.getField("test_hlt");
luf = f.createField(mkstr(345), 0f);
assertTrue(luf.isCompressed());
assertTrue(luf.isStored());
f = ischema.getField("test_hlt_off");
luf = f.createField(mkstr(400), 0f);
assertFalse(luf.isCompressed());
assertTrue(luf.isStored());
// f = ischema.getField("test_hlt");
// luf = f.createField("test", 0f);
// assertFalse(luf.isCompressed());
// assertTrue(luf.isStored());
//
// f = ischema.getField("test_hlt");
// luf = f.createField(mkstr(345), 0f);
// assertTrue(luf.isCompressed());
// assertTrue(luf.isStored());
//
// f = ischema.getField("test_hlt_off");
// luf = f.createField(mkstr(400), 0f);
// assertFalse(luf.isCompressed());
// assertTrue(luf.isStored());
//
}
public void testNotLazyField() throws IOException {

View File

@ -37,7 +37,7 @@ public class TestBufferedTokenStream extends BaseTokenTestCase {
protected Token process(Token t) throws IOException {
if ("A".equals(new String(t.termBuffer(), 0, t.termLength()))) {
Token t2 = read();
if (t2!=null && "B".equals(new String(t2.termBuffer(), 0, t2.termLength()))) t.setTermText("Q");
if (t2!=null && "B".equals(new String(t2.termBuffer(), 0, t2.termLength()))) t.setTermBuffer("Q");
if (t2!=null) pushBack(t2);
}
return t;

View File

@ -16,6 +16,7 @@
*/
package org.apache.solr.core;
import java.io.File;
import java.io.IOException;
import org.apache.lucene.store.FSDirectory;
import org.apache.solr.util.AbstractSolrTestCase;
@ -44,7 +45,7 @@ public class AlternateDirectoryTest extends AbstractSolrTestCase {
public FSDirectory open(String path) throws IOException {
openCalled = true;
return FSDirectory.getDirectory(path);
return FSDirectory.open(new File(path));
}
}

View File

@ -30,11 +30,11 @@ import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.solr.common.SolrException;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.util.TestHarness;
@ -97,11 +97,11 @@ public class TestArbitraryIndexDir extends AbstractSolrTestCase{
}
//add a doc in the new index dir
Directory dir = FSDirectory.getDirectory(newDir);
IndexWriter iw = new IndexWriter(dir, new StandardAnalyzer(), new MaxFieldLength(1000));
Directory dir = FSDirectory.open(newDir);
IndexWriter iw = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_24), new MaxFieldLength(1000));
Document doc = new Document();
doc.add(new Field("id", "2", Field.Store.YES, Field.Index.TOKENIZED));
doc.add(new Field("name", "name2", Field.Store.YES, Field.Index.TOKENIZED));
doc.add(new Field("id", "2", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("name", "name2", Field.Store.YES, Field.Index.ANALYZED));
iw.addDocument(doc);
iw.commit();
iw.close();

View File

@ -158,12 +158,11 @@ public class HighlighterTest extends AbstractSolrTestCase {
TokenStream ts1 = tots.getMultiValuedTokenStream( v.length() );
Analyzer a2 = new WhitespaceAnalyzer();
TokenStream ts2 = a2.tokenStream( "", new StringReader( v ) );
Token t1 = new Token();
Token t2 = new Token();
for( t1 = ts1.next( t1 ); t1 != null; t1 = ts1.next( t1 ) ){
t2 = ts2.next( t2 );
assertEquals( t2, t1 );
while (ts1.incrementToken()) {
assertTrue(ts2.incrementToken());
assertEquals(ts1, ts2);
}
assertFalse(ts2.incrementToken());
}
}

View File

@ -25,6 +25,7 @@ import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.search.spell.JaroWinklerDistance;
import org.apache.lucene.search.spell.SpellChecker;
import org.apache.lucene.search.spell.StringDistance;
import org.apache.lucene.store.FSDirectory;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.util.AbstractSolrTestCase;
@ -254,10 +255,10 @@ public class IndexBasedSpellCheckerTest extends AbstractSolrTestCase {
File indexDir = new File(tmpDir, "spellingIdx" + new Date().getTime());
//create a standalone index
File altIndexDir = new File(tmpDir, "alternateIdx" + new Date().getTime());
IndexWriter iw = new IndexWriter(altIndexDir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
IndexWriter iw = new IndexWriter(FSDirectory.open(altIndexDir), new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
for (int i = 0; i < ALT_DOCS.length; i++) {
Document doc = new Document();
doc.add(new Field("title", ALT_DOCS[i], Field.Store.YES, Field.Index.TOKENIZED));
doc.add(new Field("title", ALT_DOCS[i], Field.Store.YES, Field.Index.ANALYZED));
iw.addDocument(doc);
}
iw.optimize();

View File

@ -19,6 +19,12 @@ package org.apache.solr.spelling;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import java.util.Collection;
import java.util.HashSet;
@ -36,9 +42,24 @@ class SimpleQueryConverter extends SpellingQueryConverter{
Collection<Token> result = new HashSet<Token>();
WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
TokenStream ts = analyzer.tokenStream("", new StringReader(origQuery));
Token tok = null;
// TODO: support custom attributes
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) ts.addAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = (TypeAttribute) ts.addAttribute(TypeAttribute.class);
FlagsAttribute flagsAtt = (FlagsAttribute) ts.addAttribute(FlagsAttribute.class);
PayloadAttribute payloadAtt = (PayloadAttribute) ts.addAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts.addAttribute(PositionIncrementAttribute.class);
try {
while ((tok = ts.next()) != null){
ts.reset();
while (ts.incrementToken()){
Token tok = new Token();
tok.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
tok.setFlags(flagsAtt.getFlags());
tok.setPayload(payloadAtt.getPayload());
tok.setPositionIncrement(posIncAtt.getPositionIncrement());
tok.setType(typeAtt.type());
result.add(tok);
}
} catch (IOException e) {

View File

@ -53,8 +53,8 @@ public class DirectUpdateHandlerOptimizeTest extends AbstractSolrTestCase {
for (int i = 0; i < 99; i++) {
// Add a valid document
cmd.doc = new Document();
cmd.doc.add(new Field("id", "id_" + i, Field.Store.YES, Field.Index.UN_TOKENIZED));
cmd.doc.add(new Field("subject", "subject_" + i, Field.Store.NO, Field.Index.TOKENIZED));
cmd.doc.add(new Field("id", "id_" + i, Field.Store.YES, Field.Index.NOT_ANALYZED));
cmd.doc.add(new Field("subject", "subject_" + i, Field.Store.NO, Field.Index.ANALYZED));
updater.addDoc(cmd);
}

View File

@ -67,16 +67,16 @@ public class DirectUpdateHandlerTest extends AbstractSolrTestCase {
// Add a valid document
cmd.doc = new Document();
cmd.doc.add( new Field( "id", "AAA", Store.YES, Index.UN_TOKENIZED ) );
cmd.doc.add( new Field( "subject", "xxxxx", Store.YES, Index.UN_TOKENIZED ) );
cmd.doc.add( new Field( "id", "AAA", Store.YES, Index.NOT_ANALYZED ) );
cmd.doc.add( new Field( "subject", "xxxxx", Store.YES, Index.NOT_ANALYZED ) );
updater.addDoc( cmd );
// Add a document with multiple ids
cmd.indexedId = null; // reset the id for this add
cmd.doc = new Document();
cmd.doc.add( new Field( "id", "AAA", Store.YES, Index.UN_TOKENIZED ) );
cmd.doc.add( new Field( "id", "BBB", Store.YES, Index.UN_TOKENIZED ) );
cmd.doc.add( new Field( "subject", "xxxxx", Store.YES, Index.UN_TOKENIZED ) );
cmd.doc.add( new Field( "id", "AAA", Store.YES, Index.NOT_ANALYZED ) );
cmd.doc.add( new Field( "id", "BBB", Store.YES, Index.NOT_ANALYZED ) );
cmd.doc.add( new Field( "subject", "xxxxx", Store.YES, Index.NOT_ANALYZED ) );
try {
updater.addDoc( cmd );
fail( "added a document with multiple ids" );
@ -86,7 +86,7 @@ public class DirectUpdateHandlerTest extends AbstractSolrTestCase {
// Add a document without an id
cmd.indexedId = null; // reset the id for this add
cmd.doc = new Document();
cmd.doc.add( new Field( "subject", "xxxxx", Store.YES, Index.UN_TOKENIZED ) );
cmd.doc.add( new Field( "subject", "xxxxx", Store.YES, Index.NOT_ANALYZED ) );
try {
updater.addDoc( cmd );
fail( "added a document without an ids" );
@ -325,7 +325,7 @@ public class DirectUpdateHandlerTest extends AbstractSolrTestCase {
// Add a document
cmd.doc = new Document();
cmd.doc.add( new Field( "id", id, Store.YES, Index.UN_TOKENIZED ) );
cmd.doc.add( new Field( "id", id, Store.YES, Index.NOT_ANALYZED ) );
updater.addDoc( cmd );
}

View File

@ -126,11 +126,12 @@ public class TestCharArrayMap extends TestCase {
int ret=0;
long start = System.currentTimeMillis();
String[] stopwords = StopAnalyzer.ENGLISH_STOP_WORDS;
Set<String> stopwords = (Set<String>) StopAnalyzer.ENGLISH_STOP_WORDS_SET;
// words = "this is a different test to see what is really going on here... I hope it works well but I'm not sure it will".split(" ");
char[][] stopwordschars = new char[stopwords.length][];
for (int i=0; i<stopwords.length; i++) {
stopwordschars[i] = stopwords[i].toCharArray();
char[][] stopwordschars = new char[stopwords.size()][];
Iterator<String> it = stopwords.iterator();
for (int i=0; i<stopwords.size(); i++) {
stopwordschars[i] = it.next().toCharArray();
}
String[] testwords = "now is the time for all good men to come to the aid of their country".split(" ");

View File

@ -57,11 +57,11 @@ public class TestOpenBitSet extends TestCase {
do {
aa = a.nextSetBit(aa+1);
if (rand.nextBoolean()) {
iterator.next();
bb = iterator.doc();
iterator.nextDoc();
bb = iterator.docID();
} else {
iterator.skipTo(bb+1);
bb = iterator.doc();
iterator.advance(bb+1);
bb = iterator.docID();
}
assertEquals(aa == -1 ? DocIdSetIterator.NO_MORE_DOCS : aa, bb);
} while (aa>=0);