merge lucene_4
This commit is contained in:
commit
6ede42bc37
17
pom.xml
17
pom.xml
|
@ -30,7 +30,7 @@
|
|||
</parent>
|
||||
|
||||
<properties>
|
||||
<lucene.version>3.6.1</lucene.version>
|
||||
<lucene.version>4.0.0</lucene.version>
|
||||
</properties>
|
||||
|
||||
<repositories>
|
||||
|
@ -51,7 +51,13 @@
|
|||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-analyzers</artifactId>
|
||||
<artifactId>lucene-analyzers-common</artifactId>
|
||||
<version>${lucene.version}</version>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-codecs</artifactId>
|
||||
<version>${lucene.version}</version>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
|
@ -79,6 +85,13 @@
|
|||
<version>${lucene.version}</version>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-queryparser</artifactId>
|
||||
<version>${lucene.version}</version>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
|
||||
|
||||
<!-- START: dependencies that are shaded -->
|
||||
<dependency>
|
||||
|
|
|
@ -0,0 +1,85 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* Extension to {@link Analyzer} suitable for Analyzers which wrap
|
||||
* other Analyzers.
|
||||
* <p/>
|
||||
* {@link #getWrappedAnalyzer(String)} allows the Analyzer
|
||||
* to wrap multiple Analyzers which are selected on a per field basis.
|
||||
* <p/>
|
||||
* {@link #wrapComponents(String, Analyzer.TokenStreamComponents)} allows the
|
||||
* TokenStreamComponents of the wrapped Analyzer to then be wrapped
|
||||
* (such as adding a new {@link TokenFilter} to form new TokenStreamComponents.
|
||||
*/
|
||||
public abstract class CustomAnalyzerWrapper extends Analyzer {
|
||||
|
||||
/**
|
||||
* Creates a new CustomAnalyzerWrapper. Since the {@link Analyzer.ReuseStrategy} of
|
||||
* the wrapped Analyzers are unknown, {@link Analyzer.PerFieldReuseStrategy} is assumed
|
||||
*/
|
||||
protected CustomAnalyzerWrapper() {
|
||||
super(new PerFieldReuseStrategy());
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves the wrapped Analyzer appropriate for analyzing the field with
|
||||
* the given name
|
||||
*
|
||||
* @param fieldName Name of the field which is to be analyzed
|
||||
* @return Analyzer for the field with the given name. Assumed to be non-null
|
||||
*/
|
||||
protected abstract Analyzer getWrappedAnalyzer(String fieldName);
|
||||
|
||||
/**
|
||||
* Wraps / alters the given TokenStreamComponents, taken from the wrapped
|
||||
* Analyzer, to form new components. It is through this method that new
|
||||
* TokenFilters can be added by AnalyzerWrappers.
|
||||
*
|
||||
*
|
||||
* @param fieldName Name of the field which is to be analyzed
|
||||
* @param components TokenStreamComponents taken from the wrapped Analyzer
|
||||
* @return Wrapped / altered TokenStreamComponents.
|
||||
*/
|
||||
protected abstract TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components);
|
||||
|
||||
@Override
|
||||
protected final TokenStreamComponents createComponents(String fieldName, Reader aReader) {
|
||||
return wrapComponents(fieldName, getWrappedAnalyzer(fieldName).createComponents(fieldName, aReader));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getPositionIncrementGap(String fieldName) {
|
||||
return getWrappedAnalyzer(fieldName).getPositionIncrementGap(fieldName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getOffsetGap(String fieldName) {
|
||||
return getWrappedAnalyzer(fieldName).getOffsetGap(fieldName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Reader initReader(String fieldName, Reader reader) {
|
||||
return getWrappedAnalyzer(fieldName).initReader(fieldName, reader);
|
||||
}
|
||||
}
|
|
@ -1,63 +0,0 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
*/
|
||||
// LUCENE MONITOR: Next version of Lucene (4.0) will have this as part of the analyzers module
|
||||
public final class TrimFilter extends TokenFilter {
|
||||
|
||||
final boolean updateOffsets;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
|
||||
public TrimFilter(TokenStream in, boolean updateOffsets) {
|
||||
super(in);
|
||||
this.updateOffsets = updateOffsets;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (!input.incrementToken()) return false;
|
||||
|
||||
char[] termBuffer = termAtt.buffer();
|
||||
int len = termAtt.length();
|
||||
//TODO: Is this the right behavior or should we return false? Currently, " ", returns true, so I think this should
|
||||
//also return true
|
||||
if (len == 0) {
|
||||
return true;
|
||||
}
|
||||
int start = 0;
|
||||
int end = 0;
|
||||
int endOff = 0;
|
||||
|
||||
// eat the first characters
|
||||
//QUESTION: Should we use Character.isWhitespace() instead?
|
||||
for (start = 0; start < len && termBuffer[start] <= ' '; start++) {
|
||||
}
|
||||
// eat the end characters
|
||||
for (end = len; end >= start && termBuffer[end - 1] <= ' '; end--) {
|
||||
endOff++;
|
||||
}
|
||||
if (start > 0 || end < len) {
|
||||
if (start < end) {
|
||||
termAtt.copyBuffer(termBuffer, start, (end - start));
|
||||
} else {
|
||||
termAtt.setEmpty();
|
||||
}
|
||||
if (updateOffsets) {
|
||||
int newStart = offsetAtt.startOffset() + start;
|
||||
int newEnd = offsetAtt.endOffset() - (start < end ? endOff : 0);
|
||||
offsetAtt.setOffset(newStart, newEnd);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -19,11 +19,11 @@
|
|||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
|
|
|
@ -1,574 +0,0 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Splits words into subwords and performs optional transformations on subword groups.
|
||||
* Words are split into subwords with the following rules:
|
||||
* - split on intra-word delimiters (by default, all non alpha-numeric characters).
|
||||
* - "Wi-Fi" -> "Wi", "Fi"
|
||||
* - split on case transitions
|
||||
* - "PowerShot" -> "Power", "Shot"
|
||||
* - split on letter-number transitions
|
||||
* - "SD500" -> "SD", "500"
|
||||
* - leading and trailing intra-word delimiters on each subword are ignored
|
||||
* - "//hello---there, 'dude'" -> "hello", "there", "dude"
|
||||
* - trailing "'s" are removed for each subword
|
||||
* - "O'Neil's" -> "O", "Neil"
|
||||
* - Note: this step isn't performed in a separate filter because of possible subword combinations.
|
||||
* <p/>
|
||||
* The <b>combinations</b> parameter affects how subwords are combined:
|
||||
* - combinations="0" causes no subword combinations.
|
||||
* - "PowerShot" -> 0:"Power", 1:"Shot" (0 and 1 are the token positions)
|
||||
* - combinations="1" means that in addition to the subwords, maximum runs of non-numeric subwords are catenated and produced at the same position of the last subword in the run.
|
||||
* - "PowerShot" -> 0:"Power", 1:"Shot" 1:"PowerShot"
|
||||
* - "A's+B's&C's" -> 0:"A", 1:"B", 2:"C", 2:"ABC"
|
||||
* - "Super-Duper-XL500-42-AutoCoder!" -> 0:"Super", 1:"Duper", 2:"XL", 2:"SuperDuperXL", 3:"500" 4:"42", 5:"Auto", 6:"Coder", 6:"AutoCoder"
|
||||
* <p/>
|
||||
* One use for WordDelimiterFilter is to help match words with different subword delimiters.
|
||||
* For example, if the source text contained "wi-fi" one may want "wifi" "WiFi" "wi-fi" "wi+fi" queries to all match.
|
||||
* One way of doing so is to specify combinations="1" in the analyzer used for indexing, and combinations="0" (the default)
|
||||
* in the analyzer used for querying. Given that the current StandardTokenizer immediately removes many intra-word
|
||||
* delimiters, it is recommended that this filter be used after a tokenizer that does not do this (such as WhitespaceTokenizer).
|
||||
*/
|
||||
// LUCENE MONITOR: Part of Lucene 4.0, once we upgrade, remove it
|
||||
public final class WordDelimiterFilter extends TokenFilter {
|
||||
|
||||
public static final int LOWER = 0x01;
|
||||
public static final int UPPER = 0x02;
|
||||
public static final int DIGIT = 0x04;
|
||||
public static final int SUBWORD_DELIM = 0x08;
|
||||
|
||||
// combinations: for testing, not for setting bits
|
||||
public static final int ALPHA = 0x03;
|
||||
public static final int ALPHANUM = 0x07;
|
||||
|
||||
/**
|
||||
* If true, causes parts of words to be generated:
|
||||
* <p/>
|
||||
* "PowerShot" => "Power" "Shot"
|
||||
*/
|
||||
final boolean generateWordParts;
|
||||
|
||||
/**
|
||||
* If true, causes number subwords to be generated:
|
||||
* <p/>
|
||||
* "500-42" => "500" "42"
|
||||
*/
|
||||
final boolean generateNumberParts;
|
||||
|
||||
/**
|
||||
* If true, causes maximum runs of word parts to be catenated:
|
||||
* <p/>
|
||||
* "wi-fi" => "wifi"
|
||||
*/
|
||||
final boolean catenateWords;
|
||||
|
||||
/**
|
||||
* If true, causes maximum runs of number parts to be catenated:
|
||||
* <p/>
|
||||
* "500-42" => "50042"
|
||||
*/
|
||||
final boolean catenateNumbers;
|
||||
|
||||
/**
|
||||
* If true, causes all subword parts to be catenated:
|
||||
* <p/>
|
||||
* "wi-fi-4000" => "wifi4000"
|
||||
*/
|
||||
final boolean catenateAll;
|
||||
|
||||
/**
|
||||
* If true, original words are preserved and added to the subword list (Defaults to false)
|
||||
* <p/>
|
||||
* "500-42" => "500" "42" "500-42"
|
||||
*/
|
||||
final boolean preserveOriginal;
|
||||
|
||||
/**
|
||||
* If not null is the set of tokens to protect from being delimited
|
||||
*/
|
||||
final CharArraySet protWords;
|
||||
|
||||
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
|
||||
|
||||
// used for iterating word delimiter breaks
|
||||
private final WordDelimiterIterator iterator;
|
||||
|
||||
// used for concatenating runs of similar typed subwords (word,number)
|
||||
private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation();
|
||||
// number of subwords last output by concat.
|
||||
private int lastConcatCount = 0;
|
||||
|
||||
// used for catenate all
|
||||
private final WordDelimiterConcatenation concatAll = new WordDelimiterConcatenation();
|
||||
|
||||
// used for accumulating position increment gaps
|
||||
private int accumPosInc = 0;
|
||||
|
||||
private char savedBuffer[] = new char[1024];
|
||||
private int savedStartOffset;
|
||||
private int savedEndOffset;
|
||||
private String savedType;
|
||||
private boolean hasSavedState = false;
|
||||
// if length by start + end offsets doesn't match the term text then assume
|
||||
// this is a synonym and don't adjust the offsets.
|
||||
private boolean hasIllegalOffsets = false;
|
||||
|
||||
// for a run of the same subword type within a word, have we output anything?
|
||||
private boolean hasOutputToken = false;
|
||||
// when preserve original is on, have we output any token following it?
|
||||
// this token must have posInc=0!
|
||||
private boolean hasOutputFollowingOriginal = false;
|
||||
|
||||
/**
|
||||
* @param in Token stream to be filtered.
|
||||
* @param charTypeTable
|
||||
* @param generateWordParts If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
|
||||
* @param generateNumberParts If 1, causes number subwords to be generated: "500-42" => "500" "42"
|
||||
* @param catenateWords 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
|
||||
* @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
|
||||
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
|
||||
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
|
||||
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
|
||||
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
|
||||
* @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
||||
* @param protWords If not null is the set of tokens to protect from being delimited
|
||||
*/
|
||||
public WordDelimiterFilter(TokenStream in,
|
||||
byte[] charTypeTable,
|
||||
int generateWordParts,
|
||||
int generateNumberParts,
|
||||
int catenateWords,
|
||||
int catenateNumbers,
|
||||
int catenateAll,
|
||||
int splitOnCaseChange,
|
||||
int preserveOriginal,
|
||||
int splitOnNumerics,
|
||||
int stemEnglishPossessive,
|
||||
CharArraySet protWords) {
|
||||
super(in);
|
||||
this.generateWordParts = generateWordParts != 0;
|
||||
this.generateNumberParts = generateNumberParts != 0;
|
||||
this.catenateWords = catenateWords != 0;
|
||||
this.catenateNumbers = catenateNumbers != 0;
|
||||
this.catenateAll = catenateAll != 0;
|
||||
this.preserveOriginal = preserveOriginal != 0;
|
||||
this.protWords = protWords;
|
||||
this.iterator = new WordDelimiterIterator(charTypeTable, splitOnCaseChange != 0, splitOnNumerics != 0, stemEnglishPossessive != 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param in Token stream to be filtered.
|
||||
* @param generateWordParts If 1, causes parts of words to be generated: "PowerShot", "Power-Shot" => "Power" "Shot"
|
||||
* @param generateNumberParts If 1, causes number subwords to be generated: "500-42" => "500" "42"
|
||||
* @param catenateWords 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
|
||||
* @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
|
||||
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
|
||||
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
|
||||
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
|
||||
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
|
||||
* @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
||||
* @param protWords If not null is the set of tokens to protect from being delimited
|
||||
*/
|
||||
public WordDelimiterFilter(TokenStream in,
|
||||
int generateWordParts,
|
||||
int generateNumberParts,
|
||||
int catenateWords,
|
||||
int catenateNumbers,
|
||||
int catenateAll,
|
||||
int splitOnCaseChange,
|
||||
int preserveOriginal,
|
||||
int splitOnNumerics,
|
||||
int stemEnglishPossessive,
|
||||
CharArraySet protWords) {
|
||||
this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, stemEnglishPossessive, protWords);
|
||||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
while (true) {
|
||||
if (!hasSavedState) {
|
||||
// process a new input word
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int termLength = termAttribute.length();
|
||||
char[] termBuffer = termAttribute.buffer();
|
||||
|
||||
accumPosInc += posIncAttribute.getPositionIncrement();
|
||||
|
||||
iterator.setText(termBuffer, termLength);
|
||||
iterator.next();
|
||||
|
||||
// word of no delimiters, or protected word: just return it
|
||||
if ((iterator.current == 0 && iterator.end == termLength) ||
|
||||
(protWords != null && protWords.contains(termBuffer, 0, termLength))) {
|
||||
posIncAttribute.setPositionIncrement(accumPosInc);
|
||||
accumPosInc = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
// word of simply delimiters
|
||||
if (iterator.end == WordDelimiterIterator.DONE && !preserveOriginal) {
|
||||
// if the posInc is 1, simply ignore it in the accumulation
|
||||
if (posIncAttribute.getPositionIncrement() == 1) {
|
||||
accumPosInc--;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
saveState();
|
||||
|
||||
hasOutputToken = false;
|
||||
hasOutputFollowingOriginal = !preserveOriginal;
|
||||
lastConcatCount = 0;
|
||||
|
||||
if (preserveOriginal) {
|
||||
posIncAttribute.setPositionIncrement(accumPosInc);
|
||||
accumPosInc = 0;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// at the end of the string, output any concatenations
|
||||
if (iterator.end == WordDelimiterIterator.DONE) {
|
||||
if (!concat.isEmpty()) {
|
||||
if (flushConcatenation(concat)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!concatAll.isEmpty()) {
|
||||
// only if we haven't output this same combo above!
|
||||
if (concatAll.subwordCount > lastConcatCount) {
|
||||
concatAll.writeAndClear();
|
||||
return true;
|
||||
}
|
||||
concatAll.clear();
|
||||
}
|
||||
|
||||
// no saved concatenations, on to the next input word
|
||||
hasSavedState = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
// word surrounded by delimiters: always output
|
||||
if (iterator.isSingleWord()) {
|
||||
generatePart(true);
|
||||
iterator.next();
|
||||
return true;
|
||||
}
|
||||
|
||||
int wordType = iterator.type();
|
||||
|
||||
// do we already have queued up incompatible concatenations?
|
||||
if (!concat.isEmpty() && (concat.type & wordType) == 0) {
|
||||
if (flushConcatenation(concat)) {
|
||||
hasOutputToken = false;
|
||||
return true;
|
||||
}
|
||||
hasOutputToken = false;
|
||||
}
|
||||
|
||||
// add subwords depending upon options
|
||||
if (shouldConcatenate(wordType)) {
|
||||
if (concat.isEmpty()) {
|
||||
concat.type = wordType;
|
||||
}
|
||||
concatenate(concat);
|
||||
}
|
||||
|
||||
// add all subwords (catenateAll)
|
||||
if (catenateAll) {
|
||||
concatenate(concatAll);
|
||||
}
|
||||
|
||||
// if we should output the word or number part
|
||||
if (shouldGenerateParts(wordType)) {
|
||||
generatePart(false);
|
||||
iterator.next();
|
||||
return true;
|
||||
}
|
||||
|
||||
iterator.next();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
hasSavedState = false;
|
||||
concat.clear();
|
||||
concatAll.clear();
|
||||
accumPosInc = 0;
|
||||
}
|
||||
|
||||
// ================================================= Helper Methods ================================================
|
||||
|
||||
/**
|
||||
* Saves the existing attribute states
|
||||
*/
|
||||
private void saveState() {
|
||||
// otherwise, we have delimiters, save state
|
||||
savedStartOffset = offsetAttribute.startOffset();
|
||||
savedEndOffset = offsetAttribute.endOffset();
|
||||
// if length by start + end offsets doesn't match the term text then assume this is a synonym and don't adjust the offsets.
|
||||
hasIllegalOffsets = (savedEndOffset - savedStartOffset != termAttribute.length());
|
||||
savedType = typeAttribute.type();
|
||||
|
||||
if (savedBuffer.length < termAttribute.length()) {
|
||||
savedBuffer = new char[ArrayUtil.oversize(termAttribute.length(), RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
}
|
||||
|
||||
System.arraycopy(termAttribute.buffer(), 0, savedBuffer, 0, termAttribute.length());
|
||||
iterator.text = savedBuffer;
|
||||
|
||||
hasSavedState = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Flushes the given WordDelimiterConcatenation by either writing its concat and then clearing, or just clearing.
|
||||
*
|
||||
* @param concatenation WordDelimiterConcatenation that will be flushed
|
||||
* @return {@code true} if the concatenation was written before it was cleared, {@code} false otherwise
|
||||
*/
|
||||
private boolean flushConcatenation(WordDelimiterConcatenation concatenation) {
|
||||
lastConcatCount = concatenation.subwordCount;
|
||||
if (concatenation.subwordCount != 1 || !shouldGenerateParts(concatenation.type)) {
|
||||
concatenation.writeAndClear();
|
||||
return true;
|
||||
}
|
||||
concatenation.clear();
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether to concatenate a word or number if the current word is the given type
|
||||
*
|
||||
* @param wordType Type of the current word used to determine if it should be concatenated
|
||||
* @return {@code true} if concatenation should occur, {@code false} otherwise
|
||||
*/
|
||||
private boolean shouldConcatenate(int wordType) {
|
||||
return (catenateWords && isAlpha(wordType)) || (catenateNumbers && isDigit(wordType));
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether a word/number part should be generated for a word of the given type
|
||||
*
|
||||
* @param wordType Type of the word used to determine if a word/number part should be generated
|
||||
* @return {@code true} if a word/number part should be generated, {@code false} otherwise
|
||||
*/
|
||||
private boolean shouldGenerateParts(int wordType) {
|
||||
return (generateWordParts && isAlpha(wordType)) || (generateNumberParts && isDigit(wordType));
|
||||
}
|
||||
|
||||
/**
|
||||
* Concatenates the saved buffer to the given WordDelimiterConcatenation
|
||||
*
|
||||
* @param concatenation WordDelimiterConcatenation to concatenate the buffer to
|
||||
*/
|
||||
private void concatenate(WordDelimiterConcatenation concatenation) {
|
||||
if (concatenation.isEmpty()) {
|
||||
concatenation.startOffset = savedStartOffset + iterator.current;
|
||||
}
|
||||
concatenation.append(savedBuffer, iterator.current, iterator.end - iterator.current);
|
||||
concatenation.endOffset = savedStartOffset + iterator.end;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a word/number part, updating the appropriate attributes
|
||||
*
|
||||
* @param isSingleWord {@code true} if the generation is occurring from a single word, {@code false} otherwise
|
||||
*/
|
||||
private void generatePart(boolean isSingleWord) {
|
||||
clearAttributes();
|
||||
termAttribute.copyBuffer(savedBuffer, iterator.current, iterator.end - iterator.current);
|
||||
|
||||
int startOffSet = (isSingleWord || !hasIllegalOffsets) ? savedStartOffset + iterator.current : savedStartOffset;
|
||||
int endOffSet = (hasIllegalOffsets) ? savedEndOffset : savedStartOffset + iterator.end;
|
||||
|
||||
offsetAttribute.setOffset(startOffSet, endOffSet);
|
||||
posIncAttribute.setPositionIncrement(position(false));
|
||||
typeAttribute.setType(savedType);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the position increment gap for a subword or concatenation
|
||||
*
|
||||
* @param inject true if this token wants to be injected
|
||||
* @return position increment gap
|
||||
*/
|
||||
private int position(boolean inject) {
|
||||
int posInc = accumPosInc;
|
||||
|
||||
if (hasOutputToken) {
|
||||
accumPosInc = 0;
|
||||
return inject ? 0 : Math.max(1, posInc);
|
||||
}
|
||||
|
||||
hasOutputToken = true;
|
||||
|
||||
if (!hasOutputFollowingOriginal) {
|
||||
// the first token following the original is 0 regardless
|
||||
hasOutputFollowingOriginal = true;
|
||||
return 0;
|
||||
}
|
||||
// clear the accumulated position increment
|
||||
accumPosInc = 0;
|
||||
return Math.max(1, posInc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the given word type includes {@link #ALPHA}
|
||||
*
|
||||
* @param type Word type to check
|
||||
* @return {@code true} if the type contains ALPHA, {@code false} otherwise
|
||||
*/
|
||||
static boolean isAlpha(int type) {
|
||||
return (type & ALPHA) != 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the given word type includes {@link #DIGIT}
|
||||
*
|
||||
* @param type Word type to check
|
||||
* @return {@code true} if the type contains DIGIT, {@code false} otherwise
|
||||
*/
|
||||
static boolean isDigit(int type) {
|
||||
return (type & DIGIT) != 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the given word type includes {@link #SUBWORD_DELIM}
|
||||
*
|
||||
* @param type Word type to check
|
||||
* @return {@code true} if the type contains SUBWORD_DELIM, {@code false} otherwise
|
||||
*/
|
||||
static boolean isSubwordDelim(int type) {
|
||||
return (type & SUBWORD_DELIM) != 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the given word type includes {@link #UPPER}
|
||||
*
|
||||
* @param type Word type to check
|
||||
* @return {@code true} if the type contains UPPER, {@code false} otherwise
|
||||
*/
|
||||
static boolean isUpper(int type) {
|
||||
return (type & UPPER) != 0;
|
||||
}
|
||||
|
||||
// ================================================= Inner Classes =================================================
|
||||
|
||||
/**
|
||||
* A WDF concatenated 'run'
|
||||
*/
|
||||
final class WordDelimiterConcatenation {
|
||||
final StringBuilder buffer = new StringBuilder();
|
||||
int startOffset;
|
||||
int endOffset;
|
||||
int type;
|
||||
int subwordCount;
|
||||
|
||||
/**
|
||||
* Appends the given text of the given length, to the concetenation at the given offset
|
||||
*
|
||||
* @param text Text to append
|
||||
* @param offset Offset in the concetenation to add the text
|
||||
* @param length Length of the text to append
|
||||
*/
|
||||
void append(char text[], int offset, int length) {
|
||||
buffer.append(text, offset, length);
|
||||
subwordCount++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes the concatenation to the attributes
|
||||
*/
|
||||
void write() {
|
||||
clearAttributes();
|
||||
if (termAttribute.length() < buffer.length()) {
|
||||
termAttribute.resizeBuffer(buffer.length());
|
||||
}
|
||||
char termbuffer[] = termAttribute.buffer();
|
||||
|
||||
buffer.getChars(0, buffer.length(), termbuffer, 0);
|
||||
termAttribute.setLength(buffer.length());
|
||||
|
||||
if (hasIllegalOffsets) {
|
||||
offsetAttribute.setOffset(savedStartOffset, savedEndOffset);
|
||||
} else {
|
||||
offsetAttribute.setOffset(startOffset, endOffset);
|
||||
}
|
||||
posIncAttribute.setPositionIncrement(position(true));
|
||||
typeAttribute.setType(savedType);
|
||||
accumPosInc = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if the concatenation is empty
|
||||
*
|
||||
* @return {@code true} if the concatenation is empty, {@code false} otherwise
|
||||
*/
|
||||
boolean isEmpty() {
|
||||
return buffer.length() == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clears the concatenation and resets its state
|
||||
*/
|
||||
void clear() {
|
||||
buffer.setLength(0);
|
||||
startOffset = endOffset = type = subwordCount = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience method for the common scenario of having to write the concetenation and then clearing its state
|
||||
*/
|
||||
void writeAndClear() {
|
||||
write();
|
||||
clear();
|
||||
}
|
||||
}
|
||||
// questions:
|
||||
// negative numbers? -42 indexed as just 42?
|
||||
// dollar sign? $42
|
||||
// percent sign? 33%
|
||||
// downsides: if source text is "powershot" then a query of "PowerShot" won't match!
|
||||
}
|
|
@ -1,341 +0,0 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
|
||||
|
||||
/**
|
||||
* A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterFilter rules.
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
public final class WordDelimiterIterator {
|
||||
|
||||
/**
|
||||
* Indicates the end of iteration
|
||||
*/
|
||||
public static final int DONE = -1;
|
||||
|
||||
public static final byte[] DEFAULT_WORD_DELIM_TABLE;
|
||||
|
||||
char text[];
|
||||
int length;
|
||||
|
||||
/**
|
||||
* start position of text, excluding leading delimiters
|
||||
*/
|
||||
int startBounds;
|
||||
/**
|
||||
* end position of text, excluding trailing delimiters
|
||||
*/
|
||||
int endBounds;
|
||||
|
||||
/**
|
||||
* Beginning of subword
|
||||
*/
|
||||
int current;
|
||||
/**
|
||||
* End of subword
|
||||
*/
|
||||
int end;
|
||||
|
||||
/* does this string end with a possessive such as 's */
|
||||
private boolean hasFinalPossessive = false;
|
||||
|
||||
/**
|
||||
* If false, causes case changes to be ignored (subwords will only be generated
|
||||
* given SUBWORD_DELIM tokens). (Defaults to true)
|
||||
*/
|
||||
final boolean splitOnCaseChange;
|
||||
|
||||
/**
|
||||
* If false, causes numeric changes to be ignored (subwords will only be generated
|
||||
* given SUBWORD_DELIM tokens). (Defaults to true)
|
||||
*/
|
||||
final boolean splitOnNumerics;
|
||||
|
||||
/**
|
||||
* If true, causes trailing "'s" to be removed for each subword. (Defaults to true)
|
||||
* <p/>
|
||||
* "O'Neil's" => "O", "Neil"
|
||||
*/
|
||||
final boolean stemEnglishPossessive;
|
||||
|
||||
private final byte[] charTypeTable;
|
||||
|
||||
/**
|
||||
* if true, need to skip over a possessive found in the last call to next()
|
||||
*/
|
||||
private boolean skipPossessive = false;
|
||||
|
||||
// TODO: should there be a WORD_DELIM category for chars that only separate words (no catenation of subwords will be
|
||||
// done if separated by these chars?) "," would be an obvious candidate...
|
||||
static {
|
||||
byte[] tab = new byte[256];
|
||||
for (int i = 0; i < 256; i++) {
|
||||
byte code = 0;
|
||||
if (Character.isLowerCase(i)) {
|
||||
code |= LOWER;
|
||||
} else if (Character.isUpperCase(i)) {
|
||||
code |= UPPER;
|
||||
} else if (Character.isDigit(i)) {
|
||||
code |= DIGIT;
|
||||
}
|
||||
if (code == 0) {
|
||||
code = SUBWORD_DELIM;
|
||||
}
|
||||
tab[i] = code;
|
||||
}
|
||||
DEFAULT_WORD_DELIM_TABLE = tab;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new WordDelimiterIterator operating with the supplied rules.
|
||||
*
|
||||
* @param charTypeTable table containing character types
|
||||
* @param splitOnCaseChange if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
|
||||
* @param splitOnNumerics if true, causes "j2se" to be three tokens; "j" "2" "se"
|
||||
* @param stemEnglishPossessive if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
||||
*/
|
||||
WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean splitOnNumerics, boolean stemEnglishPossessive) {
|
||||
this.charTypeTable = charTypeTable;
|
||||
this.splitOnCaseChange = splitOnCaseChange;
|
||||
this.splitOnNumerics = splitOnNumerics;
|
||||
this.stemEnglishPossessive = stemEnglishPossessive;
|
||||
}
|
||||
|
||||
/**
|
||||
* Advance to the next subword in the string.
|
||||
*
|
||||
* @return index of the next subword, or {@link #DONE} if all subwords have been returned
|
||||
*/
|
||||
int next() {
|
||||
current = end;
|
||||
if (current == DONE) {
|
||||
return DONE;
|
||||
}
|
||||
|
||||
if (skipPossessive) {
|
||||
current += 2;
|
||||
skipPossessive = false;
|
||||
}
|
||||
|
||||
int lastType = 0;
|
||||
|
||||
while (current < endBounds && (isSubwordDelim(lastType = charType(text[current])))) {
|
||||
current++;
|
||||
}
|
||||
|
||||
if (current >= endBounds) {
|
||||
return end = DONE;
|
||||
}
|
||||
|
||||
for (end = current + 1; end < endBounds; end++) {
|
||||
int type = charType(text[end]);
|
||||
if (isBreak(lastType, type)) {
|
||||
break;
|
||||
}
|
||||
lastType = type;
|
||||
}
|
||||
|
||||
if (end < endBounds - 1 && endsWithPossessive(end + 2)) {
|
||||
skipPossessive = true;
|
||||
}
|
||||
|
||||
return end;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return the type of the current subword.
|
||||
* This currently uses the type of the first character in the subword.
|
||||
*
|
||||
* @return type of the current word
|
||||
*/
|
||||
int type() {
|
||||
if (end == DONE) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int type = charType(text[current]);
|
||||
switch (type) {
|
||||
// return ALPHA word type for both lower and upper
|
||||
case LOWER:
|
||||
case UPPER:
|
||||
return ALPHA;
|
||||
default:
|
||||
return type;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset the text to a new value, and reset all state
|
||||
*
|
||||
* @param text New text
|
||||
* @param length length of the text
|
||||
*/
|
||||
void setText(char text[], int length) {
|
||||
this.text = text;
|
||||
this.length = this.endBounds = length;
|
||||
current = startBounds = end = 0;
|
||||
skipPossessive = hasFinalPossessive = false;
|
||||
setBounds();
|
||||
}
|
||||
|
||||
// ================================================= Helper Methods ================================================
|
||||
|
||||
/**
|
||||
* Determines whether the transition from lastType to type indicates a break
|
||||
*
|
||||
* @param lastType Last subword type
|
||||
* @param type Current subword type
|
||||
* @return {@code true} if the transition indicates a break, {@code false} otherwise
|
||||
*/
|
||||
private boolean isBreak(int lastType, int type) {
|
||||
if ((type & lastType) != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!splitOnCaseChange && isAlpha(lastType) && isAlpha(type)) {
|
||||
// ALPHA->ALPHA: always ignore if case isn't considered.
|
||||
return false;
|
||||
} else if (isUpper(lastType) && isAlpha(type)) {
|
||||
// UPPER->letter: Don't split
|
||||
return false;
|
||||
} else if (!splitOnNumerics && ((isAlpha(lastType) && isDigit(type)) || (isDigit(lastType) && isAlpha(type)))) {
|
||||
// ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if the current word contains only one subword. Note, it could be potentially surrounded by delimiters
|
||||
*
|
||||
* @return {@code true} if the current word contains only one subword, {@code false} otherwise
|
||||
*/
|
||||
boolean isSingleWord() {
|
||||
if (hasFinalPossessive) {
|
||||
return current == startBounds && end == endBounds - 2;
|
||||
} else {
|
||||
return current == startBounds && end == endBounds;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove
|
||||
* it yet, simply note it.
|
||||
*/
|
||||
private void setBounds() {
|
||||
while (startBounds < length && (isSubwordDelim(charType(text[startBounds])))) {
|
||||
startBounds++;
|
||||
}
|
||||
|
||||
while (endBounds > startBounds && (isSubwordDelim(charType(text[endBounds - 1])))) {
|
||||
endBounds--;
|
||||
}
|
||||
if (endsWithPossessive(endBounds)) {
|
||||
hasFinalPossessive = true;
|
||||
}
|
||||
current = startBounds;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if the text at the given position indicates an English possessive which should be removed
|
||||
*
|
||||
* @param pos Position in the text to check if it indicates an English possessive
|
||||
* @return {@code true} if the text at the position indicates an English posessive, {@code false} otherwise
|
||||
*/
|
||||
private boolean endsWithPossessive(int pos) {
|
||||
return (stemEnglishPossessive &&
|
||||
pos > 2 &&
|
||||
text[pos - 2] == '\'' &&
|
||||
(text[pos - 1] == 's' || text[pos - 1] == 'S') &&
|
||||
isAlpha(charType(text[pos - 3])) &&
|
||||
(pos == endBounds || isSubwordDelim(charType(text[pos]))));
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines the type of the given character
|
||||
*
|
||||
* @param ch Character whose type is to be determined
|
||||
* @return Type of the character
|
||||
*/
|
||||
private int charType(int ch) {
|
||||
if (ch < charTypeTable.length) {
|
||||
return charTypeTable[ch];
|
||||
}
|
||||
return getType(ch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the type of the given character
|
||||
*
|
||||
* @param ch Character whose type is to be determined
|
||||
* @return Type of the character
|
||||
*/
|
||||
public static byte getType(int ch) {
|
||||
switch (Character.getType(ch)) {
|
||||
case Character.UPPERCASE_LETTER:
|
||||
return UPPER;
|
||||
case Character.LOWERCASE_LETTER:
|
||||
return LOWER;
|
||||
|
||||
case Character.TITLECASE_LETTER:
|
||||
case Character.MODIFIER_LETTER:
|
||||
case Character.OTHER_LETTER:
|
||||
case Character.NON_SPACING_MARK:
|
||||
case Character.ENCLOSING_MARK: // depends what it encloses?
|
||||
case Character.COMBINING_SPACING_MARK:
|
||||
return ALPHA;
|
||||
|
||||
case Character.DECIMAL_DIGIT_NUMBER:
|
||||
case Character.LETTER_NUMBER:
|
||||
case Character.OTHER_NUMBER:
|
||||
return DIGIT;
|
||||
|
||||
// case Character.SPACE_SEPARATOR:
|
||||
// case Character.LINE_SEPARATOR:
|
||||
// case Character.PARAGRAPH_SEPARATOR:
|
||||
// case Character.CONTROL:
|
||||
// case Character.FORMAT:
|
||||
// case Character.PRIVATE_USE:
|
||||
|
||||
case Character.SURROGATE: // prevent splitting
|
||||
return ALPHA | DIGIT;
|
||||
|
||||
// case Character.DASH_PUNCTUATION:
|
||||
// case Character.START_PUNCTUATION:
|
||||
// case Character.END_PUNCTUATION:
|
||||
// case Character.CONNECTOR_PUNCTUATION:
|
||||
// case Character.OTHER_PUNCTUATION:
|
||||
// case Character.MATH_SYMBOL:
|
||||
// case Character.CURRENCY_SYMBOL:
|
||||
// case Character.MODIFIER_SYMBOL:
|
||||
// case Character.OTHER_SYMBOL:
|
||||
// case Character.INITIAL_QUOTE_PUNCTUATION:
|
||||
// case Character.FINAL_QUOTE_PUNCTUATION:
|
||||
|
||||
default:
|
||||
return SUBWORD_DELIM;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,85 +0,0 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* A TokenFilter which applies a Pattern to each token in the stream,
|
||||
* replacing match occurances with the specified replacement string.
|
||||
* <p/>
|
||||
* <p>
|
||||
* <b>Note:</b> Depending on the input and the pattern used and the input
|
||||
* TokenStream, this TokenFilter may produce Tokens whose text is the empty
|
||||
* string.
|
||||
* </p>
|
||||
*
|
||||
* @see Pattern
|
||||
*/
|
||||
public final class PatternReplaceFilter extends TokenFilter {
|
||||
private final Pattern p;
|
||||
private final String replacement;
|
||||
private final boolean all;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final Matcher m;
|
||||
|
||||
/**
|
||||
* Constructs an instance to replace either the first, or all occurances
|
||||
*
|
||||
* @param in the TokenStream to process
|
||||
* @param p the patterm to apply to each Token
|
||||
* @param replacement the "replacement string" to substitute, if null a
|
||||
* blank string will be used. Note that this is not the literal
|
||||
* string that will be used, '$' and '\' have special meaning.
|
||||
* @param all if true, all matches will be replaced otherwise just the first match.
|
||||
* @see Matcher#quoteReplacement
|
||||
*/
|
||||
public PatternReplaceFilter(TokenStream in,
|
||||
Pattern p,
|
||||
String replacement,
|
||||
boolean all) {
|
||||
super(in);
|
||||
this.p = p;
|
||||
this.replacement = (null == replacement) ? "" : replacement;
|
||||
this.all = all;
|
||||
this.m = p.matcher(termAtt);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (!input.incrementToken()) return false;
|
||||
|
||||
m.reset();
|
||||
if (m.find()) {
|
||||
// replaceAll/replaceFirst will reset() this previous find.
|
||||
String transformed = all ? m.replaceAll(replacement) : m.replaceFirst(replacement);
|
||||
termAtt.setEmpty().append(transformed);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,153 +0,0 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* This tokenizer uses regex pattern matching to construct distinct tokens
|
||||
* for the input stream. It takes two arguments: "pattern" and "group".
|
||||
* <p/>
|
||||
* <ul>
|
||||
* <li>"pattern" is the regular expression.</li>
|
||||
* <li>"group" says which group to extract into tokens.</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* group=-1 (the default) is equivalent to "split". In this case, the tokens will
|
||||
* be equivalent to the output from (without empty tokens):
|
||||
* {@link String#split(java.lang.String)}
|
||||
* </p>
|
||||
* <p>
|
||||
* Using group >= 0 selects the matching group as the token. For example, if you have:<br/>
|
||||
* <pre>
|
||||
* pattern = \'([^\']+)\'
|
||||
* group = 0
|
||||
* input = aaa 'bbb' 'ccc'
|
||||
* </pre>
|
||||
* the output will be two tokens: 'bbb' and 'ccc' (including the ' marks). With the same input
|
||||
* but using group=1, the output would be: bbb and ccc (no ' marks)
|
||||
* </p>
|
||||
* <p>NOTE: This Tokenizer does not output tokens that are of zero length.</p>
|
||||
*
|
||||
* @see Pattern
|
||||
*/
|
||||
public final class PatternTokenizer extends Tokenizer {
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
private final StringBuilder str = new StringBuilder();
|
||||
private int index;
|
||||
|
||||
private final Pattern pattern;
|
||||
private final int group;
|
||||
private final Matcher matcher;
|
||||
|
||||
/**
|
||||
* creates a new PatternTokenizer returning tokens from group (-1 for split functionality)
|
||||
*/
|
||||
public PatternTokenizer(Reader input, Pattern pattern, int group) throws IOException {
|
||||
super(input);
|
||||
this.pattern = pattern;
|
||||
this.group = group;
|
||||
fillBuffer(str, input);
|
||||
matcher = pattern.matcher(str);
|
||||
index = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (index >= str.length()) return false;
|
||||
clearAttributes();
|
||||
if (group >= 0) {
|
||||
|
||||
// match a specific group
|
||||
while (matcher.find()) {
|
||||
index = matcher.start(group);
|
||||
final int endIndex = matcher.end(group);
|
||||
if (index == endIndex) continue;
|
||||
termAtt.setEmpty().append(str, index, endIndex);
|
||||
offsetAtt.setOffset(correctOffset(index), correctOffset(endIndex));
|
||||
return true;
|
||||
}
|
||||
|
||||
index = Integer.MAX_VALUE; // mark exhausted
|
||||
return false;
|
||||
|
||||
} else {
|
||||
|
||||
// String.split() functionality
|
||||
while (matcher.find()) {
|
||||
if (matcher.start() - index > 0) {
|
||||
// found a non-zero-length token
|
||||
termAtt.setEmpty().append(str, index, matcher.start());
|
||||
offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.start()));
|
||||
index = matcher.end();
|
||||
return true;
|
||||
}
|
||||
|
||||
index = matcher.end();
|
||||
}
|
||||
|
||||
if (str.length() - index == 0) {
|
||||
index = Integer.MAX_VALUE; // mark exhausted
|
||||
return false;
|
||||
}
|
||||
|
||||
termAtt.setEmpty().append(str, index, str.length());
|
||||
offsetAtt.setOffset(correctOffset(index), correctOffset(str.length()));
|
||||
index = Integer.MAX_VALUE; // mark exhausted
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
final int ofs = correctOffset(str.length());
|
||||
offsetAtt.setOffset(ofs, ofs);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
fillBuffer(str, input);
|
||||
matcher.reset(str);
|
||||
index = 0;
|
||||
}
|
||||
|
||||
// TODO: we should see if we can make this tokenizer work without reading
|
||||
// the entire document into RAM, perhaps with Matcher.hitEnd/requireEnd ?
|
||||
final char[] buffer = new char[8192];
|
||||
|
||||
private void fillBuffer(StringBuilder sb, Reader input) throws IOException {
|
||||
int len;
|
||||
sb.setLength(0);
|
||||
while ((len = input.read(buffer)) > 0) {
|
||||
sb.append(buffer, 0, len);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,451 +0,0 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.QueryWrapperFilter;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.elasticsearch.common.Unicode;
|
||||
import org.elasticsearch.common.bloom.BloomFilter;
|
||||
import org.elasticsearch.index.cache.bloom.BloomCache;
|
||||
import org.elasticsearch.index.mapper.internal.UidFieldMapper;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
/* Tracks the stream of {@link BuffereDeletes}.
|
||||
* When DocumensWriter flushes, its buffered
|
||||
* deletes are appended to this stream. We later
|
||||
* apply these deletes (resolve them to the actual
|
||||
* docIDs, per segment) when a merge is started
|
||||
* (only to the to-be-merged segments). We
|
||||
* also apply to all segments when NRT reader is pulled,
|
||||
* commit/close is called, or when too many deletes are
|
||||
* buffered and must be flushed (by RAM usage or by count).
|
||||
*
|
||||
* Each packet is assigned a generation, and each flushed or
|
||||
* merged segment is also assigned a generation, so we can
|
||||
* track which BufferedDeletes packets to apply to any given
|
||||
* segment. */
|
||||
|
||||
// LUCENE MONITOR: We copied this class from Lucene, effectively overriding it with our implementation
|
||||
// if it comes first in the classpath, allowing for faster apply deletes based on terms
|
||||
class BufferedDeletesStream implements XIndexWriter.XBufferedDeletesStream {
|
||||
|
||||
// TODO: maybe linked list?
|
||||
private final List<FrozenBufferedDeletes> deletes = new ArrayList<FrozenBufferedDeletes>();
|
||||
|
||||
// Starts at 1 so that SegmentInfos that have never had
|
||||
// deletes applied (whose bufferedDelGen defaults to 0)
|
||||
// will be correct:
|
||||
private long nextGen = 1;
|
||||
|
||||
// used only by assert
|
||||
private Term lastDeleteTerm;
|
||||
|
||||
private PrintStream infoStream;
|
||||
private final AtomicLong bytesUsed = new AtomicLong();
|
||||
private final AtomicInteger numTerms = new AtomicInteger();
|
||||
private final int messageID;
|
||||
|
||||
private BloomCache bloomCache;
|
||||
|
||||
public BufferedDeletesStream(int messageID) {
|
||||
this.messageID = messageID;
|
||||
}
|
||||
|
||||
private synchronized void message(String message) {
|
||||
if (infoStream != null) {
|
||||
infoStream.println("BD " + messageID + " [" + new Date() + "; " + Thread.currentThread().getName() + "]: " + message);
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized void setInfoStream(PrintStream infoStream) {
|
||||
this.infoStream = infoStream;
|
||||
}
|
||||
|
||||
public void setBloomCache(BloomCache bloomCache) {
|
||||
this.bloomCache = bloomCache;
|
||||
}
|
||||
|
||||
// Appends a new packet of buffered deletes to the stream,
|
||||
// setting its generation:
|
||||
public synchronized void push(FrozenBufferedDeletes packet) {
|
||||
assert packet.any();
|
||||
assert checkDeleteStats();
|
||||
assert packet.gen < nextGen;
|
||||
deletes.add(packet);
|
||||
numTerms.addAndGet(packet.numTermDeletes);
|
||||
bytesUsed.addAndGet(packet.bytesUsed);
|
||||
if (infoStream != null) {
|
||||
message("push deletes " + packet + " delGen=" + packet.gen + " packetCount=" + deletes.size());
|
||||
}
|
||||
assert checkDeleteStats();
|
||||
}
|
||||
|
||||
public synchronized void clear() {
|
||||
deletes.clear();
|
||||
nextGen = 1;
|
||||
numTerms.set(0);
|
||||
bytesUsed.set(0);
|
||||
}
|
||||
|
||||
public boolean any() {
|
||||
return bytesUsed.get() != 0;
|
||||
}
|
||||
|
||||
public int numTerms() {
|
||||
return numTerms.get();
|
||||
}
|
||||
|
||||
public long bytesUsed() {
|
||||
return bytesUsed.get();
|
||||
}
|
||||
|
||||
public static class ApplyDeletesResult {
|
||||
// True if any actual deletes took place:
|
||||
public final boolean anyDeletes;
|
||||
|
||||
// Current gen, for the merged segment:
|
||||
public final long gen;
|
||||
|
||||
// If non-null, contains segments that are 100% deleted
|
||||
public final List<SegmentInfo> allDeleted;
|
||||
|
||||
ApplyDeletesResult(boolean anyDeletes, long gen, List<SegmentInfo> allDeleted) {
|
||||
this.anyDeletes = anyDeletes;
|
||||
this.gen = gen;
|
||||
this.allDeleted = allDeleted;
|
||||
}
|
||||
}
|
||||
|
||||
// Sorts SegmentInfos from smallest to biggest bufferedDelGen:
|
||||
private static final Comparator<SegmentInfo> sortByDelGen = new Comparator<SegmentInfo>() {
|
||||
// @Override -- not until Java 1.6
|
||||
public int compare(SegmentInfo si1, SegmentInfo si2) {
|
||||
final long cmp = si1.getBufferedDeletesGen() - si2.getBufferedDeletesGen();
|
||||
if (cmp > 0) {
|
||||
return 1;
|
||||
} else if (cmp < 0) {
|
||||
return -1;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Resolves the buffered deleted Term/Query/docIDs, into
|
||||
* actual deleted docIDs in the deletedDocs BitVector for
|
||||
* each SegmentReader.
|
||||
*/
|
||||
public synchronized ApplyDeletesResult applyDeletes(IndexWriter.ReaderPool readerPool, List<SegmentInfo> infos) throws IOException {
|
||||
final long t0 = System.currentTimeMillis();
|
||||
|
||||
if (infos.size() == 0) {
|
||||
return new ApplyDeletesResult(false, nextGen++, null);
|
||||
}
|
||||
|
||||
assert checkDeleteStats();
|
||||
|
||||
if (!any()) {
|
||||
message("applyDeletes: no deletes; skipping");
|
||||
return new ApplyDeletesResult(false, nextGen++, null);
|
||||
}
|
||||
|
||||
if (infoStream != null) {
|
||||
message("applyDeletes: infos=" + infos + " packetCount=" + deletes.size());
|
||||
}
|
||||
|
||||
List<SegmentInfo> infos2 = new ArrayList<SegmentInfo>();
|
||||
infos2.addAll(infos);
|
||||
Collections.sort(infos2, sortByDelGen);
|
||||
|
||||
CoalescedDeletes coalescedDeletes = null;
|
||||
boolean anyNewDeletes = false;
|
||||
|
||||
int infosIDX = infos2.size() - 1;
|
||||
int delIDX = deletes.size() - 1;
|
||||
|
||||
List<SegmentInfo> allDeleted = null;
|
||||
|
||||
while (infosIDX >= 0) {
|
||||
//System.out.println("BD: cycle delIDX=" + delIDX + " infoIDX=" + infosIDX);
|
||||
|
||||
final FrozenBufferedDeletes packet = delIDX >= 0 ? deletes.get(delIDX) : null;
|
||||
final SegmentInfo info = infos2.get(infosIDX);
|
||||
final long segGen = info.getBufferedDeletesGen();
|
||||
|
||||
if (packet != null && segGen < packet.gen) {
|
||||
//System.out.println(" coalesce");
|
||||
if (coalescedDeletes == null) {
|
||||
coalescedDeletes = new CoalescedDeletes();
|
||||
}
|
||||
coalescedDeletes.update(packet);
|
||||
delIDX--;
|
||||
} else if (packet != null && segGen == packet.gen) {
|
||||
//System.out.println(" eq");
|
||||
|
||||
// Lock order: IW -> BD -> RP
|
||||
assert readerPool.infoIsLive(info);
|
||||
SegmentReader reader = readerPool.get(info, false);
|
||||
int delCount = 0;
|
||||
final boolean segAllDeletes;
|
||||
try {
|
||||
if (coalescedDeletes != null) {
|
||||
//System.out.println(" del coalesced");
|
||||
delCount += applyTermDeletes(coalescedDeletes.termsIterable(), reader);
|
||||
delCount += applyQueryDeletes(coalescedDeletes.queriesIterable(), reader);
|
||||
}
|
||||
//System.out.println(" del exact");
|
||||
// Don't delete by Term here; DocumentsWriter
|
||||
// already did that on flush:
|
||||
delCount += applyQueryDeletes(packet.queriesIterable(), reader);
|
||||
segAllDeletes = reader.numDocs() == 0;
|
||||
} finally {
|
||||
readerPool.release(reader);
|
||||
}
|
||||
anyNewDeletes |= delCount > 0;
|
||||
|
||||
if (segAllDeletes) {
|
||||
if (allDeleted == null) {
|
||||
allDeleted = new ArrayList<SegmentInfo>();
|
||||
}
|
||||
allDeleted.add(info);
|
||||
}
|
||||
|
||||
if (infoStream != null) {
|
||||
message("seg=" + info + " segGen=" + segGen + " segDeletes=[" + packet + "]; coalesced deletes=[" + (coalescedDeletes == null ? "null" : coalescedDeletes) + "] delCount=" + delCount + (segAllDeletes ? " 100% deleted" : ""));
|
||||
}
|
||||
|
||||
if (coalescedDeletes == null) {
|
||||
coalescedDeletes = new CoalescedDeletes();
|
||||
}
|
||||
coalescedDeletes.update(packet);
|
||||
delIDX--;
|
||||
infosIDX--;
|
||||
info.setBufferedDeletesGen(nextGen);
|
||||
|
||||
} else {
|
||||
//System.out.println(" gt");
|
||||
|
||||
if (coalescedDeletes != null) {
|
||||
// Lock order: IW -> BD -> RP
|
||||
assert readerPool.infoIsLive(info);
|
||||
SegmentReader reader = readerPool.get(info, false);
|
||||
int delCount = 0;
|
||||
final boolean segAllDeletes;
|
||||
try {
|
||||
delCount += applyTermDeletes(coalescedDeletes.termsIterable(), reader);
|
||||
delCount += applyQueryDeletes(coalescedDeletes.queriesIterable(), reader);
|
||||
segAllDeletes = reader.numDocs() == 0;
|
||||
} finally {
|
||||
readerPool.release(reader);
|
||||
}
|
||||
anyNewDeletes |= delCount > 0;
|
||||
|
||||
if (segAllDeletes) {
|
||||
if (allDeleted == null) {
|
||||
allDeleted = new ArrayList<SegmentInfo>();
|
||||
}
|
||||
allDeleted.add(info);
|
||||
}
|
||||
|
||||
if (infoStream != null) {
|
||||
message("seg=" + info + " segGen=" + segGen + " coalesced deletes=[" + (coalescedDeletes == null ? "null" : coalescedDeletes) + "] delCount=" + delCount + (segAllDeletes ? " 100% deleted" : ""));
|
||||
}
|
||||
}
|
||||
info.setBufferedDeletesGen(nextGen);
|
||||
|
||||
infosIDX--;
|
||||
}
|
||||
}
|
||||
|
||||
assert checkDeleteStats();
|
||||
if (infoStream != null) {
|
||||
message("applyDeletes took " + (System.currentTimeMillis() - t0) + " msec");
|
||||
}
|
||||
// assert infos != segmentInfos || !any() : "infos=" + infos + " segmentInfos=" + segmentInfos + " any=" + any;
|
||||
|
||||
return new ApplyDeletesResult(anyNewDeletes, nextGen++, allDeleted);
|
||||
}
|
||||
|
||||
public synchronized long getNextGen() {
|
||||
return nextGen++;
|
||||
}
|
||||
|
||||
// Lock order IW -> BD
|
||||
/* Removes any BufferedDeletes that we no longer need to
|
||||
* store because all segments in the index have had the
|
||||
* deletes applied. */
|
||||
public synchronized void prune(SegmentInfos segmentInfos) {
|
||||
assert checkDeleteStats();
|
||||
long minGen = Long.MAX_VALUE;
|
||||
for (SegmentInfo info : segmentInfos) {
|
||||
minGen = Math.min(info.getBufferedDeletesGen(), minGen);
|
||||
}
|
||||
|
||||
if (infoStream != null) {
|
||||
message("prune sis=" + segmentInfos + " minGen=" + minGen + " packetCount=" + deletes.size());
|
||||
}
|
||||
|
||||
final int limit = deletes.size();
|
||||
for (int delIDX = 0; delIDX < limit; delIDX++) {
|
||||
if (deletes.get(delIDX).gen >= minGen) {
|
||||
prune(delIDX);
|
||||
assert checkDeleteStats();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// All deletes pruned
|
||||
prune(limit);
|
||||
assert !any();
|
||||
assert checkDeleteStats();
|
||||
}
|
||||
|
||||
private synchronized void prune(int count) {
|
||||
if (count > 0) {
|
||||
if (infoStream != null) {
|
||||
message("pruneDeletes: prune " + count + " packets; " + (deletes.size() - count) + " packets remain");
|
||||
}
|
||||
for (int delIDX = 0; delIDX < count; delIDX++) {
|
||||
final FrozenBufferedDeletes packet = deletes.get(delIDX);
|
||||
numTerms.addAndGet(-packet.numTermDeletes);
|
||||
assert numTerms.get() >= 0;
|
||||
bytesUsed.addAndGet(-packet.bytesUsed);
|
||||
assert bytesUsed.get() >= 0;
|
||||
}
|
||||
deletes.subList(0, count).clear();
|
||||
}
|
||||
}
|
||||
|
||||
// ES CHANGE: Add bloom filter usage
|
||||
// Delete by Term
|
||||
private synchronized long applyTermDeletes(Iterable<Term> termsIter, SegmentReader reader) throws IOException {
|
||||
long delCount = 0;
|
||||
|
||||
assert checkDeleteTerm(null);
|
||||
|
||||
BloomFilter filter = bloomCache == null ? BloomFilter.NONE : bloomCache.filter(reader, UidFieldMapper.NAME, true);
|
||||
UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result();
|
||||
|
||||
TermDocs docs = reader.termDocs();
|
||||
|
||||
for (Term term : termsIter) {
|
||||
|
||||
if (term.field() == UidFieldMapper.NAME) {
|
||||
Unicode.fromStringAsUtf8(term.text(), utf8);
|
||||
if (!filter.isPresent(utf8.result, 0, utf8.length)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (docs == null) {
|
||||
docs = reader.termDocs();
|
||||
}
|
||||
|
||||
// Since we visit terms sorted, we gain performance
|
||||
// by re-using the same TermsEnum and seeking only
|
||||
// forwards
|
||||
assert checkDeleteTerm(term);
|
||||
docs.seek(term);
|
||||
|
||||
while (docs.next()) {
|
||||
final int docID = docs.doc();
|
||||
reader.deleteDocument(docID);
|
||||
// TODO: we could/should change
|
||||
// reader.deleteDocument to return boolean
|
||||
// true if it did in fact delete, because here
|
||||
// we could be deleting an already-deleted doc
|
||||
// which makes this an upper bound:
|
||||
delCount++;
|
||||
}
|
||||
}
|
||||
|
||||
return delCount;
|
||||
}
|
||||
|
||||
public static class QueryAndLimit {
|
||||
public final Query query;
|
||||
public final int limit;
|
||||
|
||||
public QueryAndLimit(Query query, int limit) {
|
||||
this.query = query;
|
||||
this.limit = limit;
|
||||
}
|
||||
}
|
||||
|
||||
// Delete by query
|
||||
private synchronized long applyQueryDeletes(Iterable<QueryAndLimit> queriesIter, SegmentReader reader) throws IOException {
|
||||
long delCount = 0;
|
||||
|
||||
for (QueryAndLimit ent : queriesIter) {
|
||||
Query query = ent.query;
|
||||
int limit = ent.limit;
|
||||
final DocIdSet docs = new QueryWrapperFilter(query).getDocIdSet(reader);
|
||||
if (docs != null) {
|
||||
final DocIdSetIterator it = docs.iterator();
|
||||
if (it != null) {
|
||||
while (true) {
|
||||
int doc = it.nextDoc();
|
||||
if (doc >= limit)
|
||||
break;
|
||||
|
||||
reader.deleteDocument(doc);
|
||||
// TODO: we could/should change
|
||||
// reader.deleteDocument to return boolean
|
||||
// true if it did in fact delete, because here
|
||||
// we could be deleting an already-deleted doc
|
||||
// which makes this an upper bound:
|
||||
delCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return delCount;
|
||||
}
|
||||
|
||||
// used only by assert
|
||||
private boolean checkDeleteTerm(Term term) {
|
||||
if (term != null) {
|
||||
assert lastDeleteTerm == null || term.compareTo(lastDeleteTerm) > 0 : "lastTerm=" + lastDeleteTerm + " vs term=" + term;
|
||||
}
|
||||
// TODO: we re-use term now in our merged iterable, but we shouldn't clone, instead copy for this assert
|
||||
lastDeleteTerm = term == null ? null : new Term(term.field(), term.text());
|
||||
return true;
|
||||
}
|
||||
|
||||
// only for assert
|
||||
private boolean checkDeleteStats() {
|
||||
int numTerms2 = 0;
|
||||
long bytesUsed2 = 0;
|
||||
for (FrozenBufferedDeletes packet : deletes) {
|
||||
numTerms2 += packet.numTermDeletes;
|
||||
bytesUsed2 += packet.bytesUsed;
|
||||
}
|
||||
assert numTerms2 == numTerms.get() : "numTerms2=" + numTerms2 + " vs " + numTerms.get();
|
||||
assert bytesUsed2 == bytesUsed.get() : "bytesUsed2=" + bytesUsed2 + " vs " + bytesUsed;
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -1,49 +0,0 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.index;
|
||||
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public class ExtendedIndexSearcher extends IndexSearcher {
|
||||
|
||||
public ExtendedIndexSearcher(ExtendedIndexSearcher searcher) {
|
||||
super(searcher.getIndexReader(), searcher.subReaders(), searcher.docStarts());
|
||||
setSimilarity(searcher.getSimilarity());
|
||||
}
|
||||
|
||||
public ExtendedIndexSearcher(IndexReader r) {
|
||||
super(r);
|
||||
}
|
||||
|
||||
public IndexReader[] subReaders() {
|
||||
return this.subReaders;
|
||||
}
|
||||
|
||||
public int[] docStarts() {
|
||||
return this.docStarts;
|
||||
}
|
||||
|
||||
public int readerIndex(int doc) {
|
||||
return DirectoryReader.readerIndex(doc, docStarts, subReaders.length);
|
||||
}
|
||||
}
|
|
@ -84,7 +84,7 @@ public class TrackingConcurrentMergeScheduler extends ConcurrentMergeScheduler {
|
|||
currentMergesNumDocs.inc(totalNumDocs);
|
||||
currentMergesSizeInBytes.inc(totalSizeInBytes);
|
||||
if (logger.isTraceEnabled()) {
|
||||
logger.trace("merge [{}] starting..., merging [{}] segments, [{}] docs, [{}] size, into [{}] estimated_size", merge.info == null ? "_na_" : merge.info.name, merge.segments.size(), totalNumDocs, new ByteSizeValue(totalSizeInBytes), new ByteSizeValue(merge.estimatedMergeBytes));
|
||||
logger.trace("merge [{}] starting..., merging [{}] segments, [{}] docs, [{}] size, into [{}] estimated_size", merge.info == null ? "_na_" : merge.info.info.name, merge.segments.size(), totalNumDocs, new ByteSizeValue(totalSizeInBytes), new ByteSizeValue(merge.estimatedMergeBytes));
|
||||
}
|
||||
try {
|
||||
TrackingMergeScheduler.setCurrentMerge(merge);
|
||||
|
@ -101,9 +101,9 @@ public class TrackingConcurrentMergeScheduler extends ConcurrentMergeScheduler {
|
|||
totalMergesSizeInBytes.inc(totalSizeInBytes);
|
||||
totalMerges.inc(took);
|
||||
if (took > 20000) { // if more than 20 seconds, DEBUG log it
|
||||
logger.debug("merge [{}] done, took [{}]", merge.info == null ? "_na_" : merge.info.name, TimeValue.timeValueMillis(took));
|
||||
logger.debug("merge [{}] done, took [{}]", merge.info == null ? "_na_" : merge.info.info.name, TimeValue.timeValueMillis(took));
|
||||
} else if (logger.isTraceEnabled()) {
|
||||
logger.trace("merge [{}] done, took [{}]", merge.info == null ? "_na_" : merge.info.name, TimeValue.timeValueMillis(took));
|
||||
logger.trace("merge [{}] done, took [{}]", merge.info == null ? "_na_" : merge.info.info.name, TimeValue.timeValueMillis(took));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -96,7 +96,7 @@ public class TrackingSerialMergeScheduler extends MergeScheduler {
|
|||
|
||||
// sadly, segment name is not available since mergeInit is called from merge itself...
|
||||
if (logger.isTraceEnabled()) {
|
||||
logger.trace("merge [{}] starting..., merging [{}] segments, [{}] docs, [{}] size, into [{}] estimated_size", merge.info == null ? "_na_" : merge.info.name, merge.segments.size(), totalNumDocs, new ByteSizeValue(totalSizeInBytes), new ByteSizeValue(merge.estimatedMergeBytes));
|
||||
logger.trace("merge [{}] starting..., merging [{}] segments, [{}] docs, [{}] size, into [{}] estimated_size", merge.info == null ? "_na_" : merge.info.info.name, merge.segments.size(), totalNumDocs, new ByteSizeValue(totalSizeInBytes), new ByteSizeValue(merge.estimatedMergeBytes));
|
||||
}
|
||||
try {
|
||||
TrackingMergeScheduler.setCurrentMerge(merge);
|
||||
|
@ -113,9 +113,9 @@ public class TrackingSerialMergeScheduler extends MergeScheduler {
|
|||
totalMergesSizeInBytes.inc(totalSizeInBytes);
|
||||
totalMerges.inc(took);
|
||||
if (took > 20000) { // if more than 20 seconds, DEBUG log it
|
||||
logger.debug("merge [{}] done, took [{}]", merge.info == null ? "_na_" : merge.info.name, TimeValue.timeValueMillis(took));
|
||||
logger.debug("merge [{}] done, took [{}]", merge.info == null ? "_na_" : merge.info.info.name, TimeValue.timeValueMillis(took));
|
||||
} else if (logger.isTraceEnabled()) {
|
||||
logger.trace("merge [{}] done, took [{}]", merge.info == null ? "_na_" : merge.info.name, TimeValue.timeValueMillis(took));
|
||||
logger.trace("merge [{}] done, took [{}]", merge.info == null ? "_na_" : merge.info.info.name, TimeValue.timeValueMillis(took));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,29 +0,0 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.LockObtainFailedException;
|
||||
import org.elasticsearch.common.logging.ESLogger;
|
||||
import org.elasticsearch.index.cache.bloom.BloomCache;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
*/
|
||||
public class XIndexWriter extends IndexWriter {
|
||||
|
||||
private final ESLogger logger;
|
||||
|
||||
public XIndexWriter(Directory d, IndexWriterConfig conf, ESLogger logger, BloomCache bloomCache) throws CorruptIndexException, LockObtainFailedException, IOException {
|
||||
super(d, conf);
|
||||
this.logger = logger;
|
||||
if (bufferedDeletesStream instanceof XBufferedDeletesStream) {
|
||||
logger.debug("using bloom filter enhanced delete handling");
|
||||
((XBufferedDeletesStream) bufferedDeletesStream).setBloomCache(bloomCache);
|
||||
}
|
||||
}
|
||||
|
||||
public static interface XBufferedDeletesStream {
|
||||
|
||||
void setBloomCache(BloomCache bloomCache);
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -17,7 +17,7 @@
|
|||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.queryParser;
|
||||
package org.apache.lucene.queryparser.classic;
|
||||
|
||||
import org.apache.lucene.search.DeletionAwareConstantScoreQuery;
|
||||
import org.apache.lucene.search.Filter;
|
|
@ -17,7 +17,7 @@
|
|||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.queryParser;
|
||||
package org.apache.lucene.queryparser.classic;
|
||||
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.elasticsearch.index.query.QueryParseContext;
|
|
@ -17,7 +17,7 @@
|
|||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.queryParser;
|
||||
package org.apache.lucene.queryparser.classic;
|
||||
|
||||
import com.google.common.base.Objects;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
|
@ -287,7 +287,7 @@ public class MapperQueryParser extends QueryParser {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Query getRangeQuery(String field, String part1, String part2, boolean inclusive) throws ParseException {
|
||||
protected Query getRangeQuery(String field, String part1, String part2, boolean startInclusive, boolean endInclusive) throws ParseException {
|
||||
if ("*".equals(part1)) {
|
||||
part1 = null;
|
||||
}
|
||||
|
@ -297,13 +297,13 @@ public class MapperQueryParser extends QueryParser {
|
|||
Collection<String> fields = extractMultiFields(field);
|
||||
if (fields != null) {
|
||||
if (fields.size() == 1) {
|
||||
return getRangeQuerySingle(fields.iterator().next(), part1, part2, inclusive);
|
||||
return getRangeQuerySingle(fields.iterator().next(), part1, part2, startInclusive, endInclusive);
|
||||
}
|
||||
if (settings.useDisMax()) {
|
||||
DisjunctionMaxQuery disMaxQuery = new DisjunctionMaxQuery(settings.tieBreaker());
|
||||
boolean added = false;
|
||||
for (String mField : fields) {
|
||||
Query q = getRangeQuerySingle(mField, part1, part2, inclusive);
|
||||
Query q = getRangeQuerySingle(mField, part1, part2, startInclusive, endInclusive);
|
||||
if (q != null) {
|
||||
added = true;
|
||||
applyBoost(mField, q);
|
||||
|
@ -317,7 +317,7 @@ public class MapperQueryParser extends QueryParser {
|
|||
} else {
|
||||
List<BooleanClause> clauses = new ArrayList<BooleanClause>();
|
||||
for (String mField : fields) {
|
||||
Query q = getRangeQuerySingle(mField, part1, part2, inclusive);
|
||||
Query q = getRangeQuerySingle(mField, part1, part2, startInclusive, endInclusive);
|
||||
if (q != null) {
|
||||
applyBoost(mField, q);
|
||||
clauses.add(new BooleanClause(q, BooleanClause.Occur.SHOULD));
|
||||
|
@ -328,18 +328,18 @@ public class MapperQueryParser extends QueryParser {
|
|||
return getBooleanQuery(clauses, true);
|
||||
}
|
||||
} else {
|
||||
return getRangeQuerySingle(field, part1, part2, inclusive);
|
||||
return getRangeQuerySingle(field, part1, part2, startInclusive, endInclusive);
|
||||
}
|
||||
}
|
||||
|
||||
private Query getRangeQuerySingle(String field, String part1, String part2, boolean inclusive) {
|
||||
private Query getRangeQuerySingle(String field, String part1, String part2, boolean startInclusive, boolean endInclusive) {
|
||||
currentMapper = null;
|
||||
MapperService.SmartNameFieldMappers fieldMappers = parseContext.smartFieldMappers(field);
|
||||
if (fieldMappers != null) {
|
||||
currentMapper = fieldMappers.fieldMappers().mapper();
|
||||
if (currentMapper != null) {
|
||||
try {
|
||||
Query rangeQuery = currentMapper.rangeQuery(part1, part2, inclusive, inclusive, parseContext);
|
||||
Query rangeQuery = currentMapper.rangeQuery(part1, part2, startInclusive, startInclusive, parseContext);
|
||||
return wrapSmartNameQuery(rangeQuery, fieldMappers, parseContext);
|
||||
} catch (RuntimeException e) {
|
||||
if (settings.lenient()) {
|
||||
|
@ -349,7 +349,7 @@ public class MapperQueryParser extends QueryParser {
|
|||
}
|
||||
}
|
||||
}
|
||||
return newRangeQuery(field, part1, part2, inclusive);
|
||||
return newRangeQuery(field, part1, part2, startInclusive, endInclusive);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -395,7 +395,8 @@ public class MapperQueryParser extends QueryParser {
|
|||
currentMapper = fieldMappers.fieldMappers().mapper();
|
||||
if (currentMapper != null) {
|
||||
try {
|
||||
Query fuzzyQuery = currentMapper.fuzzyQuery(termStr, minSimilarity, fuzzyPrefixLength, settings.fuzzyMaxExpansions());
|
||||
//LUCENE 4 UPGRADE I disabled transpositions here by default - maybe this needs to be changed
|
||||
Query fuzzyQuery = currentMapper.fuzzyQuery(termStr, minSimilarity, fuzzyPrefixLength, settings.fuzzyMaxExpansions(), false);
|
||||
return wrapSmartNameQuery(fuzzyQuery, fieldMappers, parseContext);
|
||||
} catch (RuntimeException e) {
|
||||
if (settings.lenient()) {
|
||||
|
@ -410,7 +411,10 @@ public class MapperQueryParser extends QueryParser {
|
|||
|
||||
@Override
|
||||
protected Query newFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) {
|
||||
FuzzyQuery query = new FuzzyQuery(term, minimumSimilarity, prefixLength, settings.fuzzyMaxExpansions());
|
||||
String text = term.text();
|
||||
int numEdits = FuzzyQuery.floatToEdits(minimumSimilarity, text.codePointCount(0, text.length()));
|
||||
//LUCENE 4 UPGRADE I disabled transpositions here by default - maybe this needs to be changed
|
||||
FuzzyQuery query = new FuzzyQuery(term, numEdits, prefixLength, settings.fuzzyMaxExpansions(), false);
|
||||
QueryParsers.setRewriteMethod(query, settings.fuzzyRewriteMethod());
|
||||
return query;
|
||||
}
|
||||
|
@ -503,7 +507,7 @@ public class MapperQueryParser extends QueryParser {
|
|||
// get Analyzer from superclass and tokenize the term
|
||||
TokenStream source;
|
||||
try {
|
||||
source = getAnalyzer().reusableTokenStream(field, new StringReader(termStr));
|
||||
source = getAnalyzer().tokenStream(field, new StringReader(termStr));
|
||||
} catch (IOException e) {
|
||||
return super.getPrefixQuery(field, termStr);
|
||||
}
|
||||
|
@ -631,7 +635,7 @@ public class MapperQueryParser extends QueryParser {
|
|||
if (c == '?' || c == '*') {
|
||||
if (isWithinToken) {
|
||||
try {
|
||||
TokenStream source = getAnalyzer().reusableTokenStream(field, new FastStringReader(tmp.toString()));
|
||||
TokenStream source = getAnalyzer().tokenStream(field, new FastStringReader(tmp.toString()));
|
||||
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
|
||||
if (source.incrementToken()) {
|
||||
String term = termAtt.toString();
|
||||
|
@ -660,7 +664,7 @@ public class MapperQueryParser extends QueryParser {
|
|||
}
|
||||
if (isWithinToken) {
|
||||
try {
|
||||
TokenStream source = getAnalyzer().reusableTokenStream(field, new FastStringReader(tmp.toString()));
|
||||
TokenStream source = getAnalyzer().tokenStream(field, new FastStringReader(tmp.toString()));
|
||||
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
|
||||
if (source.incrementToken()) {
|
||||
String term = termAtt.toString();
|
|
@ -17,7 +17,7 @@
|
|||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.queryParser;
|
||||
package org.apache.lucene.queryparser.classic;
|
||||
|
||||
import org.apache.lucene.search.DeletionAwareConstantScoreQuery;
|
||||
import org.apache.lucene.search.Filter;
|
|
@ -17,7 +17,7 @@
|
|||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.queryParser;
|
||||
package org.apache.lucene.queryparser.classic;
|
||||
|
||||
import gnu.trove.map.hash.TObjectFloatHashMap;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
|
@ -28,6 +28,7 @@ import org.elasticsearch.common.lucene.search.NotDeletedFilter;
|
|||
// So it can basically be cached safely even with a reader that changes deletions but remain with teh same cache key
|
||||
// See more: https://issues.apache.org/jira/browse/LUCENE-2468
|
||||
// TODO Lucene 4.0 won't need this, since live docs are "and'ed" while scoring
|
||||
// LUCENE 4 UPGRADE: we probably don't need this anymore, because of acceptDocs
|
||||
public class DeletionAwareConstantScoreQuery extends ConstantScoreQuery {
|
||||
|
||||
private final Filter actualFilter;
|
||||
|
|
|
@ -1,116 +0,0 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.search;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermDocs;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
// LUCENE MONITOR: Against TermsFilter
|
||||
public class PublicTermsFilter extends Filter {
|
||||
|
||||
Set<Term> terms = new TreeSet<Term>();
|
||||
|
||||
/**
|
||||
* Adds a term to the list of acceptable terms
|
||||
*
|
||||
* @param term
|
||||
*/
|
||||
public void addTerm(Term term) {
|
||||
terms.add(term);
|
||||
}
|
||||
|
||||
public Set<Term> getTerms() {
|
||||
return terms;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj)
|
||||
return true;
|
||||
if ((obj == null) || (obj.getClass() != this.getClass()))
|
||||
return false;
|
||||
PublicTermsFilter test = (PublicTermsFilter) obj;
|
||||
return (terms == test.terms ||
|
||||
(terms != null && terms.equals(test.terms)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int hash = 9;
|
||||
for (Iterator<Term> iter = terms.iterator(); iter.hasNext(); ) {
|
||||
Term term = iter.next();
|
||||
hash = 31 * hash + term.hashCode();
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
|
||||
FixedBitSet result = null;
|
||||
TermDocs td = reader.termDocs();
|
||||
try {
|
||||
// batch read, in Lucene 4.0 its no longer needed
|
||||
int[] docs = new int[Lucene.BATCH_ENUM_DOCS];
|
||||
int[] freqs = new int[Lucene.BATCH_ENUM_DOCS];
|
||||
for (Term term : terms) {
|
||||
td.seek(term);
|
||||
int number = td.read(docs, freqs);
|
||||
if (number > 0) {
|
||||
if (result == null) {
|
||||
result = new FixedBitSet(reader.maxDoc());
|
||||
}
|
||||
while (number > 0) {
|
||||
for (int i = 0; i < number; i++) {
|
||||
result.set(docs[i]);
|
||||
}
|
||||
number = td.read(docs, freqs);
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
td.close();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (Term term : terms) {
|
||||
if (builder.length() > 0) {
|
||||
builder.append(' ');
|
||||
}
|
||||
builder.append(term);
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -19,13 +19,12 @@
|
|||
|
||||
package org.apache.lucene.search;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
import org.elasticsearch.ElasticSearchIllegalStateException;
|
||||
import org.elasticsearch.search.controller.ShardFieldDoc;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.Collator;
|
||||
import java.util.Locale;
|
||||
|
||||
/**
|
||||
*
|
||||
|
@ -48,7 +47,7 @@ public class ShardFieldDocSortedHitQueue extends PriorityQueue<ShardFieldDoc> {
|
|||
* @param size The number of hits to retain. Must be greater than zero.
|
||||
*/
|
||||
public ShardFieldDocSortedHitQueue(SortField[] fields, int size) {
|
||||
initialize(size);
|
||||
super(size);
|
||||
setFields(fields);
|
||||
}
|
||||
|
||||
|
@ -83,31 +82,11 @@ public class ShardFieldDocSortedHitQueue extends PriorityQueue<ShardFieldDoc> {
|
|||
return fields;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns an array of collators, possibly <code>null</code>. The collators
|
||||
* correspond to any SortFields which were given a specific locale.
|
||||
*
|
||||
* @param fields Array of sort fields.
|
||||
* @return Array, possibly <code>null</code>.
|
||||
*/
|
||||
private Collator[] hasCollators(final SortField[] fields) {
|
||||
if (fields == null) return null;
|
||||
Collator[] ret = new Collator[fields.length];
|
||||
for (int i = 0; i < fields.length; ++i) {
|
||||
Locale locale = fields[i].getLocale();
|
||||
if (locale != null)
|
||||
ret[i] = Collator.getInstance(locale);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns whether <code>a</code> is less relevant than <code>b</code>.
|
||||
*
|
||||
* @param a ScoreDoc
|
||||
* @param b ScoreDoc
|
||||
* @param docA ScoreDoc
|
||||
* @param docB ScoreDoc
|
||||
* @return <code>true</code> if document <code>a</code> should be sorted after document <code>b</code>.
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
|
@ -116,10 +95,10 @@ public class ShardFieldDocSortedHitQueue extends PriorityQueue<ShardFieldDoc> {
|
|||
final int n = fields.length;
|
||||
int c = 0;
|
||||
for (int i = 0; i < n && c == 0; ++i) {
|
||||
final int type = fields[i].getType();
|
||||
if (type == SortField.STRING) {
|
||||
final String s1 = (String) docA.fields[i];
|
||||
final String s2 = (String) docB.fields[i];
|
||||
final SortField.Type type = fields[i].getType();
|
||||
if (type == SortField.Type.STRING) {
|
||||
final BytesRef s1 = (BytesRef) docA.fields[i];
|
||||
final BytesRef s2 = (BytesRef) docB.fields[i];
|
||||
// null values need to be sorted first, because of how FieldCache.getStringIndex()
|
||||
// works - in that routine, any documents without a value in the given field are
|
||||
// put first. If both are null, the next SortField is used
|
||||
|
|
|
@ -19,91 +19,168 @@
|
|||
|
||||
package org.apache.lucene.search;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermDocs;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.queries.TermsFilter;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
|
||||
/**
|
||||
* Similar to {@link TermsFilter} but stores the terms in an array for better memory usage
|
||||
* when cached, and also uses bulk read
|
||||
*/
|
||||
// LUCENE MONITOR: Against TermsFilter
|
||||
// LUCENE 4 UPGRADE: Make sure to sync this against latest 4.1
|
||||
// LUCENE 4.1: once its out, we can use TermsFilter from it
|
||||
public class XTermsFilter extends Filter {
|
||||
|
||||
private final Term[] terms;
|
||||
private final Term[] filterTerms;
|
||||
private final boolean[] resetTermsEnum;// true if the enum must be reset when building the bitset
|
||||
private final int length;
|
||||
|
||||
public XTermsFilter(Term term) {
|
||||
this.terms = new Term[]{term};
|
||||
/**
|
||||
* Creates a new {@link XTermsFilter} from the given collection. The collection
|
||||
* can contain duplicate terms and multiple fields.
|
||||
*/
|
||||
public XTermsFilter(Collection<Term> terms) {
|
||||
this(terms.toArray(new Term[terms.size()]));
|
||||
}
|
||||
|
||||
public XTermsFilter(Term[] terms) {
|
||||
/**
|
||||
* Creates a new {@link XTermsFilter} from the given array. The array can
|
||||
* contain duplicate terms and multiple fields.
|
||||
*/
|
||||
public XTermsFilter(Term... terms) {
|
||||
if (terms == null || terms.length == 0) {
|
||||
throw new IllegalArgumentException("TermsFilter requires at least one term");
|
||||
}
|
||||
Arrays.sort(terms);
|
||||
this.terms = terms;
|
||||
this.filterTerms = new Term[terms.length];
|
||||
this.resetTermsEnum = new boolean[terms.length];
|
||||
int index = 0;
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
Term currentTerm = terms[i];
|
||||
boolean fieldChanged = true;
|
||||
if (index > 0) {
|
||||
// deduplicate
|
||||
if (filterTerms[index - 1].field().equals(currentTerm.field())) {
|
||||
fieldChanged = false;
|
||||
if (filterTerms[index - 1].bytes().bytesEquals(currentTerm.bytes())) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
this.filterTerms[index] = currentTerm;
|
||||
this.resetTermsEnum[index] = index == 0 || fieldChanged; // mark index 0 so we have a clear path in the iteration
|
||||
|
||||
index++;
|
||||
}
|
||||
length = index;
|
||||
}
|
||||
|
||||
public Term[] getTerms() {
|
||||
return terms;
|
||||
return filterTerms;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
|
||||
AtomicReader reader = context.reader();
|
||||
FixedBitSet result = null; // lazy init if needed - no need to create a big bitset ahead of time
|
||||
Fields fields = reader.fields();
|
||||
if (fields == null) {
|
||||
return result;
|
||||
}
|
||||
final BytesRef br = new BytesRef();
|
||||
Terms terms = null;
|
||||
TermsEnum termsEnum = null;
|
||||
DocsEnum docs = null;
|
||||
assert resetTermsEnum[0];
|
||||
for (int i = 0; i < length; i++) {
|
||||
Term term = this.filterTerms[i];
|
||||
if (resetTermsEnum[i]) {
|
||||
terms = fields.terms(term.field());
|
||||
if (terms == null) {
|
||||
i = skipToNextField(i + 1, length); // skip to the next field since this field is not indexed
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if ((termsEnum = terms.iterator(termsEnum)) != null) {
|
||||
br.copyBytes(term.bytes());
|
||||
assert termsEnum != null;
|
||||
if (termsEnum.seekExact(br, true)) {
|
||||
docs = termsEnum.docs(acceptDocs, docs, 0);
|
||||
if (result == null) {
|
||||
if (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
result = new FixedBitSet(reader.maxDoc());
|
||||
// lazy init but don't do it in the hot loop since we could read many docs
|
||||
result.set(docs.docID());
|
||||
}
|
||||
}
|
||||
while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
result.set(docs.docID());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private final int skipToNextField(int index, int length) {
|
||||
for (int i = index; i < length; i++) {
|
||||
if (resetTermsEnum[i]) {
|
||||
return i - 1;
|
||||
}
|
||||
}
|
||||
return length;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj)
|
||||
if (this == obj) {
|
||||
return true;
|
||||
if ((obj == null) || (obj.getClass() != this.getClass()))
|
||||
}
|
||||
if ((obj == null) || (obj.getClass() != this.getClass())) {
|
||||
return false;
|
||||
}
|
||||
XTermsFilter test = (XTermsFilter) obj;
|
||||
return Arrays.equals(terms, test.terms);
|
||||
if (filterTerms != test.filterTerms) {
|
||||
if (length == test.length) {
|
||||
for (int i = 0; i < length; i++) {
|
||||
// can not be null!
|
||||
if (!filterTerms[i].equals(test.filterTerms[i])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Arrays.hashCode(terms);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
|
||||
FixedBitSet result = null;
|
||||
TermDocs td = reader.termDocs();
|
||||
try {
|
||||
// batch read, in Lucene 4.0 its no longer needed
|
||||
int[] docs = new int[Lucene.BATCH_ENUM_DOCS];
|
||||
int[] freqs = new int[Lucene.BATCH_ENUM_DOCS];
|
||||
for (Term term : terms) {
|
||||
td.seek(term);
|
||||
int number = td.read(docs, freqs);
|
||||
if (number > 0) {
|
||||
if (result == null) {
|
||||
result = new FixedBitSet(reader.maxDoc());
|
||||
}
|
||||
while (number > 0) {
|
||||
for (int i = 0; i < number; i++) {
|
||||
result.set(docs[i]);
|
||||
}
|
||||
number = td.read(docs, freqs);
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
td.close();
|
||||
int hash = 9;
|
||||
for (int i = 0; i < length; i++) {
|
||||
hash = 31 * hash + filterTerms[i].hashCode();
|
||||
}
|
||||
return result;
|
||||
return hash;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (Term term : terms) {
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (builder.length() > 0) {
|
||||
builder.append(' ');
|
||||
}
|
||||
builder.append(term);
|
||||
builder.append(filterTerms[i]);
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -144,7 +144,7 @@ public abstract class AbstractFragmentsBuilder extends BaseFragmentsBuilder {
|
|||
}
|
||||
}
|
||||
if (!toffsList.isEmpty()) {
|
||||
subInfos.add(new FieldFragList.WeightedFragInfo.SubInfo(subInfo.text, toffsList, subInfo.getSeqnum()));
|
||||
subInfos.add(new FieldFragList.WeightedFragInfo.SubInfo(subInfo.getText(), toffsList, subInfo.getSeqnum()));
|
||||
}
|
||||
|
||||
if (subInfo.getTermsOffsets().isEmpty()) {
|
||||
|
@ -175,9 +175,7 @@ public abstract class AbstractFragmentsBuilder extends BaseFragmentsBuilder {
|
|||
private final static List<FieldPhraseList.WeightedPhraseInfo> EMPTY = Collections.emptyList();
|
||||
|
||||
private WeightedFragInfo(int startOffset, int endOffset, float totalBoost, List<FieldFragList.WeightedFragInfo.SubInfo> subInfos) {
|
||||
super(startOffset, endOffset, EMPTY);
|
||||
this.subInfos = subInfos;
|
||||
this.totalBoost = totalBoost;
|
||||
super(startOffset, endOffset, subInfos, totalBoost);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -65,12 +65,12 @@ public class XScoreOrderFragmentsBuilder extends AbstractFragmentsBuilder {
|
|||
public static class ScoreComparator implements Comparator<WeightedFragInfo> {
|
||||
|
||||
public int compare(WeightedFragInfo o1, WeightedFragInfo o2) {
|
||||
if (o1.totalBoost > o2.totalBoost) return -1;
|
||||
else if (o1.totalBoost < o2.totalBoost) return 1;
|
||||
if (o1.getTotalBoost() > o2.getTotalBoost()) return -1;
|
||||
else if (o1.getTotalBoost() < o2.getTotalBoost()) return 1;
|
||||
// if same score then check startOffset
|
||||
else {
|
||||
if (o1.startOffset < o2.startOffset) return -1;
|
||||
else if (o1.startOffset > o2.startOffset) return 1;
|
||||
if (o1.getStartOffset() < o2.getStartOffset()) return -1;
|
||||
else if (o1.getStartOffset() > o2.getStartOffset()) return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@ class XFSIndexOutput extends FSDirectory.FSIndexOutput {
|
|||
private final StoreRateLimiting.Listener rateListener;
|
||||
|
||||
XFSIndexOutput(FSDirectory parent, String name, RateLimiter rateLimiter, StoreRateLimiting.Listener rateListener) throws IOException {
|
||||
super(parent, name);
|
||||
super(parent, name, null /* we have our own rate limiter */);
|
||||
this.rateLimiter = rateLimiter;
|
||||
this.rateListener = rateListener;
|
||||
}
|
||||
|
|
|
@ -40,12 +40,12 @@ public class XMMapFSDirectory extends MMapDirectory {
|
|||
}
|
||||
|
||||
@Override
|
||||
public IndexOutput createOutput(String name) throws IOException {
|
||||
public IndexOutput createOutput(String name, IOContext context) throws IOException {
|
||||
StoreRateLimiting rateLimiting = rateLimitingProvider.rateLimiting();
|
||||
StoreRateLimiting.Type type = rateLimiting.getType();
|
||||
RateLimiter limiter = rateLimiting.getRateLimiter();
|
||||
if (type == StoreRateLimiting.Type.NONE || limiter == null) {
|
||||
return super.createOutput(name);
|
||||
return super.createOutput(name, context);
|
||||
}
|
||||
if (TrackingMergeScheduler.getCurrentMerge() != null) {
|
||||
// we are mering, and type is either MERGE or ALL, rate limit...
|
||||
|
@ -59,6 +59,6 @@ public class XMMapFSDirectory extends MMapDirectory {
|
|||
return new XFSIndexOutput(this, name, limiter, rateListener);
|
||||
}
|
||||
// we shouldn't really get here...
|
||||
return super.createOutput(name);
|
||||
return super.createOutput(name, context);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -40,12 +40,12 @@ public class XNIOFSDirectory extends NIOFSDirectory {
|
|||
}
|
||||
|
||||
@Override
|
||||
public IndexOutput createOutput(String name) throws IOException {
|
||||
public IndexOutput createOutput(String name, IOContext context) throws IOException {
|
||||
StoreRateLimiting rateLimiting = rateLimitingProvider.rateLimiting();
|
||||
StoreRateLimiting.Type type = rateLimiting.getType();
|
||||
RateLimiter limiter = rateLimiting.getRateLimiter();
|
||||
if (type == StoreRateLimiting.Type.NONE || limiter == null) {
|
||||
return super.createOutput(name);
|
||||
return super.createOutput(name, context);
|
||||
}
|
||||
if (TrackingMergeScheduler.getCurrentMerge() != null) {
|
||||
// we are mering, and type is either MERGE or ALL, rate limit...
|
||||
|
@ -59,6 +59,6 @@ public class XNIOFSDirectory extends NIOFSDirectory {
|
|||
return new XFSIndexOutput(this, name, limiter, rateListener);
|
||||
}
|
||||
// we shouldn't really get here...
|
||||
return super.createOutput(name);
|
||||
return super.createOutput(name, context);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -40,12 +40,12 @@ public class XSimpleFSDirectory extends SimpleFSDirectory {
|
|||
}
|
||||
|
||||
@Override
|
||||
public IndexOutput createOutput(String name) throws IOException {
|
||||
public IndexOutput createOutput(String name, IOContext context) throws IOException {
|
||||
StoreRateLimiting rateLimiting = rateLimitingProvider.rateLimiting();
|
||||
StoreRateLimiting.Type type = rateLimiting.getType();
|
||||
RateLimiter limiter = rateLimiting.getRateLimiter();
|
||||
if (type == StoreRateLimiting.Type.NONE || limiter == null) {
|
||||
return super.createOutput(name);
|
||||
return super.createOutput(name, context);
|
||||
}
|
||||
if (TrackingMergeScheduler.getCurrentMerge() != null) {
|
||||
// we are mering, and type is either MERGE or ALL, rate limit...
|
||||
|
@ -59,6 +59,6 @@ public class XSimpleFSDirectory extends SimpleFSDirectory {
|
|||
return new XFSIndexOutput(this, name, limiter, rateListener);
|
||||
}
|
||||
// we shouldn't really get here...
|
||||
return super.createOutput(name);
|
||||
return super.createOutput(name, context);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,10 +17,7 @@ package org.apache.lucene.store.bytebuffer;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.SingleInstanceLockFactory;
|
||||
import org.apache.lucene.store.*;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
|
@ -98,36 +95,6 @@ public class ByteBufferDirectory extends Directory {
|
|||
return files.containsKey(name);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long fileModified(String name) throws IOException {
|
||||
ByteBufferFile file = files.get(name);
|
||||
if (file == null)
|
||||
throw new FileNotFoundException(name);
|
||||
return file.getLastModified();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void touchFile(String name) throws IOException {
|
||||
ByteBufferFile file = files.get(name);
|
||||
if (file == null)
|
||||
throw new FileNotFoundException(name);
|
||||
|
||||
long ts2, ts1 = System.currentTimeMillis();
|
||||
do {
|
||||
try {
|
||||
Thread.sleep(0, 1);
|
||||
} catch (java.lang.InterruptedException ie) {
|
||||
// In 3.0 we will change this to throw
|
||||
// InterruptedException instead
|
||||
Thread.currentThread().interrupt();
|
||||
throw new RuntimeException(ie);
|
||||
}
|
||||
ts2 = System.currentTimeMillis();
|
||||
} while (ts1 == ts2);
|
||||
|
||||
file.setLastModified(ts2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void deleteFile(String name) throws IOException {
|
||||
ByteBufferFile file = files.remove(name);
|
||||
|
@ -146,7 +113,7 @@ public class ByteBufferDirectory extends Directory {
|
|||
}
|
||||
|
||||
@Override
|
||||
public IndexOutput createOutput(String name) throws IOException {
|
||||
public IndexOutput createOutput(String name, IOContext context) throws IOException {
|
||||
ByteBufferAllocator.Type allocatorType = ByteBufferAllocator.Type.LARGE;
|
||||
if (name.contains("segments") || name.endsWith(".del")) {
|
||||
allocatorType = ByteBufferAllocator.Type.SMALL;
|
||||
|
@ -166,7 +133,7 @@ public class ByteBufferDirectory extends Directory {
|
|||
}
|
||||
|
||||
@Override
|
||||
public IndexInput openInput(String name) throws IOException {
|
||||
public IndexInput openInput(String name, IOContext context) throws IOException {
|
||||
ByteBufferFile file = files.get(name);
|
||||
if (file == null)
|
||||
throw new FileNotFoundException(name);
|
||||
|
|
|
@ -186,7 +186,7 @@ public class ByteBufferIndexInput extends IndexInput {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Object clone() {
|
||||
public IndexInput clone() {
|
||||
ByteBufferIndexInput cloned = (ByteBufferIndexInput) super.clone();
|
||||
cloned.file.incRef(); // inc ref on cloned one
|
||||
if (currentBuffer != EMPTY_BUFFER) {
|
||||
|
|
|
@ -198,7 +198,7 @@ public class TransportAnalyzeAction extends TransportSingleCustomOperationAction
|
|||
List<AnalyzeResponse.AnalyzeToken> tokens = Lists.newArrayList();
|
||||
TokenStream stream = null;
|
||||
try {
|
||||
stream = analyzer.reusableTokenStream(field, new FastStringReader(request.text()));
|
||||
stream = analyzer.tokenStream(field, new FastStringReader(request.text()));
|
||||
stream.reset();
|
||||
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
|
||||
|
|
|
@ -34,7 +34,6 @@ public class ClearIndicesCacheRequest extends BroadcastOperationRequest<ClearInd
|
|||
private boolean filterCache = false;
|
||||
private boolean fieldDataCache = false;
|
||||
private boolean idCache = false;
|
||||
private boolean bloomCache = false;
|
||||
private String[] fields = null;
|
||||
|
||||
ClearIndicesCacheRequest() {
|
||||
|
@ -82,26 +81,16 @@ public class ClearIndicesCacheRequest extends BroadcastOperationRequest<ClearInd
|
|||
return this;
|
||||
}
|
||||
|
||||
public boolean bloomCache() {
|
||||
return this.bloomCache;
|
||||
}
|
||||
|
||||
public ClearIndicesCacheRequest bloomCache(boolean bloomCache) {
|
||||
this.bloomCache = bloomCache;
|
||||
return this;
|
||||
}
|
||||
|
||||
public void readFrom(StreamInput in) throws IOException {
|
||||
super.readFrom(in);
|
||||
filterCache = in.readBoolean();
|
||||
fieldDataCache = in.readBoolean();
|
||||
idCache = in.readBoolean();
|
||||
bloomCache = in.readBoolean();
|
||||
int size = in.readVInt();
|
||||
if (size > 0) {
|
||||
fields = new String[size];
|
||||
for (int i = 0; i < size; i++) {
|
||||
fields[i] = in.readUTF();
|
||||
fields[i] = in.readString();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -111,13 +100,12 @@ public class ClearIndicesCacheRequest extends BroadcastOperationRequest<ClearInd
|
|||
out.writeBoolean(filterCache);
|
||||
out.writeBoolean(fieldDataCache);
|
||||
out.writeBoolean(idCache);
|
||||
out.writeBoolean(bloomCache);
|
||||
if (fields == null) {
|
||||
out.writeVInt(0);
|
||||
} else {
|
||||
out.writeVInt(fields.length);
|
||||
for (String field : fields) {
|
||||
out.writeUTF(field);
|
||||
out.writeString(field);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -53,11 +53,6 @@ public class ClearIndicesCacheRequestBuilder extends BroadcastOperationRequestBu
|
|||
return this;
|
||||
}
|
||||
|
||||
public ClearIndicesCacheRequestBuilder setBloomCache(boolean bloomCache) {
|
||||
request.bloomCache(bloomCache);
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doExecute(ActionListener<ClearIndicesCacheResponse> listener) {
|
||||
((IndicesAdminClient) client).clearCache(request, listener);
|
||||
|
|
|
@ -33,7 +33,6 @@ class ShardClearIndicesCacheRequest extends BroadcastShardOperationRequest {
|
|||
private boolean filterCache = false;
|
||||
private boolean fieldDataCache = false;
|
||||
private boolean idCache = false;
|
||||
private boolean bloomCache = false;
|
||||
private String[] fields = null;
|
||||
|
||||
ShardClearIndicesCacheRequest() {
|
||||
|
@ -44,7 +43,6 @@ class ShardClearIndicesCacheRequest extends BroadcastShardOperationRequest {
|
|||
filterCache = request.filterCache();
|
||||
fieldDataCache = request.fieldDataCache();
|
||||
idCache = request.idCache();
|
||||
bloomCache = request.bloomCache();
|
||||
fields = request.fields();
|
||||
}
|
||||
|
||||
|
@ -60,10 +58,6 @@ class ShardClearIndicesCacheRequest extends BroadcastShardOperationRequest {
|
|||
return this.idCache;
|
||||
}
|
||||
|
||||
public boolean bloomCache() {
|
||||
return this.bloomCache;
|
||||
}
|
||||
|
||||
public String[] fields() {
|
||||
return this.fields;
|
||||
}
|
||||
|
@ -79,7 +73,6 @@ class ShardClearIndicesCacheRequest extends BroadcastShardOperationRequest {
|
|||
filterCache = in.readBoolean();
|
||||
fieldDataCache = in.readBoolean();
|
||||
idCache = in.readBoolean();
|
||||
bloomCache = in.readBoolean();
|
||||
int size = in.readVInt();
|
||||
if (size > 0) {
|
||||
fields = new String[size];
|
||||
|
@ -95,7 +88,6 @@ class ShardClearIndicesCacheRequest extends BroadcastShardOperationRequest {
|
|||
out.writeBoolean(filterCache);
|
||||
out.writeBoolean(fieldDataCache);
|
||||
out.writeBoolean(idCache);
|
||||
out.writeBoolean(bloomCache);
|
||||
if (fields == null) {
|
||||
out.writeVInt(0);
|
||||
} else {
|
||||
|
|
|
@ -138,10 +138,6 @@ public class TransportClearIndicesCacheAction extends TransportBroadcastOperatio
|
|||
clearedAtLeastOne = true;
|
||||
service.cache().idCache().clear();
|
||||
}
|
||||
if (request.bloomCache()) {
|
||||
clearedAtLeastOne = true;
|
||||
service.cache().bloomCache().clear();
|
||||
}
|
||||
if (!clearedAtLeastOne) {
|
||||
if (request.fields() != null && request.fields().length > 0) {
|
||||
// only clear caches relating to the specified fields
|
||||
|
|
|
@ -86,7 +86,7 @@ public class TransportExplainAction extends TransportShardSingleOperationAction<
|
|||
protected ExplainResponse shardOperation(ExplainRequest request, int shardId) throws ElasticSearchException {
|
||||
IndexService indexService = indicesService.indexService(request.index());
|
||||
IndexShard indexShard = indexService.shardSafe(shardId);
|
||||
Term uidTerm = UidFieldMapper.TERM_FACTORY.createTerm(Uid.createUid(request.type(), request.id()));
|
||||
Term uidTerm = new Term(UidFieldMapper.NAME, Uid.createUid(request.type(), request.id()));
|
||||
Engine.GetResult result = indexShard.get(new Engine.Get(false, uidTerm));
|
||||
if (!result.exists()) {
|
||||
return new ExplainResponse(false);
|
||||
|
@ -104,7 +104,7 @@ public class TransportExplainAction extends TransportShardSingleOperationAction<
|
|||
try {
|
||||
context.parsedQuery(parseQuery(request, indexService));
|
||||
context.preProcess();
|
||||
int topLevelDocId = result.docIdAndVersion().docId + result.docIdAndVersion().docStart;
|
||||
int topLevelDocId = result.docIdAndVersion().docId + result.docIdAndVersion().reader.docBase;
|
||||
|
||||
Explanation explanation = context.searcher().explain(context.query(), topLevelDocId);
|
||||
if (request.fields() != null) {
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
|
||||
package org.elasticsearch.action.mlt;
|
||||
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.elasticsearch.ElasticSearchException;
|
||||
import org.elasticsearch.action.ActionListener;
|
||||
|
@ -263,7 +263,7 @@ public class TransportMoreLikeThisAction extends TransportAction<MoreLikeThisReq
|
|||
}
|
||||
docMapper.parse(SourceToParse.source(getResponse.sourceRef()).type(request.type()).id(request.id()), new DocumentMapper.ParseListenerAdapter() {
|
||||
@Override
|
||||
public boolean beforeFieldAdded(FieldMapper fieldMapper, Fieldable field, Object parseContext) {
|
||||
public boolean beforeFieldAdded(FieldMapper fieldMapper, Field field, Object parseContext) {
|
||||
if (fieldMapper instanceof InternalMapper) {
|
||||
return true;
|
||||
}
|
||||
|
@ -281,7 +281,7 @@ public class TransportMoreLikeThisAction extends TransportAction<MoreLikeThisReq
|
|||
});
|
||||
}
|
||||
|
||||
private void addMoreLikeThis(MoreLikeThisRequest request, BoolQueryBuilder boolBuilder, FieldMapper fieldMapper, Fieldable field) {
|
||||
private void addMoreLikeThis(MoreLikeThisRequest request, BoolQueryBuilder boolBuilder, FieldMapper fieldMapper, Field field) {
|
||||
addMoreLikeThis(request, boolBuilder, field.name(), fieldMapper.valueAsString(field));
|
||||
}
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
package org.elasticsearch.common;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.elasticsearch.common.util.concurrent.ThreadLocals;
|
||||
|
||||
|
@ -29,10 +30,10 @@ import java.util.Arrays;
|
|||
*/
|
||||
public class Unicode {
|
||||
|
||||
private static ThreadLocal<ThreadLocals.CleanableValue<UnicodeUtil.UTF8Result>> cachedUtf8Result = new ThreadLocal<ThreadLocals.CleanableValue<UnicodeUtil.UTF8Result>>() {
|
||||
private static ThreadLocal<ThreadLocals.CleanableValue<BytesRef>> cachedUtf8Result = new ThreadLocal<ThreadLocals.CleanableValue<BytesRef>>() {
|
||||
@Override
|
||||
protected ThreadLocals.CleanableValue<UnicodeUtil.UTF8Result> initialValue() {
|
||||
return new ThreadLocals.CleanableValue<UnicodeUtil.UTF8Result>(new UnicodeUtil.UTF8Result());
|
||||
protected ThreadLocals.CleanableValue<BytesRef> initialValue() {
|
||||
return new ThreadLocals.CleanableValue<BytesRef>(new BytesRef());
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -47,20 +48,20 @@ public class Unicode {
|
|||
if (source == null) {
|
||||
return null;
|
||||
}
|
||||
UnicodeUtil.UTF8Result result = unsafeFromStringAsUtf8(source);
|
||||
return Arrays.copyOfRange(result.result, 0, result.length);
|
||||
BytesRef result = unsafeFromStringAsUtf8(source);
|
||||
return Arrays.copyOfRange(result.bytes, result.offset, result.length);
|
||||
}
|
||||
|
||||
public static UnicodeUtil.UTF8Result fromStringAsUtf8(String source) {
|
||||
public static BytesRef fromStringAsUtf8(String source) {
|
||||
if (source == null) {
|
||||
return null;
|
||||
}
|
||||
UnicodeUtil.UTF8Result result = new UnicodeUtil.UTF8Result();
|
||||
BytesRef result = new BytesRef();
|
||||
UnicodeUtil.UTF16toUTF8(source, 0, source.length(), result);
|
||||
return result;
|
||||
}
|
||||
|
||||
public static void fromStringAsUtf8(String source, UnicodeUtil.UTF8Result result) {
|
||||
public static void fromStringAsUtf8(String source, BytesRef result) {
|
||||
if (source == null) {
|
||||
result.length = 0;
|
||||
return;
|
||||
|
@ -68,11 +69,11 @@ public class Unicode {
|
|||
UnicodeUtil.UTF16toUTF8(source, 0, source.length(), result);
|
||||
}
|
||||
|
||||
public static UnicodeUtil.UTF8Result unsafeFromStringAsUtf8(String source) {
|
||||
public static BytesRef unsafeFromStringAsUtf8(String source) {
|
||||
if (source == null) {
|
||||
return null;
|
||||
}
|
||||
UnicodeUtil.UTF8Result result = cachedUtf8Result.get().get();
|
||||
BytesRef result = cachedUtf8Result.get().get();
|
||||
UnicodeUtil.UTF16toUTF8(source, 0, source.length(), result);
|
||||
return result;
|
||||
}
|
||||
|
|
|
@ -1,172 +0,0 @@
|
|||
/*
|
||||
* Licensed to Elastic Search and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. Elastic Search licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.common.bloom;
|
||||
|
||||
/**
|
||||
* The following calculations are taken from:
|
||||
* http://www.cs.wisc.edu/~cao/papers/summary-cache/node8.html
|
||||
* "Bloom Filters - the math"
|
||||
* <p/>
|
||||
* This class's static methods are meant to facilitate the use of the Bloom
|
||||
* Filter class by helping to choose correct values of 'bits per element' and
|
||||
* 'number of hash functions, k'.
|
||||
*/
|
||||
class BloomCalculations {
|
||||
|
||||
private static final int minBuckets = 2;
|
||||
private static final int minK = 1;
|
||||
|
||||
private static final int EXCESS = 20;
|
||||
|
||||
/**
|
||||
* In the following table, the row 'i' shows false positive rates if i buckets
|
||||
* per element are used. Column 'j' shows false positive rates if j hash
|
||||
* functions are used. The first row is 'i=0', the first column is 'j=0'.
|
||||
* Each cell (i,j) the false positive rate determined by using i buckets per
|
||||
* element and j hash functions.
|
||||
*/
|
||||
static final double[][] probs = new double[][]{
|
||||
{1.0}, // dummy row representing 0 buckets per element
|
||||
{1.0, 1.0}, // dummy row representing 1 buckets per element
|
||||
{1.0, 0.393, 0.400},
|
||||
{1.0, 0.283, 0.237, 0.253},
|
||||
{1.0, 0.221, 0.155, 0.147, 0.160},
|
||||
{1.0, 0.181, 0.109, 0.092, 0.092, 0.101}, // 5
|
||||
{1.0, 0.154, 0.0804, 0.0609, 0.0561, 0.0578, 0.0638},
|
||||
{1.0, 0.133, 0.0618, 0.0423, 0.0359, 0.0347, 0.0364},
|
||||
{1.0, 0.118, 0.0489, 0.0306, 0.024, 0.0217, 0.0216, 0.0229},
|
||||
{1.0, 0.105, 0.0397, 0.0228, 0.0166, 0.0141, 0.0133, 0.0135, 0.0145},
|
||||
{1.0, 0.0952, 0.0329, 0.0174, 0.0118, 0.00943, 0.00844, 0.00819, 0.00846}, // 10
|
||||
{1.0, 0.0869, 0.0276, 0.0136, 0.00864, 0.0065, 0.00552, 0.00513, 0.00509},
|
||||
{1.0, 0.08, 0.0236, 0.0108, 0.00646, 0.00459, 0.00371, 0.00329, 0.00314},
|
||||
{1.0, 0.074, 0.0203, 0.00875, 0.00492, 0.00332, 0.00255, 0.00217, 0.00199, 0.00194},
|
||||
{1.0, 0.0689, 0.0177, 0.00718, 0.00381, 0.00244, 0.00179, 0.00146, 0.00129, 0.00121, 0.0012},
|
||||
{1.0, 0.0645, 0.0156, 0.00596, 0.003, 0.00183, 0.00128, 0.001, 0.000852, 0.000775, 0.000744}, // 15
|
||||
{1.0, 0.0606, 0.0138, 0.005, 0.00239, 0.00139, 0.000935, 0.000702, 0.000574, 0.000505, 0.00047, 0.000459},
|
||||
{1.0, 0.0571, 0.0123, 0.00423, 0.00193, 0.00107, 0.000692, 0.000499, 0.000394, 0.000335, 0.000302, 0.000287, 0.000284},
|
||||
{1.0, 0.054, 0.0111, 0.00362, 0.00158, 0.000839, 0.000519, 0.00036, 0.000275, 0.000226, 0.000198, 0.000183, 0.000176},
|
||||
{1.0, 0.0513, 0.00998, 0.00312, 0.0013, 0.000663, 0.000394, 0.000264, 0.000194, 0.000155, 0.000132, 0.000118, 0.000111, 0.000109},
|
||||
{1.0, 0.0488, 0.00906, 0.0027, 0.00108, 0.00053, 0.000303, 0.000196, 0.00014, 0.000108, 8.89e-05, 7.77e-05, 7.12e-05, 6.79e-05, 6.71e-05} // 20
|
||||
}; // the first column is a dummy column representing K=0.
|
||||
|
||||
/**
|
||||
* The optimal number of hashes for a given number of bits per element.
|
||||
* These values are automatically calculated from the data above.
|
||||
*/
|
||||
private static final int[] optKPerBuckets = new int[probs.length];
|
||||
|
||||
static {
|
||||
for (int i = 0; i < probs.length; i++) {
|
||||
double min = Double.MAX_VALUE;
|
||||
double[] prob = probs[i];
|
||||
for (int j = 0; j < prob.length; j++) {
|
||||
if (prob[j] < min) {
|
||||
min = prob[j];
|
||||
optKPerBuckets[i] = Math.max(minK, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given the number of buckets that can be used per element, return a
|
||||
* specification that minimizes the false positive rate.
|
||||
*
|
||||
* @param bucketsPerElement The number of buckets per element for the filter.
|
||||
* @return A spec that minimizes the false positive rate.
|
||||
*/
|
||||
public static BloomSpecification computeBloomSpec(int bucketsPerElement) {
|
||||
assert bucketsPerElement >= 1;
|
||||
assert bucketsPerElement <= probs.length - 1;
|
||||
return new BloomSpecification(optKPerBuckets[bucketsPerElement], bucketsPerElement);
|
||||
}
|
||||
|
||||
/**
|
||||
* A wrapper class that holds two key parameters for a Bloom Filter: the
|
||||
* number of hash functions used, and the number of buckets per element used.
|
||||
*/
|
||||
public static class BloomSpecification {
|
||||
final int K; // number of hash functions.
|
||||
final int bucketsPerElement;
|
||||
|
||||
public BloomSpecification(int k, int bucketsPerElement) {
|
||||
K = k;
|
||||
this.bucketsPerElement = bucketsPerElement;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a maximum tolerable false positive probability, compute a Bloom
|
||||
* specification which will give less than the specified false positive rate,
|
||||
* but minimize the number of buckets per element and the number of hash
|
||||
* functions used. Because bandwidth (and therefore total bitvector size)
|
||||
* is considered more expensive than computing power, preference is given
|
||||
* to minimizing buckets per element rather than number of hash functions.
|
||||
*
|
||||
* @param maxBucketsPerElement The maximum number of buckets available for the filter.
|
||||
* @param maxFalsePosProb The maximum tolerable false positive rate.
|
||||
* @return A Bloom Specification which would result in a false positive rate
|
||||
* less than specified by the function call
|
||||
* @throws UnsupportedOperationException if a filter satisfying the parameters cannot be met
|
||||
*/
|
||||
public static BloomSpecification computeBloomSpec(int maxBucketsPerElement, double maxFalsePosProb) {
|
||||
assert maxBucketsPerElement >= 1;
|
||||
assert maxBucketsPerElement <= probs.length - 1;
|
||||
int maxK = probs[maxBucketsPerElement].length - 1;
|
||||
|
||||
// Handle the trivial cases
|
||||
if (maxFalsePosProb >= probs[minBuckets][minK]) {
|
||||
return new BloomSpecification(2, optKPerBuckets[2]);
|
||||
}
|
||||
if (maxFalsePosProb < probs[maxBucketsPerElement][maxK]) {
|
||||
throw new UnsupportedOperationException(String.format("Unable to satisfy %s with %s buckets per element",
|
||||
maxFalsePosProb, maxBucketsPerElement));
|
||||
}
|
||||
|
||||
// First find the minimal required number of buckets:
|
||||
int bucketsPerElement = 2;
|
||||
int K = optKPerBuckets[2];
|
||||
while (probs[bucketsPerElement][K] > maxFalsePosProb) {
|
||||
bucketsPerElement++;
|
||||
K = optKPerBuckets[bucketsPerElement];
|
||||
}
|
||||
// Now that the number of buckets is sufficient, see if we can relax K
|
||||
// without losing too much precision.
|
||||
while (probs[bucketsPerElement][K - 1] <= maxFalsePosProb) {
|
||||
K--;
|
||||
}
|
||||
|
||||
return new BloomSpecification(K, bucketsPerElement);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the maximum number of buckets per element that this implementation
|
||||
* can support. Crucially, it will lower the bucket count if necessary to meet
|
||||
* BitSet's size restrictions.
|
||||
*/
|
||||
public static int maxBucketsPerElement(long numElements) {
|
||||
numElements = Math.max(1, numElements);
|
||||
double v = (Long.MAX_VALUE - EXCESS) / (double) numElements;
|
||||
if (v < 1.0) {
|
||||
throw new UnsupportedOperationException("Cannot compute probabilities for " + numElements + " elements.");
|
||||
}
|
||||
return Math.min(BloomCalculations.probs.length - 1, (int) v);
|
||||
}
|
||||
}
|
|
@ -1,64 +0,0 @@
|
|||
/*
|
||||
* Licensed to Elastic Search and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. Elastic Search licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.common.bloom;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public interface BloomFilter {
|
||||
|
||||
public static final BloomFilter NONE = new BloomFilter() {
|
||||
@Override
|
||||
public void add(byte[] key, int offset, int length) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isPresent(byte[] key, int offset, int length) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long sizeInBytes() {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
public static final BloomFilter EMPTY = new BloomFilter() {
|
||||
@Override
|
||||
public void add(byte[] key, int offset, int length) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isPresent(byte[] key, int offset, int length) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long sizeInBytes() {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
void add(byte[] key, int offset, int length);
|
||||
|
||||
boolean isPresent(byte[] key, int offset, int length);
|
||||
|
||||
long sizeInBytes();
|
||||
}
|
|
@ -1,98 +0,0 @@
|
|||
/*
|
||||
* Licensed to Elastic Search and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. Elastic Search licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.common.bloom;
|
||||
|
||||
import org.elasticsearch.common.UUID;
|
||||
import org.elasticsearch.common.logging.ESLogger;
|
||||
import org.elasticsearch.common.logging.ESLoggerFactory;
|
||||
import org.elasticsearch.common.unit.ByteSizeValue;
|
||||
import org.elasticsearch.common.unit.SizeValue;
|
||||
import org.elasticsearch.common.unit.TimeValue;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public class BloomFilterFactory {
|
||||
|
||||
private static ESLogger logger = ESLoggerFactory.getLogger(BloomFilterFactory.class.getName());
|
||||
|
||||
private static final int EXCESS = 20;
|
||||
|
||||
/**
|
||||
* @return A BloomFilter with the lowest practical false positive probability
|
||||
* for the given number of elements.
|
||||
*/
|
||||
public static BloomFilter getFilter(long numElements, int targetBucketsPerElem) {
|
||||
int maxBucketsPerElement = Math.max(1, BloomCalculations.maxBucketsPerElement(numElements));
|
||||
int bucketsPerElement = Math.min(targetBucketsPerElem, maxBucketsPerElement);
|
||||
if (bucketsPerElement < targetBucketsPerElem) {
|
||||
logger.warn(String.format("Cannot provide an optimal BloomFilter for %d elements (%d/%d buckets per element).",
|
||||
numElements, bucketsPerElement, targetBucketsPerElem));
|
||||
}
|
||||
BloomCalculations.BloomSpecification spec = BloomCalculations.computeBloomSpec(bucketsPerElement);
|
||||
return new ObsBloomFilter(spec.K, bucketsFor(numElements, spec.bucketsPerElement));
|
||||
}
|
||||
|
||||
/**
|
||||
* @return The smallest BloomFilter that can provide the given false positive
|
||||
* probability rate for the given number of elements.
|
||||
* <p/>
|
||||
* Asserts that the given probability can be satisfied using this filter.
|
||||
*/
|
||||
public static BloomFilter getFilter(long numElements, double maxFalsePosProbability) {
|
||||
assert maxFalsePosProbability <= 1.0 : "Invalid probability";
|
||||
int bucketsPerElement = BloomCalculations.maxBucketsPerElement(numElements);
|
||||
BloomCalculations.BloomSpecification spec = BloomCalculations.computeBloomSpec(bucketsPerElement, maxFalsePosProbability);
|
||||
return new ObsBloomFilter(spec.K, bucketsFor(numElements, spec.bucketsPerElement));
|
||||
}
|
||||
|
||||
private static long bucketsFor(long numElements, int bucketsPer) {
|
||||
return numElements * bucketsPer + EXCESS;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws UnsupportedEncodingException {
|
||||
long elements = SizeValue.parseSizeValue("100m").singles();
|
||||
BloomFilter filter = BloomFilterFactory.getFilter(elements, 15);
|
||||
System.out.println("Filter size: " + new ByteSizeValue(filter.sizeInBytes()));
|
||||
for (long i = 0; i < elements; i++) {
|
||||
byte[] utf8s = UUID.randomBase64UUID().getBytes("UTF8");
|
||||
filter.add(utf8s, 0, utf8s.length);
|
||||
}
|
||||
long falsePositives = 0;
|
||||
for (long i = 0; i < elements; i++) {
|
||||
byte[] utf8s = UUID.randomBase64UUID().getBytes("UTF8");
|
||||
if (filter.isPresent(utf8s, 0, utf8s.length)) {
|
||||
falsePositives++;
|
||||
}
|
||||
}
|
||||
System.out.println("false positives: " + falsePositives);
|
||||
|
||||
byte[] utf8s = UUID.randomBase64UUID().getBytes("UTF8");
|
||||
long time = System.currentTimeMillis();
|
||||
for (long i = 0; i < elements; i++) {
|
||||
if (filter.isPresent(utf8s, 0, utf8s.length)) {
|
||||
}
|
||||
}
|
||||
long timeSize = System.currentTimeMillis() - time;
|
||||
System.out.println("Indexed in " + new TimeValue(timeSize) + ", TPS: " + (elements / timeSize) + " per millisecond");
|
||||
}
|
||||
}
|
|
@ -1,97 +0,0 @@
|
|||
/*
|
||||
* Licensed to Elastic Search and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. Elastic Search licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.common.bloom;
|
||||
|
||||
import org.apache.lucene.util.OpenBitSet;
|
||||
import org.elasticsearch.common.MurmurHash;
|
||||
import org.elasticsearch.common.RamUsage;
|
||||
|
||||
public class ObsBloomFilter implements BloomFilter {
|
||||
|
||||
private final int hashCount;
|
||||
|
||||
private final OpenBitSet bitset;
|
||||
private final long size;
|
||||
|
||||
ObsBloomFilter(int hashCount, long size) {
|
||||
this.hashCount = hashCount;
|
||||
this.bitset = new OpenBitSet(size);
|
||||
this.size = size;
|
||||
}
|
||||
|
||||
long emptyBuckets() {
|
||||
long n = 0;
|
||||
for (long i = 0; i < buckets(); i++) {
|
||||
if (!bitset.get(i)) {
|
||||
n++;
|
||||
}
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
private long buckets() {
|
||||
return size;
|
||||
}
|
||||
|
||||
private long[] getHashBuckets(byte[] key, int offset, int length) {
|
||||
return getHashBuckets(key, offset, length, hashCount, buckets());
|
||||
}
|
||||
|
||||
static long[] getHashBuckets(byte[] b, int offset, int length, int hashCount, long max) {
|
||||
long[] result = new long[hashCount];
|
||||
long[] hash = MurmurHash.hash3_x64_128(b, offset, length, 0L);
|
||||
for (int i = 0; i < hashCount; ++i) {
|
||||
result[i] = Math.abs((hash[0] + (long) i * hash[1]) % max);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add(byte[] key, int offset, int length) {
|
||||
// inline the hash buckets so we don't have to create the int[] each time...
|
||||
long[] hash = MurmurHash.hash3_x64_128(key, offset, length, 0L);
|
||||
for (int i = 0; i < hashCount; ++i) {
|
||||
long bucketIndex = Math.abs((hash[0] + (long) i * hash[1]) % size);
|
||||
bitset.fastSet(bucketIndex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isPresent(byte[] key, int offset, int length) {
|
||||
// inline the hash buckets so we don't have to create the int[] each time...
|
||||
long[] hash = MurmurHash.hash3_x64_128(key, offset, length, 0L);
|
||||
for (int i = 0; i < hashCount; ++i) {
|
||||
long bucketIndex = Math.abs((hash[0] + (long) i * hash[1]) % size);
|
||||
if (!bitset.fastGet(bucketIndex)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
bitset.clear(0, bitset.size());
|
||||
}
|
||||
|
||||
@Override
|
||||
public long sizeInBytes() {
|
||||
return bitset.getBits().length * RamUsage.NUM_BYTES_LONG + RamUsage.NUM_BYTES_ARRAY_HEADER + RamUsage.NUM_BYTES_INT /* wlen */;
|
||||
}
|
||||
}
|
|
@ -125,6 +125,16 @@ public class ByteBufferBytesReference implements BytesReference {
|
|||
return buffer.arrayOffset() + buffer.position();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Helper.bytesHashCode(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
return Helper.bytesEqual(this, (BytesReference) obj);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toUtf8() {
|
||||
if (!buffer.hasRemaining()) {
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
package org.elasticsearch.common.bytes;
|
||||
|
||||
import com.google.common.base.Charsets;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||
import org.elasticsearch.common.Bytes;
|
||||
import org.elasticsearch.common.io.stream.BytesStreamInput;
|
||||
|
@ -43,6 +44,23 @@ public class BytesArray implements BytesReference {
|
|||
this(bytes.getBytes(Charsets.UTF_8));
|
||||
}
|
||||
|
||||
public BytesArray(BytesRef bytesRef) {
|
||||
this(bytesRef, false);
|
||||
}
|
||||
|
||||
public BytesArray(BytesRef bytesRef, boolean deepCopy) {
|
||||
if (deepCopy) {
|
||||
BytesRef copy = BytesRef.deepCopyOf(bytesRef);
|
||||
bytes = copy.bytes;
|
||||
offset = copy.offset;
|
||||
length = copy.length;
|
||||
} else {
|
||||
bytes = bytesRef.bytes;
|
||||
offset = bytesRef.offset;
|
||||
length = bytesRef.length;
|
||||
}
|
||||
}
|
||||
|
||||
public BytesArray(byte[] bytes) {
|
||||
this.bytes = bytes;
|
||||
this.offset = 0;
|
||||
|
@ -130,33 +148,12 @@ public class BytesArray implements BytesReference {
|
|||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
return bytesEquals((BytesArray) obj);
|
||||
}
|
||||
|
||||
public boolean bytesEquals(BytesArray other) {
|
||||
if (length == other.length) {
|
||||
int otherUpto = other.offset;
|
||||
final byte[] otherBytes = other.bytes;
|
||||
final int end = offset + length;
|
||||
for (int upto = offset; upto < end; upto++, otherUpto++) {
|
||||
if (bytes[upto] != otherBytes[otherUpto]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
public int hashCode() {
|
||||
return Helper.bytesHashCode(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = 0;
|
||||
final int end = offset + length;
|
||||
for (int i = offset; i < end; i++) {
|
||||
result = 31 * result + bytes[i];
|
||||
}
|
||||
return result;
|
||||
public boolean equals(Object obj) {
|
||||
return Helper.bytesEqual(this, (BytesReference) obj);
|
||||
}
|
||||
}
|
|
@ -24,12 +24,53 @@ import org.jboss.netty.buffer.ChannelBuffer;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.util.Comparator;
|
||||
|
||||
/**
|
||||
* A reference to bytes.
|
||||
*/
|
||||
public interface BytesReference {
|
||||
|
||||
public static class Helper {
|
||||
|
||||
public static boolean bytesEqual(BytesReference a, BytesReference b) {
|
||||
if (a == b) {
|
||||
return true;
|
||||
}
|
||||
if (a.length() != b.length()) {
|
||||
return false;
|
||||
}
|
||||
if (!a.hasArray()) {
|
||||
a = a.toBytesArray();
|
||||
}
|
||||
if (!b.hasArray()) {
|
||||
b = b.toBytesArray();
|
||||
}
|
||||
int bUpTo = b.arrayOffset();
|
||||
final byte[] aArray = a.array();
|
||||
final byte[] bArray = b.array();
|
||||
final int end = a.arrayOffset() + a.length();
|
||||
for (int aUpTo = a.arrayOffset(); aUpTo < end; aUpTo++, bUpTo++) {
|
||||
if (aArray[aUpTo] != bArray[bUpTo]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public static int bytesHashCode(BytesReference a) {
|
||||
if (!a.hasArray()) {
|
||||
a = a.toBytesArray();
|
||||
}
|
||||
int result = 0;
|
||||
final int end = a.arrayOffset() + a.length();
|
||||
for (int i = a.arrayOffset(); i < end; i++) {
|
||||
result = 31 * result + a.array()[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the byte at the specified index. Need to be between 0 and length.
|
||||
*/
|
||||
|
@ -94,4 +135,6 @@ public interface BytesReference {
|
|||
* Converts to a string based on utf8.
|
||||
*/
|
||||
String toUtf8();
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -106,4 +106,14 @@ public class ChannelBufferBytesReference implements BytesReference {
|
|||
public String toUtf8() {
|
||||
return buffer.toString(Charsets.UTF_8);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Helper.bytesHashCode(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
return Helper.bytesEqual(this, (BytesReference) obj);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -124,14 +124,12 @@ public class HashedBytesArray implements BytesReference {
|
|||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
HashedBytesArray bytesWrap = (HashedBytesArray) o;
|
||||
return Arrays.equals(bytes, bytesWrap.bytes);
|
||||
public int hashCode() {
|
||||
return Helper.bytesHashCode(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return hashCode;
|
||||
public boolean equals(Object obj) {
|
||||
return Helper.bytesEqual(this, (BytesReference) obj);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
package org.elasticsearch.common.compress;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
import org.apache.lucene.store.*;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.elasticsearch.index.store.support.ForceSyncDirectory;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -60,16 +62,6 @@ public class CompressedDirectory extends Directory implements ForceSyncDirectory
|
|||
return dir.fileExists(name);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long fileModified(String name) throws IOException {
|
||||
return dir.fileModified(name);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void touchFile(String name) throws IOException {
|
||||
dir.touchFile(name);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void deleteFile(String name) throws IOException {
|
||||
dir.deleteFile(name);
|
||||
|
@ -82,11 +74,12 @@ public class CompressedDirectory extends Directory implements ForceSyncDirectory
|
|||
@Override
|
||||
public long fileLength(String name) throws IOException {
|
||||
if (actualLength && decompressExtensions.contains(getExtension(name))) {
|
||||
IndexInput in = openInput(name);
|
||||
// LUCENE 4 UPGRADE: Is this the right IOContext?
|
||||
IndexInput in = openInput(name, IOContext.READONCE);
|
||||
try {
|
||||
return in.length();
|
||||
} catch (Exception e) {
|
||||
in.close();
|
||||
} finally {
|
||||
IOUtils.close(in);
|
||||
}
|
||||
}
|
||||
return dir.fileLength(name);
|
||||
|
@ -97,24 +90,19 @@ public class CompressedDirectory extends Directory implements ForceSyncDirectory
|
|||
dir.sync(names);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void sync(String name) throws IOException {
|
||||
dir.sync(name);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void forceSync(String name) throws IOException {
|
||||
if (dir instanceof ForceSyncDirectory) {
|
||||
((ForceSyncDirectory) dir).forceSync(name);
|
||||
} else {
|
||||
dir.sync(name);
|
||||
dir.sync(ImmutableList.of(name));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public IndexInput openInput(String name) throws IOException {
|
||||
public IndexInput openInput(String name, IOContext context) throws IOException {
|
||||
if (decompressExtensions.contains(getExtension(name))) {
|
||||
IndexInput in = dir.openInput(name);
|
||||
IndexInput in = dir.openInput(name, context);
|
||||
Compressor compressor1 = CompressorFactory.compressor(in);
|
||||
if (compressor1 != null) {
|
||||
return compressor1.indexInput(in);
|
||||
|
@ -122,29 +110,15 @@ public class CompressedDirectory extends Directory implements ForceSyncDirectory
|
|||
return in;
|
||||
}
|
||||
}
|
||||
return dir.openInput(name);
|
||||
return dir.openInput(name, context);
|
||||
}
|
||||
|
||||
@Override
|
||||
public IndexInput openInput(String name, int bufferSize) throws IOException {
|
||||
if (decompressExtensions.contains(getExtension(name))) {
|
||||
IndexInput in = dir.openInput(name, bufferSize);
|
||||
Compressor compressor1 = CompressorFactory.compressor(in);
|
||||
if (compressor1 != null) {
|
||||
return compressor1.indexInput(in);
|
||||
} else {
|
||||
return in;
|
||||
}
|
||||
}
|
||||
return dir.openInput(name, bufferSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public IndexOutput createOutput(String name) throws IOException {
|
||||
public IndexOutput createOutput(String name, IOContext context) throws IOException {
|
||||
if (compress && compressExtensions.contains(getExtension(name))) {
|
||||
return compressor.indexOutput(dir.createOutput(name));
|
||||
return compressor.indexOutput(dir.createOutput(name, context));
|
||||
}
|
||||
return dir.createOutput(name);
|
||||
return dir.createOutput(name, context);
|
||||
}
|
||||
|
||||
// can't override this one, we need to open the correct compression
|
||||
|
|
|
@ -203,7 +203,7 @@ public abstract class CompressedIndexInput<T extends CompressorContext> extends
|
|||
protected abstract int uncompress(IndexInput in, byte[] out) throws IOException;
|
||||
|
||||
@Override
|
||||
public Object clone() {
|
||||
public IndexInput clone() {
|
||||
// we clone and we need to make sure we keep the same positions!
|
||||
CompressedIndexInput cloned = (CompressedIndexInput) super.clone();
|
||||
cloned.uncompressed = new byte[uncompressedLength];
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
|
||||
package org.elasticsearch.common.compress;
|
||||
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.common.Unicode;
|
||||
import org.elasticsearch.common.bytes.BytesArray;
|
||||
import org.elasticsearch.common.bytes.BytesReference;
|
||||
|
@ -79,8 +79,8 @@ public class CompressedString implements Streamable {
|
|||
}
|
||||
|
||||
public CompressedString(String str) throws IOException {
|
||||
UnicodeUtil.UTF8Result result = Unicode.unsafeFromStringAsUtf8(str);
|
||||
this.bytes = CompressorFactory.defaultCompressor().compress(result.result, 0, result.length);
|
||||
BytesRef result = Unicode.unsafeFromStringAsUtf8(str);
|
||||
this.bytes = CompressorFactory.defaultCompressor().compress(result.bytes, result.offset, result.length);
|
||||
}
|
||||
|
||||
public byte[] compressed() {
|
||||
|
|
|
@ -65,7 +65,7 @@ public class LZFCompressedIndexInput extends CompressedIndexInput<LZFCompressorC
|
|||
}
|
||||
|
||||
@Override
|
||||
public Object clone() {
|
||||
public IndexInput clone() {
|
||||
LZFCompressedIndexInput cloned = (LZFCompressedIndexInput) super.clone();
|
||||
cloned.inputBuffer = new byte[LZFChunk.MAX_CHUNK_LEN];
|
||||
return cloned;
|
||||
|
|
|
@ -59,7 +59,7 @@ public abstract class SnappyCompressedIndexInput extends CompressedIndexInput<Sn
|
|||
}
|
||||
|
||||
@Override
|
||||
public Object clone() {
|
||||
public IndexInput clone() {
|
||||
SnappyCompressedIndexInput cloned = (SnappyCompressedIndexInput) super.clone();
|
||||
cloned.inputBuffer = new byte[inputBuffer.length];
|
||||
return cloned;
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
package org.elasticsearch.common.io.stream;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.common.bytes.BytesArray;
|
||||
import org.elasticsearch.common.bytes.BytesReference;
|
||||
|
||||
|
@ -69,6 +70,16 @@ public class BytesStreamInput extends StreamInput {
|
|||
return bytes;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef readBytesRef(int length) throws IOException {
|
||||
if (unsafe) {
|
||||
return super.readBytesRef(length);
|
||||
}
|
||||
BytesRef bytes = new BytesRef(buf, pos, length);
|
||||
pos += length;
|
||||
return bytes;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long skip(long n) throws IOException {
|
||||
if (pos + n > count) {
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
package org.elasticsearch.common.io.stream;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.Nullable;
|
||||
import org.elasticsearch.common.Strings;
|
||||
|
@ -84,6 +85,20 @@ public abstract class StreamInput extends InputStream {
|
|||
return new BytesArray(bytes, 0, length);
|
||||
}
|
||||
|
||||
public BytesRef readBytesRef() throws IOException {
|
||||
int length = readVInt();
|
||||
return readBytesRef(length);
|
||||
}
|
||||
|
||||
public BytesRef readBytesRef(int length) throws IOException {
|
||||
if (length == 0) {
|
||||
return new BytesRef();
|
||||
}
|
||||
byte[] bytes = new byte[length];
|
||||
readBytes(bytes, 0, length);
|
||||
return new BytesRef(bytes, 0, length);
|
||||
}
|
||||
|
||||
public void readFully(byte[] b) throws IOException {
|
||||
readBytes(b, 0, b.length);
|
||||
}
|
||||
|
@ -347,6 +362,8 @@ public abstract class StreamInput extends InputStream {
|
|||
return readBytesReference();
|
||||
case 15:
|
||||
return readText();
|
||||
case 16:
|
||||
return readShort();
|
||||
default:
|
||||
throw new IOException("Can't read unknown type [" + type + "]");
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
package org.elasticsearch.common.io.stream;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.Nullable;
|
||||
import org.elasticsearch.common.bytes.BytesReference;
|
||||
|
@ -106,6 +107,15 @@ public abstract class StreamOutput extends OutputStream {
|
|||
bytes.writeTo(this);
|
||||
}
|
||||
|
||||
public void writeBytesRef(BytesRef bytes) throws IOException {
|
||||
if (bytes == null) {
|
||||
writeVInt(0);
|
||||
return;
|
||||
}
|
||||
writeVInt(bytes.length);
|
||||
write(bytes.bytes, bytes.offset, bytes.length);
|
||||
}
|
||||
|
||||
public final void writeShort(short v) throws IOException {
|
||||
writeByte((byte) (v >> 8));
|
||||
writeByte((byte) v);
|
||||
|
@ -358,6 +368,9 @@ public abstract class StreamOutput extends OutputStream {
|
|||
} else if (value instanceof Text) {
|
||||
writeByte((byte) 15);
|
||||
writeText((Text) value);
|
||||
} else if (type == Short.class) {
|
||||
writeByte((byte) 16);
|
||||
writeShort((Short) value);
|
||||
} else {
|
||||
throw new IOException("Can't write type [" + type + "]");
|
||||
}
|
||||
|
|
|
@ -1,151 +0,0 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.common.io.stream;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public class Streamables {
|
||||
|
||||
public static Map<String, Object> readMap(StreamInput in) throws IOException {
|
||||
int size = in.readVInt();
|
||||
Map<String, Object> map = new HashMap<String, Object>(size);
|
||||
for (int i = 0; i < size; i++) {
|
||||
map.put(in.readUTF(), readMapValue(in));
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
public static Object readMapValue(StreamInput in) throws IOException {
|
||||
byte type = in.readByte();
|
||||
if (type == -1) {
|
||||
return null;
|
||||
} else if (type == 0) {
|
||||
return in.readUTF();
|
||||
} else if (type == 1) {
|
||||
return in.readInt();
|
||||
} else if (type == 2) {
|
||||
return in.readLong();
|
||||
} else if (type == 3) {
|
||||
return in.readFloat();
|
||||
} else if (type == 4) {
|
||||
return in.readDouble();
|
||||
} else if (type == 5) {
|
||||
return in.readBoolean();
|
||||
} else if (type == 6) {
|
||||
int bytesSize = in.readVInt();
|
||||
byte[] value = new byte[bytesSize];
|
||||
in.readFully(value);
|
||||
return value;
|
||||
} else if (type == 7) {
|
||||
int size = in.readVInt();
|
||||
List list = new ArrayList(size);
|
||||
for (int i = 0; i < size; i++) {
|
||||
list.add(readMapValue(in));
|
||||
}
|
||||
return list;
|
||||
} else if (type == 8) {
|
||||
int size = in.readVInt();
|
||||
Object[] list = new Object[size];
|
||||
for (int i = 0; i < size; i++) {
|
||||
list[i] = readMapValue(in);
|
||||
}
|
||||
return list;
|
||||
} else if (type == 9) {
|
||||
int size = in.readVInt();
|
||||
Map map = new HashMap(size);
|
||||
for (int i = 0; i < size; i++) {
|
||||
map.put(in.readUTF(), readMapValue(in));
|
||||
}
|
||||
return map;
|
||||
} else {
|
||||
throw new IOException("Can't read unknown type [" + type + "]");
|
||||
}
|
||||
}
|
||||
|
||||
public static void writeMap(StreamOutput out, Map<String, Object> map) throws IOException {
|
||||
out.writeVInt(map.size());
|
||||
for (Map.Entry<String, Object> entry : map.entrySet()) {
|
||||
out.writeUTF(entry.getKey());
|
||||
writeMapValue(out, entry.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
private static void writeMapValue(StreamOutput out, Object value) throws IOException {
|
||||
if (value == null) {
|
||||
out.writeByte((byte) -1);
|
||||
return;
|
||||
}
|
||||
Class type = value.getClass();
|
||||
if (type == String.class) {
|
||||
out.writeByte((byte) 0);
|
||||
out.writeUTF((String) value);
|
||||
} else if (type == Integer.class) {
|
||||
out.writeByte((byte) 1);
|
||||
out.writeInt((Integer) value);
|
||||
} else if (type == Long.class) {
|
||||
out.writeByte((byte) 2);
|
||||
out.writeLong((Long) value);
|
||||
} else if (type == Float.class) {
|
||||
out.writeByte((byte) 3);
|
||||
out.writeFloat((Float) value);
|
||||
} else if (type == Double.class) {
|
||||
out.writeByte((byte) 4);
|
||||
out.writeDouble((Double) value);
|
||||
} else if (type == Boolean.class) {
|
||||
out.writeByte((byte) 5);
|
||||
out.writeBoolean((Boolean) value);
|
||||
} else if (type == byte[].class) {
|
||||
out.writeByte((byte) 6);
|
||||
out.writeVInt(((byte[]) value).length);
|
||||
out.writeBytes(((byte[]) value));
|
||||
} else if (value instanceof List) {
|
||||
out.writeByte((byte) 7);
|
||||
List list = (List) value;
|
||||
out.writeVInt(list.size());
|
||||
for (Object o : list) {
|
||||
writeMapValue(out, o);
|
||||
}
|
||||
} else if (value instanceof Object[]) {
|
||||
out.writeByte((byte) 8);
|
||||
Object[] list = (Object[]) value;
|
||||
out.writeVInt(list.length);
|
||||
for (Object o : list) {
|
||||
writeMapValue(out, o);
|
||||
}
|
||||
} else if (value instanceof Map) {
|
||||
out.writeByte((byte) 9);
|
||||
Map<String, Object> map = (Map<String, Object>) value;
|
||||
out.writeVInt(map.size());
|
||||
for (Map.Entry<String, Object> entry : map.entrySet()) {
|
||||
out.writeUTF(entry.getKey());
|
||||
writeMapValue(out, entry.getValue());
|
||||
}
|
||||
} else {
|
||||
throw new IOException("Can't write type [" + type + "]");
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,90 +0,0 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.common.lucene;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.elasticsearch.common.lucene.uid.UidField;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public class DocumentBuilder {
|
||||
|
||||
public static final Document EMPTY = new Document();
|
||||
|
||||
public static DocumentBuilder doc() {
|
||||
return new DocumentBuilder();
|
||||
}
|
||||
|
||||
public static Fieldable uidField(String value) {
|
||||
return uidField(value, 0);
|
||||
}
|
||||
|
||||
public static Fieldable uidField(String value, long version) {
|
||||
return new UidField("_uid", value, version);
|
||||
}
|
||||
|
||||
public static FieldBuilder field(String name, String value) {
|
||||
return field(name, value, Field.Store.YES, Field.Index.ANALYZED);
|
||||
}
|
||||
|
||||
public static FieldBuilder field(String name, String value, Field.Store store, Field.Index index) {
|
||||
return new FieldBuilder(name, value, store, index);
|
||||
}
|
||||
|
||||
public static FieldBuilder field(String name, String value, Field.Store store, Field.Index index, Field.TermVector termVector) {
|
||||
return new FieldBuilder(name, value, store, index, termVector);
|
||||
}
|
||||
|
||||
public static FieldBuilder field(String name, byte[] value, Field.Store store) {
|
||||
return new FieldBuilder(name, value, store);
|
||||
}
|
||||
|
||||
public static FieldBuilder field(String name, byte[] value, int offset, int length, Field.Store store) {
|
||||
return new FieldBuilder(name, value, offset, length, store);
|
||||
}
|
||||
|
||||
private final Document document;
|
||||
|
||||
private DocumentBuilder() {
|
||||
this.document = new Document();
|
||||
}
|
||||
|
||||
public DocumentBuilder boost(float boost) {
|
||||
document.setBoost(boost);
|
||||
return this;
|
||||
}
|
||||
|
||||
public DocumentBuilder add(Fieldable field) {
|
||||
document.add(field);
|
||||
return this;
|
||||
}
|
||||
|
||||
public DocumentBuilder add(FieldBuilder fieldBuilder) {
|
||||
document.add(fieldBuilder.build());
|
||||
return this;
|
||||
}
|
||||
|
||||
public Document build() {
|
||||
return document;
|
||||
}
|
||||
}
|
|
@ -1,65 +0,0 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.common.lucene;
|
||||
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public class FieldBuilder {
|
||||
|
||||
private final Field field;
|
||||
|
||||
FieldBuilder(String name, String value, Field.Store store, Field.Index index) {
|
||||
field = new Field(name, value, store, index);
|
||||
}
|
||||
|
||||
FieldBuilder(String name, String value, Field.Store store, Field.Index index, Field.TermVector termVector) {
|
||||
field = new Field(name, value, store, index, termVector);
|
||||
}
|
||||
|
||||
FieldBuilder(String name, byte[] value, Field.Store store) {
|
||||
field = new Field(name, value, store);
|
||||
}
|
||||
|
||||
FieldBuilder(String name, byte[] value, int offset, int length, Field.Store store) {
|
||||
field = new Field(name, value, offset, length, store);
|
||||
}
|
||||
|
||||
public FieldBuilder boost(float boost) {
|
||||
field.setBoost(boost);
|
||||
return this;
|
||||
}
|
||||
|
||||
public FieldBuilder omitNorms(boolean omitNorms) {
|
||||
field.setOmitNorms(omitNorms);
|
||||
return this;
|
||||
}
|
||||
|
||||
public FieldBuilder omitTermFreqAndPositions(boolean omitTermFreqAndPositions) {
|
||||
field.setOmitTermFreqAndPositions(omitTermFreqAndPositions);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Field build() {
|
||||
return field;
|
||||
}
|
||||
}
|
|
@ -86,21 +86,11 @@ public abstract class IndexCommitDelegate extends IndexCommit {
|
|||
return delegate.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getVersion() {
|
||||
return delegate.getVersion();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getGeneration() {
|
||||
return delegate.getGeneration();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getTimestamp() throws IOException {
|
||||
return delegate.getTimestamp();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, String> getUserData() throws IOException {
|
||||
return delegate.getUserData();
|
||||
|
|
|
@ -19,10 +19,12 @@
|
|||
|
||||
package org.elasticsearch.common.lucene;
|
||||
|
||||
import org.apache.lucene.analysis.KeywordAnalyzer;
|
||||
import org.apache.lucene.analysis.core.KeywordAnalyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.elasticsearch.common.Nullable;
|
||||
import org.elasticsearch.common.io.stream.StreamInput;
|
||||
|
@ -40,7 +42,7 @@ import java.lang.reflect.Field;
|
|||
*/
|
||||
public class Lucene {
|
||||
|
||||
public static final Version VERSION = Version.LUCENE_36;
|
||||
public static final Version VERSION = Version.LUCENE_40;
|
||||
public static final Version ANALYZER_VERSION = VERSION;
|
||||
public static final Version QUERYPARSER_VERSION = VERSION;
|
||||
|
||||
|
@ -57,6 +59,9 @@ public class Lucene {
|
|||
if (version == null) {
|
||||
return defaultVersion;
|
||||
}
|
||||
if ("4.0".equals(version)) {
|
||||
return Version.LUCENE_40;
|
||||
}
|
||||
if ("3.6".equals(version)) {
|
||||
return Version.LUCENE_36;
|
||||
}
|
||||
|
@ -82,6 +87,27 @@ public class Lucene {
|
|||
return defaultVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the segments infos, returning null if it doesn't exists
|
||||
*/
|
||||
@Nullable
|
||||
public static SegmentInfos readSegmentInfosIfExists(Directory directory) {
|
||||
try {
|
||||
return readSegmentInfos(directory);
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the segments infos, failing if it fails to load
|
||||
*/
|
||||
public static SegmentInfos readSegmentInfos(Directory directory) throws IOException {
|
||||
final SegmentInfos sis = new SegmentInfos();
|
||||
sis.read(directory);
|
||||
return sis;
|
||||
}
|
||||
|
||||
public static long count(IndexSearcher searcher, Query query) throws IOException {
|
||||
TotalHitCountCollector countCollector = new TotalHitCountCollector();
|
||||
// we don't need scores, so wrap it in a constant score query
|
||||
|
@ -92,18 +118,6 @@ public class Lucene {
|
|||
return countCollector.getTotalHits();
|
||||
}
|
||||
|
||||
public static int docId(IndexReader reader, Term term) throws IOException {
|
||||
TermDocs termDocs = reader.termDocs(term);
|
||||
try {
|
||||
if (termDocs.next()) {
|
||||
return termDocs.doc();
|
||||
}
|
||||
return NO_DOC;
|
||||
} finally {
|
||||
termDocs.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the index writer, returning <tt>false</tt> if it failed to close.
|
||||
*/
|
||||
|
@ -134,7 +148,7 @@ public class Lucene {
|
|||
if (in.readBoolean()) {
|
||||
field = in.readUTF();
|
||||
}
|
||||
fields[i] = new SortField(field, in.readVInt(), in.readBoolean());
|
||||
fields[i] = new SortField(field, readSortType(in), in.readBoolean());
|
||||
}
|
||||
|
||||
FieldDoc[] fieldDocs = new FieldDoc[in.readVInt()];
|
||||
|
@ -160,6 +174,8 @@ public class Lucene {
|
|||
cFields[j] = in.readShort();
|
||||
} else if (type == 8) {
|
||||
cFields[j] = in.readBoolean();
|
||||
} else if (type == 9) {
|
||||
cFields[j] = in.readBytesRef();
|
||||
} else {
|
||||
throw new IOException("Can't match type [" + type + "]");
|
||||
}
|
||||
|
@ -201,9 +217,9 @@ public class Lucene {
|
|||
out.writeUTF(sortField.getField());
|
||||
}
|
||||
if (sortField.getComparatorSource() != null) {
|
||||
out.writeVInt(((FieldDataType.ExtendedFieldComparatorSource) sortField.getComparatorSource()).reducedType());
|
||||
writeSortType(out, ((FieldDataType.ExtendedFieldComparatorSource) sortField.getComparatorSource()).reducedType());
|
||||
} else {
|
||||
out.writeVInt(sortField.getType());
|
||||
writeSortType(out, sortField.getType());
|
||||
}
|
||||
out.writeBoolean(sortField.getReverse());
|
||||
}
|
||||
|
@ -245,6 +261,9 @@ public class Lucene {
|
|||
} else if (type == Boolean.class) {
|
||||
out.writeByte((byte) 8);
|
||||
out.writeBoolean((Boolean) field);
|
||||
} else if (type == BytesRef.class) {
|
||||
out.writeByte((byte) 9);
|
||||
out.writeBytesRef((BytesRef) field);
|
||||
} else {
|
||||
throw new IOException("Can't handle sort field value of type [" + type + "]");
|
||||
}
|
||||
|
@ -271,6 +290,15 @@ public class Lucene {
|
|||
}
|
||||
}
|
||||
|
||||
// LUCENE 4 UPGRADE: We might want to maintain our own ordinal, instead of Lucene's ordinal
|
||||
public static SortField.Type readSortType(StreamInput in) throws IOException {
|
||||
return SortField.Type.values()[in.readVInt()];
|
||||
}
|
||||
|
||||
public static void writeSortType(StreamOutput out, SortField.Type sortType) throws IOException {
|
||||
out.writeVInt(sortType.ordinal());
|
||||
}
|
||||
|
||||
public static Explanation readExplanation(StreamInput in) throws IOException {
|
||||
float value = in.readFloat();
|
||||
String description = in.readUTF();
|
||||
|
@ -312,9 +340,9 @@ public class Lucene {
|
|||
segmentReaderSegmentInfoField = segmentReaderSegmentInfoFieldX;
|
||||
}
|
||||
|
||||
public static SegmentInfo getSegmentInfo(SegmentReader reader) {
|
||||
public static SegmentInfoPerCommit getSegmentInfo(SegmentReader reader) {
|
||||
try {
|
||||
return (SegmentInfo) segmentReaderSegmentInfoField.get(reader);
|
||||
return (SegmentInfoPerCommit) segmentReaderSegmentInfoField.get(reader);
|
||||
} catch (IllegalAccessException e) {
|
||||
return null;
|
||||
}
|
||||
|
@ -343,7 +371,7 @@ public class Lucene {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setNextReader(IndexReader reader, int docBase) throws IOException {
|
||||
public void setNextReader(AtomicReaderContext context) throws IOException {
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
package org.elasticsearch.common.lucene;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.ScoreCachingWrappingScorer;
|
||||
|
@ -59,8 +60,8 @@ public class MinimumScoreCollector extends Collector {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setNextReader(IndexReader reader, int docBase) throws IOException {
|
||||
collector.setNextReader(reader, docBase);
|
||||
public void setNextReader(AtomicReaderContext context) throws IOException {
|
||||
collector.setNextReader(context);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
|
||||
package org.elasticsearch.common.lucene;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.ScoreCachingWrappingScorer;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
|
@ -61,10 +61,10 @@ public class MultiCollector extends Collector {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setNextReader(IndexReader reader, int docBase) throws IOException {
|
||||
collector.setNextReader(reader, docBase);
|
||||
public void setNextReader(AtomicReaderContext context) throws IOException {
|
||||
collector.setNextReader(context);
|
||||
for (Collector collector : collectors) {
|
||||
collector.setNextReader(reader, docBase);
|
||||
collector.setNextReader(context);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -21,8 +21,8 @@ package org.elasticsearch.common.lucene.all;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.document.AbstractField;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.elasticsearch.ElasticSearchException;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -31,22 +31,21 @@ import java.io.Reader;
|
|||
/**
|
||||
*
|
||||
*/
|
||||
public class AllField extends AbstractField {
|
||||
public class AllField extends Field {
|
||||
|
||||
private final AllEntries allEntries;
|
||||
|
||||
private final Analyzer analyzer;
|
||||
|
||||
public AllField(String name, Field.Store store, Field.TermVector termVector, AllEntries allEntries, Analyzer analyzer) {
|
||||
super(name, store, Field.Index.ANALYZED, termVector);
|
||||
|
||||
public AllField(String name, AllEntries allEntries, Analyzer analyzer, FieldType fieldType) {
|
||||
super(name, fieldType);
|
||||
this.allEntries = allEntries;
|
||||
this.analyzer = analyzer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String stringValue() {
|
||||
if (isStored()) {
|
||||
if (fieldType().stored()) {
|
||||
return allEntries.buildText();
|
||||
}
|
||||
return null;
|
||||
|
@ -58,7 +57,7 @@ public class AllField extends AbstractField {
|
|||
}
|
||||
|
||||
@Override
|
||||
public TokenStream tokenStreamValue() {
|
||||
public TokenStream tokenStream(Analyzer analyzer) throws IOException {
|
||||
try {
|
||||
allEntries.reset(); // reset the all entries, just in case it was read already
|
||||
return AllTokenStream.allTokenStream(name, allEntries, analyzer);
|
||||
|
|
|
@ -19,14 +19,21 @@
|
|||
|
||||
package org.elasticsearch.common.lucene.all;
|
||||
|
||||
import org.apache.lucene.index.AtomicReader;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermPositions;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.Similarity.SloppySimScorer;
|
||||
import org.apache.lucene.search.spans.SpanScorer;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.search.spans.SpanWeight;
|
||||
import org.apache.lucene.search.spans.TermSpans;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -51,32 +58,35 @@ public class AllTermQuery extends SpanTermQuery {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(Searcher searcher) throws IOException {
|
||||
public Weight createWeight(IndexSearcher searcher) throws IOException {
|
||||
return new AllTermWeight(this, searcher);
|
||||
}
|
||||
|
||||
protected class AllTermWeight extends SpanWeight {
|
||||
|
||||
public AllTermWeight(AllTermQuery query, Searcher searcher) throws IOException {
|
||||
public AllTermWeight(AllTermQuery query, IndexSearcher searcher) throws IOException {
|
||||
super(query, searcher);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder,
|
||||
boolean topScorer) throws IOException {
|
||||
return new AllTermSpanScorer((TermSpans) query.getSpans(reader), this, similarity, reader.norms(query.getField()));
|
||||
public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder,
|
||||
boolean topScorer, Bits acceptDocs) throws IOException {
|
||||
if (this.stats == null) {
|
||||
return null;
|
||||
}
|
||||
AtomicReader reader = context.reader();
|
||||
SloppySimScorer sloppySimScorer = similarity.sloppySimScorer(stats, context);
|
||||
return new AllTermSpanScorer((TermSpans) query.getSpans(context, acceptDocs, termContexts), this, sloppySimScorer);
|
||||
}
|
||||
|
||||
protected class AllTermSpanScorer extends SpanScorer {
|
||||
// TODO: is this the best way to allocate this?
|
||||
protected byte[] payload = new byte[4];
|
||||
protected TermPositions positions;
|
||||
protected DocsAndPositionsEnum positions;
|
||||
protected float payloadScore;
|
||||
protected int payloadsSeen;
|
||||
|
||||
public AllTermSpanScorer(TermSpans spans, Weight weight, Similarity similarity, byte[] norms) throws IOException {
|
||||
super(spans, weight, similarity, norms);
|
||||
positions = spans.getPositions();
|
||||
public AllTermSpanScorer(TermSpans spans, Weight weight, Similarity.SloppySimScorer docScorer) throws IOException {
|
||||
super(spans, weight, docScorer);
|
||||
positions = spans.getPostings();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -88,12 +98,11 @@ public class AllTermQuery extends SpanTermQuery {
|
|||
freq = 0.0f;
|
||||
payloadScore = 0;
|
||||
payloadsSeen = 0;
|
||||
Similarity similarity1 = getSimilarity();
|
||||
while (more && doc == spans.doc()) {
|
||||
int matchLength = spans.end() - spans.start();
|
||||
|
||||
freq += similarity1.sloppyFreq(matchLength);
|
||||
processPayload(similarity1);
|
||||
freq += docScorer.computeSlopFactor(matchLength);
|
||||
processPayload();
|
||||
|
||||
more = spans.next();// this moves positions to the next match in this
|
||||
// document
|
||||
|
@ -101,10 +110,10 @@ public class AllTermQuery extends SpanTermQuery {
|
|||
return more || (freq != 0);
|
||||
}
|
||||
|
||||
protected void processPayload(Similarity similarity) throws IOException {
|
||||
if (positions.isPayloadAvailable()) {
|
||||
payload = positions.getPayload(payload, 0);
|
||||
payloadScore += decodeFloat(payload);
|
||||
protected void processPayload() throws IOException {
|
||||
final BytesRef payload;
|
||||
if ((payload = positions.getPayload()) != null) {
|
||||
payloadScore += decodeFloat(payload.bytes, payload.offset);
|
||||
payloadsSeen++;
|
||||
|
||||
} else {
|
||||
|
@ -141,27 +150,40 @@ public class AllTermQuery extends SpanTermQuery {
|
|||
return payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Explanation explain(final int doc) throws IOException {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(AtomicReaderContext context, int doc) throws IOException{
|
||||
AllTermSpanScorer scorer = (AllTermSpanScorer) scorer(context, true, false, context.reader().getLiveDocs());
|
||||
if (scorer != null) {
|
||||
int newDoc = scorer.advance(doc);
|
||||
if (newDoc == doc) {
|
||||
float freq = scorer.freq();
|
||||
SloppySimScorer docScorer = similarity.sloppySimScorer(stats, context);
|
||||
ComplexExplanation inner = new ComplexExplanation();
|
||||
inner.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
|
||||
Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq));
|
||||
inner.addDetail(scoreExplanation);
|
||||
inner.setValue(scoreExplanation.getValue());
|
||||
inner.setMatch(true);
|
||||
ComplexExplanation result = new ComplexExplanation();
|
||||
Explanation nonPayloadExpl = super.explain(doc);
|
||||
result.addDetail(nonPayloadExpl);
|
||||
// QUESTION: Is there a way to avoid this skipTo call? We need to know
|
||||
// whether to load the payload or not
|
||||
result.addDetail(inner);
|
||||
Explanation payloadBoost = new Explanation();
|
||||
result.addDetail(payloadBoost);
|
||||
|
||||
float payloadScore = getPayloadScore();
|
||||
final float payloadScore = scorer.getPayloadScore();
|
||||
payloadBoost.setValue(payloadScore);
|
||||
// GSI: I suppose we could toString the payload, but I don't think that
|
||||
// would be a good idea
|
||||
payloadBoost.setDescription("allPayload(...)");
|
||||
result.setValue(nonPayloadExpl.getValue() * payloadScore);
|
||||
result.setValue(inner.getValue() * payloadScore);
|
||||
result.setDescription("btq, product of:");
|
||||
result.setMatch(nonPayloadExpl.getValue() == 0 ? Boolean.FALSE : Boolean.TRUE); // LUCENE-1303
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return new ComplexExplanation(false, 0.0f, "no matching term");
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -35,8 +35,10 @@ import static org.apache.lucene.analysis.payloads.PayloadHelper.encodeFloat;
|
|||
public final class AllTokenStream extends TokenFilter {
|
||||
|
||||
public static TokenStream allTokenStream(String allFieldName, AllEntries allEntries, Analyzer analyzer) throws IOException {
|
||||
return new AllTokenStream(analyzer.reusableTokenStream(allFieldName, allEntries), allEntries);
|
||||
return new AllTokenStream(analyzer.tokenStream(allFieldName, allEntries), allEntries);
|
||||
}
|
||||
|
||||
private final BytesRef payloadSpare = new BytesRef(new byte[4]);
|
||||
|
||||
private final AllEntries allEntries;
|
||||
|
||||
|
@ -60,7 +62,8 @@ public final class AllTokenStream extends TokenFilter {
|
|||
if (allEntries.current() != null) {
|
||||
float boost = allEntries.current().boost();
|
||||
if (boost != 1.0f) {
|
||||
payloadAttribute.setPayload(new Payload(encodeFloat(boost)));
|
||||
encodeFloat(boost, payloadSpare.bytes, payloadSpare.offset);
|
||||
payloadAttribute.setPayload(payloadSpare);
|
||||
} else {
|
||||
payloadAttribute.setPayload(null);
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
package org.elasticsearch.common.lucene.docset;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
|
@ -58,9 +59,9 @@ public class DocIdSetCollector extends Collector {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setNextReader(IndexReader reader, int docBase) throws IOException {
|
||||
base = docBase;
|
||||
collector.setNextReader(reader, docBase);
|
||||
public void setNextReader(AtomicReaderContext ctx) throws IOException {
|
||||
base = ctx.docBase;
|
||||
collector.setNextReader(ctx);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -0,0 +1,63 @@
|
|||
package org.elasticsearch.common.lucene.document;
|
||||
|
||||
import org.apache.lucene.document.*;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public abstract class AbstractMultipleFieldsVisitor extends BaseFieldVisitor {
|
||||
|
||||
protected Document doc = new Document();
|
||||
|
||||
@Override
|
||||
public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
|
||||
doc.add(new StoredField(fieldInfo.name, value));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
|
||||
final FieldType ft = new FieldType(TextField.TYPE_STORED);
|
||||
ft.setStoreTermVectors(fieldInfo.hasVectors());
|
||||
ft.setIndexed(fieldInfo.isIndexed());
|
||||
ft.setOmitNorms(fieldInfo.omitsNorms());
|
||||
ft.setIndexOptions(fieldInfo.getIndexOptions());
|
||||
doc.add(new Field(fieldInfo.name, value, ft));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void intField(FieldInfo fieldInfo, int value) {
|
||||
doc.add(new StoredField(fieldInfo.name, value));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void longField(FieldInfo fieldInfo, long value) {
|
||||
doc.add(new StoredField(fieldInfo.name, value));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void floatField(FieldInfo fieldInfo, float value) {
|
||||
doc.add(new StoredField(fieldInfo.name, value));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void doubleField(FieldInfo fieldInfo, double value) {
|
||||
doc.add(new StoredField(fieldInfo.name, value));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Document createDocument() {
|
||||
return doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
if (!doc.getFields().isEmpty()) {
|
||||
doc = new Document();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
package org.elasticsearch.common.lucene.document;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.StoredFieldVisitor;
|
||||
|
||||
public abstract class BaseFieldVisitor extends StoredFieldVisitor {
|
||||
|
||||
// LUCENE 4 UPGRADE: Added for now to make everything work. Want to make use of Document as less as possible.
|
||||
public abstract Document createDocument();
|
||||
|
||||
// LUCENE 4 UPGRADE: Added for now for compatibility with Selectors
|
||||
public abstract void reset();
|
||||
|
||||
}
|
|
@ -1,29 +0,0 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.common.lucene.document;
|
||||
|
||||
import org.apache.lucene.document.FieldSelector;
|
||||
|
||||
/**
|
||||
*/
|
||||
public interface ResetFieldSelector extends FieldSelector {
|
||||
|
||||
void reset();
|
||||
}
|
|
@ -19,19 +19,25 @@
|
|||
|
||||
package org.elasticsearch.common.lucene.document;
|
||||
|
||||
import org.apache.lucene.document.FieldSelectorResult;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.StoredField;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public class SingleFieldSelector implements ResetFieldSelector {
|
||||
public class SingleFieldVisitor extends AbstractMultipleFieldsVisitor {
|
||||
|
||||
private String name;
|
||||
|
||||
public SingleFieldSelector() {
|
||||
public SingleFieldVisitor() {
|
||||
}
|
||||
|
||||
public SingleFieldSelector(String name) {
|
||||
public SingleFieldVisitor(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
|
@ -40,14 +46,10 @@ public class SingleFieldSelector implements ResetFieldSelector {
|
|||
}
|
||||
|
||||
@Override
|
||||
public FieldSelectorResult accept(String fieldName) {
|
||||
if (name.equals(fieldName)) {
|
||||
return FieldSelectorResult.LOAD;
|
||||
public Status needsField(FieldInfo fieldInfo) throws IOException {
|
||||
if (name.equals(fieldInfo.name)) {
|
||||
return Status.YES;
|
||||
}
|
||||
return FieldSelectorResult.NO_LOAD;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
return Status.NO;
|
||||
}
|
||||
}
|
|
@ -20,9 +20,11 @@
|
|||
package org.elasticsearch.common.lucene.search;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.elasticsearch.common.lucene.docset.AndDocIdSet;
|
||||
import org.elasticsearch.common.lucene.docset.AndDocSet;
|
||||
import org.elasticsearch.common.lucene.docset.DocSet;
|
||||
|
@ -46,14 +48,16 @@ public class AndFilter extends Filter {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
|
||||
if (filters.size() == 1) {
|
||||
return filters.get(0).getDocIdSet(reader);
|
||||
// LUCENE 4 UPGRADE: For leave this null, until we figure out how to deal with deleted docs...
|
||||
return filters.get(0).getDocIdSet(context, null);
|
||||
}
|
||||
List sets = Lists.newArrayListWithExpectedSize(filters.size());
|
||||
boolean allAreDocSet = true;
|
||||
for (Filter filter : filters) {
|
||||
DocIdSet set = filter.getDocIdSet(reader);
|
||||
// LUCENE 4 UPGRADE: For leave this null, until we figure out how to deal with deleted docs...
|
||||
DocIdSet set = filter.getDocIdSet(context, null);
|
||||
if (set == null) { // none matching for this filter, we AND, so return EMPTY
|
||||
return DocSet.EMPTY_DOC_SET;
|
||||
}
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
package org.elasticsearch.common.lucene.search;
|
||||
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -29,8 +29,8 @@ import java.io.IOException;
|
|||
*/
|
||||
public class EmptyScorer extends Scorer {
|
||||
|
||||
public EmptyScorer(Similarity similarity) {
|
||||
super(similarity);
|
||||
public EmptyScorer(Weight weight) {
|
||||
super(weight);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -38,6 +38,11 @@ public class EmptyScorer extends Scorer {
|
|||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float freq() throws IOException {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return NO_MORE_DOCS;
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
|
||||
package org.elasticsearch.common.lucene.search;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
|
@ -57,9 +57,9 @@ public class FilteredCollector extends Collector {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setNextReader(IndexReader reader, int docBase) throws IOException {
|
||||
collector.setNextReader(reader, docBase);
|
||||
docSet = DocSets.convert(reader, filter.getDocIdSet(reader));
|
||||
public void setNextReader(AtomicReaderContext context) throws IOException {
|
||||
collector.setNextReader(context);
|
||||
docSet = DocSets.convert(context.reader(), filter.getDocIdSet(context, null));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -19,8 +19,10 @@
|
|||
|
||||
package org.elasticsearch.common.lucene.search;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.elasticsearch.common.lucene.docset.GetDocSet;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -39,11 +41,11 @@ public class LimitFilter extends NoCacheFilter {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
|
||||
if (counter > limit) {
|
||||
return null;
|
||||
}
|
||||
return new LimitDocSet(reader.maxDoc(), limit);
|
||||
return new LimitDocSet(context.reader().maxDoc(), limit);
|
||||
}
|
||||
|
||||
public class LimitDocSet extends GetDocSet {
|
||||
|
|
|
@ -19,9 +19,11 @@
|
|||
|
||||
package org.elasticsearch.common.lucene.search;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.elasticsearch.common.lucene.docset.AllDocSet;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -34,8 +36,8 @@ import java.io.IOException;
|
|||
public class MatchAllDocsFilter extends Filter {
|
||||
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
|
||||
return new AllDocSet(reader.maxDoc());
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
|
||||
return new AllDocSet(context.reader().maxDoc());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -19,9 +19,11 @@
|
|||
|
||||
package org.elasticsearch.common.lucene.search;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -31,7 +33,7 @@ import java.io.IOException;
|
|||
public class MatchNoDocsFilter extends Filter {
|
||||
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
|
@ -19,9 +19,11 @@
|
|||
|
||||
package org.elasticsearch.common.lucene.search;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
@ -43,20 +45,6 @@ public final class MatchNoDocsQuery extends Query {
|
|||
* Weight implementation that matches no documents.
|
||||
*/
|
||||
private class MatchNoDocsWeight extends Weight {
|
||||
/**
|
||||
* The similarity implementation.
|
||||
*/
|
||||
private final Similarity similarity;
|
||||
|
||||
|
||||
/**
|
||||
* Creates a new weight that matches nothing.
|
||||
*
|
||||
* @param searcher the search to match for
|
||||
*/
|
||||
public MatchNoDocsWeight(final Searcher searcher) {
|
||||
this.similarity = searcher.getSimilarity();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
@ -69,36 +57,29 @@ public final class MatchNoDocsQuery extends Query {
|
|||
}
|
||||
|
||||
@Override
|
||||
public float getValue() {
|
||||
public float getValueForNormalization() throws IOException {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float sumOfSquaredWeights() {
|
||||
return 0;
|
||||
public void normalize(float norm, float topLevelBoost) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void normalize(final float queryNorm) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Scorer scorer(final IndexReader reader,
|
||||
final boolean scoreDocsInOrder,
|
||||
final boolean topScorer) throws IOException {
|
||||
public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, boolean topScorer, Bits acceptDocs) throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(final IndexReader reader,
|
||||
public Explanation explain(final AtomicReaderContext context,
|
||||
final int doc) {
|
||||
return new ComplexExplanation(false, 0, "MatchNoDocs matches nothing");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(final Searcher searcher) {
|
||||
return new MatchNoDocsWeight(searcher);
|
||||
public Weight createWeight(IndexSearcher searcher) throws IOException {
|
||||
return new MatchNoDocsWeight();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -22,7 +22,10 @@ package org.elasticsearch.common.lucene.search;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.search.similar.MoreLikeThis;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.lucene.queries.mlt.MoreLikeThis;
|
||||
import org.elasticsearch.common.io.FastStringReader;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -35,7 +38,7 @@ public class MoreLikeThisQuery extends Query {
|
|||
|
||||
public static final float DEFAULT_PERCENT_TERMS_TO_MATCH = 0.3f;
|
||||
|
||||
private Similarity similarity;
|
||||
private TFIDFSimilarity similarity;
|
||||
|
||||
private String likeText;
|
||||
private String[] moreLikeFields;
|
||||
|
@ -77,7 +80,8 @@ public class MoreLikeThisQuery extends Query {
|
|||
mlt.setStopWords(stopWords);
|
||||
mlt.setBoost(boostTerms);
|
||||
mlt.setBoostFactor(boostTermsFactor);
|
||||
BooleanQuery bq = (BooleanQuery) mlt.like(new FastStringReader(likeText));
|
||||
//LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field)
|
||||
BooleanQuery bq = (BooleanQuery) mlt.like(new FastStringReader(likeText), moreLikeFields[0]);
|
||||
BooleanClause[] clauses = bq.getClauses();
|
||||
|
||||
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
|
||||
|
@ -112,7 +116,10 @@ public class MoreLikeThisQuery extends Query {
|
|||
}
|
||||
|
||||
public void setSimilarity(Similarity similarity) {
|
||||
this.similarity = similarity;
|
||||
if (similarity == null || similarity instanceof TFIDFSimilarity) {
|
||||
//LUCENE 4 UPGRADE we need TFIDF similarity here so I only set it if it is an instance of it
|
||||
this.similarity = (TFIDFSimilarity) similarity;
|
||||
}
|
||||
}
|
||||
|
||||
public Analyzer getAnalyzer() {
|
||||
|
|
|
@ -19,11 +19,12 @@
|
|||
|
||||
package org.elasticsearch.common.lucene.search;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermEnum;
|
||||
import gnu.trove.set.hash.THashSet;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.search.MultiPhraseQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -137,7 +138,7 @@ public class MultiPhrasePrefixQuery extends Query {
|
|||
}
|
||||
Term[] suffixTerms = termArrays.get(sizeMinus1);
|
||||
int position = positions.get(sizeMinus1);
|
||||
List<Term> terms = new ArrayList<Term>();
|
||||
Set<Term> terms = new THashSet<Term>();
|
||||
for (Term term : suffixTerms) {
|
||||
getPrefixTerms(terms, term, reader);
|
||||
if (terms.size() > maxExpansions) {
|
||||
|
@ -151,24 +152,33 @@ public class MultiPhrasePrefixQuery extends Query {
|
|||
return query.rewrite(reader);
|
||||
}
|
||||
|
||||
private void getPrefixTerms(List<Term> terms, final Term prefix, final IndexReader reader) throws IOException {
|
||||
TermEnum enumerator = reader.terms(prefix);
|
||||
try {
|
||||
do {
|
||||
Term term = enumerator.term();
|
||||
if (term != null
|
||||
&& term.text().startsWith(prefix.text())
|
||||
&& term.field().equals(field)) {
|
||||
terms.add(term);
|
||||
} else {
|
||||
private void getPrefixTerms(Set<Term> terms, final Term prefix, final IndexReader reader) throws IOException {
|
||||
// SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment into one terms
|
||||
// instance, which is very expensive. Therefore I think it is better to iterate over each leaf individually.
|
||||
TermsEnum termsEnum = null;
|
||||
List<AtomicReaderContext> leaves = reader.leaves();
|
||||
for (AtomicReaderContext leaf : leaves) {
|
||||
Terms _terms = leaf.reader().terms(field);
|
||||
if (_terms == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
termsEnum = _terms.iterator(termsEnum);
|
||||
TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes());
|
||||
if (TermsEnum.SeekStatus.END == seekStatus) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) {
|
||||
if (!StringHelper.startsWith(term, prefix.bytes())) {
|
||||
break;
|
||||
}
|
||||
|
||||
terms.add(new Term(field, BytesRef.deepCopyOf(term)));
|
||||
if (terms.size() >= maxExpansions) {
|
||||
break;
|
||||
return;
|
||||
}
|
||||
} while (enumerator.next());
|
||||
} finally {
|
||||
enumerator.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
|
||||
package org.elasticsearch.common.lucene.search;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
|
||||
|
@ -41,7 +41,7 @@ public class NoopCollector extends Collector {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setNextReader(IndexReader reader, int docBase) throws IOException {
|
||||
public void setNextReader(AtomicReaderContext context) throws IOException {
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -19,11 +19,13 @@
|
|||
|
||||
package org.elasticsearch.common.lucene.search;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.search.FilteredDocIdSetIterator;
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -39,15 +41,15 @@ public class NotDeletedFilter extends Filter {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
|
||||
DocIdSet docIdSet = filter.getDocIdSet(reader);
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
|
||||
DocIdSet docIdSet = filter.getDocIdSet(context, acceptDocs);
|
||||
if (docIdSet == null) {
|
||||
return null;
|
||||
}
|
||||
if (!reader.hasDeletions()) {
|
||||
if (!context.reader().hasDeletions()) {
|
||||
return docIdSet;
|
||||
}
|
||||
return new NotDeletedDocIdSet(docIdSet, reader);
|
||||
return new NotDeletedDocIdSet(docIdSet, context.reader().getLiveDocs());
|
||||
}
|
||||
|
||||
public Filter filter() {
|
||||
|
@ -63,11 +65,11 @@ public class NotDeletedFilter extends Filter {
|
|||
|
||||
private final DocIdSet innerSet;
|
||||
|
||||
private final IndexReader reader;
|
||||
private final Bits liveDocs;
|
||||
|
||||
NotDeletedDocIdSet(DocIdSet innerSet, IndexReader reader) {
|
||||
NotDeletedDocIdSet(DocIdSet innerSet, Bits liveDocs) {
|
||||
this.innerSet = innerSet;
|
||||
this.reader = reader;
|
||||
this.liveDocs = liveDocs;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -76,22 +78,22 @@ public class NotDeletedFilter extends Filter {
|
|||
if (iterator == null) {
|
||||
return null;
|
||||
}
|
||||
return new NotDeletedDocIdSetIterator(iterator, reader);
|
||||
return new NotDeletedDocIdSetIterator(iterator, liveDocs);
|
||||
}
|
||||
}
|
||||
|
||||
static class NotDeletedDocIdSetIterator extends FilteredDocIdSetIterator {
|
||||
|
||||
private final IndexReader reader;
|
||||
private final Bits liveDocs;
|
||||
|
||||
NotDeletedDocIdSetIterator(DocIdSetIterator innerIter, IndexReader reader) {
|
||||
NotDeletedDocIdSetIterator(DocIdSetIterator innerIter, Bits liveDocs) {
|
||||
super(innerIter);
|
||||
this.reader = reader;
|
||||
this.liveDocs = liveDocs;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean match(int doc) throws IOException {
|
||||
return !reader.isDeleted(doc);
|
||||
protected boolean match(int doc) {
|
||||
return liveDocs == null || liveDocs.get(doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,9 +19,11 @@
|
|||
|
||||
package org.elasticsearch.common.lucene.search;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.elasticsearch.common.lucene.docset.AllDocSet;
|
||||
import org.elasticsearch.common.lucene.docset.DocSet;
|
||||
import org.elasticsearch.common.lucene.docset.NotDocIdSet;
|
||||
|
@ -45,15 +47,16 @@ public class NotFilter extends Filter {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
|
||||
DocIdSet set = filter.getDocIdSet(reader);
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
|
||||
// LUCENE 4 UPGRADE: For leave acceptedDocs null, until we figure out how to deal with deleted docs...
|
||||
DocIdSet set = filter.getDocIdSet(context, null);
|
||||
if (set == null) {
|
||||
return new AllDocSet(reader.maxDoc());
|
||||
return new AllDocSet(context.reader().maxDoc());
|
||||
}
|
||||
if (set instanceof DocSet) {
|
||||
return new NotDocSet((DocSet) set, reader.maxDoc());
|
||||
return new NotDocSet((DocSet) set, context.reader().maxDoc());
|
||||
}
|
||||
return new NotDocIdSet(set, reader.maxDoc());
|
||||
return new NotDocIdSet(set, context.reader().maxDoc());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -20,9 +20,11 @@
|
|||
package org.elasticsearch.common.lucene.search;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.elasticsearch.common.lucene.docset.DocSet;
|
||||
import org.elasticsearch.common.lucene.docset.OrDocIdSet;
|
||||
import org.elasticsearch.common.lucene.docset.OrDocSet;
|
||||
|
@ -46,14 +48,16 @@ public class OrFilter extends Filter {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
|
||||
if (filters.size() == 1) {
|
||||
return filters.get(0).getDocIdSet(reader);
|
||||
// LUCENE 4 UPGRADE: For leave acceptedDocs null, until we figure out how to deal with deleted docs...
|
||||
return filters.get(0).getDocIdSet(context, null);
|
||||
}
|
||||
List sets = Lists.newArrayListWithExpectedSize(filters.size());
|
||||
boolean allAreDocSet = true;
|
||||
for (Filter filter : filters) {
|
||||
DocIdSet set = filter.getDocIdSet(reader);
|
||||
// LUCENE 4 UPGRADE: For leave acceptedDocs null, until we figure out how to deal with deleted docs...
|
||||
DocIdSet set = filter.getDocIdSet(context, null);
|
||||
if (set == null) { // none matching for this filter, continue
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -19,13 +19,11 @@
|
|||
|
||||
package org.elasticsearch.common.lucene.search;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermDocs;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -45,26 +43,25 @@ public class TermFilter extends Filter {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
|
||||
FixedBitSet result = null;
|
||||
TermDocs td = reader.termDocs();
|
||||
try {
|
||||
td.seek(term);
|
||||
// batch read, in Lucene 4.0 its no longer needed
|
||||
int[] docs = new int[Lucene.BATCH_ENUM_DOCS];
|
||||
int[] freqs = new int[Lucene.BATCH_ENUM_DOCS];
|
||||
int number = td.read(docs, freqs);
|
||||
if (number > 0) {
|
||||
result = new FixedBitSet(reader.maxDoc());
|
||||
while (number > 0) {
|
||||
for (int i = 0; i < number; i++) {
|
||||
result.set(docs[i]);
|
||||
}
|
||||
number = td.read(docs, freqs);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
td.close();
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
|
||||
Terms terms = context.reader().terms(term.field());
|
||||
if (terms == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
TermsEnum termsEnum = terms.iterator(null);
|
||||
if (!termsEnum.seekExact(term.bytes(), false)) {
|
||||
return null;
|
||||
}
|
||||
DocsEnum docsEnum = termsEnum.docs(acceptDocs, null);
|
||||
int docId = docsEnum.nextDoc();
|
||||
if (docId == DocsEnum.NO_MORE_DOCS) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final FixedBitSet result = new FixedBitSet(context.reader().maxDoc());
|
||||
for (; docId < DocsEnum.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
|
||||
result.set(docId);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
|
|
@ -19,11 +19,12 @@
|
|||
|
||||
package org.elasticsearch.common.lucene.search;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.queries.FilterClause;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.search.FilterClause;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.elasticsearch.common.lucene.docset.DocSet;
|
||||
import org.elasticsearch.common.lucene.docset.DocSets;
|
||||
|
@ -41,9 +42,9 @@ public class XBooleanFilter extends Filter {
|
|||
ArrayList<Filter> notFilters = null;
|
||||
ArrayList<Filter> mustFilters = null;
|
||||
|
||||
private DocIdSet getDISI(ArrayList<Filter> filters, int index, IndexReader reader)
|
||||
private DocIdSet getDISI(ArrayList<Filter> filters, int index, AtomicReaderContext context, Bits acceptedDocs)
|
||||
throws IOException {
|
||||
DocIdSet docIdSet = filters.get(index).getDocIdSet(reader);
|
||||
DocIdSet docIdSet = filters.get(index).getDocIdSet(context, acceptedDocs);
|
||||
if (docIdSet == DocIdSet.EMPTY_DOCIDSET || docIdSet == DocSet.EMPTY_DOC_SET) {
|
||||
return null;
|
||||
}
|
||||
|
@ -67,23 +68,26 @@ public class XBooleanFilter extends Filter {
|
|||
* of the filters that have been added.
|
||||
*/
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptedDocs) throws IOException {
|
||||
FixedBitSet res = null;
|
||||
|
||||
if (mustFilters == null && notFilters == null && shouldFilters != null && shouldFilters.size() == 1) {
|
||||
return shouldFilters.get(0).getDocIdSet(reader);
|
||||
// LUCENE 4 UPGRADE: For leave acceptedDocs null, until we figure out how to deal with deleted docs...
|
||||
return shouldFilters.get(0).getDocIdSet(context, null);
|
||||
}
|
||||
|
||||
if (shouldFilters == null && notFilters == null && mustFilters != null && mustFilters.size() == 1) {
|
||||
return mustFilters.get(0).getDocIdSet(reader);
|
||||
// LUCENE 4 UPGRADE: For leave acceptedDocs null, until we figure out how to deal with deleted docs...
|
||||
return mustFilters.get(0).getDocIdSet(context, null);
|
||||
}
|
||||
|
||||
if (shouldFilters != null) {
|
||||
for (int i = 0; i < shouldFilters.size(); i++) {
|
||||
final DocIdSet disi = getDISI(shouldFilters, i, reader);
|
||||
// LUCENE 4 UPGRADE: For leave acceptedDocs null, until we figure out how to deal with deleted docs...
|
||||
final DocIdSet disi = getDISI(shouldFilters, i, context, null);
|
||||
if (disi == null) continue;
|
||||
if (res == null) {
|
||||
res = new FixedBitSet(reader.maxDoc());
|
||||
res = new FixedBitSet(context.reader().maxDoc());
|
||||
}
|
||||
DocSets.or(res, disi);
|
||||
}
|
||||
|
@ -98,10 +102,11 @@ public class XBooleanFilter extends Filter {
|
|||
if (notFilters != null) {
|
||||
for (int i = 0; i < notFilters.size(); i++) {
|
||||
if (res == null) {
|
||||
res = new FixedBitSet(reader.maxDoc());
|
||||
res.set(0, reader.maxDoc()); // NOTE: may set bits on deleted docs
|
||||
res = new FixedBitSet(context.reader().maxDoc());
|
||||
res.set(0, context.reader().maxDoc()); // NOTE: may set bits on deleted docs
|
||||
}
|
||||
final DocIdSet disi = getDISI(notFilters, i, reader);
|
||||
// LUCENE 4 UPGRADE: For leave acceptedDocs null, until we figure out how to deal with deleted docs...
|
||||
final DocIdSet disi = getDISI(notFilters, i, context, null);
|
||||
if (disi != null) {
|
||||
DocSets.andNot(res, disi);
|
||||
}
|
||||
|
@ -110,12 +115,13 @@ public class XBooleanFilter extends Filter {
|
|||
|
||||
if (mustFilters != null) {
|
||||
for (int i = 0; i < mustFilters.size(); i++) {
|
||||
final DocIdSet disi = getDISI(mustFilters, i, reader);
|
||||
// LUCENE 4 UPGRADE: For leave acceptedDocs null, until we figure out how to deal with deleted docs...
|
||||
final DocIdSet disi = getDISI(mustFilters, i, context, null);
|
||||
if (disi == null) {
|
||||
return null;
|
||||
}
|
||||
if (res == null) {
|
||||
res = new FixedBitSet(reader.maxDoc());
|
||||
res = new FixedBitSet(context.reader().maxDoc());
|
||||
DocSets.or(res, disi);
|
||||
} else {
|
||||
DocSets.and(res, disi);
|
||||
|
@ -219,10 +225,10 @@ public class XBooleanFilter extends Filter {
|
|||
|
||||
private void appendFilters(ArrayList<Filter> filters, String occurString, StringBuilder buffer) {
|
||||
if (filters != null) {
|
||||
for (int i = 0; i < filters.size(); i++) {
|
||||
for (Filter filter : filters) {
|
||||
buffer.append(' ');
|
||||
buffer.append(occurString);
|
||||
buffer.append(filters.get(i).toString());
|
||||
buffer.append(filter.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
|
||||
package org.elasticsearch.common.lucene.search.function;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
/**
|
||||
|
@ -39,7 +39,7 @@ public class BoostScoreFunction implements ScoreFunction {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setNextReader(IndexReader reader) {
|
||||
public void setNextReader(AtomicReaderContext context) {
|
||||
// nothing to do here...
|
||||
}
|
||||
|
||||
|
|
|
@ -19,9 +19,11 @@
|
|||
|
||||
package org.elasticsearch.common.lucene.search.function;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
import org.elasticsearch.common.lucene.docset.DocSet;
|
||||
import org.elasticsearch.common.lucene.docset.DocSets;
|
||||
|
@ -73,13 +75,11 @@ public class FiltersFunctionScoreQuery extends Query {
|
|||
final FilterFunction[] filterFunctions;
|
||||
final ScoreMode scoreMode;
|
||||
final float maxBoost;
|
||||
DocSet[] docSets;
|
||||
|
||||
public FiltersFunctionScoreQuery(Query subQuery, ScoreMode scoreMode, FilterFunction[] filterFunctions, float maxBoost) {
|
||||
this.subQuery = subQuery;
|
||||
this.scoreMode = scoreMode;
|
||||
this.filterFunctions = filterFunctions;
|
||||
this.docSets = new DocSet[filterFunctions.length];
|
||||
this.maxBoost = maxBoost;
|
||||
}
|
||||
|
||||
|
@ -106,72 +106,69 @@ public class FiltersFunctionScoreQuery extends Query {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(Searcher searcher) throws IOException {
|
||||
return new CustomBoostFactorWeight(searcher);
|
||||
public Weight createWeight(IndexSearcher searcher) throws IOException {
|
||||
Weight subQueryWeight = subQuery.createWeight(searcher);
|
||||
return new CustomBoostFactorWeight(subQueryWeight, filterFunctions.length);
|
||||
}
|
||||
|
||||
class CustomBoostFactorWeight extends Weight {
|
||||
Searcher searcher;
|
||||
Weight subQueryWeight;
|
||||
|
||||
public CustomBoostFactorWeight(Searcher searcher) throws IOException {
|
||||
this.searcher = searcher;
|
||||
this.subQueryWeight = subQuery.weight(searcher);
|
||||
final Weight subQueryWeight;
|
||||
final DocSet[] docSets;
|
||||
|
||||
public CustomBoostFactorWeight(Weight subQueryWeight, int filterFunctionLength) throws IOException {
|
||||
this.subQueryWeight = subQueryWeight;
|
||||
this.docSets = new DocSet[filterFunctionLength];
|
||||
}
|
||||
|
||||
public Query getQuery() {
|
||||
return FiltersFunctionScoreQuery.this;
|
||||
}
|
||||
|
||||
public float getValue() {
|
||||
return getBoost();
|
||||
}
|
||||
|
||||
@Override
|
||||
public float sumOfSquaredWeights() throws IOException {
|
||||
float sum = subQueryWeight.sumOfSquaredWeights();
|
||||
public float getValueForNormalization() throws IOException {
|
||||
float sum = subQueryWeight.getValueForNormalization();
|
||||
sum *= getBoost() * getBoost();
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void normalize(float norm) {
|
||||
norm *= getBoost();
|
||||
subQueryWeight.normalize(norm);
|
||||
public void normalize(float norm, float topLevelBoost) {
|
||||
subQueryWeight.normalize(norm, topLevelBoost * getBoost());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException {
|
||||
Scorer subQueryScorer = subQueryWeight.scorer(reader, scoreDocsInOrder, false);
|
||||
public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, boolean topScorer, Bits acceptDocs) throws IOException {
|
||||
Scorer subQueryScorer = subQueryWeight.scorer(context, scoreDocsInOrder, topScorer, acceptDocs);
|
||||
if (subQueryScorer == null) {
|
||||
return null;
|
||||
}
|
||||
for (int i = 0; i < filterFunctions.length; i++) {
|
||||
FilterFunction filterFunction = filterFunctions[i];
|
||||
filterFunction.function.setNextReader(reader);
|
||||
docSets[i] = DocSets.convert(reader, filterFunction.filter.getDocIdSet(reader));
|
||||
filterFunction.function.setNextReader(context);
|
||||
docSets[i] = DocSets.convert(context.reader(), filterFunction.filter.getDocIdSet(context, acceptDocs));
|
||||
}
|
||||
return new CustomBoostFactorScorer(getSimilarity(searcher), this, subQueryScorer, scoreMode, filterFunctions, maxBoost, docSets);
|
||||
return new CustomBoostFactorScorer(this, subQueryScorer, scoreMode, filterFunctions, maxBoost, docSets);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(IndexReader reader, int doc) throws IOException {
|
||||
Explanation subQueryExpl = subQueryWeight.explain(reader, doc);
|
||||
public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
|
||||
Explanation subQueryExpl = subQueryWeight.explain(context, doc);
|
||||
if (!subQueryExpl.isMatch()) {
|
||||
return subQueryExpl;
|
||||
}
|
||||
|
||||
if (scoreMode == ScoreMode.First) {
|
||||
for (FilterFunction filterFunction : filterFunctions) {
|
||||
DocSet docSet = DocSets.convert(reader, filterFunction.filter.getDocIdSet(reader));
|
||||
DocSet docSet = DocSets.convert(context.reader(), filterFunction.filter.getDocIdSet(context, context.reader().getLiveDocs()));
|
||||
if (docSet.get(doc)) {
|
||||
filterFunction.function.setNextReader(reader);
|
||||
filterFunction.function.setNextReader(context);
|
||||
Explanation functionExplanation = filterFunction.function.explainFactor(doc);
|
||||
float sc = getValue() * subQueryExpl.getValue() * functionExplanation.getValue();
|
||||
float sc = getBoost() * subQueryExpl.getValue() * functionExplanation.getValue();
|
||||
Explanation filterExplanation = new ComplexExplanation(true, sc, "custom score, product of:");
|
||||
filterExplanation.addDetail(new Explanation(1.0f, "match filter: " + filterFunction.filter.toString()));
|
||||
filterExplanation.addDetail(functionExplanation);
|
||||
filterExplanation.addDetail(new Explanation(getValue(), "queryBoost"));
|
||||
filterExplanation.addDetail(new Explanation(getBoost(), "queryBoost"));
|
||||
|
||||
// top level score = subquery.score * filter.score (this already has the query boost)
|
||||
float topLevelScore = subQueryExpl.getValue() * sc;
|
||||
|
@ -189,9 +186,9 @@ public class FiltersFunctionScoreQuery extends Query {
|
|||
float min = Float.POSITIVE_INFINITY;
|
||||
ArrayList<Explanation> filtersExplanations = new ArrayList<Explanation>();
|
||||
for (FilterFunction filterFunction : filterFunctions) {
|
||||
DocSet docSet = DocSets.convert(reader, filterFunction.filter.getDocIdSet(reader));
|
||||
DocSet docSet = DocSets.convert(context.reader(), filterFunction.filter.getDocIdSet(context, context.reader().getLiveDocs()));
|
||||
if (docSet.get(doc)) {
|
||||
filterFunction.function.setNextReader(reader);
|
||||
filterFunction.function.setNextReader(context);
|
||||
Explanation functionExplanation = filterFunction.function.explainFactor(doc);
|
||||
float factor = functionExplanation.getValue();
|
||||
count++;
|
||||
|
@ -202,7 +199,7 @@ public class FiltersFunctionScoreQuery extends Query {
|
|||
Explanation res = new ComplexExplanation(true, factor, "custom score, product of:");
|
||||
res.addDetail(new Explanation(1.0f, "match filter: " + filterFunction.filter.toString()));
|
||||
res.addDetail(functionExplanation);
|
||||
res.addDetail(new Explanation(getValue(), "queryBoost"));
|
||||
res.addDetail(new Explanation(getBoost(), "queryBoost"));
|
||||
filtersExplanations.add(res);
|
||||
}
|
||||
}
|
||||
|
@ -229,7 +226,7 @@ public class FiltersFunctionScoreQuery extends Query {
|
|||
if (factor > maxBoost) {
|
||||
factor = maxBoost;
|
||||
}
|
||||
float sc = factor * subQueryExpl.getValue() * getValue();
|
||||
float sc = factor * subQueryExpl.getValue() * getBoost();
|
||||
Explanation res = new ComplexExplanation(true, sc, "custom score, score mode [" + scoreMode.toString().toLowerCase() + "]");
|
||||
res.addDetail(subQueryExpl);
|
||||
for (Explanation explanation : filtersExplanations) {
|
||||
|
@ -239,27 +236,28 @@ public class FiltersFunctionScoreQuery extends Query {
|
|||
}
|
||||
}
|
||||
|
||||
float sc = getValue() * subQueryExpl.getValue();
|
||||
float sc = getBoost() * subQueryExpl.getValue();
|
||||
Explanation res = new ComplexExplanation(true, sc, "custom score, no filter match, product of:");
|
||||
res.addDetail(subQueryExpl);
|
||||
res.addDetail(new Explanation(getValue(), "queryBoost"));
|
||||
res.addDetail(new Explanation(getBoost(), "queryBoost"));
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static class CustomBoostFactorScorer extends Scorer {
|
||||
private final float subQueryWeight;
|
||||
|
||||
private final float subQueryBoost;
|
||||
private final Scorer scorer;
|
||||
private final FilterFunction[] filterFunctions;
|
||||
private final ScoreMode scoreMode;
|
||||
private final float maxBoost;
|
||||
private final DocSet[] docSets;
|
||||
|
||||
private CustomBoostFactorScorer(Similarity similarity, CustomBoostFactorWeight w, Scorer scorer,
|
||||
ScoreMode scoreMode, FilterFunction[] filterFunctions, float maxBoost, DocSet[] docSets) throws IOException {
|
||||
super(similarity);
|
||||
this.subQueryWeight = w.getValue();
|
||||
private CustomBoostFactorScorer(CustomBoostFactorWeight w, Scorer scorer, ScoreMode scoreMode,
|
||||
FilterFunction[] filterFunctions, float maxBoost, DocSet[] docSets) throws IOException {
|
||||
super(w);
|
||||
this.subQueryBoost = w.getQuery().getBoost();
|
||||
this.scorer = scorer;
|
||||
this.scoreMode = scoreMode;
|
||||
this.filterFunctions = filterFunctions;
|
||||
|
@ -339,7 +337,12 @@ public class FiltersFunctionScoreQuery extends Query {
|
|||
factor = maxBoost;
|
||||
}
|
||||
float score = scorer.score();
|
||||
return subQueryWeight * score * factor;
|
||||
return subQueryBoost * score * factor;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float freq() throws IOException {
|
||||
return scorer.freq();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -19,9 +19,11 @@
|
|||
|
||||
package org.elasticsearch.common.lucene.search.function;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -63,76 +65,72 @@ public class FunctionScoreQuery extends Query {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(Searcher searcher) throws IOException {
|
||||
return new CustomBoostFactorWeight(searcher);
|
||||
public Weight createWeight(IndexSearcher searcher) throws IOException {
|
||||
Weight subQueryWeight = subQuery.createWeight(searcher);
|
||||
return new CustomBoostFactorWeight(subQueryWeight);
|
||||
}
|
||||
|
||||
class CustomBoostFactorWeight extends Weight {
|
||||
Searcher searcher;
|
||||
Weight subQueryWeight;
|
||||
|
||||
public CustomBoostFactorWeight(Searcher searcher) throws IOException {
|
||||
this.searcher = searcher;
|
||||
this.subQueryWeight = subQuery.weight(searcher);
|
||||
final Weight subQueryWeight;
|
||||
|
||||
public CustomBoostFactorWeight(Weight subQueryWeight) throws IOException {
|
||||
this.subQueryWeight = subQueryWeight;
|
||||
}
|
||||
|
||||
public Query getQuery() {
|
||||
return FunctionScoreQuery.this;
|
||||
}
|
||||
|
||||
public float getValue() {
|
||||
return getBoost();
|
||||
}
|
||||
|
||||
@Override
|
||||
public float sumOfSquaredWeights() throws IOException {
|
||||
float sum = subQueryWeight.sumOfSquaredWeights();
|
||||
public float getValueForNormalization() throws IOException {
|
||||
float sum = subQueryWeight.getValueForNormalization();
|
||||
sum *= getBoost() * getBoost();
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void normalize(float norm) {
|
||||
norm *= getBoost();
|
||||
subQueryWeight.normalize(norm);
|
||||
public void normalize(float norm, float topLevelBoost) {
|
||||
subQueryWeight.normalize(norm, topLevelBoost * getBoost());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException {
|
||||
Scorer subQueryScorer = subQueryWeight.scorer(reader, scoreDocsInOrder, false);
|
||||
public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, boolean topScorer, Bits acceptDocs) throws IOException {
|
||||
Scorer subQueryScorer = subQueryWeight.scorer(context, scoreDocsInOrder, topScorer, acceptDocs);
|
||||
if (subQueryScorer == null) {
|
||||
return null;
|
||||
}
|
||||
function.setNextReader(reader);
|
||||
return new CustomBoostFactorScorer(getSimilarity(searcher), this, subQueryScorer, function);
|
||||
function.setNextReader(context);
|
||||
return new CustomBoostFactorScorer(this, subQueryScorer, function);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(IndexReader reader, int doc) throws IOException {
|
||||
Explanation subQueryExpl = subQueryWeight.explain(reader, doc);
|
||||
public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
|
||||
Explanation subQueryExpl = subQueryWeight.explain(context, doc);
|
||||
if (!subQueryExpl.isMatch()) {
|
||||
return subQueryExpl;
|
||||
}
|
||||
|
||||
function.setNextReader(reader);
|
||||
function.setNextReader(context);
|
||||
Explanation functionExplanation = function.explainScore(doc, subQueryExpl);
|
||||
float sc = getValue() * functionExplanation.getValue();
|
||||
float sc = getBoost() * functionExplanation.getValue();
|
||||
Explanation res = new ComplexExplanation(true, sc, "custom score, product of:");
|
||||
res.addDetail(functionExplanation);
|
||||
res.addDetail(new Explanation(getValue(), "queryBoost"));
|
||||
res.addDetail(new Explanation(getBoost(), "queryBoost"));
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static class CustomBoostFactorScorer extends Scorer {
|
||||
private final float subQueryWeight;
|
||||
|
||||
private final float subQueryBoost;
|
||||
private final Scorer scorer;
|
||||
private final ScoreFunction function;
|
||||
|
||||
private CustomBoostFactorScorer(Similarity similarity, CustomBoostFactorWeight w, Scorer scorer, ScoreFunction function) throws IOException {
|
||||
super(similarity);
|
||||
this.subQueryWeight = w.getValue();
|
||||
private CustomBoostFactorScorer(CustomBoostFactorWeight w, Scorer scorer, ScoreFunction function) throws IOException {
|
||||
super(w);
|
||||
this.subQueryBoost = w.getQuery().getBoost();
|
||||
this.scorer = scorer;
|
||||
this.function = function;
|
||||
}
|
||||
|
@ -154,7 +152,12 @@ public class FunctionScoreQuery extends Query {
|
|||
|
||||
@Override
|
||||
public float score() throws IOException {
|
||||
return subQueryWeight * function.score(scorer.docID(), scorer.score());
|
||||
return subQueryBoost * function.score(scorer.docID(), scorer.score());
|
||||
}
|
||||
|
||||
@Override
|
||||
public float freq() throws IOException {
|
||||
return scorer.freq();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
|
||||
package org.elasticsearch.common.lucene.search.function;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
/**
|
||||
|
@ -27,7 +27,7 @@ import org.apache.lucene.search.Explanation;
|
|||
*/
|
||||
public interface ScoreFunction {
|
||||
|
||||
void setNextReader(IndexReader reader);
|
||||
void setNextReader(AtomicReaderContext context);
|
||||
|
||||
float score(int docId, float subQueryScore);
|
||||
|
||||
|
|
|
@ -5,7 +5,6 @@ import com.spatial4j.core.shape.Point;
|
|||
import com.spatial4j.core.shape.Rectangle;
|
||||
import com.spatial4j.core.shape.Shape;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.elasticsearch.common.geo.GeoShapeConstants;
|
||||
|
@ -56,12 +55,13 @@ public abstract class SpatialStrategy {
|
|||
* @param shape Shape to convert ints its indexable format
|
||||
* @return Fieldable for indexing the Shape
|
||||
*/
|
||||
public Fieldable createField(Shape shape) {
|
||||
public Field createField(Shape shape) {
|
||||
int detailLevel = prefixTree.getLevelForDistance(
|
||||
calcDistanceFromErrPct(shape, distanceErrorPct, GeoShapeConstants.SPATIAL_CONTEXT));
|
||||
List<Node> nodes = prefixTree.getNodes(shape, detailLevel, true);
|
||||
NodeTokenStream tokenStream = nodeTokenStream.get();
|
||||
tokenStream.setNodes(nodes);
|
||||
// LUCENE 4 Upgrade: We should pass in the FieldType and use it here
|
||||
return new Field(fieldName.indexName(), tokenStream);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,185 +0,0 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.common.lucene.store;
|
||||
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.elasticsearch.index.store.support.ForceSyncDirectory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* A Directory instance that switches files between
|
||||
* two other Directory instances.
|
||||
* <p/>
|
||||
* <p>Files with the specified extensions are placed in the
|
||||
* primary directory; others are placed in the secondary
|
||||
* directory.
|
||||
*
|
||||
*
|
||||
*/
|
||||
public class SwitchDirectory extends Directory implements ForceSyncDirectory {
|
||||
|
||||
private final Directory secondaryDir;
|
||||
|
||||
private final Directory primaryDir;
|
||||
|
||||
private final ImmutableSet<String> primaryExtensions;
|
||||
|
||||
private boolean doClose;
|
||||
|
||||
public SwitchDirectory(Set<String> primaryExtensions, Directory primaryDir, Directory secondaryDir, boolean doClose) {
|
||||
this.primaryExtensions = ImmutableSet.copyOf(primaryExtensions);
|
||||
this.primaryDir = primaryDir;
|
||||
this.secondaryDir = secondaryDir;
|
||||
this.doClose = doClose;
|
||||
this.lockFactory = primaryDir.getLockFactory();
|
||||
}
|
||||
|
||||
public ImmutableSet<String> primaryExtensions() {
|
||||
return primaryExtensions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the primary directory
|
||||
*/
|
||||
public Directory primaryDir() {
|
||||
return primaryDir;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the secondary directory
|
||||
*/
|
||||
public Directory secondaryDir() {
|
||||
return secondaryDir;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if (doClose) {
|
||||
try {
|
||||
secondaryDir.close();
|
||||
} finally {
|
||||
primaryDir.close();
|
||||
}
|
||||
doClose = false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String[] listAll() throws IOException {
|
||||
Set<String> files = new HashSet<String>();
|
||||
for (String f : primaryDir.listAll()) {
|
||||
files.add(f);
|
||||
}
|
||||
for (String f : secondaryDir.listAll()) {
|
||||
files.add(f);
|
||||
}
|
||||
return files.toArray(new String[files.size()]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility method to return a file's extension.
|
||||
*/
|
||||
public static String getExtension(String name) {
|
||||
int i = name.lastIndexOf('.');
|
||||
if (i == -1) {
|
||||
return "";
|
||||
}
|
||||
return name.substring(i + 1, name.length());
|
||||
}
|
||||
|
||||
private Directory getDirectory(String name) {
|
||||
String ext = getExtension(name);
|
||||
if (primaryExtensions.contains(ext)) {
|
||||
return primaryDir;
|
||||
} else {
|
||||
return secondaryDir;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean fileExists(String name) throws IOException {
|
||||
return getDirectory(name).fileExists(name);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long fileModified(String name) throws IOException {
|
||||
return getDirectory(name).fileModified(name);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void touchFile(String name) throws IOException {
|
||||
getDirectory(name).touchFile(name);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void deleteFile(String name) throws IOException {
|
||||
getDirectory(name).deleteFile(name);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long fileLength(String name) throws IOException {
|
||||
return getDirectory(name).fileLength(name);
|
||||
}
|
||||
|
||||
@Override
|
||||
public IndexOutput createOutput(String name) throws IOException {
|
||||
return getDirectory(name).createOutput(name);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void sync(Collection<String> names) throws IOException {
|
||||
List<String> primaryNames = new ArrayList<String>();
|
||||
List<String> secondaryNames = new ArrayList<String>();
|
||||
|
||||
for (String name : names)
|
||||
if (primaryExtensions.contains(getExtension(name)))
|
||||
primaryNames.add(name);
|
||||
else
|
||||
secondaryNames.add(name);
|
||||
|
||||
primaryDir.sync(primaryNames);
|
||||
secondaryDir.sync(secondaryNames);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void sync(String name) throws IOException {
|
||||
getDirectory(name).sync(name);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void forceSync(String name) throws IOException {
|
||||
Directory dir = getDirectory(name);
|
||||
if (dir instanceof ForceSyncDirectory) {
|
||||
((ForceSyncDirectory) dir).forceSync(name);
|
||||
} else {
|
||||
dir.sync(name);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public IndexInput openInput(String name) throws IOException {
|
||||
return getDirectory(name).openInput(name);
|
||||
}
|
||||
}
|
|
@ -19,14 +19,19 @@
|
|||
|
||||
package org.elasticsearch.common.lucene.uid;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.document.AbstractField;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.common.Numbers;
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
import org.elasticsearch.index.mapper.internal.UidFieldMapper;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
@ -34,57 +39,49 @@ import java.io.Reader;
|
|||
/**
|
||||
*
|
||||
*/
|
||||
public class UidField extends AbstractField {
|
||||
// TODO: LUCENE 4 UPGRADE: Store version as doc values instead of as a payload.
|
||||
public class UidField extends Field {
|
||||
|
||||
public static class DocIdAndVersion {
|
||||
public final int docId;
|
||||
public final int docStart;
|
||||
public final long version;
|
||||
public final IndexReader reader;
|
||||
public final AtomicReaderContext reader;
|
||||
|
||||
public DocIdAndVersion(int docId, long version, IndexReader reader, int docStart) {
|
||||
public DocIdAndVersion(int docId, long version, AtomicReaderContext reader) {
|
||||
this.docId = docId;
|
||||
this.version = version;
|
||||
this.reader = reader;
|
||||
this.docStart = docStart;
|
||||
}
|
||||
}
|
||||
|
||||
// this works fine for nested docs since they don't have the payload which has the version
|
||||
// so we iterate till we find the one with the payload
|
||||
public static DocIdAndVersion loadDocIdAndVersion(IndexReader subReader, int docStart, Term term) {
|
||||
// LUCENE 4 UPGRADE: We can get rid of the do while loop, since there is only one _uid value (live docs are taken into account)
|
||||
public static DocIdAndVersion loadDocIdAndVersion(AtomicReaderContext context, Term term) {
|
||||
int docId = Lucene.NO_DOC;
|
||||
TermPositions uid = null;
|
||||
try {
|
||||
uid = subReader.termPositions(term);
|
||||
if (!uid.next()) {
|
||||
DocsAndPositionsEnum uid = context.reader().termPositionsEnum(term);
|
||||
if (uid == null || uid.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
return null; // no doc
|
||||
}
|
||||
// Note, only master docs uid have version payload, so we can use that info to not
|
||||
// take them into account
|
||||
do {
|
||||
docId = uid.doc();
|
||||
docId = uid.docID();
|
||||
uid.nextPosition();
|
||||
if (!uid.isPayloadAvailable()) {
|
||||
if (uid.getPayload() == null) {
|
||||
continue;
|
||||
}
|
||||
if (uid.getPayloadLength() < 8) {
|
||||
if (uid.getPayload().length < 8) {
|
||||
continue;
|
||||
}
|
||||
byte[] payload = uid.getPayload(new byte[8], 0);
|
||||
return new DocIdAndVersion(docId, Numbers.bytesToLong(payload), subReader, docStart);
|
||||
} while (uid.next());
|
||||
return new DocIdAndVersion(docId, -2, subReader, docStart);
|
||||
byte[] payload = new byte[uid.getPayload().length];
|
||||
System.arraycopy(uid.getPayload().bytes, uid.getPayload().offset, payload, 0, uid.getPayload().length);
|
||||
return new DocIdAndVersion(docId, Numbers.bytesToLong(payload), context);
|
||||
} while (uid.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
|
||||
return new DocIdAndVersion(docId, -2, context);
|
||||
} catch (Exception e) {
|
||||
return new DocIdAndVersion(docId, -2, subReader, docStart);
|
||||
} finally {
|
||||
if (uid != null) {
|
||||
try {
|
||||
uid.close();
|
||||
} catch (IOException e) {
|
||||
// nothing to do here...
|
||||
}
|
||||
}
|
||||
return new DocIdAndVersion(docId, -2, context);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -92,37 +89,30 @@ public class UidField extends AbstractField {
|
|||
* Load the version for the uid from the reader, returning -1 if no doc exists, or -2 if
|
||||
* no version is available (for backward comp.)
|
||||
*/
|
||||
public static long loadVersion(IndexReader reader, Term term) {
|
||||
TermPositions uid = null;
|
||||
// LUCENE 4 UPGRADE: We can get rid of the do while loop, since there is only one _uid value (live docs are taken into account)
|
||||
public static long loadVersion(AtomicReaderContext context, Term term) {
|
||||
try {
|
||||
uid = reader.termPositions(term);
|
||||
if (!uid.next()) {
|
||||
DocsAndPositionsEnum uid = context.reader().termPositionsEnum(term);
|
||||
if (uid == null || uid.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
return -1;
|
||||
}
|
||||
// Note, only master docs uid have version payload, so we can use that info to not
|
||||
// take them into account
|
||||
do {
|
||||
uid.nextPosition();
|
||||
if (!uid.isPayloadAvailable()) {
|
||||
if (uid.getPayload() == null) {
|
||||
continue;
|
||||
}
|
||||
if (uid.getPayloadLength() < 8) {
|
||||
if (uid.getPayload().length < 8) {
|
||||
continue;
|
||||
}
|
||||
byte[] payload = uid.getPayload(new byte[8], 0);
|
||||
byte[] payload = new byte[uid.getPayload().length];
|
||||
System.arraycopy(uid.getPayload().bytes, uid.getPayload().offset, payload, 0, uid.getPayload().length);
|
||||
return Numbers.bytesToLong(payload);
|
||||
} while (uid.next());
|
||||
} while (uid.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
|
||||
return -2;
|
||||
} catch (Exception e) {
|
||||
return -2;
|
||||
} finally {
|
||||
if (uid != null) {
|
||||
try {
|
||||
uid.close();
|
||||
} catch (IOException e) {
|
||||
// nothing to do here...
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -130,26 +120,13 @@ public class UidField extends AbstractField {
|
|||
|
||||
private long version;
|
||||
|
||||
private final UidPayloadTokenStream tokenStream;
|
||||
|
||||
public UidField(String name, String uid, long version) {
|
||||
super(name, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO);
|
||||
super(name, UidFieldMapper.Defaults.UID_FIELD_TYPE);
|
||||
this.uid = uid;
|
||||
this.version = version;
|
||||
this.indexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
|
||||
this.tokenStream = new UidPayloadTokenStream(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setIndexOptions(FieldInfo.IndexOptions indexOptions) {
|
||||
// never allow to set this, since we want payload!
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setOmitTermFreqAndPositions(boolean omitTermFreqAndPositions) {
|
||||
// never allow to set this, since we want payload!
|
||||
}
|
||||
|
||||
public String uid() {
|
||||
return this.uid;
|
||||
}
|
||||
|
@ -177,7 +154,7 @@ public class UidField extends AbstractField {
|
|||
}
|
||||
|
||||
@Override
|
||||
public TokenStream tokenStreamValue() {
|
||||
public TokenStream tokenStream(Analyzer analyzer) throws IOException {
|
||||
return tokenStream;
|
||||
}
|
||||
|
||||
|
@ -206,7 +183,7 @@ public class UidField extends AbstractField {
|
|||
}
|
||||
termAtt.setLength(0);
|
||||
termAtt.append(field.uid);
|
||||
payloadAttribute.setPayload(new Payload(Numbers.longToBytes(field.version())));
|
||||
payloadAttribute.setPayload(new BytesRef(Numbers.longToBytes(field.version())));
|
||||
added = true;
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -61,4 +61,19 @@ public class BytesText implements Text {
|
|||
public String toString() {
|
||||
return string();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return bytes().hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
return bytes().equals(((Text) obj).bytes());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(Text text) {
|
||||
return UTF8SortedAsUnicodeComparator.utf8SortedAsUnicodeSortOrder.compare(bytes(), text.bytes());
|
||||
}
|
||||
}
|
|
@ -87,4 +87,19 @@ public class StringAndBytesText implements Text {
|
|||
public String toString() {
|
||||
return string();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return bytes().hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
return bytes().equals(((Text) obj).bytes());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(Text text) {
|
||||
return UTF8SortedAsUnicodeComparator.utf8SortedAsUnicodeSortOrder.compare(bytes(), text.bytes());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -71,4 +71,21 @@ public class StringText implements Text {
|
|||
public String toString() {
|
||||
return string();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
// we use bytes here so we can be consistent with other text implementations
|
||||
return bytes().hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
// we use bytes here so we can be consistent with other text implementations
|
||||
return bytes().equals(((Text) obj).bytes());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(Text text) {
|
||||
return UTF8SortedAsUnicodeComparator.utf8SortedAsUnicodeSortOrder.compare(bytes(), text.bytes());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,7 +26,7 @@ import org.elasticsearch.common.bytes.BytesReference;
|
|||
* so we can represent it in a more optimized manner in memory as well as serializing it over the
|
||||
* network as well as converting it to json format.
|
||||
*/
|
||||
public interface Text {
|
||||
public interface Text extends Comparable<Text> {
|
||||
|
||||
/**
|
||||
* Are bytes available without the need to be converted into bytes when calling {@link #bytes()}.
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
package org.elasticsearch.common.text;
|
||||
|
||||
import org.elasticsearch.common.bytes.BytesReference;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
// LUCENE 4 UPGRADE: Is this the right way of comparing bytesreferences inside Text instances?
|
||||
// Copied from Lucene's BytesRef comparator
|
||||
public class UTF8SortedAsUnicodeComparator implements Comparator<BytesReference> {
|
||||
|
||||
public final static Comparator<BytesReference> utf8SortedAsUnicodeSortOrder = new UTF8SortedAsUnicodeComparator();
|
||||
|
||||
// Only singleton
|
||||
private UTF8SortedAsUnicodeComparator() {
|
||||
}
|
||||
|
||||
public int compare(BytesReference a, BytesReference b) {
|
||||
if (a.hasArray() && b.hasArray()) {
|
||||
final byte[] aBytes = a.array();
|
||||
int aUpto = a.arrayOffset();
|
||||
final byte[] bBytes = b.array();
|
||||
int bUpto = b.arrayOffset();
|
||||
|
||||
final int aStop = aUpto + Math.min(a.length(), b.length());
|
||||
while (aUpto < aStop) {
|
||||
int aByte = aBytes[aUpto++] & 0xff;
|
||||
int bByte = bBytes[bUpto++] & 0xff;
|
||||
|
||||
int diff = aByte - bByte;
|
||||
if (diff != 0) {
|
||||
return diff;
|
||||
}
|
||||
}
|
||||
|
||||
// One is a prefix of the other, or, they are equal:
|
||||
return a.length() - b.length();
|
||||
} else {
|
||||
final byte[] aBytes = a.toBytes();
|
||||
int aUpto = 0;
|
||||
final byte[] bBytes = b.toBytes();
|
||||
int bUpto = 0;
|
||||
|
||||
final int aStop = aUpto + Math.min(a.length(), b.length());
|
||||
while (aUpto < aStop) {
|
||||
int aByte = aBytes[aUpto++] & 0xff;
|
||||
int bByte = bBytes[bUpto++] & 0xff;
|
||||
|
||||
int diff = aByte - bByte;
|
||||
if (diff != 0) {
|
||||
return diff;
|
||||
}
|
||||
}
|
||||
|
||||
// One is a prefix of the other, or, they are equal:
|
||||
return a.length() - b.length();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -20,6 +20,7 @@
|
|||
package org.elasticsearch.common.xcontent;
|
||||
|
||||
import com.google.common.base.Charsets;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.common.Nullable;
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.bytes.BytesArray;
|
||||
|
@ -526,6 +527,12 @@ public final class XContentBuilder implements BytesStream {
|
|||
return this;
|
||||
}
|
||||
|
||||
public XContentBuilder field(XContentBuilderString name, BytesRef value) throws IOException {
|
||||
field(name);
|
||||
generator.writeUTF8String(value.bytes, value.offset, value.length);
|
||||
return this;
|
||||
}
|
||||
|
||||
public XContentBuilder field(String name, Text value) throws IOException {
|
||||
field(name);
|
||||
if (value.hasBytes() && value.bytes().hasArray()) {
|
||||
|
|
|
@ -19,6 +19,8 @@
|
|||
|
||||
package org.elasticsearch.common.xcontent;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
@ -129,6 +131,10 @@ public interface XContentParser extends Closeable {
|
|||
|
||||
String textOrNull() throws IOException;
|
||||
|
||||
BytesRef bytesOrNull() throws IOException;
|
||||
|
||||
BytesRef bytes() throws IOException;
|
||||
|
||||
boolean hasTextCharacters();
|
||||
|
||||
char[] textCharacters() throws IOException;
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue