Merge pull request #13870 from mikemccand/close_tokenstream

Close TokenStream in finally clause
This commit is contained in:
Michael McCandless 2015-10-02 14:43:26 +01:00
commit 5278cf0d5e
12 changed files with 113 additions and 94 deletions

View File

@ -26,6 +26,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.IOUtils;
import org.elasticsearch.common.lucene.search.Queries;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.index.mapper.MappedFieldType;
@ -484,30 +485,31 @@ public class MapperQueryParser extends QueryParser {
if (!settings.analyzeWildcard()) {
return super.getPrefixQuery(field, termStr);
}
List<String> tlist;
// get Analyzer from superclass and tokenize the term
TokenStream source;
TokenStream source = null;
try {
source = getAnalyzer().tokenStream(field, termStr);
source.reset();
} catch (IOException e) {
return super.getPrefixQuery(field, termStr);
}
List<String> tlist = new ArrayList<>();
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
while (true) {
try {
if (!source.incrementToken()) break;
source = getAnalyzer().tokenStream(field, termStr);
source.reset();
} catch (IOException e) {
break;
return super.getPrefixQuery(field, termStr);
}
tlist.add(termAtt.toString());
}
tlist = new ArrayList<>();
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
try {
source.close();
} catch (IOException e) {
// ignore
while (true) {
try {
if (!source.incrementToken()) break;
} catch (IOException e) {
break;
}
tlist.add(termAtt.toString());
}
} finally {
if (source != null) {
IOUtils.closeWhileHandlingException(source);
}
}
if (tlist.size() == 1) {
@ -617,8 +619,7 @@ public class MapperQueryParser extends QueryParser {
char c = termStr.charAt(i);
if (c == '?' || c == '*') {
if (isWithinToken) {
try {
TokenStream source = getAnalyzer().tokenStream(field, tmp.toString());
try (TokenStream source = getAnalyzer().tokenStream(field, tmp.toString())) {
source.reset();
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
if (source.incrementToken()) {
@ -633,7 +634,6 @@ public class MapperQueryParser extends QueryParser {
// no tokens, just use what we have now
aggStr.append(tmp);
}
source.close();
} catch (IOException e) {
aggStr.append(tmp);
}
@ -648,22 +648,22 @@ public class MapperQueryParser extends QueryParser {
}
if (isWithinToken) {
try {
TokenStream source = getAnalyzer().tokenStream(field, tmp.toString());
source.reset();
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
if (source.incrementToken()) {
String term = termAtt.toString();
if (term.length() == 0) {
try (TokenStream source = getAnalyzer().tokenStream(field, tmp.toString())) {
source.reset();
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
if (source.incrementToken()) {
String term = termAtt.toString();
if (term.length() == 0) {
// no tokens, just use what we have now
aggStr.append(tmp);
} else {
aggStr.append(term);
}
} else {
// no tokens, just use what we have now
aggStr.append(tmp);
} else {
aggStr.append(term);
}
} else {
// no tokens, just use what we have now
aggStr.append(tmp);
}
source.close();
} catch (IOException e) {
aggStr.append(tmp);
}

View File

@ -959,11 +959,9 @@ public long ramBytesUsed() {
// TODO: is there a Reader from a CharSequence?
// Turn tokenstream into automaton:
Automaton automaton = null;
TokenStream ts = queryAnalyzer.tokenStream("", key.toString());
try {
try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
automaton = getTokenStreamToAutomaton().toAutomaton(ts);
} finally {
IOUtils.closeWhileHandlingException(ts);
}
automaton = replaceSep(automaton);

View File

@ -217,12 +217,10 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
}
List<AnalyzeResponse.AnalyzeToken> tokens = new ArrayList<>();
TokenStream stream = null;
int lastPosition = -1;
int lastOffset = 0;
for (String text : request.text()) {
try {
stream = analyzer.tokenStream(field, text);
try (TokenStream stream = analyzer.tokenStream(field, text)) {
stream.reset();
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
@ -243,11 +241,8 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
lastPosition += analyzer.getPositionIncrementGap(field);
lastOffset += analyzer.getOffsetGap(field);
} catch (IOException e) {
throw new ElasticsearchException("failed to analyze", e);
} finally {
IOUtils.closeWhileHandlingException(stream);
}
}

View File

@ -314,7 +314,9 @@ public class Analysis {
* @see #isCharacterTokenStream(TokenStream)
*/
public static boolean generatesCharacterTokenStream(Analyzer analyzer, String fieldName) throws IOException {
return isCharacterTokenStream(analyzer.tokenStream(fieldName, ""));
try (TokenStream ts = analyzer.tokenStream(fieldName, "")) {
return isCharacterTokenStream(ts);
}
}
}

View File

@ -19,6 +19,7 @@
package org.elasticsearch.index.mapper.core;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Field;
@ -145,7 +146,7 @@ public class TokenCountFieldMapper extends IntegerFieldMapper {
if (valueAndBoost.value() == null) {
count = fieldType().nullValue();
} else {
count = countPositions(analyzer.analyzer().tokenStream(simpleName(), valueAndBoost.value()));
count = countPositions(analyzer, simpleName(), valueAndBoost.value());
}
addIntegerFields(context, fields, count, valueAndBoost.boost());
}
@ -156,12 +157,14 @@ public class TokenCountFieldMapper extends IntegerFieldMapper {
/**
* Count position increments in a token stream. Package private for testing.
* @param tokenStream token stream to count
* @param analyzer analyzer to create token stream
* @param fieldName field name to pass to analyzer
* @param fieldValue field value to pass to analyzer
* @return number of position increments in a token stream
* @throws IOException if tokenStream throws it
*/
static int countPositions(TokenStream tokenStream) throws IOException {
try {
static int countPositions(Analyzer analyzer, String fieldName, String fieldValue) throws IOException {
try (TokenStream tokenStream = analyzer.tokenStream(fieldName, fieldValue)) {
int count = 0;
PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class);
tokenStream.reset();
@ -171,8 +174,6 @@ public class TokenCountFieldMapper extends IntegerFieldMapper {
tokenStream.end();
count += position.getPositionIncrement();
return count;
} finally {
tokenStream.close();
}
}

View File

@ -88,10 +88,11 @@ class MultiDocumentPercolatorIndex implements PercolatorIndex {
try {
// TODO: instead of passing null here, we can have a CTL<Map<String,TokenStream>> and pass previous,
// like the indexer does
TokenStream tokenStream = field.tokenStream(analyzer, null);
if (tokenStream != null) {
memoryIndex.addField(field.name(), tokenStream, field.boost());
}
try (TokenStream tokenStream = field.tokenStream(analyzer, null)) {
if (tokenStream != null) {
memoryIndex.addField(field.name(), tokenStream, field.boost());
}
}
} catch (IOException e) {
throw new ElasticsearchException("Failed to create token stream", e);
}

View File

@ -56,10 +56,11 @@ class SingleDocumentPercolatorIndex implements PercolatorIndex {
Analyzer analyzer = context.mapperService().documentMapper(parsedDocument.type()).mappers().indexAnalyzer();
// TODO: instead of passing null here, we can have a CTL<Map<String,TokenStream>> and pass previous,
// like the indexer does
TokenStream tokenStream = field.tokenStream(analyzer, null);
if (tokenStream != null) {
memoryIndex.addField(field.name(), tokenStream, field.boost());
}
try (TokenStream tokenStream = field.tokenStream(analyzer, null)) {
if (tokenStream != null) {
memoryIndex.addField(field.name(), tokenStream, field.boost());
}
}
} catch (Exception e) {
throw new ElasticsearchException("Failed to create token stream for [" + field.name() + "]", e);
}

View File

@ -33,6 +33,7 @@ import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.IOUtils;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.common.text.StringText;
import org.elasticsearch.common.text.Text;
@ -109,15 +110,16 @@ public class PlainHighlighter implements Highlighter {
for (Object textToHighlight : textsToHighlight) {
String text = textToHighlight.toString();
TokenStream tokenStream = analyzer.tokenStream(mapper.fieldType().names().indexName(), text);
if (!tokenStream.hasAttribute(CharTermAttribute.class) || !tokenStream.hasAttribute(OffsetAttribute.class)) {
// can't perform highlighting if the stream has no terms (binary token stream) or no offsets
continue;
}
TextFragment[] bestTextFragments = entry.getBestTextFragments(tokenStream, text, false, numberOfFragments);
for (TextFragment bestTextFragment : bestTextFragments) {
if (bestTextFragment != null && bestTextFragment.getScore() > 0) {
fragsList.add(bestTextFragment);
try (TokenStream tokenStream = analyzer.tokenStream(mapper.fieldType().names().indexName(), text)) {
if (!tokenStream.hasAttribute(CharTermAttribute.class) || !tokenStream.hasAttribute(OffsetAttribute.class)) {
// can't perform highlighting if the stream has no terms (binary token stream) or no offsets
continue;
}
TextFragment[] bestTextFragments = entry.getBestTextFragments(tokenStream, text, false, numberOfFragments);
for (TextFragment bestTextFragment : bestTextFragments) {
if (bestTextFragment != null && bestTextFragment.getScore() > 0) {
fragsList.add(bestTextFragment);
}
}
}
}
@ -165,7 +167,7 @@ public class PlainHighlighter implements Highlighter {
String fieldContents = textsToHighlight.get(0).toString();
int end;
try {
end = findGoodEndForNoHighlightExcerpt(noMatchSize, analyzer.tokenStream(mapper.fieldType().names().indexName(), fieldContents));
end = findGoodEndForNoHighlightExcerpt(noMatchSize, analyzer, mapper.fieldType().names().indexName(), fieldContents);
} catch (Exception e) {
throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
}
@ -181,8 +183,8 @@ public class PlainHighlighter implements Highlighter {
return true;
}
private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, TokenStream tokenStream) throws IOException {
try {
private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents) throws IOException {
try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) {
if (!tokenStream.hasAttribute(OffsetAttribute.class)) {
// Can't split on term boundaries without offsets
return -1;
@ -200,11 +202,9 @@ public class PlainHighlighter implements Highlighter {
}
end = attr.endOffset();
}
tokenStream.end();
// We've exhausted the token stream so we should just highlight everything.
return end;
} finally {
tokenStream.end();
tokenStream.close();
}
}
}

View File

@ -28,6 +28,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.ParseFieldMatcher;
@ -116,22 +117,34 @@ public final class SuggestUtils {
}
public static int analyze(Analyzer analyzer, CharsRef toAnalyze, String field, TokenConsumer consumer) throws IOException {
TokenStream ts = analyzer.tokenStream(
field, new FastCharArrayReader(toAnalyze.chars, toAnalyze.offset, toAnalyze.length)
);
return analyze(ts, consumer);
try (TokenStream ts = analyzer.tokenStream(
field, new FastCharArrayReader(toAnalyze.chars, toAnalyze.offset, toAnalyze.length))) {
return analyze(ts, consumer);
}
}
/** NOTE: this method closes the TokenStream, even on exception, which is awkward
* because really the caller who called {@link Analyzer#tokenStream} should close it,
* but when trying that there are recursion issues when we try to use the same
* TokenStrem twice in the same recursion... */
public static int analyze(TokenStream stream, TokenConsumer consumer) throws IOException {
stream.reset();
consumer.reset(stream);
int numTokens = 0;
while (stream.incrementToken()) {
consumer.nextToken();
numTokens++;
boolean success = false;
try {
stream.reset();
consumer.reset(stream);
while (stream.incrementToken()) {
consumer.nextToken();
numTokens++;
}
consumer.end();
} finally {
if (success) {
stream.close();
} else {
IOUtils.closeWhileHandlingException(stream);
}
}
consumer.end();
stream.close();
return numTokens;
}

View File

@ -100,9 +100,7 @@ public final class CompletionTokenStream extends TokenStream {
@Override
public void close() throws IOException {
if (posInc == -1) {
input.close();
}
input.close();
}
public static interface ToFiniteStrings {

View File

@ -92,12 +92,13 @@ public final class PhraseSuggester extends Suggester<PhraseSuggestionContext> {
if (gens.size() > 0 && suggestTerms != null) {
final NoisyChannelSpellChecker checker = new NoisyChannelSpellChecker(realWordErrorLikelihood, suggestion.getRequireUnigram(), suggestion.getTokenLimit());
final BytesRef separator = suggestion.separator();
TokenStream stream = checker.tokenStream(suggestion.getAnalyzer(), suggestion.getText(), spare, suggestion.getField());
WordScorer wordScorer = suggestion.model().newScorer(indexReader, suggestTerms, suggestField, realWordErrorLikelihood, separator);
Result checkerResult = checker.getCorrections(stream, new MultiCandidateGeneratorWrapper(suggestion.getShardSize(),
gens.toArray(new CandidateGenerator[gens.size()])), suggestion.maxErrors(),
suggestion.getShardSize(), wordScorer, suggestion.confidence(), suggestion.gramSize());
Result checkerResult;
try (TokenStream stream = checker.tokenStream(suggestion.getAnalyzer(), suggestion.getText(), spare, suggestion.getField())) {
checkerResult = checker.getCorrections(stream, new MultiCandidateGeneratorWrapper(suggestion.getShardSize(),
gens.toArray(new CandidateGenerator[gens.size()])), suggestion.maxErrors(),
suggestion.getShardSize(), wordScorer, suggestion.confidence(), suggestion.gramSize());
}
PhraseSuggestion.Entry resultEntry = buildResultEntry(suggestion, spare, checkerResult.cutoffScore);
response.addTerm(resultEntry);

View File

@ -19,7 +19,9 @@
package org.elasticsearch.index.mapper.core;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.common.xcontent.XContentFactory;
@ -87,7 +89,14 @@ public class TokenCountFieldMapperTests extends ESSingleNodeTestCase {
int finalTokenIncrement = 4; // Count the final token increment on the rare token streams that have them
Token[] tokens = new Token[] {t1, t2, t3};
Collections.shuffle(Arrays.asList(tokens), getRandom());
TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens);
assertThat(TokenCountFieldMapper.countPositions(tokenStream), equalTo(7));
final TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens);
// TODO: we have no CannedAnalyzer?
Analyzer analyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
return new TokenStreamComponents(new MockTokenizer(), tokenStream);
}
};
assertThat(TokenCountFieldMapper.countPositions(analyzer, "", ""), equalTo(7));
}
}