mirror of https://github.com/apache/lucene.git
LUCENE-4955: Fix NGramTokenizer and NGramTokenFilter, and remove them from TestRandomChains' exclusion list.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1476135 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7cfcb26b92
commit
a03e38d5d0
|
@ -37,6 +37,16 @@ Optimizations
|
||||||
|
|
||||||
======================= Lucene 4.4.0 =======================
|
======================= Lucene 4.4.0 =======================
|
||||||
|
|
||||||
|
Changes in backwards compatibility policy
|
||||||
|
|
||||||
|
* LUCENE-4955: NGramTokenFilter now emits all n-grams for the same token at the
|
||||||
|
same position and preserves the position length and the offsets of the
|
||||||
|
original token. (Simon Willnauer, Adrien Grand)
|
||||||
|
|
||||||
|
* LUCENE-4955: NGramTokenizer now emits n-grams in a different order
|
||||||
|
(a, ab, b, bc, c) instead of (a, b, c, ab, bc) and doesn't trim trailing
|
||||||
|
whitespaces. (Adrien Grand)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
|
|
||||||
* LUCENE-4935: CustomScoreQuery wrongly applied its query boost twice
|
* LUCENE-4935: CustomScoreQuery wrongly applied its query boost twice
|
||||||
|
@ -46,6 +56,9 @@ Bug Fixes
|
||||||
if you had a 64-bit JVM without compressed OOPS: IBM J9, or Oracle with
|
if you had a 64-bit JVM without compressed OOPS: IBM J9, or Oracle with
|
||||||
large heap/explicitly disabled. (Mike McCandless, Uwe Schindler, Robert Muir)
|
large heap/explicitly disabled. (Mike McCandless, Uwe Schindler, Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-4955: NGramTokenizer now supports inputs larger than 1024 chars.
|
||||||
|
(Adrien Grand)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-4938: Don't use an unnecessarily large priority queue in IndexSearcher
|
* LUCENE-4938: Don't use an unnecessarily large priority queue in IndexSearcher
|
||||||
|
|
|
@ -0,0 +1,155 @@
|
||||||
|
package org.apache.lucene.analysis.ngram;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Old broken version of {@link NGramTokenizer}.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public final class Lucene43NGramTokenizer extends Tokenizer {
|
||||||
|
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
||||||
|
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
||||||
|
|
||||||
|
private int minGram, maxGram;
|
||||||
|
private int gramSize;
|
||||||
|
private int pos;
|
||||||
|
private int inLen; // length of the input AFTER trim()
|
||||||
|
private int charsRead; // length of the input
|
||||||
|
private String inStr;
|
||||||
|
private boolean started;
|
||||||
|
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates NGramTokenizer with given min and max n-grams.
|
||||||
|
* @param input {@link Reader} holding the input to be tokenized
|
||||||
|
* @param minGram the smallest n-gram to generate
|
||||||
|
* @param maxGram the largest n-gram to generate
|
||||||
|
*/
|
||||||
|
public Lucene43NGramTokenizer(Reader input, int minGram, int maxGram) {
|
||||||
|
super(input);
|
||||||
|
init(minGram, maxGram);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates NGramTokenizer with given min and max n-grams.
|
||||||
|
* @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
|
||||||
|
* @param input {@link Reader} holding the input to be tokenized
|
||||||
|
* @param minGram the smallest n-gram to generate
|
||||||
|
* @param maxGram the largest n-gram to generate
|
||||||
|
*/
|
||||||
|
public Lucene43NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) {
|
||||||
|
super(factory, input);
|
||||||
|
init(minGram, maxGram);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates NGramTokenizer with default min and max n-grams.
|
||||||
|
* @param input {@link Reader} holding the input to be tokenized
|
||||||
|
*/
|
||||||
|
public Lucene43NGramTokenizer(Reader input) {
|
||||||
|
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void init(int minGram, int maxGram) {
|
||||||
|
if (minGram < 1) {
|
||||||
|
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||||
|
}
|
||||||
|
if (minGram > maxGram) {
|
||||||
|
throw new IllegalArgumentException("minGram must not be greater than maxGram");
|
||||||
|
}
|
||||||
|
this.minGram = minGram;
|
||||||
|
this.maxGram = maxGram;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the next token in the stream, or null at EOS. */
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
clearAttributes();
|
||||||
|
if (!started) {
|
||||||
|
started = true;
|
||||||
|
gramSize = minGram;
|
||||||
|
char[] chars = new char[1024];
|
||||||
|
charsRead = 0;
|
||||||
|
// TODO: refactor to a shared readFully somewhere:
|
||||||
|
while (charsRead < chars.length) {
|
||||||
|
int inc = input.read(chars, charsRead, chars.length-charsRead);
|
||||||
|
if (inc == -1) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
charsRead += inc;
|
||||||
|
}
|
||||||
|
inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings
|
||||||
|
|
||||||
|
if (charsRead == chars.length) {
|
||||||
|
// Read extra throwaway chars so that on end() we
|
||||||
|
// report the correct offset:
|
||||||
|
char[] throwaway = new char[1024];
|
||||||
|
while(true) {
|
||||||
|
final int inc = input.read(throwaway, 0, throwaway.length);
|
||||||
|
if (inc == -1) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
charsRead += inc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inLen = inStr.length();
|
||||||
|
if (inLen == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pos+gramSize > inLen) { // if we hit the end of the string
|
||||||
|
pos = 0; // reset to beginning of string
|
||||||
|
gramSize++; // increase n-gram size
|
||||||
|
if (gramSize > maxGram) // we are done
|
||||||
|
return false;
|
||||||
|
if (pos+gramSize > inLen)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
int oldPos = pos;
|
||||||
|
pos++;
|
||||||
|
termAtt.setEmpty().append(inStr, oldPos, oldPos+gramSize);
|
||||||
|
offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void end() {
|
||||||
|
// set final offset
|
||||||
|
final int finalOffset = correctOffset(charsRead);
|
||||||
|
this.offsetAtt.setOffset(finalOffset, finalOffset);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reset() throws IOException {
|
||||||
|
super.reset();
|
||||||
|
started = false;
|
||||||
|
pos = 0;
|
||||||
|
}
|
||||||
|
}
|
|
@ -47,6 +47,6 @@ public class NGramFilterFactory extends TokenFilterFactory {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public NGramTokenFilter create(TokenStream input) {
|
public NGramTokenFilter create(TokenStream input) {
|
||||||
return new NGramTokenFilter(input, minGramSize, maxGramSize);
|
return new NGramTokenFilter(luceneMatchVersion, input, minGramSize, maxGramSize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,37 +21,60 @@ import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tokenizes the input into n-grams of the given size(s).
|
* Tokenizes the input into n-grams of the given size(s).
|
||||||
|
* <a name="version"/>
|
||||||
|
* <p>You must specify the required {@link Version} compatibility when
|
||||||
|
* creating a {@link NGramTokenFilter}. As of Lucene 4.4, this token filters:<ul>
|
||||||
|
* <li>emits all n-grams for the same token at the same position,</li>
|
||||||
|
* <li>does not modify offsets,</li>
|
||||||
|
* <li>sorts n-grams by their offset in the original token first, then
|
||||||
|
* increasing length (meaning that "abc" will give "a", "ab", "abc", "b", "bc",
|
||||||
|
* "c").</li></ul>
|
||||||
|
* <p>You can make this filter use the old behavior by providing a version <
|
||||||
|
* {@link Version#LUCENE_44} in the constructor but this is not recommended as
|
||||||
|
* it will lead to broken {@link TokenStream}s that will cause highlighting
|
||||||
|
* bugs.
|
||||||
*/
|
*/
|
||||||
public final class NGramTokenFilter extends TokenFilter {
|
public final class NGramTokenFilter extends TokenFilter {
|
||||||
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
||||||
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
||||||
|
|
||||||
private int minGram, maxGram;
|
private final int minGram, maxGram;
|
||||||
|
|
||||||
private char[] curTermBuffer;
|
private char[] curTermBuffer;
|
||||||
private int curTermLength;
|
private int curTermLength;
|
||||||
private int curGramSize;
|
private int curGramSize;
|
||||||
private int curPos;
|
private int curPos;
|
||||||
|
private int curPosInc, curPosLen;
|
||||||
private int tokStart;
|
private int tokStart;
|
||||||
private int tokEnd; // only used if the length changed before this filter
|
private int tokEnd;
|
||||||
private boolean hasIllegalOffsets; // only if the length changed before this filter
|
private boolean hasIllegalOffsets; // only if the length changed before this filter
|
||||||
|
|
||||||
|
private final Version version;
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final PositionIncrementAttribute posIncAtt;
|
||||||
|
private final PositionLengthAttribute posLenAtt;
|
||||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates NGramTokenFilter with given min and max n-grams.
|
* Creates NGramTokenFilter with given min and max n-grams.
|
||||||
|
* @param version Lucene version to enable correct position increments.
|
||||||
|
* See <a href="#version">above</a> for details.
|
||||||
* @param input {@link TokenStream} holding the input to be tokenized
|
* @param input {@link TokenStream} holding the input to be tokenized
|
||||||
* @param minGram the smallest n-gram to generate
|
* @param minGram the smallest n-gram to generate
|
||||||
* @param maxGram the largest n-gram to generate
|
* @param maxGram the largest n-gram to generate
|
||||||
*/
|
*/
|
||||||
public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
|
public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
|
||||||
super(input);
|
super(new LengthFilter(true, input, minGram, Integer.MAX_VALUE));
|
||||||
|
this.version = version;
|
||||||
if (minGram < 1) {
|
if (minGram < 1) {
|
||||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||||
}
|
}
|
||||||
|
@ -60,14 +83,37 @@ public final class NGramTokenFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
this.minGram = minGram;
|
this.minGram = minGram;
|
||||||
this.maxGram = maxGram;
|
this.maxGram = maxGram;
|
||||||
|
if (version.onOrAfter(Version.LUCENE_44)) {
|
||||||
|
posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||||
|
} else {
|
||||||
|
posIncAtt = new PositionIncrementAttribute() {
|
||||||
|
@Override
|
||||||
|
public void setPositionIncrement(int positionIncrement) {}
|
||||||
|
@Override
|
||||||
|
public int getPositionIncrement() {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
posLenAtt = new PositionLengthAttribute() {
|
||||||
|
@Override
|
||||||
|
public void setPositionLength(int positionLength) {}
|
||||||
|
@Override
|
||||||
|
public int getPositionLength() {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates NGramTokenFilter with default min and max n-grams.
|
* Creates NGramTokenFilter with default min and max n-grams.
|
||||||
|
* @param version Lucene version to enable correct position increments.
|
||||||
|
* See <a href="#version">above</a> for details.
|
||||||
* @param input {@link TokenStream} holding the input to be tokenized
|
* @param input {@link TokenStream} holding the input to be tokenized
|
||||||
*/
|
*/
|
||||||
public NGramTokenFilter(TokenStream input) {
|
public NGramTokenFilter(Version version, TokenStream input) {
|
||||||
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the next token in the stream, or null at EOS. */
|
/** Returns the next token in the stream, or null at EOS. */
|
||||||
|
@ -82,6 +128,8 @@ public final class NGramTokenFilter extends TokenFilter {
|
||||||
curTermLength = termAtt.length();
|
curTermLength = termAtt.length();
|
||||||
curGramSize = minGram;
|
curGramSize = minGram;
|
||||||
curPos = 0;
|
curPos = 0;
|
||||||
|
curPosInc = posIncAtt.getPositionIncrement();
|
||||||
|
curPosLen = posLenAtt.getPositionLength();
|
||||||
tokStart = offsetAtt.startOffset();
|
tokStart = offsetAtt.startOffset();
|
||||||
tokEnd = offsetAtt.endOffset();
|
tokEnd = offsetAtt.endOffset();
|
||||||
// if length by start + end offsets doesn't match the term text then assume
|
// if length by start + end offsets doesn't match the term text then assume
|
||||||
|
@ -89,6 +137,22 @@ public final class NGramTokenFilter extends TokenFilter {
|
||||||
hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
|
hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (version.onOrAfter(Version.LUCENE_44)) {
|
||||||
|
if (curGramSize > maxGram || curPos + curGramSize > curTermLength) {
|
||||||
|
++curPos;
|
||||||
|
curGramSize = minGram;
|
||||||
|
}
|
||||||
|
if (curPos + curGramSize <= curTermLength) {
|
||||||
|
clearAttributes();
|
||||||
|
termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
|
||||||
|
posIncAtt.setPositionIncrement(curPosInc);
|
||||||
|
curPosInc = 0;
|
||||||
|
posLenAtt.setPositionLength(curPosLen);
|
||||||
|
offsetAtt.setOffset(tokStart, tokEnd);
|
||||||
|
curGramSize++;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
while (curGramSize <= maxGram) {
|
while (curGramSize <= maxGram) {
|
||||||
while (curPos+curGramSize <= curTermLength) { // while there is input
|
while (curPos+curGramSize <= curTermLength) { // while there is input
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
|
@ -104,6 +168,7 @@ public final class NGramTokenFilter extends TokenFilter {
|
||||||
curGramSize++; // increase n-gram size
|
curGramSize++; // increase n-gram size
|
||||||
curPos = 0;
|
curPos = 0;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
curTermBuffer = null;
|
curTermBuffer = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,64 +17,90 @@ package org.apache.lucene.analysis.ngram;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
|
||||||
import org.apache.lucene.util.AttributeSource;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tokenizes the input into n-grams of the given size(s).
|
* Tokenizes the input into n-grams of the given size(s).
|
||||||
|
* <p>On the contrary to {@link NGramTokenFilter}, this class sets offsets so
|
||||||
|
* that characters between startOffset and endOffset in the original stream are
|
||||||
|
* the same as the term chars.
|
||||||
|
* <p>For example, "abcde" would be tokenized as (minGram=2, maxGram=3):
|
||||||
|
* <table>
|
||||||
|
* <tr><th>Term</th><td>ab</td><td>abc</td><td>bc</td><td>bcd</td><td>cd</td><td>cde</td><td>de</td></tr>
|
||||||
|
* <tr><th>Position increment</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr>
|
||||||
|
* <tr><th>Position length</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr>
|
||||||
|
* <tr><th>Offsets</th><td>[0,2[</td><td>[0,3[</td><td>[1,3[</td><td>[1,4[</td><td>[2,4[</td><td>[2,5[</td><td>[3,5[</td></tr>
|
||||||
|
* </table>
|
||||||
|
* <a name="version"/>
|
||||||
|
* <p>Before Lucene 4.4, this class had a different behavior:<ul>
|
||||||
|
* <li>It didn't support more than 1024 chars of input, the rest was trashed.</li>
|
||||||
|
* <li>The last whitespaces of the 1024 chars block were trimmed.</li>
|
||||||
|
* <li>Tokens were emitted in a different order (by increasing lengths).</li></ul>
|
||||||
|
* <p>Although highly discouraged, it is still possible to use the old behavior
|
||||||
|
* through {@link Lucene43NGramTokenizer}.
|
||||||
*/
|
*/
|
||||||
public final class NGramTokenizer extends Tokenizer {
|
public final class NGramTokenizer extends Tokenizer {
|
||||||
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
||||||
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
||||||
|
|
||||||
private int minGram, maxGram;
|
private char[] buffer;
|
||||||
|
private int bufferStart, bufferEnd; // remaining slice of the buffer
|
||||||
|
private int offset;
|
||||||
private int gramSize;
|
private int gramSize;
|
||||||
private int pos;
|
private int minGram, maxGram;
|
||||||
private int inLen; // length of the input AFTER trim()
|
private boolean exhausted;
|
||||||
private int charsRead; // length of the input
|
|
||||||
private String inStr;
|
|
||||||
private boolean started;
|
|
||||||
|
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates NGramTokenizer with given min and max n-grams.
|
* Creates NGramTokenizer with given min and max n-grams.
|
||||||
|
* @param version the lucene compatibility <a href="#version">version</a>
|
||||||
* @param input {@link Reader} holding the input to be tokenized
|
* @param input {@link Reader} holding the input to be tokenized
|
||||||
* @param minGram the smallest n-gram to generate
|
* @param minGram the smallest n-gram to generate
|
||||||
* @param maxGram the largest n-gram to generate
|
* @param maxGram the largest n-gram to generate
|
||||||
*/
|
*/
|
||||||
public NGramTokenizer(Reader input, int minGram, int maxGram) {
|
public NGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
|
||||||
super(input);
|
super(input);
|
||||||
init(minGram, maxGram);
|
init(version, minGram, maxGram);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates NGramTokenizer with given min and max n-grams.
|
* Creates NGramTokenizer with given min and max n-grams.
|
||||||
|
* @param version the lucene compatibility <a href="#version">version</a>
|
||||||
* @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
|
* @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
|
||||||
* @param input {@link Reader} holding the input to be tokenized
|
* @param input {@link Reader} holding the input to be tokenized
|
||||||
* @param minGram the smallest n-gram to generate
|
* @param minGram the smallest n-gram to generate
|
||||||
* @param maxGram the largest n-gram to generate
|
* @param maxGram the largest n-gram to generate
|
||||||
*/
|
*/
|
||||||
public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) {
|
public NGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) {
|
||||||
super(factory, input);
|
super(factory, input);
|
||||||
init(minGram, maxGram);
|
init(version, minGram, maxGram);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates NGramTokenizer with default min and max n-grams.
|
* Creates NGramTokenizer with default min and max n-grams.
|
||||||
|
* @param version the lucene compatibility <a href="#version">version</a>
|
||||||
* @param input {@link Reader} holding the input to be tokenized
|
* @param input {@link Reader} holding the input to be tokenized
|
||||||
*/
|
*/
|
||||||
public NGramTokenizer(Reader input) {
|
public NGramTokenizer(Version version, Reader input) {
|
||||||
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void init(int minGram, int maxGram) {
|
private void init(Version version, int minGram, int maxGram) {
|
||||||
|
if (!version.onOrAfter(Version.LUCENE_44)) {
|
||||||
|
throw new IllegalArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer");
|
||||||
|
}
|
||||||
if (minGram < 1) {
|
if (minGram < 1) {
|
||||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||||
}
|
}
|
||||||
|
@ -83,73 +109,66 @@ public final class NGramTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
this.minGram = minGram;
|
this.minGram = minGram;
|
||||||
this.maxGram = maxGram;
|
this.maxGram = maxGram;
|
||||||
|
buffer = new char[maxGram + 1024];
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the next token in the stream, or null at EOS. */
|
/** Returns the next token in the stream, or null at EOS. */
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
if (!started) {
|
|
||||||
started = true;
|
// compact
|
||||||
|
if (bufferStart >= buffer.length - maxGram) {
|
||||||
|
System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
|
||||||
|
bufferEnd -= bufferStart;
|
||||||
|
bufferStart = 0;
|
||||||
|
|
||||||
|
// fill in remaining space
|
||||||
|
if (!exhausted) {
|
||||||
|
// TODO: refactor to a shared readFully
|
||||||
|
while (bufferEnd < buffer.length) {
|
||||||
|
final int read = input.read(buffer, bufferEnd, buffer.length - bufferEnd);
|
||||||
|
if (read == -1) {
|
||||||
|
exhausted = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
bufferEnd += read;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// should we go to the next offset?
|
||||||
|
if (gramSize > maxGram || bufferStart + gramSize > bufferEnd) {
|
||||||
|
bufferStart++;
|
||||||
|
offset++;
|
||||||
gramSize = minGram;
|
gramSize = minGram;
|
||||||
char[] chars = new char[1024];
|
|
||||||
charsRead = 0;
|
|
||||||
// TODO: refactor to a shared readFully somewhere:
|
|
||||||
while (charsRead < chars.length) {
|
|
||||||
int inc = input.read(chars, charsRead, chars.length-charsRead);
|
|
||||||
if (inc == -1) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
charsRead += inc;
|
|
||||||
}
|
|
||||||
inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings
|
|
||||||
|
|
||||||
if (charsRead == chars.length) {
|
|
||||||
// Read extra throwaway chars so that on end() we
|
|
||||||
// report the correct offset:
|
|
||||||
char[] throwaway = new char[1024];
|
|
||||||
while(true) {
|
|
||||||
final int inc = input.read(throwaway, 0, throwaway.length);
|
|
||||||
if (inc == -1) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
charsRead += inc;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inLen = inStr.length();
|
// are there enough chars remaining?
|
||||||
if (inLen == 0) {
|
if (bufferStart + gramSize > bufferEnd) {
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (pos+gramSize > inLen) { // if we hit the end of the string
|
|
||||||
pos = 0; // reset to beginning of string
|
|
||||||
gramSize++; // increase n-gram size
|
|
||||||
if (gramSize > maxGram) // we are done
|
|
||||||
return false;
|
|
||||||
if (pos+gramSize > inLen)
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
int oldPos = pos;
|
termAtt.copyBuffer(buffer, bufferStart, gramSize);
|
||||||
pos++;
|
posIncAtt.setPositionIncrement(1);
|
||||||
termAtt.setEmpty().append(inStr, oldPos, oldPos+gramSize);
|
posLenAtt.setPositionLength(1);
|
||||||
offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize));
|
offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + gramSize));
|
||||||
|
++gramSize;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void end() {
|
public void end() {
|
||||||
// set final offset
|
final int endOffset = correctOffset(offset + bufferEnd - bufferStart);
|
||||||
final int finalOffset = correctOffset(charsRead);
|
offsetAtt.setOffset(endOffset, endOffset);
|
||||||
this.offsetAtt.setOffset(finalOffset, finalOffset);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void reset() throws IOException {
|
public void reset() throws IOException {
|
||||||
super.reset();
|
super.reset();
|
||||||
started = false;
|
bufferStart = bufferEnd = buffer.length;
|
||||||
pos = 0;
|
offset = 0;
|
||||||
|
gramSize = minGram;
|
||||||
|
exhausted = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,8 +18,10 @@ package org.apache.lucene.analysis.ngram;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||||
import org.apache.lucene.util.AttributeSource.AttributeFactory;
|
import org.apache.lucene.util.AttributeSource.AttributeFactory;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -49,7 +51,11 @@ public class NGramTokenizerFactory extends TokenizerFactory {
|
||||||
|
|
||||||
/** Creates the {@link TokenStream} of n-grams from the given {@link Reader} and {@link AttributeFactory}. */
|
/** Creates the {@link TokenStream} of n-grams from the given {@link Reader} and {@link AttributeFactory}. */
|
||||||
@Override
|
@Override
|
||||||
public NGramTokenizer create(AttributeFactory factory, Reader input) {
|
public Tokenizer create(AttributeFactory factory, Reader input) {
|
||||||
return new NGramTokenizer(factory, input, minGramSize, maxGramSize);
|
if (luceneMatchVersion.onOrAfter(Version.LUCENE_44)) {
|
||||||
|
return new NGramTokenizer(luceneMatchVersion, factory, input, minGramSize, maxGramSize);
|
||||||
|
} else {
|
||||||
|
return new Lucene43NGramTokenizer(factory, input, minGramSize, maxGramSize);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -54,8 +54,6 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter;
|
|
||||||
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
|
|
||||||
import org.apache.lucene.analysis.ValidatingTokenFilter;
|
import org.apache.lucene.analysis.ValidatingTokenFilter;
|
||||||
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
|
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
|
||||||
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
|
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
|
||||||
|
@ -71,14 +69,14 @@ import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter;
|
||||||
import org.apache.lucene.analysis.miscellaneous.KeepWordFilter;
|
import org.apache.lucene.analysis.miscellaneous.KeepWordFilter;
|
||||||
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
|
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
|
||||||
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
|
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter;
|
||||||
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
|
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
|
||||||
import org.apache.lucene.analysis.miscellaneous.TrimFilter;
|
import org.apache.lucene.analysis.miscellaneous.TrimFilter;
|
||||||
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
|
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
|
||||||
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
|
|
||||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
|
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
|
||||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
|
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
|
||||||
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
|
import org.apache.lucene.analysis.ngram.Lucene43NGramTokenizer;
|
||||||
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
|
||||||
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
|
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
|
||||||
import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
|
import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
|
||||||
import org.apache.lucene.analysis.payloads.IdentityEncoder;
|
import org.apache.lucene.analysis.payloads.IdentityEncoder;
|
||||||
|
@ -90,8 +88,9 @@ import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||||
import org.apache.lucene.analysis.th.ThaiWordFilter;
|
import org.apache.lucene.analysis.th.ThaiWordFilter;
|
||||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.util.AttributeSource.AttributeFactory;
|
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
|
||||||
import org.apache.lucene.util.AttributeSource;
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
import org.apache.lucene.util.AttributeSource.AttributeFactory;
|
||||||
import org.apache.lucene.util.CharsRef;
|
import org.apache.lucene.util.CharsRef;
|
||||||
import org.apache.lucene.util.Rethrow;
|
import org.apache.lucene.util.Rethrow;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
@ -162,9 +161,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
// startOffset thats > its endOffset
|
// startOffset thats > its endOffset
|
||||||
// (see LUCENE-3738 for a list of other offenders here)
|
// (see LUCENE-3738 for a list of other offenders here)
|
||||||
// broken!
|
// broken!
|
||||||
NGramTokenizer.class,
|
Lucene43NGramTokenizer.class,
|
||||||
// broken!
|
|
||||||
NGramTokenFilter.class,
|
|
||||||
// broken!
|
// broken!
|
||||||
EdgeNGramTokenizer.class,
|
EdgeNGramTokenizer.class,
|
||||||
// broken!
|
// broken!
|
||||||
|
|
|
@ -26,7 +26,9 @@ import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
|
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
@ -46,7 +48,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
public void testInvalidInput() throws Exception {
|
public void testInvalidInput() throws Exception {
|
||||||
boolean gotException = false;
|
boolean gotException = false;
|
||||||
try {
|
try {
|
||||||
new NGramTokenFilter(input, 2, 1);
|
new NGramTokenFilter(TEST_VERSION_CURRENT, input, 2, 1);
|
||||||
} catch (IllegalArgumentException e) {
|
} catch (IllegalArgumentException e) {
|
||||||
gotException = true;
|
gotException = true;
|
||||||
}
|
}
|
||||||
|
@ -56,7 +58,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
public void testInvalidInput2() throws Exception {
|
public void testInvalidInput2() throws Exception {
|
||||||
boolean gotException = false;
|
boolean gotException = false;
|
||||||
try {
|
try {
|
||||||
new NGramTokenFilter(input, 0, 1);
|
new NGramTokenFilter(TEST_VERSION_CURRENT, input, 0, 1);
|
||||||
} catch (IllegalArgumentException e) {
|
} catch (IllegalArgumentException e) {
|
||||||
gotException = true;
|
gotException = true;
|
||||||
}
|
}
|
||||||
|
@ -64,42 +66,56 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testUnigrams() throws Exception {
|
public void testUnigrams() throws Exception {
|
||||||
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
|
NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 1, 1);
|
||||||
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
|
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testBigrams() throws Exception {
|
public void testBigrams() throws Exception {
|
||||||
NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
|
NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 2, 2);
|
||||||
assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5});
|
assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,0,0,0}, new int[]{5,5,5,5}, new int[]{1,0,0,0});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testNgrams() throws Exception {
|
public void testNgrams() throws Exception {
|
||||||
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
|
NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 1, 3);
|
||||||
assertTokenStreamContents(filter,
|
assertTokenStreamContents(filter,
|
||||||
new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
|
new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"},
|
||||||
new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
|
new int[]{0,0,0,0,0,0,0,0,0,0,0,0},
|
||||||
new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
|
new int[]{5,5,5,5,5,5,5,5,5,5,5,5},
|
||||||
null, null, null, null, false
|
null,
|
||||||
|
new int[]{1,0,0,0,0,0,0,0,0,0,0,0},
|
||||||
|
null, null, false
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNgramsNoIncrement() throws Exception {
|
||||||
|
NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 1, 3);
|
||||||
|
assertTokenStreamContents(filter,
|
||||||
|
new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"},
|
||||||
|
new int[]{0,0,0,0,0,0,0,0,0,0,0,0},
|
||||||
|
new int[]{5,5,5,5,5,5,5,5,5,5,5,5},
|
||||||
|
null,
|
||||||
|
new int[]{1,0,0,0,0,0,0,0,0,0,0,0},
|
||||||
|
null, null, false
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testOversizedNgrams() throws Exception {
|
public void testOversizedNgrams() throws Exception {
|
||||||
NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
|
NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 6, 7);
|
||||||
assertTokenStreamContents(filter, new String[0], new int[0], new int[0]);
|
assertTokenStreamContents(filter, new String[0], new int[0], new int[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSmallTokenInStream() throws Exception {
|
public void testSmallTokenInStream() throws Exception {
|
||||||
input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
|
input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
|
||||||
NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
|
NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 3, 3);
|
||||||
assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
|
assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReset() throws Exception {
|
public void testReset() throws Exception {
|
||||||
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
|
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
|
||||||
NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
|
NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 1, 1);
|
||||||
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
|
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
|
||||||
tokenizer.setReader(new StringReader("abcde"));
|
tokenizer.setReader(new StringReader("abcde"));
|
||||||
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
|
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
|
||||||
}
|
}
|
||||||
|
|
||||||
// LUCENE-3642
|
// LUCENE-3642
|
||||||
|
@ -112,14 +128,15 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
|
TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
|
||||||
filters = new NGramTokenFilter(filters, 2, 2);
|
filters = new NGramTokenFilter(TEST_VERSION_CURRENT, filters, 2, 2);
|
||||||
return new TokenStreamComponents(tokenizer, filters);
|
return new TokenStreamComponents(tokenizer, filters);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
assertAnalyzesTo(analyzer, "mosfellsbær",
|
assertAnalyzesTo(analyzer, "mosfellsbær",
|
||||||
new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
|
new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
|
||||||
new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
|
new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
|
||||||
new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 });
|
new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 },
|
||||||
|
new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 });
|
||||||
}
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
|
@ -129,7 +146,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
return new TokenStreamComponents(tokenizer,
|
return new TokenStreamComponents(tokenizer,
|
||||||
new NGramTokenFilter(tokenizer, 2, 4));
|
new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 4));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
|
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
|
||||||
|
@ -142,9 +159,22 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
||||||
return new TokenStreamComponents(tokenizer,
|
return new TokenStreamComponents(tokenizer,
|
||||||
new NGramTokenFilter(tokenizer, 2, 15));
|
new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 15));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
|
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testLucene43() throws IOException {
|
||||||
|
NGramTokenFilter filter = new NGramTokenFilter(Version.LUCENE_43, input, 2, 3);
|
||||||
|
assertTokenStreamContents(filter,
|
||||||
|
new String[]{"ab","bc","cd","de","abc","bcd","cde"},
|
||||||
|
new int[]{0,1,2,3,0,1,2},
|
||||||
|
new int[]{2,3,4,5,3,4,5},
|
||||||
|
null,
|
||||||
|
new int[]{1,1,1,1,1,1,1},
|
||||||
|
null, null, false
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,13 +18,21 @@ package org.apache.lucene.analysis.ngram;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.util.LuceneTestCase.Slow;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
|
||||||
|
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tests {@link NGramTokenizer} for correctness.
|
* Tests {@link NGramTokenizer} for correctness.
|
||||||
|
@ -41,7 +49,7 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
public void testInvalidInput() throws Exception {
|
public void testInvalidInput() throws Exception {
|
||||||
boolean gotException = false;
|
boolean gotException = false;
|
||||||
try {
|
try {
|
||||||
new NGramTokenizer(input, 2, 1);
|
new NGramTokenizer(TEST_VERSION_CURRENT, input, 2, 1);
|
||||||
} catch (IllegalArgumentException e) {
|
} catch (IllegalArgumentException e) {
|
||||||
gotException = true;
|
gotException = true;
|
||||||
}
|
}
|
||||||
|
@ -51,7 +59,7 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
public void testInvalidInput2() throws Exception {
|
public void testInvalidInput2() throws Exception {
|
||||||
boolean gotException = false;
|
boolean gotException = false;
|
||||||
try {
|
try {
|
||||||
new NGramTokenizer(input, 0, 1);
|
new NGramTokenizer(TEST_VERSION_CURRENT, input, 0, 1);
|
||||||
} catch (IllegalArgumentException e) {
|
} catch (IllegalArgumentException e) {
|
||||||
gotException = true;
|
gotException = true;
|
||||||
}
|
}
|
||||||
|
@ -59,21 +67,21 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testUnigrams() throws Exception {
|
public void testUnigrams() throws Exception {
|
||||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
|
NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 1);
|
||||||
assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
|
assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testBigrams() throws Exception {
|
public void testBigrams() throws Exception {
|
||||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
|
NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 2, 2);
|
||||||
assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, 5 /* abcde */);
|
assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, 5 /* abcde */);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testNgrams() throws Exception {
|
public void testNgrams() throws Exception {
|
||||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
|
NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 3);
|
||||||
assertTokenStreamContents(tokenizer,
|
assertTokenStreamContents(tokenizer,
|
||||||
new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
|
new String[]{"a","ab", "abc", "b", "bc", "bcd", "c", "cd", "cde", "d", "de", "e"},
|
||||||
new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
|
new int[]{0,0,0,1,1,1,2,2,2,3,3,4},
|
||||||
new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
|
new int[]{1,2,3,2,3,4,3,4,5,4,5,5},
|
||||||
null,
|
null,
|
||||||
null,
|
null,
|
||||||
null,
|
null,
|
||||||
|
@ -83,12 +91,12 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testOversizedNgrams() throws Exception {
|
public void testOversizedNgrams() throws Exception {
|
||||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
|
NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 6, 7);
|
||||||
assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
|
assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReset() throws Exception {
|
public void testReset() throws Exception {
|
||||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
|
NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 1);
|
||||||
assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
|
assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
|
||||||
tokenizer.setReader(new StringReader("abcde"));
|
tokenizer.setReader(new StringReader("abcde"));
|
||||||
assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
|
assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
|
||||||
|
@ -99,11 +107,48 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
Analyzer a = new Analyzer() {
|
Analyzer a = new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
Tokenizer tokenizer = new NGramTokenizer(reader, 2, 4);
|
Tokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 4);
|
||||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
|
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
|
||||||
checkRandomData(random(), a, 50*RANDOM_MULTIPLIER, 1027, false, false);
|
checkRandomData(random(), a, 50*RANDOM_MULTIPLIER, 1027, false, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void testNGrams(int minGram, int maxGram, int length) throws IOException {
|
||||||
|
final String s = RandomStrings.randomAsciiOfLength(random(), length);
|
||||||
|
final TokenStream grams = new NGramTokenizer(TEST_VERSION_CURRENT, new StringReader(s), minGram, maxGram);
|
||||||
|
final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class);
|
||||||
|
final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class);
|
||||||
|
final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class);
|
||||||
|
grams.reset();
|
||||||
|
for (int start = 0; start < s.length(); ++start) {
|
||||||
|
for (int end = start + minGram; end <= start + maxGram && end <= s.length(); ++end) {
|
||||||
|
assertTrue(grams.incrementToken());
|
||||||
|
assertEquals(s.substring(start, end), termAtt.toString());
|
||||||
|
assertEquals(1, posIncAtt.getPositionIncrement());
|
||||||
|
assertEquals(start, offsetAtt.startOffset());
|
||||||
|
assertEquals(end, offsetAtt.endOffset());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
grams.end();
|
||||||
|
assertEquals(s.length(), offsetAtt.startOffset());
|
||||||
|
assertEquals(s.length(), offsetAtt.endOffset());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testLargeInput() throws IOException {
|
||||||
|
// test sliding
|
||||||
|
final int minGram = _TestUtil.nextInt(random(), 1, 100);
|
||||||
|
final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
|
||||||
|
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testLargeMaxGram() throws IOException {
|
||||||
|
// test sliding with maxGram > 1024
|
||||||
|
final int minGram = _TestUtil.nextInt(random(), 1200, 1300);
|
||||||
|
final int maxGram = _TestUtil.nextInt(random(), minGram, 1300);
|
||||||
|
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,7 +35,7 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
|
||||||
Reader reader = new StringReader("test");
|
Reader reader = new StringReader("test");
|
||||||
TokenStream stream = tokenizerFactory("NGram").create(reader);
|
TokenStream stream = tokenizerFactory("NGram").create(reader);
|
||||||
assertTokenStreamContents(stream,
|
assertTokenStreamContents(stream,
|
||||||
new String[] { "t", "e", "s", "t", "te", "es", "st" });
|
new String[] { "t", "te", "e", "es", "s", "st", "t" });
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -47,7 +47,7 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
|
||||||
"minGramSize", "2",
|
"minGramSize", "2",
|
||||||
"maxGramSize", "3").create(reader);
|
"maxGramSize", "3").create(reader);
|
||||||
assertTokenStreamContents(stream,
|
assertTokenStreamContents(stream,
|
||||||
new String[] { "te", "es", "st", "tes", "est" });
|
new String[] { "te", "tes", "es", "est", "st" });
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -58,7 +58,7 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
|
||||||
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
stream = tokenFilterFactory("NGram").create(stream);
|
stream = tokenFilterFactory("NGram").create(stream);
|
||||||
assertTokenStreamContents(stream,
|
assertTokenStreamContents(stream,
|
||||||
new String[] { "t", "e", "s", "t", "te", "es", "st" });
|
new String[] { "t", "te", "e", "es", "s", "st", "t" });
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -71,7 +71,7 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
|
||||||
"minGramSize", "2",
|
"minGramSize", "2",
|
||||||
"maxGramSize", "3").create(stream);
|
"maxGramSize", "3").create(stream);
|
||||||
assertTokenStreamContents(stream,
|
assertTokenStreamContents(stream,
|
||||||
new String[] { "te", "es", "st", "tes", "est" });
|
new String[] { "te", "tes", "es", "est", "st" });
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
Loading…
Reference in New Issue