Import the new n-gram tokenizers and filters from Lucene.
Lucene 4.4 will feature new n-gram tokenizers and filters that should not generate broken offsets (that cause highlighting bugs) anymore. They also correctly handle supplementary characters and the tokenizers can work in a streaming fashion (they are not limited to the first 1024 chars of the stream anymore).
This commit is contained in:
parent
a388588b1f
commit
fccbe9c185
|
@ -0,0 +1,214 @@
|
|||
package org.apache.lucene.analysis.ngram;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.util.XCharacterUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Tokenizes the given token into n-grams of given size(s).
|
||||
* <p>
|
||||
* This {@link TokenFilter} create n-grams from the beginning edge or ending edge of a input token.
|
||||
* <p><a name="version"/>As of Lucene 4.4, this filter does not support
|
||||
* {@link Side#BACK} (you can use {@link ReverseStringFilter} up-front and
|
||||
* afterward to get the same behavior), handles supplementary characters
|
||||
* correctly and does not update offsets anymore.
|
||||
*/
|
||||
public final class XEdgeNGramTokenFilter extends TokenFilter {
|
||||
|
||||
static {
|
||||
// LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1492640.
|
||||
assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed";
|
||||
}
|
||||
|
||||
public static final Side DEFAULT_SIDE = Side.FRONT;
|
||||
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
|
||||
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
|
||||
|
||||
/** Specifies which side of the input the n-gram should be generated from */
|
||||
public static enum Side {
|
||||
|
||||
/** Get the n-gram from the front of the input */
|
||||
FRONT {
|
||||
@Override
|
||||
public String getLabel() { return "front"; }
|
||||
},
|
||||
|
||||
/** Get the n-gram from the end of the input */
|
||||
@Deprecated
|
||||
BACK {
|
||||
@Override
|
||||
public String getLabel() { return "back"; }
|
||||
};
|
||||
|
||||
public abstract String getLabel();
|
||||
|
||||
// Get the appropriate Side from a string
|
||||
public static Side getSide(String sideName) {
|
||||
if (FRONT.getLabel().equals(sideName)) {
|
||||
return FRONT;
|
||||
}
|
||||
if (BACK.getLabel().equals(sideName)) {
|
||||
return BACK;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private final XCharacterUtils charUtils;
|
||||
private final int minGram;
|
||||
private final int maxGram;
|
||||
private Side side;
|
||||
private char[] curTermBuffer;
|
||||
private int curTermLength;
|
||||
private int curCodePointCount;
|
||||
private int curGramSize;
|
||||
private int tokStart;
|
||||
private int tokEnd;
|
||||
private int savePosIncr;
|
||||
private int savePosLen;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||
|
||||
/**
|
||||
* Creates XEdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param version the <a href="#version">Lucene match version</a>
|
||||
* @param input {@link TokenStream} holding the input to be tokenized
|
||||
* @param side the {@link Side} from which to chop off an n-gram
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
@Deprecated
|
||||
public XEdgeNGramTokenFilter(Version version, TokenStream input, Side side, int minGram, int maxGram) {
|
||||
super(input);
|
||||
|
||||
if (version == null) {
|
||||
throw new IllegalArgumentException("version must not be null");
|
||||
}
|
||||
|
||||
if (side == Side.BACK) {
|
||||
throw new IllegalArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward");
|
||||
}
|
||||
|
||||
if (side == null) {
|
||||
throw new IllegalArgumentException("sideLabel must be either front or back");
|
||||
}
|
||||
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
}
|
||||
|
||||
if (minGram > maxGram) {
|
||||
throw new IllegalArgumentException("minGram must not be greater than maxGram");
|
||||
}
|
||||
|
||||
this.charUtils = XCharacterUtils.getInstance(version);
|
||||
this.minGram = minGram;
|
||||
this.maxGram = maxGram;
|
||||
this.side = side;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates XEdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param version the <a href="#version">Lucene match version</a>
|
||||
* @param input {@link TokenStream} holding the input to be tokenized
|
||||
* @param sideLabel the name of the {@link Side} from which to chop off an n-gram
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
@Deprecated
|
||||
public XEdgeNGramTokenFilter(Version version, TokenStream input, String sideLabel, int minGram, int maxGram) {
|
||||
this(version, input, Side.getSide(sideLabel), minGram, maxGram);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates XEdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param version the <a href="#version">Lucene match version</a>
|
||||
* @param input {@link TokenStream} holding the input to be tokenized
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public XEdgeNGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
|
||||
this(version, input, Side.FRONT, minGram, maxGram);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
while (true) {
|
||||
if (curTermBuffer == null) {
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
} else {
|
||||
curTermBuffer = termAtt.buffer().clone();
|
||||
curTermLength = termAtt.length();
|
||||
curCodePointCount = charUtils.codePointCount(termAtt);
|
||||
curGramSize = minGram;
|
||||
tokStart = offsetAtt.startOffset();
|
||||
tokEnd = offsetAtt.endOffset();
|
||||
savePosIncr += posIncrAtt.getPositionIncrement();
|
||||
savePosLen = posLenAtt.getPositionLength();
|
||||
}
|
||||
}
|
||||
if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
|
||||
if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
|
||||
// grab gramSize chars from front or back
|
||||
final int start = side == Side.FRONT ? 0 : charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize);
|
||||
final int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
|
||||
clearAttributes();
|
||||
offsetAtt.setOffset(tokStart, tokEnd);
|
||||
// first ngram gets increment, others don't
|
||||
if (curGramSize == minGram) {
|
||||
posIncrAtt.setPositionIncrement(savePosIncr);
|
||||
savePosIncr = 0;
|
||||
} else {
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
}
|
||||
posLenAtt.setPositionLength(savePosLen);
|
||||
termAtt.copyBuffer(curTermBuffer, start, end - start);
|
||||
curGramSize++;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
curTermBuffer = null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
curTermBuffer = null;
|
||||
savePosIncr = 0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,77 @@
|
|||
package org.apache.lucene.analysis.ngram;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Tokenizes the input from an edge into n-grams of given size(s).
|
||||
* <p>
|
||||
* This {@link Tokenizer} create n-grams from the beginning edge or ending edge of a input token.
|
||||
* <p><a name="version" /> As of Lucene 4.4, this tokenizer<ul>
|
||||
* <li>can handle <code>maxGram</code> larger than 1024 chars, but beware that this will result in increased memory usage
|
||||
* <li>doesn't trim the input,
|
||||
* <li>sets position increments equal to 1 instead of 1 for the first token and 0 for all other ones
|
||||
* <li>doesn't support backward n-grams anymore.
|
||||
* <li>supports {@link #isTokenChar(int) pre-tokenization},
|
||||
* <li>correctly handles supplementary characters.
|
||||
* </ul>
|
||||
* <p>Although <b style="color:red">highly</b> discouraged, it is still possible
|
||||
* to use the old behavior through {@link Lucene43XEdgeXNGramTokenizer}.
|
||||
*/
|
||||
public class XEdgeNGramTokenizer extends XNGramTokenizer {
|
||||
|
||||
static {
|
||||
// LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1492640.
|
||||
assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed";
|
||||
}
|
||||
|
||||
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
|
||||
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
|
||||
|
||||
/**
|
||||
* Creates XEdgeXNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param version the <a href="#version">Lucene match version</a>
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public XEdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
|
||||
super(version, input, minGram, maxGram, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates XEdgeXNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param version the <a href="#version">Lucene match version</a>
|
||||
* @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public XEdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) {
|
||||
super(version, factory, input, minGram, maxGram, true);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,281 @@
|
|||
package org.apache.lucene.analysis.ngram;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Old version of {@link EdgeNGramTokenizer} which doesn't handle correctly
|
||||
* supplementary characters.
|
||||
*/
|
||||
@Deprecated
|
||||
public final class XLucene43EdgeNGramTokenizer extends Tokenizer {
|
||||
|
||||
static {
|
||||
// LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1492640.
|
||||
assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed";
|
||||
}
|
||||
|
||||
public static final Side DEFAULT_SIDE = Side.FRONT;
|
||||
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
|
||||
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
/** Specifies which side of the input the n-gram should be generated from */
|
||||
public static enum Side {
|
||||
|
||||
/** Get the n-gram from the front of the input */
|
||||
FRONT {
|
||||
@Override
|
||||
public String getLabel() { return "front"; }
|
||||
},
|
||||
|
||||
/** Get the n-gram from the end of the input */
|
||||
BACK {
|
||||
@Override
|
||||
public String getLabel() { return "back"; }
|
||||
};
|
||||
|
||||
public abstract String getLabel();
|
||||
|
||||
// Get the appropriate Side from a string
|
||||
public static Side getSide(String sideName) {
|
||||
if (FRONT.getLabel().equals(sideName)) {
|
||||
return FRONT;
|
||||
}
|
||||
if (BACK.getLabel().equals(sideName)) {
|
||||
return BACK;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private int minGram;
|
||||
private int maxGram;
|
||||
private int gramSize;
|
||||
private Side side;
|
||||
private boolean started;
|
||||
private int inLen; // length of the input AFTER trim()
|
||||
private int charsRead; // length of the input
|
||||
private String inStr;
|
||||
|
||||
|
||||
/**
|
||||
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param version the <a href="#version">Lucene match version</a>
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
* @param side the {@link Side} from which to chop off an n-gram
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
@Deprecated
|
||||
public XLucene43EdgeNGramTokenizer(Version version, Reader input, Side side, int minGram, int maxGram) {
|
||||
super(input);
|
||||
init(version, side, minGram, maxGram);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param version the <a href="#version">Lucene match version</a>
|
||||
* @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
* @param side the {@link Side} from which to chop off an n-gram
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
@Deprecated
|
||||
public XLucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, Side side, int minGram, int maxGram) {
|
||||
super(factory, input);
|
||||
init(version, side, minGram, maxGram);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param version the <a href="#version">Lucene match version</a>
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
* @param sideLabel the name of the {@link Side} from which to chop off an n-gram
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
@Deprecated
|
||||
public XLucene43EdgeNGramTokenizer(Version version, Reader input, String sideLabel, int minGram, int maxGram) {
|
||||
this(version, input, Side.getSide(sideLabel), minGram, maxGram);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param version the <a href="#version">Lucene match version</a>
|
||||
* @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
* @param sideLabel the name of the {@link Side} from which to chop off an n-gram
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
@Deprecated
|
||||
public XLucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, String sideLabel, int minGram, int maxGram) {
|
||||
this(version, factory, input, Side.getSide(sideLabel), minGram, maxGram);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param version the <a href="#version">Lucene match version</a>
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public XLucene43EdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
|
||||
this(version, input, Side.FRONT, minGram, maxGram);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param version the <a href="#version">Lucene match version</a>
|
||||
* @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public XLucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) {
|
||||
this(version, factory, input, Side.FRONT, minGram, maxGram);
|
||||
}
|
||||
|
||||
private void init(Version version, Side side, int minGram, int maxGram) {
|
||||
if (version == null) {
|
||||
throw new IllegalArgumentException("version must not be null");
|
||||
}
|
||||
|
||||
if (side == null) {
|
||||
throw new IllegalArgumentException("sideLabel must be either front or back");
|
||||
}
|
||||
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
}
|
||||
|
||||
if (minGram > maxGram) {
|
||||
throw new IllegalArgumentException("minGram must not be greater than maxGram");
|
||||
}
|
||||
|
||||
maxGram = Math.min(maxGram, 1024);
|
||||
|
||||
this.minGram = minGram;
|
||||
this.maxGram = maxGram;
|
||||
this.side = side;
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
// if we are just starting, read the whole input
|
||||
if (!started) {
|
||||
started = true;
|
||||
gramSize = minGram;
|
||||
final int limit = side == Side.FRONT ? maxGram : 1024;
|
||||
char[] chars = new char[Math.min(1024, limit)];
|
||||
charsRead = 0;
|
||||
// TODO: refactor to a shared readFully somewhere:
|
||||
boolean exhausted = false;
|
||||
while (charsRead < limit) {
|
||||
final int inc = input.read(chars, charsRead, chars.length-charsRead);
|
||||
if (inc == -1) {
|
||||
exhausted = true;
|
||||
break;
|
||||
}
|
||||
charsRead += inc;
|
||||
if (charsRead == chars.length && charsRead < limit) {
|
||||
chars = ArrayUtil.grow(chars);
|
||||
}
|
||||
}
|
||||
|
||||
inStr = new String(chars, 0, charsRead);
|
||||
inStr = inStr.trim();
|
||||
|
||||
if (!exhausted) {
|
||||
// Read extra throwaway chars so that on end() we
|
||||
// report the correct offset:
|
||||
char[] throwaway = new char[1024];
|
||||
while(true) {
|
||||
final int inc = input.read(throwaway, 0, throwaway.length);
|
||||
if (inc == -1) {
|
||||
break;
|
||||
}
|
||||
charsRead += inc;
|
||||
}
|
||||
}
|
||||
|
||||
inLen = inStr.length();
|
||||
if (inLen == 0) {
|
||||
return false;
|
||||
}
|
||||
posIncrAtt.setPositionIncrement(1);
|
||||
} else {
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
}
|
||||
|
||||
// if the remaining input is too short, we can't generate any n-grams
|
||||
if (gramSize > inLen) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// if we have hit the end of our n-gram size range, quit
|
||||
if (gramSize > maxGram || gramSize > inLen) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// grab gramSize chars from front or back
|
||||
int start = side == Side.FRONT ? 0 : inLen - gramSize;
|
||||
int end = start + gramSize;
|
||||
termAtt.setEmpty().append(inStr, start, end);
|
||||
offsetAtt.setOffset(correctOffset(start), correctOffset(end));
|
||||
gramSize++;
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() {
|
||||
// set final offset
|
||||
final int finalOffset = correctOffset(charsRead);
|
||||
this.offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
started = false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,164 @@
|
|||
package org.apache.lucene.analysis.ngram;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
|
||||
/**
|
||||
* Old broken version of {@link NGramTokenizer}.
|
||||
*/
|
||||
@Deprecated
|
||||
public final class XLucene43NGramTokenizer extends Tokenizer {
|
||||
|
||||
static {
|
||||
// LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1492640.
|
||||
assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed";
|
||||
}
|
||||
|
||||
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
||||
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
||||
|
||||
private int minGram, maxGram;
|
||||
private int gramSize;
|
||||
private int pos;
|
||||
private int inLen; // length of the input AFTER trim()
|
||||
private int charsRead; // length of the input
|
||||
private String inStr;
|
||||
private boolean started;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
/**
|
||||
* Creates NGramTokenizer with given min and max n-grams.
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public XLucene43NGramTokenizer(Reader input, int minGram, int maxGram) {
|
||||
super(input);
|
||||
init(minGram, maxGram);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates NGramTokenizer with given min and max n-grams.
|
||||
* @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public XLucene43NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) {
|
||||
super(factory, input);
|
||||
init(minGram, maxGram);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates NGramTokenizer with default min and max n-grams.
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
*/
|
||||
public XLucene43NGramTokenizer(Reader input) {
|
||||
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
||||
}
|
||||
|
||||
private void init(int minGram, int maxGram) {
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
}
|
||||
if (minGram > maxGram) {
|
||||
throw new IllegalArgumentException("minGram must not be greater than maxGram");
|
||||
}
|
||||
this.minGram = minGram;
|
||||
this.maxGram = maxGram;
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
if (!started) {
|
||||
started = true;
|
||||
gramSize = minGram;
|
||||
char[] chars = new char[1024];
|
||||
charsRead = 0;
|
||||
// TODO: refactor to a shared readFully somewhere:
|
||||
while (charsRead < chars.length) {
|
||||
int inc = input.read(chars, charsRead, chars.length-charsRead);
|
||||
if (inc == -1) {
|
||||
break;
|
||||
}
|
||||
charsRead += inc;
|
||||
}
|
||||
inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings
|
||||
|
||||
if (charsRead == chars.length) {
|
||||
// Read extra throwaway chars so that on end() we
|
||||
// report the correct offset:
|
||||
char[] throwaway = new char[1024];
|
||||
while(true) {
|
||||
final int inc = input.read(throwaway, 0, throwaway.length);
|
||||
if (inc == -1) {
|
||||
break;
|
||||
}
|
||||
charsRead += inc;
|
||||
}
|
||||
}
|
||||
|
||||
inLen = inStr.length();
|
||||
if (inLen == 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (pos+gramSize > inLen) { // if we hit the end of the string
|
||||
pos = 0; // reset to beginning of string
|
||||
gramSize++; // increase n-gram size
|
||||
if (gramSize > maxGram) // we are done
|
||||
return false;
|
||||
if (pos+gramSize > inLen)
|
||||
return false;
|
||||
}
|
||||
|
||||
int oldPos = pos;
|
||||
pos++;
|
||||
termAtt.setEmpty().append(inStr, oldPos, oldPos+gramSize);
|
||||
offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize));
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() {
|
||||
// set final offset
|
||||
final int finalOffset = correctOffset(charsRead);
|
||||
this.offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
started = false;
|
||||
pos = 0;
|
||||
}
|
||||
}
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.analysis.ngram;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
|
@ -26,14 +28,15 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.util.XCharacterUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
|
||||
/**
|
||||
* Tokenizes the input into n-grams of the given size(s).
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version} compatibility when
|
||||
* creating a {@link NGramTokenFilter}. As of Lucene 4.4, this token filters:<ul>
|
||||
* creating a {@link XNGramTokenFilter}. As of Lucene 4.4, this token filters:<ul>
|
||||
* <li>handles supplementary characters correctly,</li>
|
||||
* <li>emits all n-grams for the same token at the same position,</li>
|
||||
* <li>does not modify offsets,</li>
|
||||
* <li>sorts n-grams by their offset in the original token first, then
|
||||
|
@ -43,13 +46,18 @@ import org.elasticsearch.common.lucene.Lucene;
|
|||
* {@link Version#LUCENE_44} in the constructor but this is not recommended as
|
||||
* it will lead to broken {@link TokenStream}s that will cause highlighting
|
||||
* bugs.
|
||||
* <p>If you were using this {@link TokenFilter} to perform partial highlighting,
|
||||
* this won't work anymore since this filter doesn't update offsets. You should
|
||||
* modify your analysis chain to use {@link NGramTokenizer}, and potentially
|
||||
* override {@link NGramTokenizer#isTokenChar(int)} to perform pre-tokenization.
|
||||
*/
|
||||
public final class XNGramTokenFilter extends TokenFilter {
|
||||
|
||||
|
||||
static {
|
||||
// LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1476563
|
||||
assert Lucene.VERSION.ordinal() < Version.LUCENE_42.ordinal()+2 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this should can be removed";
|
||||
// LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1492640.
|
||||
assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed";
|
||||
}
|
||||
|
||||
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
||||
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
||||
|
||||
|
@ -57,21 +65,21 @@ public final class XNGramTokenFilter extends TokenFilter {
|
|||
|
||||
private char[] curTermBuffer;
|
||||
private int curTermLength;
|
||||
private int curCodePointCount;
|
||||
private int curGramSize;
|
||||
private int curPos;
|
||||
private int curPosInc, curPosLen;
|
||||
private int tokStart;
|
||||
private int tokEnd;
|
||||
private boolean hasIllegalOffsets; // only if the length changed before this filter
|
||||
|
||||
private final Version version;
|
||||
private final XCharacterUtils charUtils;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt;
|
||||
private final PositionLengthAttribute posLenAtt;
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
/**
|
||||
* Creates NGramTokenFilter with given min and max n-grams.
|
||||
* Creates XNGramTokenFilter with given min and max n-grams.
|
||||
* @param version Lucene version to enable correct position increments.
|
||||
* See <a href="#version">above</a> for details.
|
||||
* @param input {@link TokenStream} holding the input to be tokenized
|
||||
|
@ -80,7 +88,7 @@ public final class XNGramTokenFilter extends TokenFilter {
|
|||
*/
|
||||
public XNGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
|
||||
super(new LengthFilter(true, input, minGram, Integer.MAX_VALUE));
|
||||
this.version = version;
|
||||
this.charUtils = XCharacterUtils.getInstance(version);
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
}
|
||||
|
@ -89,31 +97,12 @@ public final class XNGramTokenFilter extends TokenFilter {
|
|||
}
|
||||
this.minGram = minGram;
|
||||
this.maxGram = maxGram;
|
||||
if (version.onOrAfter(Version.LUCENE_42)) {
|
||||
posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||
} else {
|
||||
posIncAtt = new PositionIncrementAttribute() {
|
||||
@Override
|
||||
public void setPositionIncrement(int positionIncrement) {}
|
||||
@Override
|
||||
public int getPositionIncrement() {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
posLenAtt = new PositionLengthAttribute() {
|
||||
@Override
|
||||
public void setPositionLength(int positionLength) {}
|
||||
@Override
|
||||
public int getPositionLength() {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
}
|
||||
posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates NGramTokenFilter with default min and max n-grams.
|
||||
* Creates XNGramTokenFilter with default min and max n-grams.
|
||||
* @param version Lucene version to enable correct position increments.
|
||||
* See <a href="#version">above</a> for details.
|
||||
* @param input {@link TokenStream} holding the input to be tokenized
|
||||
|
@ -132,25 +121,24 @@ public final class XNGramTokenFilter extends TokenFilter {
|
|||
} else {
|
||||
curTermBuffer = termAtt.buffer().clone();
|
||||
curTermLength = termAtt.length();
|
||||
curCodePointCount = charUtils.codePointCount(termAtt);
|
||||
curGramSize = minGram;
|
||||
curPos = 0;
|
||||
curPosInc = posIncAtt.getPositionIncrement();
|
||||
curPosLen = posLenAtt.getPositionLength();
|
||||
tokStart = offsetAtt.startOffset();
|
||||
tokEnd = offsetAtt.endOffset();
|
||||
// if length by start + end offsets doesn't match the term text then assume
|
||||
// this is a synonym and don't adjust the offsets.
|
||||
hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
|
||||
}
|
||||
}
|
||||
if (version.onOrAfter(Version.LUCENE_42)) {
|
||||
if (curGramSize > maxGram || curPos + curGramSize > curTermLength) {
|
||||
if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) {
|
||||
++curPos;
|
||||
curGramSize = minGram;
|
||||
}
|
||||
if (curPos + curGramSize <= curTermLength) {
|
||||
if ((curPos + curGramSize) <= curCodePointCount) {
|
||||
clearAttributes();
|
||||
termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
|
||||
final int start = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
|
||||
final int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
|
||||
termAtt.copyBuffer(curTermBuffer, start, end - start);
|
||||
posIncAtt.setPositionIncrement(curPosInc);
|
||||
curPosInc = 0;
|
||||
posLenAtt.setPositionLength(curPosLen);
|
||||
|
@ -158,23 +146,6 @@ public final class XNGramTokenFilter extends TokenFilter {
|
|||
curGramSize++;
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
while (curGramSize <= maxGram) {
|
||||
while (curPos+curGramSize <= curTermLength) { // while there is input
|
||||
clearAttributes();
|
||||
termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
|
||||
if (hasIllegalOffsets) {
|
||||
offsetAtt.setOffset(tokStart, tokEnd);
|
||||
} else {
|
||||
offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
|
||||
}
|
||||
curPos++;
|
||||
return true;
|
||||
}
|
||||
curGramSize++; // increase n-gram size
|
||||
curPos = 0;
|
||||
}
|
||||
}
|
||||
curTermBuffer = null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.analysis.ngram;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
|
@ -25,8 +27,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.util.XCharacterUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
|
||||
/**
|
||||
* Tokenizes the input into n-grams of the given size(s).
|
||||
|
@ -41,34 +43,53 @@ import org.elasticsearch.common.lucene.Lucene;
|
|||
* <tr><th>Offsets</th><td>[0,2[</td><td>[0,3[</td><td>[1,3[</td><td>[1,4[</td><td>[2,4[</td><td>[2,5[</td><td>[3,5[</td></tr>
|
||||
* </table>
|
||||
* <a name="version"/>
|
||||
* <p>Before Lucene 4.4, this class had a different behavior:<ul>
|
||||
* <li>It didn't support more than 1024 chars of input, the rest was trashed.</li>
|
||||
* <li>The last whitespaces of the 1024 chars block were trimmed.</li>
|
||||
* <li>Tokens were emitted in a different order (by increasing lengths).</li></ul>
|
||||
* <p>Although highly discouraged, it is still possible to use the old behavior
|
||||
* through {@link Lucene43NGramTokenizer}.
|
||||
* <p>This tokenizer changed a lot in Lucene 4.4 in order to:<ul>
|
||||
* <li>tokenize in a streaming fashion to support streams which are larger
|
||||
* than 1024 chars (limit of the previous version),
|
||||
* <li>count grams based on unicode code points instead of java chars (and
|
||||
* never split in the middle of surrogate pairs),
|
||||
* <li>give the ability to {@link #isTokenChar(int) pre-tokenize} the stream
|
||||
* before computing n-grams.</ul>
|
||||
* <p>Additionally, this class doesn't trim trailing whitespaces and emits
|
||||
* tokens in a different order, tokens are now emitted by increasing start
|
||||
* offsets while they used to be emitted by increasing lengths (which prevented
|
||||
* from supporting large input streams).
|
||||
* <p>Although <b style="color:red">highly</b> discouraged, it is still possible
|
||||
* to use the old behavior through {@link Lucene43NGramTokenizer}.
|
||||
*/
|
||||
public final class XNGramTokenizer extends Tokenizer {
|
||||
// non-final to allow for overriding isTokenChar, but all other methods should be final
|
||||
public class XNGramTokenizer extends Tokenizer {
|
||||
|
||||
static {
|
||||
// LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1492640.
|
||||
assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed";
|
||||
}
|
||||
|
||||
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
||||
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
||||
|
||||
static {
|
||||
// LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1476563
|
||||
assert Lucene.VERSION.ordinal() < Version.LUCENE_42.ordinal()+2 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this should can be removed";
|
||||
}
|
||||
|
||||
private char[] buffer;
|
||||
private int bufferStart, bufferEnd; // remaining slice of the buffer
|
||||
private XCharacterUtils charUtils;
|
||||
private XCharacterUtils.CharacterBuffer charBuffer;
|
||||
private int[] buffer; // like charBuffer, but converted to code points
|
||||
private int bufferStart, bufferEnd; // remaining slice in buffer
|
||||
private int offset;
|
||||
private int gramSize;
|
||||
private int minGram, maxGram;
|
||||
private boolean exhausted;
|
||||
private int lastCheckedChar; // last offset in the buffer that we checked
|
||||
private int lastNonTokenChar; // last offset that we found to not be a token char
|
||||
private boolean edgesOnly; // leading edges n-grams only
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
XNGramTokenizer(Version version, Reader input, int minGram, int maxGram, boolean edgesOnly) {
|
||||
super(input);
|
||||
init(version, minGram, maxGram, edgesOnly);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates NGramTokenizer with given min and max n-grams.
|
||||
* @param version the lucene compatibility <a href="#version">version</a>
|
||||
|
@ -77,8 +98,12 @@ public final class XNGramTokenizer extends Tokenizer {
|
|||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public XNGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
|
||||
super(input);
|
||||
init(version, minGram, maxGram);
|
||||
this(version, input, minGram, maxGram, false);
|
||||
}
|
||||
|
||||
XNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram, boolean edgesOnly) {
|
||||
super(factory, input);
|
||||
init(version, minGram, maxGram, edgesOnly);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -90,8 +115,7 @@ public final class XNGramTokenizer extends Tokenizer {
|
|||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public XNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) {
|
||||
super(factory, input);
|
||||
init(version, minGram, maxGram);
|
||||
this(version, factory, input, minGram, maxGram, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -103,10 +127,13 @@ public final class XNGramTokenizer extends Tokenizer {
|
|||
this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
||||
}
|
||||
|
||||
private void init(Version version, int minGram, int maxGram) {
|
||||
if (!version.onOrAfter(Version.LUCENE_42)) {
|
||||
throw new IllegalArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer");
|
||||
private void init(Version version, int minGram, int maxGram, boolean edgesOnly) {
|
||||
if (!version.onOrAfter(Version.LUCENE_43)) {
|
||||
throw new IllegalArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer");
|
||||
}
|
||||
charUtils = version.onOrAfter(Version.LUCENE_43)
|
||||
? XCharacterUtils.getInstance(version)
|
||||
: XCharacterUtils.getJava4Instance();
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
}
|
||||
|
@ -115,66 +142,107 @@ public final class XNGramTokenizer extends Tokenizer {
|
|||
}
|
||||
this.minGram = minGram;
|
||||
this.maxGram = maxGram;
|
||||
buffer = new char[maxGram + 1024];
|
||||
this.edgesOnly = edgesOnly;
|
||||
charBuffer = XCharacterUtils.newCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
|
||||
buffer = new int[charBuffer.getBuffer().length];
|
||||
// Make the term att large enough
|
||||
termAtt.resizeBuffer(2 * maxGram);
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
public final boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
|
||||
// compact
|
||||
if (bufferStart >= buffer.length - maxGram) {
|
||||
System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
|
||||
bufferEnd -= bufferStart;
|
||||
bufferStart = 0;
|
||||
// termination of this loop is guaranteed by the fact that every iteration
|
||||
// either advances the buffer (calls consumes()) or increases gramSize
|
||||
while (true) {
|
||||
// compact
|
||||
if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted) {
|
||||
System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
|
||||
bufferEnd -= bufferStart;
|
||||
lastCheckedChar -= bufferStart;
|
||||
lastNonTokenChar -= bufferStart;
|
||||
bufferStart = 0;
|
||||
|
||||
// fill in remaining space
|
||||
if (!exhausted) {
|
||||
// TODO: refactor to a shared readFully
|
||||
while (bufferEnd < buffer.length) {
|
||||
final int read = input.read(buffer, bufferEnd, buffer.length - bufferEnd);
|
||||
if (read == -1) {
|
||||
exhausted = true;
|
||||
break;
|
||||
}
|
||||
bufferEnd += read;
|
||||
// fill in remaining space
|
||||
exhausted = !charUtils.fill(charBuffer, input, buffer.length - bufferEnd);
|
||||
// convert to code points
|
||||
bufferEnd += charUtils.toCodePoints(charBuffer.getBuffer(), 0, charBuffer.getLength(), buffer, bufferEnd);
|
||||
}
|
||||
|
||||
// should we go to the next offset?
|
||||
if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd) {
|
||||
if (bufferStart + 1 + minGram > bufferEnd) {
|
||||
assert exhausted;
|
||||
return false;
|
||||
}
|
||||
consume();
|
||||
gramSize = minGram;
|
||||
}
|
||||
|
||||
updateLastNonTokenChar();
|
||||
|
||||
// retry if the token to be emitted was going to not only contain token chars
|
||||
final boolean termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize);
|
||||
final boolean isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1;
|
||||
if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar) {
|
||||
consume();
|
||||
gramSize = minGram;
|
||||
continue;
|
||||
}
|
||||
|
||||
final int length = charUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0);
|
||||
termAtt.setLength(length);
|
||||
posIncAtt.setPositionIncrement(1);
|
||||
posLenAtt.setPositionLength(1);
|
||||
offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + length));
|
||||
++gramSize;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
private void updateLastNonTokenChar() {
|
||||
final int termEnd = bufferStart + gramSize - 1;
|
||||
if (termEnd > lastCheckedChar) {
|
||||
for (int i = termEnd; i > lastCheckedChar; --i) {
|
||||
if (!isTokenChar(buffer[i])) {
|
||||
lastNonTokenChar = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
lastCheckedChar = termEnd;
|
||||
}
|
||||
}
|
||||
|
||||
// should we go to the next offset?
|
||||
if (gramSize > maxGram || bufferStart + gramSize > bufferEnd) {
|
||||
bufferStart++;
|
||||
offset++;
|
||||
gramSize = minGram;
|
||||
}
|
||||
/** Consume one code point. */
|
||||
private void consume() {
|
||||
offset += Character.charCount(buffer[bufferStart++]);
|
||||
}
|
||||
|
||||
// are there enough chars remaining?
|
||||
if (bufferStart + gramSize > bufferEnd) {
|
||||
return false;
|
||||
}
|
||||
|
||||
termAtt.copyBuffer(buffer, bufferStart, gramSize);
|
||||
posIncAtt.setPositionIncrement(1);
|
||||
posLenAtt.setPositionLength(1);
|
||||
offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + gramSize));
|
||||
++gramSize;
|
||||
/** Only collect characters which satisfy this condition. */
|
||||
protected boolean isTokenChar(int chr) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() {
|
||||
final int endOffset = correctOffset(offset + bufferEnd - bufferStart);
|
||||
public final void end() {
|
||||
assert bufferStart <= bufferEnd;
|
||||
int endOffset = offset;
|
||||
for (int i = bufferStart; i < bufferEnd; ++i) {
|
||||
endOffset += Character.charCount(buffer[i]);
|
||||
}
|
||||
endOffset = correctOffset(endOffset);
|
||||
offsetAtt.setOffset(endOffset, endOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
public final void reset() throws IOException {
|
||||
super.reset();
|
||||
bufferStart = bufferEnd = buffer.length;
|
||||
lastNonTokenChar = lastCheckedChar = bufferStart - 1;
|
||||
offset = 0;
|
||||
gramSize = minGram;
|
||||
exhausted = false;
|
||||
charBuffer.reset();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,394 @@
|
|||
package org.apache.lucene.analysis.util;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link XCharacterUtils} provides a unified interface to Character-related
|
||||
* operations to implement backwards compatible character operations based on a
|
||||
* {@link Version} instance.
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
public abstract class XCharacterUtils {
|
||||
|
||||
static {
|
||||
// LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1492640.
|
||||
assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed";
|
||||
}
|
||||
|
||||
private static final Java4XCharacterUtils JAVA_4 = new Java4XCharacterUtils();
|
||||
private static final Java5XCharacterUtils JAVA_5 = new Java5XCharacterUtils();
|
||||
|
||||
/**
|
||||
* Returns a {@link XCharacterUtils} implementation according to the given
|
||||
* {@link Version} instance.
|
||||
*
|
||||
* @param matchVersion
|
||||
* a version instance
|
||||
* @return a {@link XCharacterUtils} implementation according to the given
|
||||
* {@link Version} instance.
|
||||
*/
|
||||
public static XCharacterUtils getInstance(final Version matchVersion) {
|
||||
return matchVersion.onOrAfter(Version.LUCENE_31) ? JAVA_5 : JAVA_4;
|
||||
}
|
||||
|
||||
/** Return a {@link XCharacterUtils} instance compatible with Java 1.4. */
|
||||
public static XCharacterUtils getJava4Instance() {
|
||||
return JAVA_4;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the code point at the given index of the {@link CharSequence}.
|
||||
* Depending on the {@link Version} passed to
|
||||
* {@link XCharacterUtils#getInstance(Version)} this method mimics the behavior
|
||||
* of {@link Character#codePointAt(char[], int)} as it would have been
|
||||
* available on a Java 1.4 JVM or on a later virtual machine version.
|
||||
*
|
||||
* @param seq
|
||||
* a character sequence
|
||||
* @param offset
|
||||
* the offset to the char values in the chars array to be converted
|
||||
*
|
||||
* @return the Unicode code point at the given index
|
||||
* @throws NullPointerException
|
||||
* - if the sequence is null.
|
||||
* @throws IndexOutOfBoundsException
|
||||
* - if the value offset is negative or not less than the length of
|
||||
* the character sequence.
|
||||
*/
|
||||
public abstract int codePointAt(final CharSequence seq, final int offset);
|
||||
|
||||
/**
|
||||
* Returns the code point at the given index of the char array where only elements
|
||||
* with index less than the limit are used.
|
||||
* Depending on the {@link Version} passed to
|
||||
* {@link XCharacterUtils#getInstance(Version)} this method mimics the behavior
|
||||
* of {@link Character#codePointAt(char[], int)} as it would have been
|
||||
* available on a Java 1.4 JVM or on a later virtual machine version.
|
||||
*
|
||||
* @param chars
|
||||
* a character array
|
||||
* @param offset
|
||||
* the offset to the char values in the chars array to be converted
|
||||
* @param limit the index afer the last element that should be used to calculate
|
||||
* codepoint.
|
||||
*
|
||||
* @return the Unicode code point at the given index
|
||||
* @throws NullPointerException
|
||||
* - if the array is null.
|
||||
* @throws IndexOutOfBoundsException
|
||||
* - if the value offset is negative or not less than the length of
|
||||
* the char array.
|
||||
*/
|
||||
public abstract int codePointAt(final char[] chars, final int offset, final int limit);
|
||||
|
||||
/** Return the number of characters in <code>seq</code>. */
|
||||
public abstract int codePointCount(CharSequence seq);
|
||||
|
||||
/**
|
||||
* Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code>
|
||||
* of the given bufferSize.
|
||||
*
|
||||
* @param bufferSize
|
||||
* the internal char buffer size, must be <code>>= 2</code>
|
||||
* @return a new {@link CharacterBuffer} instance.
|
||||
*/
|
||||
public static CharacterBuffer newCharacterBuffer(final int bufferSize) {
|
||||
if (bufferSize < 2) {
|
||||
throw new IllegalArgumentException("buffersize must be >= 2");
|
||||
}
|
||||
return new CharacterBuffer(new char[bufferSize], 0, 0);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Converts each unicode codepoint to lowerCase via {@link Character#toLowerCase(int)} starting
|
||||
* at the given offset.
|
||||
* @param buffer the char buffer to lowercase
|
||||
* @param offset the offset to start at
|
||||
* @param limit the max char in the buffer to lower case
|
||||
*/
|
||||
public final void toLowerCase(final char[] buffer, final int offset, final int limit) {
|
||||
assert buffer.length >= limit;
|
||||
assert offset <=0 && offset <= buffer.length;
|
||||
for (int i = offset; i < limit;) {
|
||||
i += Character.toChars(
|
||||
Character.toLowerCase(
|
||||
codePointAt(buffer, i, limit)), buffer, i);
|
||||
}
|
||||
}
|
||||
|
||||
/** Converts a sequence of Java characters to a sequence of unicode code points.
|
||||
* @return the number of code points written to the destination buffer */
|
||||
public final int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
|
||||
if (srcLen < 0) {
|
||||
throw new IllegalArgumentException("srcLen must be >= 0");
|
||||
}
|
||||
int codePointCount = 0;
|
||||
for (int i = 0; i < srcLen; ) {
|
||||
final int cp = codePointAt(src, srcOff + i, srcOff + srcLen);
|
||||
final int charCount = Character.charCount(cp);
|
||||
dest[destOff + codePointCount++] = cp;
|
||||
i += charCount;
|
||||
}
|
||||
return codePointCount;
|
||||
}
|
||||
|
||||
/** Converts a sequence of unicode code points to a sequence of Java characters.
|
||||
* @return the number of chars written to the destination buffer */
|
||||
public final int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
|
||||
if (srcLen < 0) {
|
||||
throw new IllegalArgumentException("srcLen must be >= 0");
|
||||
}
|
||||
int written = 0;
|
||||
for (int i = 0; i < srcLen; ++i) {
|
||||
written += Character.toChars(src[srcOff + i], dest, destOff + written);
|
||||
}
|
||||
return written;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills the {@link CharacterBuffer} with characters read from the given
|
||||
* reader {@link Reader}. This method tries to read <code>numChars</code>
|
||||
* characters into the {@link CharacterBuffer}, each call to fill will start
|
||||
* filling the buffer from offset <code>0</code> up to <code>numChars</code>.
|
||||
* In case code points can span across 2 java characters, this method may
|
||||
* only fill <code>numChars - 1</code> characters in order not to split in
|
||||
* the middle of a surrogate pair, even if there are remaining characters in
|
||||
* the {@link Reader}.
|
||||
* <p>
|
||||
* Depending on the {@link Version} passed to
|
||||
* {@link XCharacterUtils#getInstance(Version)} this method implements
|
||||
* supplementary character awareness when filling the given buffer. For all
|
||||
* {@link Version} > 3.0 {@link #fill(CharacterBuffer, Reader, int)} guarantees
|
||||
* that the given {@link CharacterBuffer} will never contain a high surrogate
|
||||
* character as the last element in the buffer unless it is the last available
|
||||
* character in the reader. In other words, high and low surrogate pairs will
|
||||
* always be preserved across buffer boarders.
|
||||
* </p>
|
||||
* <p>
|
||||
* A return value of <code>false</code> means that this method call exhausted
|
||||
* the reader, but there may be some bytes which have been read, which can be
|
||||
* verified by checking whether <code>buffer.getLength() > 0</code>.
|
||||
* </p>
|
||||
*
|
||||
* @param buffer
|
||||
* the buffer to fill.
|
||||
* @param reader
|
||||
* the reader to read characters from.
|
||||
* @param numChars
|
||||
* the number of chars to read
|
||||
* @return <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer
|
||||
* @throws IOException
|
||||
* if the reader throws an {@link IOException}.
|
||||
*/
|
||||
public abstract boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException;
|
||||
|
||||
/** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
|
||||
public final boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
|
||||
return fill(buffer, reader, buffer.buffer.length);
|
||||
}
|
||||
|
||||
/** Return the index within <code>buf[start:start+count]</code> which is by <code>offset</code>
|
||||
* code points from <code>index</code>. */
|
||||
public abstract int offsetByCodePoints(char[] buf, int start, int count, int index, int offset);
|
||||
|
||||
static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException {
|
||||
int read = 0;
|
||||
while (read < len) {
|
||||
final int r = reader.read(dest, offset + read, len - read);
|
||||
if (r == -1) {
|
||||
break;
|
||||
}
|
||||
read += r;
|
||||
}
|
||||
return read;
|
||||
}
|
||||
|
||||
private static final class Java5XCharacterUtils extends XCharacterUtils {
|
||||
Java5XCharacterUtils() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public int codePointAt(final CharSequence seq, final int offset) {
|
||||
return Character.codePointAt(seq, offset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int codePointAt(final char[] chars, final int offset, final int limit) {
|
||||
return Character.codePointAt(chars, offset, limit);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean fill(final CharacterBuffer buffer, final Reader reader, int numChars) throws IOException {
|
||||
assert buffer.buffer.length >= 2;
|
||||
if (numChars < 2 || numChars > buffer.buffer.length) {
|
||||
throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
|
||||
}
|
||||
final char[] charBuffer = buffer.buffer;
|
||||
buffer.offset = 0;
|
||||
final int offset;
|
||||
|
||||
// Install the previously saved ending high surrogate:
|
||||
if (buffer.lastTrailingHighSurrogate != 0) {
|
||||
charBuffer[0] = buffer.lastTrailingHighSurrogate;
|
||||
buffer.lastTrailingHighSurrogate = 0;
|
||||
offset = 1;
|
||||
} else {
|
||||
offset = 0;
|
||||
}
|
||||
|
||||
final int read = readFully(reader, charBuffer, offset, numChars - offset);
|
||||
|
||||
buffer.length = offset + read;
|
||||
final boolean result = buffer.length == numChars;
|
||||
if (buffer.length < numChars) {
|
||||
// We failed to fill the buffer. Even if the last char is a high
|
||||
// surrogate, there is nothing we can do
|
||||
return result;
|
||||
}
|
||||
|
||||
if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
|
||||
buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int codePointCount(CharSequence seq) {
|
||||
return Character.codePointCount(seq, 0, seq.length());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
|
||||
return Character.offsetByCodePoints(buf, start, count, index, offset);
|
||||
}
|
||||
}
|
||||
|
||||
private static final class Java4XCharacterUtils extends XCharacterUtils {
|
||||
Java4XCharacterUtils() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public int codePointAt(final CharSequence seq, final int offset) {
|
||||
return seq.charAt(offset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int codePointAt(final char[] chars, final int offset, final int limit) {
|
||||
if(offset >= limit)
|
||||
throw new IndexOutOfBoundsException("offset must be less than limit");
|
||||
return chars[offset];
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean fill(CharacterBuffer buffer, Reader reader, int numChars)
|
||||
throws IOException {
|
||||
assert buffer.buffer.length >= 1;
|
||||
if (numChars < 1 || numChars > buffer.buffer.length) {
|
||||
throw new IllegalArgumentException("numChars must be >= 1 and <= the buffer size");
|
||||
}
|
||||
buffer.offset = 0;
|
||||
final int read = readFully(reader, buffer.buffer, 0, numChars);
|
||||
buffer.length = read;
|
||||
buffer.lastTrailingHighSurrogate = 0;
|
||||
return read == numChars;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int codePointCount(CharSequence seq) {
|
||||
return seq.length();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
|
||||
final int result = index + offset;
|
||||
if (result < 0 || result > count) {
|
||||
throw new IndexOutOfBoundsException();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* A simple IO buffer to use with
|
||||
* {@link XCharacterUtils#fill(CharacterBuffer, Reader)}.
|
||||
*/
|
||||
public static final class CharacterBuffer {
|
||||
|
||||
private final char[] buffer;
|
||||
private int offset;
|
||||
private int length;
|
||||
// NOTE: not private so outer class can access without
|
||||
// $access methods:
|
||||
char lastTrailingHighSurrogate;
|
||||
|
||||
CharacterBuffer(char[] buffer, int offset, int length) {
|
||||
this.buffer = buffer;
|
||||
this.offset = offset;
|
||||
this.length = length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the internal buffer
|
||||
*
|
||||
* @return the buffer
|
||||
*/
|
||||
public char[] getBuffer() {
|
||||
return buffer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the data offset in the internal buffer.
|
||||
*
|
||||
* @return the offset
|
||||
*/
|
||||
public int getOffset() {
|
||||
return offset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the length of the data in the internal buffer starting at
|
||||
* {@link #getOffset()}
|
||||
*
|
||||
* @return the length
|
||||
*/
|
||||
public int getLength() {
|
||||
return length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the CharacterBuffer. All internals are reset to its default
|
||||
* values.
|
||||
*/
|
||||
public void reset() {
|
||||
offset = 0;
|
||||
length = 0;
|
||||
lastTrailingHighSurrogate = 0;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,137 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* A class to match character code points.
|
||||
*/
|
||||
public interface CharMatcher {
|
||||
|
||||
public static class ByUnicodeCategory implements CharMatcher {
|
||||
|
||||
public static CharMatcher of(byte unicodeCategory) {
|
||||
return new ByUnicodeCategory(unicodeCategory);
|
||||
}
|
||||
|
||||
private final byte unicodeType;
|
||||
|
||||
ByUnicodeCategory(byte unicodeType) {
|
||||
this.unicodeType = unicodeType;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isTokenChar(int c) {
|
||||
return Character.getType(c) == unicodeType;
|
||||
}
|
||||
}
|
||||
|
||||
public enum Basic implements CharMatcher {
|
||||
LETTER {
|
||||
@Override
|
||||
public boolean isTokenChar(int c) {
|
||||
return Character.isLetter(c);
|
||||
}
|
||||
},
|
||||
DIGIT {
|
||||
@Override
|
||||
public boolean isTokenChar(int c) {
|
||||
return Character.isDigit(c);
|
||||
}
|
||||
},
|
||||
WHITESPACE {
|
||||
@Override
|
||||
public boolean isTokenChar(int c) {
|
||||
return Character.isWhitespace(c);
|
||||
}
|
||||
},
|
||||
PUNCTUATION {
|
||||
@Override
|
||||
public boolean isTokenChar(int c) {
|
||||
switch (Character.getType(c)) {
|
||||
case Character.START_PUNCTUATION:
|
||||
case Character.END_PUNCTUATION:
|
||||
case Character.OTHER_PUNCTUATION:
|
||||
case Character.CONNECTOR_PUNCTUATION:
|
||||
case Character.DASH_PUNCTUATION:
|
||||
case Character.INITIAL_QUOTE_PUNCTUATION:
|
||||
case Character.FINAL_QUOTE_PUNCTUATION:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
},
|
||||
SYMBOL {
|
||||
@Override
|
||||
public boolean isTokenChar(int c) {
|
||||
switch (Character.getType(c)) {
|
||||
case Character.CURRENCY_SYMBOL:
|
||||
case Character.MATH_SYMBOL:
|
||||
case Character.OTHER_SYMBOL:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public final class Builder {
|
||||
private final Set<CharMatcher> matchers;
|
||||
Builder() {
|
||||
matchers = new HashSet<CharMatcher>();
|
||||
}
|
||||
public Builder or(CharMatcher matcher) {
|
||||
matchers.add(matcher);
|
||||
return this;
|
||||
}
|
||||
public CharMatcher build() {
|
||||
switch (matchers.size()) {
|
||||
case 0:
|
||||
return new CharMatcher() {
|
||||
@Override
|
||||
public boolean isTokenChar(int c) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
case 1:
|
||||
return matchers.iterator().next();
|
||||
default:
|
||||
return new CharMatcher() {
|
||||
@Override
|
||||
public boolean isTokenChar(int c) {
|
||||
for (CharMatcher matcher : matchers) {
|
||||
if (matcher.isTokenChar(c)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns true if, and only if, the provided character matches this character class. */
|
||||
public boolean isTokenChar(int c);
|
||||
}
|
|
@ -20,9 +20,10 @@
|
|||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
|
||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
|
||||
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
|
||||
import org.apache.lucene.analysis.ngram.*;
|
||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter.Side;
|
||||
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
|
@ -51,6 +52,19 @@ public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new EdgeNGramTokenFilter(tokenStream, side, minGram, maxGram);
|
||||
if (version.onOrAfter(Version.LUCENE_43)) {
|
||||
TokenStream result = tokenStream;
|
||||
// side=BACK is not supported anymore but applying ReverseStringFilter up-front and after the token filter has the same effect
|
||||
if (side == Side.BACK) {
|
||||
result = new ReverseStringFilter(version, result);
|
||||
}
|
||||
result = new XEdgeNGramTokenFilter(version, result, minGram, maxGram);
|
||||
if (side == Side.BACK) {
|
||||
result = new ReverseStringFilter(version, result);
|
||||
}
|
||||
return result;
|
||||
} else {
|
||||
return new EdgeNGramTokenFilter(tokenStream, side, minGram, maxGram);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -19,9 +19,13 @@
|
|||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
|
||||
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
||||
import org.apache.lucene.analysis.ngram.XEdgeNGramTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
|
@ -30,6 +34,8 @@ import org.elasticsearch.index.settings.IndexSettings;
|
|||
|
||||
import java.io.Reader;
|
||||
|
||||
import static org.elasticsearch.index.analysis.NGramTokenizerFactory.parseTokenChars;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
|
@ -41,16 +47,37 @@ public class EdgeNGramTokenizerFactory extends AbstractTokenizerFactory {
|
|||
|
||||
private final EdgeNGramTokenizer.Side side;
|
||||
|
||||
private final CharMatcher matcher;
|
||||
|
||||
@Inject
|
||||
public EdgeNGramTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name, settings);
|
||||
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
|
||||
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
|
||||
this.side = EdgeNGramTokenizer.Side.getSide(settings.get("side", EdgeNGramTokenizer.DEFAULT_SIDE.getLabel()));
|
||||
this.matcher = parseTokenChars(settings.getAsArray("token_chars"));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tokenizer create(Reader reader) {
|
||||
return new EdgeNGramTokenizer(reader, side, minGram, maxGram);
|
||||
if (version.onOrAfter(Version.LUCENE_43)) {
|
||||
if (side == EdgeNGramTokenizer.Side.BACK) {
|
||||
throw new ElasticSearchIllegalArgumentException("side=BACK is not supported anymore. Please fix your analysis chain or use"
|
||||
+ " an older compatibility version (<=4.2) but beware that it might cause highlighting bugs.");
|
||||
}
|
||||
// LUCENE MONITOR: this token filter is a copy from lucene trunk and should go away once we upgrade to lucene 4.4
|
||||
if (matcher == null) {
|
||||
return new XEdgeNGramTokenizer(version, reader, minGram, maxGram);
|
||||
} else {
|
||||
return new XEdgeNGramTokenizer(version, reader, minGram, maxGram) {
|
||||
@Override
|
||||
protected boolean isTokenChar(int chr) {
|
||||
return matcher.isTokenChar(chr);
|
||||
}
|
||||
};
|
||||
}
|
||||
} else {
|
||||
return new EdgeNGramTokenizer(reader, side, minGram, maxGram);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -49,7 +49,7 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
if (this.version.onOrAfter(Version.LUCENE_42)) {
|
||||
if (this.version.onOrAfter(Version.LUCENE_43)) {
|
||||
// LUCENE MONITOR: this token filter is a copy from lucene trunk and should go away once we upgrade to lucene 4.4
|
||||
return new XNGramTokenFilter(version, tokenStream, minGram, maxGram);
|
||||
}
|
||||
|
|
|
@ -19,10 +19,12 @@
|
|||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
||||
import org.apache.lucene.analysis.ngram.XNGramTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
|
@ -30,6 +32,10 @@ import org.elasticsearch.index.Index;
|
|||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.lang.reflect.Field;
|
||||
import java.lang.reflect.Modifier;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
*
|
||||
|
@ -37,22 +43,75 @@ import java.io.Reader;
|
|||
public class NGramTokenizerFactory extends AbstractTokenizerFactory {
|
||||
|
||||
private final int minGram;
|
||||
|
||||
private final int maxGram;
|
||||
private final CharMatcher matcher;
|
||||
|
||||
static final Map<String, CharMatcher> MATCHERS;
|
||||
|
||||
static {
|
||||
ImmutableMap.Builder<String, CharMatcher> builder = ImmutableMap.builder();
|
||||
builder.put("letter", CharMatcher.Basic.LETTER);
|
||||
builder.put("digit", CharMatcher.Basic.DIGIT);
|
||||
builder.put("whitespace", CharMatcher.Basic.WHITESPACE);
|
||||
builder.put("punctuation", CharMatcher.Basic.PUNCTUATION);
|
||||
builder.put("symbol", CharMatcher.Basic.SYMBOL);
|
||||
// Populate with unicode categories from java.lang.Character
|
||||
for (Field field : Character.class.getFields()) {
|
||||
if (!field.getName().startsWith("DIRECTIONALITY")
|
||||
&& Modifier.isPublic(field.getModifiers())
|
||||
&& Modifier.isStatic(field.getModifiers())
|
||||
&& field.getType() == byte.class) {
|
||||
try {
|
||||
builder.put(field.getName().toLowerCase(Locale.ROOT), CharMatcher.ByUnicodeCategory.of(field.getByte(null)));
|
||||
} catch (Exception e) {
|
||||
// just ignore
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
MATCHERS = builder.build();
|
||||
}
|
||||
|
||||
static CharMatcher parseTokenChars(String[] characterClasses) {
|
||||
if (characterClasses == null || characterClasses.length == 0) {
|
||||
return null;
|
||||
}
|
||||
CharMatcher.Builder builder = new CharMatcher.Builder();
|
||||
for (String characterClass : characterClasses) {
|
||||
characterClass = characterClass.toLowerCase(Locale.ROOT).trim();
|
||||
CharMatcher matcher = MATCHERS.get(characterClass);
|
||||
if (matcher == null) {
|
||||
throw new ElasticSearchIllegalArgumentException("Unknown token type: '" + characterClass + "', must be one of " + MATCHERS.keySet());
|
||||
}
|
||||
builder.or(matcher);
|
||||
}
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
@Inject
|
||||
public NGramTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name, settings);
|
||||
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
|
||||
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
|
||||
this.matcher = parseTokenChars(settings.getAsArray("token_chars"));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tokenizer create(Reader reader) {
|
||||
if (this.version.onOrAfter(Version.LUCENE_42)) {
|
||||
// LUCENE MONITOR: this token filter is a copy from lucene trunk and should go away once we upgrade to lucene 4.4
|
||||
return new XNGramTokenizer(version, reader, minGram, maxGram);
|
||||
if (this.version.onOrAfter(Version.LUCENE_43)) {
|
||||
// LUCENE MONITOR: this token filter is a copy from lucene trunk and should go away once we upgrade to lucene 4.4
|
||||
if (matcher == null) {
|
||||
return new XNGramTokenizer(version, reader, minGram, maxGram);
|
||||
} else {
|
||||
return new XNGramTokenizer(version, reader, minGram, maxGram) {
|
||||
@Override
|
||||
protected boolean isTokenChar(int chr) {
|
||||
return matcher.isTokenChar(chr);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
return new NGramTokenizer(reader, minGram, maxGram);
|
||||
}
|
||||
|
||||
}
|
|
@ -94,6 +94,7 @@ public class HighlighterSearchTests extends AbstractSharedClusterTest {
|
|||
.put("index.number_of_shards", 1)
|
||||
.put("analysis.tokenizer.autocomplete.max_gram", 20)
|
||||
.put("analysis.tokenizer.autocomplete.min_gram", 1)
|
||||
.put("analysis.tokenizer.autocomplete.token_chars", "letter,digit")
|
||||
.put("analysis.tokenizer.autocomplete.type", "nGram")
|
||||
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
|
||||
.putArray("analysis.filter.wordDelimiter.type_table",
|
||||
|
@ -283,6 +284,7 @@ public class HighlighterSearchTests extends AbstractSharedClusterTest {
|
|||
.put("analysis.filter.my_ngram.type", "ngram")
|
||||
.put("analysis.tokenizer.my_ngramt.max_gram", 20)
|
||||
.put("analysis.tokenizer.my_ngramt.min_gram", 1)
|
||||
.put("analysis.tokenizer.my_ngramt.token_chars", "letter,digit")
|
||||
.put("analysis.tokenizer.my_ngramt.type", "ngram")
|
||||
.put("analysis.analyzer.name_index_analyzer.tokenizer", "my_ngramt")
|
||||
.put("analysis.analyzer.name2_index_analyzer.tokenizer", "whitespace")
|
||||
|
|
|
@ -0,0 +1,87 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.test.unit.index.analysis;
|
||||
|
||||
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.NGramTokenizerFactory;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
|
||||
import static org.testng.Assert.fail;
|
||||
|
||||
public class NGramTokenizerFactoryTests {
|
||||
|
||||
@Test
|
||||
public void testParseTokenChars() {
|
||||
final Index index = new Index("test");
|
||||
final String name = "ngr";
|
||||
final Settings indexSettings = ImmutableSettings.EMPTY;
|
||||
for (String tokenChars : Arrays.asList("letters", "number", "DIRECTIONALITY_UNDEFINED")) {
|
||||
final Settings settings = ImmutableSettings.builder().put("min_gram", 2).put("max_gram", 3).put("token_chars", tokenChars).build();
|
||||
try {
|
||||
new NGramTokenizerFactory(index, indexSettings, name, settings).create(new StringReader(""));
|
||||
fail();
|
||||
} catch (ElasticSearchIllegalArgumentException expected) {
|
||||
// OK
|
||||
}
|
||||
}
|
||||
for (String tokenChars : Arrays.asList("letter", " digit ", "punctuation", "DIGIT", "CoNtRoL", "dash_punctuation")) {
|
||||
final Settings settings = ImmutableSettings.builder().put("min_gram", 2).put("max_gram", 3).put("token_chars", tokenChars).build();
|
||||
new NGramTokenizerFactory(index, indexSettings, name, settings).create(new StringReader(""));
|
||||
// no exception
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPreTokenization() throws IOException {
|
||||
// Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters
|
||||
final Index index = new Index("test");
|
||||
final String name = "ngr";
|
||||
final Settings indexSettings = ImmutableSettings.EMPTY;
|
||||
Settings settings = ImmutableSettings.builder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build();
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(new NGramTokenizerFactory(index, indexSettings, name, settings).create(new StringReader("Åbc déf g\uD801\uDC00f ")),
|
||||
new String[] {"Åb", "Åbc", "bc", "dé", "déf", "éf", "g\uD801\uDC00", "g\uD801\uDC00f", "\uD801\uDC00f"});
|
||||
settings = ImmutableSettings.builder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build();
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(new NGramTokenizerFactory(index, indexSettings, name, settings).create(new StringReader(" a!$ 9")),
|
||||
new String[] {" a", " a!", "a!", "a!$", "!$", "!$ ", "$ ", "$ 9", " 9"});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPreTokenizationEdge() throws IOException {
|
||||
// Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters
|
||||
final Index index = new Index("test");
|
||||
final String name = "ngr";
|
||||
final Settings indexSettings = ImmutableSettings.EMPTY;
|
||||
Settings settings = ImmutableSettings.builder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build();
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(new EdgeNGramTokenizerFactory(index, indexSettings, name, settings).create(new StringReader("Åbc déf g\uD801\uDC00f ")),
|
||||
new String[] {"Åb", "Åbc", "dé", "déf", "g\uD801\uDC00", "g\uD801\uDC00f"});
|
||||
settings = ImmutableSettings.builder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build();
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(new EdgeNGramTokenizerFactory(index, indexSettings, name, settings).create(new StringReader(" a!$ 9")),
|
||||
new String[] {" a", " a!"});
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue