From fccbe9c185889a981c9172d848e373b40826d10b Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 13 Jun 2013 23:28:06 +0200 Subject: [PATCH] Import the new n-gram tokenizers and filters from Lucene. Lucene 4.4 will feature new n-gram tokenizers and filters that should not generate broken offsets (that cause highlighting bugs) anymore. They also correctly handle supplementary characters and the tokenizers can work in a streaming fashion (they are not limited to the first 1024 chars of the stream anymore). --- .../analysis/ngram/XEdgeNGramTokenFilter.java | 214 ++++++++++ .../analysis/ngram/XEdgeNGramTokenizer.java | 77 ++++ .../ngram/XLucene43EdgeNGramTokenizer.java | 281 +++++++++++++ .../ngram/XLucene43NGramTokenizer.java | 164 ++++++++ .../analysis/ngram/XNGramTokenFilter.java | 81 ++-- .../analysis/ngram/XNGramTokenizer.java | 186 ++++++--- .../lucene/analysis/util/XCharacterUtils.java | 394 ++++++++++++++++++ .../index/analysis/CharMatcher.java | 137 ++++++ .../analysis/EdgeNGramTokenFilterFactory.java | 22 +- .../analysis/EdgeNGramTokenizerFactory.java | 29 +- .../analysis/NGramTokenFilterFactory.java | 2 +- .../index/analysis/NGramTokenizerFactory.java | 67 ++- .../highlight/HighlighterSearchTests.java | 2 + .../analysis/NGramTokenizerFactoryTests.java | 87 ++++ 14 files changed, 1619 insertions(+), 124 deletions(-) create mode 100644 src/main/java/org/apache/lucene/analysis/ngram/XEdgeNGramTokenFilter.java create mode 100644 src/main/java/org/apache/lucene/analysis/ngram/XEdgeNGramTokenizer.java create mode 100644 src/main/java/org/apache/lucene/analysis/ngram/XLucene43EdgeNGramTokenizer.java create mode 100644 src/main/java/org/apache/lucene/analysis/ngram/XLucene43NGramTokenizer.java create mode 100644 src/main/java/org/apache/lucene/analysis/util/XCharacterUtils.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/CharMatcher.java create mode 100644 src/test/java/org/elasticsearch/test/unit/index/analysis/NGramTokenizerFactoryTests.java diff --git a/src/main/java/org/apache/lucene/analysis/ngram/XEdgeNGramTokenFilter.java b/src/main/java/org/apache/lucene/analysis/ngram/XEdgeNGramTokenFilter.java new file mode 100644 index 00000000000..0e327253b05 --- /dev/null +++ b/src/main/java/org/apache/lucene/analysis/ngram/XEdgeNGramTokenFilter.java @@ -0,0 +1,214 @@ +package org.apache.lucene.analysis.ngram; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.elasticsearch.common.lucene.Lucene; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.reverse.ReverseStringFilter; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.analysis.util.XCharacterUtils; +import org.apache.lucene.util.Version; + +import java.io.IOException; + +/** + * Tokenizes the given token into n-grams of given size(s). + *

+ * This {@link TokenFilter} create n-grams from the beginning edge or ending edge of a input token. + *

As of Lucene 4.4, this filter does not support + * {@link Side#BACK} (you can use {@link ReverseStringFilter} up-front and + * afterward to get the same behavior), handles supplementary characters + * correctly and does not update offsets anymore. + */ +public final class XEdgeNGramTokenFilter extends TokenFilter { + + static { + // LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1492640. + assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed"; + } + + public static final Side DEFAULT_SIDE = Side.FRONT; + public static final int DEFAULT_MAX_GRAM_SIZE = 1; + public static final int DEFAULT_MIN_GRAM_SIZE = 1; + + /** Specifies which side of the input the n-gram should be generated from */ + public static enum Side { + + /** Get the n-gram from the front of the input */ + FRONT { + @Override + public String getLabel() { return "front"; } + }, + + /** Get the n-gram from the end of the input */ + @Deprecated + BACK { + @Override + public String getLabel() { return "back"; } + }; + + public abstract String getLabel(); + + // Get the appropriate Side from a string + public static Side getSide(String sideName) { + if (FRONT.getLabel().equals(sideName)) { + return FRONT; + } + if (BACK.getLabel().equals(sideName)) { + return BACK; + } + return null; + } + } + + private final XCharacterUtils charUtils; + private final int minGram; + private final int maxGram; + private Side side; + private char[] curTermBuffer; + private int curTermLength; + private int curCodePointCount; + private int curGramSize; + private int tokStart; + private int tokEnd; + private int savePosIncr; + private int savePosLen; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); + + /** + * Creates XEdgeNGramTokenFilter that can generate n-grams in the sizes of the given range + * + * @param version the Lucene match version + * @param input {@link TokenStream} holding the input to be tokenized + * @param side the {@link Side} from which to chop off an n-gram + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + @Deprecated + public XEdgeNGramTokenFilter(Version version, TokenStream input, Side side, int minGram, int maxGram) { + super(input); + + if (version == null) { + throw new IllegalArgumentException("version must not be null"); + } + + if (side == Side.BACK) { + throw new IllegalArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward"); + } + + if (side == null) { + throw new IllegalArgumentException("sideLabel must be either front or back"); + } + + if (minGram < 1) { + throw new IllegalArgumentException("minGram must be greater than zero"); + } + + if (minGram > maxGram) { + throw new IllegalArgumentException("minGram must not be greater than maxGram"); + } + + this.charUtils = XCharacterUtils.getInstance(version); + this.minGram = minGram; + this.maxGram = maxGram; + this.side = side; + } + + /** + * Creates XEdgeNGramTokenFilter that can generate n-grams in the sizes of the given range + * + * @param version the Lucene match version + * @param input {@link TokenStream} holding the input to be tokenized + * @param sideLabel the name of the {@link Side} from which to chop off an n-gram + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + @Deprecated + public XEdgeNGramTokenFilter(Version version, TokenStream input, String sideLabel, int minGram, int maxGram) { + this(version, input, Side.getSide(sideLabel), minGram, maxGram); + } + + /** + * Creates XEdgeNGramTokenFilter that can generate n-grams in the sizes of the given range + * + * @param version the Lucene match version + * @param input {@link TokenStream} holding the input to be tokenized + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + public XEdgeNGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) { + this(version, input, Side.FRONT, minGram, maxGram); + } + + @Override + public final boolean incrementToken() throws IOException { + while (true) { + if (curTermBuffer == null) { + if (!input.incrementToken()) { + return false; + } else { + curTermBuffer = termAtt.buffer().clone(); + curTermLength = termAtt.length(); + curCodePointCount = charUtils.codePointCount(termAtt); + curGramSize = minGram; + tokStart = offsetAtt.startOffset(); + tokEnd = offsetAtt.endOffset(); + savePosIncr += posIncrAtt.getPositionIncrement(); + savePosLen = posLenAtt.getPositionLength(); + } + } + if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit + if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams + // grab gramSize chars from front or back + final int start = side == Side.FRONT ? 0 : charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize); + final int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); + clearAttributes(); + offsetAtt.setOffset(tokStart, tokEnd); + // first ngram gets increment, others don't + if (curGramSize == minGram) { + posIncrAtt.setPositionIncrement(savePosIncr); + savePosIncr = 0; + } else { + posIncrAtt.setPositionIncrement(0); + } + posLenAtt.setPositionLength(savePosLen); + termAtt.copyBuffer(curTermBuffer, start, end - start); + curGramSize++; + return true; + } + } + curTermBuffer = null; + } + } + + @Override + public void reset() throws IOException { + super.reset(); + curTermBuffer = null; + savePosIncr = 0; + } +} diff --git a/src/main/java/org/apache/lucene/analysis/ngram/XEdgeNGramTokenizer.java b/src/main/java/org/apache/lucene/analysis/ngram/XEdgeNGramTokenizer.java new file mode 100644 index 00000000000..5639d29d464 --- /dev/null +++ b/src/main/java/org/apache/lucene/analysis/ngram/XEdgeNGramTokenizer.java @@ -0,0 +1,77 @@ +package org.apache.lucene.analysis.ngram; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.elasticsearch.common.lucene.Lucene; + +import java.io.Reader; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.util.Version; + +/** + * Tokenizes the input from an edge into n-grams of given size(s). + *

+ * This {@link Tokenizer} create n-grams from the beginning edge or ending edge of a input token. + *

As of Lucene 4.4, this tokenizer

+ *

Although highly discouraged, it is still possible + * to use the old behavior through {@link Lucene43XEdgeXNGramTokenizer}. + */ +public class XEdgeNGramTokenizer extends XNGramTokenizer { + + static { + // LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1492640. + assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed"; + } + + public static final int DEFAULT_MAX_GRAM_SIZE = 1; + public static final int DEFAULT_MIN_GRAM_SIZE = 1; + + /** + * Creates XEdgeXNGramTokenizer that can generate n-grams in the sizes of the given range + * + * @param version the Lucene match version + * @param input {@link Reader} holding the input to be tokenized + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + public XEdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram) { + super(version, input, minGram, maxGram, true); + } + + /** + * Creates XEdgeXNGramTokenizer that can generate n-grams in the sizes of the given range + * + * @param version the Lucene match version + * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use + * @param input {@link Reader} holding the input to be tokenized + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + public XEdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) { + super(version, factory, input, minGram, maxGram, true); + } + +} diff --git a/src/main/java/org/apache/lucene/analysis/ngram/XLucene43EdgeNGramTokenizer.java b/src/main/java/org/apache/lucene/analysis/ngram/XLucene43EdgeNGramTokenizer.java new file mode 100644 index 00000000000..86138bef711 --- /dev/null +++ b/src/main/java/org/apache/lucene/analysis/ngram/XLucene43EdgeNGramTokenizer.java @@ -0,0 +1,281 @@ +package org.apache.lucene.analysis.ngram; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.elasticsearch.common.lucene.Lucene; + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Version; + +/** + * Old version of {@link EdgeNGramTokenizer} which doesn't handle correctly + * supplementary characters. + */ +@Deprecated +public final class XLucene43EdgeNGramTokenizer extends Tokenizer { + + static { + // LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1492640. + assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed"; + } + + public static final Side DEFAULT_SIDE = Side.FRONT; + public static final int DEFAULT_MAX_GRAM_SIZE = 1; + public static final int DEFAULT_MIN_GRAM_SIZE = 1; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + + /** Specifies which side of the input the n-gram should be generated from */ + public static enum Side { + + /** Get the n-gram from the front of the input */ + FRONT { + @Override + public String getLabel() { return "front"; } + }, + + /** Get the n-gram from the end of the input */ + BACK { + @Override + public String getLabel() { return "back"; } + }; + + public abstract String getLabel(); + + // Get the appropriate Side from a string + public static Side getSide(String sideName) { + if (FRONT.getLabel().equals(sideName)) { + return FRONT; + } + if (BACK.getLabel().equals(sideName)) { + return BACK; + } + return null; + } + } + + private int minGram; + private int maxGram; + private int gramSize; + private Side side; + private boolean started; + private int inLen; // length of the input AFTER trim() + private int charsRead; // length of the input + private String inStr; + + + /** + * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range + * + * @param version the Lucene match version + * @param input {@link Reader} holding the input to be tokenized + * @param side the {@link Side} from which to chop off an n-gram + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + @Deprecated + public XLucene43EdgeNGramTokenizer(Version version, Reader input, Side side, int minGram, int maxGram) { + super(input); + init(version, side, minGram, maxGram); + } + + /** + * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range + * + * @param version the Lucene match version + * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use + * @param input {@link Reader} holding the input to be tokenized + * @param side the {@link Side} from which to chop off an n-gram + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + @Deprecated + public XLucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, Side side, int minGram, int maxGram) { + super(factory, input); + init(version, side, minGram, maxGram); + } + + /** + * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range + * + * @param version the Lucene match version + * @param input {@link Reader} holding the input to be tokenized + * @param sideLabel the name of the {@link Side} from which to chop off an n-gram + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + @Deprecated + public XLucene43EdgeNGramTokenizer(Version version, Reader input, String sideLabel, int minGram, int maxGram) { + this(version, input, Side.getSide(sideLabel), minGram, maxGram); + } + + /** + * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range + * + * @param version the Lucene match version + * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use + * @param input {@link Reader} holding the input to be tokenized + * @param sideLabel the name of the {@link Side} from which to chop off an n-gram + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + @Deprecated + public XLucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, String sideLabel, int minGram, int maxGram) { + this(version, factory, input, Side.getSide(sideLabel), minGram, maxGram); + } + + /** + * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range + * + * @param version the Lucene match version + * @param input {@link Reader} holding the input to be tokenized + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + public XLucene43EdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram) { + this(version, input, Side.FRONT, minGram, maxGram); + } + + /** + * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range + * + * @param version the Lucene match version + * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use + * @param input {@link Reader} holding the input to be tokenized + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + public XLucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) { + this(version, factory, input, Side.FRONT, minGram, maxGram); + } + + private void init(Version version, Side side, int minGram, int maxGram) { + if (version == null) { + throw new IllegalArgumentException("version must not be null"); + } + + if (side == null) { + throw new IllegalArgumentException("sideLabel must be either front or back"); + } + + if (minGram < 1) { + throw new IllegalArgumentException("minGram must be greater than zero"); + } + + if (minGram > maxGram) { + throw new IllegalArgumentException("minGram must not be greater than maxGram"); + } + + maxGram = Math.min(maxGram, 1024); + + this.minGram = minGram; + this.maxGram = maxGram; + this.side = side; + } + + /** Returns the next token in the stream, or null at EOS. */ + @Override + public boolean incrementToken() throws IOException { + clearAttributes(); + // if we are just starting, read the whole input + if (!started) { + started = true; + gramSize = minGram; + final int limit = side == Side.FRONT ? maxGram : 1024; + char[] chars = new char[Math.min(1024, limit)]; + charsRead = 0; + // TODO: refactor to a shared readFully somewhere: + boolean exhausted = false; + while (charsRead < limit) { + final int inc = input.read(chars, charsRead, chars.length-charsRead); + if (inc == -1) { + exhausted = true; + break; + } + charsRead += inc; + if (charsRead == chars.length && charsRead < limit) { + chars = ArrayUtil.grow(chars); + } + } + + inStr = new String(chars, 0, charsRead); + inStr = inStr.trim(); + + if (!exhausted) { + // Read extra throwaway chars so that on end() we + // report the correct offset: + char[] throwaway = new char[1024]; + while(true) { + final int inc = input.read(throwaway, 0, throwaway.length); + if (inc == -1) { + break; + } + charsRead += inc; + } + } + + inLen = inStr.length(); + if (inLen == 0) { + return false; + } + posIncrAtt.setPositionIncrement(1); + } else { + posIncrAtt.setPositionIncrement(0); + } + + // if the remaining input is too short, we can't generate any n-grams + if (gramSize > inLen) { + return false; + } + + // if we have hit the end of our n-gram size range, quit + if (gramSize > maxGram || gramSize > inLen) { + return false; + } + + // grab gramSize chars from front or back + int start = side == Side.FRONT ? 0 : inLen - gramSize; + int end = start + gramSize; + termAtt.setEmpty().append(inStr, start, end); + offsetAtt.setOffset(correctOffset(start), correctOffset(end)); + gramSize++; + return true; + } + + @Override + public void end() { + // set final offset + final int finalOffset = correctOffset(charsRead); + this.offsetAtt.setOffset(finalOffset, finalOffset); + } + + @Override + public void reset() throws IOException { + super.reset(); + started = false; + } +} diff --git a/src/main/java/org/apache/lucene/analysis/ngram/XLucene43NGramTokenizer.java b/src/main/java/org/apache/lucene/analysis/ngram/XLucene43NGramTokenizer.java new file mode 100644 index 00000000000..fbc680000a4 --- /dev/null +++ b/src/main/java/org/apache/lucene/analysis/ngram/XLucene43NGramTokenizer.java @@ -0,0 +1,164 @@ +package org.apache.lucene.analysis.ngram; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.Version; +import org.elasticsearch.common.lucene.Lucene; + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +/** + * Old broken version of {@link NGramTokenizer}. + */ +@Deprecated +public final class XLucene43NGramTokenizer extends Tokenizer { + + static { + // LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1492640. + assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed"; + } + + public static final int DEFAULT_MIN_NGRAM_SIZE = 1; + public static final int DEFAULT_MAX_NGRAM_SIZE = 2; + + private int minGram, maxGram; + private int gramSize; + private int pos; + private int inLen; // length of the input AFTER trim() + private int charsRead; // length of the input + private String inStr; + private boolean started; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + + /** + * Creates NGramTokenizer with given min and max n-grams. + * @param input {@link Reader} holding the input to be tokenized + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + public XLucene43NGramTokenizer(Reader input, int minGram, int maxGram) { + super(input); + init(minGram, maxGram); + } + + /** + * Creates NGramTokenizer with given min and max n-grams. + * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use + * @param input {@link Reader} holding the input to be tokenized + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + public XLucene43NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) { + super(factory, input); + init(minGram, maxGram); + } + + /** + * Creates NGramTokenizer with default min and max n-grams. + * @param input {@link Reader} holding the input to be tokenized + */ + public XLucene43NGramTokenizer(Reader input) { + this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE); + } + + private void init(int minGram, int maxGram) { + if (minGram < 1) { + throw new IllegalArgumentException("minGram must be greater than zero"); + } + if (minGram > maxGram) { + throw new IllegalArgumentException("minGram must not be greater than maxGram"); + } + this.minGram = minGram; + this.maxGram = maxGram; + } + + /** Returns the next token in the stream, or null at EOS. */ + @Override + public boolean incrementToken() throws IOException { + clearAttributes(); + if (!started) { + started = true; + gramSize = minGram; + char[] chars = new char[1024]; + charsRead = 0; + // TODO: refactor to a shared readFully somewhere: + while (charsRead < chars.length) { + int inc = input.read(chars, charsRead, chars.length-charsRead); + if (inc == -1) { + break; + } + charsRead += inc; + } + inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings + + if (charsRead == chars.length) { + // Read extra throwaway chars so that on end() we + // report the correct offset: + char[] throwaway = new char[1024]; + while(true) { + final int inc = input.read(throwaway, 0, throwaway.length); + if (inc == -1) { + break; + } + charsRead += inc; + } + } + + inLen = inStr.length(); + if (inLen == 0) { + return false; + } + } + + if (pos+gramSize > inLen) { // if we hit the end of the string + pos = 0; // reset to beginning of string + gramSize++; // increase n-gram size + if (gramSize > maxGram) // we are done + return false; + if (pos+gramSize > inLen) + return false; + } + + int oldPos = pos; + pos++; + termAtt.setEmpty().append(inStr, oldPos, oldPos+gramSize); + offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize)); + return true; + } + + @Override + public void end() { + // set final offset + final int finalOffset = correctOffset(charsRead); + this.offsetAtt.setOffset(finalOffset, finalOffset); + } + + @Override + public void reset() throws IOException { + super.reset(); + started = false; + pos = 0; + } +} diff --git a/src/main/java/org/apache/lucene/analysis/ngram/XNGramTokenFilter.java b/src/main/java/org/apache/lucene/analysis/ngram/XNGramTokenFilter.java index 81888c96ba1..9c9701093b5 100644 --- a/src/main/java/org/apache/lucene/analysis/ngram/XNGramTokenFilter.java +++ b/src/main/java/org/apache/lucene/analysis/ngram/XNGramTokenFilter.java @@ -17,6 +17,8 @@ package org.apache.lucene.analysis.ngram; * limitations under the License. */ +import org.elasticsearch.common.lucene.Lucene; + import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; @@ -26,14 +28,15 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.analysis.util.XCharacterUtils; import org.apache.lucene.util.Version; -import org.elasticsearch.common.lucene.Lucene; /** * Tokenizes the input into n-grams of the given size(s). * *

You must specify the required {@link Version} compatibility when - * creating a {@link NGramTokenFilter}. As of Lucene 4.4, this token filters: