diff --git a/contrib/CHANGES.txt b/contrib/CHANGES.txt index 9f280e60dc6..f878aa8680a 100644 --- a/contrib/CHANGES.txt +++ b/contrib/CHANGES.txt @@ -99,6 +99,10 @@ New features * LUCENE-2243: Add DisjunctionMaxQuery support for FastVectorHighlighter. (Koji Sekiguchi) + * LUCENE-2218: ShingleFilter supports minimum shingle size, and the separator + character is now configurable. Its also up to 20% faster. + (Steven Rowe via Robert Muir) + Build * LUCENE-2124: Moved the JDK-based collation support from contrib/collation diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java index 2aa6cd4ea2a..8349eeb98c4 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java @@ -34,7 +34,9 @@ import org.apache.lucene.util.Version; public final class ShingleAnalyzerWrapper extends Analyzer { private final Analyzer defaultAnalyzer; - private int maxShingleSize = 2; + private int maxShingleSize = ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE; + private int minShingleSize = ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE; + private String tokenSeparator = ShingleFilter.TOKEN_SEPARATOR; private boolean outputUnigrams = true; public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) { @@ -44,7 +46,13 @@ public final class ShingleAnalyzerWrapper extends Analyzer { public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int maxShingleSize) { this(defaultAnalyzer); - this.maxShingleSize = maxShingleSize; + setMaxShingleSize(maxShingleSize); + } + + public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) { + this(defaultAnalyzer); + setMaxShingleSize(maxShingleSize); + setMinShingleSize(minShingleSize); } /** @@ -58,29 +66,73 @@ public final class ShingleAnalyzerWrapper extends Analyzer { /** * Wraps {@link StandardAnalyzer}. */ - public ShingleAnalyzerWrapper(Version matchVersion, int nGramSize) { + public ShingleAnalyzerWrapper(Version matchVersion, int minShingleSize, int maxShingleSize) { this(matchVersion); - this.maxShingleSize = nGramSize; + setMaxShingleSize(maxShingleSize); + setMinShingleSize(minShingleSize); } /** - * The max shingle (ngram) size + * The max shingle (token ngram) size * - * @return The max shingle (ngram) size + * @return The max shingle (token ngram) size */ public int getMaxShingleSize() { return maxShingleSize; } /** - * Set the maximum size of output shingles - * + * Set the maximum size of output shingles (default: 2) + * * @param maxShingleSize max shingle size */ public void setMaxShingleSize(int maxShingleSize) { + if (maxShingleSize < 2) { + throw new IllegalArgumentException("Max shingle size must be >= 2"); + } this.maxShingleSize = maxShingleSize; } + /** + * The min shingle (token ngram) size + * + * @return The min shingle (token ngram) size + */ + public int getMinShingleSize() { + return minShingleSize; + } + + /** + *
Set the min shingle size (default: 2). + *
This method requires that the passed in minShingleSize is not greater + * than maxShingleSize, so make sure that maxShingleSize is set before + * calling this method. + * + * @param minShingleSize min size of output shingles + */ + public void setMinShingleSize(int minShingleSize) { + if (minShingleSize < 2) { + throw new IllegalArgumentException("Min shingle size must be >= 2"); + } + if (minShingleSize > maxShingleSize) { + throw new IllegalArgumentException + ("Min shingle size must be <= max shingle size"); + } + this.minShingleSize = minShingleSize; + } + + public String getTokenSeparator() { + return tokenSeparator; + } + + /** + * Sets the string to use when joining adjacent tokens to form a shingle + * @param tokenSeparator used to separate input stream tokens in output shingles + */ + public void setTokenSeparator(String tokenSeparator) { + this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator); + } + public boolean isOutputUnigrams() { return outputUnigrams; } @@ -104,8 +156,10 @@ public final class ShingleAnalyzerWrapper extends Analyzer { } catch (IOException e) { wrapped = defaultAnalyzer.tokenStream(fieldName, reader); } - ShingleFilter filter = new ShingleFilter(wrapped); + ShingleFilter filter = new ShingleFilter(wrapped, minShingleSize, maxShingleSize); + filter.setMinShingleSize(minShingleSize); filter.setMaxShingleSize(maxShingleSize); + filter.setTokenSeparator(tokenSeparator); filter.setOutputUnigrams(outputUnigrams); return filter; } @@ -113,7 +167,7 @@ public final class ShingleAnalyzerWrapper extends Analyzer { private class SavedStreams { TokenStream wrapped; ShingleFilter shingle; - }; + } @Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { @@ -135,6 +189,8 @@ public final class ShingleAnalyzerWrapper extends Analyzer { } } streams.shingle.setMaxShingleSize(maxShingleSize); + streams.shingle.setMinShingleSize(minShingleSize); + streams.shingle.setTokenSeparator(tokenSeparator); streams.shingle.setOutputUnigrams(outputUnigrams); return streams.shingle; } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java index 2a74357695a..ebf789b4d4b 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java @@ -18,18 +18,15 @@ package org.apache.lucene.analysis.shingle; */ import java.io.IOException; -import java.util.Iterator; import java.util.LinkedList; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; -import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.AttributeSource.State; + /** *
A ShingleFilter constructs shingles (token n-grams) from a token stream.
@@ -44,26 +41,59 @@ import org.apache.lucene.util.AttributeSource.State;
*/
public final class ShingleFilter extends TokenFilter {
- private LinkedList Set the min shingle size (default: 2).
+ * This method requires that the passed in minShingleSize is not greater
+ * than maxShingleSize, so make sure that maxShingleSize is set before
+ * calling this method.
+ * The unigram output option is independent of the min shingle size.
+ *
+ * @param minShingleSize min size of output shingles
*/
- private void clearShingles() {
- for (int i = 0; i < shingles.length; i++) {
- shingles[i].setLength(0);
+ public void setMinShingleSize(int minShingleSize) {
+ if (minShingleSize < 2) {
+ throw new IllegalArgumentException("Min shingle size must be >= 2");
}
+ if (minShingleSize > maxShingleSize) {
+ throw new IllegalArgumentException
+ ("Min shingle size must be <= max shingle size");
+ }
+ this.minShingleSize = minShingleSize;
+ gramSize = new CircularSequence();
+ }
+
+ /**
+ * Sets the string to use when joining adjacent tokens to form a shingle
+ * @param tokenSeparator used to separate input stream tokens in output shingles
+ */
+ public void setTokenSeparator(String tokenSeparator) {
+ this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator;
}
-
- private AttributeSource.State nextToken;
- private int shingleBufferPosition;
- private int[] endOffsets;
/* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#next()
*/
@Override
public final boolean incrementToken() throws IOException {
- while (true) {
- if (nextToken == null) {
- if (!fillShingleBuffer()) {
- return false;
- }
- }
-
- nextToken = shingleBuf.getFirst();
-
- if (outputUnigrams) {
- if (shingleBufferPosition == 0) {
- restoreState(nextToken);
- posIncrAtt.setPositionIncrement(1);
- shingleBufferPosition++;
- return true;
- }
- } else if (shingleBufferPosition % this.maxShingleSize == 0){
- shingleBufferPosition++;
- }
-
- if (shingleBufferPosition < shingleBuf.size()) {
- restoreState(nextToken);
- typeAtt.setType(tokenType);
- offsetAtt.setOffset(offsetAtt.startOffset(), endOffsets[shingleBufferPosition]);
- StringBuilder buf = shingles[shingleBufferPosition];
- int termLength = buf.length();
- char[] termBuffer = termAtt.termBuffer();
- if (termBuffer.length < termLength)
- termBuffer = termAtt.resizeTermBuffer(termLength);
- buf.getChars(0, termLength, termBuffer, 0);
- termAtt.setTermLength(termLength);
- if ((! outputUnigrams) && shingleBufferPosition % this.maxShingleSize == 1) {
- posIncrAtt.setPositionIncrement(1);
- } else {
- posIncrAtt.setPositionIncrement(0);
- }
- shingleBufferPosition++;
- if (shingleBufferPosition == shingleBuf.size()) {
- nextToken = null;
- shingleBufferPosition = 0;
- }
- return true;
- } else {
- nextToken = null;
- shingleBufferPosition = 0;
+ boolean tokenAvailable = false;
+ if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) {
+ shiftInputWindow();
+ }
+ if ( ! inputWindow.isEmpty()) {
+ restoreState(inputWindow.getFirst());
+ if (1 == gramSize.getValue()) {
+ posIncrAtt.setPositionIncrement(1);
+ gramSize.advance();
+ tokenAvailable = true;
+ } else if (inputWindow.size() >= gramSize.getValue()) {
+ getNextShingle();
+ gramSize.advance();
+ tokenAvailable = true;
}
}
+ return tokenAvailable;
+ }
+
+ /**
+ * Makes the next token a shingle of length {@link #gramSize},
+ * composed of tokens taken from {@link #inputWindow}.
+ * Callers of this method must first insure that there are at least
+ * Get the next token from the input stream.
+ * If the next token has Fills {@link #inputWindow} with input stream tokens, if available,
+ * shifting to the right if the window was previously full.
+ * Resets {@link #gramSize} to its minimum value.
*
* @throws IOException if there's a problem getting the next token
*/
- private boolean fillShingleBuffer() throws IOException {
- boolean addedToken = false;
- /*
- * Try to fill the shingle buffer.
- */
- do {
- if (getNextToken()) {
- shingleBuf.add(captureState());
- if (shingleBuf.size() > maxShingleSize)
- {
- shingleBuf.removeFirst();
- }
- addedToken = true;
- } else {
+ private void shiftInputWindow() throws IOException {
+ if (inputWindow.size() > 0) {
+ inputWindow.removeFirst();
+ }
+ while (getNextToken()) {
+ inputWindow.add(captureState());
+ if (inputWindow.size() == maxShingleSize) {
break;
}
- } while (shingleBuf.size() < maxShingleSize);
-
- if (shingleBuf.isEmpty()) {
- return false;
}
-
- /*
- * If no new token could be added to the shingle buffer, we have reached
- * the end of the input stream and have to discard the least recent token.
- */
- if (! addedToken) {
- shingleBuf.removeFirst();
- }
-
- if (shingleBuf.isEmpty()) {
- return false;
- }
-
- clearShingles();
-
- endOffsets = new int[shingleBuf.size()];
- for (int i = 0; i < endOffsets.length; i++) {
- endOffsets[i] = 0;
- }
-
- int i = 0;
- for (Iterator An instance of this class is used to maintain the number of input
+ * stream tokens that will be used to compose the next unigram or shingle:
+ * {@link #gramSize}.
+ * 1 is included in the circular sequence only if
+ * {@link #outputUnigrams} = true.
+ */
+ private class CircularSequence {
+ private int value;
+ private int minValue;
+
+ public CircularSequence() {
+ minValue = outputUnigrams ? 1 : minShingleSize;
+ reset();
+ }
+
+ /**
+ * {@see #advance()}
+ * @return the current value.
+ */
+ public int getValue() {
+ return value;
+ }
+
+ /**
+ * Increments this circular number's value to the next member in the
+ * circular sequence
+ * 1 is included in the circular sequence only if
+ * {@link #outputUnigrams} = true.
+ *
+ * @return the next member in the circular sequence
+ */
+ public int advance() {
+ if (value == 1) {
+ value = minShingleSize;
+ } else if (value == maxShingleSize) {
+ reset();
+ } else {
+ ++value;
+ }
+ return value;
+ }
+
+ /**
+ * Sets this circular number's value to the first member of the
+ * circular sequence
+ * 1 is included in the circular sequence only if
+ * {@link #outputUnigrams} = true.
+ */
+ public void reset() {
+ value = minValue;
+ }
+
+ /**
+ * Returns true if the current value is the first member of the circular
+ * sequence.
+ * If {@link #outputUnigrams} = true, the first member of the circular
+ * sequence will be 1; otherwise, it will be {@link #minShingleSize}.
+ *
+ * @return true if the current value is the first member of the circular
+ * sequence; false otherwise
+ */
+ public boolean atMinValue() {
+ return value == minValue;
+ }
}
}
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
index 1991af0505d..9258aba003f 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
@@ -246,4 +246,117 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new int[] { 6, 13, 13, 18, 18, 27, 27 },
new int[] { 1, 0, 1, 0, 1, 0, 1 });
}
+
+ public void testNonDefaultMinShingleSize() throws Exception {
+ ShingleAnalyzerWrapper analyzer
+ = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 3, 4);
+ assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
+ new String[] { "please", "please divide this", "please divide this sentence",
+ "divide", "divide this sentence", "divide this sentence into",
+ "this", "this sentence into", "this sentence into shingles",
+ "sentence", "sentence into shingles",
+ "into",
+ "shingles" },
+ new int[] { 0, 0, 0, 7, 7, 7, 14, 14, 14, 19, 19, 28, 33 },
+ new int[] { 6, 18, 27, 13, 27, 32, 18, 32, 41, 27, 41, 32, 41 },
+ new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 });
+ analyzer.setOutputUnigrams(false);
+ assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
+ new String[] { "please divide this", "please divide this sentence",
+ "divide this sentence", "divide this sentence into",
+ "this sentence into", "this sentence into shingles",
+ "sentence into shingles" },
+ new int[] { 0, 0, 7, 7, 14, 14, 19 },
+ new int[] { 18, 27, 27, 32, 32, 41, 41 },
+ new int[] { 1, 0, 1, 0, 1, 0, 1 });
+ }
+
+ public void testNonDefaultMinAndSameMaxShingleSize() throws Exception {
+ ShingleAnalyzerWrapper analyzer
+ = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 3, 3);
+ assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
+ new String[] { "please", "please divide this",
+ "divide", "divide this sentence",
+ "this", "this sentence into",
+ "sentence", "sentence into shingles",
+ "into",
+ "shingles" },
+ new int[] { 0, 0, 7, 7, 14, 14, 19, 19, 28, 33 },
+ new int[] { 6, 18, 13, 27, 18, 32, 27, 41, 32, 41 },
+ new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 });
+ analyzer.setOutputUnigrams(false);
+ assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
+ new String[] { "please divide this",
+ "divide this sentence",
+ "this sentence into",
+ "sentence into shingles" },
+ new int[] { 0, 7, 14, 19 },
+ new int[] { 18, 27, 32, 41 },
+ new int[] { 1, 1, 1, 1 });
+ }
+
+ public void testNoTokenSeparator() throws Exception {
+ ShingleAnalyzerWrapper analyzer
+ = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer());
+ analyzer.setTokenSeparator("");
+ assertAnalyzesToReuse(analyzer, "please divide into shingles",
+ new String[] { "please", "pleasedivide",
+ "divide", "divideinto",
+ "into", "intoshingles",
+ "shingles" },
+ new int[] { 0, 0, 7, 7, 14, 14, 19 },
+ new int[] { 6, 13, 13, 18, 18, 27, 27 },
+ new int[] { 1, 0, 1, 0, 1, 0, 1 });
+ analyzer.setOutputUnigrams(false);
+ assertAnalyzesToReuse(analyzer, "please divide into shingles",
+ new String[] { "pleasedivide",
+ "divideinto",
+ "intoshingles" },
+ new int[] { 0, 7, 14 },
+ new int[] { 13, 18, 27 },
+ new int[] { 1, 1, 1 });
+ }
+
+ public void testNullTokenSeparator() throws Exception {
+ ShingleAnalyzerWrapper analyzer
+ = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer());
+ analyzer.setTokenSeparator(null);
+ assertAnalyzesToReuse(analyzer, "please divide into shingles",
+ new String[] { "please", "pleasedivide",
+ "divide", "divideinto",
+ "into", "intoshingles",
+ "shingles" },
+ new int[] { 0, 0, 7, 7, 14, 14, 19 },
+ new int[] { 6, 13, 13, 18, 18, 27, 27 },
+ new int[] { 1, 0, 1, 0, 1, 0, 1 });
+ analyzer.setOutputUnigrams(false);
+ assertAnalyzesToReuse(analyzer, "please divide into shingles",
+ new String[] { "pleasedivide",
+ "divideinto",
+ "intoshingles" },
+ new int[] { 0, 7, 14 },
+ new int[] { 13, 18, 27 },
+ new int[] { 1, 1, 1 });
+ }
+ public void testAltTokenSeparator() throws Exception {
+ ShingleAnalyzerWrapper analyzer
+ = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer());
+ analyzer.setTokenSeparator("input
*
* @param input input stream
+ * @param minShingleSize minimum shingle size produced by the filter.
* @param maxShingleSize maximum shingle size produced by the filter.
*/
- public ShingleFilter(TokenStream input, int maxShingleSize) {
+ public ShingleFilter(TokenStream input, int minShingleSize, int maxShingleSize) {
super(input);
setMaxShingleSize(maxShingleSize);
+ setMinShingleSize(minShingleSize);
this.termAtt = addAttribute(TermAttribute.class);
this.offsetAtt = addAttribute(OffsetAttribute.class);
this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
@@ -92,22 +147,34 @@ public final class ShingleFilter extends TokenFilter {
}
/**
- * Construct a ShingleFilter with default shingle size.
+ * Constructs a ShingleFilter with the specified shingle size from the
+ * {@link TokenStream} input
+ *
+ * @param input input stream
+ * @param maxShingleSize maximum shingle size produced by the filter.
+ */
+ public ShingleFilter(TokenStream input, int maxShingleSize) {
+ this(input, DEFAULT_MIN_SHINGLE_SIZE, maxShingleSize);
+ }
+
+ /**
+ * Construct a ShingleFilter with default shingle size: 2.
*
* @param input input stream
*/
public ShingleFilter(TokenStream input) {
- this(input, DEFAULT_MAX_SHINGLE_SIZE);
+ this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE);
}
/**
- * Construct a ShingleFilter with the specified token type for shingle tokens.
+ * Construct a ShingleFilter with the specified token type for shingle tokens
+ * and the default shingle size: 2
*
* @param input input stream
* @param tokenType token type for shingle tokens
*/
public ShingleFilter(TokenStream input, String tokenType) {
- this(input, DEFAULT_MAX_SHINGLE_SIZE);
+ this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE);
setTokenType(tokenType);
}
@@ -130,6 +197,7 @@ public final class ShingleFilter extends TokenFilter {
*/
public void setOutputUnigrams(boolean outputUnigrams) {
this.outputUnigrams = outputUnigrams;
+ gramSize = new CircularSequence();
}
/**
@@ -141,203 +209,239 @@ public final class ShingleFilter extends TokenFilter {
if (maxShingleSize < 2) {
throw new IllegalArgumentException("Max shingle size must be >= 2");
}
- shingles = new StringBuilder[maxShingleSize];
- for (int i = 0; i < shingles.length; i++) {
- shingles[i] = new StringBuilder();
- }
this.maxShingleSize = maxShingleSize;
}
/**
- * Clear the StringBuilders that are used for storing the output shingles.
+ * gramSize
tokens available in inputWindow
.
+ */
+ private void getNextShingle() {
+ int startOffset = offsetAtt.startOffset();
+
+ int minTokNum = gramSize.getValue() - 1; // zero-based inputWindow position
+ if (gramSize.getValue() == minShingleSize) {
+ // Clear the shingle text buffer if this is the first shingle
+ // at the current position in the input stream.
+ shingleBuilder.setLength(0);
+ minTokNum = 0;
+ }
+ for (int tokNum = minTokNum ; tokNum < gramSize.getValue() ; ++tokNum) {
+ if (tokNum > 0) {
+ shingleBuilder.append(tokenSeparator);
+ }
+ restoreState(inputWindow.get(tokNum));
+ shingleBuilder.append(termAtt.termBuffer(), 0, termAtt.termLength());
+ }
+ char[] termBuffer = termAtt.termBuffer();
+ int termLength = shingleBuilder.length();
+ if (termBuffer.length < termLength) {
+ termBuffer = termAtt.resizeTermBuffer(termLength);
+ }
+ shingleBuilder.getChars(0, termLength, termBuffer, 0);
+ termAtt.setTermLength(termLength);
+ posIncrAtt.setPositionIncrement(gramSize.atMinValue() ? 1 : 0);
+ typeAtt.setType(tokenType);
+ offsetAtt.setOffset(startOffset, offsetAtt.endOffset());
}
- private int numFillerTokensToInsert;
- private AttributeSource.State currentToken;
- private boolean hasCurrentToken;
-
- private TermAttribute termAtt;
- private OffsetAttribute offsetAtt;
- private PositionIncrementAttribute posIncrAtt;
- private TypeAttribute typeAtt;
-
/**
- * Get the next token from the input stream and push it on the token buffer.
- * If we encounter a token with position increment > 1, we put filler tokens
- * on the token buffer.
- * positionIncrement > 1
,
+ * positionIncrement - 1
{@link #FILLER_TOKEN}s are
+ * inserted first.
+ * @return false for end of stream; true otherwise
* @throws IOException if the input stream has a problem
*/
private boolean getNextToken() throws IOException {
-
- while (true) {
- if (numFillerTokensToInsert > 0) {
- if (currentToken == null) {
- currentToken = captureState();
- } else {
- restoreState(currentToken);
- }
- numFillerTokensToInsert--;
- // A filler token occupies no space
- offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
- termAtt.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
- return true;
- }
-
- if (hasCurrentToken) {
- if (currentToken != null) {
- restoreState(currentToken);
- currentToken = null;
- }
- hasCurrentToken = false;
- return true;
- }
-
- if (!input.incrementToken()) return false;
- hasCurrentToken = true;
-
- if (posIncrAtt.getPositionIncrement() > 1) {
- numFillerTokensToInsert = posIncrAtt.getPositionIncrement() - 1;
- }
+ boolean success = false;
+ if (numFillerTokensToInsert > 0) {
+ insertFillerToken();
+ success = true;
+ } else if (null != nextInputStreamToken) {
+ restoreState(nextInputStreamToken);
+ nextInputStreamToken = null;
+ success = true;
+ } else if (input.incrementToken()) {
+ if (posIncrAtt.getPositionIncrement() > 1) {
+ numFillerTokensToInsert = posIncrAtt.getPositionIncrement() - 1;
+ insertFillerToken();
+ }
+ success = true;
}
+ return success;
}
/**
- * Fill the output buffer with new shingles.
+ * Inserts a {@link #FILLER_TOKEN} and decrements
+ * {@link #numFillerTokensToInsert}.
+ */
+ private void insertFillerToken() {
+ if (null == nextInputStreamToken) {
+ nextInputStreamToken = captureState();
+ } else {
+ restoreState(nextInputStreamToken);
+ }
+ --numFillerTokensToInsert;
+ // A filler token occupies no space
+ offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
+ termAtt.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
+ }
+
+ /**
+ * gramSize
will take on values from the circular sequence
+ * { [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }.
+ * gramSize
will take on values from the circular sequence
+ * { [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }.
+ * gramSize
will take on values from the circular sequence
+ * { [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }.
+ *