diff --git a/CHANGES.txt b/CHANGES.txt index a99aa2c5bea..75d82afbad8 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -199,6 +199,10 @@ Optimizations * LUCENE-2285: Code cleanup. (Shai Erera via Uwe Schindler) +* LUCENE-2303: Remove code duplication in Token class by subclassing + TermAttributeImpl, move DEFAULT_TYPE constant to TypeInterface, improve + null-handling for TypeAttribute. (Uwe Schindler) + Build * LUCENE-2124: Moved the JDK-based collation support from contrib/collation diff --git a/src/java/org/apache/lucene/analysis/Token.java b/src/java/org/apache/lucene/analysis/Token.java index 5ccf5e289ee..32242ae9e0f 100644 --- a/src/java/org/apache/lucene/analysis/Token.java +++ b/src/java/org/apache/lucene/analysis/Token.java @@ -17,6 +17,7 @@ package org.apache.lucene.analysis; * limitations under the License. */ +import org.apache.lucene.analysis.tokenattributes.TermAttributeImpl; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; @@ -25,11 +26,9 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.index.Payload; import org.apache.lucene.index.TermPositions; // for javadoc -import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeImpl; -import org.apache.lucene.util.RamUsageEstimator; /** A Token is an occurrence of a term from the text of a field. It consists of @@ -116,16 +115,10 @@ import org.apache.lucene.util.RamUsageEstimator; @see org.apache.lucene.index.Payload */ -public class Token extends AttributeImpl - implements Cloneable, TermAttribute, TypeAttribute, PositionIncrementAttribute, +public class Token extends TermAttributeImpl + implements TypeAttribute, PositionIncrementAttribute, FlagsAttribute, OffsetAttribute, PayloadAttribute { - public static final String DEFAULT_TYPE = "word"; - - private static int MIN_BUFFER_SIZE = 10; - - private char[] termBuffer; - private int termLength; private int startOffset,endOffset; private String type = DEFAULT_TYPE; private int flags; @@ -273,139 +266,6 @@ public class Token extends AttributeImpl return positionIncrement; } - /** Returns the Token's term text. - * - * This method has a performance penalty - * because the text is stored internally in a char[]. If - * possible, use {@link #termBuffer()} and {@link - * #termLength()} directly instead. If you really need a - * String, use this method, which is nothing more than - * a convenience call to new String(token.termBuffer(), 0, token.termLength()) - */ - public final String term() { - initTermBuffer(); - return new String(termBuffer, 0, termLength); - } - - /** Copies the contents of buffer, starting at offset for - * length characters, into the termBuffer array. - * @param buffer the buffer to copy - * @param offset the index in the buffer of the first character to copy - * @param length the number of characters to copy - */ - public final void setTermBuffer(char[] buffer, int offset, int length) { - growTermBuffer(length); - System.arraycopy(buffer, offset, termBuffer, 0, length); - termLength = length; - } - - /** Copies the contents of buffer into the termBuffer array. - * @param buffer the buffer to copy - */ - public final void setTermBuffer(String buffer) { - final int length = buffer.length(); - growTermBuffer(length); - buffer.getChars(0, length, termBuffer, 0); - termLength = length; - } - - /** Copies the contents of buffer, starting at offset and continuing - * for length characters, into the termBuffer array. - * @param buffer the buffer to copy - * @param offset the index in the buffer of the first character to copy - * @param length the number of characters to copy - */ - public final void setTermBuffer(String buffer, int offset, int length) { - assert offset <= buffer.length(); - assert offset + length <= buffer.length(); - growTermBuffer(length); - buffer.getChars(offset, offset + length, termBuffer, 0); - termLength = length; - } - - /** Returns the internal termBuffer character array which - * you can then directly alter. If the array is too - * small for your token, use {@link - * #resizeTermBuffer(int)} to increase it. After - * altering the buffer be sure to call {@link - * #setTermLength} to record the number of valid - * characters that were placed into the termBuffer. */ - public final char[] termBuffer() { - initTermBuffer(); - return termBuffer; - } - - /** Grows the termBuffer to at least size newSize, preserving the - * existing content. Note: If the next operation is to change - * the contents of the term buffer use - * {@link #setTermBuffer(char[], int, int)}, - * {@link #setTermBuffer(String)}, or - * {@link #setTermBuffer(String, int, int)} - * to optimally combine the resize with the setting of the termBuffer. - * @param newSize minimum size of the new termBuffer - * @return newly created termBuffer with length >= newSize - */ - public char[] resizeTermBuffer(int newSize) { - if (termBuffer == null) { - // The buffer is always at least MIN_BUFFER_SIZE - termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; - } else { - if(termBuffer.length < newSize){ - // Not big enough; create a new array with slight - // over allocation and preserve content - final char[] newCharBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; - System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length); - termBuffer = newCharBuffer; - } - } - return termBuffer; - } - - /** Allocates a buffer char[] of at least newSize, without preserving the existing content. - * its always used in places that set the content - * @param newSize minimum size of the buffer - */ - private void growTermBuffer(int newSize) { - if (termBuffer == null) { - // The buffer is always at least MIN_BUFFER_SIZE - termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; - } else { - if(termBuffer.length < newSize){ - // Not big enough; create a new array with slight - // over allocation: - termBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; - } - } - } - - private void initTermBuffer() { - if (termBuffer == null) { - termBuffer = new char[ArrayUtil.oversize(MIN_BUFFER_SIZE, RamUsageEstimator.NUM_BYTES_CHAR)]; - termLength = 0; - } - } - - /** Return number of valid characters (length of the term) - * in the termBuffer array. */ - public final int termLength() { - initTermBuffer(); - return termLength; - } - - /** Set number of valid characters (length of the term) in - * the termBuffer array. Use this to truncate the termBuffer - * or to synchronize with external manipulation of the termBuffer. - * Note: to grow the size of the array, - * use {@link #resizeTermBuffer(int)} first. - * @param length the truncated length - */ - public final void setTermLength(int length) { - initTermBuffer(); - if (length > termBuffer.length) - throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")"); - termLength = length; - } - /** Returns this Token's starting offset, the position of the first character corresponding to this token in the source text. @@ -490,15 +350,10 @@ public class Token extends AttributeImpl @Override public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append('('); - initTermBuffer(); - if (termBuffer == null) - sb.append("null"); - else - sb.append(termBuffer, 0, termLength); - sb.append(',').append(startOffset).append(',').append(endOffset); - if (!type.equals("word")) + final StringBuilder sb = new StringBuilder(); + sb.append('(').append(term()).append(',') + .append(startOffset).append(',').append(endOffset); + if (!"word".equals(type)) sb.append(",type=").append(type); if (positionIncrement != 1) sb.append(",posIncr=").append(positionIncrement); @@ -511,9 +366,8 @@ public class Token extends AttributeImpl */ @Override public void clear() { + super.clear(); payload = null; - // Leave termBuffer to allow re-use - termLength = 0; positionIncrement = 1; flags = 0; startOffset = endOffset = 0; @@ -524,9 +378,6 @@ public class Token extends AttributeImpl public Object clone() { Token t = (Token)super.clone(); // Do a deep clone - if (termBuffer != null) { - t.termBuffer = termBuffer.clone(); - } if (payload != null) { t.payload = (Payload) payload.clone(); } @@ -554,46 +405,30 @@ public class Token extends AttributeImpl return true; if (obj instanceof Token) { - Token other = (Token) obj; - - initTermBuffer(); - other.initTermBuffer(); - - if (termLength == other.termLength && - startOffset == other.startOffset && + final Token other = (Token) obj; + return (startOffset == other.startOffset && endOffset == other.endOffset && flags == other.flags && positionIncrement == other.positionIncrement && - subEqual(type, other.type) && - subEqual(payload, other.payload)) { - for(int i=0;i