diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt index 424fcacb788..542759777da 100644 --- a/lucene/MIGRATE.txt +++ b/lucene/MIGRATE.txt @@ -128,7 +128,7 @@ LUCENE-1458, LUCENE-2111: Flexible Indexing do this: - if ((doc=td.skipTo(target)) != DocsEnum.NO_MORE_DOCS) { + if ((doc=td.advance(target)) != DocsEnum.NO_MORE_DOCS) { ... } diff --git a/lucene/src/java/org/apache/lucene/analysis/Token.java b/lucene/src/java/org/apache/lucene/analysis/Token.java index 6be3a764218..a50b934377c 100644 --- a/lucene/src/java/org/apache/lucene/analysis/Token.java +++ b/lucene/src/java/org/apache/lucene/analysis/Token.java @@ -45,8 +45,8 @@ import org.apache.lucene.util.AttributeImpl; with type "eos". The default token type is "word".
A Token can optionally have metadata (a.k.a. Payload) in the form of a variable
- length byte array. Use {@link DocsAndPositionsEnum#getPayloadLength()} and
- {@link DocsAndPositionsEnum#getPayload(byte[], int)} to retrieve the payloads from the index.
+ length byte array. Use {@link DocsAndPositionsEnum#getPayload()} to retrieve the
+ payloads from the index.
@@ -253,7 +253,7 @@ public class Token extends CharTermAttributeImpl
*
*
* @param positionIncrement the distance from the prior term
- * @see org.apache.lucene.index.TermPositions
+ * @see org.apache.lucene.index.DocsAndPositionsEnum
*/
public void setPositionIncrement(int positionIncrement) {
if (positionIncrement < 0)
diff --git a/lucene/src/java/org/apache/lucene/analysis/TokenStream.java b/lucene/src/java/org/apache/lucene/analysis/TokenStream.java
index 6fb7e8cc4bb..606bdef4aae 100644
--- a/lucene/src/java/org/apache/lucene/analysis/TokenStream.java
+++ b/lucene/src/java/org/apache/lucene/analysis/TokenStream.java
@@ -74,7 +74,7 @@ import org.apache.lucene.util.AttributeSource;
*
* Sometimes it is desirable to capture a current state of a TokenStream
,
* e.g., for buffering purposes (see {@link CachingTokenFilter},
- * {@link TeeSinkTokenFilter}). For this usecase
+ * TeeSinkTokenFilter). For this usecase
* {@link AttributeSource#captureState} and {@link AttributeSource#restoreState}
* can be used.
*
The {@code TokenStream}-API in Lucene is based on the decorator pattern. diff --git a/lucene/src/java/org/apache/lucene/analysis/package.html b/lucene/src/java/org/apache/lucene/analysis/package.html index eb23fc7d1ac..d98f84f5d66 100644 --- a/lucene/src/java/org/apache/lucene/analysis/package.html +++ b/lucene/src/java/org/apache/lucene/analysis/package.html @@ -98,27 +98,22 @@ There are many post tokenization steps that can be done, including (but not limi
- Lucene Java provides a number of analysis capabilities, the most commonly used one being the {@link - org.apache.lucene.analysis.standard.StandardAnalyzer}. Many applications will have a long and industrious life with nothing more + Lucene Java provides a number of analysis capabilities, the most commonly used one being the StandardAnalyzer. + Many applications will have a long and industrious life with nothing more than the StandardAnalyzer. However, there are a few other classes/packages that are worth mentioning:
Analysis is one of the main causes of performance degradation during indexing. Simply put, the more you analyze the slower the indexing (in most cases). - Perhaps your application would be just fine using the simple {@link org.apache.lucene.analysis.WhitespaceTokenizer} combined with a - {@link org.apache.lucene.analysis.StopFilter}. The contrib/benchmark library can be useful for testing out the speed of the analysis process. + Perhaps your application would be just fine using the simple WhitespaceTokenizer combined with a StopFilter. The contrib/benchmark library can be useful + for testing out the speed of the analysis process.
@@ -229,7 +224,7 @@ the source code of any one of the many samples located in this package. public TokenStream tokenStream(final String fieldName, Reader reader) { final TokenStream ts = someAnalyzer.tokenStream(fieldName, reader); TokenStream res = new TokenStream() { - TermAttribute termAtt = addAttribute(TermAttribute.class); + CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); public boolean incrementToken() throws IOException { @@ -237,7 +232,7 @@ the source code of any one of the many samples located in this package. while (true) { boolean hasNext = ts.incrementToken(); if (hasNext) { - if (stopWords.contains(termAtt.term())) { + if (stopWords.contains(termAtt.toString())) { extraIncrement++; // filter this word continue; } @@ -282,7 +277,7 @@ is necessary that can transport custom types of data from the documents to the i
Lucene now provides six Attributes out of the box, which replace the variables the Token class has:
The term text of a token.
The term text of a token.
The start and end offset of token in characters.
See above for detailed information about position increment.
The payload that a Token can optionally have.
This @@ -409,7 +404,7 @@ public final class LengthFilter extends TokenFilter { final int min; final int max; - private TermAttribute termAtt; + private CharTermAttribute termAtt; /** * Build a filter that removes words that are too long or too @@ -420,7 +415,7 @@ public final class LengthFilter extends TokenFilter { super(in); this.min = min; this.max = max; - termAtt = addAttribute(TermAttribute.class); + termAtt = addAttribute(CharTermAttribute.class); } /** @@ -431,7 +426,7 @@ public final class LengthFilter extends TokenFilter { assert termAtt != null; // return the first non-stop word found while (input.incrementToken()) { - int len = termAtt.termLength(); + int len = termAtt.length(); if (len >= min && len <= max) { return true; } @@ -442,11 +437,11 @@ public final class LengthFilter extends TokenFilter { } }-The TermAttribute is added in the constructor and stored in the instance variable
termAtt
.
-Remember that there can only be a single instance of TermAttribute in the chain, so in our example the
+The CharTermAttribute is added in the constructor and stored in the instance variable termAtt
.
+Remember that there can only be a single instance of CharTermAttribute in the chain, so in our example the
addAttribute()
call in LengthFilter returns the TermAttribute that the WhitespaceTokenizer already added. The tokens
are retrieved from the input stream in the incrementToken()
method. By looking at the term text
-in the TermAttribute the length of the term can be determined and too short or too long tokens are skipped.
+in the CharTermAttribute the length of the term can be determined and too short or too long tokens are skipped.
Note how incrementToken()
can efficiently access the instance variable; no attribute lookup
is neccessary. The same is true for the consumer, which can simply use local references to the Attributes.
@@ -521,17 +516,17 @@ that tags every word with a leading upper-case letter as a 'Noun' and all other
public static class PartOfSpeechTaggingFilter extends TokenFilter { PartOfSpeechAttribute posAtt; - TermAttribute termAtt; + CharTermAttribute termAtt; protected PartOfSpeechTaggingFilter(TokenStream input) { super(input); posAtt = addAttribute(PartOfSpeechAttribute.class); - termAtt = addAttribute(TermAttribute.class); + termAtt = addAttribute(CharTermAttribute.class); } public boolean incrementToken() throws IOException { if (!input.incrementToken()) {return false;} - posAtt.setPartOfSpeech(determinePOS(termAtt.termBuffer(), 0, termAtt.termLength())); + posAtt.setPartOfSpeech(determinePOS(termAtt.buffer(), 0, termAtt.length())); return true; } @@ -577,8 +572,8 @@ to make use of the new PartOfSpeechAttribute and print it out: MyAnalyzer analyzer = new MyAnalyzer(); TokenStream stream = analyzer.tokenStream("field", new StringReader(text)); - // get the TermAttribute from the TokenStream - TermAttribute termAtt = stream.addAttribute(TermAttribute.class); + // get the CharTermAttribute from the TokenStream + CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); // get the PartOfSpeechAttribute from the TokenStream PartOfSpeechAttribute posAtt = stream.addAttribute(PartOfSpeechAttribute.class); @@ -587,7 +582,7 @@ to make use of the new PartOfSpeechAttribute and print it out: // print all tokens until stream is exhausted while (stream.incrementToken()) { - System.out.println(termAtt.term() + ": " + posAtt.getPartOfSpeech()); + System.out.println(termAtt.toString() + ": " + posAtt.getPartOfSpeech()); } stream.end(); diff --git a/lucene/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java b/lucene/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java index bbb8cf24b7b..20999aa151a 100644 --- a/lucene/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java +++ b/lucene/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java @@ -43,7 +43,7 @@ import org.apache.lucene.util.Attribute; * * * - * @see org.apache.lucene.index.TermPositions + * @see org.apache.lucene.index.DocsAndPositionsEnum */ public interface PositionIncrementAttribute extends Attribute { /** Set the position increment. The default value is one. diff --git a/lucene/src/java/org/apache/lucene/index/Payload.java b/lucene/src/java/org/apache/lucene/index/Payload.java index 50a99129d69..f7639a3830f 100644 --- a/lucene/src/java/org/apache/lucene/index/Payload.java +++ b/lucene/src/java/org/apache/lucene/index/Payload.java @@ -30,7 +30,7 @@ import org.apache.lucene.util.ArrayUtil; * To store payloads in the index a {@link TokenStream} has to be used that * produces payload data. *- * Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)} + * Use {@link DocsAndPositionsEnum#getPayload()} * to retrieve the payloads from the index.
* */ diff --git a/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java b/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java index f94d6bc9bac..a03bb0b0934 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java +++ b/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java @@ -48,8 +48,8 @@ public class SegmentWriteState { * tweaking this is rarely useful.*/ public final int termIndexInterval; - /** Expert: The fraction of {@link TermDocs} entries stored in skip tables, - * used to accelerate {@link TermDocs#skipTo(int)}. Larger values result in + /** Expert: The fraction of TermDocs entries stored in skip tables, + * used to accelerate {@link DocsEnum#advance(int)}. Larger values result in * smaller indexes, greater acceleration, but fewer accelerable cases, while * smaller values result in bigger indexes, less acceleration and more * accelerable cases. More detailed experiments would be useful here. */ diff --git a/lucene/src/java/org/apache/lucene/index/Term.java b/lucene/src/java/org/apache/lucene/index/Term.java index d69c4811db5..3903255c58a 100644 --- a/lucene/src/java/org/apache/lucene/index/Term.java +++ b/lucene/src/java/org/apache/lucene/index/Term.java @@ -101,7 +101,7 @@ public final class Term implements Comparable, java.io.Serializable { * Therefore the bytes should not be modified after construction, for * example, you should clone a copy rather than pass reused bytes from * a TermsEnum. - * @param text The bytes of the new term (field is implicitly same as this Term instance) + * @param bytes The bytes of the new term (field is implicitly same as this Term instance) * @return A new Term */ public Term createTerm(BytesRef bytes) diff --git a/lucene/src/java/org/apache/lucene/index/TermVectorMapper.java b/lucene/src/java/org/apache/lucene/index/TermVectorMapper.java index c0da5bb25ec..bdca6327923 100644 --- a/lucene/src/java/org/apache/lucene/index/TermVectorMapper.java +++ b/lucene/src/java/org/apache/lucene/index/TermVectorMapper.java @@ -51,7 +51,7 @@ public abstract class TermVectorMapper { * Tell the mapper what to expect in regards to field, number of terms, offset and position storage. * This method will be called once before retrieving the vector for a field. * - * This method will be called before {@link #map(String,int,TermVectorOffsetInfo[],int[])}. + * This method will be called before {@link #map(BytesRef,int,TermVectorOffsetInfo[],int[])}. * @param field The field the vector is for * @param numTerms The number of terms that need to be mapped * @param storeOffsets true if the mapper should expect offset information diff --git a/lucene/src/java/org/apache/lucene/search/FieldCacheRangeFilter.java b/lucene/src/java/org/apache/lucene/search/FieldCacheRangeFilter.java index 13f12505f6a..43ccb0d55ea 100644 --- a/lucene/src/java/org/apache/lucene/search/FieldCacheRangeFilter.java +++ b/lucene/src/java/org/apache/lucene/search/FieldCacheRangeFilter.java @@ -76,7 +76,7 @@ public abstract class FieldCacheRangeFilter extends Filter { public abstract DocIdSet getDocIdSet(IndexReader reader) throws IOException; /** - * Creates a string range filter using {@link FieldCache#getStringIndex}. This works with all + * Creates a string range filter using {@link FieldCache#getTermsIndex}. This works with all * fields containing zero or one term in the field. The range can be half-open by setting one * of the values to null
. */ diff --git a/lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java b/lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java index f95a51775e9..57f8be754a4 100644 --- a/lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java +++ b/lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java @@ -19,6 +19,7 @@ package org.apache.lucene.search; import java.io.IOException; +import org.apache.lucene.index.DocsEnum; // javadoc @link import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.OpenBitSet; import org.apache.lucene.util.BytesRef; @@ -40,7 +41,7 @@ import org.apache.lucene.util.BytesRef; * * * The first invocation of this filter on a given field will - * be slower, since a {@link FieldCache.StringIndex} must be + * be slower, since a {@link FieldCache.DocTermsIndex} must be * created. Subsequent invocations using the same field * will re-use this cache. However, as with all * functionality based on {@link FieldCache}, persistent RAM diff --git a/lucene/src/java/org/apache/lucene/search/FieldComparator.java b/lucene/src/java/org/apache/lucene/search/FieldComparator.java index 358d5d32f23..9b140fff64b 100644 --- a/lucene/src/java/org/apache/lucene/search/FieldComparator.java +++ b/lucene/src/java/org/apache/lucene/search/FieldComparator.java @@ -699,7 +699,7 @@ public abstract class FieldComparator { * ordinals. This is functionally equivalent to {@link * TermValComparator}, but it first resolves the string * to their relative ordinal positions (using the index - * returned by {@link FieldCache#getStringIndex}), and + * returned by {@link FieldCache#getTermsIndex}), and * does most comparisons using the ordinals. For medium * to large results, this comparator will be much faster * than {@link TermValComparator}. For very small diff --git a/lucene/src/java/org/apache/lucene/store/MMapDirectory.java b/lucene/src/java/org/apache/lucene/store/MMapDirectory.java index 6a394c4509e..58be3678ccb 100644 --- a/lucene/src/java/org/apache/lucene/store/MMapDirectory.java +++ b/lucene/src/java/org/apache/lucene/store/MMapDirectory.java @@ -22,6 +22,7 @@ import java.io.File; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.BufferUnderflowException; +import java.nio.channels.ClosedChannelException; // javadoc @link import java.nio.channels.FileChannel; import java.nio.channels.FileChannel.MapMode; diff --git a/lucene/src/java/org/apache/lucene/store/NIOFSDirectory.java b/lucene/src/java/org/apache/lucene/store/NIOFSDirectory.java index 5e28cf501aa..2a18262a2ab 100644 --- a/lucene/src/java/org/apache/lucene/store/NIOFSDirectory.java +++ b/lucene/src/java/org/apache/lucene/store/NIOFSDirectory.java @@ -20,6 +20,7 @@ package org.apache.lucene.store; import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; +import java.nio.channels.ClosedChannelException; // javadoc @link import java.nio.channels.FileChannel; import java.util.concurrent.Future; // javadoc @@ -29,7 +30,7 @@ import java.util.concurrent.Future; // javadoc * without synchronizing. ** This class only uses FileChannel when reading; writing is achieved with - * {@link SimpleFSDirectory.SimpleFSIndexOutput}. + * {@link FSDirectory.FSIndexOutput}. *
* NOTE: NIOFSDirectory is not recommended on Windows because of a bug in * how FileChannel.read is implemented in Sun's JRE. Inside of the diff --git a/lucene/src/java/org/apache/lucene/util/automaton/Automaton.java b/lucene/src/java/org/apache/lucene/util/automaton/Automaton.java index ff5558995e9..aed2e52dabb 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/Automaton.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/Automaton.java @@ -115,7 +115,6 @@ public class Automaton implements Serializable, Cloneable { * constructor, automata can be constructed manually from {@link State} and * {@link Transition} objects. * - * @see #setInitialState(State) * @see State * @see Transition */ diff --git a/lucene/src/java/org/apache/lucene/util/automaton/State.java b/lucene/src/java/org/apache/lucene/util/automaton/State.java index ab833db9b73..b19c7c9b27e 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/State.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/State.java @@ -238,7 +238,7 @@ public class State implements Serializable, Comparable
{ /** * Return this state's number. * - * Expert: Will be useless unless {@link Automaton#setStateNumbers(Set)} + * Expert: Will be useless unless {@link Automaton#getNumberedStates} * has been called first to number the states. * @return the number */