-
-
-
- 4.0.0
-
- org.apache.lucene
- lucene-contrib
- @version@
-
- org.apache.lucene
- lucene-spellchecker
- Lucene Spellchecker
- @version@
- Spell Checker
- jar
-
diff --git a/lucene/contrib/swing/pom.xml.template b/lucene/contrib/swing/pom.xml.template
deleted file mode 100644
index 2ebf3bc9d03..00000000000
--- a/lucene/contrib/swing/pom.xml.template
+++ /dev/null
@@ -1,36 +0,0 @@
-
-
-
-
- 4.0.0
-
- org.apache.lucene
- lucene-contrib
- @version@
-
- org.apache.lucene
- lucene-swing
- Lucene Swing
- @version@
- Swing Models
- jar
-
diff --git a/lucene/contrib/swing/src/java/org/apache/lucene/swing/models/ListSearcher.java b/lucene/contrib/swing/src/java/org/apache/lucene/swing/models/ListSearcher.java
index 611b063baef..e8d2b7765c0 100644
--- a/lucene/contrib/swing/src/java/org/apache/lucene/swing/models/ListSearcher.java
+++ b/lucene/contrib/swing/src/java/org/apache/lucene/swing/models/ListSearcher.java
@@ -32,6 +32,7 @@ import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
@@ -192,7 +193,7 @@ public class ListSearcher extends AbstractListModel {
}
@Override
- public void setNextReader(IndexReader reader, int docBase) {}
+ public void setNextReader(AtomicReaderContext context) {}
@Override
public boolean acceptsDocsOutOfOrder() {
return true;
diff --git a/lucene/contrib/wordnet/pom.xml.template b/lucene/contrib/wordnet/pom.xml.template
deleted file mode 100644
index e9518229364..00000000000
--- a/lucene/contrib/wordnet/pom.xml.template
+++ /dev/null
@@ -1,37 +0,0 @@
-
-
-
-
- 4.0.0
-
- org.apache.lucene
- lucene-contrib
- @version@
-
- org.apache.lucene
- lucene-wordnet
- Lucene Wordnet
- @version@
- WordNet
- jar
-
-
diff --git a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java b/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java
index 908cfd66eff..646abf73dbd 100755
--- a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java
+++ b/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Collector;
@@ -160,9 +161,9 @@ public final class SynExpand {
}
@Override
- public void setNextReader(IndexReader reader, int docBase)
+ public void setNextReader(AtomicReaderContext context)
throws IOException {
- this.reader = reader;
+ this.reader = context.reader;
}
@Override
diff --git a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynLookup.java b/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynLookup.java
index 066df71ba02..4cc4836cc5b 100644
--- a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynLookup.java
+++ b/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynLookup.java
@@ -32,6 +32,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Collector;
@@ -59,7 +60,7 @@ public class SynLookup {
}
@Override
- public void setNextReader(IndexReader reader, int docBase) {}
+ public void setNextReader(AtomicReaderContext context) {}
@Override
public boolean acceptsDocsOutOfOrder() {
return true;
@@ -169,9 +170,9 @@ public class SynLookup {
}
@Override
- public void setNextReader(IndexReader reader, int docBase)
+ public void setNextReader(AtomicReaderContext context)
throws IOException {
- this.reader = reader;
+ this.reader = context.reader;
}
@Override
diff --git a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynonymMap.java b/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynonymMap.java
index 455c8118c5a..099d653bef1 100644
--- a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynonymMap.java
+++ b/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynonymMap.java
@@ -52,15 +52,17 @@ import java.util.TreeSet;
* high-frequency lookups of medium size synonym tables.
*
* Example Usage:
- *
+ *
* String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"};
* SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl"));
* for (int i = 0; i < words.length; i++) {
* String[] synonyms = map.getSynonyms(words[i]);
* System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString());
* }
- *
+ *
+ *
* Example output:
+ *
* hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough]
* woods:[forest, wood]
* forest:[afforest, timber, timberland, wood, woodland, woods]
@@ -161,7 +163,7 @@ public class SynonymMap {
return word.toLowerCase();
}
- private static boolean isValid(String str) {
+ protected boolean isValid(String str) {
for (int i=str.length(); --i >= 0; ) {
if (!Character.isLetter(str.charAt(i))) return false;
}
@@ -395,4 +397,4 @@ public class SynonymMap {
}
}
-}
\ No newline at end of file
+}
diff --git a/lucene/contrib/xml-query-parser/pom.xml.template b/lucene/contrib/xml-query-parser/pom.xml.template
deleted file mode 100644
index 4c1e7f8a5a5..00000000000
--- a/lucene/contrib/xml-query-parser/pom.xml.template
+++ /dev/null
@@ -1,43 +0,0 @@
-
-
-
-
- 4.0.0
-
- org.apache.lucene
- lucene-contrib
- @version@
-
- org.apache.lucene
- lucene-xml-query-parser
- Lucene XML Query Parser
- @version@
- XML query parser
- jar
-
-
- org.apache.lucene
- lucene-queries
- @version@
-
-
-
diff --git a/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/NumericRangeFilterBuilder.java b/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/NumericRangeFilterBuilder.java
index c834f8e4d98..ea5f5741c34 100644
--- a/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/NumericRangeFilterBuilder.java
+++ b/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/NumericRangeFilterBuilder.java
@@ -19,7 +19,7 @@ package org.apache.lucene.xmlparser.builders;
import java.io.IOException;
-import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.NumericRangeFilter;
@@ -157,7 +157,7 @@ public class NumericRangeFilterBuilder implements FilterBuilder {
private static final long serialVersionUID = 1L;
@Override
- public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
+ public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
return null;
}
diff --git a/lucene/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/TestParser.java b/lucene/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/TestParser.java
index ad266b52f19..6122b8aab6a 100644
--- a/lucene/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/TestParser.java
+++ b/lucene/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/TestParser.java
@@ -73,7 +73,7 @@ public class TestParser extends LuceneTestCase {
d.close();
writer.close();
reader=IndexReader.open(dir, true);
- searcher=new IndexSearcher(reader);
+ searcher=newSearcher(reader);
}
@@ -215,7 +215,10 @@ public class TestParser extends LuceneTestCase {
}
private void dumpResults(String qType,Query q, int numDocs) throws IOException
{
- TopDocs hits = searcher.search(q, null, numDocs);
+ if (VERBOSE) {
+ System.out.println("TEST: query=" + q);
+ }
+ TopDocs hits = searcher.search(q, null, numDocs);
assertTrue(qType +" should produce results ", hits.totalHits>0);
if(VERBOSE)
{
diff --git a/lucene/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/builders/TestNumericRangeFilterBuilder.java b/lucene/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/builders/TestNumericRangeFilterBuilder.java
index dca574dd0bf..028cc752b85 100644
--- a/lucene/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/builders/TestNumericRangeFilterBuilder.java
+++ b/lucene/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/builders/TestNumericRangeFilterBuilder.java
@@ -28,7 +28,9 @@ import javax.xml.parsers.ParserConfigurationException;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.SlowMultiReaderWrapper;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.NumericRangeFilter;
import org.apache.lucene.store.Directory;
@@ -64,10 +66,10 @@ public class TestNumericRangeFilterBuilder extends LuceneTestCase {
writer.commit();
try
{
- IndexReader reader = IndexReader.open(ramDir, true);
+ IndexReader reader = new SlowMultiReaderWrapper(IndexReader.open(ramDir, true));
try
{
- assertNull(filter.getDocIdSet(reader));
+ assertNull(filter.getDocIdSet((AtomicReaderContext) reader.getTopReaderContext()));
}
finally
{
diff --git a/lucene/lucene-contrib-pom.xml.template b/lucene/lucene-contrib-pom.xml.template
deleted file mode 100644
index 9c33fb836c1..00000000000
--- a/lucene/lucene-contrib-pom.xml.template
+++ /dev/null
@@ -1,50 +0,0 @@
-
-
-
-
- 4.0.0
-
- org.apache.lucene
- lucene-parent
- @version@
-
- lucene-contrib
- Lucene Java Contrib POM
- @version@
- pom
-
-
- org.apache.lucene
- lucene-core
- @version@
-
-
-
- 1.0.4
- 1.7
- 3.1
- 1.7.0
- 1.4
- 3.3.93
- 4.0
-
-
diff --git a/lucene/lucene-core-pom.xml.template b/lucene/lucene-core-pom.xml.template
deleted file mode 100644
index da3a14e32a5..00000000000
--- a/lucene/lucene-core-pom.xml.template
+++ /dev/null
@@ -1,36 +0,0 @@
-
-
-
-
-
- org.apache.lucene
- lucene-parent
- @version@
-
- 4.0.0
- org.apache.lucene
- lucene-core
- Lucene Core
- @version@
- Apache Lucene Java Core
- jar
-
diff --git a/lucene/lucene-parent-pom.xml.template b/lucene/lucene-parent-pom.xml.template
deleted file mode 100644
index 3f01c87cd3c..00000000000
--- a/lucene/lucene-parent-pom.xml.template
+++ /dev/null
@@ -1,89 +0,0 @@
-
-
-
-
- org.apache
- apache
- 4
-
- 4.0.0
- org.apache.lucene
- lucene-parent
- Lucene Java POM
- @version@
- Apache Lucene Java POM
- http://lucene.apache.org/java
- pom
-
- JIRA
- http://issues.apache.org/jira/browse/LUCENE
-
-
- Hudson
-
- http://lucene.zones.apache.org:8080/hudson/job/Lucene-Nightly/
-
-
-
-
- Java User List
- java-user-subscribe@lucene.apache.org
- java-user-unsubscribe@lucene.apache.org
-
- http://mail-archives.apache.org/mod_mbox/java-user/
-
-
-
- Java Developer List
- java-dev-subscribe@lucene.apache.org
- java-dev-unsubscribe@lucene.apache.org
-
- http://mail-archives.apache.org/mod_mbox/java-dev/
-
-
-
- Java Commits List
- java-commits-subscribe@lucene.apache.org
-
- java-commits-unsubscribe@lucene.apache.org
-
-
- http://mail-archives.apache.org/mod_mbox/java-commits/
-
-
-
- 2000
-
-
- Apache 2
- http://www.apache.org/licenses/LICENSE-2.0.txt
-
-
-
-
- scm:svn:http://svn.apache.org/repos/asf/lucene/dev
-
-
- scm:svn:https://svn.apache.org/repos/asf/lucene/dev
-
-
-
diff --git a/lucene/src/java/org/apache/lucene/analysis/NumericTokenStream.java b/lucene/src/java/org/apache/lucene/analysis/NumericTokenStream.java
index bed4c06c1a6..b98a24646ca 100644
--- a/lucene/src/java/org/apache/lucene/analysis/NumericTokenStream.java
+++ b/lucene/src/java/org/apache/lucene/analysis/NumericTokenStream.java
@@ -19,6 +19,7 @@ package org.apache.lucene.analysis;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.document.NumericField; // for javadocs
@@ -83,8 +84,6 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
* href="../search/NumericRangeQuery.html#precisionStepDesc">precisionStep
* parameter as well as how numeric fields work under the hood.
*
- * @lucene.experimental
- *
* @since 2.9
*/
public final class NumericTokenStream extends TokenStream {
@@ -95,22 +94,34 @@ public final class NumericTokenStream extends TokenStream {
/** The lower precision tokens gets this token type assigned. */
public static final String TOKEN_TYPE_LOWER_PREC = "lowerPrecNumeric";
- /** Expert: Use this attribute to get the details of the currently generated token
+ /** Expert: Use this attribute to get the details of the currently generated token.
* @lucene.experimental
* @since 4.0
*/
public interface NumericTermAttribute extends Attribute {
/** Returns current shift value, undefined before first token */
int getShift();
- /** Returns {@link NumericTokenStream}'s raw value as {@code long} */
+ /** Returns current token's raw value as {@code long} with all {@link #getShift} applied, undefined before first token */
long getRawValue();
/** Returns value size in bits (32 for {@code float}, {@code int}; 64 for {@code double}, {@code long}) */
int getValueSize();
+
+ /** Don't call this method!
+ * @lucene.internal */
+ void init(long value, int valSize, int precisionStep, int shift);
+
+ /** Don't call this method!
+ * @lucene.internal */
+ void setShift(int shift);
+
+ /** Don't call this method!
+ * @lucene.internal */
+ int incShift();
}
+ // just a wrapper to prevent adding CTA
private static final class NumericAttributeFactory extends AttributeFactory {
private final AttributeFactory delegate;
- private NumericTokenStream ts = null;
NumericAttributeFactory(AttributeFactory delegate) {
this.delegate = delegate;
@@ -118,72 +129,79 @@ public final class NumericTokenStream extends TokenStream {
@Override
public AttributeImpl createAttributeInstance(Class extends Attribute> attClass) {
- if (attClass == NumericTermAttribute.class)
- return new NumericTermAttributeImpl(ts);
if (CharTermAttribute.class.isAssignableFrom(attClass))
throw new IllegalArgumentException("NumericTokenStream does not support CharTermAttribute.");
return delegate.createAttributeInstance(attClass);
}
}
- private static final class NumericTermAttributeImpl extends AttributeImpl implements NumericTermAttribute,TermToBytesRefAttribute {
- private final NumericTokenStream ts;
+ /** Implementation of {@link NumericTermAttribute}.
+ * @lucene.internal
+ * @since 4.0
+ */
+ public static final class NumericTermAttributeImpl extends AttributeImpl implements NumericTermAttribute,TermToBytesRefAttribute {
+ private long value = 0L;
+ private int valueSize = 0, shift = 0, precisionStep = 0;
- public NumericTermAttributeImpl(NumericTokenStream ts) {
- this.ts = ts;
- }
-
public int toBytesRef(BytesRef bytes) {
try {
- assert ts.valSize == 64 || ts.valSize == 32;
- return (ts.valSize == 64) ?
- NumericUtils.longToPrefixCoded(ts.value, ts.shift, bytes) :
- NumericUtils.intToPrefixCoded((int) ts.value, ts.shift, bytes);
+ assert valueSize == 64 || valueSize == 32;
+ return (valueSize == 64) ?
+ NumericUtils.longToPrefixCoded(value, shift, bytes) :
+ NumericUtils.intToPrefixCoded((int) value, shift, bytes);
} catch (IllegalArgumentException iae) {
- // return empty token before first
+ // return empty token before first or after last
bytes.length = 0;
return 0;
}
}
- public int getShift() { return ts.shift; }
- public long getRawValue() { return ts.value; }
- public int getValueSize() { return ts.valSize; }
+ public int getShift() { return shift; }
+ public void setShift(int shift) { this.shift = shift; }
+ public int incShift() {
+ return (shift += precisionStep);
+ }
+
+ public long getRawValue() { return value & ~((1L << shift) - 1L); }
+ public int getValueSize() { return valueSize; }
+
+ public void init(long value, int valueSize, int precisionStep, int shift) {
+ this.value = value;
+ this.valueSize = valueSize;
+ this.precisionStep = precisionStep;
+ this.shift = shift;
+ }
@Override
public void clear() {
- // this attribute has no contents to clear
- }
-
- @Override
- public boolean equals(Object other) {
- return other == this;
- }
-
- @Override
- public int hashCode() {
- return System.identityHashCode(this);
+ // this attribute has no contents to clear!
+ // we keep it untouched as it's fully controlled by outer class.
}
+ @Override
+ public void reflectWith(AttributeReflector reflector) {
+ final BytesRef bytes = new BytesRef();
+ toBytesRef(bytes);
+ reflector.reflect(TermToBytesRefAttribute.class, "bytes", bytes);
+ reflector.reflect(NumericTermAttribute.class, "shift", shift);
+ reflector.reflect(NumericTermAttribute.class, "rawValue", getRawValue());
+ reflector.reflect(NumericTermAttribute.class, "valueSize", valueSize);
+ }
+
@Override
public void copyTo(AttributeImpl target) {
- // this attribute has no contents to copy
- }
-
- @Override
- public Object clone() {
- // cannot throw CloneNotSupportedException (checked)
- throw new UnsupportedOperationException();
+ final NumericTermAttribute a = (NumericTermAttribute) target;
+ a.init(value, valueSize, precisionStep, shift);
}
}
-
+
/**
* Creates a token stream for numeric values using the default precisionStep
* {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). The stream is not yet initialized,
* before using set a value using the various set???Value() methods.
*/
public NumericTokenStream() {
- this(NumericUtils.PRECISION_STEP_DEFAULT);
+ this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, NumericUtils.PRECISION_STEP_DEFAULT);
}
/**
@@ -192,15 +210,7 @@ public final class NumericTokenStream extends TokenStream {
* before using set a value using the various set???Value() methods.
*/
public NumericTokenStream(final int precisionStep) {
- super(new NumericAttributeFactory(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY));
- // we must do this after the super call :(
- ((NumericAttributeFactory) getAttributeFactory()).ts = this;
- addAttribute(NumericTermAttribute.class);
-
- this.precisionStep = precisionStep;
- if (precisionStep < 1)
- throw new IllegalArgumentException("precisionStep must be >=1");
- shift = -precisionStep;
+ this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, precisionStep);
}
/**
@@ -212,14 +222,10 @@ public final class NumericTokenStream extends TokenStream {
*/
public NumericTokenStream(AttributeFactory factory, final int precisionStep) {
super(new NumericAttributeFactory(factory));
- // we must do this after the super call :(
- ((NumericAttributeFactory) getAttributeFactory()).ts = this;
- addAttribute(NumericTermAttribute.class);
-
- this.precisionStep = precisionStep;
if (precisionStep < 1)
throw new IllegalArgumentException("precisionStep must be >=1");
- shift = -precisionStep;
+ this.precisionStep = precisionStep;
+ numericAtt.setShift(-precisionStep);
}
/**
@@ -229,9 +235,7 @@ public final class NumericTokenStream extends TokenStream {
* new Field(name, new NumericTokenStream(precisionStep).setLongValue(value))
*/
public NumericTokenStream setLongValue(final long value) {
- this.value = value;
- valSize = 64;
- shift = -precisionStep;
+ numericAtt.init(value, valSize = 64, precisionStep, -precisionStep);
return this;
}
@@ -242,9 +246,7 @@ public final class NumericTokenStream extends TokenStream {
* new Field(name, new NumericTokenStream(precisionStep).setIntValue(value))
*/
public NumericTokenStream setIntValue(final int value) {
- this.value = value;
- valSize = 32;
- shift = -precisionStep;
+ numericAtt.init(value, valSize = 32, precisionStep, -precisionStep);
return this;
}
@@ -255,9 +257,7 @@ public final class NumericTokenStream extends TokenStream {
* new Field(name, new NumericTokenStream(precisionStep).setDoubleValue(value))
*/
public NumericTokenStream setDoubleValue(final double value) {
- this.value = NumericUtils.doubleToSortableLong(value);
- valSize = 64;
- shift = -precisionStep;
+ numericAtt.init(NumericUtils.doubleToSortableLong(value), valSize = 64, precisionStep, -precisionStep);
return this;
}
@@ -268,9 +268,7 @@ public final class NumericTokenStream extends TokenStream {
* new Field(name, new NumericTokenStream(precisionStep).setFloatValue(value))
*/
public NumericTokenStream setFloatValue(final float value) {
- this.value = NumericUtils.floatToSortableInt(value);
- valSize = 32;
- shift = -precisionStep;
+ numericAtt.init(NumericUtils.floatToSortableInt(value), valSize = 32, precisionStep, -precisionStep);
return this;
}
@@ -278,40 +276,28 @@ public final class NumericTokenStream extends TokenStream {
public void reset() {
if (valSize == 0)
throw new IllegalStateException("call set???Value() before usage");
- shift = -precisionStep;
+ numericAtt.setShift(-precisionStep);
}
@Override
public boolean incrementToken() {
if (valSize == 0)
throw new IllegalStateException("call set???Value() before usage");
- shift += precisionStep;
- if (shift >= valSize) {
- // reset so the attribute still works after exhausted stream
- shift -= precisionStep;
- return false;
- }
-
+
+ // this will only clear all other attributes in this TokenStream
clearAttributes();
- // the TermToBytesRefAttribute is directly accessing shift & value.
+
+ final int shift = numericAtt.incShift();
typeAtt.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC : TOKEN_TYPE_LOWER_PREC);
posIncrAtt.setPositionIncrement((shift == 0) ? 1 : 0);
- return true;
- }
-
- @Override
- public String toString() {
- final StringBuilder sb = new StringBuilder("(numeric,valSize=").append(valSize);
- sb.append(",precisionStep=").append(precisionStep).append(')');
- return sb.toString();
+ return (shift < valSize);
}
// members
+ private final NumericTermAttribute numericAtt = addAttribute(NumericTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- int shift, valSize = 0; // valSize==0 means not initialized
+ private int valSize = 0; // valSize==0 means not initialized
private final int precisionStep;
-
- long value = 0L;
}
diff --git a/lucene/src/java/org/apache/lucene/analysis/Token.java b/lucene/src/java/org/apache/lucene/analysis/Token.java
index a50b934377c..80c31ec4189 100644
--- a/lucene/src/java/org/apache/lucene/analysis/Token.java
+++ b/lucene/src/java/org/apache/lucene/analysis/Token.java
@@ -28,6 +28,7 @@ import org.apache.lucene.index.DocsAndPositionsEnum; // for javadoc
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
/**
A Token is an occurrence of a term from the text of a field. It consists of
@@ -588,6 +589,17 @@ public class Token extends CharTermAttributeImpl
}
}
+ @Override
+ public void reflectWith(AttributeReflector reflector) {
+ super.reflectWith(reflector);
+ reflector.reflect(OffsetAttribute.class, "startOffset", startOffset);
+ reflector.reflect(OffsetAttribute.class, "endOffset", endOffset);
+ reflector.reflect(PositionIncrementAttribute.class, "positionIncrement", positionIncrement);
+ reflector.reflect(PayloadAttribute.class, "payload", payload);
+ reflector.reflect(FlagsAttribute.class, "flags", flags);
+ reflector.reflect(TypeAttribute.class, "type", type);
+ }
+
/** Convenience factory that returns Token
as implementation for the basic
* attributes and return the default impl (with "Impl" appended) for all other
* attributes.
diff --git a/lucene/src/java/org/apache/lucene/analysis/package.html b/lucene/src/java/org/apache/lucene/analysis/package.html
index d98f84f5d66..28569e483ba 100644
--- a/lucene/src/java/org/apache/lucene/analysis/package.html
+++ b/lucene/src/java/org/apache/lucene/analysis/package.html
@@ -305,7 +305,7 @@ with the TokenStream.
Attribute instances are reused for all tokens of a document. Thus, a TokenStream/-Filter needs to update
the appropriate Attribute(s) in incrementToken(). The consumer, commonly the Lucene indexer, consumes the data in the
-Attributes and then calls incrementToken() again until it retuns false, which indicates that the end of the stream
+Attributes and then calls incrementToken() again until it returns false, which indicates that the end of the stream
was reached. This means that in each call of incrementToken() a TokenStream/-Filter can safely overwrite the data in
the Attribute instances.
diff --git a/lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java b/lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java
index 4268abc0db6..d45d280f73c 100644
--- a/lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java
+++ b/lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java
@@ -23,6 +23,7 @@ import java.nio.CharBuffer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.AttributeReflector;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.UnicodeUtil;
@@ -243,6 +244,14 @@ public class CharTermAttributeImpl extends AttributeImpl implements CharTermAttr
return new String(termBuffer, 0, termLength);
}
+ @Override
+ public void reflectWith(AttributeReflector reflector) {
+ reflector.reflect(CharTermAttribute.class, "term", toString());
+ final BytesRef bytes = new BytesRef();
+ toBytesRef(bytes);
+ reflector.reflect(TermToBytesRefAttribute.class, "bytes", bytes);
+ }
+
@Override
public void copyTo(AttributeImpl target) {
CharTermAttribute t = (CharTermAttribute) target;
diff --git a/lucene/src/java/org/apache/lucene/document/AbstractField.java b/lucene/src/java/org/apache/lucene/document/AbstractField.java
index 54ea023ba28..0fb6f8795f2 100755
--- a/lucene/src/java/org/apache/lucene/document/AbstractField.java
+++ b/lucene/src/java/org/apache/lucene/document/AbstractField.java
@@ -81,7 +81,7 @@ public abstract class AbstractField implements Fieldable {
* used to compute the norm factor for the field. By
* default, in the {@link
* org.apache.lucene.search.Similarity#computeNorm(String,
- * FieldInvertState)} method, the boost value is multipled
+ * FieldInvertState)} method, the boost value is multiplied
* by the {@link
* org.apache.lucene.search.Similarity#lengthNorm(String,
* int)} and then
@@ -103,7 +103,7 @@ public abstract class AbstractField implements Fieldable {
*
* Note: this value is not stored directly with the document in the index.
* Documents returned from {@link org.apache.lucene.index.IndexReader#document(int)} and
- * {@link org.apache.lucene.search.Searcher#doc(int)} may thus not have the same value present as when
+ * {@link org.apache.lucene.search.IndexSearcher#doc(int)} may thus not have the same value present as when
* this field was indexed.
*
* @see #setBoost(float)
diff --git a/lucene/src/java/org/apache/lucene/document/DateTools.java b/lucene/src/java/org/apache/lucene/document/DateTools.java
index 68cb2dfdf25..0e5199c6247 100644
--- a/lucene/src/java/org/apache/lucene/document/DateTools.java
+++ b/lucene/src/java/org/apache/lucene/document/DateTools.java
@@ -47,28 +47,37 @@ import org.apache.lucene.util.NumericUtils; // for javadocs
*/
public class DateTools {
- private final static TimeZone GMT = TimeZone.getTimeZone("GMT");
+ private static final class DateFormats {
+ final static TimeZone GMT = TimeZone.getTimeZone("GMT");
- private static final SimpleDateFormat YEAR_FORMAT = new SimpleDateFormat("yyyy", Locale.US);
- private static final SimpleDateFormat MONTH_FORMAT = new SimpleDateFormat("yyyyMM", Locale.US);
- private static final SimpleDateFormat DAY_FORMAT = new SimpleDateFormat("yyyyMMdd", Locale.US);
- private static final SimpleDateFormat HOUR_FORMAT = new SimpleDateFormat("yyyyMMddHH", Locale.US);
- private static final SimpleDateFormat MINUTE_FORMAT = new SimpleDateFormat("yyyyMMddHHmm", Locale.US);
- private static final SimpleDateFormat SECOND_FORMAT = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US);
- private static final SimpleDateFormat MILLISECOND_FORMAT = new SimpleDateFormat("yyyyMMddHHmmssSSS", Locale.US);
- static {
- // times need to be normalized so the value doesn't depend on the
- // location the index is created/used:
- YEAR_FORMAT.setTimeZone(GMT);
- MONTH_FORMAT.setTimeZone(GMT);
- DAY_FORMAT.setTimeZone(GMT);
- HOUR_FORMAT.setTimeZone(GMT);
- MINUTE_FORMAT.setTimeZone(GMT);
- SECOND_FORMAT.setTimeZone(GMT);
- MILLISECOND_FORMAT.setTimeZone(GMT);
+ final SimpleDateFormat YEAR_FORMAT = new SimpleDateFormat("yyyy", Locale.US);
+ final SimpleDateFormat MONTH_FORMAT = new SimpleDateFormat("yyyyMM", Locale.US);
+ final SimpleDateFormat DAY_FORMAT = new SimpleDateFormat("yyyyMMdd", Locale.US);
+ final SimpleDateFormat HOUR_FORMAT = new SimpleDateFormat("yyyyMMddHH", Locale.US);
+ final SimpleDateFormat MINUTE_FORMAT = new SimpleDateFormat("yyyyMMddHHmm", Locale.US);
+ final SimpleDateFormat SECOND_FORMAT = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US);
+ final SimpleDateFormat MILLISECOND_FORMAT = new SimpleDateFormat("yyyyMMddHHmmssSSS", Locale.US);
+ {
+ // times need to be normalized so the value doesn't depend on the
+ // location the index is created/used:
+ YEAR_FORMAT.setTimeZone(GMT);
+ MONTH_FORMAT.setTimeZone(GMT);
+ DAY_FORMAT.setTimeZone(GMT);
+ HOUR_FORMAT.setTimeZone(GMT);
+ MINUTE_FORMAT.setTimeZone(GMT);
+ SECOND_FORMAT.setTimeZone(GMT);
+ MILLISECOND_FORMAT.setTimeZone(GMT);
+ }
+
+ final Calendar calInstance = Calendar.getInstance(GMT, Locale.US);
}
-
- private static final Calendar calInstance = Calendar.getInstance(GMT);
+
+ private static final ThreadLocal FORMATS = new ThreadLocal() {
+ @Override
+ protected DateFormats initialValue() {
+ return new DateFormats();
+ }
+ };
// cannot create, the class has static methods only
private DateTools() {}
@@ -82,7 +91,7 @@ public class DateTools {
* @return a string in format yyyyMMddHHmmssSSS
or shorter,
* depending on resolution
; using GMT as timezone
*/
- public static synchronized String dateToString(Date date, Resolution resolution) {
+ public static String dateToString(Date date, Resolution resolution) {
return timeToString(date.getTime(), resolution);
}
@@ -95,24 +104,20 @@ public class DateTools {
* @return a string in format yyyyMMddHHmmssSSS
or shorter,
* depending on resolution
; using GMT as timezone
*/
- public static synchronized String timeToString(long time, Resolution resolution) {
- calInstance.setTimeInMillis(round(time, resolution));
- Date date = calInstance.getTime();
+ public static String timeToString(long time, Resolution resolution) {
+ final DateFormats formats = FORMATS.get();
- if (resolution == Resolution.YEAR) {
- return YEAR_FORMAT.format(date);
- } else if (resolution == Resolution.MONTH) {
- return MONTH_FORMAT.format(date);
- } else if (resolution == Resolution.DAY) {
- return DAY_FORMAT.format(date);
- } else if (resolution == Resolution.HOUR) {
- return HOUR_FORMAT.format(date);
- } else if (resolution == Resolution.MINUTE) {
- return MINUTE_FORMAT.format(date);
- } else if (resolution == Resolution.SECOND) {
- return SECOND_FORMAT.format(date);
- } else if (resolution == Resolution.MILLISECOND) {
- return MILLISECOND_FORMAT.format(date);
+ formats.calInstance.setTimeInMillis(round(time, resolution));
+ final Date date = formats.calInstance.getTime();
+
+ switch (resolution) {
+ case YEAR: return formats.YEAR_FORMAT.format(date);
+ case MONTH:return formats.MONTH_FORMAT.format(date);
+ case DAY: return formats.DAY_FORMAT.format(date);
+ case HOUR: return formats.HOUR_FORMAT.format(date);
+ case MINUTE: return formats.MINUTE_FORMAT.format(date);
+ case SECOND: return formats.SECOND_FORMAT.format(date);
+ case MILLISECOND: return formats.MILLISECOND_FORMAT.format(date);
}
throw new IllegalArgumentException("unknown resolution " + resolution);
@@ -128,7 +133,7 @@ public class DateTools {
* @throws ParseException if dateString
is not in the
* expected format
*/
- public static synchronized long stringToTime(String dateString) throws ParseException {
+ public static long stringToTime(String dateString) throws ParseException {
return stringToDate(dateString).getTime();
}
@@ -142,21 +147,23 @@ public class DateTools {
* @throws ParseException if dateString
is not in the
* expected format
*/
- public static synchronized Date stringToDate(String dateString) throws ParseException {
+ public static Date stringToDate(String dateString) throws ParseException {
+ final DateFormats formats = FORMATS.get();
+
if (dateString.length() == 4) {
- return YEAR_FORMAT.parse(dateString);
+ return formats.YEAR_FORMAT.parse(dateString);
} else if (dateString.length() == 6) {
- return MONTH_FORMAT.parse(dateString);
+ return formats.MONTH_FORMAT.parse(dateString);
} else if (dateString.length() == 8) {
- return DAY_FORMAT.parse(dateString);
+ return formats.DAY_FORMAT.parse(dateString);
} else if (dateString.length() == 10) {
- return HOUR_FORMAT.parse(dateString);
+ return formats.HOUR_FORMAT.parse(dateString);
} else if (dateString.length() == 12) {
- return MINUTE_FORMAT.parse(dateString);
+ return formats.MINUTE_FORMAT.parse(dateString);
} else if (dateString.length() == 14) {
- return SECOND_FORMAT.parse(dateString);
+ return formats.SECOND_FORMAT.parse(dateString);
} else if (dateString.length() == 17) {
- return MILLISECOND_FORMAT.parse(dateString);
+ return formats.MILLISECOND_FORMAT.parse(dateString);
}
throw new ParseException("Input is not valid date string: " + dateString, 0);
}
@@ -170,7 +177,7 @@ public class DateTools {
* @return the date with all values more precise than resolution
* set to 0 or 1
*/
- public static synchronized Date round(Date date, Resolution resolution) {
+ public static Date round(Date date, Resolution resolution) {
return new Date(round(date.getTime(), resolution));
}
@@ -184,67 +191,63 @@ public class DateTools {
* @return the date with all values more precise than resolution
* set to 0 or 1, expressed as milliseconds since January 1, 1970, 00:00:00 GMT
*/
- public static synchronized long round(long time, Resolution resolution) {
+ public static long round(long time, Resolution resolution) {
+ final Calendar calInstance = FORMATS.get().calInstance;
calInstance.setTimeInMillis(time);
- if (resolution == Resolution.YEAR) {
- calInstance.set(Calendar.MONTH, 0);
- calInstance.set(Calendar.DAY_OF_MONTH, 1);
- calInstance.set(Calendar.HOUR_OF_DAY, 0);
- calInstance.set(Calendar.MINUTE, 0);
- calInstance.set(Calendar.SECOND, 0);
- calInstance.set(Calendar.MILLISECOND, 0);
- } else if (resolution == Resolution.MONTH) {
- calInstance.set(Calendar.DAY_OF_MONTH, 1);
- calInstance.set(Calendar.HOUR_OF_DAY, 0);
- calInstance.set(Calendar.MINUTE, 0);
- calInstance.set(Calendar.SECOND, 0);
- calInstance.set(Calendar.MILLISECOND, 0);
- } else if (resolution == Resolution.DAY) {
- calInstance.set(Calendar.HOUR_OF_DAY, 0);
- calInstance.set(Calendar.MINUTE, 0);
- calInstance.set(Calendar.SECOND, 0);
- calInstance.set(Calendar.MILLISECOND, 0);
- } else if (resolution == Resolution.HOUR) {
- calInstance.set(Calendar.MINUTE, 0);
- calInstance.set(Calendar.SECOND, 0);
- calInstance.set(Calendar.MILLISECOND, 0);
- } else if (resolution == Resolution.MINUTE) {
- calInstance.set(Calendar.SECOND, 0);
- calInstance.set(Calendar.MILLISECOND, 0);
- } else if (resolution == Resolution.SECOND) {
- calInstance.set(Calendar.MILLISECOND, 0);
- } else if (resolution == Resolution.MILLISECOND) {
- // don't cut off anything
- } else {
- throw new IllegalArgumentException("unknown resolution " + resolution);
+ switch (resolution) {
+ case YEAR:
+ calInstance.set(Calendar.MONTH, 0);
+ calInstance.set(Calendar.DAY_OF_MONTH, 1);
+ calInstance.set(Calendar.HOUR_OF_DAY, 0);
+ calInstance.set(Calendar.MINUTE, 0);
+ calInstance.set(Calendar.SECOND, 0);
+ calInstance.set(Calendar.MILLISECOND, 0);
+ break;
+ case MONTH:
+ calInstance.set(Calendar.DAY_OF_MONTH, 1);
+ calInstance.set(Calendar.HOUR_OF_DAY, 0);
+ calInstance.set(Calendar.MINUTE, 0);
+ calInstance.set(Calendar.SECOND, 0);
+ calInstance.set(Calendar.MILLISECOND, 0);
+ break;
+ case DAY:
+ calInstance.set(Calendar.HOUR_OF_DAY, 0);
+ calInstance.set(Calendar.MINUTE, 0);
+ calInstance.set(Calendar.SECOND, 0);
+ calInstance.set(Calendar.MILLISECOND, 0);
+ break;
+ case HOUR:
+ calInstance.set(Calendar.MINUTE, 0);
+ calInstance.set(Calendar.SECOND, 0);
+ calInstance.set(Calendar.MILLISECOND, 0);
+ break;
+ case MINUTE:
+ calInstance.set(Calendar.SECOND, 0);
+ calInstance.set(Calendar.MILLISECOND, 0);
+ break;
+ case SECOND:
+ calInstance.set(Calendar.MILLISECOND, 0);
+ break;
+ case MILLISECOND:
+ // don't cut off anything
+ break;
+ default:
+ throw new IllegalArgumentException("unknown resolution " + resolution);
}
return calInstance.getTimeInMillis();
}
/** Specifies the time granularity. */
- public static class Resolution {
+ public static enum Resolution {
- public static final Resolution YEAR = new Resolution("year");
- public static final Resolution MONTH = new Resolution("month");
- public static final Resolution DAY = new Resolution("day");
- public static final Resolution HOUR = new Resolution("hour");
- public static final Resolution MINUTE = new Resolution("minute");
- public static final Resolution SECOND = new Resolution("second");
- public static final Resolution MILLISECOND = new Resolution("millisecond");
+ YEAR, MONTH, DAY, HOUR, MINUTE, SECOND, MILLISECOND;
- private String resolution;
-
- private Resolution() {
- }
-
- private Resolution(String resolution) {
- this.resolution = resolution;
- }
-
+ /** this method returns the name of the resolution
+ * in lowercase (for backwards compatibility) */
@Override
public String toString() {
- return resolution;
+ return super.toString().toLowerCase(Locale.ENGLISH);
}
}
diff --git a/lucene/src/java/org/apache/lucene/document/Document.java b/lucene/src/java/org/apache/lucene/document/Document.java
index 1dea49d465c..58e7a7ee3c3 100644
--- a/lucene/src/java/org/apache/lucene/document/Document.java
+++ b/lucene/src/java/org/apache/lucene/document/Document.java
@@ -18,6 +18,7 @@ package org.apache.lucene.document;
*/
import java.util.*; // for javadoc
+import org.apache.lucene.search.IndexSearcher; // for javadoc
import org.apache.lucene.search.ScoreDoc; // for javadoc
import org.apache.lucene.index.IndexReader; // for javadoc
@@ -165,7 +166,7 @@ public final class Document implements java.io.Serializable {
/** Returns a List of all the fields in a document.
* Note that fields which are not {@link Fieldable#isStored() stored} are
* not available in documents retrieved from the
- * index, e.g. {@link Searcher#doc(int)} or {@link
+ * index, e.g. {@link IndexSearcher#doc(int)} or {@link
* IndexReader#document(int)}.
*/
public final List getFields() {
diff --git a/lucene/src/java/org/apache/lucene/document/Fieldable.java b/lucene/src/java/org/apache/lucene/document/Fieldable.java
index 35d2d06d611..75529bc2819 100755
--- a/lucene/src/java/org/apache/lucene/document/Fieldable.java
+++ b/lucene/src/java/org/apache/lucene/document/Fieldable.java
@@ -67,7 +67,7 @@ public interface Fieldable extends Serializable {
*
* Note: this value is not stored directly with the document in the index.
* Documents returned from {@link org.apache.lucene.index.IndexReader#document(int)} and
- * {@link org.apache.lucene.search.Searcher#doc(int)} may thus not have the same value present as when
+ * {@link org.apache.lucene.search.IndexSearcher#doc(int)} may thus not have the same value present as when
* this field was indexed.
*
* @see #setBoost(float)
diff --git a/lucene/src/java/org/apache/lucene/document/NumericField.java b/lucene/src/java/org/apache/lucene/document/NumericField.java
index b1ccf52c8bc..4d008e0169f 100644
--- a/lucene/src/java/org/apache/lucene/document/NumericField.java
+++ b/lucene/src/java/org/apache/lucene/document/NumericField.java
@@ -134,8 +134,6 @@ import org.apache.lucene.search.FieldCache; // javadocs
* values are returned as {@link String}s (according to
* toString(value)
of the used data type).
*
- * @lucene.experimental
- *
* @since 2.9
*/
public final class NumericField extends AbstractField {
diff --git a/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java b/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java
index 3b144aada2c..c72a1f6b0a3 100644
--- a/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java
+++ b/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java
@@ -17,435 +17,228 @@ package org.apache.lucene.index;
* limitations under the License.
*/
-import java.io.IOException;
-import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Iterator;
import java.util.HashMap;
-import java.util.Date;
-import java.util.Map.Entry;
+import java.util.List;
import java.util.Map;
-import java.util.concurrent.atomic.AtomicInteger;
+import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.atomic.AtomicInteger;
-import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
-import org.apache.lucene.search.Scorer;
-import org.apache.lucene.search.Weight;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.index.BufferedDeletesStream.QueryAndLimit;
-/** Holds a {@link SegmentDeletes} for each segment in the
- * index. */
+/* Holds buffered deletes, by docID, term or query for a
+ * single segment. This is used to hold buffered pending
+ * deletes against the to-be-flushed segment. Once the
+ * deletes are pushed (on flush in DocumentsWriter), these
+ * deletes are converted to a FrozenDeletes instance. */
+
+// NOTE: we are sync'd by BufferedDeletes, ie, all access to
+// instances of this class is via sync'd methods on
+// BufferedDeletes
class BufferedDeletes {
- // Deletes for all flushed/merged segments:
- private final Map deletesMap = new HashMap();
+ /* Rough logic: HashMap has an array[Entry] w/ varying
+ load factor (say 2 * POINTER). Entry is object w/ Term
+ key, Integer val, int hash, Entry next
+ (OBJ_HEADER + 3*POINTER + INT). Term is object w/
+ String field and String text (OBJ_HEADER + 2*POINTER).
+ We don't count Term's field since it's interned.
+ Term's text is String (OBJ_HEADER + 4*INT + POINTER +
+ OBJ_HEADER + string.length*CHAR). Integer is
+ OBJ_HEADER + INT. */
+ final static int BYTES_PER_DEL_TERM = 8*RamUsageEstimator.NUM_BYTES_OBJECT_REF + 5*RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + 6*RamUsageEstimator.NUM_BYTES_INT;
- // used only by assert
- private Term lastDeleteTerm;
-
- private PrintStream infoStream;
- private final AtomicLong bytesUsed = new AtomicLong();
- private final AtomicInteger numTerms = new AtomicInteger();
- private final int messageID;
+ /* Rough logic: del docIDs are List. Say list
+ allocates ~2X size (2*POINTER). Integer is OBJ_HEADER
+ + int */
+ final static int BYTES_PER_DEL_DOCID = 2*RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + RamUsageEstimator.NUM_BYTES_INT;
- public BufferedDeletes(int messageID) {
- this.messageID = messageID;
- }
+ /* Rough logic: HashMap has an array[Entry] w/ varying
+ load factor (say 2 * POINTER). Entry is object w/
+ Query key, Integer val, int hash, Entry next
+ (OBJ_HEADER + 3*POINTER + INT). Query we often
+ undercount (say 24 bytes). Integer is OBJ_HEADER + INT. */
+ final static int BYTES_PER_DEL_QUERY = 5*RamUsageEstimator.NUM_BYTES_OBJECT_REF + 2*RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + 2*RamUsageEstimator.NUM_BYTES_INT + 24;
- private synchronized void message(String message) {
- if (infoStream != null) {
- infoStream.println("BD " + messageID + " [" + new Date() + "; " + Thread.currentThread().getName() + "]: BD " + message);
- }
- }
-
- public synchronized void setInfoStream(PrintStream infoStream) {
- this.infoStream = infoStream;
- }
+ final AtomicInteger numTermDeletes = new AtomicInteger();
+ final Map terms;
+ final Map queries = new HashMap();
+ final List docIDs = new ArrayList();
- public synchronized void pushDeletes(SegmentDeletes newDeletes, SegmentInfo info) {
- pushDeletes(newDeletes, info, false);
- }
+ public static final Integer MAX_INT = Integer.valueOf(Integer.MAX_VALUE);
- // Moves all pending deletes onto the provided segment,
- // then clears the pending deletes
- public synchronized void pushDeletes(SegmentDeletes newDeletes, SegmentInfo info, boolean noLimit) {
- assert newDeletes.any();
- numTerms.addAndGet(newDeletes.numTermDeletes.get());
+ final AtomicLong bytesUsed = new AtomicLong();
- if (!noLimit) {
- assert !deletesMap.containsKey(info);
- assert info != null;
- deletesMap.put(info, newDeletes);
- bytesUsed.addAndGet(newDeletes.bytesUsed.get());
+ private final static boolean VERBOSE_DELETES = false;
+
+ long gen;
+
+ public BufferedDeletes(boolean sortTerms) {
+ if (sortTerms) {
+ terms = new TreeMap();
} else {
- final SegmentDeletes deletes = getDeletes(info);
- bytesUsed.addAndGet(-deletes.bytesUsed.get());
- deletes.update(newDeletes, noLimit);
- bytesUsed.addAndGet(deletes.bytesUsed.get());
- }
- if (infoStream != null) {
- message("push deletes seg=" + info + " dels=" + getDeletes(info));
+ terms = new HashMap();
}
- assert checkDeleteStats();
}
- public synchronized void clear() {
- deletesMap.clear();
- numTerms.set(0);
+ @Override
+ public String toString() {
+ if (VERBOSE_DELETES) {
+ return "gen=" + gen + " numTerms=" + numTermDeletes + ", terms=" + terms
+ + ", queries=" + queries + ", docIDs=" + docIDs + ", bytesUsed="
+ + bytesUsed;
+ } else {
+ String s = "gen=" + gen;
+ if (numTermDeletes.get() != 0) {
+ s += " " + numTermDeletes.get() + " deleted terms (unique count=" + terms.size() + ")";
+ }
+ if (queries.size() != 0) {
+ s += " " + queries.size() + " deleted queries";
+ }
+ if (docIDs.size() != 0) {
+ s += " " + docIDs.size() + " deleted docIDs";
+ }
+ if (bytesUsed.get() != 0) {
+ s += " bytesUsed=" + bytesUsed.get();
+ }
+
+ return s;
+ }
+ }
+
+ void update(BufferedDeletes in) {
+ numTermDeletes.addAndGet(in.numTermDeletes.get());
+ for (Map.Entry ent : in.terms.entrySet()) {
+ final Term term = ent.getKey();
+ if (!terms.containsKey(term)) {
+ // only incr bytesUsed if this term wasn't already buffered:
+ bytesUsed.addAndGet(BYTES_PER_DEL_TERM);
+ }
+ terms.put(term, MAX_INT);
+ }
+
+ for (Map.Entry ent : in.queries.entrySet()) {
+ final Query query = ent.getKey();
+ if (!queries.containsKey(query)) {
+ // only incr bytesUsed if this query wasn't already buffered:
+ bytesUsed.addAndGet(BYTES_PER_DEL_QUERY);
+ }
+ queries.put(query, MAX_INT);
+ }
+
+ // docIDs never move across segments and the docIDs
+ // should already be cleared
+ }
+
+ void update(FrozenBufferedDeletes in) {
+ numTermDeletes.addAndGet(in.numTermDeletes);
+ for(Term term : in.terms) {
+ if (!terms.containsKey(term)) {
+ // only incr bytesUsed if this term wasn't already buffered:
+ bytesUsed.addAndGet(BYTES_PER_DEL_TERM);
+ }
+ terms.put(term, MAX_INT);
+ }
+
+ for(int queryIdx=0;queryIdx termsIterable() {
+ return new Iterable() {
+ // @Override -- not until Java 1.6
+ public Iterator iterator() {
+ return terms.keySet().iterator();
+ }
+ };
+ }
+
+ public Iterable queriesIterable() {
+ return new Iterable() {
+
+ // @Override -- not until Java 1.6
+ public Iterator iterator() {
+ return new Iterator() {
+ private final Iterator> iter = queries.entrySet().iterator();
+
+ // @Override -- not until Java 1.6
+ public boolean hasNext() {
+ return iter.hasNext();
+ }
+
+ // @Override -- not until Java 1.6
+ public QueryAndLimit next() {
+ final Map.Entry ent = iter.next();
+ return new QueryAndLimit(ent.getKey(), ent.getValue());
+ }
+
+ // @Override -- not until Java 1.6
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ };
+ }
+ };
+ }
+
+ void clear() {
+ terms.clear();
+ queries.clear();
+ docIDs.clear();
+ numTermDeletes.set(0);
bytesUsed.set(0);
}
-
- synchronized boolean any() {
- return bytesUsed.get() != 0;
- }
-
- public int numTerms() {
- return numTerms.get();
- }
-
- public long bytesUsed() {
- return bytesUsed.get();
- }
-
- // IW calls this on finishing a merge. While the merge
- // was running, it's possible new deletes were pushed onto
- // our last (and only our last) segment. In this case we
- // must carry forward those deletes onto the merged
- // segment.
- synchronized void commitMerge(MergePolicy.OneMerge merge) {
- assert checkDeleteStats();
- if (infoStream != null) {
- message("commitMerge merge.info=" + merge.info + " merge.segments=" + merge.segments);
- }
- final SegmentInfo lastInfo = merge.segments.lastElement();
- final SegmentDeletes lastDeletes = deletesMap.get(lastInfo);
- if (lastDeletes != null) {
- deletesMap.remove(lastInfo);
- assert !deletesMap.containsKey(merge.info);
- deletesMap.put(merge.info, lastDeletes);
- // don't need to update numTerms/bytesUsed since we
- // are just moving the deletes from one info to
- // another
- if (infoStream != null) {
- message("commitMerge done: new deletions=" + lastDeletes);
- }
- } else if (infoStream != null) {
- message("commitMerge done: no new deletions");
- }
- assert !anyDeletes(merge.segments.range(0, merge.segments.size()-1));
- assert checkDeleteStats();
- }
-
- synchronized void clear(SegmentDeletes deletes) {
- deletes.clear();
+
+ void clearDocIDs() {
+ bytesUsed.addAndGet(-docIDs.size()*BYTES_PER_DEL_DOCID);
+ docIDs.clear();
}
- public synchronized boolean applyDeletes(IndexWriter.ReaderPool readerPool, SegmentInfos segmentInfos, SegmentInfos applyInfos) throws IOException {
- if (!any()) {
- return false;
- }
- final long t0 = System.currentTimeMillis();
-
- if (infoStream != null) {
- message("applyDeletes: applyInfos=" + applyInfos + "; index=" + segmentInfos);
- }
-
- assert checkDeleteStats();
-
- assert applyInfos.size() > 0;
-
- boolean any = false;
-
- final SegmentInfo lastApplyInfo = applyInfos.lastElement();
- final int lastIdx = segmentInfos.indexOf(lastApplyInfo);
-
- final SegmentInfo firstInfo = applyInfos.firstElement();
- final int firstIdx = segmentInfos.indexOf(firstInfo);
-
- // applyInfos must be a slice of segmentInfos
- assert lastIdx - firstIdx + 1 == applyInfos.size();
-
- // iterate over all segment infos backwards
- // coalesceing deletes along the way
- // when we're at or below the last of the
- // segments to apply to, start applying the deletes
- // we traverse up to the first apply infos
- SegmentDeletes coalescedDeletes = null;
- boolean hasDeletes = false;
- for (int segIdx=segmentInfos.size()-1; segIdx >= firstIdx; segIdx--) {
- final SegmentInfo info = segmentInfos.info(segIdx);
- final SegmentDeletes deletes = deletesMap.get(info);
- assert deletes == null || deletes.any();
-
- if (deletes == null && coalescedDeletes == null) {
- continue;
- }
-
- if (infoStream != null) {
- message("applyDeletes: seg=" + info + " segment's deletes=[" + (deletes == null ? "null" : deletes) + "]; coalesced deletes=[" + (coalescedDeletes == null ? "null" : coalescedDeletes) + "]");
- }
-
- hasDeletes |= deletes != null;
-
- if (segIdx <= lastIdx && hasDeletes) {
-
- final long delCountInc = applyDeletes(readerPool, info, coalescedDeletes, deletes);
-
- if (delCountInc != 0) {
- any = true;
- }
- if (infoStream != null) {
- message("deletes touched " + delCountInc + " docIDs");
- }
-
- if (deletes != null) {
- // we've applied doc ids, and they're only applied
- // on the current segment
- bytesUsed.addAndGet(-deletes.docIDs.size() * SegmentDeletes.BYTES_PER_DEL_DOCID);
- deletes.clearDocIDs();
- }
- }
-
- // now coalesce at the max limit
- if (deletes != null) {
- if (coalescedDeletes == null) {
- coalescedDeletes = new SegmentDeletes();
- }
- // TODO: we could make this single pass (coalesce as
- // we apply the deletes
- coalescedDeletes.update(deletes, true);
- }
- }
-
- // move all deletes to segment just before our merge.
- if (firstIdx > 0) {
-
- SegmentDeletes mergedDeletes = null;
- // TODO: we could also make this single pass
- for (SegmentInfo info : applyInfos) {
- final SegmentDeletes deletes = deletesMap.get(info);
- if (deletes != null) {
- assert deletes.any();
- if (mergedDeletes == null) {
- mergedDeletes = getDeletes(segmentInfos.info(firstIdx-1));
- numTerms.addAndGet(-mergedDeletes.numTermDeletes.get());
- bytesUsed.addAndGet(-mergedDeletes.bytesUsed.get());
- }
-
- mergedDeletes.update(deletes, true);
- }
- }
-
- if (mergedDeletes != null) {
- numTerms.addAndGet(mergedDeletes.numTermDeletes.get());
- bytesUsed.addAndGet(mergedDeletes.bytesUsed.get());
- }
-
- if (infoStream != null) {
- if (mergedDeletes != null) {
- message("applyDeletes: merge all deletes into seg=" + segmentInfos.info(firstIdx-1) + ": " + mergedDeletes);
- } else {
- message("applyDeletes: no deletes to merge");
- }
- }
- } else {
- // We drop the deletes in this case, because we've
- // applied them to segment infos starting w/ the first
- // segment. There are no prior segments so there's no
- // reason to keep them around. When the applyInfos ==
- // segmentInfos this means all deletes have been
- // removed:
- }
- remove(applyInfos);
-
- assert checkDeleteStats();
- assert applyInfos != segmentInfos || !any();
-
- if (infoStream != null) {
- message("applyDeletes took " + (System.currentTimeMillis()-t0) + " msec");
- }
- return any;
- }
-
- private synchronized long applyDeletes(IndexWriter.ReaderPool readerPool,
- SegmentInfo info,
- SegmentDeletes coalescedDeletes,
- SegmentDeletes segmentDeletes) throws IOException {
- assert readerPool.infoIsLive(info);
-
- assert coalescedDeletes == null || coalescedDeletes.docIDs.size() == 0;
-
- long delCount = 0;
-
- // Lock order: IW -> BD -> RP
- SegmentReader reader = readerPool.get(info, false);
- try {
- if (coalescedDeletes != null) {
- delCount += applyDeletes(coalescedDeletes, reader);
- }
- if (segmentDeletes != null) {
- delCount += applyDeletes(segmentDeletes, reader);
- }
- } finally {
- readerPool.release(reader);
- }
- return delCount;
- }
-
- private synchronized long applyDeletes(SegmentDeletes deletes, SegmentReader reader) throws IOException {
-
- long delCount = 0;
-
- assert checkDeleteTerm(null);
-
- if (deletes.terms.size() > 0) {
- Fields fields = reader.fields();
- if (fields == null) {
- // This reader has no postings
- return 0;
- }
-
- TermsEnum termsEnum = null;
-
- String currentField = null;
- DocsEnum docs = null;
-
- for (Entry entry: deletes.terms.entrySet()) {
- Term term = entry.getKey();
- // Since we visit terms sorted, we gain performance
- // by re-using the same TermsEnum and seeking only
- // forwards
- if (term.field() != currentField) {
- assert currentField == null || currentField.compareTo(term.field()) < 0;
- currentField = term.field();
- Terms terms = fields.terms(currentField);
- if (terms != null) {
- termsEnum = terms.iterator();
- } else {
- termsEnum = null;
- }
- }
-
- if (termsEnum == null) {
- continue;
- }
- assert checkDeleteTerm(term);
-
- if (termsEnum.seek(term.bytes(), false) == TermsEnum.SeekStatus.FOUND) {
- DocsEnum docsEnum = termsEnum.docs(reader.getDeletedDocs(), docs);
-
- if (docsEnum != null) {
- docs = docsEnum;
- final int limit = entry.getValue();
- while (true) {
- final int docID = docs.nextDoc();
- if (docID == DocsEnum.NO_MORE_DOCS || docID >= limit) {
- break;
- }
- reader.deleteDocument(docID);
- // TODO: we could/should change
- // reader.deleteDocument to return boolean
- // true if it did in fact delete, because here
- // we could be deleting an already-deleted doc
- // which makes this an upper bound:
- delCount++;
- }
- }
- }
- }
- }
-
- // Delete by docID
- for (Integer docIdInt : deletes.docIDs) {
- int docID = docIdInt.intValue();
- reader.deleteDocument(docID);
- delCount++;
- }
-
- // Delete by query
- if (deletes.queries.size() > 0) {
- IndexSearcher searcher = new IndexSearcher(reader);
- try {
- for (Entry entry : deletes.queries.entrySet()) {
- Query query = entry.getKey();
- int limit = entry.getValue().intValue();
- Weight weight = query.weight(searcher);
- Scorer scorer = weight.scorer(reader, true, false);
- if (scorer != null) {
- while(true) {
- int doc = scorer.nextDoc();
- if (doc >= limit)
- break;
-
- reader.deleteDocument(doc);
- // TODO: we could/should change
- // reader.deleteDocument to return boolean
- // true if it did in fact delete, because here
- // we could be deleting an already-deleted doc
- // which makes this an upper bound:
- delCount++;
- }
- }
- }
- } finally {
- searcher.close();
- }
- }
-
- return delCount;
- }
-
- public synchronized SegmentDeletes getDeletes(SegmentInfo info) {
- SegmentDeletes deletes = deletesMap.get(info);
- if (deletes == null) {
- deletes = new SegmentDeletes();
- deletesMap.put(info, deletes);
- }
- return deletes;
- }
-
- public synchronized void remove(SegmentInfos infos) {
- assert infos.size() > 0;
- for (SegmentInfo info : infos) {
- SegmentDeletes deletes = deletesMap.get(info);
- if (deletes != null) {
- bytesUsed.addAndGet(-deletes.bytesUsed.get());
- assert bytesUsed.get() >= 0: "bytesUsed=" + bytesUsed;
- numTerms.addAndGet(-deletes.numTermDeletes.get());
- assert numTerms.get() >= 0: "numTerms=" + numTerms;
- deletesMap.remove(info);
- }
- }
- }
-
- // used only by assert
- private boolean anyDeletes(SegmentInfos infos) {
- for(SegmentInfo info : infos) {
- if (deletesMap.containsKey(info)) {
- return true;
- }
- }
- return false;
- }
-
- // used only by assert
- private boolean checkDeleteTerm(Term term) {
- if (term != null) {
- assert lastDeleteTerm == null || term.compareTo(lastDeleteTerm) > 0: "lastTerm=" + lastDeleteTerm + " vs term=" + term;
- }
- lastDeleteTerm = term;
- return true;
- }
-
- // only for assert
- private boolean checkDeleteStats() {
- int numTerms2 = 0;
- long bytesUsed2 = 0;
- for(SegmentDeletes deletes : deletesMap.values()) {
- numTerms2 += deletes.numTermDeletes.get();
- bytesUsed2 += deletes.bytesUsed.get();
- }
- assert numTerms2 == numTerms.get(): "numTerms2=" + numTerms2 + " vs " + numTerms.get();
- assert bytesUsed2 == bytesUsed.get(): "bytesUsed2=" + bytesUsed2 + " vs " + bytesUsed;
- return true;
+ boolean any() {
+ return terms.size() > 0 || docIDs.size() > 0 || queries.size() > 0;
}
}
diff --git a/lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java b/lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java
new file mode 100644
index 00000000000..de3046db5dd
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java
@@ -0,0 +1,441 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.Comparator;
+import java.util.Collections;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Weight;
+
+/* Tracks the stream of {@link BuffereDeletes}.
+ * When DocumensWriter flushes, its buffered
+ * deletes are appended to this stream. We later
+ * apply these deletes (resolve them to the actual
+ * docIDs, per segment) when a merge is started
+ * (only to the to-be-merged segments). We
+ * also apply to all segments when NRT reader is pulled,
+ * commit/close is called, or when too many deletes are
+ * buffered and must be flushed (by RAM usage or by count).
+ *
+ * Each packet is assigned a generation, and each flushed or
+ * merged segment is also assigned a generation, so we can
+ * track which BufferedDeletes packets to apply to any given
+ * segment. */
+
+class BufferedDeletesStream {
+
+ // TODO: maybe linked list?
+ private final List deletes = new ArrayList();
+
+ // Starts at 1 so that SegmentInfos that have never had
+ // deletes applied (whose bufferedDelGen defaults to 0)
+ // will be correct:
+ private long nextGen = 1;
+
+ // used only by assert
+ private Term lastDeleteTerm;
+
+ private PrintStream infoStream;
+ private final AtomicLong bytesUsed = new AtomicLong();
+ private final AtomicInteger numTerms = new AtomicInteger();
+ private final int messageID;
+
+ public BufferedDeletesStream(int messageID) {
+ this.messageID = messageID;
+ }
+
+ private synchronized void message(String message) {
+ if (infoStream != null) {
+ infoStream.println("BD " + messageID + " [" + new Date() + "; " + Thread.currentThread().getName() + "]: " + message);
+ }
+ }
+
+ public synchronized void setInfoStream(PrintStream infoStream) {
+ this.infoStream = infoStream;
+ }
+
+ // Appends a new packet of buffered deletes to the stream,
+ // setting its generation:
+ public synchronized void push(FrozenBufferedDeletes packet) {
+ assert packet.any();
+ assert checkDeleteStats();
+ assert packet.gen < nextGen;
+ deletes.add(packet);
+ numTerms.addAndGet(packet.numTermDeletes);
+ bytesUsed.addAndGet(packet.bytesUsed);
+ if (infoStream != null) {
+ message("push deletes " + packet + " delGen=" + packet.gen + " packetCount=" + deletes.size());
+ }
+ assert checkDeleteStats();
+ }
+
+ public synchronized void clear() {
+ deletes.clear();
+ nextGen = 1;
+ numTerms.set(0);
+ bytesUsed.set(0);
+ }
+
+ public boolean any() {
+ return bytesUsed.get() != 0;
+ }
+
+ public int numTerms() {
+ return numTerms.get();
+ }
+
+ public long bytesUsed() {
+ return bytesUsed.get();
+ }
+
+ public static class ApplyDeletesResult {
+ // True if any actual deletes took place:
+ public final boolean anyDeletes;
+
+ // Current gen, for the merged segment:
+ public final long gen;
+
+ ApplyDeletesResult(boolean anyDeletes, long gen) {
+ this.anyDeletes = anyDeletes;
+ this.gen = gen;
+ }
+ }
+
+ // Sorts SegmentInfos from smallest to biggest bufferedDelGen:
+ private static final Comparator sortByDelGen = new Comparator() {
+ // @Override -- not until Java 1.6
+ public int compare(SegmentInfo si1, SegmentInfo si2) {
+ final long cmp = si1.getBufferedDeletesGen() - si2.getBufferedDeletesGen();
+ if (cmp > 0) {
+ return 1;
+ } else if (cmp < 0) {
+ return -1;
+ } else {
+ return 0;
+ }
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ return sortByDelGen == other;
+ }
+ };
+
+ /** Resolves the buffered deleted Term/Query/docIDs, into
+ * actual deleted docIDs in the deletedDocs BitVector for
+ * each SegmentReader. */
+ public synchronized ApplyDeletesResult applyDeletes(IndexWriter.ReaderPool readerPool, SegmentInfos infos) throws IOException {
+ final long t0 = System.currentTimeMillis();
+
+ if (infos.size() == 0) {
+ return new ApplyDeletesResult(false, nextGen++);
+ }
+
+ assert checkDeleteStats();
+
+ if (!any()) {
+ message("applyDeletes: no deletes; skipping");
+ return new ApplyDeletesResult(false, nextGen++);
+ }
+
+ if (infoStream != null) {
+ message("applyDeletes: infos=" + infos + " packetCount=" + deletes.size());
+ }
+
+ SegmentInfos infos2 = new SegmentInfos();
+ infos2.addAll(infos);
+ Collections.sort(infos2, sortByDelGen);
+
+ BufferedDeletes coalescedDeletes = null;
+ boolean anyNewDeletes = false;
+
+ int infosIDX = infos2.size()-1;
+ int delIDX = deletes.size()-1;
+
+ while (infosIDX >= 0) {
+ //System.out.println("BD: cycle delIDX=" + delIDX + " infoIDX=" + infosIDX);
+
+ final FrozenBufferedDeletes packet = delIDX >= 0 ? deletes.get(delIDX) : null;
+ final SegmentInfo info = infos2.get(infosIDX);
+ final long segGen = info.getBufferedDeletesGen();
+
+ if (packet != null && segGen < packet.gen) {
+ //System.out.println(" coalesce");
+ if (coalescedDeletes == null) {
+ coalescedDeletes = new BufferedDeletes(true);
+ }
+ coalescedDeletes.update(packet);
+ delIDX--;
+ } else if (packet != null && segGen == packet.gen) {
+ //System.out.println(" eq");
+
+ // Lock order: IW -> BD -> RP
+ assert readerPool.infoIsLive(info);
+ SegmentReader reader = readerPool.get(info, false);
+ int delCount = 0;
+ try {
+ if (coalescedDeletes != null) {
+ //System.out.println(" del coalesced");
+ delCount += applyTermDeletes(coalescedDeletes.termsIterable(), reader);
+ delCount += applyQueryDeletes(coalescedDeletes.queriesIterable(), reader);
+ }
+ //System.out.println(" del exact");
+ // Don't delete by Term here; DocumentsWriter
+ // already did that on flush:
+ delCount += applyQueryDeletes(packet.queriesIterable(), reader);
+ } finally {
+ readerPool.release(reader);
+ }
+ anyNewDeletes |= delCount > 0;
+
+ if (infoStream != null) {
+ message("seg=" + info + " segGen=" + segGen + " segDeletes=[" + packet + "]; coalesced deletes=[" + (coalescedDeletes == null ? "null" : coalescedDeletes) + "] delCount=" + delCount);
+ }
+
+ if (coalescedDeletes == null) {
+ coalescedDeletes = new BufferedDeletes(true);
+ }
+ coalescedDeletes.update(packet);
+ delIDX--;
+ infosIDX--;
+ info.setBufferedDeletesGen(nextGen);
+
+ } else {
+ //System.out.println(" gt");
+
+ if (coalescedDeletes != null) {
+ // Lock order: IW -> BD -> RP
+ assert readerPool.infoIsLive(info);
+ SegmentReader reader = readerPool.get(info, false);
+ int delCount = 0;
+ try {
+ delCount += applyTermDeletes(coalescedDeletes.termsIterable(), reader);
+ delCount += applyQueryDeletes(coalescedDeletes.queriesIterable(), reader);
+ } finally {
+ readerPool.release(reader);
+ }
+ anyNewDeletes |= delCount > 0;
+
+ if (infoStream != null) {
+ message("seg=" + info + " segGen=" + segGen + " coalesced deletes=[" + (coalescedDeletes == null ? "null" : coalescedDeletes) + "] delCount=" + delCount);
+ }
+ }
+ info.setBufferedDeletesGen(nextGen);
+
+ infosIDX--;
+ }
+ }
+
+ assert checkDeleteStats();
+ if (infoStream != null) {
+ message("applyDeletes took " + (System.currentTimeMillis()-t0) + " msec");
+ }
+ // assert infos != segmentInfos || !any() : "infos=" + infos + " segmentInfos=" + segmentInfos + " any=" + any;
+
+ return new ApplyDeletesResult(anyNewDeletes, nextGen++);
+ }
+
+ public synchronized long getNextGen() {
+ return nextGen++;
+ }
+
+ // Lock order IW -> BD
+ /* Removes any BufferedDeletes that we no longer need to
+ * store because all segments in the index have had the
+ * deletes applied. */
+ public synchronized void prune(SegmentInfos segmentInfos) {
+ assert checkDeleteStats();
+ long minGen = Long.MAX_VALUE;
+ for(SegmentInfo info : segmentInfos) {
+ minGen = Math.min(info.getBufferedDeletesGen(), minGen);
+ }
+
+ if (infoStream != null) {
+ message("prune sis=" + segmentInfos + " minGen=" + minGen + " packetCount=" + deletes.size());
+ }
+
+ final int limit = deletes.size();
+ for(int delIDX=0;delIDX= minGen) {
+ prune(delIDX);
+ assert checkDeleteStats();
+ return;
+ }
+ }
+
+ // All deletes pruned
+ prune(limit);
+ assert !any();
+ assert checkDeleteStats();
+ }
+
+ private synchronized void prune(int count) {
+ if (count > 0) {
+ if (infoStream != null) {
+ message("pruneDeletes: prune " + count + " packets; " + (deletes.size() - count) + " packets remain");
+ }
+ for(int delIDX=0;delIDX= 0;
+ bytesUsed.addAndGet(-packet.bytesUsed);
+ assert bytesUsed.get() >= 0;
+ }
+ deletes.subList(0, count).clear();
+ }
+ }
+
+ // Delete by Term
+ private synchronized long applyTermDeletes(Iterable termsIter, SegmentReader reader) throws IOException {
+ long delCount = 0;
+ Fields fields = reader.fields();
+ if (fields == null) {
+ // This reader has no postings
+ return 0;
+ }
+
+ TermsEnum termsEnum = null;
+
+ String currentField = null;
+ DocsEnum docs = null;
+
+ assert checkDeleteTerm(null);
+
+ for (Term term : termsIter) {
+ // Since we visit terms sorted, we gain performance
+ // by re-using the same TermsEnum and seeking only
+ // forwards
+ if (term.field() != currentField) {
+ assert currentField == null || currentField.compareTo(term.field()) < 0;
+ currentField = term.field();
+ Terms terms = fields.terms(currentField);
+ if (terms != null) {
+ termsEnum = terms.iterator();
+ } else {
+ termsEnum = null;
+ }
+ }
+
+ if (termsEnum == null) {
+ continue;
+ }
+ assert checkDeleteTerm(term);
+
+ // System.out.println(" term=" + term);
+
+ if (termsEnum.seek(term.bytes(), false) == TermsEnum.SeekStatus.FOUND) {
+ DocsEnum docsEnum = termsEnum.docs(reader.getDeletedDocs(), docs);
+
+ if (docsEnum != null) {
+ while (true) {
+ final int docID = docsEnum.nextDoc();
+ if (docID == DocsEnum.NO_MORE_DOCS) {
+ break;
+ }
+ reader.deleteDocument(docID);
+ // TODO: we could/should change
+ // reader.deleteDocument to return boolean
+ // true if it did in fact delete, because here
+ // we could be deleting an already-deleted doc
+ // which makes this an upper bound:
+ delCount++;
+ }
+ }
+ }
+ }
+
+ return delCount;
+ }
+
+ public static class QueryAndLimit {
+ public final Query query;
+ public final int limit;
+ public QueryAndLimit(Query query, int limit) {
+ this.query = query;
+ this.limit = limit;
+ }
+ }
+
+ // Delete by query
+ private synchronized long applyQueryDeletes(Iterable queriesIter, SegmentReader reader) throws IOException {
+ long delCount = 0;
+ IndexSearcher searcher = new IndexSearcher(reader);
+ assert searcher.getTopReaderContext().isAtomic;
+ final AtomicReaderContext readerContext = (AtomicReaderContext) searcher.getTopReaderContext();
+ try {
+ for (QueryAndLimit ent : queriesIter) {
+ Query query = ent.query;
+ int limit = ent.limit;
+ Weight weight = query.weight(searcher);
+ Scorer scorer = weight.scorer(readerContext, Weight.ScorerContext.def());
+ if (scorer != null) {
+ while(true) {
+ int doc = scorer.nextDoc();
+ if (doc >= limit)
+ break;
+
+ reader.deleteDocument(doc);
+ // TODO: we could/should change
+ // reader.deleteDocument to return boolean
+ // true if it did in fact delete, because here
+ // we could be deleting an already-deleted doc
+ // which makes this an upper bound:
+ delCount++;
+ }
+ }
+ }
+ } finally {
+ searcher.close();
+ }
+
+ return delCount;
+ }
+
+ // used only by assert
+ private boolean checkDeleteTerm(Term term) {
+ if (term != null) {
+ assert lastDeleteTerm == null || term.compareTo(lastDeleteTerm) > 0: "lastTerm=" + lastDeleteTerm + " vs term=" + term;
+ }
+ lastDeleteTerm = term;
+ return true;
+ }
+
+ // only for assert
+ private boolean checkDeleteStats() {
+ int numTerms2 = 0;
+ long bytesUsed2 = 0;
+ for(FrozenBufferedDeletes packet : deletes) {
+ numTerms2 += packet.numTermDeletes;
+ bytesUsed2 += packet.bytesUsed;
+ }
+ assert numTerms2 == numTerms.get(): "numTerms2=" + numTerms2 + " vs " + numTerms.get();
+ assert bytesUsed2 == bytesUsed.get(): "bytesUsed2=" + bytesUsed2 + " vs " + bytesUsed;
+ return true;
+ }
+}
diff --git a/lucene/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/src/java/org/apache/lucene/index/CheckIndex.java
index 392ab635249..a109afdb3de 100644
--- a/lucene/src/java/org/apache/lucene/index/CheckIndex.java
+++ b/lucene/src/java/org/apache/lucene/index/CheckIndex.java
@@ -548,10 +548,10 @@ public class CheckIndex {
if (infoStream != null) {
infoStream.print(" test: field norms.........");
}
- final byte[] b = new byte[reader.maxDoc()];
+ byte[] b;
for (final String fieldName : fieldNames) {
if (reader.hasNorms(fieldName)) {
- reader.norms(fieldName, b, 0);
+ b = reader.norms(fieldName);
++status.totFields;
}
}
@@ -610,6 +610,8 @@ public class CheckIndex {
Comparator termComp = terms.getComparator();
+ long sumTotalTermFreq = 0;
+
while(true) {
final BytesRef term = terms.next();
@@ -660,6 +662,8 @@ public class CheckIndex {
}
int lastDoc = -1;
+ int docCount = 0;
+ long totalTermFreq = 0;
while(true) {
final int doc = docs2.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
@@ -667,6 +671,8 @@ public class CheckIndex {
}
final int freq = docs2.freq();
status.totPos += freq;
+ totalTermFreq += freq;
+ docCount++;
if (doc <= lastDoc) {
throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
@@ -697,22 +703,39 @@ public class CheckIndex {
}
}
}
+
+ final long totalTermFreq2 = terms.totalTermFreq();
+ final boolean hasTotalTermFreq = postings != null && totalTermFreq2 != -1;
- // Now count how many deleted docs occurred in
- // this term:
-
+ // Re-count if there are deleted docs:
if (reader.hasDeletions()) {
final DocsEnum docsNoDel = terms.docs(null, docs);
- int count = 0;
+ docCount = 0;
+ totalTermFreq = 0;
while(docsNoDel.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
- count++;
+ docCount++;
+ totalTermFreq += docsNoDel.freq();
}
- if (count != docFreq) {
- throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + count);
+ }
+
+ if (docCount != docFreq) {
+ throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + docCount);
+ }
+ if (hasTotalTermFreq) {
+ sumTotalTermFreq += totalTermFreq;
+ if (totalTermFreq != totalTermFreq2) {
+ throw new RuntimeException("term " + term + " totalTermFreq=" + totalTermFreq2 + " != recomputed totalTermFreq=" + totalTermFreq);
}
}
}
+ if (sumTotalTermFreq != 0) {
+ final long v = fields.terms(field).getSumTotalTermFreq();
+ if (v != -1 && sumTotalTermFreq != v) {
+ throw new RuntimeException("sumTotalTermFreq for field " + field + "=" + v + " != recomputed sumTotalTermFreq=" + sumTotalTermFreq);
+ }
+ }
+
// Test seek to last term:
if (lastTerm != null) {
if (terms.seek(lastTerm) != TermsEnum.SeekStatus.FOUND) {
@@ -779,7 +802,7 @@ public class CheckIndex {
msg("OK [" + status.termCount + " terms; " + status.totFreq + " terms/docs pairs; " + status.totPos + " tokens]");
} catch (Throwable e) {
- msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
+ msg("ERROR: " + e);
status.error = e;
if (infoStream != null) {
e.printStackTrace(infoStream);
diff --git a/lucene/src/java/org/apache/lucene/index/CompoundFileWriter.java b/lucene/src/java/org/apache/lucene/index/CompoundFileWriter.java
index a11dab49d03..c80a8343b16 100644
--- a/lucene/src/java/org/apache/lucene/index/CompoundFileWriter.java
+++ b/lucene/src/java/org/apache/lucene/index/CompoundFileWriter.java
@@ -17,15 +17,15 @@ package org.apache.lucene.index;
* limitations under the License.
*/
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.util.IOUtils;
-
-import java.util.LinkedList;
-import java.util.HashSet;
-
import java.io.IOException;
+import java.util.HashSet;
+import java.util.LinkedList;
+
+import org.apache.lucene.index.codecs.MergeState;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.IOUtils;
/**
* Combines multiple files into a single compound file.
@@ -80,7 +80,7 @@ final class CompoundFileWriter {
private HashSet ids;
private LinkedList entries;
private boolean merged = false;
- private SegmentMerger.CheckAbort checkAbort;
+ private MergeState.CheckAbort checkAbort;
/** Create the compound stream in the specified file. The file name is the
* entire name (no extensions are added).
@@ -90,7 +90,7 @@ final class CompoundFileWriter {
this(dir, name, null);
}
- CompoundFileWriter(Directory dir, String name, SegmentMerger.CheckAbort checkAbort) {
+ CompoundFileWriter(Directory dir, String name, MergeState.CheckAbort checkAbort) {
if (dir == null)
throw new NullPointerException("directory cannot be null");
if (name == null)
diff --git a/lucene/src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java b/lucene/src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java
index b33aa6c1098..b9cafc7c5c2 100644
--- a/lucene/src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java
+++ b/lucene/src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java
@@ -142,8 +142,12 @@ public class ConcurrentMergeScheduler extends MergeScheduler {
}
};
- /** Called whenever the running merges have changed, to
- * pause & unpause threads. */
+ /**
+ * Called whenever the running merges have changed, to pause & unpause
+ * threads. This method sorts the merge threads by their merge size in
+ * descending order and then pauses/unpauses threads from first to last --
+ * that way, smaller merges are guaranteed to run before larger ones.
+ */
protected synchronized void updateMergeThreads() {
// Only look at threads that are alive & not in the
@@ -164,6 +168,7 @@ public class ConcurrentMergeScheduler extends MergeScheduler {
threadIdx++;
}
+ // Sort the merge threads in descending order.
CollectionUtil.mergeSort(activeMerges, compareByMergeDocCount);
int pri = mergeThreadPriority;
@@ -175,12 +180,8 @@ public class ConcurrentMergeScheduler extends MergeScheduler {
continue;
}
- final boolean doPause;
- if (threadIdx < activeMergeCount-maxThreadCount) {
- doPause = true;
- } else {
- doPause = false;
- }
+ // pause the thread if maxThreadCount is smaller than the number of merge threads.
+ final boolean doPause = threadIdx < activeMergeCount - maxThreadCount;
if (verbose()) {
if (doPause != merge.getPause()) {
@@ -205,13 +206,26 @@ public class ConcurrentMergeScheduler extends MergeScheduler {
}
}
- private boolean verbose() {
+ /**
+ * Returns true if verbosing is enabled. This method is usually used in
+ * conjunction with {@link #message(String)}, like that:
+ *
+ *
+ * if (verbose()) {
+ * message("your message");
+ * }
+ *
+ */
+ protected boolean verbose() {
return writer != null && writer.verbose();
}
- private void message(String message) {
- if (verbose())
- writer.message("CMS: " + message);
+ /**
+ * Outputs the given message - this method assumes {@link #verbose()} was
+ * called and returned true.
+ */
+ protected void message(String message) {
+ writer.message("CMS: " + message);
}
private synchronized void initMergeThreadPriority() {
@@ -231,10 +245,10 @@ public class ConcurrentMergeScheduler extends MergeScheduler {
/** Wait for any running merge threads to finish */
public void sync() {
- while(true) {
+ while (true) {
MergeThread toSync = null;
- synchronized(this) {
- for(MergeThread t : mergeThreads) {
+ synchronized (this) {
+ for (MergeThread t : mergeThreads) {
if (t.isAlive()) {
toSync = t;
break;
@@ -253,12 +267,14 @@ public class ConcurrentMergeScheduler extends MergeScheduler {
}
}
- private synchronized int mergeThreadCount() {
+ /**
+ * Returns the number of merge threads that are alive. Note that this number
+ * is ≤ {@link #mergeThreads} size.
+ */
+ protected synchronized int mergeThreadCount() {
int count = 0;
- final int numThreads = mergeThreads.size();
- for(int i=0;i= 1+maxMergeCount) {
+ startStallTime = System.currentTimeMillis();
+ if (verbose()) {
+ message(" too many merges; stalling...");
+ }
+ try {
+ wait();
+ } catch (InterruptedException ie) {
+ throw new ThreadInterruptedException(ie);
+ }
+ }
+
+ if (verbose()) {
+ if (startStallTime != 0) {
+ message(" stalled for " + (System.currentTimeMillis()-startStallTime) + " msec");
+ }
+ }
+ }
+
// TODO: we could be careful about which merges to do in
// the BG (eg maybe the "biggest" ones) vs FG, which
// merges to do first (the easiest ones?), etc.
-
MergePolicy.OneMerge merge = writer.getNextMerge();
if (merge == null) {
if (verbose())
@@ -311,32 +347,11 @@ public class ConcurrentMergeScheduler extends MergeScheduler {
boolean success = false;
try {
synchronized(this) {
- final MergeThread merger;
- long startStallTime = 0;
- while (mergeThreadCount() >= maxMergeCount) {
- startStallTime = System.currentTimeMillis();
- if (verbose()) {
- message(" too many merges; stalling...");
- }
- try {
- wait();
- } catch (InterruptedException ie) {
- throw new ThreadInterruptedException(ie);
- }
- }
-
- if (verbose()) {
- if (startStallTime != 0) {
- message(" stalled for " + (System.currentTimeMillis()-startStallTime) + " msec");
- }
- message(" consider merge " + merge.segString(dir));
- }
-
- assert mergeThreadCount() < maxMergeCount;
+ message(" consider merge " + merge.segString(dir));
// OK to spawn a new merge thread to handle this
// merge:
- merger = getMergeThread(writer, merge);
+ final MergeThread merger = getMergeThread(writer, merge);
mergeThreads.add(merger);
if (verbose()) {
message(" launch new thread [" + merger.getName() + "]");
@@ -360,8 +375,7 @@ public class ConcurrentMergeScheduler extends MergeScheduler {
}
/** Does the actual merge, by calling {@link IndexWriter#merge} */
- protected void doMerge(MergePolicy.OneMerge merge)
- throws IOException {
+ protected void doMerge(MergePolicy.OneMerge merge) throws IOException {
writer.merge(merge);
}
diff --git a/lucene/src/java/org/apache/lucene/index/DirectoryReader.java b/lucene/src/java/org/apache/lucene/index/DirectoryReader.java
index d9571dddafb..600ed1e7508 100644
--- a/lucene/src/java/org/apache/lucene/index/DirectoryReader.java
+++ b/lucene/src/java/org/apache/lucene/index/DirectoryReader.java
@@ -27,6 +27,7 @@ import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
@@ -35,10 +36,8 @@ import org.apache.lucene.store.Lock;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.ReaderUtil;
import org.apache.lucene.util.BytesRef;
-
-import org.apache.lucene.search.FieldCache; // not great (circular); used only to purge FieldCache entry on close
+import org.apache.lucene.util.MapBackedSet;
/**
* An IndexReader which reads indexes with multiple segments.
@@ -60,8 +59,8 @@ class DirectoryReader extends IndexReader implements Cloneable {
private boolean rollbackHasChanges;
private SegmentReader[] subReaders;
+ private ReaderContext topLevelReaderContext;
private int[] starts; // 1st docno for each segment
- private final Map subReaderToSlice = new HashMap();
private int maxDoc = 0;
private int numDocs = -1;
private boolean hasDeletions = false;
@@ -71,6 +70,8 @@ class DirectoryReader extends IndexReader implements Cloneable {
// opened on a past IndexCommit:
private long maxIndexVersion;
+ private final boolean applyAllDeletes;
+
// static IndexReader open(final Directory directory, final IndexDeletionPolicy deletionPolicy, final IndexCommit commit, final boolean readOnly,
// final int termInfosIndexDivisor) throws CorruptIndexException, IOException {
// return open(directory, deletionPolicy, commit, readOnly, termInfosIndexDivisor, null);
@@ -107,6 +108,8 @@ class DirectoryReader extends IndexReader implements Cloneable {
} else {
this.codecs = codecs;
}
+ readerFinishedListeners = new MapBackedSet(new ConcurrentHashMap());
+ applyAllDeletes = false;
// To reduce the chance of hitting FileNotFound
// (and having to retry), we open segments in
@@ -118,6 +121,7 @@ class DirectoryReader extends IndexReader implements Cloneable {
boolean success = false;
try {
readers[i] = SegmentReader.get(readOnly, sis.info(i), termInfosIndexDivisor);
+ readers[i].readerFinishedListeners = readerFinishedListeners;
success = true;
} finally {
if (!success) {
@@ -137,9 +141,11 @@ class DirectoryReader extends IndexReader implements Cloneable {
}
// Used by near real-time search
- DirectoryReader(IndexWriter writer, SegmentInfos infos, int termInfosIndexDivisor, CodecProvider codecs) throws IOException {
+ DirectoryReader(IndexWriter writer, SegmentInfos infos, int termInfosIndexDivisor, CodecProvider codecs, boolean applyAllDeletes) throws IOException {
this.directory = writer.getDirectory();
this.readOnly = true;
+ this.applyAllDeletes = applyAllDeletes; // saved for reopen
+
segmentInfos = (SegmentInfos) infos.clone();// make sure we clone otherwise we share mutable state with IW
this.termInfosIndexDivisor = termInfosIndexDivisor;
if (codecs == null) {
@@ -147,6 +153,7 @@ class DirectoryReader extends IndexReader implements Cloneable {
} else {
this.codecs = codecs;
}
+ readerFinishedListeners = writer.getReaderFinishedListeners();
// IndexWriter synchronizes externally before calling
// us, which ensures infos will not change; so there's
@@ -161,6 +168,7 @@ class DirectoryReader extends IndexReader implements Cloneable {
final SegmentInfo info = infos.info(i);
assert info.dir == dir;
readers[i] = writer.readerPool.getReadOnlyClone(info, true, termInfosIndexDivisor);
+ readers[i].readerFinishedListeners = readerFinishedListeners;
success = true;
} finally {
if (!success) {
@@ -183,11 +191,15 @@ class DirectoryReader extends IndexReader implements Cloneable {
/** This constructor is only used for {@link #reopen()} */
DirectoryReader(Directory directory, SegmentInfos infos, SegmentReader[] oldReaders, int[] oldStarts,
- boolean readOnly, boolean doClone, int termInfosIndexDivisor, CodecProvider codecs) throws IOException {
+ boolean readOnly, boolean doClone, int termInfosIndexDivisor, CodecProvider codecs,
+ Collection readerFinishedListeners) throws IOException {
this.directory = directory;
this.readOnly = readOnly;
this.segmentInfos = infos;
this.termInfosIndexDivisor = termInfosIndexDivisor;
+ this.readerFinishedListeners = readerFinishedListeners;
+ applyAllDeletes = false;
+
if (codecs == null) {
this.codecs = CodecProvider.getDefault();
} else {
@@ -233,8 +245,10 @@ class DirectoryReader extends IndexReader implements Cloneable {
// this is a new reader; in case we hit an exception we can close it safely
newReader = SegmentReader.get(readOnly, infos.info(i), termInfosIndexDivisor);
+ newReader.readerFinishedListeners = readerFinishedListeners;
} else {
newReader = newReaders[i].reopenSegment(infos.info(i), doClone, readOnly);
+ assert newReader.readerFinishedListeners == readerFinishedListeners;
}
if (newReader == newReaders[i]) {
// this reader will be shared between the old and the new one,
@@ -300,25 +314,22 @@ class DirectoryReader extends IndexReader implements Cloneable {
private void initialize(SegmentReader[] subReaders) throws IOException {
this.subReaders = subReaders;
starts = new int[subReaders.length + 1]; // build starts array
-
+ final AtomicReaderContext[] subReaderCtx = new AtomicReaderContext[subReaders.length];
+ topLevelReaderContext = new CompositeReaderContext(this, subReaderCtx, subReaderCtx);
final List subFields = new ArrayList();
- final List fieldSlices = new ArrayList();
-
+
for (int i = 0; i < subReaders.length; i++) {
starts[i] = maxDoc;
+ subReaderCtx[i] = new AtomicReaderContext(topLevelReaderContext, subReaders[i], i, maxDoc, i, maxDoc);
maxDoc += subReaders[i].maxDoc(); // compute maxDocs
if (subReaders[i].hasDeletions()) {
hasDeletions = true;
}
-
- final ReaderUtil.Slice slice = new ReaderUtil.Slice(starts[i], subReaders[i].maxDoc(), i);
- subReaderToSlice.put(subReaders[i], slice);
-
+
final Fields f = subReaders[i].fields();
if (f != null) {
subFields.add(f);
- fieldSlices.add(slice);
}
}
starts[subReaders.length] = maxDoc;
@@ -361,6 +372,7 @@ class DirectoryReader extends IndexReader implements Cloneable {
writeLock = null;
hasChanges = false;
}
+ assert newReader.readerFinishedListeners != null;
return newReader;
}
@@ -395,7 +407,9 @@ class DirectoryReader extends IndexReader implements Cloneable {
// TODO: right now we *always* make a new reader; in
// the future we could have write make some effort to
// detect that no changes have occurred
- return writer.getReader();
+ IndexReader reader = writer.getReader(applyAllDeletes);
+ reader.readerFinishedListeners = readerFinishedListeners;
+ return reader;
}
private IndexReader doReopen(final boolean openReadOnly, IndexCommit commit) throws CorruptIndexException, IOException {
@@ -462,7 +476,7 @@ class DirectoryReader extends IndexReader implements Cloneable {
private synchronized DirectoryReader doReopen(SegmentInfos infos, boolean doClone, boolean openReadOnly) throws CorruptIndexException, IOException {
DirectoryReader reader;
- reader = new DirectoryReader(directory, infos, subReaders, starts, openReadOnly, doClone, termInfosIndexDivisor, codecs);
+ reader = new DirectoryReader(directory, infos, subReaders, starts, openReadOnly, doClone, termInfosIndexDivisor, codecs, readerFinishedListeners);
return reader;
}
@@ -605,12 +619,6 @@ class DirectoryReader extends IndexReader implements Cloneable {
throw new UnsupportedOperationException("please use MultiNorms.norms, or wrap your IndexReader with SlowMultiReaderWrapper, if you really need a top level norms");
}
- @Override
- public synchronized void norms(String field, byte[] result, int offset)
- throws IOException {
- throw new UnsupportedOperationException("please use MultiNorms.norms, or wrap your IndexReader with SlowMultiReaderWrapper, if you really need a top level norms");
- }
-
@Override
protected void doSetNorm(int n, String field, byte value)
throws CorruptIndexException, IOException {
@@ -715,11 +723,18 @@ class DirectoryReader extends IndexReader implements Cloneable {
// case we have to roll back:
startCommit();
+ final SegmentInfos rollbackSegmentInfos = new SegmentInfos();
+ rollbackSegmentInfos.addAll(segmentInfos);
+
boolean success = false;
try {
for (int i = 0; i < subReaders.length; i++)
subReaders[i].commit();
+ // Remove segments that contain only 100% deleted
+ // docs:
+ segmentInfos.pruneDeletedSegments();
+
// Sync all files we just wrote
directory.sync(segmentInfos.files(directory, false));
segmentInfos.commit(directory);
@@ -739,6 +754,10 @@ class DirectoryReader extends IndexReader implements Cloneable {
// partially written .del files, etc, are
// removed):
deleter.refresh();
+
+ // Restore all SegmentInfos (in case we pruned some)
+ segmentInfos.clear();
+ segmentInfos.addAll(rollbackSegmentInfos);
}
}
@@ -815,11 +834,6 @@ class DirectoryReader extends IndexReader implements Cloneable {
}
}
- // NOTE: only needed in case someone had asked for
- // FieldCache for top-level reader (which is generally
- // not a good idea):
- FieldCache.DEFAULT.purge(this);
-
if (writer != null) {
// Since we just closed, writer may now be able to
// delete unused files:
@@ -844,18 +858,18 @@ class DirectoryReader extends IndexReader implements Cloneable {
fieldSet.addAll(names);
}
return fieldSet;
- }
+ }
+
+ @Override
+ public ReaderContext getTopReaderContext() {
+ return topLevelReaderContext;
+ }
@Override
public IndexReader[] getSequentialSubReaders() {
return subReaders;
}
- @Override
- public int getSubReaderDocBase(IndexReader subReader) {
- return subReaderToSlice.get(subReader).start;
- }
-
/** Returns the directory this index resides in. */
@Override
public Directory directory() {
diff --git a/lucene/src/java/org/apache/lucene/index/DocInverterPerField.java b/lucene/src/java/org/apache/lucene/index/DocInverterPerField.java
index 95d64c44136..d360fbfb230 100644
--- a/lucene/src/java/org/apache/lucene/index/DocInverterPerField.java
+++ b/lucene/src/java/org/apache/lucene/index/DocInverterPerField.java
@@ -63,8 +63,6 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
fieldState.reset(docState.doc.getBoost());
- final int maxFieldLength = docState.maxFieldLength;
-
final boolean doInvert = consumer.start(fields, count);
for(int i=0;i= maxFieldLength) {
- if (docState.infoStream != null)
- docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
- break;
- }
hasMoreTokens = stream.incrementToken();
}
diff --git a/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java b/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
index 999fdb117e9..2462803f94d 100644
--- a/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
+++ b/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
@@ -30,14 +30,16 @@ import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Query;
-import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMFile;
import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitVector;
+import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.RecyclingByteBlockAllocator;
import org.apache.lucene.util.ThreadInterruptedException;
-import org.apache.lucene.util.RamUsageEstimator;
+
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_MASK;
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
@@ -127,22 +129,21 @@ final class DocumentsWriter {
private boolean aborting; // True if an abort is pending
PrintStream infoStream;
- int maxFieldLength = IndexWriterConfig.UNLIMITED_FIELD_LENGTH;
- Similarity similarity;
+ SimilarityProvider similarityProvider;
// max # simultaneous threads; if there are more than
// this, they wait for others to finish first
private final int maxThreadStates;
+ // TODO: cutover to BytesRefHash
// Deletes for our still-in-RAM (to be flushed next) segment
- private SegmentDeletes pendingDeletes = new SegmentDeletes();
+ private BufferedDeletes pendingDeletes = new BufferedDeletes(false);
static class DocState {
DocumentsWriter docWriter;
Analyzer analyzer;
- int maxFieldLength;
PrintStream infoStream;
- Similarity similarity;
+ SimilarityProvider similarityProvider;
int docID;
Document doc;
String maxTermPrefix;
@@ -191,6 +192,7 @@ final class DocumentsWriter {
/**
* Allocate bytes used from shared pool.
*/
+ @Override
protected byte[] newBuffer(int size) {
assert size == PER_DOC_BLOCK_SIZE;
return perDocAllocator.getByteBlock();
@@ -279,16 +281,16 @@ final class DocumentsWriter {
private boolean closed;
private final FieldInfos fieldInfos;
- private final BufferedDeletes bufferedDeletes;
+ private final BufferedDeletesStream bufferedDeletesStream;
private final IndexWriter.FlushControl flushControl;
- DocumentsWriter(Directory directory, IndexWriter writer, IndexingChain indexingChain, int maxThreadStates, FieldInfos fieldInfos, BufferedDeletes bufferedDeletes) throws IOException {
+ DocumentsWriter(Directory directory, IndexWriter writer, IndexingChain indexingChain, int maxThreadStates, FieldInfos fieldInfos, BufferedDeletesStream bufferedDeletesStream) throws IOException {
this.directory = directory;
this.writer = writer;
- this.similarity = writer.getConfig().getSimilarity();
+ this.similarityProvider = writer.getConfig().getSimilarityProvider();
this.maxThreadStates = maxThreadStates;
this.fieldInfos = fieldInfos;
- this.bufferedDeletes = bufferedDeletes;
+ this.bufferedDeletesStream = bufferedDeletesStream;
flushControl = writer.flushControl;
consumer = indexingChain.getChain(this);
@@ -337,6 +339,9 @@ final class DocumentsWriter {
return doFlush;
}
+ // TODO: we could check w/ FreqProxTermsWriter: if the
+ // term doesn't exist, don't bother buffering into the
+ // per-DWPT map (but still must go into the global map)
boolean deleteTerm(Term term, boolean skipWait) {
final boolean doFlush = flushControl.waitUpdate(0, 1, skipWait);
synchronized(this) {
@@ -358,17 +363,10 @@ final class DocumentsWriter {
}
}
- synchronized void setMaxFieldLength(int maxFieldLength) {
- this.maxFieldLength = maxFieldLength;
+ synchronized void setSimilarityProvider(SimilarityProvider similarity) {
+ this.similarityProvider = similarity;
for(int i=0;i BD
+ final long delGen = bufferedDeletesStream.getNextGen();
if (pendingDeletes.any()) {
- if (newSegment != null) {
+ if (segmentInfos.size() > 0 || newSegment != null) {
+ final FrozenBufferedDeletes packet = new FrozenBufferedDeletes(pendingDeletes, delGen);
if (infoStream != null) {
- message("flush: push buffered deletes to newSegment");
+ message("flush: push buffered deletes");
}
- bufferedDeletes.pushDeletes(pendingDeletes, newSegment);
- } else if (segmentInfos.size() > 0) {
+ bufferedDeletesStream.push(packet);
if (infoStream != null) {
- message("flush: push buffered deletes to previously flushed segment " + segmentInfos.lastElement());
+ message("flush: delGen=" + packet.gen);
+ }
+ if (newSegment != null) {
+ newSegment.setBufferedDeletesGen(packet.gen);
}
- bufferedDeletes.pushDeletes(pendingDeletes, segmentInfos.lastElement(), true);
} else {
if (infoStream != null) {
message("flush: drop buffered deletes: no segments");
@@ -534,7 +535,9 @@ final class DocumentsWriter {
// there are no segments, the deletions cannot
// affect anything.
}
- pendingDeletes = new SegmentDeletes();
+ pendingDeletes.clear();
+ } else if (newSegment != null) {
+ newSegment.setBufferedDeletesGen(delGen);
}
}
@@ -546,6 +549,8 @@ final class DocumentsWriter {
// Lock order: IW -> DW
synchronized SegmentInfo flush(IndexWriter writer, IndexFileDeleter deleter, MergePolicy mergePolicy, SegmentInfos segmentInfos) throws IOException {
+ final long startTime = System.currentTimeMillis();
+
// We change writer's segmentInfos:
assert Thread.holdsLock(writer);
@@ -583,6 +588,18 @@ final class DocumentsWriter {
final SegmentWriteState flushState = segWriteState();
+ // Apply delete-by-docID now (delete-byDocID only
+ // happens when an exception is hit processing that
+ // doc, eg if analyzer has some problem w/ the text):
+ if (pendingDeletes.docIDs.size() > 0) {
+ flushState.deletedDocs = new BitVector(numDocs);
+ for(int delDocID : pendingDeletes.docIDs) {
+ flushState.deletedDocs.set(delDocID);
+ }
+ pendingDeletes.bytesUsed.addAndGet(-pendingDeletes.docIDs.size() * BufferedDeletes.BYTES_PER_DEL_DOCID);
+ pendingDeletes.docIDs.clear();
+ }
+
newSegment = new SegmentInfo(segment, numDocs, directory, false, fieldInfos.hasProx(), flushState.segmentCodecs, false);
Collection threads = new HashSet();
@@ -593,10 +610,14 @@ final class DocumentsWriter {
double startMBUsed = bytesUsed()/1024./1024.;
consumer.flush(threads, flushState);
+
newSegment.setHasVectors(flushState.hasVectors);
if (infoStream != null) {
message("new segment has " + (flushState.hasVectors ? "vectors" : "no vectors"));
+ if (flushState.deletedDocs != null) {
+ message("new segment has " + flushState.deletedDocs.count() + " deleted docs");
+ }
message("flushedFiles=" + newSegment.files());
message("flushed codecs=" + newSegment.getSegmentCodecs());
}
@@ -617,6 +638,30 @@ final class DocumentsWriter {
newSegment.setUseCompoundFile(true);
}
+ // Must write deleted docs after the CFS so we don't
+ // slurp the del file into CFS:
+ if (flushState.deletedDocs != null) {
+ final int delCount = flushState.deletedDocs.count();
+ assert delCount > 0;
+ newSegment.setDelCount(delCount);
+ newSegment.advanceDelGen();
+ final String delFileName = newSegment.getDelFileName();
+ boolean success2 = false;
+ try {
+ flushState.deletedDocs.write(directory, delFileName);
+ success2 = true;
+ } finally {
+ if (!success2) {
+ try {
+ directory.deleteFile(delFileName);
+ } catch (Throwable t) {
+ // suppress this so we keep throwing the
+ // original exception
+ }
+ }
+ }
+ }
+
if (infoStream != null) {
message("flush: segment=" + newSegment);
final double newSegmentSizeNoStore = newSegment.sizeInBytes(false)/1024./1024.;
@@ -643,6 +688,9 @@ final class DocumentsWriter {
// Lock order: IW -> DW -> BD
pushDeletes(newSegment, segmentInfos);
+ if (infoStream != null) {
+ message("flush time " + (System.currentTimeMillis()-startTime) + " msec");
+ }
return newSegment;
}
@@ -650,7 +698,7 @@ final class DocumentsWriter {
SegmentWriteState segWriteState() {
return new SegmentWriteState(infoStream, directory, segment, fieldInfos,
numDocs, writer.getConfig().getTermIndexInterval(),
- SegmentCodecs.build(fieldInfos, writer.codecs), bytesUsed);
+ SegmentCodecs.build(fieldInfos, writer.codecs), pendingDeletes, bytesUsed);
}
synchronized void close() {
@@ -909,8 +957,7 @@ final class DocumentsWriter {
final static int BYTE_BLOCK_NOT_MASK = ~BYTE_BLOCK_MASK;
/* if you increase this, you must fix field cache impl for
- * getTerms/getTermsIndex requires <= 32768. Also fix
- * DeltaBytesWriter's TERM_EOF if necessary. */
+ * getTerms/getTermsIndex requires <= 32768. */
final static int MAX_TERM_LENGTH_UTF8 = BYTE_BLOCK_SIZE-2;
/* Initial chunks size of the shared int[] blocks used to
@@ -971,7 +1018,7 @@ final class DocumentsWriter {
final boolean doBalance;
final long deletesRAMUsed;
- deletesRAMUsed = bufferedDeletes.bytesUsed();
+ deletesRAMUsed = bufferedDeletesStream.bytesUsed();
synchronized(this) {
if (ramBufferSize == IndexWriterConfig.DISABLE_AUTO_FLUSH || bufferIsFull) {
diff --git a/lucene/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java b/lucene/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java
index c9ab3828f3c..611098a64bc 100644
--- a/lucene/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java
+++ b/lucene/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java
@@ -35,9 +35,8 @@ final class DocumentsWriterThreadState {
public DocumentsWriterThreadState(DocumentsWriter docWriter) throws IOException {
this.docWriter = docWriter;
docState = new DocumentsWriter.DocState();
- docState.maxFieldLength = docWriter.maxFieldLength;
docState.infoStream = docWriter.infoStream;
- docState.similarity = docWriter.similarity;
+ docState.similarityProvider = docWriter.similarityProvider;
docState.docWriter = docWriter;
consumer = docWriter.consumer.addThread(this);
}
diff --git a/lucene/src/java/org/apache/lucene/index/FieldInfo.java b/lucene/src/java/org/apache/lucene/index/FieldInfo.java
index 96ace5f1f1b..bfb74209df4 100644
--- a/lucene/src/java/org/apache/lucene/index/FieldInfo.java
+++ b/lucene/src/java/org/apache/lucene/index/FieldInfo.java
@@ -56,7 +56,7 @@ public final class FieldInfo {
this.storeOffsetWithTermVector = false;
this.storePositionWithTermVector = false;
this.storePayloads = false;
- this.omitNorms = true;
+ this.omitNorms = false;
this.omitTermFreqAndPositions = false;
}
}
@@ -86,7 +86,7 @@ public final class FieldInfo {
this.storePayloads = true;
}
if (this.omitNorms != omitNorms) {
- this.omitNorms = false; // once norms are stored, always store
+ this.omitNorms = true; // if one require omitNorms at least once, it remains off for life
}
if (this.omitTermFreqAndPositions != omitTermFreqAndPositions) {
this.omitTermFreqAndPositions = true; // if one require omitTermFreqAndPositions at least once, it remains off for life
diff --git a/lucene/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/src/java/org/apache/lucene/index/FieldInfos.java
index 47d21177f58..39a3cbd90c6 100644
--- a/lucene/src/java/org/apache/lucene/index/FieldInfos.java
+++ b/lucene/src/java/org/apache/lucene/index/FieldInfos.java
@@ -284,14 +284,21 @@ public final class FieldInfos {
}
public boolean hasVectors() {
- boolean hasVectors = false;
for (int i = 0; i < size(); i++) {
if (fieldInfo(i).storeTermVector) {
- hasVectors = true;
- break;
+ return true;
}
}
- return hasVectors;
+ return false;
+ }
+
+ public boolean hasNorms() {
+ for (int i = 0; i < size(); i++) {
+ if (!fieldInfo(i).omitNorms) {
+ return true;
+ }
+ }
+ return false;
}
public void write(Directory d, String name) throws IOException {
diff --git a/lucene/src/java/org/apache/lucene/index/FieldInvertState.java b/lucene/src/java/org/apache/lucene/index/FieldInvertState.java
index 9dc9ffcc8b4..8c4e92ad4ea 100644
--- a/lucene/src/java/org/apache/lucene/index/FieldInvertState.java
+++ b/lucene/src/java/org/apache/lucene/index/FieldInvertState.java
@@ -30,6 +30,7 @@ public final class FieldInvertState {
int length;
int numOverlap;
int offset;
+ int maxTermFrequency;
float boost;
AttributeSource attributeSource;
@@ -53,6 +54,7 @@ public final class FieldInvertState {
length = 0;
numOverlap = 0;
offset = 0;
+ maxTermFrequency = 0;
boost = docBoost;
attributeSource = null;
}
@@ -73,6 +75,10 @@ public final class FieldInvertState {
return length;
}
+ public void setLength(int length) {
+ this.length = length;
+ }
+
/**
* Get the number of terms with positionIncrement == 0
.
* @return the numOverlap
@@ -81,6 +87,10 @@ public final class FieldInvertState {
return numOverlap;
}
+ public void setNumOverlap(int numOverlap) {
+ this.numOverlap = numOverlap;
+ }
+
/**
* Get end offset of the last processed term.
* @return the offset
@@ -99,6 +109,19 @@ public final class FieldInvertState {
return boost;
}
+ public void setBoost(float boost) {
+ this.boost = boost;
+ }
+
+ /**
+ * Get the maximum term-frequency encountered for any term in the field. A
+ * field containing "the quick brown fox jumps over the lazy dog" would have
+ * a value of 2, because "the" appears twice.
+ */
+ public int getMaxTermFrequency() {
+ return maxTermFrequency;
+ }
+
public AttributeSource getAttributeSource() {
return attributeSource;
}
diff --git a/lucene/src/java/org/apache/lucene/index/Fields.java b/lucene/src/java/org/apache/lucene/index/Fields.java
index f3fe6542775..20e7176f4ec 100644
--- a/lucene/src/java/org/apache/lucene/index/Fields.java
+++ b/lucene/src/java/org/apache/lucene/index/Fields.java
@@ -30,7 +30,7 @@ public abstract class Fields {
* names. This will not return null. */
public abstract FieldsEnum iterator() throws IOException;
- /** Get the {@link Terms} for this field. This may return
+ /** Get the {@link Terms} for this field. This will return
* null if the field does not exist. */
public abstract Terms terms(String field) throws IOException;
diff --git a/lucene/src/java/org/apache/lucene/index/FieldsReader.java b/lucene/src/java/org/apache/lucene/index/FieldsReader.java
index 96b58120e50..76c0ed23552 100644
--- a/lucene/src/java/org/apache/lucene/index/FieldsReader.java
+++ b/lucene/src/java/org/apache/lucene/index/FieldsReader.java
@@ -37,8 +37,10 @@ import java.io.Reader;
* Class responsible for access to stored document fields.
*
* It uses <segment>.fdt and <segment>.fdx; files.
+ *
+ * @lucene.internal
*/
-final class FieldsReader implements Cloneable {
+public final class FieldsReader implements Cloneable {
private final static int FORMAT_SIZE = 4;
private final FieldInfos fieldInfos;
@@ -74,6 +76,23 @@ final class FieldsReader implements Cloneable {
ensureOpen();
return new FieldsReader(fieldInfos, numTotalDocs, size, format, docStoreOffset, cloneableFieldsStream, cloneableIndexStream);
}
+
+ /** Verifies that the code version which wrote the segment is supported. */
+ public static void checkCodeVersion(Directory dir, String segment) throws IOException {
+ final String indexStreamFN = IndexFileNames.segmentFileName(segment, "", IndexFileNames.FIELDS_INDEX_EXTENSION);
+ IndexInput idxStream = dir.openInput(indexStreamFN, 1024);
+
+ try {
+ int format = idxStream.readInt();
+ if (format < FieldsWriter.FORMAT_MINIMUM)
+ throw new IndexFormatTooOldException(indexStreamFN, format, FieldsWriter.FORMAT_MINIMUM, FieldsWriter.FORMAT_CURRENT);
+ if (format > FieldsWriter.FORMAT_CURRENT)
+ throw new IndexFormatTooNewException(indexStreamFN, format, FieldsWriter.FORMAT_MINIMUM, FieldsWriter.FORMAT_CURRENT);
+ } finally {
+ idxStream.close();
+ }
+
+ }
// Used only by clone
private FieldsReader(FieldInfos fieldInfos, int numTotalDocs, int size, int format, int docStoreOffset,
@@ -89,11 +108,11 @@ final class FieldsReader implements Cloneable {
indexStream = (IndexInput) cloneableIndexStream.clone();
}
- FieldsReader(Directory d, String segment, FieldInfos fn) throws IOException {
+ public FieldsReader(Directory d, String segment, FieldInfos fn) throws IOException {
this(d, segment, fn, BufferedIndexInput.BUFFER_SIZE, -1, 0);
}
- FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize, int docStoreOffset, int size) throws IOException {
+ public FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize, int docStoreOffset, int size) throws IOException {
boolean success = false;
isOriginal = true;
try {
@@ -157,7 +176,7 @@ final class FieldsReader implements Cloneable {
*
* @throws IOException
*/
- final void close() throws IOException {
+ public final void close() throws IOException {
if (!closed) {
if (fieldsStream != null) {
fieldsStream.close();
@@ -178,7 +197,7 @@ final class FieldsReader implements Cloneable {
}
}
- final int size() {
+ public final int size() {
return size;
}
@@ -186,7 +205,7 @@ final class FieldsReader implements Cloneable {
indexStream.seek(FORMAT_SIZE + (docID + docStoreOffset) * 8L);
}
- final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
+ public final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
seekIndex(n);
long position = indexStream.readLong();
fieldsStream.seek(position);
@@ -237,7 +256,7 @@ final class FieldsReader implements Cloneable {
* contiguous range of length numDocs starting with
* startDocID. Returns the IndexInput (the fieldStream),
* already seeked to the starting point for startDocID.*/
- final IndexInput rawDocs(int[] lengths, int startDocID, int numDocs) throws IOException {
+ public final IndexInput rawDocs(int[] lengths, int startDocID, int numDocs) throws IOException {
seekIndex(startDocID);
long startOffset = indexStream.readLong();
long lastOffset = startOffset;
diff --git a/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java b/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java
index 3393a71fdfb..4dc7cfee89e 100644
--- a/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java
+++ b/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java
@@ -19,17 +19,19 @@ package org.apache.lucene.index;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.values.DocValues;
import org.apache.lucene.index.values.DocValuesEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
-import org.apache.lucene.search.FieldCache; // not great (circular); used only to purge FieldCache entry on close
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.MapBackedSet;
import java.io.IOException;
import java.util.Collection;
import java.util.Map;
import java.util.Comparator;
+import java.util.concurrent.ConcurrentHashMap;
/** A FilterIndexReader
contains another IndexReader, which it
* uses as its basic source of data, possibly transforming the data along the
@@ -105,6 +107,11 @@ public class FilterIndexReader extends IndexReader {
public long getUniqueTermCount() throws IOException {
return in.getUniqueTermCount();
}
+
+ @Override
+ public long getSumTotalTermFreq() throws IOException {
+ return in.getSumTotalTermFreq();
+ }
}
/** Base class for filtering {@link TermsEnum} implementations. */
@@ -141,11 +148,6 @@ public class FilterIndexReader extends IndexReader {
return in.seek(text, useCache);
}
- @Override
- public void cacheCurrentTerm() throws IOException {
- in.cacheCurrentTerm();
- }
-
@Override
public SeekStatus seek(long ord) throws IOException {
return in.seek(ord);
@@ -167,10 +169,15 @@ public class FilterIndexReader extends IndexReader {
}
@Override
- public int docFreq() {
+ public int docFreq() throws IOException {
return in.docFreq();
}
+ @Override
+ public long totalTermFreq() throws IOException {
+ return in.totalTermFreq();
+ }
+
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
return in.docs(skipDocs, reuse);
@@ -185,6 +192,16 @@ public class FilterIndexReader extends IndexReader {
public Comparator getComparator() throws IOException {
return in.getComparator();
}
+
+ @Override
+ public void seek(BytesRef term, TermState state) throws IOException {
+ in.seek(term, state);
+ }
+
+ @Override
+ public TermState termState() throws IOException {
+ return in.termState();
+ }
}
/** Base class for filtering {@link DocsEnum} implementations. */
@@ -282,6 +299,7 @@ public class FilterIndexReader extends IndexReader {
public FilterIndexReader(IndexReader in) {
super();
this.in = in;
+ readerFinishedListeners = new MapBackedSet(new ConcurrentHashMap());
}
@Override
@@ -361,12 +379,6 @@ public class FilterIndexReader extends IndexReader {
return in.norms(f);
}
- @Override
- public void norms(String f, byte[] bytes, int offset) throws IOException {
- ensureOpen();
- in.norms(f, bytes, offset);
- }
-
@Override
protected void doSetNorm(int d, String f, byte b) throws CorruptIndexException, IOException {
in.setNorm(d, f, b);
@@ -393,11 +405,6 @@ public class FilterIndexReader extends IndexReader {
@Override
protected void doClose() throws IOException {
in.close();
-
- // NOTE: only needed in case someone had asked for
- // FieldCache for top-level reader (which is generally
- // not a good idea):
- FieldCache.DEFAULT.purge(this);
}
@@ -429,6 +436,11 @@ public class FilterIndexReader extends IndexReader {
public IndexReader[] getSequentialSubReaders() {
return in.getSequentialSubReaders();
}
+
+ @Override
+ public ReaderContext getTopReaderContext() {
+ return in.getTopReaderContext();
+ }
@Override
public Fields fields() throws IOException {
@@ -451,4 +463,16 @@ public class FilterIndexReader extends IndexReader {
buffer.append(')');
return buffer.toString();
}
-}
\ No newline at end of file
+
+ @Override
+ public void addReaderFinishedListener(ReaderFinishedListener listener) {
+ super.addReaderFinishedListener(listener);
+ in.addReaderFinishedListener(listener);
+ }
+
+ @Override
+ public void removeReaderFinishedListener(ReaderFinishedListener listener) {
+ super.removeReaderFinishedListener(listener);
+ in.removeReaderFinishedListener(listener);
+ }
+}
diff --git a/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java b/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
index d749d24b555..d342cb47249 100644
--- a/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
+++ b/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
@@ -20,13 +20,15 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
+import java.util.Comparator;
import java.util.List;
import java.util.Map;
-import java.util.Comparator;
-import org.apache.lucene.index.codecs.PostingsConsumer;
import org.apache.lucene.index.codecs.FieldsConsumer;
+import org.apache.lucene.index.codecs.PostingsConsumer;
+import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.index.codecs.TermsConsumer;
+import org.apache.lucene.util.BitVector;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CollectionUtil;
@@ -107,7 +109,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
// If this field has postings then add them to the
// segment
- appendPostings(fields, consumer);
+ appendPostings(fieldName, state, fields, consumer);
for(int i=0;i ent : deletes.queries.entrySet()) {
+ queries[upto] = ent.getKey();
+ queryLimits[upto] = ent.getValue();
+ upto++;
+ }
+ bytesUsed = terms.length * BYTES_PER_DEL_TERM + queries.length * BYTES_PER_DEL_QUERY;
+ numTermDeletes = deletes.numTermDeletes.get();
+ this.gen = gen;
+ }
+
+ public Iterable termsIterable() {
+ return new Iterable() {
+ // @Override -- not until Java 1.6
+ public Iterator iterator() {
+ return new Iterator() {
+ private int upto;
+
+ // @Override -- not until Java 1.6
+ public boolean hasNext() {
+ return upto < terms.length;
+ }
+
+ // @Override -- not until Java 1.6
+ public Term next() {
+ return terms[upto++];
+ }
+
+ // @Override -- not until Java 1.6
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ };
+ }
+ };
+ }
+
+ public Iterable queriesIterable() {
+ return new Iterable() {
+ // @Override -- not until Java 1.6
+ public Iterator iterator() {
+ return new Iterator() {
+ private int upto;
+
+ // @Override -- not until Java 1.6
+ public boolean hasNext() {
+ return upto < queries.length;
+ }
+
+ // @Override -- not until Java 1.6
+ public QueryAndLimit next() {
+ QueryAndLimit ret = new QueryAndLimit(queries[upto], queryLimits[upto]);
+ upto++;
+ return ret;
+ }
+
+ // @Override -- not until Java 1.6
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ };
+ }
+ };
+ }
+
+ @Override
+ public String toString() {
+ String s = "";
+ if (numTermDeletes != 0) {
+ s += " " + numTermDeletes + " deleted terms (unique count=" + terms.length + ")";
+ }
+ if (queries.length != 0) {
+ s += " " + queries.length + " deleted queries";
+ }
+ if (bytesUsed != 0) {
+ s += " bytesUsed=" + bytesUsed;
+ }
+
+ return s;
+ }
+
+ boolean any() {
+ return terms.length > 0 || queries.length > 0;
+ }
+}
diff --git a/lucene/src/java/org/apache/lucene/index/IndexFileNames.java b/lucene/src/java/org/apache/lucene/index/IndexFileNames.java
index ef9c4b419c6..4f14170bdfc 100644
--- a/lucene/src/java/org/apache/lucene/index/IndexFileNames.java
+++ b/lucene/src/java/org/apache/lucene/index/IndexFileNames.java
@@ -204,7 +204,7 @@ public final class IndexFileNames {
/**
* Returns true if the given filename ends with the given extension. One
- * should provide a pure extension, withouth '.'.
+ * should provide a pure extension, without '.'.
*/
public static boolean matchesExtension(String filename, String ext) {
// It doesn't make a difference whether we allocate a StringBuilder ourself
diff --git a/lucene/src/java/org/apache/lucene/index/IndexFormatTooOldException.java b/lucene/src/java/org/apache/lucene/index/IndexFormatTooOldException.java
index 9be38a91e2a..b8f9356cfd4 100644
--- a/lucene/src/java/org/apache/lucene/index/IndexFormatTooOldException.java
+++ b/lucene/src/java/org/apache/lucene/index/IndexFormatTooOldException.java
@@ -23,10 +23,15 @@ package org.apache.lucene.index;
*/
public class IndexFormatTooOldException extends CorruptIndexException {
+ public IndexFormatTooOldException(String filename, String version) {
+ super("Format version is not supported" + (filename!=null ? (" in file '" + filename + "'") : "") +
+ ": " + version + ". This version of Lucene only supports indexes created with release 3.0 and later.");
+ }
+
public IndexFormatTooOldException(String filename, int version, int minVersion, int maxVersion) {
super("Format version is not supported" + (filename!=null ? (" in file '" + filename + "'") : "") +
- ": " + version + " (needs to be between " + minVersion + " and " + maxVersion +
- "). This version of Lucene only supports indexes created with release 3.0 and later.");
+ ": " + version + " (needs to be between " + minVersion + " and " + maxVersion +
+ "). This version of Lucene only supports indexes created with release 3.0 and later.");
}
}
diff --git a/lucene/src/java/org/apache/lucene/index/IndexNotFoundException.java b/lucene/src/java/org/apache/lucene/index/IndexNotFoundException.java
index 5e7107448b8..dc0a6fa0d1e 100644
--- a/lucene/src/java/org/apache/lucene/index/IndexNotFoundException.java
+++ b/lucene/src/java/org/apache/lucene/index/IndexNotFoundException.java
@@ -21,7 +21,7 @@ import java.io.FileNotFoundException;
/**
* Signals that no index was found in the Directory. Possibly because the
- * directory is empty, however can slso indicate an index corruption.
+ * directory is empty, however can also indicate an index corruption.
*/
public final class IndexNotFoundException extends FileNotFoundException {
diff --git a/lucene/src/java/org/apache/lucene/index/IndexReader.java b/lucene/src/java/org/apache/lucene/index/IndexReader.java
index 7f2aa6f8945..0a014543eca 100644
--- a/lucene/src/java/org/apache/lucene/index/IndexReader.java
+++ b/lucene/src/java/org/apache/lucene/index/IndexReader.java
@@ -19,6 +19,7 @@ package org.apache.lucene.index;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.search.FieldCache; // javadocs
import org.apache.lucene.search.Similarity;
import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.index.codecs.CodecProvider;
@@ -82,6 +83,62 @@ import java.util.concurrent.atomic.AtomicInteger;
*/
public abstract class IndexReader implements Cloneable,Closeable {
+ /**
+ * A custom listener that's invoked when the IndexReader
+ * is finished.
+ *
+ * For a SegmentReader, this listener is called only
+ * once all SegmentReaders sharing the same core are
+ * closed. At this point it is safe for apps to evict
+ * this reader from any caches keyed on {@link
+ * #getCoreCacheKey}. This is the same interface that
+ * {@link FieldCache} uses, internally, to evict
+ * entries.
+ *
+ * For other readers, this listener is called when they
+ * are closed.
+ *
+ * @lucene.experimental
+ */
+ public static interface ReaderFinishedListener {
+ public void finished(IndexReader reader);
+ }
+
+ // Impls must set this if they may call add/removeReaderFinishedListener:
+ protected volatile Collection readerFinishedListeners;
+
+ /** Expert: adds a {@link ReaderFinishedListener}. The
+ * provided listener is also added to any sub-readers, if
+ * this is a composite reader. Also, any reader reopened
+ * or cloned from this one will also copy the listeners at
+ * the time of reopen.
+ *
+ * @lucene.experimental */
+ public void addReaderFinishedListener(ReaderFinishedListener listener) {
+ readerFinishedListeners.add(listener);
+ }
+
+ /** Expert: remove a previously added {@link ReaderFinishedListener}.
+ *
+ * @lucene.experimental */
+ public void removeReaderFinishedListener(ReaderFinishedListener listener) {
+ readerFinishedListeners.remove(listener);
+ }
+
+ protected void notifyReaderFinishedListeners() {
+ // Defensive (should never be null -- all impls must set
+ // this):
+ if (readerFinishedListeners != null) {
+ for(ReaderFinishedListener listener : readerFinishedListeners) {
+ listener.finished(this);
+ }
+ }
+ }
+
+ protected void readerFinished() {
+ notifyReaderFinishedListeners();
+ }
+
/**
* Constants describing field properties, for example used for
* {@link IndexReader#getFieldNames(FieldOption)}.
@@ -199,6 +256,7 @@ public abstract class IndexReader implements Cloneable,Closeable {
refCount.incrementAndGet();
}
}
+ readerFinished();
}
}
@@ -242,24 +300,26 @@ public abstract class IndexReader implements Cloneable,Closeable {
/**
* Open a near real time IndexReader from the {@link org.apache.lucene.index.IndexWriter}.
*
- *
* @param writer The IndexWriter to open from
+ * @param applyAllDeletes If true, all buffered deletes will
+ * be applied (made visible) in the returned reader. If
+ * false, the deletes are not applied but remain buffered
+ * (in IndexWriter) so that they will be applied in the
+ * future. Applying deletes can be costly, so if your app
+ * can tolerate deleted documents being returned you might
+ * gain some performance by passing false.
* @return The new IndexReader
* @throws CorruptIndexException
* @throws IOException if there is a low-level IO error
*
- * @see #reopen(IndexWriter)
+ * @see #reopen(IndexWriter,boolean)
*
* @lucene.experimental
*/
- public static IndexReader open(final IndexWriter writer) throws CorruptIndexException, IOException {
- return writer.getReader();
+ public static IndexReader open(final IndexWriter writer, boolean applyAllDeletes) throws CorruptIndexException, IOException {
+ return writer.getReader(applyAllDeletes);
}
-
-
-
-
/** Expert: returns an IndexReader reading the index in the given
* {@link IndexCommit}. You should pass readOnly=true, since it
* gives much better concurrent performance, unless you
@@ -305,7 +365,7 @@ public abstract class IndexReader implements Cloneable,Closeable {
* @param readOnly true if no changes (deletions, norms) will be made with this IndexReader
* @param termInfosIndexDivisor Subsamples which indexed
* terms are loaded into RAM. This has the same effect as {@link
- * IndexWriter#setTermIndexInterval} except that setting
+ * IndexWriterConfig#setTermIndexInterval} except that setting
* must be done at indexing time while this setting can be
* set per reader. When set to N, then one in every
* N*termIndexInterval terms in the index is loaded into
@@ -355,14 +415,17 @@ public abstract class IndexReader implements Cloneable,Closeable {
* @param readOnly true if no changes (deletions, norms) will be made with this IndexReader
* @param termInfosIndexDivisor Subsamples which indexed
* terms are loaded into RAM. This has the same effect as {@link
- * IndexWriter#setTermIndexInterval} except that setting
+ * IndexWriterConfig#setTermIndexInterval} except that setting
* must be done at indexing time while this setting can be
* set per reader. When set to N, then one in every
* N*termIndexInterval terms in the index is loaded into
* memory. By setting this to a value > 1 you can reduce
* memory usage, at the expense of higher latency when
* loading a TermInfo. The default value is 1. Set this
- * to -1 to skip loading the terms index entirely.
+ * to -1 to skip loading the terms index entirely. This is only useful in
+ * advanced situations when you will only .next() through all terms;
+ * attempts to seek will hit an exception.
+ *
* @throws CorruptIndexException if the index is corrupt
* @throws IOException if there is a low-level IO error
*/
@@ -384,7 +447,7 @@ public abstract class IndexReader implements Cloneable,Closeable {
* @param readOnly true if no changes (deletions, norms) will be made with this IndexReader
* @param termInfosIndexDivisor Subsamples which indexed
* terms are loaded into RAM. This has the same effect as {@link
- * IndexWriter#setTermIndexInterval} except that setting
+ * IndexWriterConfig#setTermIndexInterval} except that setting
* must be done at indexing time while this setting can be
* set per reader. When set to N, then one in every
* N*termIndexInterval terms in the index is loaded into
@@ -417,7 +480,7 @@ public abstract class IndexReader implements Cloneable,Closeable {
* @param readOnly true if no changes (deletions, norms) will be made with this IndexReader
* @param termInfosIndexDivisor Subsamples which indexed
* terms are loaded into RAM. This has the same effect as {@link
- * IndexWriter#setTermIndexInterval} except that setting
+ * IndexWriterConfig#setTermIndexInterval} except that setting
* must be done at indexing time while this setting can be
* set per reader. When set to N, then one in every
* N*termIndexInterval terms in the index is loaded into
@@ -546,7 +609,7 @@ public abstract class IndexReader implements Cloneable,Closeable {
* file descriptors, CPU time) will be consumed.
*
* For lower latency on reopening a reader, you should
- * call {@link #setMergedSegmentWarmer} to
+ * call {@link IndexWriterConfig#setMergedSegmentWarmer} to
* pre-warm a newly merged segment before it's committed
* to the index. This is important for minimizing
* index-to-search delay after a large merge.
@@ -561,18 +624,26 @@ public abstract class IndexReader implements Cloneable,Closeable {
* if you attempt to reopen any of those readers, you'll
* hit an {@link AlreadyClosedException}.
*
- * @lucene.experimental
- *
* @return IndexReader that covers entire index plus all
* changes made so far by this IndexWriter instance
*
+ * @param writer The IndexWriter to open from
+ * @param applyAllDeletes If true, all buffered deletes will
+ * be applied (made visible) in the returned reader. If
+ * false, the deletes are not applied but remain buffered
+ * (in IndexWriter) so that they will be applied in the
+ * future. Applying deletes can be costly, so if your app
+ * can tolerate deleted documents being returned you might
+ * gain some performance by passing false.
+ *
* @throws IOException
+ *
+ * @lucene.experimental
*/
- public IndexReader reopen(IndexWriter writer) throws CorruptIndexException, IOException {
- return writer.getReader();
+ public IndexReader reopen(IndexWriter writer, boolean applyAllDeletes) throws CorruptIndexException, IOException {
+ return writer.getReader(applyAllDeletes);
}
-
/**
* Efficiently clones the IndexReader (sharing most
* internal state).
@@ -935,14 +1006,6 @@ public abstract class IndexReader implements Cloneable,Closeable {
*/
public abstract byte[] norms(String field) throws IOException;
- /** Reads the byte-encoded normalization factor for the named field of every
- * document. This is used by the search code to score documents.
- *
- * @see org.apache.lucene.document.Field#setBoost(float)
- */
- public abstract void norms(String field, byte[] bytes, int offset)
- throws IOException;
-
/** Expert: Resets the normalization factor for the named field of the named
* document. The norm represents the product of the field's {@link
* org.apache.lucene.document.Fieldable#setBoost(float) boost} and its {@link Similarity#lengthNorm(String,
@@ -974,26 +1037,6 @@ public abstract class IndexReader implements Cloneable,Closeable {
protected abstract void doSetNorm(int doc, String field, byte value)
throws CorruptIndexException, IOException;
- /** Expert: Resets the normalization factor for the named field of the named
- * document.
- *
- * @see #norms(String)
- * @see Similarity#decodeNormValue(byte)
- *
- * @throws StaleReaderException if the index has changed
- * since this reader was opened
- * @throws CorruptIndexException if the index is corrupt
- * @throws LockObtainFailedException if another writer
- * has this index open (write.lock
could not
- * be obtained)
- * @throws IOException if there is a low-level IO error
- */
- public void setNorm(int doc, String field, float value)
- throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException {
- ensureOpen();
- setNorm(doc, field, Similarity.getDefault().encodeNormValue(value));
- }
-
/** Flex API: returns {@link Fields} for this reader.
* This method may return null if the reader has no
* postings.
@@ -1029,6 +1072,23 @@ public abstract class IndexReader implements Cloneable,Closeable {
return terms.docFreq(term);
}
+ /** Returns the number of documents containing the term
+ * t
. This method returns 0 if the term or
+ * field does not exists. This method does not take into
+ * account deleted documents that have not yet been merged
+ * away. */
+ public long totalTermFreq(String field, BytesRef term) throws IOException {
+ final Fields fields = fields();
+ if (fields == null) {
+ return 0;
+ }
+ final Terms terms = fields.terms(field);
+ if (terms == null) {
+ return 0;
+ }
+ return terms.totalTermFreq(term);
+ }
+
/** This may return null if the field does not exist.*/
public Terms terms(String field) throws IOException {
final Fields fields = fields();
@@ -1074,6 +1134,47 @@ public abstract class IndexReader implements Cloneable,Closeable {
return null;
}
}
+
+ /**
+ * Returns {@link DocsEnum} for the specified field and
+ * {@link TermState}. This may return null, if either the field or the term
+ * does not exists or the {@link TermState} is invalid for the underlying
+ * implementation.*/
+ public DocsEnum termDocsEnum(Bits skipDocs, String field, BytesRef term, TermState state) throws IOException {
+ assert state != null;
+ assert field != null;
+ final Fields fields = fields();
+ if (fields == null) {
+ return null;
+ }
+ final Terms terms = fields.terms(field);
+ if (terms != null) {
+ return terms.docs(skipDocs, term, state, null);
+ } else {
+ return null;
+ }
+ }
+
+ /**
+ * Returns {@link DocsAndPositionsEnum} for the specified field and
+ * {@link TermState}. This may return null, if either the field or the term
+ * does not exists, the {@link TermState} is invalid for the underlying
+ * implementation, or positions were not stored for this term.*/
+ public DocsAndPositionsEnum termPositionsEnum(Bits skipDocs, String field, BytesRef term, TermState state) throws IOException {
+ assert state != null;
+ assert field != null;
+ final Fields fields = fields();
+ if (fields == null) {
+ return null;
+ }
+ final Terms terms = fields.terms(field);
+ if (terms != null) {
+ return terms.docsAndPositions(skipDocs, term, state, null);
+ } else {
+ return null;
+ }
+ }
+
/** Deletes the document numbered docNum
. Once a document is
* deleted it will not appear in TermDocs or TermPositions enumerations.
@@ -1137,7 +1238,16 @@ public abstract class IndexReader implements Cloneable,Closeable {
return n;
}
- /** Undeletes all documents currently marked as deleted in this index.
+ /** Undeletes all documents currently marked as deleted in
+ * this index.
+ *
+ * NOTE: this method can only recover documents marked
+ * for deletion but not yet removed from the index; when
+ * and how Lucene removes deleted documents is an
+ * implementation detail, subject to change from release
+ * to release. However, you can use {@link
+ * #numDeletedDocs} on the current IndexReader instance to
+ * see how many documents will be un-deleted.
*
* @throws StaleReaderException if the index has changed
* since this reader was opened
@@ -1360,9 +1470,7 @@ public abstract class IndexReader implements Cloneable,Closeable {
}
/** Expert: returns the sequential sub readers that this
- * reader is logically composed of. For example,
- * IndexSearcher uses this API to drive searching by one
- * sub reader at a time. If this reader is not composed
+ * reader is logically composed of. If this reader is not composed
* of sequential child readers, it should return null.
* If this method returns an empty array, that means this
* reader is a null reader (for example a MultiReader
@@ -1377,12 +1485,33 @@ public abstract class IndexReader implements Cloneable,Closeable {
public IndexReader[] getSequentialSubReaders() {
return null;
}
-
-
- /** Expert: returns the docID base for this subReader. */
- public int getSubReaderDocBase(IndexReader subReader) {
- throw new UnsupportedOperationException();
- }
+
+ /**
+ * Expert: Returns a the root {@link ReaderContext} for this
+ * {@link IndexReader}'s sub-reader tree. Iff this reader is composed of sub
+ * readers ,ie. this reader being a composite reader, this method returns a
+ * {@link CompositeReaderContext} holding the reader's direct children as well as a
+ * view of the reader tree's atomic leaf contexts. All sub-
+ * {@link ReaderContext} instances referenced from this readers top-level
+ * context are private to this reader and are not shared with another context
+ * tree. For example, IndexSearcher uses this API to drive searching by one
+ * atomic leaf reader at a time. If this reader is not composed of child
+ * readers, this method returns an {@link AtomicReaderContext}.
+ *
+ * Note: Any of the sub-{@link CompositeReaderContext} instances reference from this
+ * top-level context holds a null
{@link CompositeReaderContext#leaves}
+ * reference. Only the top-level context maintains the convenience leaf-view
+ * for performance reasons.
+ *
+ * NOTE: You should not try using sub-readers returned by this method to make
+ * any changes (setNorm, deleteDocument, etc.). While this might succeed for
+ * one composite reader (like MultiReader), it will most likely lead to index
+ * corruption for other readers (like DirectoryReader obtained through
+ * {@link #open}. Use the top-level context's reader directly.
+ *
+ * @lucene.experimental
+ */
+ public abstract ReaderContext getTopReaderContext();
/** Expert */
public Object getCoreCacheKey() {
@@ -1442,4 +1571,132 @@ public abstract class IndexReader implements Cloneable,Closeable {
Fields retrieveFields() {
return fields;
}
+
+ /**
+ * A struct like class that represents a hierarchical relationship between
+ * {@link IndexReader} instances.
+ * @lucene.experimental
+ */
+ public static abstract class ReaderContext {
+ /** The reader context for this reader's immediate parent, or null if none */
+ public final ReaderContext parent;
+ /** The actual reader */
+ public final IndexReader reader;
+ /** true
iff the reader is an atomic reader */
+ public final boolean isAtomic;
+ /** true
if this context struct represents the top level reader within the hierarchical context */
+ public final boolean isTopLevel;
+ /** the doc base for this reader in the parent, 0 if parent is null */
+ public final int docBaseInParent;
+ /** the ord for this reader in the parent, 0 if parent is null */
+ public final int ordInParent;
+
+ ReaderContext(ReaderContext parent, IndexReader reader,
+ boolean isAtomic, int ordInParent, int docBaseInParent) {
+ this.parent = parent;
+ this.reader = reader;
+ this.isAtomic = isAtomic;
+ this.docBaseInParent = docBaseInParent;
+ this.ordInParent = ordInParent;
+ this.isTopLevel = parent==null;
+ }
+
+ /**
+ * Returns the context's leaves if this context is a top-level context
+ * otherwise null
.
+ *
+ * Note: this is convenience method since leaves can always be obtained by
+ * walking the context tree.
+ */
+ public AtomicReaderContext[] leaves() {
+ return null;
+ }
+
+ /**
+ * Returns the context's children iff this context is a composite context
+ * otherwise null
.
+ *
+ * Note: this method is a convenience method to prevent
+ * instanceof
checks and type-casts to
+ * {@link CompositeReaderContext}.
+ */
+ public ReaderContext[] children() {
+ return null;
+ }
+ }
+
+ /**
+ * {@link ReaderContext} for composite {@link IndexReader} instance.
+ * @lucene.experimental
+ */
+ public static final class CompositeReaderContext extends ReaderContext {
+ /** the composite readers immediate children */
+ public final ReaderContext[] children;
+ /** the composite readers leaf reader contexts if this is the top level reader in this context */
+ public final AtomicReaderContext[] leaves;
+
+ /**
+ * Creates a {@link CompositeReaderContext} for intermediate readers that aren't
+ * not top-level readers in the current context
+ */
+ public CompositeReaderContext(ReaderContext parent, IndexReader reader,
+ int ordInParent, int docbaseInParent, ReaderContext[] children) {
+ this(parent, reader, ordInParent, docbaseInParent, children, null);
+ }
+
+ /**
+ * Creates a {@link CompositeReaderContext} for top-level readers with parent set to null
+ */
+ public CompositeReaderContext(IndexReader reader, ReaderContext[] children, AtomicReaderContext[] leaves) {
+ this(null, reader, 0, 0, children, leaves);
+ }
+
+ private CompositeReaderContext(ReaderContext parent, IndexReader reader,
+ int ordInParent, int docbaseInParent, ReaderContext[] children,
+ AtomicReaderContext[] leaves) {
+ super(parent, reader, false, ordInParent, docbaseInParent);
+ this.children = children;
+ this.leaves = leaves;
+ }
+
+ @Override
+ public AtomicReaderContext[] leaves() {
+ return leaves;
+ }
+
+
+ @Override
+ public ReaderContext[] children() {
+ return children;
+ }
+ }
+
+ /**
+ * {@link ReaderContext} for atomic {@link IndexReader} instances
+ * @lucene.experimental
+ */
+ public static final class AtomicReaderContext extends ReaderContext {
+ /** The readers ord in the top-level's leaves array */
+ public final int ord;
+ /** The readers absolute doc base */
+ public final int docBase;
+ /**
+ * Creates a new {@link AtomicReaderContext}
+ */
+ public AtomicReaderContext(ReaderContext parent, IndexReader reader,
+ int ord, int docBase, int leafOrd, int leafDocBase) {
+ super(parent, reader, true, ord, docBase);
+ assert reader.getSequentialSubReaders() == null : "Atomic readers must not have subreaders";
+ this.ord = leafOrd;
+ this.docBase = leafDocBase;
+ }
+
+ /**
+ * Creates a new {@link AtomicReaderContext} for a atomic reader without an immediate
+ * parent.
+ */
+ public AtomicReaderContext(IndexReader atomicReader) {
+ this(null, atomicReader, 0, 0, 0, 0);
+ }
+ }
}
diff --git a/lucene/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/src/java/org/apache/lucene/index/IndexWriter.java
index 43b0281f088..44d909265b3 100644
--- a/lucene/src/java/org/apache/lucene/index/IndexWriter.java
+++ b/lucene/src/java/org/apache/lucene/index/IndexWriter.java
@@ -31,6 +31,7 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.ConcurrentHashMap;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
@@ -47,6 +48,7 @@ import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.ThreadInterruptedException;
+import org.apache.lucene.util.MapBackedSet;
/**
An IndexWriter
creates and maintains an index.
@@ -214,7 +216,6 @@ public class IndexWriter implements Closeable {
private long lastCommitChangeCount; // last changeCount that was committed
private SegmentInfos rollbackSegmentInfos; // segmentInfos we will fallback to if the commit fails
- private HashMap rollbackSegments;
volatile SegmentInfos pendingCommit; // set when a commit is pending (after prepareCommit() & before commit())
volatile long pendingCommitChangeCount;
@@ -250,7 +251,7 @@ public class IndexWriter implements Closeable {
private final AtomicInteger flushDeletesCount = new AtomicInteger();
final ReaderPool readerPool = new ReaderPool();
- final BufferedDeletes bufferedDeletes;
+ final BufferedDeletesStream bufferedDeletesStream;
// This is a "write once" variable (like the organic dye
// on a DVD-R that may or may not be heated by a laser and
@@ -270,6 +271,13 @@ public class IndexWriter implements Closeable {
// The PayloadProcessorProvider to use when segments are merged
private PayloadProcessorProvider payloadProcessorProvider;
+ // for testing
+ boolean anyNonBulkMerges;
+
+ IndexReader getReader() throws IOException {
+ return getReader(true);
+ }
+
/**
* Expert: returns a readonly reader, covering all
* committed as well as un-committed changes to the index.
@@ -329,9 +337,10 @@ public class IndexWriter implements Closeable {
*
* @throws IOException
*/
- IndexReader getReader() throws IOException {
-
+ IndexReader getReader(boolean applyAllDeletes) throws IOException {
ensureOpen();
+
+ final long tStart = System.currentTimeMillis();
if (infoStream != null) {
message("flush at getReader");
@@ -347,17 +356,27 @@ public class IndexWriter implements Closeable {
// just like we do when loading segments_N
IndexReader r;
synchronized(this) {
- flush(false, true);
- r = new DirectoryReader(this, segmentInfos, config.getReaderTermsIndexDivisor(), codecs);
+ flush(false, applyAllDeletes);
+ r = new DirectoryReader(this, segmentInfos, config.getReaderTermsIndexDivisor(), codecs, applyAllDeletes);
if (infoStream != null) {
message("return reader version=" + r.getVersion() + " reader=" + r);
}
}
maybeMerge();
+ if (infoStream != null) {
+ message("getReader took " + (System.currentTimeMillis() - tStart) + " msec");
+ }
return r;
}
+ // Used for all SegmentReaders we open
+ private final Collection readerFinishedListeners = new MapBackedSet(new ConcurrentHashMap());
+
+ Collection getReaderFinishedListeners() throws IOException {
+ return readerFinishedListeners;
+ }
+
/** Holds shared SegmentReader instances. IndexWriter uses
* SegmentReaders for 1) applying deletes, 2) doing
* merges, 3) handing out a real-time reader. This pool
@@ -567,6 +586,7 @@ public class IndexWriter implements Closeable {
// synchronized
// Returns a ref, which we xfer to readerMap:
sr = SegmentReader.get(false, info.dir, info, readBufferSize, doOpenStores, termsIndexDivisor);
+ sr.readerFinishedListeners = readerFinishedListeners;
if (info.dir == directory) {
// Only pool if reader is not external
@@ -605,8 +625,6 @@ public class IndexWriter implements Closeable {
}
}
-
-
/**
* Obtain the number of deleted docs for a pooled reader.
* If the reader isn't being pooled, the segmentInfo's
@@ -662,16 +680,13 @@ public class IndexWriter implements Closeable {
* IndexWriter. Additionally, calling {@link #getConfig()} and changing the
* parameters does not affect that IndexWriter instance.
*
- * NOTE: by default, {@link IndexWriterConfig#getMaxFieldLength()}
- * returns {@link IndexWriterConfig#UNLIMITED_FIELD_LENGTH}. Pay attention to
- * whether this setting fits your application.
*
* @param d
* the index directory. The index is either created or appended
* according conf.getOpenMode()
.
* @param conf
* the configuration settings according to which IndexWriter should
- * be initalized.
+ * be initialized.
* @throws CorruptIndexException
* if the index is corrupt
* @throws LockObtainFailedException
@@ -689,7 +704,6 @@ public class IndexWriter implements Closeable {
directory = d;
analyzer = conf.getAnalyzer();
infoStream = defaultInfoStream;
- maxFieldLength = conf.getMaxFieldLength();
termIndexInterval = conf.getTermIndexInterval();
mergePolicy = conf.getMergePolicy();
mergePolicy.setIndexWriter(this);
@@ -697,8 +711,8 @@ public class IndexWriter implements Closeable {
mergedSegmentWarmer = conf.getMergedSegmentWarmer();
codecs = conf.getCodecProvider();
- bufferedDeletes = new BufferedDeletes(messageID);
- bufferedDeletes.setInfoStream(infoStream);
+ bufferedDeletesStream = new BufferedDeletesStream(messageID);
+ bufferedDeletesStream.setInfoStream(infoStream);
poolReaders = conf.getReaderPooling();
OpenMode mode = conf.getOpenMode();
@@ -719,11 +733,8 @@ public class IndexWriter implements Closeable {
boolean success = false;
- // TODO: we should check whether this index is too old,
- // and throw an IndexFormatTooOldExc up front, here,
- // instead of later when merge, applyDeletes, getReader
- // is attempted. I think to do this we should store the
- // oldest segment's version in segments_N.
+ // If index is too old, reading the segments will throw
+ // IndexFormatTooOldException.
segmentInfos = new SegmentInfos(codecs);
try {
if (create) {
@@ -766,9 +777,8 @@ public class IndexWriter implements Closeable {
setRollbackSegmentInfos(segmentInfos);
- docWriter = new DocumentsWriter(directory, this, conf.getIndexingChain(), conf.getMaxThreadStates(), getCurrentFieldInfos(), bufferedDeletes);
+ docWriter = new DocumentsWriter(directory, this, conf.getIndexingChain(), conf.getMaxThreadStates(), getCurrentFieldInfos(), bufferedDeletesStream);
docWriter.setInfoStream(infoStream);
- docWriter.setMaxFieldLength(maxFieldLength);
// Default deleter (for backwards compatibility) is
// KeepOnlyLastCommitDeleter:
@@ -854,10 +864,6 @@ public class IndexWriter implements Closeable {
private synchronized void setRollbackSegmentInfos(SegmentInfos infos) {
rollbackSegmentInfos = (SegmentInfos) infos.clone();
- rollbackSegments = new HashMap();
- final int size = rollbackSegmentInfos.size();
- for(int i=0;iabove for details.
*
+ * NOTE: if you call {@link #close(boolean)}
+ * with false, which aborts all running merges,
+ * then any thread still running this method might hit a
+ * {@link MergePolicy.MergeAbortedException}.
+ *
* @throws CorruptIndexException if the index is corrupt
* @throws IOException if there is a low-level IO error
* @see MergePolicy#findMergesForOptimize
@@ -1682,6 +1674,11 @@ public class IndexWriter implements Closeable {
*
NOTE: if this method hits an OutOfMemoryError
* you should immediately close the writer. See above for details.
+ *
+ * NOTE: if you call {@link #close(boolean)}
+ * with false, which aborts all running merges,
+ * then any thread still running this method might hit a
+ * {@link MergePolicy.MergeAbortedException}.
*/
public void expungeDeletes(boolean doWait)
throws CorruptIndexException, IOException {
@@ -1832,6 +1829,18 @@ public class IndexWriter implements Closeable {
}
}
+ /** Expert: to be used by a {@link MergePolicy} to avoid
+ * selecting merges for segments already being merged.
+ * The returned collection is not cloned, and thus is
+ * only safe to access if you hold IndexWriter's lock
+ * (which you do when IndexWriter invokes the
+ * MergePolicy).
+ *
+ *
Do not alter the returned collection! */
+ public synchronized Collection getMergingSegments() {
+ return mergingSegments;
+ }
+
/** Expert: the {@link MergeScheduler} calls this method
* to retrieve the next merge requested by the
* MergePolicy */
@@ -1889,7 +1898,7 @@ public class IndexWriter implements Closeable {
mergePolicy.close();
mergeScheduler.close();
- bufferedDeletes.clear();
+ bufferedDeletesStream.clear();
synchronized(this) {
@@ -1952,8 +1961,9 @@ public class IndexWriter implements Closeable {
*
* NOTE: this method will forcefully abort all merges
* in progress. If other threads are running {@link
- * #optimize()} or any of the addIndexes methods, they
- * will receive {@link MergePolicy.MergeAbortedException}s.
+ * #optimize()}, {@link #addIndexes(IndexReader[])} or
+ * {@link #expungeDeletes} methods, they may receive
+ * {@link MergePolicy.MergeAbortedException}s.
*/
public synchronized void deleteAll() throws IOException {
try {
@@ -2042,12 +2052,19 @@ public class IndexWriter implements Closeable {
* will have completed once this method completes.
*/
public synchronized void waitForMerges() {
+ if (infoStream != null) {
+ message("waitForMerges");
+ }
while(pendingMerges.size() > 0 || runningMerges.size() > 0) {
doWait();
}
// sanity check
assert 0 == mergingSegments.size();
+
+ if (infoStream != null) {
+ message("waitForMerges done");
+ }
}
/**
@@ -2226,6 +2243,11 @@ public class IndexWriter implements Closeable {
* you should immediately close the writer. See above for details.
*
+ * NOTE: if you call {@link #close(boolean)}
+ * with false, which aborts all running merges,
+ * then any thread still running this method might hit a
+ * {@link MergePolicy.MergeAbortedException}.
+ *
* @throws CorruptIndexException if the index is corrupt
* @throws IOException if there is a low-level IO error
*/
@@ -2453,13 +2475,13 @@ public class IndexWriter implements Closeable {
}
/**
- * Flush all in-memory buffered udpates (adds and deletes)
+ * Flush all in-memory buffered updates (adds and deletes)
* to the Directory.
* @param triggerMerge if true, we may merge segments (if
* deletes or docs were flushed) if necessary
- * @param flushDeletes whether pending deletes should also
+ * @param applyAllDeletes whether pending deletes should also
*/
- protected final void flush(boolean triggerMerge, boolean flushDeletes) throws CorruptIndexException, IOException {
+ protected final void flush(boolean triggerMerge, boolean applyAllDeletes) throws CorruptIndexException, IOException {
// NOTE: this method cannot be sync'd because
// maybeMerge() in turn calls mergeScheduler.merge which
@@ -2470,7 +2492,7 @@ public class IndexWriter implements Closeable {
// We can be called during close, when closing==true, so we must pass false to ensureOpen:
ensureOpen(false);
- if (doFlush(flushDeletes) && triggerMerge) {
+ if (doFlush(applyAllDeletes) && triggerMerge) {
maybeMerge();
}
}
@@ -2519,10 +2541,10 @@ public class IndexWriter implements Closeable {
// tiny segments:
if (flushControl.getFlushDeletes() ||
(config.getRAMBufferSizeMB() != IndexWriterConfig.DISABLE_AUTO_FLUSH &&
- bufferedDeletes.bytesUsed() > (1024*1024*config.getRAMBufferSizeMB()/2))) {
+ bufferedDeletesStream.bytesUsed() > (1024*1024*config.getRAMBufferSizeMB()/2))) {
applyAllDeletes = true;
if (infoStream != null) {
- message("force apply deletes bytesUsed=" + bufferedDeletes.bytesUsed() + " vs ramBuffer=" + (1024*1024*config.getRAMBufferSizeMB()));
+ message("force apply deletes bytesUsed=" + bufferedDeletesStream.bytesUsed() + " vs ramBuffer=" + (1024*1024*config.getRAMBufferSizeMB()));
}
}
}
@@ -2532,12 +2554,15 @@ public class IndexWriter implements Closeable {
message("apply all deletes during flush");
}
flushDeletesCount.incrementAndGet();
- if (bufferedDeletes.applyDeletes(readerPool, segmentInfos, segmentInfos)) {
+ final BufferedDeletesStream.ApplyDeletesResult result = bufferedDeletesStream.applyDeletes(readerPool, segmentInfos);
+ if (result.anyDeletes) {
checkpoint();
}
+ bufferedDeletesStream.prune(segmentInfos);
+ assert !bufferedDeletesStream.any();
flushControl.clearDeletes();
} else if (infoStream != null) {
- message("don't apply deletes now delTermCount=" + bufferedDeletes.numTerms() + " bytesUsed=" + bufferedDeletes.bytesUsed());
+ message("don't apply deletes now delTermCount=" + bufferedDeletesStream.numTerms() + " bytesUsed=" + bufferedDeletesStream.bytesUsed());
}
doAfterFlush();
@@ -2563,7 +2588,7 @@ public class IndexWriter implements Closeable {
*/
public final long ramSizeInBytes() {
ensureOpen();
- return docWriter.bytesUsed() + bufferedDeletes.bytesUsed();
+ return docWriter.bytesUsed() + bufferedDeletesStream.bytesUsed();
}
/** Expert: Return the number of documents currently
@@ -2573,28 +2598,12 @@ public class IndexWriter implements Closeable {
return docWriter.getNumDocs();
}
- private int ensureContiguousMerge(MergePolicy.OneMerge merge) {
-
- int first = segmentInfos.indexOf(merge.segments.info(0));
- if (first == -1)
- throw new MergePolicy.MergeException("could not find segment " + merge.segments.info(0).name + " in current index " + segString(), directory);
-
- final int numSegments = segmentInfos.size();
-
- final int numSegmentsToMerge = merge.segments.size();
- for(int i=0;i= numSegments || !segmentInfos.info(first+i).equals(info)) {
- if (segmentInfos.indexOf(info) == -1)
- throw new MergePolicy.MergeException("MergePolicy selected a segment (" + info.name + ") that is not in the current index " + segString(), directory);
- else
- throw new MergePolicy.MergeException("MergePolicy selected non-contiguous segments to merge (" + merge.segString(directory) + " vs " + segString() + "), which IndexWriter (currently) cannot handle",
- directory);
+ private void ensureValidMerge(MergePolicy.OneMerge merge) {
+ for(SegmentInfo info : merge.segments) {
+ if (segmentInfos.indexOf(info) == -1) {
+ throw new MergePolicy.MergeException("MergePolicy selected a segment (" + info.name + ") that is not in the current index " + segString(), directory);
}
}
-
- return first;
}
/** Carefully merges deletes for the segments we just
@@ -2619,9 +2628,11 @@ public class IndexWriter implements Closeable {
// started merging:
int docUpto = 0;
int delCount = 0;
+ long minGen = Long.MAX_VALUE;
for(int i=0; i < sourceSegments.size(); i++) {
SegmentInfo info = sourceSegments.info(i);
+ minGen = Math.min(info.getBufferedDeletesGen(), minGen);
int docCount = info.docCount;
SegmentReader previousReader = merge.readersClone[i];
final Bits prevDelDocs = previousReader.getDeletedDocs();
@@ -2672,9 +2683,17 @@ public class IndexWriter implements Closeable {
assert mergedReader.numDeletedDocs() == delCount;
mergedReader.hasChanges = delCount > 0;
+
+ // If new deletes were applied while we were merging
+ // (which happens if eg commit() or getReader() is
+ // called during our merge), then it better be the case
+ // that the delGen has increased for all our merged
+ // segments:
+ assert !mergedReader.hasChanges || minGen > mergedReader.getSegmentInfo().getBufferedDeletesGen();
+
+ mergedReader.getSegmentInfo().setBufferedDeletesGen(minGen);
}
- /* FIXME if we want to support non-contiguous segment merges */
synchronized private boolean commitMerge(MergePolicy.OneMerge merge, SegmentReader mergedReader) throws IOException {
assert testPoint("startCommitMerge");
@@ -2700,7 +2719,7 @@ public class IndexWriter implements Closeable {
return false;
}
- final int start = ensureContiguousMerge(merge);
+ ensureValidMerge(merge);
commitMergedDeletes(merge, mergedReader);
@@ -2710,10 +2729,32 @@ public class IndexWriter implements Closeable {
// format as well:
setMergeDocStoreIsCompoundFile(merge);
- segmentInfos.subList(start, start + merge.segments.size()).clear();
assert !segmentInfos.contains(merge.info);
- segmentInfos.add(start, merge.info);
-
+
+ final Set mergedAway = new HashSet(merge.segments);
+ int segIdx = 0;
+ int newSegIdx = 0;
+ boolean inserted = false;
+ final int curSegCount = segmentInfos.size();
+ while(segIdx < curSegCount) {
+ final SegmentInfo info = segmentInfos.info(segIdx++);
+ if (mergedAway.contains(info)) {
+ if (!inserted) {
+ segmentInfos.set(segIdx-1, merge.info);
+ inserted = true;
+ newSegIdx++;
+ }
+ } else {
+ segmentInfos.set(newSegIdx++, info);
+ }
+ }
+ assert newSegIdx == curSegCount - merge.segments.size() + 1;
+ segmentInfos.subList(newSegIdx, segmentInfos.size()).clear();
+
+ if (infoStream != null) {
+ message("after commit: " + segString());
+ }
+
closeMergeReaders(merge, false);
// Must note the change to segmentInfos so any commits
@@ -2725,16 +2766,12 @@ public class IndexWriter implements Closeable {
// disk, updating SegmentInfo, etc.:
readerPool.clear(merge.segments);
- // remove pending deletes of the segments
- // that were merged, moving them onto the segment just
- // before the merged segment
- // Lock order: IW -> BD
- bufferedDeletes.commitMerge(merge);
-
if (merge.optimize) {
// cascade the optimize:
segmentsToOptimize.add(merge.info);
}
+
+
return true;
}
@@ -2862,7 +2899,7 @@ public class IndexWriter implements Closeable {
}
}
- ensureContiguousMerge(merge);
+ ensureValidMerge(merge);
pendingMerges.add(merge);
@@ -2889,10 +2926,6 @@ public class IndexWriter implements Closeable {
final synchronized void mergeInit(MergePolicy.OneMerge merge) throws IOException {
boolean success = false;
try {
- // Lock order: IW -> BD
- if (bufferedDeletes.applyDeletes(readerPool, segmentInfos, merge.segments)) {
- checkpoint();
- }
_mergeInit(merge);
success = true;
} finally {
@@ -2916,6 +2949,9 @@ public class IndexWriter implements Closeable {
throw new IllegalStateException("this writer hit an OutOfMemoryError; cannot merge");
}
+ // TODO: is there any perf benefit to sorting
+ // merged segments? eg biggest to smallest?
+
if (merge.info != null)
// mergeInit already done
return;
@@ -2928,6 +2964,17 @@ public class IndexWriter implements Closeable {
// names.
merge.info = new SegmentInfo(newSegmentName(), 0, directory, false, false, null, false);
+ // Lock order: IW -> BD
+ final BufferedDeletesStream.ApplyDeletesResult result = bufferedDeletesStream.applyDeletes(readerPool, merge.segments);
+ if (result.anyDeletes) {
+ checkpoint();
+ }
+
+ merge.info.setBufferedDeletesGen(result.gen);
+
+ // Lock order: IW -> BD
+ bufferedDeletesStream.prune(segmentInfos);
+
Map details = new HashMap();
details.put("optimize", Boolean.toString(merge.optimize));
details.put("mergeFactor", Integer.toString(merge.segments.size()));
@@ -3115,6 +3162,7 @@ public class IndexWriter implements Closeable {
message("merge segmentCodecs=" + merger.getSegmentCodecs());
message("merge store matchedCount=" + merger.getMatchedSubReaderCount() + " vs " + numSegments);
}
+ anyNonBulkMerges |= merger.getMatchedSubReaderCount() != numSegments;
assert mergedDocCount == totDocCount;
@@ -3280,7 +3328,7 @@ public class IndexWriter implements Closeable {
// NOTE: the callers of this method should in theory
// be able to do simply wait(), but, as a defense
// against thread timing hazards where notifyAll()
- // falls to be called, we wait for at most 1 second
+ // fails to be called, we wait for at most 1 second
// and then return so caller can check if wait
// conditions are satisfied:
try {
@@ -3290,6 +3338,15 @@ public class IndexWriter implements Closeable {
}
}
+ private boolean keepFullyDeletedSegments;
+
+ /** Only for testing.
+ *
+ * @lucene.internal */
+ void keepFullyDeletedSegments() {
+ keepFullyDeletedSegments = true;
+ }
+
// called only from assert
private boolean filesExist(SegmentInfos toSync) throws IOException {
Collection files = toSync.files(directory, false);
@@ -3348,6 +3405,10 @@ public class IndexWriter implements Closeable {
readerPool.commit();
toSync = (SegmentInfos) segmentInfos.clone();
+ if (!keepFullyDeletedSegments) {
+ toSync.pruneDeletedSegments();
+ }
+
assert filesExist(toSync);
if (commitUserData != null)
@@ -3477,7 +3538,7 @@ public class IndexWriter implements Closeable {
}
synchronized boolean nrtIsCurrent(SegmentInfos infos) {
- return infos.version == segmentInfos.version && !docWriter.anyChanges() && !bufferedDeletes.any();
+ return infos.version == segmentInfos.version && !docWriter.anyChanges() && !bufferedDeletesStream.any();
}
synchronized boolean isClosed() {
@@ -3644,7 +3705,7 @@ public class IndexWriter implements Closeable {
final double ramBufferSizeMB = config.getRAMBufferSizeMB();
if (ramBufferSizeMB != IndexWriterConfig.DISABLE_AUTO_FLUSH) {
final long limit = (long) (ramBufferSizeMB*1024*1024);
- long used = bufferedDeletes.bytesUsed() + docWriter.bytesUsed();
+ long used = bufferedDeletesStream.bytesUsed() + docWriter.bytesUsed();
if (used >= limit) {
// DocumentsWriter may be able to free up some
@@ -3652,7 +3713,7 @@ public class IndexWriter implements Closeable {
// Lock order: FC -> DW
docWriter.balanceRAM();
- used = bufferedDeletes.bytesUsed() + docWriter.bytesUsed();
+ used = bufferedDeletesStream.bytesUsed() + docWriter.bytesUsed();
if (used >= limit) {
return setFlushPending("ram full: " + reason, false);
}
diff --git a/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java b/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java
index 34240ea5f2d..812306cf4e8 100644
--- a/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java
+++ b/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java
@@ -21,7 +21,8 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.DocumentsWriter.IndexingChain;
import org.apache.lucene.index.IndexWriter.IndexReaderWarmer;
import org.apache.lucene.index.codecs.CodecProvider;
-import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.util.Version;
/**
@@ -41,8 +42,6 @@ import org.apache.lucene.util.Version;
*/
public final class IndexWriterConfig implements Cloneable {
- public static final int UNLIMITED_FIELD_LENGTH = Integer.MAX_VALUE;
-
/**
* Specifies the open mode for {@link IndexWriter}:
*
@@ -55,7 +54,7 @@ public final class IndexWriterConfig implements Cloneable {
public static enum OpenMode { CREATE, APPEND, CREATE_OR_APPEND }
/** Default value is 32. Change using {@link #setTermIndexInterval(int)}. */
- public static final int DEFAULT_TERM_INDEX_INTERVAL = 32; // TODO: this should be private to the codec, not settable here
+ public static final int DEFAULT_TERM_INDEX_INTERVAL = 32; // TODO: this should be private to the codec, not settable here
/** Denotes a flush trigger is disabled. */
public final static int DISABLE_AUTO_FLUSH = -1;
@@ -113,8 +112,7 @@ public final class IndexWriterConfig implements Cloneable {
private IndexDeletionPolicy delPolicy;
private IndexCommit commit;
private OpenMode openMode;
- private int maxFieldLength;
- private Similarity similarity;
+ private SimilarityProvider similarityProvider;
private int termIndexInterval; // TODO: this should be private to the codec, not settable here
private MergeScheduler mergeScheduler;
private long writeLockTimeout;
@@ -145,8 +143,7 @@ public final class IndexWriterConfig implements Cloneable {
delPolicy = new KeepOnlyLastCommitDeletionPolicy();
commit = null;
openMode = OpenMode.CREATE_OR_APPEND;
- maxFieldLength = UNLIMITED_FIELD_LENGTH;
- similarity = Similarity.getDefault();
+ similarityProvider = IndexSearcher.getDefaultSimilarityProvider();
termIndexInterval = DEFAULT_TERM_INDEX_INTERVAL; // TODO: this should be private to the codec, not settable here
mergeScheduler = new ConcurrentMergeScheduler();
writeLockTimeout = WRITE_LOCK_TIMEOUT;
@@ -219,37 +216,6 @@ public final class IndexWriterConfig implements Cloneable {
return delPolicy;
}
- /**
- * The maximum number of terms that will be indexed for a single field in a
- * document. This limits the amount of memory required for indexing, so that
- * collections with very large files will not crash the indexing process by
- * running out of memory. This setting refers to the number of running terms,
- * not to the number of different terms.
- *
- * NOTE: this silently truncates large documents, excluding from the
- * index all terms that occur further in the document. If you know your source
- * documents are large, be sure to set this value high enough to accomodate
- * the expected size. If you set it to {@link #UNLIMITED_FIELD_LENGTH}, then
- * the only limit is your memory, but you should anticipate an
- * OutOfMemoryError.
- *
- * By default it is set to {@link #UNLIMITED_FIELD_LENGTH}.
- */
- public IndexWriterConfig setMaxFieldLength(int maxFieldLength) {
- this.maxFieldLength = maxFieldLength;
- return this;
- }
-
- /**
- * Returns the maximum number of terms that will be indexed for a single field
- * in a document.
- *
- * @see #setMaxFieldLength(int)
- */
- public int getMaxFieldLength() {
- return maxFieldLength;
- }
-
/**
* Expert: allows to open a certain commit point. The default is null which
* opens the latest commit point.
@@ -269,25 +235,22 @@ public final class IndexWriterConfig implements Cloneable {
}
/**
- * Expert: set the {@link Similarity} implementation used by this IndexWriter.
+ * Expert: set the {@link SimilarityProvider} implementation used by this IndexWriter.
*
- * NOTE: the similarity cannot be null. If null
is passed,
- * the similarity will be set to the default.
- *
- * @see Similarity#setDefault(Similarity)
+ * NOTE: the similarity provider cannot be null. If null
is passed,
+ * the similarity provider will be set to the default implementation (unspecified).
*/
- public IndexWriterConfig setSimilarity(Similarity similarity) {
- this.similarity = similarity == null ? Similarity.getDefault() : similarity;
+ public IndexWriterConfig setSimilarityProvider(SimilarityProvider similarityProvider) {
+ this.similarityProvider = similarityProvider == null ? IndexSearcher.getDefaultSimilarityProvider() : similarityProvider;
return this;
}
/**
- * Expert: returns the {@link Similarity} implementation used by this
- * IndexWriter. This defaults to the current value of
- * {@link Similarity#getDefault()}.
+ * Expert: returns the {@link SimilarityProvider} implementation used by this
+ * IndexWriter.
*/
- public Similarity getSimilarity() {
- return similarity;
+ public SimilarityProvider getSimilarityProvider() {
+ return similarityProvider;
}
/**
@@ -589,10 +552,13 @@ public final class IndexWriterConfig implements Cloneable {
/** Sets the termsIndexDivisor passed to any readers that
* IndexWriter opens, for example when applying deletes
* or creating a near-real-time reader in {@link
- * IndexWriter#getReader}. */
+ * IndexWriter#getReader}. If you pass -1, the terms index
+ * won't be loaded by the readers. This is only useful in
+ * advanced situations when you will only .next() through
+ * all terms; attempts to seek will hit an exception. */
public IndexWriterConfig setReaderTermsIndexDivisor(int divisor) {
- if (divisor <= 0) {
- throw new IllegalArgumentException("divisor must be >= 1 (got " + divisor + ")");
+ if (divisor <= 0 && divisor != -1) {
+ throw new IllegalArgumentException("divisor must be >= 1, or -1 (got " + divisor + ")");
}
readerTermsIndexDivisor = divisor;
return this;
@@ -611,8 +577,7 @@ public final class IndexWriterConfig implements Cloneable {
sb.append("delPolicy=").append(delPolicy.getClass().getName()).append("\n");
sb.append("commit=").append(commit == null ? "null" : commit).append("\n");
sb.append("openMode=").append(openMode).append("\n");
- sb.append("maxFieldLength=").append(maxFieldLength).append("\n");
- sb.append("similarity=").append(similarity.getClass().getName()).append("\n");
+ sb.append("similarityProvider=").append(similarityProvider.getClass().getName()).append("\n");
sb.append("termIndexInterval=").append(termIndexInterval).append("\n"); // TODO: this should be private to the codec, not settable here
sb.append("mergeScheduler=").append(mergeScheduler.getClass().getName()).append("\n");
sb.append("default WRITE_LOCK_TIMEOUT=").append(WRITE_LOCK_TIMEOUT).append("\n");
diff --git a/lucene/src/java/org/apache/lucene/index/LogByteSizeMergePolicy.java b/lucene/src/java/org/apache/lucene/index/LogByteSizeMergePolicy.java
index 520cb4a8f16..7ef2902099f 100644
--- a/lucene/src/java/org/apache/lucene/index/LogByteSizeMergePolicy.java
+++ b/lucene/src/java/org/apache/lucene/index/LogByteSizeMergePolicy.java
@@ -30,9 +30,14 @@ public class LogByteSizeMergePolicy extends LogMergePolicy {
* or larger will never be merged. @see setMaxMergeMB */
public static final double DEFAULT_MAX_MERGE_MB = 2048;
+ /** Default maximum segment size. A segment of this size
+ * or larger will never be merged during optimize. @see setMaxMergeMBForOptimize */
+ public static final double DEFAULT_MAX_MERGE_MB_FOR_OPTIMIZE = Long.MAX_VALUE;
+
public LogByteSizeMergePolicy() {
minMergeSize = (long) (DEFAULT_MIN_MERGE_MB*1024*1024);
maxMergeSize = (long) (DEFAULT_MAX_MERGE_MB*1024*1024);
+ maxMergeSizeForOptimize = (long) (DEFAULT_MAX_MERGE_MB_FOR_OPTIMIZE*1024*1024);
}
@Override
@@ -63,6 +68,23 @@ public class LogByteSizeMergePolicy extends LogMergePolicy {
return ((double) maxMergeSize)/1024/1024;
}
+ /**
Determines the largest segment (measured by total
+ * byte size of the segment's files, in MB) that may be
+ * merged with other segments during optimize. Setting
+ * it low will leave the index with more than 1 segment,
+ * even if {@link IndexWriter#optimize()} is called.*/
+ public void setMaxMergeMBForOptimize(double mb) {
+ maxMergeSizeForOptimize = (long) (mb*1024*1024);
+ }
+
+ /** Returns the largest segment (measured by total byte
+ * size of the segment's files, in MB) that may be merged
+ * with other segments during optimize.
+ * @see #setMaxMergeMBForOptimize */
+ public double getMaxMergeMBForOptimize() {
+ return ((double) maxMergeSizeForOptimize)/1024/1024;
+ }
+
/** Sets the minimum size for the lowest level segments.
* Any segments below this size are considered to be on
* the same level (even if they vary drastically in size)
diff --git a/lucene/src/java/org/apache/lucene/index/LogDocMergePolicy.java b/lucene/src/java/org/apache/lucene/index/LogDocMergePolicy.java
index a86111c3f46..42ec5136145 100644
--- a/lucene/src/java/org/apache/lucene/index/LogDocMergePolicy.java
+++ b/lucene/src/java/org/apache/lucene/index/LogDocMergePolicy.java
@@ -31,9 +31,10 @@ public class LogDocMergePolicy extends LogMergePolicy {
public LogDocMergePolicy() {
minMergeSize = DEFAULT_MIN_MERGE_DOCS;
- // maxMergeSize is never used by LogDocMergePolicy; set
+ // maxMergeSize(ForOptimize) are never used by LogDocMergePolicy; set
// it to Long.MAX_VALUE to disable it
maxMergeSize = Long.MAX_VALUE;
+ maxMergeSizeForOptimize = Long.MAX_VALUE;
}
@Override
diff --git a/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java b/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java
index 357460c1d9d..1925a78d74d 100644
--- a/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java
+++ b/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java
@@ -18,6 +18,11 @@ package org.apache.lucene.index;
*/
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
import java.util.Set;
/**
This class implements a {@link MergePolicy} that tries
@@ -63,7 +68,11 @@ public abstract class LogMergePolicy extends MergePolicy {
protected long minMergeSize;
protected long maxMergeSize;
+ // Although the core MPs set it explicitly, we must default in case someone
+ // out there wrote his own LMP ...
+ protected long maxMergeSizeForOptimize = Long.MAX_VALUE;
protected int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;
+ protected boolean requireContiguousMerge = false;
protected double noCFSRatio = DEFAULT_NO_CFS_RATIO;
@@ -102,6 +111,21 @@ public abstract class LogMergePolicy extends MergePolicy {
writer.get().message("LMP: " + message);
}
+ /** If true, merges must be in-order slice of the
+ * segments. If false, then the merge policy is free to
+ * pick any segments. The default is false, which is
+ * in general more efficient than true since it gives the
+ * merge policy more freedom to pick closely sized
+ * segments. */
+ public void setRequireContiguousMerge(boolean v) {
+ requireContiguousMerge = v;
+ }
+
+ /** See {@link #setRequireContiguousMerge}. */
+ public boolean getRequireContiguousMerge() {
+ return requireContiguousMerge;
+ }
+
/**
Returns the number of segments that are merged at
* once and also controls the total number of segments
* allowed to accumulate in the index.
*/
@@ -240,9 +264,9 @@ public abstract class LogMergePolicy extends MergePolicy {
int start = last - 1;
while (start >= 0) {
SegmentInfo info = infos.info(start);
- if (size(info) > maxMergeSize || sizeDocs(info) > maxMergeDocs) {
+ if (size(info) > maxMergeSizeForOptimize || sizeDocs(info) > maxMergeDocs) {
if (verbose()) {
- message("optimize: skip segment=" + info + ": size is > maxMergeSize (" + maxMergeSize + ") or sizeDocs is > maxMergeDocs (" + maxMergeDocs + ")");
+ message("optimize: skip segment=" + info + ": size is > maxMergeSize (" + maxMergeSizeForOptimize + ") or sizeDocs is > maxMergeDocs (" + maxMergeDocs + ")");
}
// need to skip that segment + add a merge for the 'right' segments,
// unless there is only 1 which is optimized.
@@ -326,9 +350,12 @@ public abstract class LogMergePolicy extends MergePolicy {
}
/** Returns the merges necessary to optimize the index.
- * This merge policy defines "optimized" to mean only one
- * segment in the index, where that segment has no
- * deletions pending nor separate norms, and it is in
+ * This merge policy defines "optimized" to mean only the
+ * requested number of segments is left in the index, and
+ * respects the {@link #maxMergeSizeForOptimize} setting.
+ * By default, and assuming {@code maxNumSegments=1}, only
+ * one segment will be left in the index, where that segment
+ * has no deletions pending nor separate norms, and it is in
* compound file format if the current useCompoundFile
* setting is true. This method returns multiple merges
* (mergeFactor at a time) so the {@link MergeScheduler}
@@ -350,6 +377,8 @@ public abstract class LogMergePolicy extends MergePolicy {
}
return null;
}
+
+ // TODO: handle non-contiguous merge case differently?
// Find the newest (rightmost) segment that needs to
// be optimized (other segments may have been flushed
@@ -382,7 +411,7 @@ public abstract class LogMergePolicy extends MergePolicy {
boolean anyTooLarge = false;
for (int i = 0; i < last; i++) {
SegmentInfo info = infos.info(i);
- if (size(info) > maxMergeSize || sizeDocs(info) > maxMergeDocs) {
+ if (size(info) > maxMergeSizeForOptimize || sizeDocs(info) > maxMergeDocs) {
anyTooLarge = true;
break;
}
@@ -448,6 +477,36 @@ public abstract class LogMergePolicy extends MergePolicy {
return spec;
}
+ private static class SegmentInfoAndLevel implements Comparable {
+ SegmentInfo info;
+ float level;
+ int index;
+
+ public SegmentInfoAndLevel(SegmentInfo info, float level, int index) {
+ this.info = info;
+ this.level = level;
+ this.index = index;
+ }
+
+ // Sorts largest to smallest
+ public int compareTo(SegmentInfoAndLevel other) {
+ if (level < other.level)
+ return 1;
+ else if (level > other.level)
+ return -1;
+ else
+ return 0;
+ }
+ }
+
+ private static class SortByIndex implements Comparator {
+ public int compare(SegmentInfoAndLevel o1, SegmentInfoAndLevel o2) {
+ return o1.index - o2.index;
+ }
+ }
+
+ private static final SortByIndex sortByIndex = new SortByIndex();
+
/** Checks if any merges are now necessary and returns a
* {@link MergePolicy.MergeSpecification} if so. A merge
* is necessary when there are more than {@link
@@ -464,17 +523,37 @@ public abstract class LogMergePolicy extends MergePolicy {
// Compute levels, which is just log (base mergeFactor)
// of the size of each segment
- float[] levels = new float[numSegments];
+ final List levels = new ArrayList();
final float norm = (float) Math.log(mergeFactor);
+ final Collection mergingSegments = writer.get().getMergingSegments();
+
for(int i=0;i subReaderToSlice = new HashMap();
private boolean[] decrefOnClose; // remember which subreaders to decRef on close
private int maxDoc = 0;
private int numDocs = -1;
@@ -48,7 +48,7 @@ public class MultiReader extends IndexReader implements Cloneable {
* @param subReaders set of (sub)readers
*/
public MultiReader(IndexReader... subReaders) throws IOException {
- initialize(subReaders, true);
+ topLevelContext = initialize(subReaders, true);
}
/**
@@ -60,14 +60,13 @@ public class MultiReader extends IndexReader implements Cloneable {
* @param subReaders set of (sub)readers
*/
public MultiReader(IndexReader[] subReaders, boolean closeSubReaders) throws IOException {
- initialize(subReaders, closeSubReaders);
+ topLevelContext = initialize(subReaders, closeSubReaders);
}
- private void initialize(IndexReader[] subReaders, boolean closeSubReaders) throws IOException {
+ private ReaderContext initialize(IndexReader[] subReaders, boolean closeSubReaders) throws IOException {
this.subReaders = subReaders.clone();
starts = new int[subReaders.length + 1]; // build starts array
decrefOnClose = new boolean[subReaders.length];
-
for (int i = 0; i < subReaders.length; i++) {
starts[i] = maxDoc;
maxDoc += subReaders[i].maxDoc(); // compute maxDocs
@@ -82,14 +81,10 @@ public class MultiReader extends IndexReader implements Cloneable {
if (subReaders[i].hasDeletions()) {
hasDeletions = true;
}
-
- final ReaderUtil.Slice slice = new ReaderUtil.Slice(starts[i],
- subReaders[i].maxDoc(),
- i);
- subReaderToSlice.put(subReaders[i], slice);
}
-
starts[subReaders.length] = maxDoc;
+ readerFinishedListeners = new MapBackedSet(new ConcurrentHashMap());
+ return ReaderUtil.buildReaderContext(this);
}
@Override
@@ -97,11 +92,6 @@ public class MultiReader extends IndexReader implements Cloneable {
throw new UnsupportedOperationException("");
}
- @Override
- public int getSubReaderDocBase(IndexReader subReader) {
- return subReaderToSlice.get(subReader).start;
- }
-
@Override
public Fields fields() throws IOException {
throw new UnsupportedOperationException("please use MultiFields.getFields, or wrap your IndexReader with SlowMultiReaderWrapper, if you really need a top level Fields");
@@ -316,12 +306,6 @@ public class MultiReader extends IndexReader implements Cloneable {
throw new UnsupportedOperationException("please use MultiNorms.norms, or wrap your IndexReader with SlowMultiReaderWrapper, if you really need a top level norms");
}
- @Override
- public synchronized void norms(String field, byte[] result, int offset)
- throws IOException {
- throw new UnsupportedOperationException("please use MultiNorms.norms, or wrap your IndexReader with SlowMultiReaderWrapper, if you really need a top level norms");
- }
-
@Override
protected void doSetNorm(int n, String field, byte value)
throws CorruptIndexException, IOException {
@@ -363,11 +347,6 @@ public class MultiReader extends IndexReader implements Cloneable {
subReaders[i].close();
}
}
-
- // NOTE: only needed in case someone had asked for
- // FieldCache for top-level reader (which is generally
- // not a good idea):
- FieldCache.DEFAULT.purge(this);
}
@Override
@@ -403,4 +382,25 @@ public class MultiReader extends IndexReader implements Cloneable {
public IndexReader[] getSequentialSubReaders() {
return subReaders;
}
+
+ @Override
+ public ReaderContext getTopReaderContext() {
+ return topLevelContext;
+ }
+
+ @Override
+ public void addReaderFinishedListener(ReaderFinishedListener listener) {
+ super.addReaderFinishedListener(listener);
+ for(IndexReader sub : subReaders) {
+ sub.addReaderFinishedListener(listener);
+ }
+ }
+
+ @Override
+ public void removeReaderFinishedListener(ReaderFinishedListener listener) {
+ super.removeReaderFinishedListener(listener);
+ for(IndexReader sub : subReaders) {
+ sub.removeReaderFinishedListener(listener);
+ }
+ }
}
diff --git a/lucene/src/java/org/apache/lucene/index/MultiTerms.java b/lucene/src/java/org/apache/lucene/index/MultiTerms.java
index 4e265c056e6..2da5db54df8 100644
--- a/lucene/src/java/org/apache/lucene/index/MultiTerms.java
+++ b/lucene/src/java/org/apache/lucene/index/MultiTerms.java
@@ -76,6 +76,19 @@ public final class MultiTerms extends Terms {
}
}
+ @Override
+ public long getSumTotalTermFreq() throws IOException {
+ long sum = 0;
+ for(Terms terms : subs) {
+ final long v = terms.getSumTotalTermFreq();
+ if (v == -1) {
+ return -1;
+ }
+ sum += v;
+ }
+ return sum;
+ }
+
@Override
public Comparator getComparator() {
return termComp;
diff --git a/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java b/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java
index 02e21b17ffc..f3283939e04 100644
--- a/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java
+++ b/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java
@@ -90,13 +90,6 @@ public final class MultiTermsEnum extends TermsEnum {
return current;
}
- @Override
- public void cacheCurrentTerm() throws IOException {
- for(int i=0;i getComparator() {
return termComp;
@@ -264,7 +257,7 @@ public final class MultiTermsEnum extends TermsEnum {
}
@Override
- public int docFreq() {
+ public int docFreq() throws IOException {
int sum = 0;
for(int i=0;i> byField = new HashMap>();
+ if (!fieldInfos.hasNorms()) {
+ return;
+ }
+
// Typically, each thread will have encountered the same
// field. So first we collate by field, ie, all
// per-thread field instances that correspond to the
@@ -137,7 +139,7 @@ final class NormsWriter extends InvertedDocEndConsumer {
// Fill hole
for(;upto> readerToFields = new HashMap>();
private List storedFieldReaders = new ArrayList();
private Map normsCache = new HashMap();
-
+ private final ReaderContext topLevelReaderContext = new AtomicReaderContext(this);
private int maxDoc;
private int numDocs;
private boolean hasDeletions;
@@ -76,6 +76,7 @@ public class ParallelReader extends IndexReader {
public ParallelReader(boolean closeSubReaders) throws IOException {
super();
this.incRefReaders = !closeSubReaders;
+ readerFinishedListeners = new MapBackedSet(new ConcurrentHashMap());
}
/** {@inheritDoc} */
@@ -92,7 +93,7 @@ public class ParallelReader extends IndexReader {
buffer.append(')');
return buffer.toString();
}
-
+
/** Add an IndexReader.
* @throws IOException if there is a low-level IO error
*/
@@ -452,29 +453,14 @@ public class ParallelReader extends IndexReader {
return bytes;
if (!hasNorms(field))
return null;
+ if (normsCache.containsKey(field)) // cached omitNorms, not missing key
+ return null;
bytes = MultiNorms.norms(reader, field);
normsCache.put(field, bytes);
return bytes;
}
- @Override
- public synchronized void norms(String field, byte[] result, int offset)
- throws IOException {
- // TODO: maybe optimize
- ensureOpen();
- IndexReader reader = fieldToReader.get(field);
- if (reader==null)
- return;
-
- byte[] norms = norms(field);
- if (norms == null) {
- Arrays.fill(result, offset, result.length, Similarity.getDefault().encodeNormValue(1.0f));
- } else {
- System.arraycopy(norms, 0, result, offset, maxDoc());
- }
- }
-
@Override
protected void doSetNorm(int n, String field, byte value)
throws CorruptIndexException, IOException {
@@ -560,8 +546,6 @@ public class ParallelReader extends IndexReader {
readers.get(i).close();
}
}
-
- FieldCache.DEFAULT.purge(this);
}
@Override
@@ -574,6 +558,26 @@ public class ParallelReader extends IndexReader {
}
return fieldSet;
}
+ @Override
+ public ReaderContext getTopReaderContext() {
+ return topLevelReaderContext;
+ }
+
+ @Override
+ public void addReaderFinishedListener(ReaderFinishedListener listener) {
+ super.addReaderFinishedListener(listener);
+ for (IndexReader reader : readers) {
+ reader.addReaderFinishedListener(listener);
+ }
+ }
+
+ @Override
+ public void removeReaderFinishedListener(ReaderFinishedListener listener) {
+ super.removeReaderFinishedListener(listener);
+ for (IndexReader reader : readers) {
+ reader.removeReaderFinishedListener(listener);
+ }
+ }
}
diff --git a/lucene/src/java/org/apache/lucene/index/PayloadProcessorProvider.java b/lucene/src/java/org/apache/lucene/index/PayloadProcessorProvider.java
index e9fe11adfb8..bf825c1dacd 100644
--- a/lucene/src/java/org/apache/lucene/index/PayloadProcessorProvider.java
+++ b/lucene/src/java/org/apache/lucene/index/PayloadProcessorProvider.java
@@ -24,7 +24,7 @@ import org.apache.lucene.util.BytesRef;
/**
* Provides a {@link DirPayloadProcessor} to be used for a {@link Directory}.
- * This allows using differnt {@link DirPayloadProcessor}s for different
+ * This allows using different {@link DirPayloadProcessor}s for different
* directories, for e.g. to perform different processing of payloads of
* different directories.
*
diff --git a/lucene/src/java/org/apache/lucene/index/PerFieldCodecWrapper.java b/lucene/src/java/org/apache/lucene/index/PerFieldCodecWrapper.java
index 2decf76b178..9df1c1acc20 100644
--- a/lucene/src/java/org/apache/lucene/index/PerFieldCodecWrapper.java
+++ b/lucene/src/java/org/apache/lucene/index/PerFieldCodecWrapper.java
@@ -224,6 +224,7 @@ final class PerFieldCodecWrapper extends Codec {
}
}
+ @Override
public FieldsProducer fieldsProducer(SegmentReadState state)
throws IOException {
return new FieldsReader(state.dir, state.fieldInfos, state.segmentInfo,
@@ -233,7 +234,7 @@ final class PerFieldCodecWrapper extends Codec {
@Override
public void files(Directory dir, SegmentInfo info, String codecId, Set files)
throws IOException {
- // ignore codecid sicne segmentCodec will assign it per codec
+ // ignore codecid since segmentCodec will assign it per codec
segmentCodecs.files(dir, info, files);
}
diff --git a/lucene/src/java/org/apache/lucene/index/PersistentSnapshotDeletionPolicy.java b/lucene/src/java/org/apache/lucene/index/PersistentSnapshotDeletionPolicy.java
index fc09266c377..f4869ea926a 100644
--- a/lucene/src/java/org/apache/lucene/index/PersistentSnapshotDeletionPolicy.java
+++ b/lucene/src/java/org/apache/lucene/index/PersistentSnapshotDeletionPolicy.java
@@ -103,7 +103,7 @@ public class PersistentSnapshotDeletionPolicy extends SnapshotDeletionPolicy {
* @param mode
* specifies whether a new index should be created, deleting all
* existing snapshots information (immediately), or open an existing
- * index, initializing the class with the snapsthots information.
+ * index, initializing the class with the snapshots information.
* @param matchVersion
* specifies the {@link Version} that should be used when opening the
* IndexWriter.
diff --git a/lucene/src/java/org/apache/lucene/index/SegmentDeletes.java b/lucene/src/java/org/apache/lucene/index/SegmentDeletes.java
deleted file mode 100644
index 1bb7f028c44..00000000000
--- a/lucene/src/java/org/apache/lucene/index/SegmentDeletes.java
+++ /dev/null
@@ -1,188 +0,0 @@
-package org.apache.lucene.index;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.SortedMap;
-import java.util.TreeMap;
-import java.util.concurrent.atomic.AtomicLong;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import org.apache.lucene.search.Query;
-import org.apache.lucene.util.RamUsageEstimator;
-
-/** Holds buffered deletes, by docID, term or query for a
- * single segment. This is used to hold buffered pending
- * deletes against the to-be-flushed segment as well as
- * per-segment deletes for each segment in the index. */
-
-// NOTE: we are sync'd by BufferedDeletes, ie, all access to
-// instances of this class is via sync'd methods on
-// BufferedDeletes
-class SegmentDeletes {
-
- /* Rough logic: HashMap has an array[Entry] w/ varying
- load factor (say 2 * POINTER). Entry is object w/ Term
- key, Integer val, int hash, Entry next
- (OBJ_HEADER + 3*POINTER + INT). Term is object w/
- String field and String text (OBJ_HEADER + 2*POINTER).
- We don't count Term's field since it's interned.
- Term's text is String (OBJ_HEADER + 4*INT + POINTER +
- OBJ_HEADER + string.length*CHAR). Integer is
- OBJ_HEADER + INT. */
- final static int BYTES_PER_DEL_TERM = 8*RamUsageEstimator.NUM_BYTES_OBJECT_REF + 5*RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + 6*RamUsageEstimator.NUM_BYTES_INT;
-
- /* Rough logic: del docIDs are List. Say list
- allocates ~2X size (2*POINTER). Integer is OBJ_HEADER
- + int */
- final static int BYTES_PER_DEL_DOCID = 2*RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + RamUsageEstimator.NUM_BYTES_INT;
-
- /* Rough logic: HashMap has an array[Entry] w/ varying
- load factor (say 2 * POINTER). Entry is object w/
- Query key, Integer val, int hash, Entry next
- (OBJ_HEADER + 3*POINTER + INT). Query we often
- undercount (say 24 bytes). Integer is OBJ_HEADER + INT. */
- final static int BYTES_PER_DEL_QUERY = 5*RamUsageEstimator.NUM_BYTES_OBJECT_REF + 2*RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + 2*RamUsageEstimator.NUM_BYTES_INT + 24;
-
- // TODO: many of the deletes stored here will map to
- // Integer.MAX_VALUE; we could be more efficient for this
- // case ie use a SortedSet not a SortedMap. But: Java's
- // SortedSet impls are simply backed by a Map so we won't
- // save anything unless we do something custom...
- final AtomicInteger numTermDeletes = new AtomicInteger();
- final SortedMap terms = new TreeMap();
- final Map queries = new HashMap();
- final List docIDs = new ArrayList();
-
- public static final Integer MAX_INT = Integer.valueOf(Integer.MAX_VALUE);
-
- final AtomicLong bytesUsed = new AtomicLong();
-
- private final static boolean VERBOSE_DELETES = false;
-
- @Override
- public String toString() {
- if (VERBOSE_DELETES) {
- return "SegmentDeletes [numTerms=" + numTermDeletes + ", terms=" + terms
- + ", queries=" + queries + ", docIDs=" + docIDs + ", bytesUsed="
- + bytesUsed + "]";
- } else {
- String s = "";
- if (numTermDeletes.get() != 0) {
- s += " " + numTermDeletes.get() + " deleted terms (unique count=" + terms.size() + ")";
- }
- if (queries.size() != 0) {
- s += " " + queries.size() + " deleted queries";
- }
- if (docIDs.size() != 0) {
- s += " " + docIDs.size() + " deleted docIDs";
- }
- if (bytesUsed.get() != 0) {
- s += " bytesUsed=" + bytesUsed.get();
- }
-
- return s;
- }
- }
-
- void update(SegmentDeletes in, boolean noLimit) {
- numTermDeletes.addAndGet(in.numTermDeletes.get());
- for (Map.Entry ent : in.terms.entrySet()) {
- final Term term = ent.getKey();
- if (!terms.containsKey(term)) {
- // only incr bytesUsed if this term wasn't already buffered:
- bytesUsed.addAndGet(BYTES_PER_DEL_TERM);
- }
- final Integer limit;
- if (noLimit) {
- limit = MAX_INT;
- } else {
- limit = ent.getValue();
- }
- terms.put(term, limit);
- }
-
- for (Map.Entry ent : in.queries.entrySet()) {
- final Query query = ent.getKey();
- if (!queries.containsKey(query)) {
- // only incr bytesUsed if this query wasn't already buffered:
- bytesUsed.addAndGet(BYTES_PER_DEL_QUERY);
- }
- final Integer limit;
- if (noLimit) {
- limit = MAX_INT;
- } else {
- limit = ent.getValue();
- }
- queries.put(query, limit);
- }
-
- // docIDs never move across segments and the docIDs
- // should already be cleared
- }
-
- public void addQuery(Query query, int docIDUpto) {
- queries.put(query, docIDUpto);
- bytesUsed.addAndGet(BYTES_PER_DEL_QUERY);
- }
-
- public void addDocID(int docID) {
- docIDs.add(Integer.valueOf(docID));
- bytesUsed.addAndGet(BYTES_PER_DEL_DOCID);
- }
-
- public void addTerm(Term term, int docIDUpto) {
- Integer current = terms.get(term);
- if (current != null && docIDUpto < current) {
- // Only record the new number if it's greater than the
- // current one. This is important because if multiple
- // threads are replacing the same doc at nearly the
- // same time, it's possible that one thread that got a
- // higher docID is scheduled before the other
- // threads. If we blindly replace than we can get
- // double-doc in the segment.
- return;
- }
-
- terms.put(term, Integer.valueOf(docIDUpto));
- numTermDeletes.incrementAndGet();
- if (current == null) {
- bytesUsed.addAndGet(BYTES_PER_DEL_TERM + term.bytes.length);
- }
- }
-
- void clear() {
- terms.clear();
- queries.clear();
- docIDs.clear();
- numTermDeletes.set(0);
- bytesUsed.set(0);
- }
-
- void clearDocIDs() {
- bytesUsed.addAndGet(-docIDs.size()*BYTES_PER_DEL_DOCID);
- docIDs.clear();
- }
-
- boolean any() {
- return terms.size() > 0 || docIDs.size() > 0 || queries.size() > 0;
- }
-}
diff --git a/lucene/src/java/org/apache/lucene/index/SegmentInfo.java b/lucene/src/java/org/apache/lucene/index/SegmentInfo.java
index 1c414934ac9..31838b1fd96 100644
--- a/lucene/src/java/org/apache/lucene/index/SegmentInfo.java
+++ b/lucene/src/java/org/apache/lucene/index/SegmentInfo.java
@@ -20,6 +20,7 @@ package org.apache.lucene.index;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Constants;
import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.codecs.DefaultSegmentInfosWriter;
@@ -67,10 +68,11 @@ public final class SegmentInfo {
private boolean isCompoundFile;
- private List files; // cached list of files that this segment uses
+ private volatile List files; // cached list of files that this segment uses
// in the Directory
- long sizeInBytes = -1; // total byte size of all of our files (computed on demand)
+ private volatile long sizeInBytesNoStore = -1; // total byte size of all but the store files (computed on demand)
+ private volatile long sizeInBytesWithStore = -1; // total byte size of all of our files (computed on demand)
private int docStoreOffset; // if this segment shares stored fields & vectors, this
// offset is where in that file this segment's docs begin
@@ -88,6 +90,17 @@ public final class SegmentInfo {
private Map diagnostics;
+ // Tracks the Lucene version this segment was created with, since 3.1. Null
+ // indicates an older than 3.0 index, and it's used to detect a too old index.
+ // The format expected is "x.y" - "2.x" for pre-3.0 indexes (or null), and
+ // specific versions afterwards ("3.0", "3.1" etc.).
+ // see Constants.LUCENE_MAIN_VERSION.
+ private String version;
+
+ // NOTE: only used in-RAM by IW to track buffered deletes;
+ // this is never written to/read from the Directory
+ private long bufferedDeletesGen;
+
public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile,
boolean hasProx, SegmentCodecs segmentCodecs, boolean hasVectors) {
this.name = name;
@@ -96,10 +109,12 @@ public final class SegmentInfo {
delGen = NO;
this.isCompoundFile = isCompoundFile;
this.docStoreOffset = -1;
+ this.docStoreSegment = name;
this.hasProx = hasProx;
this.segmentCodecs = segmentCodecs;
this.hasVectors = hasVectors;
delCount = 0;
+ version = Constants.LUCENE_MAIN_VERSION;
}
/**
@@ -107,11 +122,13 @@ public final class SegmentInfo {
*/
void reset(SegmentInfo src) {
clearFiles();
+ version = src.version;
name = src.name;
docCount = src.docCount;
dir = src.dir;
delGen = src.delGen;
docStoreOffset = src.docStoreOffset;
+ docStoreSegment = src.docStoreSegment;
docStoreIsCompoundFile = src.docStoreIsCompoundFile;
hasVectors = src.hasVectors;
hasProx = src.hasProx;
@@ -146,6 +163,9 @@ public final class SegmentInfo {
*/
public SegmentInfo(Directory dir, int format, IndexInput input, CodecProvider codecs) throws IOException {
this.dir = dir;
+ if (format <= DefaultSegmentInfosWriter.FORMAT_3_1) {
+ version = input.readString();
+ }
name = input.readString();
docCount = input.readInt();
delGen = input.readLong();
@@ -219,26 +239,41 @@ public final class SegmentInfo {
}
}
}
-
- /** Returns total size in bytes of all of files used by
- * this segment. */
+
+ /**
+ * Returns total size in bytes of all of files used by this segment (if
+ * {@code includeDocStores} is true), or the size of all files except the
+ * store files otherwise.
+ */
public long sizeInBytes(boolean includeDocStores) throws IOException {
- if (sizeInBytes == -1) {
- List files = files();
- final int size = files.size();
- sizeInBytes = 0;
- for(int i=0;i
+ * NOTE: this method is used for internal purposes only - you should
+ * not modify the version of a SegmentInfo, or it may result in unexpected
+ * exceptions thrown when you attempt to open the index.
+ *
+ * @lucene.internal
+ */
+ public void setVersion(String version) {
+ this.version = version;
+ }
+
+ /** Returns the version of the code which wrote the segment. */
+ public String getVersion() {
+ return version;
+ }
+
+ long getBufferedDeletesGen() {
+ return bufferedDeletesGen;
+ }
+
+ void setBufferedDeletesGen(long v) {
+ bufferedDeletesGen = v;
+ }
}
diff --git a/lucene/src/java/org/apache/lucene/index/SegmentInfos.java b/lucene/src/java/org/apache/lucene/index/SegmentInfos.java
index 896e6222266..493279ee17b 100644
--- a/lucene/src/java/org/apache/lucene/index/SegmentInfos.java
+++ b/lucene/src/java/org/apache/lucene/index/SegmentInfos.java
@@ -308,6 +308,19 @@ public final class SegmentInfos extends Vector {
}
}
+ /** Prunes any segment whose docs are all deleted. */
+ public void pruneDeletedSegments() {
+ int segIdx = 0;
+ while(segIdx < size()) {
+ final SegmentInfo info = info(segIdx);
+ if (info.getDelCount() == info.docCount) {
+ remove(segIdx);
+ } else {
+ segIdx++;
+ }
+ }
+ }
+
/**
* Returns a copy of this instance, also copying each
* SegmentInfo.
diff --git a/lucene/src/java/org/apache/lucene/index/SegmentMerger.java b/lucene/src/java/org/apache/lucene/index/SegmentMerger.java
index 5aec216579b..da76904f011 100644
--- a/lucene/src/java/org/apache/lucene/index/SegmentMerger.java
+++ b/lucene/src/java/org/apache/lucene/index/SegmentMerger.java
@@ -19,6 +19,7 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.atomic.AtomicLong;
@@ -59,7 +60,7 @@ final class SegmentMerger {
private int mergedDocs;
- private final CheckAbort checkAbort;
+ private final MergeState.CheckAbort checkAbort;
/** Maximum number of contiguous documents to bulk-copy
when merging stored fields */
@@ -78,9 +79,9 @@ final class SegmentMerger {
this.fieldInfos = fieldInfos;
segment = name;
if (merge != null) {
- checkAbort = new CheckAbort(merge, directory);
+ checkAbort = new MergeState.CheckAbort(merge, directory);
} else {
- checkAbort = new CheckAbort(null, null) {
+ checkAbort = new MergeState.CheckAbort(null, null) {
@Override
public void work(double units) throws MergeAbortedException {
// do nothing
@@ -266,7 +267,7 @@ final class SegmentMerger {
// details.
throw new RuntimeException("mergeFields produced an invalid result: docCount is " + docCount + " but fdx file size is " + fdxFileLength + " file=" + fileName + " file exists?=" + directory.fileExists(fileName) + "; now aborting this merge to prevent index corruption");
- segmentWriteState = new SegmentWriteState(null, directory, segment, fieldInfos, docCount, termIndexInterval, codecInfo, new AtomicLong(0));
+ segmentWriteState = new SegmentWriteState(null, directory, segment, fieldInfos, docCount, termIndexInterval, codecInfo, null, new AtomicLong(0));
return docCount;
}
@@ -508,6 +509,7 @@ final class SegmentMerger {
mergeState.hasPayloadProcessorProvider = payloadProcessorProvider != null;
mergeState.dirPayloadProcessor = new PayloadProcessorProvider.DirPayloadProcessor[mergeState.readerCount];
mergeState.currentPayloadProcessor = new PayloadProcessorProvider.PayloadProcessor[mergeState.readerCount];
+ mergeState.checkAbort = checkAbort;
docBase = 0;
int inputDocBase = 0;
@@ -571,13 +573,6 @@ final class SegmentMerger {
}
private void mergeNorms() throws IOException {
- // get needed buffer size by finding the largest segment
- int bufferSize = 0;
- for (IndexReader reader : readers) {
- bufferSize = Math.max(bufferSize, reader.maxDoc());
- }
-
- byte[] normBuffer = null;
IndexOutput output = null;
try {
for (int i = 0, numFieldInfos = fieldInfos.size(); i < numFieldInfos; i++) {
@@ -587,12 +582,15 @@ final class SegmentMerger {
output = directory.createOutput(IndexFileNames.segmentFileName(segment, "", IndexFileNames.NORMS_EXTENSION));
output.writeBytes(NORMS_HEADER,NORMS_HEADER.length);
}
- if (normBuffer == null) {
- normBuffer = new byte[bufferSize];
- }
for (IndexReader reader : readers) {
final int maxDoc = reader.maxDoc();
- reader.norms(fi.name, normBuffer, 0);
+ byte normBuffer[] = reader.norms(fi.name);
+ if (normBuffer == null) {
+ // Can be null if this segment doesn't have
+ // any docs with this field
+ normBuffer = new byte[maxDoc];
+ Arrays.fill(normBuffer, (byte)0);
+ }
if (!reader.hasDeletions()) {
//optimized case for segments without deleted docs
output.writeBytes(normBuffer, maxDoc);
@@ -616,31 +614,4 @@ final class SegmentMerger {
}
}
}
-
- static class CheckAbort {
- private double workCount;
- private MergePolicy.OneMerge merge;
- private Directory dir;
- public CheckAbort(MergePolicy.OneMerge merge, Directory dir) {
- this.merge = merge;
- this.dir = dir;
- }
-
- /**
- * Records the fact that roughly units amount of work
- * have been done since this method was last called.
- * When adding time-consuming code into SegmentMerger,
- * you should test different values for units to ensure
- * that the time in between calls to merge.checkAborted
- * is up to ~ 1 second.
- */
- public void work(double units) throws MergePolicy.MergeAbortedException {
- workCount += units;
- if (workCount >= 10000.0) {
- merge.checkAborted(dir);
- workCount = 0;
- }
- }
- }
-
}
diff --git a/lucene/src/java/org/apache/lucene/index/SegmentReader.java b/lucene/src/java/org/apache/lucene/index/SegmentReader.java
index fe4a7bfee4e..f8a0598f72d 100644
--- a/lucene/src/java/org/apache/lucene/index/SegmentReader.java
+++ b/lucene/src/java/org/apache/lucene/index/SegmentReader.java
@@ -19,7 +19,6 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
@@ -32,7 +31,6 @@ import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldSelector;
-import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.BufferedIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
@@ -46,7 +44,6 @@ import org.apache.lucene.index.values.Ints;
import org.apache.lucene.index.values.DocValues;
import org.apache.lucene.index.values.Floats;
import org.apache.lucene.index.values.Type;
-import org.apache.lucene.search.FieldCache; // not great (circular); used only to purge FieldCache entry on close
import org.apache.lucene.util.BytesRef;
/**
@@ -57,7 +54,7 @@ public class SegmentReader extends IndexReader implements Cloneable {
private SegmentInfo si;
private int readBufferSize;
-
+ private final ReaderContext readerContext = new AtomicReaderContext(this);
CloseableThreadLocal fieldsReaderLocal = new FieldsReaderLocal();
CloseableThreadLocal termVectorsLocal = new CloseableThreadLocal();
@@ -190,13 +187,9 @@ public class SegmentReader extends IndexReader implements Cloneable {
storeCFSReader.close();
}
- // Force FieldCache to evict our entries at this
- // point. If the exception occurred while
- // initializing the core readers, then
- // origInstance will be null, and we don't want
- // to call FieldCache.purge (it leads to NPE):
+ // Now, notify any ReaderFinished listeners:
if (origInstance != null) {
- FieldCache.DEFAULT.purge(origInstance);
+ origInstance.notifyReaderFinishedListeners();
}
}
}
@@ -233,13 +226,7 @@ public class SegmentReader extends IndexReader implements Cloneable {
assert storeDir != null;
}
- final String storesSegment;
- if (si.getDocStoreOffset() != -1) {
- storesSegment = si.getDocStoreSegment();
- } else {
- storesSegment = segment;
- }
-
+ final String storesSegment = si.getDocStoreSegment();
fieldsReaderOrig = new FieldsReader(storeDir, storesSegment, fieldInfos, readBufferSize,
si.getDocStoreOffset(), si.docCount);
@@ -342,29 +329,6 @@ public class SegmentReader extends IndexReader implements Cloneable {
}
}
- // Load bytes but do not cache them if they were not
- // already cached
- public synchronized void bytes(byte[] bytesOut, int offset, int len) throws IOException {
- assert refCount > 0 && (origNorm == null || origNorm.refCount > 0);
- if (bytes != null) {
- // Already cached -- copy from cache:
- assert len <= maxDoc();
- System.arraycopy(bytes, 0, bytesOut, offset, len);
- } else {
- // Not cached
- if (origNorm != null) {
- // Ask origNorm to load
- origNorm.bytes(bytesOut, offset, len);
- } else {
- // We are orig -- read ourselves from disk:
- synchronized(in) {
- in.seek(normSeek);
- in.readBytes(bytesOut, offset, len, false);
- }
- }
- }
- }
-
// Load & cache full bytes array. Returns bytes.
public synchronized byte[] bytes() throws IOException {
assert refCount > 0 && (origNorm == null || origNorm.refCount > 0);
@@ -669,6 +633,7 @@ public class SegmentReader extends IndexReader implements Cloneable {
clone.si = si;
clone.readBufferSize = readBufferSize;
clone.pendingDeleteCount = pendingDeleteCount;
+ clone.readerFinishedListeners = readerFinishedListeners;
if (!openReadOnly && hasChanges) {
// My pending changes transfer to the new reader
@@ -999,22 +964,6 @@ public class SegmentReader extends IndexReader implements Cloneable {
norm.copyOnWrite()[doc] = value; // set the value
}
- /** Read norms into a pre-allocated array. */
- @Override
- public synchronized void norms(String field, byte[] bytes, int offset)
- throws IOException {
-
- ensureOpen();
- Norm norm = norms.get(field);
- if (norm == null) {
- Arrays.fill(bytes, offset, bytes.length, Similarity.getDefault().encodeNormValue(1.0f));
- return;
- }
-
- norm.bytes(bytes, offset, maxDoc());
- }
-
-
private void openNorms(Directory cfsDir, int readBufferSize) throws IOException {
long nextNormSeek = SegmentMerger.NORMS_HEADER.length; //skip header (header unused for now)
int maxDoc = maxDoc();
@@ -1191,6 +1140,11 @@ public class SegmentReader extends IndexReader implements Cloneable {
buffer.append(si.toString(core.dir, pendingDeleteCount));
return buffer.toString();
}
+
+ @Override
+ public ReaderContext getTopReaderContext() {
+ return readerContext;
+ }
/**
* Return the name of the segment this reader is reading.
@@ -1254,6 +1208,16 @@ public class SegmentReader extends IndexReader implements Cloneable {
return core.termsIndexDivisor;
}
+ @Override
+ protected void readerFinished() {
+ // Do nothing here -- we have more careful control on
+ // when to notify that a SegmentReader has finished,
+ // because a given core is shared across many cloned
+ // SegmentReaders. We only notify once that core is no
+ // longer used (all SegmentReaders sharing it have been
+ // closed).
+ }
+
@Override
public DocValues docValues(String field) throws IOException {
return core.fields.docValues(field);
diff --git a/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java b/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java
index 3ef036f4aba..98cfdb4edf3 100644
--- a/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java
+++ b/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java
@@ -23,6 +23,7 @@ import java.util.HashSet;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BitVector;
/**
* @lucene.experimental
@@ -37,6 +38,16 @@ public class SegmentWriteState {
public final Collection flushedFiles;
public final AtomicLong bytesUsed;
+ // Deletes to apply while we are flushing the segment. A
+ // Term is enrolled in here if it was deleted at one
+ // point, and it's mapped to the docIDUpto, meaning any
+ // docID < docIDUpto containing this term should be
+ // deleted.
+ public final BufferedDeletes segDeletes;
+
+ // Lazily created:
+ public BitVector deletedDocs;
+
final SegmentCodecs segmentCodecs;
public final String codecId;
@@ -62,8 +73,9 @@ public class SegmentWriteState {
public SegmentWriteState(PrintStream infoStream, Directory directory, String segmentName, FieldInfos fieldInfos,
- int numDocs, int termIndexInterval, SegmentCodecs segmentCodecs, AtomicLong bytesUsed) {
+ int numDocs, int termIndexInterval, SegmentCodecs segmentCodecs, BufferedDeletes segDeletes, AtomicLong bytesUsed) {
this.infoStream = infoStream;
+ this.segDeletes = segDeletes;
this.directory = directory;
this.segmentName = segmentName;
this.fieldInfos = fieldInfos;
@@ -88,6 +100,7 @@ public class SegmentWriteState {
segmentCodecs = state.segmentCodecs;
flushedFiles = state.flushedFiles;
this.codecId = codecId;
+ segDeletes = state.segDeletes;
bytesUsed = state.bytesUsed;
}
}
diff --git a/lucene/src/java/org/apache/lucene/index/SlowMultiReaderWrapper.java b/lucene/src/java/org/apache/lucene/index/SlowMultiReaderWrapper.java
index 7a29870586f..78c834f8008 100644
--- a/lucene/src/java/org/apache/lucene/index/SlowMultiReaderWrapper.java
+++ b/lucene/src/java/org/apache/lucene/index/SlowMultiReaderWrapper.java
@@ -18,13 +18,9 @@ package org.apache.lucene.index;
*/
import java.io.IOException;
-import java.util.Arrays;
import java.util.HashMap;
-import java.util.List;
-import java.util.ArrayList;
import java.util.Map;
-import org.apache.lucene.search.Similarity;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.ReaderUtil; // javadoc
@@ -55,10 +51,12 @@ import org.apache.lucene.index.MultiReader; // javadoc
public final class SlowMultiReaderWrapper extends FilterIndexReader {
+ private final ReaderContext readerContext;
private final Map normsCache = new HashMap();
public SlowMultiReaderWrapper(IndexReader other) {
super(other);
+ readerContext = new AtomicReaderContext(this); // emulate atomic reader!
}
@Override
@@ -85,22 +83,17 @@ public final class SlowMultiReaderWrapper extends FilterIndexReader {
return bytes;
if (!hasNorms(field))
return null;
-
+ if (normsCache.containsKey(field)) // cached omitNorms, not missing key
+ return null;
+
bytes = MultiNorms.norms(in, field);
normsCache.put(field, bytes);
return bytes;
}
-
+
@Override
- public synchronized void norms(String field, byte[] bytes, int offset) throws IOException {
- // TODO: maybe optimize
- ensureOpen();
- byte[] norms = norms(field);
- if (norms == null) {
- Arrays.fill(bytes, offset, bytes.length, Similarity.getDefault().encodeNormValue(1.0f));
- } else {
- System.arraycopy(norms, 0, bytes, offset, maxDoc());
- }
+ public ReaderContext getTopReaderContext() {
+ return readerContext;
}
@Override
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/TermState.java b/lucene/src/java/org/apache/lucene/index/TermState.java
similarity index 53%
rename from lucene/src/java/org/apache/lucene/index/codecs/TermState.java
rename to lucene/src/java/org/apache/lucene/index/TermState.java
index df437f54dd8..3279366b589 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/TermState.java
+++ b/lucene/src/java/org/apache/lucene/index/TermState.java
@@ -1,4 +1,4 @@
-package org.apache.lucene.index.codecs;
+package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -17,27 +17,23 @@ package org.apache.lucene.index.codecs;
* limitations under the License.
*/
-import org.apache.lucene.index.DocsEnum; // for javadocs
-
-import org.apache.lucene.index.codecs.standard.StandardPostingsReader; // javadocs
-
/**
- * Holds all state required for {@link StandardPostingsReader}
- * to produce a {@link DocsEnum} without re-seeking the
- * terms dict.
+ * Encapsulates all required internal state to position the associated
+ * {@link TermsEnum} without re-seeking.
+ *
+ * @see TermsEnum#seek(org.apache.lucene.util.BytesRef, TermState)
+ * @see TermsEnum#termState()
* @lucene.experimental
*/
+public abstract class TermState implements Cloneable {
-public class TermState implements Cloneable {
- public long ord; // ord for this term
- public long filePointer; // fp into the terms dict primary file (_X.tis)
- public int docFreq; // how many docs have this term
-
- public void copy(TermState other) {
- ord = other.ord;
- filePointer = other.filePointer;
- docFreq = other.docFreq;
- }
+ /**
+ * Copies the content of the given {@link TermState} to this instance
+ *
+ * @param other
+ * the TermState to copy
+ */
+ public abstract void copyFrom(TermState other);
@Override
public Object clone() {
@@ -47,10 +43,5 @@ public class TermState implements Cloneable {
// should not happen
throw new RuntimeException(cnse);
}
- }
-
- @Override
- public String toString() {
- return "tis.fp=" + filePointer + " docFreq=" + docFreq + " ord=" + ord;
- }
-}
+ }
+}
\ No newline at end of file
diff --git a/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java b/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
index 4938538d054..2b4e35e09cd 100644
--- a/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
+++ b/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
@@ -281,6 +281,7 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
int[] lastOffsets; // Last offset we saw
int[] lastPositions; // Last position where this term occurred
+ @Override
ParallelPostingsArray newInstance(int size) {
return new TermVectorsPostingsArray(size);
}
diff --git a/lucene/src/java/org/apache/lucene/index/Terms.java b/lucene/src/java/org/apache/lucene/index/Terms.java
index 362476754f6..e68293097cf 100644
--- a/lucene/src/java/org/apache/lucene/index/Terms.java
+++ b/lucene/src/java/org/apache/lucene/index/Terms.java
@@ -57,6 +57,18 @@ public abstract class Terms {
}
}
+ /** Returns the number of documents containing the
+ * specified term text. Returns 0 if the term does not
+ * exist. */
+ public long totalTermFreq(BytesRef text) throws IOException {
+ final TermsEnum termsEnum = getThreadTermsEnum();
+ if (termsEnum.seek(text) == TermsEnum.SeekStatus.FOUND) {
+ return termsEnum.totalTermFreq();
+ } else {
+ return 0;
+ }
+ }
+
/** Get {@link DocsEnum} for the specified term. This
* method may return null if the term does not exist. */
public DocsEnum docs(Bits skipDocs, BytesRef text, DocsEnum reuse) throws IOException {
@@ -80,11 +92,59 @@ public abstract class Terms {
}
}
+ /**
+ * Expert: Get {@link DocsEnum} for the specified {@link TermState}.
+ * This method may return null
if the term does not exist.
+ *
+ * @see TermsEnum#termState()
+ * @see TermsEnum#seek(BytesRef, TermState) */
+ public DocsEnum docs(Bits skipDocs, BytesRef term, TermState termState, DocsEnum reuse) throws IOException {
+ final TermsEnum termsEnum = getThreadTermsEnum();
+ termsEnum.seek(term, termState);
+ return termsEnum.docs(skipDocs, reuse);
+ }
+
+ /**
+ * Get {@link DocsEnum} for the specified {@link TermState}. This
+ * method will may return null
if the term does not exists, or positions were
+ * not indexed.
+ *
+ * @see TermsEnum#termState()
+ * @see TermsEnum#seek(BytesRef, TermState) */
+ public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, BytesRef term, TermState termState, DocsAndPositionsEnum reuse) throws IOException {
+ final TermsEnum termsEnum = getThreadTermsEnum();
+ termsEnum.seek(term, termState);
+ return termsEnum.docsAndPositions(skipDocs, reuse);
+ }
+
public long getUniqueTermCount() throws IOException {
throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()");
}
- protected TermsEnum getThreadTermsEnum() throws IOException {
+ /** Returns the sum of {@link TermsEnum#totalTermFreq} for
+ * all terms in this field, or -1 if this measure isn't
+ * stored by the codec (or if this fields omits term freq
+ * and positions). Note that, just like other term
+ * measures, this measure does not take deleted documents
+ * into account. */
+ public abstract long getSumTotalTermFreq() throws IOException;
+
+ /**
+ * Returns a thread-private {@link TermsEnum} instance. Obtaining
+ * {@link TermsEnum} from this method might be more efficient than using
+ * {@link #iterator()} directly since this method doesn't necessarily create a
+ * new {@link TermsEnum} instance.
+ *
+ * NOTE: {@link TermsEnum} instances obtained from this method must not be
+ * shared across threads. The enum should only be used within a local context
+ * where other threads can't access it.
+ *
+ * @return a thread-private {@link TermsEnum} instance
+ * @throws IOException
+ * if an IOException occurs
+ * @lucene.internal
+ */
+ public TermsEnum getThreadTermsEnum() throws IOException {
TermsEnum termsEnum = threadEnums.get();
if (termsEnum == null) {
termsEnum = iterator();
diff --git a/lucene/src/java/org/apache/lucene/index/TermsEnum.java b/lucene/src/java/org/apache/lucene/index/TermsEnum.java
index 9901a966d2b..2d6b6c1133e 100644
--- a/lucene/src/java/org/apache/lucene/index/TermsEnum.java
+++ b/lucene/src/java/org/apache/lucene/index/TermsEnum.java
@@ -73,7 +73,34 @@ public abstract class TermsEnum {
* may be before or after the current ord. See {@link
* #seek(BytesRef)}. */
public abstract SeekStatus seek(long ord) throws IOException;
-
+
+ /**
+ * Expert: Seeks a specific position by {@link TermState} previously obtained
+ * from {@link #termState()}. Callers should maintain the {@link TermState} to
+ * use this method. Low-level implementations may position the TermsEnum
+ * without re-seeking the term dictionary.
+ *
+ * Seeking by {@link TermState} should only be used iff the enum the state was
+ * obtained from and the enum the state is used for seeking are obtained from
+ * the same {@link IndexReader}, otherwise a {@link #seek(BytesRef, TermState)} call can
+ * leave the enum in undefined state.
+ *
+ * NOTE: Using this method with an incompatible {@link TermState} might leave
+ * this {@link TermsEnum} in undefined state. On a segment level
+ * {@link TermState} instances are compatible only iff the source and the
+ * target {@link TermsEnum} operate on the same field. If operating on segment
+ * level, TermState instances must not be used across segments.
+ *
+ * NOTE: A seek by {@link TermState} might not restore the
+ * {@link AttributeSource}'s state. {@link AttributeSource} states must be
+ * maintained separately if this method is used.
+ * @param term the term the TermState corresponds to
+ * @param state the {@link TermState}
+ * */
+ public void seek(BytesRef term, TermState state) throws IOException {
+ seek(term);
+ }
+
/** Increments the enumeration to the next element.
* Returns the resulting term, or null if the end was
* hit. The returned BytesRef may be re-used across calls
@@ -97,7 +124,15 @@ public abstract class TermsEnum {
* term. Do not call this before calling next() for the
* first time, after next() returns null or seek returns
* {@link SeekStatus#END}.*/
- public abstract int docFreq();
+ public abstract int docFreq() throws IOException;
+
+ /** Returns the total number of occurrences of this term
+ * across all documents (the sum of the freq() for each
+ * doc that has this term). This will be -1 if the
+ * codec doesn't support this measure. Note that, like
+ * other term measures, this measure does not take
+ * deleted documents into account. */
+ public abstract long totalTermFreq() throws IOException;
/** Get {@link DocsEnum} for the current term. Do not
* call this before calling {@link #next} or {@link
@@ -116,6 +151,25 @@ public abstract class TermsEnum {
* the postings by this codec. */
public abstract DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException;
+ /**
+ * Expert: Returns the TermsEnums internal state to position the TermsEnum
+ * without re-seeking the term dictionary.
+ *
+ * NOTE: A seek by {@link TermState} might not capture the
+ * {@link AttributeSource}'s state. Callers must maintain the
+ * {@link AttributeSource} states separately
+ *
+ * @see TermState
+ * @see #seek(BytesRef, TermState)
+ */
+ public TermState termState() throws IOException {
+ return new TermState() {
+ @Override
+ public void copyFrom(TermState other) {
+ }
+ };
+ }
+
/** Return the {@link BytesRef} Comparator used to sort
* terms provided by the iterator. This may return
* null if there are no terms. Callers may invoke this
@@ -123,10 +177,6 @@ public abstract class TermsEnum {
* instance & reuse it. */
public abstract Comparator getComparator() throws IOException;
- /** Optional optimization hint: informs the codec that the
- * current term is likely to be re-seek'd-to soon. */
- public abstract void cacheCurrentTerm() throws IOException;
-
/** An empty TermsEnum for quickly returning an empty instance e.g.
* in {@link org.apache.lucene.search.MultiTermQuery}
* Please note: This enum should be unmodifiable,
@@ -141,9 +191,6 @@ public abstract class TermsEnum {
@Override
public SeekStatus seek(long ord) { return SeekStatus.END; }
- @Override
- public void cacheCurrentTerm() {}
-
@Override
public BytesRef term() {
throw new IllegalStateException("this method should never be called");
@@ -158,6 +205,11 @@ public abstract class TermsEnum {
public int docFreq() {
throw new IllegalStateException("this method should never be called");
}
+
+ @Override
+ public long totalTermFreq() {
+ throw new IllegalStateException("this method should never be called");
+ }
@Override
public long ord() {
@@ -183,5 +235,15 @@ public abstract class TermsEnum {
public synchronized AttributeSource attributes() {
return super.attributes();
}
+
+ @Override
+ public TermState termState() throws IOException {
+ throw new IllegalStateException("this method should never be called");
+ }
+
+ @Override
+ public void seek(BytesRef term, TermState state) throws IOException {
+ throw new IllegalStateException("this method should never be called");
+ }
};
}
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermState.java b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermState.java
new file mode 100644
index 00000000000..40bf8e95e11
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermState.java
@@ -0,0 +1,56 @@
+package org.apache.lucene.index.codecs;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.DocsEnum; // javadocs
+import org.apache.lucene.index.OrdTermState;
+import org.apache.lucene.index.TermState;
+
+/**
+ * Holds all state required for {@link PostingsReaderBase}
+ * to produce a {@link DocsEnum} without re-seeking the
+ * terms dict.
+ */
+public class BlockTermState extends OrdTermState {
+ public int docFreq; // how many docs have this term
+ public long totalTermFreq; // total number of occurrences of this term
+
+ public int termCount; // term ord are in the current block
+ public long blockFilePointer; // fp into the terms dict primary file (_X.tib) that holds this term
+
+ public int blockTermCount; // how many terms in current block
+
+ @Override
+ public void copyFrom(TermState _other) {
+ assert _other instanceof BlockTermState : "can not copy from " + _other.getClass().getName();
+ BlockTermState other = (BlockTermState) _other;
+ super.copyFrom(_other);
+ docFreq = other.docFreq;
+ totalTermFreq = other.totalTermFreq;
+ termCount = other.termCount;
+ blockFilePointer = other.blockFilePointer;
+
+ // NOTE: don't copy blockTermCount;
+ // it's "transient": used only by the "primary"
+ // termState, and regenerated on seek by TermState
+ }
+
+ @Override
+ public String toString() {
+ return super.toString() + "ord=" + ord + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq + " termCount=" + termCount + " blockFP=" + blockFilePointer;
+ }
+}
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java
new file mode 100644
index 00000000000..93882869c26
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java
@@ -0,0 +1,748 @@
+package org.apache.lucene.index.codecs;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.TreeMap;
+
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.FieldsEnum;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.codecs.standard.StandardPostingsReader; // javadocs
+import org.apache.lucene.index.values.DocValues;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CodecUtil;
+import org.apache.lucene.util.DoubleBarrelLRUCache;
+
+/** Handles a terms dict, but decouples all details of
+ * doc/freqs/positions reading to an instance of {@link
+ * PostingsReaderBase}. This class is reusable for
+ * codecs that use a different format for
+ * docs/freqs/positions (though codecs are also free to
+ * make their own terms dict impl).
+ *
+ *
This class also interacts with an instance of {@link
+ * TermsIndexReaderBase}, to abstract away the specific
+ * implementation of the terms dict index.
+ * @lucene.experimental */
+
+public class BlockTermsReader extends FieldsProducer {
+ // Open input to the main terms dict file (_X.tis)
+ private final IndexInput in;
+
+ // Reads the terms dict entries, to gather state to
+ // produce DocsEnum on demand
+ private final PostingsReaderBase postingsReader;
+
+ private final TreeMap fields = new TreeMap();
+
+ // Comparator that orders our terms
+ private final Comparator termComp;
+
+ // Caches the most recently looked-up field + terms:
+ private final DoubleBarrelLRUCache termsCache;
+
+ // Reads the terms index
+ private TermsIndexReaderBase indexReader;
+
+ // keeps the dirStart offset
+ protected long dirOffset;
+
+ // Used as key for the terms cache
+ private static class FieldAndTerm extends DoubleBarrelLRUCache.CloneableKey {
+ String field;
+ BytesRef term;
+
+ public FieldAndTerm() {
+ }
+
+ public FieldAndTerm(FieldAndTerm other) {
+ field = other.field;
+ term = new BytesRef(other.term);
+ }
+
+ @Override
+ public boolean equals(Object _other) {
+ FieldAndTerm other = (FieldAndTerm) _other;
+ return other.field == field && term.bytesEquals(other.term);
+ }
+
+ @Override
+ public Object clone() {
+ return new FieldAndTerm(this);
+ }
+
+ @Override
+ public int hashCode() {
+ return field.hashCode() * 31 + term.hashCode();
+ }
+ }
+
+ //private String segment;
+
+ public BlockTermsReader(TermsIndexReaderBase indexReader, Directory dir, FieldInfos fieldInfos, String segment, PostingsReaderBase postingsReader, int readBufferSize,
+ Comparator termComp, int termsCacheSize, String codecId)
+ throws IOException {
+
+ this.postingsReader = postingsReader;
+ termsCache = new DoubleBarrelLRUCache(termsCacheSize);
+
+ this.termComp = termComp;
+ //this.segment = segment;
+ in = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, BlockTermsWriter.TERMS_EXTENSION),
+ readBufferSize);
+
+ boolean success = false;
+ try {
+ readHeader(in);
+
+ // Have PostingsReader init itself
+ postingsReader.init(in);
+
+ // Read per-field details
+ seekDir(in, dirOffset);
+
+ final int numFields = in.readVInt();
+
+ for(int i=0;i= 0;
+ final long termsStartPointer = in.readVLong();
+ final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+ final long sumTotalTermFreq = fieldInfo.omitTermFreqAndPositions ? -1 : in.readVLong();
+ assert !fields.containsKey(fieldInfo.name);
+ fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq));
+ }
+ success = true;
+ } finally {
+ if (!success) {
+ in.close();
+ }
+ }
+
+ this.indexReader = indexReader;
+ }
+
+ protected void readHeader(IndexInput input) throws IOException {
+ CodecUtil.checkHeader(in, BlockTermsWriter.CODEC_NAME,
+ BlockTermsWriter.VERSION_START,
+ BlockTermsWriter.VERSION_CURRENT);
+ dirOffset = in.readLong();
+ }
+
+ protected void seekDir(IndexInput input, long dirOffset)
+ throws IOException {
+ input.seek(dirOffset);
+ }
+
+ @Override
+ public void loadTermsIndex(int indexDivisor) throws IOException {
+ indexReader.loadTermsIndex(indexDivisor);
+ }
+
+ @Override
+ public void close() throws IOException {
+ try {
+ try {
+ if (indexReader != null) {
+ indexReader.close();
+ }
+ } finally {
+ // null so if an app hangs on to us (ie, we are not
+ // GCable, despite being closed) we still free most
+ // ram
+ indexReader = null;
+ if (in != null) {
+ in.close();
+ }
+ }
+ } finally {
+ try {
+ if (postingsReader != null) {
+ postingsReader.close();
+ }
+ } finally {
+ for(FieldReader field : fields.values()) {
+ field.close();
+ }
+ }
+ }
+ }
+
+ public static void files(Directory dir, SegmentInfo segmentInfo, String id, Collection files) {
+ files.add(IndexFileNames.segmentFileName(segmentInfo.name, id, BlockTermsWriter.TERMS_EXTENSION));
+ }
+
+ public static void getExtensions(Collection extensions) {
+ extensions.add(BlockTermsWriter.TERMS_EXTENSION);
+ }
+
+ @Override
+ public FieldsEnum iterator() {
+ return new TermFieldsEnum();
+ }
+
+ @Override
+ public Terms terms(String field) throws IOException {
+ return fields.get(field);
+ }
+
+ // Iterates through all fields
+ private class TermFieldsEnum extends FieldsEnum {
+ final Iterator it;
+ FieldReader current;
+
+ TermFieldsEnum() {
+ it = fields.values().iterator();
+ }
+
+ @Override
+ public String next() {
+ if (it.hasNext()) {
+ current = it.next();
+ return current.fieldInfo.name;
+ } else {
+ current = null;
+ return null;
+ }
+ }
+
+ @Override
+ public TermsEnum terms() throws IOException {
+ return current.iterator();
+ }
+
+ @Override
+ public DocValues docValues() throws IOException {
+ return null;
+ }
+ }
+
+ private class FieldReader extends Terms implements Closeable {
+ final long numTerms;
+ final FieldInfo fieldInfo;
+ final long termsStartPointer;
+ final long sumTotalTermFreq;
+
+ FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq) {
+ assert numTerms > 0;
+ this.fieldInfo = fieldInfo;
+ this.numTerms = numTerms;
+ this.termsStartPointer = termsStartPointer;
+ this.sumTotalTermFreq = sumTotalTermFreq;
+ }
+
+ @Override
+ public Comparator getComparator() {
+ return termComp;
+ }
+
+ @Override
+ public void close() {
+ super.close();
+ }
+
+ @Override
+ public TermsEnum iterator() throws IOException {
+ return new SegmentTermsEnum();
+ }
+
+ @Override
+ public long getUniqueTermCount() {
+ return numTerms;
+ }
+
+ @Override
+ public long getSumTotalTermFreq() {
+ return sumTotalTermFreq;
+ }
+
+ // Iterates through terms in this field
+ private final class SegmentTermsEnum extends TermsEnum {
+ private final IndexInput in;
+ private final BlockTermState state;
+ private final boolean doOrd;
+ private final FieldAndTerm fieldTerm = new FieldAndTerm();
+ private final TermsIndexReaderBase.FieldIndexEnum indexEnum;
+ private final BytesRef term = new BytesRef();
+
+ /* This is true if indexEnum is "still" seek'd to the index term
+ for the current term. We set it to true on seeking, and then it
+ remains valid until next() is called enough times to load another
+ terms block: */
+ private boolean indexIsCurrent;
+
+ /* True if we've already called .next() on the indexEnum, to "bracket"
+ the current block of terms: */
+ private boolean didIndexNext;
+
+ /* Next index term, bracketing the current block of terms; this is
+ only valid if didIndexNext is true: */
+ private BytesRef nextIndexTerm;
+
+ /* True after seek(TermState), do defer seeking. If the app then
+ calls next() (which is not "typical"), then we'll do the real seek */
+ private boolean seekPending;
+
+ /* How many blocks we've read since last seek. Once this
+ is >= indexEnum.getDivisor() we set indexIsCurrent to false (since
+ the index can no long bracket seek-within-block). */
+ private int blocksSinceSeek;
+
+ private byte[] termSuffixes;
+ private ByteArrayDataInput termSuffixesReader = new ByteArrayDataInput(null);
+
+ /* Common prefix used for all terms in this block. */
+ private int termBlockPrefix;
+
+ private byte[] docFreqBytes;
+ private final ByteArrayDataInput freqReader = new ByteArrayDataInput(null);
+ private int metaDataUpto;
+
+ public SegmentTermsEnum() throws IOException {
+ in = (IndexInput) BlockTermsReader.this.in.clone();
+ in.seek(termsStartPointer);
+ indexEnum = indexReader.getFieldEnum(fieldInfo);
+ doOrd = indexReader.supportsOrd();
+ fieldTerm.field = fieldInfo.name;
+ state = postingsReader.newTermState();
+ state.totalTermFreq = -1;
+ state.ord = -1;
+
+ termSuffixes = new byte[128];
+ docFreqBytes = new byte[64];
+ //System.out.println("BTR.enum init this=" + this + " postingsReader=" + postingsReader);
+ }
+
+ @Override
+ public Comparator getComparator() {
+ return termComp;
+ }
+
+ @Override
+ public SeekStatus seek(final BytesRef target, final boolean useCache) throws IOException {
+
+ if (indexEnum == null) {
+ throw new IllegalStateException("terms index was not loaded");
+ }
+
+ //System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " useCache=" + useCache + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this);
+ /*
+ if (didIndexNext) {
+ if (nextIndexTerm == null) {
+ //System.out.println(" nextIndexTerm=null");
+ } else {
+ //System.out.println(" nextIndexTerm=" + nextIndexTerm.utf8ToString());
+ }
+ }
+ */
+
+ // Check cache
+ if (useCache) {
+ fieldTerm.term = target;
+ // TODO: should we differentiate "frozen"
+ // TermState (ie one that was cloned and
+ // cached/returned by termState()) from the
+ // malleable (primary) one?
+ final TermState cachedState = termsCache.get(fieldTerm);
+ if (cachedState != null) {
+ seekPending = true;
+ //System.out.println(" cached!");
+ seek(target, cachedState);
+ //System.out.println(" term=" + term.utf8ToString());
+ return SeekStatus.FOUND;
+ }
+ }
+
+ boolean doSeek = true;
+
+ // See if we can avoid seeking, because target term
+ // is after current term but before next index term:
+ if (indexIsCurrent) {
+
+ final int cmp = termComp.compare(term, target);
+
+ if (cmp == 0) {
+ // Already at the requested term
+ return SeekStatus.FOUND;
+ } else if (cmp < 0) {
+
+ // Target term is after current term
+ if (!didIndexNext) {
+ if (indexEnum.next() == -1) {
+ nextIndexTerm = null;
+ } else {
+ nextIndexTerm = indexEnum.term();
+ }
+ //System.out.println(" now do index next() nextIndexTerm=" + (nextIndexTerm == null ? "null" : nextIndexTerm.utf8ToString()));
+ didIndexNext = true;
+ }
+
+ if (nextIndexTerm == null || termComp.compare(target, nextIndexTerm) < 0) {
+ // Optimization: requested term is within the
+ // same term block we are now in; skip seeking
+ // (but do scanning):
+ doSeek = false;
+ //System.out.println(" skip seek: nextIndexTerm=" + (nextIndexTerm == null ? "null" : nextIndexTerm.utf8ToString()));
+ }
+ }
+ }
+
+ if (doSeek) {
+ //System.out.println(" seek");
+
+ // Ask terms index to find biggest indexed term (=
+ // first term in a block) that's <= our text:
+ in.seek(indexEnum.seek(target));
+ boolean result = nextBlock();
+
+ // Block must exist since, at least, the indexed term
+ // is in the block:
+ assert result;
+
+ indexIsCurrent = true;
+ didIndexNext = false;
+ blocksSinceSeek = 0;
+
+ if (doOrd) {
+ state.ord = indexEnum.ord()-1;
+ }
+
+ // NOTE: the first _next() after an index seek is
+ // a bit wasteful, since it redundantly reads some
+ // suffix bytes into the buffer. We could avoid storing
+ // those bytes in the primary file, but then when
+ // next()ing over an index term we'd have to
+ // special case it:
+ term.copy(indexEnum.term());
+ //System.out.println(" seek: term=" + term.utf8ToString());
+ } else {
+ ////System.out.println(" skip seek");
+ }
+
+ seekPending = false;
+
+ // Now scan:
+ while (_next() != null) {
+ final int cmp = termComp.compare(term, target);
+ if (cmp == 0) {
+ // Match!
+ if (useCache) {
+ // Store in cache
+ decodeMetaData();
+ termsCache.put(new FieldAndTerm(fieldTerm), (BlockTermState) state.clone());
+ }
+ //System.out.println(" FOUND");
+ return SeekStatus.FOUND;
+ } else if (cmp > 0) {
+ //System.out.println(" NOT_FOUND term=" + term.utf8ToString());
+ return SeekStatus.NOT_FOUND;
+ }
+
+ // The purpose of the terms dict index is to seek
+ // the enum to the closest index term before the
+ // term we are looking for. So, we should never
+ // cross another index term (besides the first
+ // one) while we are scanning:
+ assert indexIsCurrent;
+ }
+
+ indexIsCurrent = false;
+ //System.out.println(" END");
+ return SeekStatus.END;
+ }
+
+ @Override
+ public BytesRef next() throws IOException {
+ //System.out.println("BTR.next() seekPending=" + seekPending + " pendingSeekCount=" + state.termCount);
+
+ // If seek was previously called and the term was cached,
+ // usually caller is just going to pull a D/&PEnum or get
+ // docFreq, etc. But, if they then call next(),
+ // this method catches up all internal state so next()
+ // works properly:
+ if (seekPending) {
+ assert !indexIsCurrent;
+ in.seek(state.blockFilePointer);
+ final int pendingSeekCount = state.termCount;
+ boolean result = nextBlock();
+
+ final long savOrd = state.ord;
+
+ // Block must exist since seek(TermState) was called w/ a
+ // TermState previously returned by this enum when positioned
+ // on a real term:
+ assert result;
+
+ while(state.termCount < pendingSeekCount) {
+ BytesRef nextResult = _next();
+ assert nextResult != null;
+ }
+ seekPending = false;
+ state.ord = savOrd;
+ }
+ return _next();
+ }
+
+ /* Decodes only the term bytes of the next term. If caller then asks for
+ metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily)
+ decode all metadata up to the current term. */
+ private BytesRef _next() throws IOException {
+ //System.out.println("BTR._next seg=" + segment + " this=" + this + " termCount=" + state.termCount + " (vs " + state.blockTermCount + ")");
+ if (state.termCount == state.blockTermCount) {
+ if (!nextBlock()) {
+ //System.out.println(" eof");
+ indexIsCurrent = false;
+ return null;
+ }
+ }
+
+ // TODO: cutover to something better for these ints! simple64?
+ final int suffix = termSuffixesReader.readVInt();
+ //System.out.println(" suffix=" + suffix);
+
+ term.length = termBlockPrefix + suffix;
+ if (term.bytes.length < term.length) {
+ term.grow(term.length);
+ }
+ termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix);
+ state.termCount++;
+
+ // NOTE: meaningless in the non-ord case
+ state.ord++;
+
+ //System.out.println(" return term=" + fieldInfo.name + ":" + term.utf8ToString() + " " + term);
+ return term;
+ }
+
+ @Override
+ public BytesRef term() {
+ return term;
+ }
+
+ @Override
+ public int docFreq() throws IOException {
+ //System.out.println("BTR.docFreq");
+ decodeMetaData();
+ //System.out.println(" return " + state.docFreq);
+ return state.docFreq;
+ }
+
+ @Override
+ public long totalTermFreq() throws IOException {
+ decodeMetaData();
+ return state.totalTermFreq;
+ }
+
+ @Override
+ public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
+ //System.out.println("BTR.docs this=" + this);
+ decodeMetaData();
+ //System.out.println(" state.docFreq=" + state.docFreq);
+ final DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse);
+ assert docsEnum != null;
+ return docsEnum;
+ }
+
+ @Override
+ public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
+ //System.out.println("BTR.d&p this=" + this);
+ decodeMetaData();
+ if (fieldInfo.omitTermFreqAndPositions) {
+ return null;
+ } else {
+ DocsAndPositionsEnum dpe = postingsReader.docsAndPositions(fieldInfo, state, skipDocs, reuse);
+ //System.out.println(" return d&pe=" + dpe);
+ return dpe;
+ }
+ }
+
+ @Override
+ public void seek(BytesRef target, TermState otherState) throws IOException {
+ //System.out.println("BTR.seek termState target=" + target.utf8ToString() + " " + target + " this=" + this);
+ assert otherState != null && otherState instanceof BlockTermState;
+ assert !doOrd || ((BlockTermState) otherState).ord < numTerms;
+ state.copyFrom(otherState);
+ seekPending = true;
+ indexIsCurrent = false;
+ term.copy(target);
+ }
+
+ @Override
+ public TermState termState() throws IOException {
+ //System.out.println("BTR.termState this=" + this);
+ decodeMetaData();
+ TermState ts = (TermState) state.clone();
+ //System.out.println(" return ts=" + ts);
+ return ts;
+ }
+
+ @Override
+ public SeekStatus seek(long ord) throws IOException {
+ //System.out.println("BTR.seek by ord ord=" + ord);
+ if (indexEnum == null) {
+ throw new IllegalStateException("terms index was not loaded");
+ }
+
+ if (ord >= numTerms) {
+ state.ord = numTerms-1;
+ return SeekStatus.END;
+ }
+
+ // TODO: if ord is in same terms block and
+ // after current ord, we should avoid this seek just
+ // like we do in the seek(BytesRef) case
+ in.seek(indexEnum.seek(ord));
+ boolean result = nextBlock();
+
+ // Block must exist since ord < numTerms:
+ assert result;
+
+ indexIsCurrent = true;
+ didIndexNext = false;
+ blocksSinceSeek = 0;
+ seekPending = false;
+
+ state.ord = indexEnum.ord()-1;
+ assert state.ord >= -1: "ord=" + state.ord;
+ term.copy(indexEnum.term());
+
+ // Now, scan:
+ int left = (int) (ord - state.ord);
+ while(left > 0) {
+ final BytesRef term = _next();
+ assert term != null;
+ left--;
+ assert indexIsCurrent;
+ }
+
+ // always found
+ return SeekStatus.FOUND;
+ }
+
+ @Override
+ public long ord() {
+ if (!doOrd) {
+ throw new UnsupportedOperationException();
+ }
+ return state.ord;
+ }
+
+ private void doPendingSeek() {
+ }
+
+ /* Does initial decode of next block of terms; this
+ doesn't actually decode the docFreq, totalTermFreq,
+ postings details (frq/prx offset, etc.) metadata;
+ it just loads them as byte[] blobs which are then
+ decoded on-demand if the metadata is ever requested
+ for any term in this block. This enables terms-only
+ intensive consumes (eg certain MTQs, respelling) to
+ not pay the price of decoding metadata they won't
+ use. */
+ private boolean nextBlock() throws IOException {
+
+ // TODO: we still lazy-decode the byte[] for each
+ // term (the suffix), but, if we decoded
+ // all N terms up front then seeking could do a fast
+ // bsearch w/in the block...
+
+ //System.out.println("BTR.nextBlock() fp=" + in.getFilePointer() + " this=" + this);
+ state.blockFilePointer = in.getFilePointer();
+ state.blockTermCount = in.readVInt();
+ //System.out.println(" blockTermCount=" + state.blockTermCount);
+ if (state.blockTermCount == 0) {
+ return false;
+ }
+ termBlockPrefix = in.readVInt();
+
+ // term suffixes:
+ int len = in.readVInt();
+ if (termSuffixes.length < len) {
+ termSuffixes = new byte[ArrayUtil.oversize(len, 1)];
+ }
+ //System.out.println(" termSuffixes len=" + len);
+ in.readBytes(termSuffixes, 0, len);
+ termSuffixesReader.reset(termSuffixes);
+
+ // docFreq, totalTermFreq
+ len = in.readVInt();
+ if (docFreqBytes.length < len) {
+ docFreqBytes = new byte[ArrayUtil.oversize(len, 1)];
+ }
+ //System.out.println(" freq bytes len=" + len);
+ in.readBytes(docFreqBytes, 0, len);
+ freqReader.reset(docFreqBytes);
+ metaDataUpto = 0;
+
+ state.termCount = 0;
+
+ postingsReader.readTermsBlock(in, fieldInfo, state);
+
+ blocksSinceSeek++;
+ indexIsCurrent &= (blocksSinceSeek < indexReader.getDivisor());
+ //System.out.println(" indexIsCurrent=" + indexIsCurrent);
+
+ return true;
+ }
+
+ private void decodeMetaData() throws IOException {
+ //System.out.println("BTR.decodeMetadata mdUpto=" + metaDataUpto + " vs termCount=" + state.termCount + " state=" + state);
+ if (!seekPending) {
+ // lazily catch up on metadata decode:
+ final int limit = state.termCount;
+ state.termCount = metaDataUpto;
+ while (metaDataUpto < limit) {
+ //System.out.println(" decode");
+ // TODO: we could make "tiers" of metadata, ie,
+ // decode docFreq/totalTF but don't decode postings
+ // metadata; this way caller could get
+ // docFreq/totalTF w/o paying decode cost for
+ // postings
+ state.docFreq = freqReader.readVInt();
+ if (!fieldInfo.omitTermFreqAndPositions) {
+ state.totalTermFreq = state.docFreq + freqReader.readVLong();
+ }
+ postingsReader.nextTerm(fieldInfo, state);
+ metaDataUpto++;
+ state.termCount++;
+ }
+ } else {
+ //System.out.println(" skip! seekPending");
+ }
+ }
+ }
+ }
+}
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java
new file mode 100644
index 00000000000..c60b42506ed
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java
@@ -0,0 +1,316 @@
+package org.apache.lucene.index.codecs;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CodecUtil;
+import org.apache.lucene.util.RamUsageEstimator;
+
+// TODO: currently we encode all terms between two indexed
+// terms as a block; but, we could decouple the two, ie
+// allow several blocks in between two indexed terms
+
+/**
+ * Writes terms dict, block-encoding (column stride) each
+ * term's metadata for each set of terms between two
+ * index terms.
+ *
+ * @lucene.experimental
+ */
+
+public class BlockTermsWriter extends FieldsConsumer {
+
+ final static String CODEC_NAME = "BLOCK_TERMS_DICT";
+
+ // Initial format
+ public static final int VERSION_START = 0;
+
+ public static final int VERSION_CURRENT = VERSION_START;
+
+ /** Extension of terms file */
+ static final String TERMS_EXTENSION = "tib";
+
+ protected final IndexOutput out;
+ final PostingsWriterBase postingsWriter;
+ final FieldInfos fieldInfos;
+ FieldInfo currentField;
+ private final TermsIndexWriterBase termsIndexWriter;
+ private final List fields = new ArrayList();
+ private final Comparator termComp;
+ private final String segment;
+
+ public BlockTermsWriter(
+ TermsIndexWriterBase termsIndexWriter,
+ SegmentWriteState state,
+ PostingsWriterBase postingsWriter,
+ Comparator termComp) throws IOException
+ {
+ final String termsFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, TERMS_EXTENSION);
+ this.termsIndexWriter = termsIndexWriter;
+ this.termComp = termComp;
+ out = state.directory.createOutput(termsFileName);
+ fieldInfos = state.fieldInfos;
+ writeHeader(out);
+ currentField = null;
+ this.postingsWriter = postingsWriter;
+ segment = state.segmentName;
+
+ //System.out.println("BTW.init seg=" + state.segmentName);
+
+ postingsWriter.start(out); // have consumer write its format/header
+ }
+
+ protected void writeHeader(IndexOutput out) throws IOException {
+ CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
+
+ out.writeLong(0); // leave space for end index pointer
+ }
+
+ @Override
+ public TermsConsumer addField(FieldInfo field) throws IOException {
+ //System.out.println("\nBTW.addField seg=" + segment + " field=" + field.name);
+ assert currentField == null || currentField.name.compareTo(field.name) < 0;
+ currentField = field;
+ TermsIndexWriterBase.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field, out.getFilePointer());
+ final TermsWriter terms = new TermsWriter(fieldIndexWriter, field, postingsWriter);
+ fields.add(terms);
+ return terms;
+ }
+
+ @Override
+ public void close() throws IOException {
+
+ try {
+
+ int nonZeroCount = 0;
+ for(TermsWriter field : fields) {
+ if (field.numTerms > 0) {
+ nonZeroCount++;
+ }
+ }
+
+ final long dirStart = out.getFilePointer();
+
+ out.writeVInt(nonZeroCount);
+ for(TermsWriter field : fields) {
+ if (field.numTerms > 0) {
+ out.writeVInt(field.fieldInfo.number);
+ out.writeVLong(field.numTerms);
+ out.writeVLong(field.termsStartPointer);
+ if (!field.fieldInfo.omitTermFreqAndPositions) {
+ out.writeVLong(field.sumTotalTermFreq);
+ }
+ }
+ }
+ writeTrailer(dirStart);
+ } finally {
+ try {
+ out.close();
+ } finally {
+ try {
+ postingsWriter.close();
+ } finally {
+ termsIndexWriter.close();
+ }
+ }
+ }
+ }
+
+ protected void writeTrailer(long dirStart) throws IOException {
+ // TODO Auto-generated method stub
+ out.seek(CodecUtil.headerLength(CODEC_NAME));
+ out.writeLong(dirStart);
+ }
+
+ private static class TermEntry {
+ public final BytesRef term = new BytesRef();
+ public TermStats stats;
+ }
+
+ class TermsWriter extends TermsConsumer {
+ private final FieldInfo fieldInfo;
+ private final PostingsWriterBase postingsWriter;
+ private final long termsStartPointer;
+ private long numTerms;
+ private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
+ long sumTotalTermFreq;
+ private final BytesRef lastTerm = new BytesRef();
+
+ private TermEntry[] pendingTerms;
+
+ private int pendingCount;
+
+ TermsWriter(
+ TermsIndexWriterBase.FieldWriter fieldIndexWriter,
+ FieldInfo fieldInfo,
+ PostingsWriterBase postingsWriter)
+ {
+ this.fieldInfo = fieldInfo;
+ this.fieldIndexWriter = fieldIndexWriter;
+ pendingTerms = new TermEntry[32];
+ for(int i=0;i getComparator() {
+ return termComp;
+ }
+
+ @Override
+ public PostingsConsumer startTerm(BytesRef text) throws IOException {
+ //System.out.println("BTW.startTerm seg=" + segment + " term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text);
+ postingsWriter.startTerm();
+ return postingsWriter;
+ }
+
+ private final BytesRef lastPrevTerm = new BytesRef();
+
+ @Override
+ public void finishTerm(BytesRef text, TermStats stats) throws IOException {
+
+ assert stats.docFreq > 0;
+ //System.out.println("BTW.finishTerm seg=" + segment + " term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " df=" + stats.docFreq);
+
+ final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, stats);
+
+ if (isIndexTerm) {
+ if (pendingCount > 0) {
+ // Instead of writing each term, live, we gather terms
+ // in RAM in a pending buffer, and then write the
+ // entire block in between index terms:
+ flushBlock();
+ }
+ fieldIndexWriter.add(text, stats, out.getFilePointer());
+ }
+
+ if (pendingTerms.length == pendingCount) {
+ final TermEntry[] newArray = new TermEntry[ArrayUtil.oversize(pendingCount+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+ System.arraycopy(pendingTerms, 0, newArray, 0, pendingCount);
+ for(int i=pendingCount;i 0) {
+ flushBlock();
+ }
+ // EOF marker:
+ out.writeVInt(0);
+
+ this.sumTotalTermFreq = sumTotalTermFreq;
+ fieldIndexWriter.finish(out.getFilePointer());
+ }
+
+ private int sharedPrefix(BytesRef term1, BytesRef term2) {
+ assert term1.offset == 0;
+ assert term2.offset == 0;
+ int pos1 = 0;
+ int pos1End = pos1 + Math.min(term1.length, term2.length);
+ int pos2 = 0;
+ while(pos1 < pos1End) {
+ if (term1.bytes[pos1] != term2.bytes[pos2]) {
+ return pos1;
+ }
+ pos1++;
+ pos2++;
+ }
+ return pos1;
+ }
+
+ private final RAMOutputStream bytesWriter = new RAMOutputStream();
+
+ private void flushBlock() throws IOException {
+ //System.out.println("BTW.flushBlock pendingCount=" + pendingCount);
+
+ // First pass: compute common prefix for all terms
+ // in the block, against term before first term in
+ // this block:
+ int commonPrefix = sharedPrefix(lastPrevTerm, pendingTerms[0].term);
+ for(int termCount=1;termCount 0; i--) { // read segmentInfos
- infos.add(new SegmentInfo(directory, format, input, codecs));
+ SegmentInfo si = new SegmentInfo(directory, format, input, codecs);
+ if (si.getVersion() == null) {
+ // Could be a 3.0 - try to open the doc stores - if it fails, it's a
+ // 2.x segment, and an IndexFormatTooOldException will be thrown,
+ // which is what we want.
+ Directory dir = directory;
+ if (si.getDocStoreOffset() != -1) {
+ if (si.getDocStoreIsCompoundFile()) {
+ dir = new CompoundFileReader(dir, IndexFileNames.segmentFileName(
+ si.getDocStoreSegment(), "",
+ IndexFileNames.COMPOUND_FILE_STORE_EXTENSION), 1024);
+ }
+ } else if (si.getUseCompoundFile()) {
+ dir = new CompoundFileReader(dir, IndexFileNames.segmentFileName(
+ si.name, "", IndexFileNames.COMPOUND_FILE_EXTENSION), 1024);
+ }
+
+ try {
+ FieldsReader.checkCodeVersion(dir, si.getDocStoreSegment());
+ } finally {
+ // If we opened the directory, close it
+ if (dir != directory) dir.close();
+ }
+
+ // Above call succeeded, so it's a 3.0 segment. Upgrade it so the next
+ // time the segment is read, its version won't be null and we won't
+ // need to open FieldsReader every time for each such segment.
+ si.setVersion("3.0");
+ } else if (si.getVersion().equals("2.x")) {
+ // If it's a 3x index touched by 3.1+ code, then segments record their
+ // version, whether they are 2.x ones or not. We detect that and throw
+ // appropriate exception.
+ throw new IndexFormatTooOldException(si.name, si.getVersion());
+ }
+ infos.add(si);
}
infos.userData = input.readStringStringMap();
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/DefaultSegmentInfosWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/DefaultSegmentInfosWriter.java
index c89fe948072..f034a412f52 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/DefaultSegmentInfosWriter.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/DefaultSegmentInfosWriter.java
@@ -38,9 +38,12 @@ public class DefaultSegmentInfosWriter extends SegmentInfosWriter {
/** Each segment records whether it has term vectors */
public static final int FORMAT_HAS_VECTORS = -10;
+ /** Each segment records the Lucene version that created it. */
+ public static final int FORMAT_3_1 = -11;
+
/** Each segment records whether its postings are written
* in the new flex format */
- public static final int FORMAT_4_0 = -11;
+ public static final int FORMAT_4_0 = -12;
/** This must always point to the most recent file format.
* whenever you add a new format, make it 1 smaller (negative version logic)! */
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesWriter.java
deleted file mode 100644
index 3785c40948b..00000000000
--- a/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesWriter.java
+++ /dev/null
@@ -1,75 +0,0 @@
-package org.apache.lucene.index.codecs;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.util.BytesRef;
-import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
-
-import java.io.IOException;
-
-final class DeltaBytesWriter {
-
- // Must be bigger than
- // DocumentsWriter.MAX_TERM_LENGTH_UTF8. If you change
- // this it's an index format change, so that change must be
- // versioned:
- final static int TERM_EOF = BYTE_BLOCK_SIZE;
-
- private byte[] lastBytes = new byte[10];
- private int lastLength;
- final IndexOutput out;
-
- DeltaBytesWriter(IndexOutput out) {
- this.out = out;
- }
-
- void reset() {
- lastLength = 0;
- }
-
- void write(BytesRef text) throws IOException {
- int start = 0;
- int upto = text.offset;
- final int length = text.length;
- final byte[] bytes = text.bytes;
-
- final int limit = length < lastLength ? length : lastLength;
- while(start < limit) {
- if (bytes[upto] != lastBytes[start]) {
- break;
- }
- start++;
- upto++;
- }
-
- final int suffix = length - start;
- out.writeVInt(start); // prefix
- out.writeVInt(suffix); // suffix
- out.writeBytes(bytes, upto, suffix);
- if (lastBytes.length < length) {
- lastBytes = ArrayUtil.grow(lastBytes, length);
- }
- // TODO: is this copy really necessary? I don't think
- // caller actually modifies these bytes, so we can save
- // by reference?
- System.arraycopy(bytes, upto, lastBytes, start, suffix);
- lastLength = length;
- }
-}
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java b/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java
index c335dc6fcff..c4350694cb0 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java
@@ -44,7 +44,7 @@ public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
// number of places to multiply out the actual ord, and we
// will overflow int during those multiplies. So to avoid
// having to upgrade each multiple to long in multiple
- // places (error proned), we use long here:
+ // places (error prone), we use long here:
private long totalIndexInterval;
private int indexDivisor;
@@ -94,6 +94,7 @@ public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
// Read directory
final int numFields = in.readVInt();
+ //System.out.println("FGR: init seg=" + segment + " div=" + indexDivisor + " nF=" + numFields);
for(int i=0;i 0) {
loadTermsIndex();
}
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java
index 152181557ee..1331ebf7879 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java
@@ -53,7 +53,6 @@ public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
private final List fields = new ArrayList();
private final FieldInfos fieldInfos; // unread
- private IndexOutput termsOut;
public FixedGapTermsIndexWriter(SegmentWriteState state) throws IOException {
final String indexFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, TERMS_INDEX_EXTENSION);
@@ -71,13 +70,9 @@ public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
}
@Override
- public void setTermsOutput(IndexOutput termsOut) {
- this.termsOut = termsOut;
- }
-
- @Override
- public FieldWriter addField(FieldInfo field) {
- SimpleFieldWriter writer = new SimpleFieldWriter(field);
+ public FieldWriter addField(FieldInfo field, long termsFilePointer) {
+ //System.out.println("FGW: addFfield=" + field.name);
+ SimpleFieldWriter writer = new SimpleFieldWriter(field, termsFilePointer);
fields.add(writer);
return writer;
}
@@ -119,44 +114,19 @@ public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
private final BytesRef lastTerm = new BytesRef();
- SimpleFieldWriter(FieldInfo fieldInfo) {
+ SimpleFieldWriter(FieldInfo fieldInfo, long termsFilePointer) {
this.fieldInfo = fieldInfo;
indexStart = out.getFilePointer();
- termsStart = lastTermsPointer = termsOut.getFilePointer();
+ termsStart = lastTermsPointer = termsFilePointer;
termLengths = new short[0];
termsPointerDeltas = new int[0];
}
@Override
- public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
+ public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
// First term is first indexed term:
+ //System.out.println("FGW: checkIndexTerm text=" + text.utf8ToString());
if (0 == (numTerms++ % termIndexInterval)) {
-
- final int indexedTermLength = indexedTermPrefixLength(lastTerm, text);
-
- // write only the min prefix that shows the diff
- // against prior term
- out.writeBytes(text.bytes, text.offset, indexedTermLength);
-
- if (termLengths.length == numIndexTerms) {
- termLengths = ArrayUtil.grow(termLengths);
- }
- if (termsPointerDeltas.length == numIndexTerms) {
- termsPointerDeltas = ArrayUtil.grow(termsPointerDeltas);
- }
-
- // save delta terms pointer
- final long fp = termsOut.getFilePointer();
- termsPointerDeltas[numIndexTerms] = (int) (fp - lastTermsPointer);
- lastTermsPointer = fp;
-
- // save term length (in bytes)
- assert indexedTermLength <= Short.MAX_VALUE;
- termLengths[numIndexTerms] = (short) indexedTermLength;
- totTermLength += indexedTermLength;
-
- lastTerm.copy(text);
- numIndexTerms++;
return true;
} else {
if (0 == numTerms % termIndexInterval) {
@@ -169,13 +139,41 @@ public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
}
@Override
- public void finish() throws IOException {
+ public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException {
+ final int indexedTermLength = indexedTermPrefixLength(lastTerm, text);
+ //System.out.println("FGW: add text=" + text.utf8ToString() + " " + text + " fp=" + termsFilePointer);
+
+ // write only the min prefix that shows the diff
+ // against prior term
+ out.writeBytes(text.bytes, text.offset, indexedTermLength);
+
+ if (termLengths.length == numIndexTerms) {
+ termLengths = ArrayUtil.grow(termLengths);
+ }
+ if (termsPointerDeltas.length == numIndexTerms) {
+ termsPointerDeltas = ArrayUtil.grow(termsPointerDeltas);
+ }
+
+ // save delta terms pointer
+ termsPointerDeltas[numIndexTerms] = (int) (termsFilePointer - lastTermsPointer);
+ lastTermsPointer = termsFilePointer;
+
+ // save term length (in bytes)
+ assert indexedTermLength <= Short.MAX_VALUE;
+ termLengths[numIndexTerms] = (short) indexedTermLength;
+ totTermLength += indexedTermLength;
+
+ lastTerm.copy(text);
+ numIndexTerms++;
+ }
+
+ @Override
+ public void finish(long termsFilePointer) throws IOException {
// write primary terms dict offsets
packedIndexStart = out.getFilePointer();
- final long maxValue = termsOut.getFilePointer();
- PackedInts.Writer w = PackedInts.getWriter(out, numIndexTerms, PackedInts.bitsRequired(maxValue));
+ PackedInts.Writer w = PackedInts.getWriter(out, numIndexTerms, PackedInts.bitsRequired(termsFilePointer));
// relative to our indexStart
long upto = 0;
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/MergeState.java b/lucene/src/java/org/apache/lucene/index/codecs/MergeState.java
index cfc8c749a3f..ad29d1c9b1d 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/MergeState.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/MergeState.java
@@ -17,13 +17,16 @@ package org.apache.lucene.index.codecs;
* limitations under the License.
*/
+import java.util.List;
+
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MergePolicy;
import org.apache.lucene.index.PayloadProcessorProvider.DirPayloadProcessor;
import org.apache.lucene.index.PayloadProcessorProvider.PayloadProcessor;
+import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
-import java.util.List;
/** Holds common state used during segment merging
*
@@ -37,6 +40,7 @@ public class MergeState {
public int[] docBase; // New docID base per reader
public int mergedDocCount; // Total # merged docs
public Bits multiDeletedDocs;
+ public CheckAbort checkAbort;
// Updated per field;
public FieldInfo fieldInfo;
@@ -45,5 +49,30 @@ public class MergeState {
public boolean hasPayloadProcessorProvider;
public DirPayloadProcessor[] dirPayloadProcessor;
public PayloadProcessor[] currentPayloadProcessor;
-
+
+ public static class CheckAbort {
+ private double workCount;
+ private MergePolicy.OneMerge merge;
+ private Directory dir;
+ public CheckAbort(MergePolicy.OneMerge merge, Directory dir) {
+ this.merge = merge;
+ this.dir = dir;
+ }
+
+ /**
+ * Records the fact that roughly units amount of work
+ * have been done since this method was last called.
+ * When adding time-consuming code into SegmentMerger,
+ * you should test different values for units to ensure
+ * that the time in between calls to merge.checkAborted
+ * is up to ~ 1 second.
+ */
+ public void work(double units) throws MergePolicy.MergeAbortedException {
+ workCount += units;
+ if (workCount >= 10000.0) {
+ merge.checkAborted(dir);
+ workCount = 0;
+ }
+ }
+ }
}
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/MultiLevelSkipListReader.java b/lucene/src/java/org/apache/lucene/index/codecs/MultiLevelSkipListReader.java
index 0f65c818b16..b75aa478a5a 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/MultiLevelSkipListReader.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/MultiLevelSkipListReader.java
@@ -172,6 +172,8 @@ public abstract class MultiLevelSkipListReader {
public void init(long skipPointer, int df) {
this.skipPointer[0] = skipPointer;
this.docCount = df;
+ assert skipPointer >= 0 && skipPointer <= skipStream[0].length()
+ : "invalid skip pointer: " + skipPointer + ", length=" + skipStream[0].length();
Arrays.fill(skipDoc, 0);
Arrays.fill(numSkipped, 0);
Arrays.fill(childPointer, 0);
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java b/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java
index a6bd46fe82a..b5c2c8bfa81 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java
@@ -30,9 +30,9 @@ import org.apache.lucene.util.BytesRef;
public abstract class PostingsConsumer {
- /** Adds a new doc in this term. Return null if this
- * consumer doesn't need to see the positions for this
- * doc. */
+ /** Adds a new doc in this term. If this field omits term
+ * freqs & positions then termDocFreq should be ignored,
+ * and, finishDoc will not be called. */
public abstract void startDoc(int docID, int termDocFreq) throws IOException;
public static class PostingsMergeState {
@@ -49,14 +49,16 @@ public abstract class PostingsConsumer {
public abstract void addPosition(int position, BytesRef payload) throws IOException;
/** Called when we are done adding positions & payloads
- * for each doc */
+ * for each doc. Not called when the field omits term
+ * freq and positions. */
public abstract void finishDoc() throws IOException;
/** Default merge impl: append documents, mapping around
* deletes */
- public int merge(final MergeState mergeState, final DocsEnum postings) throws IOException {
+ public TermStats merge(final MergeState mergeState, final DocsEnum postings) throws IOException {
int df = 0;
+ long totTF = 0;
if (mergeState.fieldInfo.omitTermFreqAndPositions) {
while(true) {
@@ -67,6 +69,7 @@ public abstract class PostingsConsumer {
this.startDoc(doc, postings.freq());
this.finishDoc();
df++;
+ totTF++;
}
} else {
final DocsAndPositionsEnum postingsEnum = (DocsAndPositionsEnum) postings;
@@ -77,6 +80,7 @@ public abstract class PostingsConsumer {
}
final int freq = postingsEnum.freq();
this.startDoc(doc, freq);
+ totTF += freq;
for(int i=0;iThis class also interacts with an instance of {@link
- * TermsIndexReaderBase}, to abstract away the specific
- * implementation of the terms dict index.
- * @lucene.experimental */
-
-public class PrefixCodedTermsReader extends FieldsProducer {
- // Open input to the main terms dict file (_X.tis)
- private final IndexInput in;
-
- // Reads the terms dict entries, to gather state to
- // produce DocsEnum on demand
- private final PostingsReaderBase postingsReader;
-
- private final TreeMap fields = new TreeMap();
-
- // Comparator that orders our terms
- private final Comparator termComp;
-
- // Caches the most recently looked-up field + terms:
- private final DoubleBarrelLRUCache termsCache;
-
- // Reads the terms index
- private TermsIndexReaderBase indexReader;
-
- // keeps the dirStart offset
- protected long dirOffset;
-
- // Used as key for the terms cache
- private static class FieldAndTerm extends DoubleBarrelLRUCache.CloneableKey {
- String field;
- BytesRef term;
-
- public FieldAndTerm() {
- }
-
- public FieldAndTerm(String field, BytesRef term) {
- this.field = field;
- this.term = new BytesRef(term);
- }
-
- public FieldAndTerm(FieldAndTerm other) {
- field = other.field;
- term = new BytesRef(other.term);
- }
-
- @Override
- public boolean equals(Object _other) {
- FieldAndTerm other = (FieldAndTerm) _other;
- return other.field == field && term.bytesEquals(other.term);
- }
-
- @Override
- public Object clone() {
- return new FieldAndTerm(this);
- }
-
- @Override
- public int hashCode() {
- return field.hashCode() * 31 + term.hashCode();
- }
- }
-
- public PrefixCodedTermsReader(TermsIndexReaderBase indexReader, Directory dir, FieldInfos fieldInfos, String segment, PostingsReaderBase postingsReader, int readBufferSize,
- Comparator termComp, int termsCacheSize, String codecId)
- throws IOException {
-
- this.postingsReader = postingsReader;
- termsCache = new DoubleBarrelLRUCache(termsCacheSize);
-
- this.termComp = termComp;
-
- in = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, PrefixCodedTermsWriter.TERMS_EXTENSION),
- readBufferSize);
-
- boolean success = false;
- try {
- readHeader(in);
-
- // Have PostingsReader init itself
- postingsReader.init(in);
-
- // Read per-field details
- seekDir(in, dirOffset);
-
- final int numFields = in.readInt();
-
- for(int i=0;i= 0;
- final long termsStartPointer = in.readLong();
- final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
- if (numTerms > 0) {
- assert !fields.containsKey(fieldInfo.name);
- fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer));
- }
- }
- success = true;
- } finally {
- if (!success) {
- in.close();
- }
- }
-
- this.indexReader = indexReader;
- }
-
- protected void readHeader(IndexInput input) throws IOException {
- CodecUtil.checkHeader(in, PrefixCodedTermsWriter.CODEC_NAME,
- PrefixCodedTermsWriter.VERSION_START, PrefixCodedTermsWriter.VERSION_CURRENT);
- dirOffset = in.readLong();
- }
-
- protected void seekDir(IndexInput input, long dirOffset)
- throws IOException {
- input.seek(dirOffset);
- }
-
- @Override
- public void loadTermsIndex(int indexDivisor) throws IOException {
- indexReader.loadTermsIndex(indexDivisor);
- }
-
- @Override
- public void close() throws IOException {
- try {
- try {
- if (indexReader != null) {
- indexReader.close();
- }
- } finally {
- // null so if an app hangs on to us (ie, we are not
- // GCable, despite being closed) we still free most
- // ram
- indexReader = null;
- if (in != null) {
- in.close();
- }
- }
- } finally {
- try {
- if (postingsReader != null) {
- postingsReader.close();
- }
- } finally {
- for(FieldReader field : fields.values()) {
- field.close();
- }
- }
- }
- }
-
- public static void files(Directory dir, SegmentInfo segmentInfo, String id, Collection files) {
- files.add(IndexFileNames.segmentFileName(segmentInfo.name, id, PrefixCodedTermsWriter.TERMS_EXTENSION));
- }
-
- public static void getExtensions(Collection extensions) {
- extensions.add(PrefixCodedTermsWriter.TERMS_EXTENSION);
- }
-
- @Override
- public FieldsEnum iterator() {
- return new TermFieldsEnum();
- }
-
- @Override
- public Terms terms(String field) throws IOException {
- return fields.get(field);
- }
-
- // Iterates through all fields
- private class TermFieldsEnum extends FieldsEnum {
- final Iterator it;
- FieldReader current;
-
- TermFieldsEnum() {
- it = fields.values().iterator();
- }
-
- @Override
- public String next() {
- if (it.hasNext()) {
- current = it.next();
- return current.fieldInfo.name;
- } else {
- current = null;
- return null;
- }
- }
-
- @Override
- public TermsEnum terms() throws IOException {
- return current.iterator();
- }
-
- @Override
- public DocValues docValues() throws IOException {
- // TODO Auto-generated method stub
- return null;
- }
- }
-
- private class FieldReader extends Terms implements Closeable {
- final long numTerms;
- final FieldInfo fieldInfo;
- final long termsStartPointer;
-
- FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer) {
- assert numTerms > 0;
- this.fieldInfo = fieldInfo;
- this.numTerms = numTerms;
- this.termsStartPointer = termsStartPointer;
- }
-
- @Override
- public Comparator getComparator() {
- return termComp;
- }
-
- @Override
- public void close() {
- super.close();
- }
-
- @Override
- public TermsEnum iterator() throws IOException {
- return new SegmentTermsEnum();
- }
-
- @Override
- public long getUniqueTermCount() {
- return numTerms;
- }
-
- // Iterates through terms in this field, not supporting ord()
- private class SegmentTermsEnum extends TermsEnum {
- private final IndexInput in;
- private final DeltaBytesReader bytesReader;
- private final TermState state;
- private boolean seekPending;
- private final FieldAndTerm fieldTerm = new FieldAndTerm();
- private final TermsIndexReaderBase.FieldIndexEnum indexEnum;
- private boolean positioned;
- private boolean didIndexNext;
- private BytesRef nextIndexTerm;
- private boolean isIndexTerm;
- private final boolean doOrd;
-
- SegmentTermsEnum() throws IOException {
- in = (IndexInput) PrefixCodedTermsReader.this.in.clone();
- in.seek(termsStartPointer);
- indexEnum = indexReader.getFieldEnum(fieldInfo);
- doOrd = indexReader.supportsOrd();
- bytesReader = new DeltaBytesReader(in);
- fieldTerm.field = fieldInfo.name;
- state = postingsReader.newTermState();
- state.ord = -1;
- }
-
- @Override
- public Comparator getComparator() {
- return termComp;
- }
-
- @Override
- public void cacheCurrentTerm() {
- TermState stateCopy = (TermState) state.clone();
- stateCopy.filePointer = in.getFilePointer();
- termsCache.put(new FieldAndTerm(fieldInfo.name, bytesReader.term),
- stateCopy);
- }
-
- // called only from assert
- private boolean first;
- private int indexTermCount;
-
- private boolean startSeek() {
- first = true;
- indexTermCount = 0;
- return true;
- }
-
- private boolean checkSeekScan() {
- if (!first && isIndexTerm) {
- indexTermCount++;
- if (indexTermCount >= indexReader.getDivisor()) {
- //System.out.println("now fail count=" + indexTermCount);
- return false;
- }
- }
- first = false;
- return true;
- }
-
- /** Seeks until the first term that's >= the provided
- * text; returns SeekStatus.FOUND if the exact term
- * is found, SeekStatus.NOT_FOUND if a different term
- * was found, SeekStatus.END if we hit EOF */
- @Override
- public SeekStatus seek(BytesRef term, boolean useCache) throws IOException {
-
- if (indexEnum == null) {
- throw new IllegalStateException("terms index was not loaded");
- }
-
- //System.out.println("te.seek term=" + fieldInfo.name + ":" + term.utf8ToString() + " current=" + term().utf8ToString() + " useCache=" + useCache + " this=" + this);
-
- // Check cache
- fieldTerm.term = term;
- TermState cachedState;
- if (useCache) {
- cachedState = termsCache.get(fieldTerm);
- if (cachedState != null) {
- state.copy(cachedState);
- seekPending = true;
- positioned = false;
- bytesReader.term.copy(term);
- //System.out.println(" cached!");
- return SeekStatus.FOUND;
- }
- } else {
- cachedState = null;
- }
-
- boolean doSeek = true;
-
- if (positioned) {
-
- final int cmp = termComp.compare(bytesReader.term, term);
-
- if (cmp == 0) {
- // already at the requested term
- return SeekStatus.FOUND;
- } else if (cmp < 0) {
-
- if (seekPending) {
- seekPending = false;
- in.seek(state.filePointer);
- indexEnum.seek(bytesReader.term);
- didIndexNext = false;
- }
-
- // Target term is after current term
- if (!didIndexNext) {
- if (indexEnum.next() == -1) {
- nextIndexTerm = null;
- } else {
- nextIndexTerm = indexEnum.term();
- }
- //System.out.println(" now do index next() nextIndexTerm=" + (nextIndexTerm == null ? "null" : nextIndexTerm.utf8ToString()));
- didIndexNext = true;
- }
-
- if (nextIndexTerm == null || termComp.compare(term, nextIndexTerm) < 0) {
- // Optimization: requested term is within the
- // same index block we are now in; skip seeking
- // (but do scanning):
- doSeek = false;
- //System.out.println(" skip seek: nextIndexTerm=" + nextIndexTerm);
- }
- }
- }
-
- if (doSeek) {
-
- positioned = true;
-
- // Ask terms index to find biggest index term that's <=
- // our text:
- in.seek(indexEnum.seek(term));
- didIndexNext = false;
- if (doOrd) {
- state.ord = indexEnum.ord()-1;
- }
- seekPending = false;
-
- // NOTE: the first next() after an index seek is
- // wasteful, since it redundantly reads the same
- // bytes into the buffer. We could avoid storing
- // those bytes in the primary file, but then when
- // scanning over an index term we'd have to
- // special case it:
- bytesReader.reset(indexEnum.term());
- //System.out.println(" doSeek term=" + indexEnum.term().utf8ToString() + " vs target=" + term.utf8ToString());
- } else {
- //System.out.println(" skip seek");
- }
-
- assert startSeek();
-
- // Now scan:
- while (next() != null) {
- final int cmp = termComp.compare(bytesReader.term, term);
- if (cmp == 0) {
- // Done!
- if (useCache) {
- // Store in cache
- FieldAndTerm entryKey = new FieldAndTerm(fieldTerm);
- cachedState = (TermState) state.clone();
- // this is fp after current term
- cachedState.filePointer = in.getFilePointer();
- termsCache.put(entryKey, cachedState);
- }
-
- return SeekStatus.FOUND;
- } else if (cmp > 0) {
- return SeekStatus.NOT_FOUND;
- }
-
- // The purpose of the terms dict index is to seek
- // the enum to the closest index term before the
- // term we are looking for. So, we should never
- // cross another index term (besides the first
- // one) while we are scanning:
- assert checkSeekScan();
- }
-
- positioned = false;
- return SeekStatus.END;
- }
-
- @Override
- public BytesRef term() {
- return bytesReader.term;
- }
-
- @Override
- public BytesRef next() throws IOException {
-
- if (seekPending) {
- seekPending = false;
- in.seek(state.filePointer);
- indexEnum.seek(bytesReader.term);
- didIndexNext = false;
- }
-
- if (!bytesReader.read()) {
- //System.out.println("te.next end!");
- positioned = false;
- return null;
- }
-
- final byte b = in.readByte();
- isIndexTerm = (b & 0x80) != 0;
-
- if ((b & 0x40) == 0) {
- // Fast case -- docFreq fits in 6 bits
- state.docFreq = b & 0x3F;
- } else {
- state.docFreq = (in.readVInt() << 6) | (b & 0x3F);
- }
-
- postingsReader.readTerm(in,
- fieldInfo, state,
- isIndexTerm);
- state.ord++;
- positioned = true;
-
- //System.out.println("te.next term=" + bytesReader.term.utf8ToString());
- return bytesReader.term;
- }
-
- @Override
- public int docFreq() {
- return state.docFreq;
- }
-
- @Override
- public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
- DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse);
- assert docsEnum != null;
- return docsEnum;
- }
-
- @Override
- public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
- if (fieldInfo.omitTermFreqAndPositions) {
- return null;
- } else {
- return postingsReader.docsAndPositions(fieldInfo, state, skipDocs, reuse);
- }
- }
-
- @Override
- public SeekStatus seek(long ord) throws IOException {
-
- if (indexEnum == null) {
- throw new IllegalStateException("terms index was not loaded");
- }
-
- if (ord >= numTerms) {
- state.ord = numTerms-1;
- return SeekStatus.END;
- }
-
- in.seek(indexEnum.seek(ord));
- seekPending = false;
- positioned = true;
-
- // NOTE: the first next() after an index seek is
- // wasteful, since it redundantly reads the same
- // bytes into the buffer
- bytesReader.reset(indexEnum.term());
-
- state.ord = indexEnum.ord()-1;
- assert state.ord >= -1: "ord=" + state.ord;
-
- // Now, scan:
- int left = (int) (ord - state.ord);
- while(left > 0) {
- final BytesRef term = next();
- assert term != null;
- left--;
- }
-
- // always found
- return SeekStatus.FOUND;
- }
-
- @Override
- public long ord() {
- if (!doOrd) {
- throw new UnsupportedOperationException();
- }
- return state.ord;
- }
- }
- }
-}
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java
deleted file mode 100644
index 377e3e55647..00000000000
--- a/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java
+++ /dev/null
@@ -1,206 +0,0 @@
-package org.apache.lucene.index.codecs;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Comparator;
-
-import org.apache.lucene.index.FieldInfo;
-import org.apache.lucene.index.FieldInfos;
-import org.apache.lucene.index.IndexFileNames;
-import org.apache.lucene.index.SegmentWriteState;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.util.CodecUtil;
-
-/**
- * Writes terms dict and interacts with docs/positions
- * consumers to write the postings files.
- *
- * The [new] terms dict format is field-centric: each field
- * has its own section in the file. Fields are written in
- * UTF16 string comparison order. Within each field, each
- * term's text is written in UTF16 string comparison order.
- * @lucene.experimental
- */
-
-public class PrefixCodedTermsWriter extends FieldsConsumer {
-
- final static String CODEC_NAME = "STANDARD_TERMS_DICT";
-
- // Initial format
- public static final int VERSION_START = 0;
-
- public static final int VERSION_CURRENT = VERSION_START;
-
- /** Extension of terms file */
- static final String TERMS_EXTENSION = "tis";
-
- private final DeltaBytesWriter termWriter;
-
- protected final IndexOutput out;
- final PostingsWriterBase postingsWriter;
- final FieldInfos fieldInfos;
- FieldInfo currentField;
- private final TermsIndexWriterBase termsIndexWriter;
- private final List fields = new ArrayList();
- private final Comparator termComp;
-
- public PrefixCodedTermsWriter(
- TermsIndexWriterBase termsIndexWriter,
- SegmentWriteState state,
- PostingsWriterBase postingsWriter,
- Comparator termComp) throws IOException
- {
- final String termsFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, TERMS_EXTENSION);
- this.termsIndexWriter = termsIndexWriter;
- this.termComp = termComp;
- out = state.directory.createOutput(termsFileName);
- termsIndexWriter.setTermsOutput(out);
-
- fieldInfos = state.fieldInfos;
- writeHeader(out);
- termWriter = new DeltaBytesWriter(out);
- currentField = null;
- this.postingsWriter = postingsWriter;
-
- postingsWriter.start(out); // have consumer write its format/header
- }
-
- protected void writeHeader(IndexOutput out) throws IOException {
- // Count indexed fields up front
- CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
-
- out.writeLong(0); // leave space for end index pointer
- }
-
- @Override
- public TermsConsumer addField(FieldInfo field) throws IOException {
- assert currentField == null || currentField.name.compareTo(field.name) < 0 : "current field name " + (currentField == null? null: currentField.name) + " given: " +field.name;
- currentField = field;
- TermsIndexWriterBase.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field);
- TermsConsumer terms = new TermsWriter(fieldIndexWriter, field, postingsWriter);
- fields.add(terms);
- return terms;
- }
-
- @Override
- public void close() throws IOException {
-
- try {
- final int fieldCount = fields.size();
-
- final long dirStart = out.getFilePointer();
-
- out.writeInt(fieldCount);
- for(int i=0;i getComparator() {
- return termComp;
- }
-
- @Override
- public PostingsConsumer startTerm(BytesRef text) throws IOException {
- postingsWriter.startTerm();
- return postingsWriter;
- }
-
- @Override
- public void finishTerm(BytesRef text, int numDocs) throws IOException {
-
- assert numDocs > 0;
- //System.out.println("finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " fp=" + out.getFilePointer());
-
- final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, numDocs);
-
- termWriter.write(text);
- final int highBit = isIndexTerm ? 0x80 : 0;
- //System.out.println(" isIndex=" + isIndexTerm);
-
- // This is a vInt, except, we steal top bit to record
- // whether this was an indexed term:
- if ((numDocs & ~0x3F) == 0) {
- // Fast case -- docFreq fits in 6 bits
- out.writeByte((byte) (highBit | numDocs));
- } else {
- // Write bottom 6 bits of docFreq, then write the
- // remainder as vInt:
- out.writeByte((byte) (highBit | 0x40 | (numDocs & 0x3F)));
- out.writeVInt(numDocs >>> 6);
- }
- postingsWriter.finishTerm(numDocs, isIndexTerm);
- numTerms++;
- }
-
- // Finishes all terms in this field
- @Override
- public void finish() throws IOException {
- // EOF marker:
- out.writeVInt(DeltaBytesWriter.TERM_EOF);
- fieldIndexWriter.finish();
- }
- }
-}
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesReader.java b/lucene/src/java/org/apache/lucene/index/codecs/TermStats.java
similarity index 50%
rename from lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesReader.java
rename to lucene/src/java/org/apache/lucene/index/codecs/TermStats.java
index 0514dad96a7..bb2b6f34d27 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesReader.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/TermStats.java
@@ -17,36 +17,12 @@ package org.apache.lucene.index.codecs;
* limitations under the License.
*/
-import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.util.BytesRef;
+public class TermStats {
+ public final int docFreq;
+ public final long totalTermFreq;
-import java.io.IOException;
-
-// Handles reading incremental UTF8 encoded terms
-final class DeltaBytesReader {
- final BytesRef term = new BytesRef();
- final IndexInput in;
-
- DeltaBytesReader(IndexInput in) {
- this.in = in;
- term.bytes = new byte[10];
- }
-
- void reset(BytesRef text) {
- term.copy(text);
- }
-
- boolean read() throws IOException {
- final int start = in.readVInt();
- if (start == DeltaBytesWriter.TERM_EOF) {
- return false;
- }
- final int suffix = in.readVInt();
- assert start <= term.length: "start=" + start + " length=" + term.length;
- final int newLength = start+suffix;
- term.grow(newLength);
- in.readBytes(term.bytes, start, suffix);
- term.length = newLength;
- return true;
+ public TermStats(int docFreq, long totalTermFreq) {
+ this.docFreq = docFreq;
+ this.totalTermFreq = totalTermFreq;
}
}
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java b/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java
index 48fc7e01660..93b578ce17c 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java
@@ -38,10 +38,10 @@ public abstract class TermsConsumer {
public abstract PostingsConsumer startTerm(BytesRef text) throws IOException;
/** Finishes the current term; numDocs must be > 0. */
- public abstract void finishTerm(BytesRef text, int numDocs) throws IOException;
+ public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException;
/** Called when we are done adding terms to this field */
- public abstract void finish() throws IOException;
+ public abstract void finish(long sumTotalTermFreq) throws IOException;
/** Return the BytesRef Comparator used to sort terms
* before feeding to this API. */
@@ -55,6 +55,8 @@ public abstract class TermsConsumer {
BytesRef term;
assert termsEnum != null;
+ long sumTotalTermFreq = 0;
+ long sumDF = 0;
if (mergeState.fieldInfo.omitTermFreqAndPositions) {
if (docsEnum == null) {
@@ -69,9 +71,14 @@ public abstract class TermsConsumer {
if (docsEnumIn != null) {
docsEnum.reset(docsEnumIn);
final PostingsConsumer postingsConsumer = startTerm(term);
- final int numDocs = postingsConsumer.merge(mergeState, docsEnum);
- if (numDocs > 0) {
- finishTerm(term, numDocs);
+ final TermStats stats = postingsConsumer.merge(mergeState, docsEnum);
+ if (stats.docFreq > 0) {
+ finishTerm(term, stats);
+ sumDF += stats.docFreq;
+ if (sumDF > 60000) {
+ mergeState.checkAbort.work(sumDF/5.0);
+ sumDF = 0;
+ }
}
}
}
@@ -94,14 +101,20 @@ public abstract class TermsConsumer {
}
}
final PostingsConsumer postingsConsumer = startTerm(term);
- final int numDocs = postingsConsumer.merge(mergeState, postingsEnum);
- if (numDocs > 0) {
- finishTerm(term, numDocs);
+ final TermStats stats = postingsConsumer.merge(mergeState, postingsEnum);
+ if (stats.docFreq > 0) {
+ finishTerm(term, stats);
+ sumTotalTermFreq += stats.totalTermFreq;
+ sumDF += stats.docFreq;
+ if (sumDF > 60000) {
+ mergeState.checkAbort.work(sumDF/5.0);
+ sumDF = 0;
+ }
}
}
}
}
- finish();
+ finish(sumTotalTermFreq);
}
}
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java b/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java
index e74cd1a52d0..53f1a7e7d81 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java
@@ -17,7 +17,6 @@ package org.apache.lucene.index.codecs;
* limitations under the License.
*/
-import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
@@ -25,14 +24,13 @@ import java.io.IOException;
/** @lucene.experimental */
public abstract class TermsIndexWriterBase {
- public abstract void setTermsOutput(IndexOutput out);
-
public abstract class FieldWriter {
- public abstract boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException;
- public abstract void finish() throws IOException;
+ public abstract boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException;
+ public abstract void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException;
+ public abstract void finish(long termsFilePointer) throws IOException;
}
- public abstract FieldWriter addField(FieldInfo fieldInfo) throws IOException;
+ public abstract FieldWriter addField(FieldInfo fieldInfo, long termsFilePointer) throws IOException;
public abstract void close() throws IOException;
}
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java b/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java
index 60ca441c51f..68ec78ab023 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java
@@ -164,9 +164,6 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase {
this.fieldInfo = fieldInfo;
this.indexStart = indexStart;
- // We still create the indexReader when indexDivisor
- // is -1, so that PrefixCodedTermsReader can call
- // isIndexTerm for each field:
if (indexDivisor > 0) {
loadTermsIndex();
}
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java
index 12195e813ae..e4cba764738 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java
@@ -52,14 +52,14 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
private final List fields = new ArrayList();
private final FieldInfos fieldInfos; // unread
- private IndexOutput termsOut;
private final IndexTermSelector policy;
/** @lucene.experimental */
public static abstract class IndexTermSelector {
// Called sequentially on every term being written,
// returning true if this term should be indexed
- public abstract boolean isIndexTerm(BytesRef term, int docFreq);
+ public abstract boolean isIndexTerm(BytesRef term, TermStats stats);
+ public abstract void newField(FieldInfo fieldInfo);
}
/** Same policy as {@link FixedGapTermsIndexWriter} */
@@ -74,15 +74,20 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
}
@Override
- public boolean isIndexTerm(BytesRef term, int docFreq) {
+ public boolean isIndexTerm(BytesRef term, TermStats stats) {
if (count >= interval) {
- count = 0;
+ count = 1;
return true;
} else {
count++;
return false;
}
}
+
+ @Override
+ public void newField(FieldInfo fieldInfo) {
+ count = interval;
+ }
}
/** Sets an index term when docFreq >= docFreqThresh, or
@@ -96,18 +101,26 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
public EveryNOrDocFreqTermSelector(int docFreqThresh, int interval) {
this.interval = interval;
this.docFreqThresh = docFreqThresh;
+
+ // First term is first indexed term:
+ count = interval;
}
@Override
- public boolean isIndexTerm(BytesRef term, int docFreq) {
- if (docFreq >= docFreqThresh || count >= interval) {
- count = 0;
+ public boolean isIndexTerm(BytesRef term, TermStats stats) {
+ if (stats.docFreq >= docFreqThresh || count >= interval) {
+ count = 1;
return true;
} else {
count++;
return false;
}
}
+
+ @Override
+ public void newField(FieldInfo fieldInfo) {
+ count = interval;
+ }
}
// TODO: it'd be nice to let the FST builder prune based
@@ -158,14 +171,10 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
}
@Override
- public void setTermsOutput(IndexOutput termsOut) {
- this.termsOut = termsOut;
- }
-
- @Override
- public FieldWriter addField(FieldInfo field) throws IOException {
- //System.out.println("VGW: field=" + field.name);
- FSTFieldWriter writer = new FSTFieldWriter(field);
+ public FieldWriter addField(FieldInfo field, long termsFilePointer) throws IOException {
+ ////System.out.println("VGW: field=" + field.name);
+ policy.newField(field);
+ FSTFieldWriter writer = new FSTFieldWriter(field, termsFilePointer);
fields.add(writer);
return writer;
}
@@ -200,42 +209,48 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
private final BytesRef lastTerm = new BytesRef();
private boolean first = true;
- public FSTFieldWriter(FieldInfo fieldInfo) throws IOException {
+ public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
this.fieldInfo = fieldInfo;
fstOutputs = PositiveIntOutputs.getSingleton(true);
fstBuilder = new Builder(FST.INPUT_TYPE.BYTE1,
0, 0, true,
fstOutputs);
indexStart = out.getFilePointer();
- //System.out.println("VGW: field=" + fieldInfo.name);
+ ////System.out.println("VGW: field=" + fieldInfo.name);
// Always put empty string in
- fstBuilder.add(new BytesRef(), fstOutputs.get(termsOut.getFilePointer()));
+ fstBuilder.add(new BytesRef(), fstOutputs.get(termsFilePointer));
}
@Override
- public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
- if (policy.isIndexTerm(text, docFreq) || first) {
+ public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
+ //System.out.println("VGW: index term=" + text.utf8ToString());
+ // NOTE: we must force the first term per field to be
+ // indexed, in case policy doesn't:
+ if (policy.isIndexTerm(text, stats) || first) {
first = false;
- //System.out.println("VGW: index term=" + text.utf8ToString() + " fp=" + termsOut.getFilePointer());
- final int lengthSave = text.length;
- text.length = indexedTermPrefixLength(lastTerm, text);
- try {
- fstBuilder.add(text, fstOutputs.get(termsOut.getFilePointer()));
- } finally {
- text.length = lengthSave;
- }
- lastTerm.copy(text);
+ //System.out.println(" YES");
return true;
} else {
- //System.out.println("VGW: not index term=" + text.utf8ToString() + " fp=" + termsOut.getFilePointer());
lastTerm.copy(text);
return false;
}
}
@Override
- public void finish() throws IOException {
+ public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException {
+ final int lengthSave = text.length;
+ text.length = indexedTermPrefixLength(lastTerm, text);
+ try {
+ fstBuilder.add(text, fstOutputs.get(termsFilePointer));
+ } finally {
+ text.length = lengthSave;
+ }
+ lastTerm.copy(text);
+ }
+
+ @Override
+ public void finish(long termsFilePointer) throws IOException {
fst = fstBuilder.finish();
if (fst != null) {
fst.save(out);
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java b/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java
index 3fb9adcb3b6..1b6829dc28d 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java
@@ -24,6 +24,7 @@ package org.apache.lucene.index.codecs.intblock;
import java.io.IOException;
import org.apache.lucene.index.codecs.sep.IntIndexInput;
+import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.IntsRef;
@@ -149,7 +150,7 @@ public abstract class FixedIntBlockIndexInput extends IntIndexInput {
private int upto;
@Override
- public void read(final IndexInput indexIn, final boolean absolute) throws IOException {
+ public void read(final DataInput indexIn, final boolean absolute) throws IOException {
if (absolute) {
fp = indexIn.readVLong();
upto = indexIn.readVInt();
@@ -205,5 +206,10 @@ public abstract class FixedIntBlockIndexInput extends IntIndexInput {
other.upto = upto;
return other;
}
+
+ @Override
+ public String toString() {
+ return "fp=" + fp + " upto=" + upto;
+ }
}
}
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java b/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java
index 00658b0b5f1..8b5e4988fcd 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java
@@ -111,6 +111,11 @@ public abstract class FixedIntBlockIndexOutput extends IntIndexOutput {
lastUpto = upto;
lastFP = fp;
}
+
+ @Override
+ public String toString() {
+ return "fp=" + fp + " upto=" + upto;
+ }
}
@Override
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java b/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java
index 6084df41ca4..0881587d041 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java
@@ -24,6 +24,7 @@ package org.apache.lucene.index.codecs.intblock;
import java.io.IOException;
import org.apache.lucene.index.codecs.sep.IntIndexInput;
+import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.IntsRef;
@@ -168,7 +169,7 @@ public abstract class VariableIntBlockIndexInput extends IntIndexInput {
private int upto;
@Override
- public void read(final IndexInput indexIn, final boolean absolute) throws IOException {
+ public void read(final DataInput indexIn, final boolean absolute) throws IOException {
if (absolute) {
fp = indexIn.readVLong();
upto = indexIn.readByte()&0xFF;
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/intblock/package.html b/lucene/src/java/org/apache/lucene/index/codecs/intblock/package.html
new file mode 100644
index 00000000000..403ea1b55f6
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/index/codecs/intblock/package.html
@@ -0,0 +1,25 @@
+
+
+
+
+
+
+
+Intblock: base support for fixed or variable length block integer encoders
+
+
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/package.html b/lucene/src/java/org/apache/lucene/index/codecs/package.html
new file mode 100644
index 00000000000..78dcb95de64
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/index/codecs/package.html
@@ -0,0 +1,25 @@
+
+
+
+
+
+
+
+Codecs API: API for customization of the encoding and structure of the index.
+
+
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
index ec65dcf47d8..31cb23a4e58 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
@@ -269,6 +269,11 @@ public class PreFlexFields extends FieldsProducer {
return BytesRef.getUTF8SortedAsUTF16Comparator();
}
}
+
+ @Override
+ public long getSumTotalTermFreq() {
+ return -1;
+ }
}
private class PreTermsEnum extends TermsEnum {
@@ -540,7 +545,7 @@ public class PreFlexFields extends FieldsProducer {
// We can easily detect S in UTF8: if a byte has
// prefix 11110 (0xf0), then that byte and the
// following 3 bytes encode a single unicode codepoint
- // in S. Similary,we can detect E: if a byte has
+ // in S. Similarly, we can detect E: if a byte has
// prefix 1110111 (0xee), then that byte and the
// following 2 bytes encode a single unicode codepoint
// in E.
@@ -748,11 +753,6 @@ public class PreFlexFields extends FieldsProducer {
}
}
- @Override
- public void cacheCurrentTerm() throws IOException {
- getTermsDict().cacheCurrentTerm(termEnum);
- }
-
@Override
public SeekStatus seek(long ord) throws IOException {
throw new UnsupportedOperationException();
@@ -949,6 +949,11 @@ public class PreFlexFields extends FieldsProducer {
return termEnum.docFreq();
}
+ @Override
+ public long totalTermFreq() {
+ return -1;
+ }
+
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
PreDocsEnum docsEnum;
@@ -982,7 +987,7 @@ public class PreFlexFields extends FieldsProducer {
private final class PreDocsEnum extends DocsEnum {
final private SegmentTermDocs docs;
-
+ private int docID = -1;
PreDocsEnum() throws IOException {
docs = new SegmentTermDocs(freqStream, getTermsDict(), fieldInfos);
}
@@ -1000,18 +1005,18 @@ public class PreFlexFields extends FieldsProducer {
@Override
public int nextDoc() throws IOException {
if (docs.next()) {
- return docs.doc();
+ return docID = docs.doc();
} else {
- return NO_MORE_DOCS;
+ return docID = NO_MORE_DOCS;
}
}
@Override
public int advance(int target) throws IOException {
if (docs.skipTo(target)) {
- return docs.doc();
+ return docID = docs.doc();
} else {
- return NO_MORE_DOCS;
+ return docID = NO_MORE_DOCS;
}
}
@@ -1022,7 +1027,7 @@ public class PreFlexFields extends FieldsProducer {
@Override
public int docID() {
- return docs.doc();
+ return docID;
}
@Override
@@ -1038,7 +1043,7 @@ public class PreFlexFields extends FieldsProducer {
private final class PreDocsAndPositionsEnum extends DocsAndPositionsEnum {
final private SegmentTermPositions pos;
-
+ private int docID = -1;
PreDocsAndPositionsEnum() throws IOException {
pos = new SegmentTermPositions(freqStream, proxStream, getTermsDict(), fieldInfos);
}
@@ -1056,18 +1061,18 @@ public class PreFlexFields extends FieldsProducer {
@Override
public int nextDoc() throws IOException {
if (pos.next()) {
- return pos.doc();
+ return docID = pos.doc();
} else {
- return NO_MORE_DOCS;
+ return docID = NO_MORE_DOCS;
}
}
@Override
public int advance(int target) throws IOException {
if (pos.skipTo(target)) {
- return pos.doc();
+ return docID = pos.doc();
} else {
- return NO_MORE_DOCS;
+ return docID = NO_MORE_DOCS;
}
}
@@ -1078,16 +1083,18 @@ public class PreFlexFields extends FieldsProducer {
@Override
public int docID() {
- return pos.doc();
+ return docID;
}
@Override
public int nextPosition() throws IOException {
+ assert docID != NO_MORE_DOCS;
return pos.nextPosition();
}
@Override
public boolean hasPayload() {
+ assert docID != NO_MORE_DOCS;
return pos.isPayloadAvailable();
}
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java
index a8703ae83f1..fb7c8ceec46 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java
@@ -45,7 +45,7 @@ public final class SegmentTermEnum implements Cloneable {
// whenever you add a new format, make it 1 smaller (negative version logic)!
public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
- // when removing support for old versions, levae the last supported version here
+ // when removing support for old versions, leave the last supported version here
public static final int FORMAT_MINIMUM = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
private TermBuffer termBuffer = new TermBuffer();
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermPositions.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermPositions.java
index f50d226741c..c642f6b1aaa 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermPositions.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermPositions.java
@@ -58,6 +58,7 @@ extends SegmentTermDocs {
this.proxStreamOrig = proxStream; // the proxStream will be cloned lazily when nextPosition() is called for the first time
}
+ @Override
final void seek(TermInfo ti, Term term) throws IOException {
super.seek(ti, term);
if (ti != null)
@@ -69,6 +70,7 @@ extends SegmentTermDocs {
needToLoadPayload = false;
}
+ @Override
public final void close() throws IOException {
super.close();
if (proxStream != null) proxStream.close();
@@ -100,11 +102,13 @@ extends SegmentTermDocs {
return delta;
}
+ @Override
protected final void skippingDoc() throws IOException {
// we remember to skip a document lazily
lazySkipProxCount += freq;
}
+ @Override
public final boolean next() throws IOException {
// we remember to skip the remaining positions of the current
// document lazily
@@ -118,12 +122,14 @@ extends SegmentTermDocs {
return false;
}
+ @Override
public final int read(final int[] docs, final int[] freqs) {
throw new UnsupportedOperationException("TermPositions does not support processing multiple documents in one call. Use TermDocs instead.");
}
/** Called by super.skipTo(). */
+ @Override
protected void skipProx(long proxPointer, int payloadLength) throws IOException {
// we save the pointer, we might have to skip there lazily
lazySkipPointer = proxPointer;
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java
index adf0535390d..8205e73b972 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java
@@ -67,15 +67,18 @@ public final class TermInfosReader {
this.term = t;
}
+ @Override
public boolean equals(Object other) {
CloneableTerm t = (CloneableTerm) other;
return this.term.equals(t.term);
}
+ @Override
public int hashCode() {
return term.hashCode();
}
+ @Override
public Object clone() {
return new CloneableTerm(term);
}
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/package.html b/lucene/src/java/org/apache/lucene/index/codecs/preflex/package.html
new file mode 100644
index 00000000000..c6c96c978c2
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/package.html
@@ -0,0 +1,25 @@
+
+
+
+
+
+
+
+Preflex codec: supports Lucene 3.x indexes (readonly)
+
+
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java
index 19cf99a733c..0867425baa5 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java
@@ -32,8 +32,8 @@ import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.FieldsProducer;
import org.apache.lucene.index.codecs.FixedGapTermsIndexReader;
import org.apache.lucene.index.codecs.FixedGapTermsIndexWriter;
-import org.apache.lucene.index.codecs.PrefixCodedTermsReader;
-import org.apache.lucene.index.codecs.PrefixCodedTermsWriter;
+import org.apache.lucene.index.codecs.BlockTermsReader;
+import org.apache.lucene.index.codecs.BlockTermsWriter;
import org.apache.lucene.index.codecs.TermsIndexReaderBase;
import org.apache.lucene.index.codecs.TermsIndexWriterBase;
import org.apache.lucene.index.codecs.standard.StandardCodec;
@@ -89,7 +89,7 @@ public class PulsingCodec extends Codec {
// Terms dict
success = false;
try {
- FieldsConsumer ret = new PrefixCodedTermsWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
+ FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
success = true;
return ret;
} finally {
@@ -132,13 +132,13 @@ public class PulsingCodec extends Codec {
// Terms dict reader
success = false;
try {
- FieldsProducer ret = new PrefixCodedTermsReader(indexReader,
- state.dir, state.fieldInfos, state.segmentInfo.name,
- pulsingReader,
- state.readBufferSize,
- BytesRef.getUTF8SortedAsUnicodeComparator(),
- StandardCodec.TERMS_CACHE_SIZE,
- state.codecId);
+ FieldsProducer ret = new BlockTermsReader(indexReader,
+ state.dir, state.fieldInfos, state.segmentInfo.name,
+ pulsingReader,
+ state.readBufferSize,
+ BytesRef.getUTF8SortedAsUnicodeComparator(),
+ StandardCodec.TERMS_CACHE_SIZE,
+ state.codecId);
success = true;
return ret;
} finally {
@@ -155,7 +155,7 @@ public class PulsingCodec extends Codec {
@Override
public void files(Directory dir, SegmentInfo segmentInfo, String id, Set files) throws IOException {
StandardPostingsReader.files(dir, segmentInfo, id, files);
- PrefixCodedTermsReader.files(dir, segmentInfo, id, files);
+ BlockTermsReader.files(dir, segmentInfo, id, files);
FixedGapTermsIndexReader.files(dir, segmentInfo, id, files);
}
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java
index 4914b36059a..6adab4d9f19 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java
@@ -19,14 +19,15 @@ package org.apache.lucene.index.codecs.pulsing;
import java.io.IOException;
+import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
-import org.apache.lucene.index.DocsAndPositionsEnum;
-import org.apache.lucene.index.codecs.TermState;
+import org.apache.lucene.index.TermState;
import org.apache.lucene.index.codecs.PostingsReaderBase;
-import org.apache.lucene.index.codecs.pulsing.PulsingPostingsWriterImpl.Document;
-import org.apache.lucene.index.codecs.pulsing.PulsingPostingsWriterImpl.Position;
+import org.apache.lucene.index.codecs.BlockTermState;
+import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
@@ -43,7 +44,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
// Fallback reader for non-pulsed terms:
final PostingsReaderBase wrappedPostingsReader;
- int maxPulsingDocFreq;
+ int maxPositions;
public PulsingPostingsReaderImpl(PostingsReaderBase wrappedPostingsReader) throws IOException {
this.wrappedPostingsReader = wrappedPostingsReader;
@@ -53,145 +54,139 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
public void init(IndexInput termsIn) throws IOException {
CodecUtil.checkHeader(termsIn, PulsingPostingsWriterImpl.CODEC,
PulsingPostingsWriterImpl.VERSION_START, PulsingPostingsWriterImpl.VERSION_START);
- maxPulsingDocFreq = termsIn.readVInt();
+ maxPositions = termsIn.readVInt();
wrappedPostingsReader.init(termsIn);
}
- private static class PulsingTermState extends TermState {
- private Document docs[];
- private TermState wrappedTermState;
- private boolean pendingIndexTerm;
+ private static class PulsingTermState extends BlockTermState {
+ private byte[] postings;
+ private int postingsSize; // -1 if this term was not inlined
+ private BlockTermState wrappedTermState;
+ ByteArrayDataInput inlinedBytesReader;
+ private byte[] inlinedBytes;
+
+ @Override
public Object clone() {
PulsingTermState clone;
clone = (PulsingTermState) super.clone();
- clone.docs = docs.clone();
- for(int i=0;i>>1;
- if ((code & 1) != 0) {
- doc.numPositions = 1;
- } else {
- doc.numPositions = termsIn.readVInt();
- }
-
- if (doc.numPositions > doc.positions.length) {
- doc.reallocPositions(doc.numPositions);
- }
-
- int position = 0;
- int payloadLength = -1;
-
- for(int j=0;j>> 1;
- if ((code2 & 1) != 0) {
- payloadLength = termsIn.readVInt();
- }
-
- if (payloadLength > 0) {
- if (pos.payload == null) {
- pos.payload = new BytesRef();
- pos.payload.bytes = new byte[payloadLength];
- } else if (payloadLength > pos.payload.bytes.length) {
- pos.payload.grow(payloadLength);
- }
- pos.payload.length = payloadLength;
- termsIn.readBytes(pos.payload.bytes, 0, payloadLength);
- } else if (pos.payload != null) {
- pos.payload.length = 0;
- }
- } else {
- position += code2;
- }
- pos.pos = position;
- }
- }
- doc.docID = docID;
+ // Inlined into terms dict -- just read the byte[] blob in,
+ // but don't decode it now (we only decode when a DocsEnum
+ // or D&PEnum is pulled):
+ termState.postingsSize = termState.inlinedBytesReader.readVInt();
+ if (termState.postings == null || termState.postings.length < termState.postingsSize) {
+ termState.postings = new byte[ArrayUtil.oversize(termState.postingsSize, 1)];
}
+ // TODO: sort of silly to copy from one big byte[]
+ // (the blob holding all inlined terms' blobs for
+ // current term block) into another byte[] (just the
+ // blob for this term)...
+ termState.inlinedBytesReader.readBytes(termState.postings, 0, termState.postingsSize);
} else {
+ //System.out.println(" not inlined");
+ termState.postingsSize = -1;
+ // TODO: should we do full copyFrom? much heavier...?
termState.wrappedTermState.docFreq = termState.docFreq;
- wrappedPostingsReader.readTerm(termsIn, fieldInfo, termState.wrappedTermState, termState.pendingIndexTerm);
- termState.pendingIndexTerm = false;
+ termState.wrappedTermState.totalTermFreq = termState.totalTermFreq;
+ wrappedPostingsReader.nextTerm(fieldInfo, termState.wrappedTermState);
+ termState.wrappedTermState.termCount++;
}
}
// TODO: we could actually reuse, by having TL that
// holds the last wrapped reuse, and vice-versa
@Override
- public DocsEnum docs(FieldInfo field, TermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException {
+ public DocsEnum docs(FieldInfo field, BlockTermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException {
PulsingTermState termState = (PulsingTermState) _termState;
- if (termState.docFreq <= maxPulsingDocFreq) {
+ if (termState.postingsSize != -1) {
+ PulsingDocsEnum postings;
if (reuse instanceof PulsingDocsEnum) {
- return ((PulsingDocsEnum) reuse).reset(skipDocs, termState);
+ postings = (PulsingDocsEnum) reuse;
+ if (!postings.canReuse(field)) {
+ postings = new PulsingDocsEnum(field);
+ }
} else {
- PulsingDocsEnum docsEnum = new PulsingDocsEnum();
- return docsEnum.reset(skipDocs, termState);
+ postings = new PulsingDocsEnum(field);
}
+ return postings.reset(skipDocs, termState);
} else {
+ // TODO: not great that we lose reuse of PulsingDocsEnum in this case:
if (reuse instanceof PulsingDocsEnum) {
return wrappedPostingsReader.docs(field, termState.wrappedTermState, skipDocs, null);
} else {
@@ -202,15 +197,26 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
// TODO: -- not great that we can't always reuse
@Override
- public DocsAndPositionsEnum docsAndPositions(FieldInfo field, TermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
- PulsingTermState termState = (PulsingTermState) _termState;
- if (termState.docFreq <= maxPulsingDocFreq) {
+ public DocsAndPositionsEnum docsAndPositions(FieldInfo field, BlockTermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
+ if (field.omitTermFreqAndPositions) {
+ return null;
+ }
+ //System.out.println("D&P: field=" + field.name);
+
+ final PulsingTermState termState = (PulsingTermState) _termState;
+
+ if (termState.postingsSize != -1) {
+ PulsingDocsAndPositionsEnum postings;
if (reuse instanceof PulsingDocsAndPositionsEnum) {
- return ((PulsingDocsAndPositionsEnum) reuse).reset(skipDocs, termState);
+ postings = (PulsingDocsAndPositionsEnum) reuse;
+ if (!postings.canReuse(field)) {
+ postings = new PulsingDocsAndPositionsEnum(field);
+ }
} else {
- PulsingDocsAndPositionsEnum postingsEnum = new PulsingDocsAndPositionsEnum();
- return postingsEnum.reset(skipDocs, termState);
+ postings = new PulsingDocsAndPositionsEnum(field);
}
+
+ return postings.reset(skipDocs, termState);
} else {
if (reuse instanceof PulsingDocsAndPositionsEnum) {
return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, skipDocs, null);
@@ -220,63 +226,90 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
}
}
- static class PulsingDocsEnum extends DocsEnum {
- private int nextRead;
+ private static class PulsingDocsEnum extends DocsEnum {
+ private final ByteArrayDataInput postings = new ByteArrayDataInput(null);
+ private final boolean omitTF;
+ private final boolean storePayloads;
private Bits skipDocs;
- private Document doc;
- private PulsingTermState state;
+ private int docID;
+ private int freq;
- public void close() {}
+ public PulsingDocsEnum(FieldInfo fieldInfo) {
+ omitTF = fieldInfo.omitTermFreqAndPositions;
+ storePayloads = fieldInfo.storePayloads;
+ }
- PulsingDocsEnum reset(Bits skipDocs, PulsingTermState termState) {
- // TODO: -- not great we have to clone here --
- // merging is wasteful; TermRangeQuery too
- state = (PulsingTermState) termState.clone();
+ public PulsingDocsEnum reset(Bits skipDocs, PulsingTermState termState) {
+ //System.out.println("PR docsEnum termState=" + termState + " docFreq=" + termState.docFreq);
+ assert termState.postingsSize != -1;
+ final byte[] bytes = new byte[termState.postingsSize];
+ System.arraycopy(termState.postings, 0, bytes, 0, termState.postingsSize);
+ postings.reset(bytes);
+ docID = 0;
+ freq = 1;
this.skipDocs = skipDocs;
- nextRead = 0;
return this;
}
- @Override
- public int nextDoc() {
- while(true) {
- if (nextRead >= state.docFreq) {
- return NO_MORE_DOCS;
- } else {
- doc = state.docs[nextRead++];
- if (skipDocs == null || !skipDocs.get(doc.docID)) {
- return doc.docID;
- }
- }
- }
+ boolean canReuse(FieldInfo fieldInfo) {
+ return omitTF == fieldInfo.omitTermFreqAndPositions && storePayloads == fieldInfo.storePayloads;
}
@Override
- public int read() {
- int i=0;
- // TODO: -- ob1?
- initBulkResult();
- final int[] docs = bulkResult.docs.ints;
- final int[] freqs = bulkResult.freqs.ints;
- while(nextRead < state.docFreq) {
- doc = state.docs[nextRead++];
- if (skipDocs == null || !skipDocs.get(doc.docID)) {
- docs[i] = doc.docID;
- freqs[i] = doc.numPositions;
- i++;
+ public int nextDoc() throws IOException {
+ //System.out.println("PR nextDoc this= "+ this);
+ while(true) {
+ if (postings.eof()) {
+ //System.out.println("PR END");
+ return docID = NO_MORE_DOCS;
+ }
+
+ final int code = postings.readVInt();
+ if (omitTF) {
+ docID += code;
+ } else {
+ docID += code >>> 1; // shift off low bit
+ if ((code & 1) != 0) { // if low bit is set
+ freq = 1; // freq is one
+ } else {
+ freq = postings.readVInt(); // else read freq
+ }
+
+ // Skip positions
+ if (storePayloads) {
+ int payloadLength = -1;
+ for(int pos=0;pos>> 1; // shift off low bit
+ if ((code & 1) != 0) { // if low bit is set
+ freq = 1; // freq is one
} else {
- doc = state.docs[nextRead++];
- if (skipDocs == null || !skipDocs.get(doc.docID)) {
- nextPosRead = 0;
- return doc.docID;
- }
+ freq = postings.readVInt(); // else read freq
+ }
+ posPending = freq;
+
+ if (skipDocs == null || !skipDocs.get(docID)) {
+ //System.out.println(" return docID=" + docID + " freq=" + freq);
+ position = 0;
+ return docID;
}
}
}
@Override
public int freq() {
- return doc.numPositions;
+ return freq;
}
@Override
public int docID() {
- return doc.docID;
+ return docID;
}
@Override
@@ -347,26 +406,72 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
return doc;
}
}
- return NO_MORE_DOCS;
+ return docID = NO_MORE_DOCS;
}
@Override
- public int nextPosition() {
- assert nextPosRead < doc.numPositions;
- pos = doc.positions[nextPosRead++];
- payloadRetrieved = false;
- return pos.pos;
+ public int nextPosition() throws IOException {
+ //System.out.println("PR d&p nextPosition posPending=" + posPending + " vs freq=" + freq);
+
+ assert posPending > 0;
+ posPending--;
+
+ if (storePayloads) {
+ if (!payloadRetrieved) {
+ //System.out.println("PR skip payload=" + payloadLength);
+ postings.skipBytes(payloadLength);
+ }
+ final int code = postings.readVInt();
+ //System.out.println("PR code=" + code);
+ if ((code & 1) != 0) {
+ payloadLength = postings.readVInt();
+ //System.out.println("PR new payload len=" + payloadLength);
+ }
+ position += code >> 1;
+ payloadRetrieved = false;
+ } else {
+ position += postings.readVInt();
+ }
+
+ //System.out.println("PR d&p nextPos return pos=" + position + " this=" + this);
+ return position;
+ }
+
+ private void skipPositions() throws IOException {
+ while(posPending != 0) {
+ nextPosition();
+ }
+ if (storePayloads && !payloadRetrieved) {
+ //System.out.println(" skip payload len=" + payloadLength);
+ postings.skipBytes(payloadLength);
+ payloadRetrieved = true;
+ }
}
@Override
public boolean hasPayload() {
- return !payloadRetrieved && pos.payload != null && pos.payload.length > 0;
+ return storePayloads && !payloadRetrieved && payloadLength > 0;
}
@Override
- public BytesRef getPayload() {
+ public BytesRef getPayload() throws IOException {
+ //System.out.println("PR getPayload payloadLength=" + payloadLength + " this=" + this);
+ if (payloadRetrieved) {
+ throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once.");
+ }
payloadRetrieved = true;
- return pos.payload;
+ if (payloadLength > 0) {
+ if (payload == null) {
+ payload = new BytesRef(payloadLength);
+ } else {
+ payload.grow(payloadLength);
+ }
+ postings.readBytes(payload.bytes, 0, payloadLength);
+ payload.length = payloadLength;
+ return payload;
+ } else {
+ return null;
+ }
}
}
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java
index f18637d29ad..35b2a3d0278 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java
@@ -20,17 +20,17 @@ package org.apache.lucene.index.codecs.pulsing;
import java.io.IOException;
import org.apache.lucene.index.FieldInfo;
-import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.index.codecs.PostingsWriterBase;
+import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.CodecUtil;
-// TODO: we now pulse entirely according to docFreq of the
-// term; it might be better to eg pulse by "net bytes used"
-// so that a term that has only 1 doc but zillions of
-// positions would not be inlined. Though this is
+// TODO: we now inline based on total TF of the term,
+// but it might be better to inline by "net bytes used"
+// so that a term that has only 1 posting but a huge
+// payload would not be inlined. Though this is
// presumably rare in practice...
/** @lucene.experimental */
@@ -44,86 +44,42 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
final static int VERSION_CURRENT = VERSION_START;
- IndexOutput termsOut;
+ private IndexOutput termsOut;
- boolean omitTF;
- boolean storePayloads;
+ private boolean omitTF;
+ private boolean storePayloads;
- // Starts a new term
- FieldInfo fieldInfo;
+ // one entry per position
+ private final Position[] pending;
+ private int pendingCount = 0; // -1 once we've hit too many positions
+ private Position currentDoc; // first Position entry of current doc
- /** @lucene.experimental */
- public static class Document {
- int docID;
- int termDocFreq;
- int numPositions;
- Position[] positions;
- Document() {
- positions = new Position[1];
- positions[0] = new Position();
- }
-
- @Override
- public Object clone() {
- Document doc = new Document();
- doc.docID = docID;
- doc.termDocFreq = termDocFreq;
- doc.numPositions = numPositions;
- doc.positions = new Position[positions.length];
- for(int i = 0; i < positions.length; i++) {
- doc.positions[i] = (Position) positions[i].clone();
- }
-
- return doc;
- }
-
- void reallocPositions(int minSize) {
- final Position[] newArray = new Position[ArrayUtil.oversize(minSize, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
- System.arraycopy(positions, 0, newArray, 0, positions.length);
- for(int i=positions.length;i maxPulsingDocFreq docs
-
- static class Position {
+ private static final class Position {
BytesRef payload;
+ int termFreq; // only incremented on first position for a given doc
int pos;
-
- @Override
- public Object clone() {
- Position position = new Position();
- position.pos = pos;
- if (payload != null) {
- position.payload = new BytesRef(payload);
- }
- return position;
- }
+ int docID;
}
// TODO: -- lazy init this? ie, if every single term
- // was pulsed then we never need to use this fallback?
- // Fallback writer for non-pulsed terms:
+ // was inlined (eg for a "primary key" field) then we
+ // never need to use this fallback? Fallback writer for
+ // non-inlined terms:
final PostingsWriterBase wrappedPostingsWriter;
- /** If docFreq <= maxPulsingDocFreq, its postings are
+ /** If the total number of positions (summed across all docs
+ * for this term) is <= maxPositions, then the postings are
* inlined into terms dict */
- public PulsingPostingsWriterImpl(int maxPulsingDocFreq, PostingsWriterBase wrappedPostingsWriter) throws IOException {
+ public PulsingPostingsWriterImpl(int maxPositions, PostingsWriterBase wrappedPostingsWriter) throws IOException {
super();
- pendingDocs = new Document[maxPulsingDocFreq];
- for(int i=0;i= the cutoff:
this.wrappedPostingsWriter = wrappedPostingsWriter;
}
@@ -131,14 +87,14 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
public void start(IndexOutput termsOut) throws IOException {
this.termsOut = termsOut;
CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
- termsOut.writeVInt(pendingDocs.length);
+ termsOut.writeVInt(pending.length); // encode maxPositions in header
wrappedPostingsWriter.start(termsOut);
}
@Override
public void startTerm() {
- assert pendingDocCount == 0;
- pulsed = false;
+ //System.out.println("PW startTerm");
+ assert pendingCount == 0;
}
// TODO: -- should we NOT reuse across fields? would
@@ -148,73 +104,56 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
// our parent calls setField whenever the field changes
@Override
public void setField(FieldInfo fieldInfo) {
- this.fieldInfo = fieldInfo;
omitTF = fieldInfo.omitTermFreqAndPositions;
+ //System.out.println("PW field=" + fieldInfo.name + " omitTF=" + omitTF);
storePayloads = fieldInfo.storePayloads;
wrappedPostingsWriter.setField(fieldInfo);
}
@Override
public void startDoc(int docID, int termDocFreq) throws IOException {
-
assert docID >= 0: "got docID=" + docID;
-
- if (!pulsed && pendingDocCount == pendingDocs.length) {
-
- // OK we just crossed the threshold, this term should
- // now be written with our wrapped codec:
- wrappedPostingsWriter.startTerm();
-
- // Flush all buffered docs
- for(int i=0;i currentDoc.positions.length) {
- currentDoc.reallocPositions(termDocFreq);
- }
- currentDoc.numPositions = 0;
}
}
@Override
public void addPosition(int position, BytesRef payload) throws IOException {
- if (pulsed) {
+
+ //System.out.println("PW pos=" + position + " payload=" + (payload == null ? "null" : payload.length + " bytes"));
+ if (pendingCount == pending.length) {
+ push();
+ }
+
+ if (pendingCount == -1) {
+ // We've already seen too many docs for this term --
+ // just forward to our fallback writer
wrappedPostingsWriter.addPosition(position, payload);
} else {
- // just buffer up
- Position pos = currentDoc.positions[currentDoc.numPositions++];
+ // buffer up
+ final Position pos = pending[pendingCount++];
pos.pos = position;
+ pos.docID = currentDoc.docID;
if (payload != null && payload.length > 0) {
if (pos.payload == null) {
pos.payload = new BytesRef(payload);
@@ -229,86 +168,146 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
@Override
public void finishDoc() throws IOException {
- assert omitTF || currentDoc.numPositions == currentDoc.termDocFreq;
- if (pulsed) {
+ //System.out.println("PW finishDoc");
+ if (pendingCount == -1) {
wrappedPostingsWriter.finishDoc();
}
}
- boolean pendingIsIndexTerm;
-
- int pulsedCount;
- int nonPulsedCount;
+ private final RAMOutputStream buffer = new RAMOutputStream();
+ private final RAMOutputStream buffer2 = new RAMOutputStream();
/** Called when we are done adding docs to this term */
@Override
- public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
+ public void finishTerm(TermStats stats) throws IOException {
+ //System.out.println("PW finishTerm docCount=" + stats.docFreq);
- assert docCount > 0;
+ assert pendingCount > 0 || pendingCount == -1;
- pendingIsIndexTerm |= isIndexTerm;
-
- if (pulsed) {
- wrappedPostingsWriter.finishTerm(docCount, pendingIsIndexTerm);
- pendingIsIndexTerm = false;
- pulsedCount++;
+ if (pendingCount == -1) {
+ wrappedPostingsWriter.finishTerm(stats);
} else {
- nonPulsedCount++;
- // OK, there were few enough occurrences for this
+
+ // There were few enough total occurrences for this
// term, so we fully inline our postings data into
// terms dict, now:
- int lastDocID = 0;
- for(int i=0;i
+
+
+
+
+
+
+Pulsing Codec: inlines low frequency terms' postings into terms dictionary.
+
+
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/sep/IntIndexInput.java b/lucene/src/java/org/apache/lucene/index/codecs/sep/IntIndexInput.java
index 2ab0f46a391..631476df0ba 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/sep/IntIndexInput.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/sep/IntIndexInput.java
@@ -17,11 +17,11 @@ package org.apache.lucene.index.codecs.sep;
* limitations under the License.
*/
-import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.util.IntsRef;
-
-import java.io.IOException;
import java.io.Closeable;
+import java.io.IOException;
+
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.util.IntsRef;
/** Defines basic API for writing ints to an IndexOutput.
* IntBlockCodec interacts with this API. @see
@@ -39,7 +39,7 @@ public abstract class IntIndexInput implements Closeable {
// TODO: -- can we simplify this?
public abstract static class Index {
- public abstract void read(IndexInput indexIn, boolean absolute) throws IOException;
+ public abstract void read(DataInput indexIn, boolean absolute) throws IOException;
public abstract void read(IntIndexInput.Reader indexIn, boolean absolute) throws IOException;
@@ -48,6 +48,7 @@ public abstract class IntIndexInput implements Closeable {
public abstract void set(Index other);
+ @Override
public abstract Object clone();
}
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java
index 342d2fa8bcd..b693db361c9 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java
@@ -20,15 +20,18 @@ package org.apache.lucene.index.codecs.sep;
import java.io.IOException;
import java.util.Collection;
-import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.codecs.BlockTermState;
import org.apache.lucene.index.codecs.PostingsReaderBase;
-import org.apache.lucene.index.codecs.TermState;
+import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
@@ -129,44 +132,120 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
}
}
- private static class SepTermState extends TermState {
+ private static final class SepTermState extends BlockTermState {
// We store only the seek point to the docs file because
// the rest of the info (freqIndex, posIndex, etc.) is
// stored in the docs file:
IntIndexInput.Index docIndex;
+ IntIndexInput.Index posIndex;
+ IntIndexInput.Index freqIndex;
+ long payloadFP;
+ long skipFP;
+ // Only used for "primary" term state; these are never
+ // copied on clone:
+ byte[] bytes;
+ ByteArrayDataInput bytesReader;
+
+ @Override
public Object clone() {
SepTermState other = (SepTermState) super.clone();
other.docIndex = (IntIndexInput.Index) docIndex.clone();
+ if (freqIndex != null) {
+ other.freqIndex = (IntIndexInput.Index) freqIndex.clone();
+ }
+ if (posIndex != null) {
+ other.posIndex = (IntIndexInput.Index) posIndex.clone();
+ }
return other;
}
- public void copy(TermState _other) {
- super.copy(_other);
+ @Override
+ public void copyFrom(TermState _other) {
+ super.copyFrom(_other);
SepTermState other = (SepTermState) _other;
docIndex.set(other.docIndex);
+ if (freqIndex != null && other.freqIndex != null) {
+ freqIndex.set(other.freqIndex);
+ }
+ if (posIndex != null && other.posIndex != null) {
+ posIndex.set(other.posIndex);
+ }
+ payloadFP = other.payloadFP;
+ skipFP = other.skipFP;
}
@Override
public String toString() {
- return "tis.fp=" + filePointer + " docFreq=" + docFreq + " ord=" + ord + " docIndex=" + docIndex;
+ return super.toString() + " docIndex=" + docIndex + " freqIndex=" + freqIndex + " posIndex=" + posIndex + " payloadFP=" + payloadFP + " skipFP=" + skipFP;
}
}
@Override
- public TermState newTermState() throws IOException {
- final SepTermState state = new SepTermState();
+ public BlockTermState newTermState() throws IOException {
+ final SepTermState state = new SepTermState();
state.docIndex = docIn.index();
+ if (freqIn != null) {
+ state.freqIndex = freqIn.index();
+ }
+ if (posIn != null) {
+ state.posIndex = posIn.index();
+ }
return state;
}
@Override
- public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState termState, boolean isIndexTerm) throws IOException {
- ((SepTermState) termState).docIndex.read(termsIn, isIndexTerm);
+ public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState _termState) throws IOException {
+ final SepTermState termState = (SepTermState) _termState;
+ final int len = termsIn.readVInt();
+ //System.out.println("SepR.readTermsBlock len=" + len);
+ if (termState.bytes == null) {
+ termState.bytes = new byte[ArrayUtil.oversize(len, 1)];
+ termState.bytesReader = new ByteArrayDataInput(termState.bytes);
+ } else if (termState.bytes.length < len) {
+ termState.bytes = new byte[ArrayUtil.oversize(len, 1)];
+ }
+ termState.bytesReader.reset(termState.bytes, 0, len);
+ termsIn.readBytes(termState.bytes, 0, len);
}
@Override
- public DocsEnum docs(FieldInfo fieldInfo, TermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException {
+ public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState) throws IOException {
+ final SepTermState termState = (SepTermState) _termState;
+ //System.out.println("SepR.nextTerm termCount=" + termState.termCount);
+ //System.out.println(" docFreq=" + termState.docFreq);
+ final boolean isFirstTerm = termState.termCount == 0;
+ termState.docIndex.read(termState.bytesReader, isFirstTerm);
+ //System.out.println(" docIndex=" + termState.docIndex);
+ if (!fieldInfo.omitTermFreqAndPositions) {
+ termState.freqIndex.read(termState.bytesReader, isFirstTerm);
+ //System.out.println(" freqIndex=" + termState.freqIndex);
+ termState.posIndex.read(termState.bytesReader, isFirstTerm);
+ //System.out.println(" posIndex=" + termState.posIndex);
+ if (fieldInfo.storePayloads) {
+ if (isFirstTerm) {
+ termState.payloadFP = termState.bytesReader.readVLong();
+ } else {
+ termState.payloadFP += termState.bytesReader.readVLong();
+ }
+ //System.out.println(" payloadFP=" + termState.payloadFP);
+ }
+ }
+ if (termState.docFreq >= skipInterval) {
+ //System.out.println(" readSkip @ " + termState.bytesReader.pos);
+ if (isFirstTerm) {
+ termState.skipFP = termState.bytesReader.readVLong();
+ } else {
+ termState.skipFP += termState.bytesReader.readVLong();
+ }
+ //System.out.println(" skipFP=" + termState.skipFP);
+ } else if (isFirstTerm) {
+ termState.skipFP = termState.bytesReader.readVLong();
+ }
+ }
+
+ @Override
+ public DocsEnum docs(FieldInfo fieldInfo, BlockTermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException {
final SepTermState termState = (SepTermState) _termState;
SepDocsEnum docsEnum;
if (reuse == null || !(reuse instanceof SepDocsEnum)) {
@@ -185,7 +264,7 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
}
@Override
- public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
+ public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
assert !fieldInfo.omitTermFreqAndPositions;
final SepTermState termState = (SepTermState) _termState;
SepDocsAndPositionsEnum postingsEnum;
@@ -217,7 +296,7 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
private Bits skipDocs;
private final IntIndexInput.Reader docReader;
private final IntIndexInput.Reader freqReader;
- private long skipOffset;
+ private long skipFP;
private final IntIndexInput.Index docIndex;
private final IntIndexInput.Index freqIndex;
@@ -258,18 +337,15 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
docIndex.seek(docReader);
if (!omitTF) {
- freqIndex.read(docReader, true);
+ freqIndex.set(termState.freqIndex);
freqIndex.seek(freqReader);
-
- posIndex.read(docReader, true);
- // skip payload offset
- docReader.readVLong();
} else {
freq = 1;
}
- skipOffset = docReader.readVLong();
docFreq = termState.docFreq;
+ // NOTE: unused if docFreq < skipInterval:
+ skipFP = termState.skipFP;
count = 0;
doc = 0;
skipped = false;
@@ -288,9 +364,11 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
count++;
// Decode next doc
+ //System.out.println("decode docDelta:");
doc += docReader.next();
if (!omitTF) {
+ //System.out.println("decode freq:");
freq = freqReader.next();
}
@@ -298,13 +376,13 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
break;
}
}
-
return doc;
}
@Override
public int read() throws IOException {
// TODO: -- switch to bulk read api in IntIndexInput
+ //System.out.println("sepdocs read");
final int[] docs = bulkResult.docs.ints;
final int[] freqs = bulkResult.freqs.ints;
int i = 0;
@@ -312,14 +390,17 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
while (i < length && count < docFreq) {
count++;
// manually inlined call to next() for speed
+ //System.out.println("decode doc");
doc += docReader.next();
if (!omitTF) {
+ //System.out.println("decode freq");
freq = freqReader.next();
}
if (skipDocs == null || !skipDocs.get(doc)) {
docs[i] = doc;
freqs[i] = freq;
+ //System.out.println(" docs[" + i + "]=" + doc + " count=" + count + " dF=" + docFreq);
i++;
}
}
@@ -359,7 +440,7 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
if (!skipped) {
// We haven't yet skipped for this posting
- skipper.init(skipOffset,
+ skipper.init(skipFP,
docIndex,
freqIndex,
posIndex,
@@ -409,14 +490,14 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
private final IntIndexInput.Reader freqReader;
private final IntIndexInput.Reader posReader;
private final IndexInput payloadIn;
- private long skipOffset;
+ private long skipFP;
private final IntIndexInput.Index docIndex;
private final IntIndexInput.Index freqIndex;
private final IntIndexInput.Index posIndex;
private final IntIndexInput startDocIn;
- private long payloadOffset;
+ private long payloadFP;
private int pendingPosCount;
private int position;
@@ -442,21 +523,26 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
SepDocsAndPositionsEnum init(FieldInfo fieldInfo, SepTermState termState, Bits skipDocs) throws IOException {
this.skipDocs = skipDocs;
storePayloads = fieldInfo.storePayloads;
+ //System.out.println("Sep D&P init");
// TODO: can't we only do this if consumer
// skipped consuming the previous docs?
docIndex.set(termState.docIndex);
docIndex.seek(docReader);
+ //System.out.println(" docIndex=" + docIndex);
- freqIndex.read(docReader, true);
+ freqIndex.set(termState.freqIndex);
freqIndex.seek(freqReader);
+ //System.out.println(" freqIndex=" + freqIndex);
- posIndex.read(docReader, true);
+ posIndex.set(termState.posIndex);
+ //System.out.println(" posIndex=" + posIndex);
posSeekPending = true;
payloadPending = false;
- payloadOffset = docReader.readVLong();
- skipOffset = docReader.readVLong();
+ payloadFP = termState.payloadFP;
+ skipFP = termState.skipFP;
+ //System.out.println(" skipFP=" + skipFP);
docFreq = termState.docFreq;
count = 0;
@@ -482,8 +568,10 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
// freq=1 case?
// Decode next doc
+ //System.out.println(" sep d&p read doc");
doc += docReader.next();
-
+
+ //System.out.println(" sep d&p read freq");
freq = freqReader.next();
pendingPosCount += freq;
@@ -509,6 +597,7 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
@Override
public int advance(int target) throws IOException {
+ //System.out.println("SepD&P advance target=" + target + " vs current=" + doc + " this=" + this);
// TODO: jump right to next() if target is < X away
// from where we are now?
@@ -519,6 +608,7 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
// skip data
if (skipper == null) {
+ //System.out.println(" create skipper");
// This DocsEnum has never done any skipping
skipper = new SepSkipListReader((IndexInput) skipIn.clone(),
freqIn,
@@ -528,46 +618,54 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
}
if (!skipped) {
+ //System.out.println(" init skip data skipFP=" + skipFP);
// We haven't yet skipped for this posting
- skipper.init(skipOffset,
+ skipper.init(skipFP,
docIndex,
freqIndex,
posIndex,
- payloadOffset,
+ payloadFP,
docFreq,
storePayloads);
skipped = true;
}
-
final int newCount = skipper.skipTo(target);
+ //System.out.println(" skip newCount=" + newCount + " vs " + count);
if (newCount > count) {
// Skipper did move
skipper.getFreqIndex().seek(freqReader);
skipper.getDocIndex().seek(docReader);
- //skipper.getPosIndex().seek(posReader);
+ // NOTE: don't seek pos here; do it lazily
+ // instead. Eg a PhraseQuery may skip to many
+ // docs before finally asking for positions...
posIndex.set(skipper.getPosIndex());
posSeekPending = true;
count = newCount;
doc = skipper.getDoc();
+ //System.out.println(" moved to doc=" + doc);
//payloadIn.seek(skipper.getPayloadPointer());
- payloadOffset = skipper.getPayloadPointer();
+ payloadFP = skipper.getPayloadPointer();
pendingPosCount = 0;
pendingPayloadBytes = 0;
payloadPending = false;
payloadLength = skipper.getPayloadLength();
+ //System.out.println(" move payloadLen=" + payloadLength);
}
}
// Now, linear scan for the rest:
do {
if (nextDoc() == NO_MORE_DOCS) {
+ //System.out.println(" advance nextDoc=END");
return NO_MORE_DOCS;
}
+ //System.out.println(" advance nextDoc=" + doc);
} while (target > doc);
+ //System.out.println(" return doc=" + doc);
return doc;
}
@@ -575,7 +673,7 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
public int nextPosition() throws IOException {
if (posSeekPending) {
posIndex.seek(posReader);
- payloadIn.seek(payloadOffset);
+ payloadIn.seek(payloadFP);
posSeekPending = false;
}
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java
index 6be97d22f9d..9e9b9966808 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java
@@ -25,7 +25,9 @@ import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.codecs.PostingsWriterBase;
+import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
@@ -68,8 +70,7 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase {
boolean storePayloads;
boolean omitTF;
- // Starts a new term
- long lastSkipStart;
+ long lastSkipFP;
FieldInfo fieldInfo;
@@ -79,7 +80,10 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase {
long lastPayloadStart;
int lastDocID;
int df;
- private boolean firstDoc;
+ private int pendingTermCount;
+
+ // Holds pending byte[] blob for the current terms block
+ private final RAMOutputStream indexBytesWriter = new RAMOutputStream();
public SepPostingsWriterImpl(SegmentWriteState state, IntStreamFactory factory) throws IOException {
super();
@@ -143,13 +147,9 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase {
payloadStart = payloadOut.getFilePointer();
lastPayloadLength = -1;
}
- firstDoc = true;
skipListWriter.resetSkip(docIndex, freqIndex, posIndex);
}
- // TODO: -- should we NOT reuse across fields? would
- // be cleaner
-
// Currently, this instance is re-used across fields, so
// our parent calls setField whenever the field changes
@Override
@@ -160,27 +160,13 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase {
storePayloads = !omitTF && fieldInfo.storePayloads;
}
-
/** Adds a new doc in this term. If this returns null
* then we just skip consuming positions/payloads. */
@Override
public void startDoc(int docID, int termDocFreq) throws IOException {
- if (firstDoc) {
- // TODO: we are writing absolute file pointers below,
- // which is wasteful. It'd be better compression to
- // write the "baseline" into each indexed term, then
- // write only the delta here.
- if (!omitTF) {
- freqIndex.write(docOut, true);
- posIndex.write(docOut, true);
- docOut.writeVLong(payloadStart);
- }
- docOut.writeVLong(skipOut.getFilePointer());
- firstDoc = false;
- }
-
final int delta = docID - lastDocID;
+ //System.out.println("SepW startDoc: write doc=" + docID + " delta=" + delta);
if (docID < 0 || (df > 0 && delta <= 0)) {
throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )");
@@ -189,6 +175,7 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase {
if ((++df % skipInterval) == 0) {
// TODO: -- awkward we have to make these two
// separate calls to skipper
+ //System.out.println(" buffer skip lastDocID=" + lastDocID);
skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength);
skipListWriter.bufferSkip(df);
}
@@ -196,10 +183,20 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase {
lastDocID = docID;
docOut.write(delta);
if (!omitTF) {
+ //System.out.println(" sepw startDoc: write freq=" + termDocFreq);
freqOut.write(termDocFreq);
}
}
+ @Override
+ public void flushTermsBlock() throws IOException {
+ //System.out.println("SepW.flushTermsBlock: pendingTermCount=" + pendingTermCount + " bytesUsed=" + indexBytesWriter.getFilePointer());
+ termsOut.writeVLong((int) indexBytesWriter.getFilePointer());
+ indexBytesWriter.writeTo(termsOut);
+ indexBytesWriter.reset();
+ pendingTermCount = 0;
+ }
+
/** Add a new position & payload */
@Override
public void addPosition(int position, BytesRef payload) throws IOException {
@@ -239,20 +236,57 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase {
/** Called when we are done adding docs to this term */
@Override
- public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
-
+ public void finishTerm(TermStats stats) throws IOException {
// TODO: -- wasteful we are counting this in two places?
- assert docCount > 0;
- assert docCount == df;
+ assert stats.docFreq > 0;
+ assert stats.docFreq == df;
- docIndex.write(termsOut, isIndexTerm);
+ final boolean isFirstTerm = pendingTermCount == 0;
+ //System.out.println("SepW.finishTerm: isFirstTerm=" + isFirstTerm);
+
+ docIndex.write(indexBytesWriter, isFirstTerm);
+ //System.out.println(" docIndex=" + docIndex);
+
+ if (!omitTF) {
+ freqIndex.write(indexBytesWriter, isFirstTerm);
+ //System.out.println(" freqIndex=" + freqIndex);
+
+ posIndex.write(indexBytesWriter, isFirstTerm);
+ //System.out.println(" posIndex=" + posIndex);
+ if (storePayloads) {
+ if (isFirstTerm) {
+ indexBytesWriter.writeVLong(payloadStart);
+ } else {
+ indexBytesWriter.writeVLong(payloadStart - lastPayloadStart);
+ }
+ lastPayloadStart = payloadStart;
+ //System.out.println(" payloadFP=" + payloadStart);
+ }
+ }
if (df >= skipInterval) {
+ //System.out.println(" skipFP=" + skipStart);
+ final long skipFP = skipOut.getFilePointer();
skipListWriter.writeSkip(skipOut);
+ //System.out.println(" writeSkip @ " + indexBytesWriter.getFilePointer());
+ if (isFirstTerm) {
+ indexBytesWriter.writeVLong(skipFP);
+ } else {
+ indexBytesWriter.writeVLong(skipFP - lastSkipFP);
+ }
+ lastSkipFP = skipFP;
+ } else if (isFirstTerm) {
+ // TODO: this is somewhat wasteful; eg if no terms in
+ // this block will use skip data, we don't need to
+ // write this:
+ final long skipFP = skipOut.getFilePointer();
+ indexBytesWriter.writeVLong(skipFP);
+ lastSkipFP = skipFP;
}
lastDocID = 0;
df = 0;
+ pendingTermCount++;
}
@Override
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/sep/package.html b/lucene/src/java/org/apache/lucene/index/codecs/sep/package.html
new file mode 100644
index 00000000000..b51d9102715
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/index/codecs/sep/package.html
@@ -0,0 +1,25 @@
+
+
+
+
+
+
+
+Sep: base support for separate files (doc,frq,pos,skp,pyl)
+
+
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java
index 890b1de029e..8e3427704b4 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java
@@ -125,28 +125,32 @@ class SimpleTextFieldsReader extends FieldsProducer {
private final IndexInput in;
private final boolean omitTF;
private int docFreq;
+ private long totalTermFreq;
private long docsStart;
private boolean ended;
- private final BytesRefFSTEnum> fstEnum;
+ private final BytesRefFSTEnum>> fstEnum;
- public SimpleTextTermsEnum(FST> fst, boolean omitTF) throws IOException {
+ public SimpleTextTermsEnum(FST>> fst, boolean omitTF) throws IOException {
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
this.omitTF = omitTF;
- fstEnum = new BytesRefFSTEnum>(fst);
+ fstEnum = new BytesRefFSTEnum>>(fst);
}
+ @Override
public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException {
//System.out.println("seek to text=" + text.utf8ToString());
- final BytesRefFSTEnum.InputOutput> result = fstEnum.seekCeil(text);
+ final BytesRefFSTEnum.InputOutput>> result = fstEnum.seekCeil(text);
if (result == null) {
//System.out.println(" end");
return SeekStatus.END;
} else {
//System.out.println(" got text=" + term.utf8ToString());
- PairOutputs.Pair pair = result.output;
- docsStart = pair.output1;
- docFreq = pair.output2.intValue();
+ PairOutputs.Pair> pair1 = result.output;
+ PairOutputs.Pair pair2 = pair1.output2;
+ docsStart = pair1.output1;
+ docFreq = pair2.output1.intValue();
+ totalTermFreq = pair2.output2;
if (result.input.equals(text)) {
//System.out.println(" match docsStart=" + docsStart);
@@ -158,18 +162,16 @@ class SimpleTextFieldsReader extends FieldsProducer {
}
}
- @Override
- public void cacheCurrentTerm() {
- }
-
@Override
public BytesRef next() throws IOException {
assert !ended;
- final BytesRefFSTEnum.InputOutput> result = fstEnum.next();
+ final BytesRefFSTEnum.InputOutput>> result = fstEnum.next();
if (result != null) {
- final PairOutputs.Pair pair = result.output;
- docsStart = pair.output1;
- docFreq = pair.output2.intValue();
+ PairOutputs.Pair> pair1 = result.output;
+ PairOutputs.Pair pair2 = pair1.output2;
+ docsStart = pair1.output1;
+ docFreq = pair2.output1.intValue();
+ totalTermFreq = pair2.output2;
return result.input;
} else {
return null;
@@ -196,6 +198,11 @@ class SimpleTextFieldsReader extends FieldsProducer {
return docFreq;
}
+ @Override
+ public long totalTermFreq() {
+ return totalTermFreq;
+ }
+
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
SimpleTextDocsEnum docsEnum;
@@ -221,7 +228,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
}
return docsAndPositionsEnum.reset(docsStart, skipDocs);
}
-
+
@Override
public Comparator getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
@@ -446,15 +453,14 @@ class SimpleTextFieldsReader extends FieldsProducer {
}
private class SimpleTextTerms extends Terms {
- private final String field;
private final long termsStart;
private final boolean omitTF;
- private FST> fst;
-
+ private long sumTotalTermFreq;
+ private FST>> fst;
+ private int termCount;
private final BytesRef scratch = new BytesRef(10);
public SimpleTextTerms(String field, long termsStart) throws IOException {
- this.field = StringHelper.intern(field);
this.termsStart = termsStart;
omitTF = fieldInfos.fieldInfo(field).omitTermFreqAndPositions;
loadTerms();
@@ -462,24 +468,38 @@ class SimpleTextFieldsReader extends FieldsProducer {
private void loadTerms() throws IOException {
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
- Builder> b = new Builder>(FST.INPUT_TYPE.BYTE1, 0, 0, true, new PairOutputs(posIntOutputs, posIntOutputs));
+ final Builder>> b;
+ b = new Builder>>(FST.INPUT_TYPE.BYTE1,
+ 0,
+ 0,
+ true,
+ new PairOutputs>(posIntOutputs,
+ new PairOutputs(posIntOutputs, posIntOutputs)));
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
in.seek(termsStart);
final BytesRef lastTerm = new BytesRef(10);
long lastDocsStart = -1;
int docFreq = 0;
+ long totalTermFreq = 0;
while(true) {
readLine(in, scratch);
if (scratch.equals(END) || scratch.startsWith(FIELD)) {
if (lastDocsStart != -1) {
- b.add(lastTerm, new PairOutputs.Pair(lastDocsStart, Long.valueOf(docFreq)));
+ b.add(lastTerm, new PairOutputs.Pair>(lastDocsStart,
+ new PairOutputs.Pair((long) docFreq,
+ posIntOutputs.get(totalTermFreq))));
+ sumTotalTermFreq += totalTermFreq;
}
break;
} else if (scratch.startsWith(DOC)) {
docFreq++;
+ } else if (scratch.startsWith(POS)) {
+ totalTermFreq++;
} else if (scratch.startsWith(TERM)) {
if (lastDocsStart != -1) {
- b.add(lastTerm, new PairOutputs.Pair(lastDocsStart, Long.valueOf(docFreq)));
+ b.add(lastTerm, new PairOutputs.Pair>(lastDocsStart,
+ new PairOutputs.Pair((long) docFreq,
+ posIntOutputs.get(totalTermFreq))));
}
lastDocsStart = in.getFilePointer();
final int len = scratch.length - TERM.length;
@@ -489,6 +509,9 @@ class SimpleTextFieldsReader extends FieldsProducer {
System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len);
lastTerm.length = len;
docFreq = 0;
+ sumTotalTermFreq += totalTermFreq;
+ totalTermFreq = 0;
+ termCount++;
}
}
fst = b.finish();
@@ -514,6 +537,16 @@ class SimpleTextFieldsReader extends FieldsProducer {
public Comparator getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
+
+ @Override
+ public long getUniqueTermCount() {
+ return (long) termCount;
+ }
+
+ @Override
+ public long getSumTotalTermFreq() {
+ return sumTotalTermFreq;
+ }
}
@Override
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java
index ae6338943e0..128da45c9b7 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java
@@ -22,6 +22,7 @@ import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.TermsConsumer;
import org.apache.lucene.index.codecs.PostingsConsumer;
+import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
@@ -84,11 +85,11 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
}
@Override
- public void finishTerm(BytesRef term, int numDocs) throws IOException {
+ public void finishTerm(BytesRef term, TermStats stats) throws IOException {
}
@Override
- public void finish() throws IOException {
+ public void finish(long sumTotalTermFreq) throws IOException {
}
@Override
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/package.html b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/package.html
new file mode 100644
index 00000000000..88aad683412
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/package.html
@@ -0,0 +1,25 @@
+
+
+
+
+
+
+
+Simpletext Codec: writes human readable postings.
+
+
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java
index 384fe2a3bc5..f0af9ca2507 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java
@@ -33,8 +33,8 @@ import org.apache.lucene.index.codecs.TermsIndexWriterBase;
import org.apache.lucene.index.codecs.TermsIndexReaderBase;
import org.apache.lucene.index.codecs.VariableGapTermsIndexWriter;
import org.apache.lucene.index.codecs.VariableGapTermsIndexReader;
-import org.apache.lucene.index.codecs.PrefixCodedTermsWriter;
-import org.apache.lucene.index.codecs.PrefixCodedTermsReader;
+import org.apache.lucene.index.codecs.BlockTermsWriter;
+import org.apache.lucene.index.codecs.BlockTermsReader;
import org.apache.lucene.store.Directory;
/** Default codec.
@@ -66,7 +66,7 @@ public class StandardCodec extends Codec {
success = false;
try {
- FieldsConsumer ret = new PrefixCodedTermsWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUnicodeComparator());
+ FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUnicodeComparator());
success = true;
return ret;
} finally {
@@ -103,15 +103,15 @@ public class StandardCodec extends Codec {
success = false;
try {
- FieldsProducer ret = new PrefixCodedTermsReader(indexReader,
- state.dir,
- state.fieldInfos,
- state.segmentInfo.name,
- postings,
- state.readBufferSize,
- BytesRef.getUTF8SortedAsUnicodeComparator(),
- TERMS_CACHE_SIZE,
- state.codecId);
+ FieldsProducer ret = new BlockTermsReader(indexReader,
+ state.dir,
+ state.fieldInfos,
+ state.segmentInfo.name,
+ postings,
+ state.readBufferSize,
+ BytesRef.getUTF8SortedAsUnicodeComparator(),
+ TERMS_CACHE_SIZE,
+ state.codecId);
success = true;
return ret;
} finally {
@@ -134,7 +134,7 @@ public class StandardCodec extends Codec {
@Override
public void files(Directory dir, SegmentInfo segmentInfo, String id, Set files) throws IOException {
StandardPostingsReader.files(dir, segmentInfo, id, files);
- PrefixCodedTermsReader.files(dir, segmentInfo, id, files);
+ BlockTermsReader.files(dir, segmentInfo, id, files);
VariableGapTermsIndexReader.files(dir, segmentInfo, id, files);
}
@@ -146,7 +146,7 @@ public class StandardCodec extends Codec {
public static void getStandardExtensions(Set extensions) {
extensions.add(FREQ_EXTENSION);
extensions.add(PROX_EXTENSION);
- PrefixCodedTermsReader.getExtensions(extensions);
+ BlockTermsReader.getExtensions(extensions);
VariableGapTermsIndexReader.getIndexExtensions(extensions);
}
}
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java
index eed6b0e6735..0c9dd4f5c86 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java
@@ -20,15 +20,18 @@ package org.apache.lucene.index.codecs.standard;
import java.io.IOException;
import java.util.Collection;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.index.SegmentInfo;
-import org.apache.lucene.index.FieldInfo;
-import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.codecs.BlockTermState;
import org.apache.lucene.index.codecs.PostingsReaderBase;
-import org.apache.lucene.index.codecs.TermState;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
@@ -45,9 +48,12 @@ public class StandardPostingsReader extends PostingsReaderBase {
int skipInterval;
int maxSkipLevels;
+ //private String segment;
+
public StandardPostingsReader(Directory dir, SegmentInfo segmentInfo, int readBufferSize, String codecId) throws IOException {
freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, codecId, StandardCodec.FREQ_EXTENSION),
readBufferSize);
+ //this.segment = segmentInfo.name;
if (segmentInfo.getHasProx()) {
boolean success = false;
try {
@@ -83,33 +89,46 @@ public class StandardPostingsReader extends PostingsReaderBase {
}
// Must keep final because we do non-standard clone
- private final static class DocTermState extends TermState {
+ private final static class StandardTermState extends BlockTermState {
long freqOffset;
long proxOffset;
int skipOffset;
+ // Only used by the "primary" TermState -- clones don't
+ // copy this (basically they are "transient"):
+ ByteArrayDataInput bytesReader;
+ byte[] bytes;
+
+ @Override
public Object clone() {
- DocTermState other = new DocTermState();
- other.copy(this);
+ StandardTermState other = new StandardTermState();
+ other.copyFrom(this);
return other;
}
- public void copy(TermState _other) {
- super.copy(_other);
- DocTermState other = (DocTermState) _other;
+ @Override
+ public void copyFrom(TermState _other) {
+ super.copyFrom(_other);
+ StandardTermState other = (StandardTermState) _other;
freqOffset = other.freqOffset;
proxOffset = other.proxOffset;
skipOffset = other.skipOffset;
+
+ // Do not copy bytes, bytesReader (else TermState is
+ // very heavy, ie drags around the entire block's
+ // byte[]). On seek back, if next() is in fact used
+ // (rare!), they will be re-read from disk.
}
+ @Override
public String toString() {
return super.toString() + " freqFP=" + freqOffset + " proxFP=" + proxOffset + " skipOffset=" + skipOffset;
}
}
@Override
- public TermState newTermState() {
- return new DocTermState();
+ public BlockTermState newTermState() {
+ return new StandardTermState();
}
@Override
@@ -125,35 +144,61 @@ public class StandardPostingsReader extends PostingsReaderBase {
}
}
+ /* Reads but does not decode the byte[] blob holding
+ metadata for the current terms block */
@Override
- public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState termState, boolean isIndexTerm)
- throws IOException {
+ public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState _termState) throws IOException {
+ final StandardTermState termState = (StandardTermState) _termState;
- final DocTermState docTermState = (DocTermState) termState;
-
- if (isIndexTerm) {
- docTermState.freqOffset = termsIn.readVLong();
- } else {
- docTermState.freqOffset += termsIn.readVLong();
+ final int len = termsIn.readVInt();
+ //System.out.println("SPR.readTermsBlock termsIn.fp=" + termsIn.getFilePointer());
+ if (termState.bytes == null) {
+ termState.bytes = new byte[ArrayUtil.oversize(len, 1)];
+ termState.bytesReader = new ByteArrayDataInput(null);
+ } else if (termState.bytes.length < len) {
+ termState.bytes = new byte[ArrayUtil.oversize(len, 1)];
}
- if (docTermState.docFreq >= skipInterval) {
- docTermState.skipOffset = termsIn.readVInt();
+ termsIn.readBytes(termState.bytes, 0, len);
+ termState.bytesReader.reset(termState.bytes, 0, len);
+ }
+
+ @Override
+ public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState)
+ throws IOException {
+ final StandardTermState termState = (StandardTermState) _termState;
+ //System.out.println("StandardR.nextTerm seg=" + segment);
+ final boolean isFirstTerm = termState.termCount == 0;
+
+ if (isFirstTerm) {
+ termState.freqOffset = termState.bytesReader.readVLong();
} else {
- docTermState.skipOffset = 0;
+ termState.freqOffset += termState.bytesReader.readVLong();
+ }
+ //System.out.println(" dF=" + termState.docFreq);
+ //System.out.println(" freqFP=" + termState.freqOffset);
+ assert termState.freqOffset < freqIn.length();
+
+ if (termState.docFreq >= skipInterval) {
+ termState.skipOffset = termState.bytesReader.readVInt();
+ //System.out.println(" skipOffset=" + termState.skipOffset + " vs freqIn.length=" + freqIn.length());
+ assert termState.freqOffset + termState.skipOffset < freqIn.length();
+ } else {
+ // undefined
}
if (!fieldInfo.omitTermFreqAndPositions) {
- if (isIndexTerm) {
- docTermState.proxOffset = termsIn.readVLong();
+ if (isFirstTerm) {
+ termState.proxOffset = termState.bytesReader.readVLong();
} else {
- docTermState.proxOffset += termsIn.readVLong();
+ termState.proxOffset += termState.bytesReader.readVLong();
}
+ //System.out.println(" proxFP=" + termState.proxOffset);
}
}
@Override
- public DocsEnum docs(FieldInfo fieldInfo, TermState termState, Bits skipDocs, DocsEnum reuse) throws IOException {
+ public DocsEnum docs(FieldInfo fieldInfo, BlockTermState termState, Bits skipDocs, DocsEnum reuse) throws IOException {
SegmentDocsEnum docsEnum;
if (reuse == null || !(reuse instanceof SegmentDocsEnum)) {
docsEnum = new SegmentDocsEnum(freqIn);
@@ -166,11 +211,11 @@ public class StandardPostingsReader extends PostingsReaderBase {
docsEnum = new SegmentDocsEnum(freqIn);
}
}
- return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs);
+ return docsEnum.reset(fieldInfo, (StandardTermState) termState, skipDocs);
}
@Override
- public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
+ public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
if (fieldInfo.omitTermFreqAndPositions) {
return null;
}
@@ -189,7 +234,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
docsEnum = new SegmentDocsAndPositionsAndPayloadsEnum(freqIn, proxIn);
}
}
- return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs);
+ return docsEnum.reset(fieldInfo, (StandardTermState) termState, skipDocs);
} else {
SegmentDocsAndPositionsEnum docsEnum;
if (reuse == null || !(reuse instanceof SegmentDocsAndPositionsEnum)) {
@@ -203,7 +248,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
docsEnum = new SegmentDocsAndPositionsEnum(freqIn, proxIn);
}
}
- return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs);
+ return docsEnum.reset(fieldInfo, (StandardTermState) termState, skipDocs);
}
}
@@ -233,7 +278,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
this.freqIn = (IndexInput) freqIn.clone();
}
- public SegmentDocsEnum reset(FieldInfo fieldInfo, DocTermState termState, Bits skipDocs) throws IOException {
+ public SegmentDocsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits skipDocs) throws IOException {
omitTF = fieldInfo.omitTermFreqAndPositions;
if (omitTF) {
freq = 1;
@@ -248,8 +293,10 @@ public class StandardPostingsReader extends PostingsReaderBase {
// cases
freqIn.seek(termState.freqOffset);
limit = termState.docFreq;
+ assert limit > 0;
ord = 0;
doc = 0;
+ //System.out.println(" sde limit=" + limit + " freqFP=" + freqOffset);
skipped = false;
@@ -331,13 +378,10 @@ public class StandardPostingsReader extends PostingsReaderBase {
@Override
public int advance(int target) throws IOException {
- // TODO: jump right to next() if target is < X away
- // from where we are now?
-
- if (skipOffset > 0) {
+ if ((target - skipInterval) >= doc && limit >= skipInterval) {
// There are enough docs in the posting to have
- // skip data
+ // skip data, and it isn't too close.
if (skipper == null) {
// This is the first time this enum has ever been used for skipping -- do lazy init
@@ -407,7 +451,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
this.proxIn = (IndexInput) proxIn.clone();
}
- public SegmentDocsAndPositionsEnum reset(FieldInfo fieldInfo, DocTermState termState, Bits skipDocs) throws IOException {
+ public SegmentDocsAndPositionsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits skipDocs) throws IOException {
assert !fieldInfo.omitTermFreqAndPositions;
assert !fieldInfo.storePayloads;
@@ -420,6 +464,8 @@ public class StandardPostingsReader extends PostingsReaderBase {
lazyProxPointer = termState.proxOffset;
limit = termState.docFreq;
+ assert limit > 0;
+
ord = 0;
doc = 0;
position = 0;
@@ -430,6 +476,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
freqOffset = termState.freqOffset;
proxOffset = termState.proxOffset;
skipOffset = termState.skipOffset;
+ //System.out.println("StandardR.D&PE reset seg=" + segment + " limit=" + limit + " freqFP=" + freqOffset + " proxFP=" + proxOffset);
return this;
}
@@ -438,6 +485,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
public int nextDoc() throws IOException {
while(true) {
if (ord == limit) {
+ //System.out.println("StandardR.D&PE seg=" + segment + " nextDoc return doc=END");
return doc = NO_MORE_DOCS;
}
@@ -461,6 +509,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
position = 0;
+ //System.out.println("StandardR.D&PE nextDoc seg=" + segment + " return doc=" + doc);
return doc;
}
@@ -477,13 +526,12 @@ public class StandardPostingsReader extends PostingsReaderBase {
@Override
public int advance(int target) throws IOException {
- // TODO: jump right to next() if target is < X away
- // from where we are now?
+ //System.out.println("StandardR.D&PE advance target=" + target);
- if (skipOffset > 0) {
+ if ((target - skipInterval) >= doc && limit >= skipInterval) {
// There are enough docs in the posting to have
- // skip data
+ // skip data, and it isn't too close
if (skipper == null) {
// This is the first time this enum has ever been used for skipping -- do lazy init
@@ -524,6 +572,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
return doc;
}
+ @Override
public int nextPosition() throws IOException {
if (lazyProxPointer != -1) {
@@ -552,10 +601,12 @@ public class StandardPostingsReader extends PostingsReaderBase {
/** Returns the payload at this position, or null if no
* payload was indexed. */
+ @Override
public BytesRef getPayload() throws IOException {
throw new IOException("No payloads exist for this field!");
}
+ @Override
public boolean hasPayload() {
return false;
}
@@ -594,7 +645,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
this.proxIn = (IndexInput) proxIn.clone();
}
- public SegmentDocsAndPositionsAndPayloadsEnum reset(FieldInfo fieldInfo, DocTermState termState, Bits skipDocs) throws IOException {
+ public SegmentDocsAndPositionsAndPayloadsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits skipDocs) throws IOException {
assert !fieldInfo.omitTermFreqAndPositions;
assert fieldInfo.storePayloads;
if (payload == null) {
@@ -622,6 +673,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
freqOffset = termState.freqOffset;
proxOffset = termState.proxOffset;
skipOffset = termState.skipOffset;
+ //System.out.println("StandardR.D&PE reset seg=" + segment + " limit=" + limit + " freqFP=" + freqOffset + " proxFP=" + proxOffset + " this=" + this);
return this;
}
@@ -630,6 +682,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
public int nextDoc() throws IOException {
while(true) {
if (ord == limit) {
+ //System.out.println("StandardR.D&PE seg=" + segment + " nextDoc return doc=END");
return doc = NO_MORE_DOCS;
}
@@ -653,6 +706,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
position = 0;
+ //System.out.println("StandardR.D&PE nextDoc seg=" + segment + " return doc=" + doc);
return doc;
}
@@ -669,13 +723,12 @@ public class StandardPostingsReader extends PostingsReaderBase {
@Override
public int advance(int target) throws IOException {
- // TODO: jump right to next() if target is < X away
- // from where we are now?
+ //System.out.println("StandardR.D&PE advance seg=" + segment + " target=" + target + " this=" + this);
- if (skipOffset > 0) {
+ if ((target - skipInterval) >= doc && limit >= skipInterval) {
// There are enough docs in the posting to have
- // skip data
+ // skip data, and it isn't too close
if (skipper == null) {
// This is the first time this enum has ever been used for skipping -- do lazy init
@@ -687,7 +740,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
// This is the first time this posting has
// skipped, since reset() was called, so now we
// load the skip data for this posting
-
+ //System.out.println(" init skipper freqOffset=" + freqOffset + " skipOffset=" + skipOffset + " vs len=" + freqIn.length());
skipper.init(freqOffset+skipOffset,
freqOffset, proxOffset,
limit, true);
@@ -718,6 +771,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
return doc;
}
+ @Override
public int nextPosition() throws IOException {
if (lazyProxPointer != -1) {
@@ -748,6 +802,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
posPendingCount--;
position = 0;
payloadPending = false;
+ //System.out.println("StandardR.D&PE skipPos");
}
// read next position
@@ -771,11 +826,13 @@ public class StandardPostingsReader extends PostingsReaderBase {
assert posPendingCount >= 0: "nextPosition() was called too many times (more than freq() times) posPendingCount=" + posPendingCount;
+ //System.out.println("StandardR.D&PE nextPos return pos=" + position);
return position;
}
/** Returns the payload at this position, or null if no
* payload was indexed. */
+ @Override
public BytesRef getPayload() throws IOException {
assert lazyProxPointer == -1;
assert posPendingCount < freq;
@@ -785,6 +842,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
if (payloadLength > payload.bytes.length) {
payload.grow(payloadLength);
}
+
proxIn.readBytes(payload.bytes, 0, payloadLength);
payload.length = payloadLength;
payloadPending = false;
@@ -792,6 +850,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
return payload;
}
+ @Override
public boolean hasPayload() {
return payloadPending && payloadLength > 0;
}
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java
index 6dafdcda728..22e923f2273 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java
@@ -22,12 +22,14 @@ package org.apache.lucene.index.codecs.standard;
import java.io.IOException;
-import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.index.FieldInfo;
-import org.apache.lucene.index.SegmentWriteState;
-import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.codecs.PostingsWriterBase;
+import org.apache.lucene.index.codecs.TermStats;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
@@ -58,8 +60,15 @@ public final class StandardPostingsWriter extends PostingsWriterBase {
int lastPayloadLength;
int lastPosition;
+ private int pendingCount;
+
+ //private String segment;
+
+ private RAMOutputStream bytesWriter = new RAMOutputStream();
+
public StandardPostingsWriter(SegmentWriteState state) throws IOException {
super();
+ //this.segment = state.segmentName;
String fileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, StandardCodec.FREQ_EXTENSION);
freqOut = state.directory.createOutput(fileName);
@@ -95,6 +104,7 @@ public final class StandardPostingsWriter extends PostingsWriterBase {
@Override
public void startTerm() {
+ //System.out.println("StandardW: startTerm seg=" + segment + " pendingCount=" + pendingCount);
freqStart = freqOut.getFilePointer();
if (proxOut != null) {
proxStart = proxOut.getFilePointer();
@@ -108,9 +118,12 @@ public final class StandardPostingsWriter extends PostingsWriterBase {
// our parent calls setField whenever the field changes
@Override
public void setField(FieldInfo fieldInfo) {
+ //System.out.println("SPW: setField");
this.fieldInfo = fieldInfo;
omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions;
storePayloads = fieldInfo.storePayloads;
+ //System.out.println(" set init blockFreqStart=" + freqStart);
+ //System.out.println(" set init blockProxStart=" + proxStart);
}
int lastDocID;
@@ -120,6 +133,7 @@ public final class StandardPostingsWriter extends PostingsWriterBase {
* then we just skip consuming positions/payloads. */
@Override
public void startDoc(int docID, int termDocFreq) throws IOException {
+ //System.out.println("StandardW: startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq);
final int delta = docID - lastDocID;
@@ -150,6 +164,7 @@ public final class StandardPostingsWriter extends PostingsWriterBase {
/** Add a new position & payload */
@Override
public void addPosition(int position, BytesRef payload) throws IOException {
+ //System.out.println("StandardW: addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.length + " bytes")) + " proxFP=" + proxOut.getFilePointer());
assert !omitTermFreqAndPositions: "omitTermFreqAndPositions is true";
assert proxOut != null;
@@ -184,40 +199,51 @@ public final class StandardPostingsWriter extends PostingsWriterBase {
/** Called when we are done adding docs to this term */
@Override
- public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
- assert docCount > 0;
+ public void finishTerm(TermStats stats) throws IOException {
+ //System.out.println("StandardW.finishTerm seg=" + segment);
+ assert stats.docFreq > 0;
// TODO: wasteful we are counting this (counting # docs
// for this term) in two places?
- assert docCount == df;
+ assert stats.docFreq == df;
- if (isIndexTerm) {
- // Write absolute at seek points
- termsOut.writeVLong(freqStart);
+ final boolean isFirstTerm = pendingCount == 0;
+ //System.out.println(" isFirstTerm=" + isFirstTerm);
+
+ //System.out.println(" freqFP=" + freqStart);
+ if (isFirstTerm) {
+ bytesWriter.writeVLong(freqStart);
} else {
- // Write delta between seek points
- termsOut.writeVLong(freqStart - lastFreqStart);
+ bytesWriter.writeVLong(freqStart-lastFreqStart);
}
-
lastFreqStart = freqStart;
if (df >= skipInterval) {
- termsOut.writeVInt((int) (skipListWriter.writeSkip(freqOut)-freqStart));
+ bytesWriter.writeVInt((int) (skipListWriter.writeSkip(freqOut)-freqStart));
}
-
+
if (!omitTermFreqAndPositions) {
- if (isIndexTerm) {
- // Write absolute at seek points
- termsOut.writeVLong(proxStart);
+ //System.out.println(" proxFP=" + proxStart);
+ if (isFirstTerm) {
+ bytesWriter.writeVLong(proxStart);
} else {
- // Write delta between seek points
- termsOut.writeVLong(proxStart - lastProxStart);
+ bytesWriter.writeVLong(proxStart - lastProxStart);
}
lastProxStart = proxStart;
}
-
+
lastDocID = 0;
df = 0;
+ pendingCount++;
+ }
+
+ @Override
+ public void flushTermsBlock() throws IOException {
+ //System.out.println("SPW.flushBlock pendingCount=" + pendingCount);
+ termsOut.writeVInt((int) bytesWriter.getFilePointer());
+ bytesWriter.writeTo(termsOut);
+ bytesWriter.reset();
+ pendingCount = 0;
}
@Override
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/package.html b/lucene/src/java/org/apache/lucene/index/codecs/standard/package.html
new file mode 100644
index 00000000000..aca1dc4b665
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/package.html
@@ -0,0 +1,25 @@
+
+
+
+
+
+
+
+Standard Codec
+
+
diff --git a/lucene/src/java/org/apache/lucene/queryParser/QueryParserBase.java b/lucene/src/java/org/apache/lucene/queryParser/QueryParserBase.java
index 078e2adfb63..41ad00987ed 100644
--- a/lucene/src/java/org/apache/lucene/queryParser/QueryParserBase.java
+++ b/lucene/src/java/org/apache/lucene/queryParser/QueryParserBase.java
@@ -1,1150 +1,1150 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.queryParser;
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.text.Collator;
-import java.text.DateFormat;
-import java.util.*;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CachingTokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
-import org.apache.lucene.document.DateTools;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.queryParser.QueryParser.Operator;
-import org.apache.lucene.search.*;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.Version;
-
-/** This class is overridden by QueryParser in QueryParser.jj
- * and acts to separate the majority of the Java code from the .jj grammar file.
- */
-public abstract class QueryParserBase {
-
- /** Do not catch this exception in your code, it means you are using methods that you should no longer use. */
- public static class MethodRemovedUseAnother extends Throwable {}
-
- static final int CONJ_NONE = 0;
- static final int CONJ_AND = 1;
- static final int CONJ_OR = 2;
-
- static final int MOD_NONE = 0;
- static final int MOD_NOT = 10;
- static final int MOD_REQ = 11;
-
- // make it possible to call setDefaultOperator() without accessing
- // the nested class:
- /** Alternative form of QueryParser.Operator.AND */
- public static final Operator AND_OPERATOR = Operator.AND;
- /** Alternative form of QueryParser.Operator.OR */
- public static final Operator OR_OPERATOR = Operator.OR;
-
- /** The actual operator that parser uses to combine query terms */
- Operator operator = OR_OPERATOR;
-
- boolean lowercaseExpandedTerms = true;
- MultiTermQuery.RewriteMethod multiTermRewriteMethod = MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
- boolean allowLeadingWildcard = false;
- boolean enablePositionIncrements = true;
-
- Analyzer analyzer;
- String field;
- int phraseSlop = 0;
- float fuzzyMinSim = FuzzyQuery.defaultMinSimilarity;
- int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength;
- Locale locale = Locale.getDefault();
-
- // the default date resolution
- DateTools.Resolution dateResolution = null;
- // maps field names to date resolutions
- Map fieldToDateResolution = null;
-
- // The collator to use when determining range inclusion,
- // for use when constructing RangeQuerys.
- Collator rangeCollator = null;
-
- boolean autoGeneratePhraseQueries;
-
- // So the generated QueryParser(CharStream) won't error out
- protected QueryParserBase() {
- }
-
- /** Initializes a query parser. Called by the QueryParser constructor
- * @param matchVersion Lucene version to match. See above.
- * @param f the default field for query terms.
- * @param a used to find terms in the query text.
- */
- public void init(Version matchVersion, String f, Analyzer a) {
- analyzer = a;
- field = f;
- if (matchVersion.onOrAfter(Version.LUCENE_31)) {
- setAutoGeneratePhraseQueries(false);
- } else {
- setAutoGeneratePhraseQueries(true);
- }
- }
-
- // the generated parser will create these in QueryParser
- public abstract void ReInit(CharStream stream);
- public abstract Query TopLevelQuery(String field) throws ParseException;
-
-
- /** Parses a query string, returning a {@link org.apache.lucene.search.Query}.
- * @param query the query string to be parsed.
- * @throws ParseException if the parsing fails
- */
- public Query parse(String query) throws ParseException {
- ReInit(new FastCharStream(new StringReader(query)));
- try {
- // TopLevelQuery is a Query followed by the end-of-input (EOF)
- Query res = TopLevelQuery(field);
- return res!=null ? res : newBooleanQuery(false);
- }
- catch (ParseException tme) {
- // rethrow to include the original query:
- ParseException e = new ParseException("Cannot parse '" +query+ "': " + tme.getMessage());
- e.initCause(tme);
- throw e;
- }
- catch (TokenMgrError tme) {
- ParseException e = new ParseException("Cannot parse '" +query+ "': " + tme.getMessage());
- e.initCause(tme);
- throw e;
- }
- catch (BooleanQuery.TooManyClauses tmc) {
- ParseException e = new ParseException("Cannot parse '" +query+ "': too many boolean clauses");
- e.initCause(tmc);
- throw e;
- }
- }
-
-
- /**
- * @return Returns the analyzer.
- */
- public Analyzer getAnalyzer() {
- return analyzer;
- }
-
- /**
- * @return Returns the default field.
- */
- public String getField() {
- return field;
- }
-
- /**
- * @see #setAutoGeneratePhraseQueries(boolean)
- */
- public final boolean getAutoGeneratePhraseQueries() {
- return autoGeneratePhraseQueries;
- }
-
- /**
- * Set to true if phrase queries will be automatically generated
- * when the analyzer returns more than one term from whitespace
- * delimited text.
- * NOTE: this behavior may not be suitable for all languages.
- *
- * Set to false if phrase queries should only be generated when
- * surrounded by double quotes.
- */
- public final void setAutoGeneratePhraseQueries(boolean value) {
- this.autoGeneratePhraseQueries = value;
- }
-
- /**
- * Get the minimal similarity for fuzzy queries.
- */
- public float getFuzzyMinSim() {
- return fuzzyMinSim;
- }
-
- /**
- * Set the minimum similarity for fuzzy queries.
- * Default is 2f.
- */
- public void setFuzzyMinSim(float fuzzyMinSim) {
- this.fuzzyMinSim = fuzzyMinSim;
- }
-
- /**
- * Get the prefix length for fuzzy queries.
- * @return Returns the fuzzyPrefixLength.
- */
- public int getFuzzyPrefixLength() {
- return fuzzyPrefixLength;
- }
-
- /**
- * Set the prefix length for fuzzy queries. Default is 0.
- * @param fuzzyPrefixLength The fuzzyPrefixLength to set.
- */
- public void setFuzzyPrefixLength(int fuzzyPrefixLength) {
- this.fuzzyPrefixLength = fuzzyPrefixLength;
- }
-
- /**
- * Sets the default slop for phrases. If zero, then exact phrase matches
- * are required. Default value is zero.
- */
- public void setPhraseSlop(int phraseSlop) {
- this.phraseSlop = phraseSlop;
- }
-
- /**
- * Gets the default slop for phrases.
- */
- public int getPhraseSlop() {
- return phraseSlop;
- }
-
-
- /**
- * Set to true
to allow leading wildcard characters.
- *
- * When set, *
or ?
are allowed as
- * the first character of a PrefixQuery and WildcardQuery.
- * Note that this can produce very slow
- * queries on big indexes.
- *
- * Default: false.
- */
- public void setAllowLeadingWildcard(boolean allowLeadingWildcard) {
- this.allowLeadingWildcard = allowLeadingWildcard;
- }
-
- /**
- * @see #setAllowLeadingWildcard(boolean)
- */
- public boolean getAllowLeadingWildcard() {
- return allowLeadingWildcard;
- }
-
- /**
- * Set to true
to enable position increments in result query.
- *
- * When set, result phrase and multi-phrase queries will
- * be aware of position increments.
- * Useful when e.g. a StopFilter increases the position increment of
- * the token that follows an omitted token.
- *
- * Default: true.
- */
- public void setEnablePositionIncrements(boolean enable) {
- this.enablePositionIncrements = enable;
- }
-
- /**
- * @see #setEnablePositionIncrements(boolean)
- */
- public boolean getEnablePositionIncrements() {
- return enablePositionIncrements;
- }
-
- /**
- * Sets the boolean operator of the QueryParser.
- * In default mode (OR_OPERATOR
) terms without any modifiers
- * are considered optional: for example capital of Hungary
is equal to
- * capital OR of OR Hungary
.
- * In AND_OPERATOR
mode terms are considered to be in conjunction: the
- * above mentioned query is parsed as capital AND of AND Hungary
- */
- public void setDefaultOperator(Operator op) {
- this.operator = op;
- }
-
-
- /**
- * Gets implicit operator setting, which will be either AND_OPERATOR
- * or OR_OPERATOR.
- */
- public Operator getDefaultOperator() {
- return operator;
- }
-
-
- /**
- * Whether terms of wildcard, prefix, fuzzy and range queries are to be automatically
- * lower-cased or not. Default is true
.
- */
- public void setLowercaseExpandedTerms(boolean lowercaseExpandedTerms) {
- this.lowercaseExpandedTerms = lowercaseExpandedTerms;
- }
-
-
- /**
- * @see #setLowercaseExpandedTerms(boolean)
- */
- public boolean getLowercaseExpandedTerms() {
- return lowercaseExpandedTerms;
- }
-
- /**
- * By default QueryParser uses {@link org.apache.lucene.search.MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}
- * when creating a PrefixQuery, WildcardQuery or RangeQuery. This implementation is generally preferable because it
- * a) Runs faster b) Does not have the scarcity of terms unduly influence score
- * c) avoids any "TooManyBooleanClauses" exception.
- * However, if your application really needs to use the
- * old-fashioned BooleanQuery expansion rewriting and the above
- * points are not relevant then use this to change
- * the rewrite method.
- */
- public void setMultiTermRewriteMethod(MultiTermQuery.RewriteMethod method) {
- multiTermRewriteMethod = method;
- }
-
-
- /**
- * @see #setMultiTermRewriteMethod
- */
- public MultiTermQuery.RewriteMethod getMultiTermRewriteMethod() {
- return multiTermRewriteMethod;
- }
-
- /**
- * Set locale used by date range parsing.
- */
- public void setLocale(Locale locale) {
- this.locale = locale;
- }
-
- /**
- * Returns current locale, allowing access by subclasses.
- */
- public Locale getLocale() {
- return locale;
- }
-
- /**
- * Sets the default date resolution used by RangeQueries for fields for which no
- * specific date resolutions has been set. Field specific resolutions can be set
- * with {@link #setDateResolution(String, org.apache.lucene.document.DateTools.Resolution)}.
- *
- * @param dateResolution the default date resolution to set
- */
- public void setDateResolution(DateTools.Resolution dateResolution) {
- this.dateResolution = dateResolution;
- }
-
- /**
- * Sets the date resolution used by RangeQueries for a specific field.
- *
- * @param fieldName field for which the date resolution is to be set
- * @param dateResolution date resolution to set
- */
- public void setDateResolution(String fieldName, DateTools.Resolution dateResolution) {
- if (fieldName == null) {
- throw new IllegalArgumentException("Field cannot be null.");
- }
-
- if (fieldToDateResolution == null) {
- // lazily initialize HashMap
- fieldToDateResolution = new HashMap();
- }
-
- fieldToDateResolution.put(fieldName, dateResolution);
- }
-
- /**
- * Returns the date resolution that is used by RangeQueries for the given field.
- * Returns null, if no default or field specific date resolution has been set
- * for the given field.
- *
- */
- public DateTools.Resolution getDateResolution(String fieldName) {
- if (fieldName == null) {
- throw new IllegalArgumentException("Field cannot be null.");
- }
-
- if (fieldToDateResolution == null) {
- // no field specific date resolutions set; return default date resolution instead
- return this.dateResolution;
- }
-
- DateTools.Resolution resolution = fieldToDateResolution.get(fieldName);
- if (resolution == null) {
- // no date resolutions set for the given field; return default date resolution instead
- resolution = this.dateResolution;
- }
-
- return resolution;
- }
-
- /**
- * Sets the collator used to determine index term inclusion in ranges
- * for RangeQuerys.
- *
- * WARNING: Setting the rangeCollator to a non-null
- * collator using this method will cause every single index Term in the
- * Field referenced by lowerTerm and/or upperTerm to be examined.
- * Depending on the number of index Terms in this Field, the operation could
- * be very slow.
- *
- * @param rc the collator to use when constructing RangeQuerys
- */
- public void setRangeCollator(Collator rc) {
- rangeCollator = rc;
- }
-
- /**
- * @return the collator used to determine index term inclusion in ranges
- * for RangeQuerys.
- */
- public Collator getRangeCollator() {
- return rangeCollator;
- }
-
- protected void addClause(List clauses, int conj, int mods, Query q) {
- boolean required, prohibited;
-
- // If this term is introduced by AND, make the preceding term required,
- // unless it's already prohibited
- if (clauses.size() > 0 && conj == CONJ_AND) {
- BooleanClause c = clauses.get(clauses.size()-1);
- if (!c.isProhibited())
- c.setOccur(BooleanClause.Occur.MUST);
- }
-
- if (clauses.size() > 0 && operator == AND_OPERATOR && conj == CONJ_OR) {
- // If this term is introduced by OR, make the preceding term optional,
- // unless it's prohibited (that means we leave -a OR b but +a OR b-->a OR b)
- // notice if the input is a OR b, first term is parsed as required; without
- // this modification a OR b would parsed as +a OR b
- BooleanClause c = clauses.get(clauses.size()-1);
- if (!c.isProhibited())
- c.setOccur(BooleanClause.Occur.SHOULD);
- }
-
- // We might have been passed a null query; the term might have been
- // filtered away by the analyzer.
- if (q == null)
- return;
-
- if (operator == OR_OPERATOR) {
- // We set REQUIRED if we're introduced by AND or +; PROHIBITED if
- // introduced by NOT or -; make sure not to set both.
- prohibited = (mods == MOD_NOT);
- required = (mods == MOD_REQ);
- if (conj == CONJ_AND && !prohibited) {
- required = true;
- }
- } else {
- // We set PROHIBITED if we're introduced by NOT or -; We set REQUIRED
- // if not PROHIBITED and not introduced by OR
- prohibited = (mods == MOD_NOT);
- required = (!prohibited && conj != CONJ_OR);
- }
- if (required && !prohibited)
- clauses.add(newBooleanClause(q, BooleanClause.Occur.MUST));
- else if (!required && !prohibited)
- clauses.add(newBooleanClause(q, BooleanClause.Occur.SHOULD));
- else if (!required && prohibited)
- clauses.add(newBooleanClause(q, BooleanClause.Occur.MUST_NOT));
- else
- throw new RuntimeException("Clause cannot be both required and prohibited");
- }
-
- /**
- * @exception org.apache.lucene.queryParser.ParseException throw in overridden method to disallow
- */
- protected Query getFieldQuery(String field, String queryText, boolean quoted) throws ParseException {
- // Use the analyzer to get all the tokens, and then build a TermQuery,
- // PhraseQuery, or nothing based on the term count
-
- TokenStream source;
- try {
- source = analyzer.reusableTokenStream(field, new StringReader(queryText));
- source.reset();
- } catch (IOException e) {
- source = analyzer.tokenStream(field, new StringReader(queryText));
- }
- CachingTokenFilter buffer = new CachingTokenFilter(source);
- TermToBytesRefAttribute termAtt = null;
- PositionIncrementAttribute posIncrAtt = null;
- int numTokens = 0;
-
- boolean success = false;
- try {
- buffer.reset();
- success = true;
- } catch (IOException e) {
- // success==false if we hit an exception
- }
- if (success) {
- if (buffer.hasAttribute(TermToBytesRefAttribute.class)) {
- termAtt = buffer.getAttribute(TermToBytesRefAttribute.class);
- }
- if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
- posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
- }
- }
-
- int positionCount = 0;
- boolean severalTokensAtSamePosition = false;
-
- boolean hasMoreTokens = false;
- if (termAtt != null) {
- try {
- hasMoreTokens = buffer.incrementToken();
- while (hasMoreTokens) {
- numTokens++;
- int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
- if (positionIncrement != 0) {
- positionCount += positionIncrement;
- } else {
- severalTokensAtSamePosition = true;
- }
- hasMoreTokens = buffer.incrementToken();
- }
- } catch (IOException e) {
- // ignore
- }
- }
- try {
- // rewind the buffer stream
- buffer.reset();
-
- // close original stream - all tokens buffered
- source.close();
- }
- catch (IOException e) {
- // ignore
- }
-
- if (numTokens == 0)
- return null;
- else if (numTokens == 1) {
- BytesRef term = new BytesRef();
- try {
- boolean hasNext = buffer.incrementToken();
- assert hasNext == true;
- termAtt.toBytesRef(term);
- } catch (IOException e) {
- // safe to ignore, because we know the number of tokens
- }
- return newTermQuery(new Term(field, term));
- } else {
- if (severalTokensAtSamePosition || (!quoted && !autoGeneratePhraseQueries)) {
- if (positionCount == 1 || (!quoted && !autoGeneratePhraseQueries)) {
- // no phrase query:
- BooleanQuery q = newBooleanQuery(positionCount == 1);
-
- BooleanClause.Occur occur = positionCount > 1 && operator == AND_OPERATOR ?
- BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD;
-
- for (int i = 0; i < numTokens; i++) {
- BytesRef term = new BytesRef();
- try {
- boolean hasNext = buffer.incrementToken();
- assert hasNext == true;
- termAtt.toBytesRef(term);
- } catch (IOException e) {
- // safe to ignore, because we know the number of tokens
- }
-
- Query currentQuery = newTermQuery(
- new Term(field, term));
- q.add(currentQuery, occur);
- }
- return q;
- }
- else {
- // phrase query:
- MultiPhraseQuery mpq = newMultiPhraseQuery();
- mpq.setSlop(phraseSlop);
- List multiTerms = new ArrayList();
- int position = -1;
- for (int i = 0; i < numTokens; i++) {
- BytesRef term = new BytesRef();
- int positionIncrement = 1;
- try {
- boolean hasNext = buffer.incrementToken();
- assert hasNext == true;
- termAtt.toBytesRef(term);
- if (posIncrAtt != null) {
- positionIncrement = posIncrAtt.getPositionIncrement();
- }
- } catch (IOException e) {
- // safe to ignore, because we know the number of tokens
- }
-
- if (positionIncrement > 0 && multiTerms.size() > 0) {
- if (enablePositionIncrements) {
- mpq.add(multiTerms.toArray(new Term[0]),position);
- } else {
- mpq.add(multiTerms.toArray(new Term[0]));
- }
- multiTerms.clear();
- }
- position += positionIncrement;
- multiTerms.add(new Term(field, term));
- }
- if (enablePositionIncrements) {
- mpq.add(multiTerms.toArray(new Term[0]),position);
- } else {
- mpq.add(multiTerms.toArray(new Term[0]));
- }
- return mpq;
- }
- }
- else {
- PhraseQuery pq = newPhraseQuery();
- pq.setSlop(phraseSlop);
- int position = -1;
-
-
- for (int i = 0; i < numTokens; i++) {
- BytesRef term = new BytesRef();
- int positionIncrement = 1;
-
- try {
- boolean hasNext = buffer.incrementToken();
- assert hasNext == true;
- termAtt.toBytesRef(term);
- if (posIncrAtt != null) {
- positionIncrement = posIncrAtt.getPositionIncrement();
- }
- } catch (IOException e) {
- // safe to ignore, because we know the number of tokens
- }
-
- if (enablePositionIncrements) {
- position += positionIncrement;
- pq.add(new Term(field, term),position);
- } else {
- pq.add(new Term(field, term));
- }
- }
- return pq;
- }
- }
- }
-
-
-
- /**
- * Base implementation delegates to {@link #getFieldQuery(String,String,boolean)}.
- * This method may be overridden, for example, to return
- * a SpanNearQuery instead of a PhraseQuery.
- *
- * @exception org.apache.lucene.queryParser.ParseException throw in overridden method to disallow
- */
- protected Query getFieldQuery(String field, String queryText, int slop)
- throws ParseException {
- Query query = getFieldQuery(field, queryText, true);
-
- if (query instanceof PhraseQuery) {
- ((PhraseQuery) query).setSlop(slop);
- }
- if (query instanceof MultiPhraseQuery) {
- ((MultiPhraseQuery) query).setSlop(slop);
- }
-
- return query;
- }
-
- /**
- *
- * @exception org.apache.lucene.queryParser.ParseException
- */
- protected Query getRangeQuery(String field,
- String part1,
- String part2,
- boolean startInclusive,
- boolean endInclusive) throws ParseException
- {
- if (lowercaseExpandedTerms) {
- part1 = part1==null ? null : part1.toLowerCase();
- part2 = part2==null ? null : part2.toLowerCase();
- }
-
-
- DateFormat df = DateFormat.getDateInstance(DateFormat.SHORT, locale);
- df.setLenient(true);
- DateTools.Resolution resolution = getDateResolution(field);
-
- try {
- part1 = DateTools.dateToString(df.parse(part1), resolution);
- } catch (Exception e) { }
-
- try {
- Date d2 = df.parse(part2);
- if (endInclusive) {
- // The user can only specify the date, not the time, so make sure
- // the time is set to the latest possible time of that date to really
- // include all documents:
- Calendar cal = Calendar.getInstance(locale);
- cal.setTime(d2);
- cal.set(Calendar.HOUR_OF_DAY, 23);
- cal.set(Calendar.MINUTE, 59);
- cal.set(Calendar.SECOND, 59);
- cal.set(Calendar.MILLISECOND, 999);
- d2 = cal.getTime();
- }
- part2 = DateTools.dateToString(d2, resolution);
- } catch (Exception e) { }
-
- return newRangeQuery(field, part1, part2, startInclusive, endInclusive);
- }
-
- /**
- * Builds a new BooleanQuery instance
- * @param disableCoord disable coord
- * @return new BooleanQuery instance
- */
- protected BooleanQuery newBooleanQuery(boolean disableCoord) {
- return new BooleanQuery(disableCoord);
- }
-
- /**
- * Builds a new BooleanClause instance
- * @param q sub query
- * @param occur how this clause should occur when matching documents
- * @return new BooleanClause instance
- */
- protected BooleanClause newBooleanClause(Query q, BooleanClause.Occur occur) {
- return new BooleanClause(q, occur);
- }
-
- /**
- * Builds a new TermQuery instance
- * @param term term
- * @return new TermQuery instance
- */
- protected Query newTermQuery(Term term){
- return new TermQuery(term);
- }
-
- /**
- * Builds a new PhraseQuery instance
- * @return new PhraseQuery instance
- */
- protected PhraseQuery newPhraseQuery(){
- return new PhraseQuery();
- }
-
- /**
- * Builds a new MultiPhraseQuery instance
- * @return new MultiPhraseQuery instance
- */
- protected MultiPhraseQuery newMultiPhraseQuery(){
- return new MultiPhraseQuery();
- }
-
- /**
- * Builds a new PrefixQuery instance
- * @param prefix Prefix term
- * @return new PrefixQuery instance
- */
- protected Query newPrefixQuery(Term prefix){
- PrefixQuery query = new PrefixQuery(prefix);
- query.setRewriteMethod(multiTermRewriteMethod);
- return query;
- }
-
- /**
- * Builds a new RegexpQuery instance
- * @param regexp Regexp term
- * @return new RegexpQuery instance
- */
- protected Query newRegexpQuery(Term regexp) {
- RegexpQuery query = new RegexpQuery(regexp);
- query.setRewriteMethod(multiTermRewriteMethod);
- return query;
- }
-
- /**
- * Builds a new FuzzyQuery instance
- * @param term Term
- * @param minimumSimilarity minimum similarity
- * @param prefixLength prefix length
- * @return new FuzzyQuery Instance
- */
- protected Query newFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) {
- // FuzzyQuery doesn't yet allow constant score rewrite
- return new FuzzyQuery(term,minimumSimilarity,prefixLength);
- }
-
- /**
- * Builds a new TermRangeQuery instance
- * @param field Field
- * @param part1 min
- * @param part2 max
- * @param startInclusive true if the start of the range is inclusive
- * @param endInclusive true if the end of the range is inclusive
- * @return new TermRangeQuery instance
- */
- protected Query newRangeQuery(String field, String part1, String part2, boolean startInclusive, boolean endInclusive) {
- final TermRangeQuery query = new TermRangeQuery(field, part1, part2, startInclusive, endInclusive, rangeCollator);
- query.setRewriteMethod(multiTermRewriteMethod);
- return query;
- }
-
- /**
- * Builds a new MatchAllDocsQuery instance
- * @return new MatchAllDocsQuery instance
- */
- protected Query newMatchAllDocsQuery() {
- return new MatchAllDocsQuery();
- }
-
- /**
- * Builds a new WildcardQuery instance
- * @param t wildcard term
- * @return new WildcardQuery instance
- */
- protected Query newWildcardQuery(Term t) {
- WildcardQuery query = new WildcardQuery(t);
- query.setRewriteMethod(multiTermRewriteMethod);
- return query;
- }
-
- /**
- * Factory method for generating query, given a set of clauses.
- * By default creates a boolean query composed of clauses passed in.
- *
- * Can be overridden by extending classes, to modify query being
- * returned.
- *
- * @param clauses List that contains {@link org.apache.lucene.search.BooleanClause} instances
- * to join.
- *
- * @return Resulting {@link org.apache.lucene.search.Query} object.
- * @exception org.apache.lucene.queryParser.ParseException throw in overridden method to disallow
- */
- protected Query getBooleanQuery(List clauses) throws ParseException {
- return getBooleanQuery(clauses, false);
- }
-
- /**
- * Factory method for generating query, given a set of clauses.
- * By default creates a boolean query composed of clauses passed in.
- *
- * Can be overridden by extending classes, to modify query being
- * returned.
- *
- * @param clauses List that contains {@link org.apache.lucene.search.BooleanClause} instances
- * to join.
- * @param disableCoord true if coord scoring should be disabled.
- *
- * @return Resulting {@link org.apache.lucene.search.Query} object.
- * @exception org.apache.lucene.queryParser.ParseException throw in overridden method to disallow
- */
- protected Query getBooleanQuery(List clauses, boolean disableCoord)
- throws ParseException
- {
- if (clauses.size()==0) {
- return null; // all clause words were filtered away by the analyzer.
- }
- BooleanQuery query = newBooleanQuery(disableCoord);
- for(final BooleanClause clause: clauses) {
- query.add(clause);
- }
- return query;
- }
-
- /**
- * Factory method for generating a query. Called when parser
- * parses an input term token that contains one or more wildcard
- * characters (? and *), but is not a prefix term token (one
- * that has just a single * character at the end)
- *
- * Depending on settings, prefix term may be lower-cased
- * automatically. It will not go through the default Analyzer,
- * however, since normal Analyzers are unlikely to work properly
- * with wildcard templates.
- *
- * Can be overridden by extending classes, to provide custom handling for
- * wildcard queries, which may be necessary due to missing analyzer calls.
- *
- * @param field Name of the field query will use.
- * @param termStr Term token that contains one or more wild card
- * characters (? or *), but is not simple prefix term
- *
- * @return Resulting {@link org.apache.lucene.search.Query} built for the term
- * @exception org.apache.lucene.queryParser.ParseException throw in overridden method to disallow
- */
- protected Query getWildcardQuery(String field, String termStr) throws ParseException
- {
- if ("*".equals(field)) {
- if ("*".equals(termStr)) return newMatchAllDocsQuery();
- }
- if (!allowLeadingWildcard && (termStr.startsWith("*") || termStr.startsWith("?")))
- throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery");
- if (lowercaseExpandedTerms) {
- termStr = termStr.toLowerCase();
- }
- Term t = new Term(field, termStr);
- return newWildcardQuery(t);
- }
-
- /**
- * Factory method for generating a query. Called when parser
- * parses an input term token that contains a regular expression
- * query.
- *
- * Depending on settings, pattern term may be lower-cased
- * automatically. It will not go through the default Analyzer,
- * however, since normal Analyzers are unlikely to work properly
- * with regular expression templates.
- *
- * Can be overridden by extending classes, to provide custom handling for
- * regular expression queries, which may be necessary due to missing analyzer
- * calls.
- *
- * @param field Name of the field query will use.
- * @param termStr Term token that contains a regular expression
- *
- * @return Resulting {@link org.apache.lucene.search.Query} built for the term
- * @exception org.apache.lucene.queryParser.ParseException throw in overridden method to disallow
- */
- protected Query getRegexpQuery(String field, String termStr) throws ParseException
- {
- if (lowercaseExpandedTerms) {
- termStr = termStr.toLowerCase();
- }
- Term t = new Term(field, termStr);
- return newRegexpQuery(t);
- }
-
- /**
- * Factory method for generating a query (similar to
- * {@link #getWildcardQuery}). Called when parser parses an input term
- * token that uses prefix notation; that is, contains a single '*' wildcard
- * character as its last character. Since this is a special case
- * of generic wildcard term, and such a query can be optimized easily,
- * this usually results in a different query object.
- *
- * Depending on settings, a prefix term may be lower-cased
- * automatically. It will not go through the default Analyzer,
- * however, since normal Analyzers are unlikely to work properly
- * with wildcard templates.
- *
- * Can be overridden by extending classes, to provide custom handling for
- * wild card queries, which may be necessary due to missing analyzer calls.
- *
- * @param field Name of the field query will use.
- * @param termStr Term token to use for building term for the query
- * (without trailing '*' character!)
- *
- * @return Resulting {@link org.apache.lucene.search.Query} built for the term
- * @exception org.apache.lucene.queryParser.ParseException throw in overridden method to disallow
- */
- protected Query getPrefixQuery(String field, String termStr) throws ParseException
- {
- if (!allowLeadingWildcard && termStr.startsWith("*"))
- throw new ParseException("'*' not allowed as first character in PrefixQuery");
- if (lowercaseExpandedTerms) {
- termStr = termStr.toLowerCase();
- }
- Term t = new Term(field, termStr);
- return newPrefixQuery(t);
- }
-
- /**
- * Factory method for generating a query (similar to
- * {@link #getWildcardQuery}). Called when parser parses
- * an input term token that has the fuzzy suffix (~) appended.
- *
- * @param field Name of the field query will use.
- * @param termStr Term token to use for building term for the query
- *
- * @return Resulting {@link org.apache.lucene.search.Query} built for the term
- * @exception org.apache.lucene.queryParser.ParseException throw in overridden method to disallow
- */
- protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) throws ParseException
- {
- if (lowercaseExpandedTerms) {
- termStr = termStr.toLowerCase();
- }
- Term t = new Term(field, termStr);
- return newFuzzyQuery(t, minSimilarity, fuzzyPrefixLength);
- }
-
-
- // extracted from the .jj grammar
- Query handleBareTokenQuery(String qfield, Token term, Token fuzzySlop, boolean prefix, boolean wildcard, boolean fuzzy, boolean regexp) throws ParseException {
- Query q;
-
- String termImage=discardEscapeChar(term.image);
- if (wildcard) {
- q = getWildcardQuery(qfield, term.image);
- } else if (prefix) {
- q = getPrefixQuery(qfield,
- discardEscapeChar(term.image.substring
- (0, term.image.length()-1)));
- } else if (regexp) {
- q = getRegexpQuery(qfield, term.image.substring(1, term.image.length()-1));
- } else if (fuzzy) {
- float fms = fuzzyMinSim;
- try {
- fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue();
- } catch (Exception ignored) { }
- if(fms < 0.0f){
- throw new ParseException("Minimum similarity for a FuzzyQuery has to be between 0.0f and 1.0f !");
- } else if (fms >= 1.0f && fms != (int) fms) {
- throw new ParseException("Fractional edit distances are not allowed!");
- }
- q = getFuzzyQuery(qfield, termImage, fms);
- } else {
- q = getFieldQuery(qfield, termImage, false);
- }
- return q;
- }
-
- // extracted from the .jj grammar
- Query handleQuotedTerm(String qfield, Token term, Token fuzzySlop) throws ParseException {
- int s = phraseSlop; // default
- if (fuzzySlop != null) {
- try {
- s = Float.valueOf(fuzzySlop.image.substring(1)).intValue();
- }
- catch (Exception ignored) { }
- }
- return getFieldQuery(qfield, discardEscapeChar(term.image.substring(1, term.image.length()-1)), s);
- }
-
- // extracted from the .jj grammar
- Query handleBoost(Query q, Token boost) throws ParseException {
- if (boost != null) {
- float f = (float) 1.0;
- try {
- f = Float.valueOf(boost.image).floatValue();
- }
- catch (Exception ignored) {
- /* Should this be handled somehow? (defaults to "no boost", if
- * boost number is invalid)
- */
- }
-
- // avoid boosting null queries, such as those caused by stop words
- if (q != null) {
- q.setBoost(f);
- }
- }
- return q;
- }
-
-
-
- /**
- * Returns a String where the escape char has been
- * removed, or kept only once if there was a double escape.
- *
- * Supports escaped unicode characters, e. g. translates
- * \\u0041
to A
.
- *
- */
- String discardEscapeChar(String input) throws ParseException {
- // Create char array to hold unescaped char sequence
- char[] output = new char[input.length()];
-
- // The length of the output can be less than the input
- // due to discarded escape chars. This variable holds
- // the actual length of the output
- int length = 0;
-
- // We remember whether the last processed character was
- // an escape character
- boolean lastCharWasEscapeChar = false;
-
- // The multiplier the current unicode digit must be multiplied with.
- // E. g. the first digit must be multiplied with 16^3, the second with 16^2...
- int codePointMultiplier = 0;
-
- // Used to calculate the codepoint of the escaped unicode character
- int codePoint = 0;
-
- for (int i = 0; i < input.length(); i++) {
- char curChar = input.charAt(i);
- if (codePointMultiplier > 0) {
- codePoint += hexToInt(curChar) * codePointMultiplier;
- codePointMultiplier >>>= 4;
- if (codePointMultiplier == 0) {
- output[length++] = (char)codePoint;
- codePoint = 0;
- }
- } else if (lastCharWasEscapeChar) {
- if (curChar == 'u') {
- // found an escaped unicode character
- codePointMultiplier = 16 * 16 * 16;
- } else {
- // this character was escaped
- output[length] = curChar;
- length++;
- }
- lastCharWasEscapeChar = false;
- } else {
- if (curChar == '\\') {
- lastCharWasEscapeChar = true;
- } else {
- output[length] = curChar;
- length++;
- }
- }
- }
-
- if (codePointMultiplier > 0) {
- throw new ParseException("Truncated unicode escape sequence.");
- }
-
- if (lastCharWasEscapeChar) {
- throw new ParseException("Term can not end with escape character.");
- }
-
- return new String(output, 0, length);
- }
-
- /** Returns the numeric value of the hexadecimal character */
- static final int hexToInt(char c) throws ParseException {
- if ('0' <= c && c <= '9') {
- return c - '0';
- } else if ('a' <= c && c <= 'f'){
- return c - 'a' + 10;
- } else if ('A' <= c && c <= 'F') {
- return c - 'A' + 10;
- } else {
- throw new ParseException("None-hex character in unicode escape sequence: " + c);
- }
- }
-
- /**
- * Returns a String where those characters that QueryParser
- * expects to be escaped are escaped by a preceding \
.
- */
- public static String escape(String s) {
- StringBuilder sb = new StringBuilder();
- for (int i = 0; i < s.length(); i++) {
- char c = s.charAt(i);
- // These characters are part of the query syntax and must be escaped
- if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':'
- || c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' || c == '}' || c == '~'
- || c == '*' || c == '?' || c == '|' || c == '&') {
- sb.append('\\');
- }
- sb.append(c);
- }
- return sb.toString();
- }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.queryParser;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.text.Collator;
+import java.text.DateFormat;
+import java.util.*;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CachingTokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.document.DateTools;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.queryParser.QueryParser.Operator;
+import org.apache.lucene.search.*;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.Version;
+
+/** This class is overridden by QueryParser in QueryParser.jj
+ * and acts to separate the majority of the Java code from the .jj grammar file.
+ */
+public abstract class QueryParserBase {
+
+ /** Do not catch this exception in your code, it means you are using methods that you should no longer use. */
+ public static class MethodRemovedUseAnother extends Throwable {}
+
+ static final int CONJ_NONE = 0;
+ static final int CONJ_AND = 1;
+ static final int CONJ_OR = 2;
+
+ static final int MOD_NONE = 0;
+ static final int MOD_NOT = 10;
+ static final int MOD_REQ = 11;
+
+ // make it possible to call setDefaultOperator() without accessing
+ // the nested class:
+ /** Alternative form of QueryParser.Operator.AND */
+ public static final Operator AND_OPERATOR = Operator.AND;
+ /** Alternative form of QueryParser.Operator.OR */
+ public static final Operator OR_OPERATOR = Operator.OR;
+
+ /** The actual operator that parser uses to combine query terms */
+ Operator operator = OR_OPERATOR;
+
+ boolean lowercaseExpandedTerms = true;
+ MultiTermQuery.RewriteMethod multiTermRewriteMethod = MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
+ boolean allowLeadingWildcard = false;
+ boolean enablePositionIncrements = true;
+
+ Analyzer analyzer;
+ String field;
+ int phraseSlop = 0;
+ float fuzzyMinSim = FuzzyQuery.defaultMinSimilarity;
+ int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength;
+ Locale locale = Locale.getDefault();
+
+ // the default date resolution
+ DateTools.Resolution dateResolution = null;
+ // maps field names to date resolutions
+ Map fieldToDateResolution = null;
+
+ // The collator to use when determining range inclusion,
+ // for use when constructing RangeQuerys.
+ Collator rangeCollator = null;
+
+ boolean autoGeneratePhraseQueries;
+
+ // So the generated QueryParser(CharStream) won't error out
+ protected QueryParserBase() {
+ }
+
+ /** Initializes a query parser. Called by the QueryParser constructor
+ * @param matchVersion Lucene version to match. See above.
+ * @param f the default field for query terms.
+ * @param a used to find terms in the query text.
+ */
+ public void init(Version matchVersion, String f, Analyzer a) {
+ analyzer = a;
+ field = f;
+ if (matchVersion.onOrAfter(Version.LUCENE_31)) {
+ setAutoGeneratePhraseQueries(false);
+ } else {
+ setAutoGeneratePhraseQueries(true);
+ }
+ }
+
+ // the generated parser will create these in QueryParser
+ public abstract void ReInit(CharStream stream);
+ public abstract Query TopLevelQuery(String field) throws ParseException;
+
+
+ /** Parses a query string, returning a {@link org.apache.lucene.search.Query}.
+ * @param query the query string to be parsed.
+ * @throws ParseException if the parsing fails
+ */
+ public Query parse(String query) throws ParseException {
+ ReInit(new FastCharStream(new StringReader(query)));
+ try {
+ // TopLevelQuery is a Query followed by the end-of-input (EOF)
+ Query res = TopLevelQuery(field);
+ return res!=null ? res : newBooleanQuery(false);
+ }
+ catch (ParseException tme) {
+ // rethrow to include the original query:
+ ParseException e = new ParseException("Cannot parse '" +query+ "': " + tme.getMessage());
+ e.initCause(tme);
+ throw e;
+ }
+ catch (TokenMgrError tme) {
+ ParseException e = new ParseException("Cannot parse '" +query+ "': " + tme.getMessage());
+ e.initCause(tme);
+ throw e;
+ }
+ catch (BooleanQuery.TooManyClauses tmc) {
+ ParseException e = new ParseException("Cannot parse '" +query+ "': too many boolean clauses");
+ e.initCause(tmc);
+ throw e;
+ }
+ }
+
+
+ /**
+ * @return Returns the analyzer.
+ */
+ public Analyzer getAnalyzer() {
+ return analyzer;
+ }
+
+ /**
+ * @return Returns the default field.
+ */
+ public String getField() {
+ return field;
+ }
+
+ /**
+ * @see #setAutoGeneratePhraseQueries(boolean)
+ */
+ public final boolean getAutoGeneratePhraseQueries() {
+ return autoGeneratePhraseQueries;
+ }
+
+ /**
+ * Set to true if phrase queries will be automatically generated
+ * when the analyzer returns more than one term from whitespace
+ * delimited text.
+ * NOTE: this behavior may not be suitable for all languages.
+ *
+ * Set to false if phrase queries should only be generated when
+ * surrounded by double quotes.
+ */
+ public final void setAutoGeneratePhraseQueries(boolean value) {
+ this.autoGeneratePhraseQueries = value;
+ }
+
+ /**
+ * Get the minimal similarity for fuzzy queries.
+ */
+ public float getFuzzyMinSim() {
+ return fuzzyMinSim;
+ }
+
+ /**
+ * Set the minimum similarity for fuzzy queries.
+ * Default is 2f.
+ */
+ public void setFuzzyMinSim(float fuzzyMinSim) {
+ this.fuzzyMinSim = fuzzyMinSim;
+ }
+
+ /**
+ * Get the prefix length for fuzzy queries.
+ * @return Returns the fuzzyPrefixLength.
+ */
+ public int getFuzzyPrefixLength() {
+ return fuzzyPrefixLength;
+ }
+
+ /**
+ * Set the prefix length for fuzzy queries. Default is 0.
+ * @param fuzzyPrefixLength The fuzzyPrefixLength to set.
+ */
+ public void setFuzzyPrefixLength(int fuzzyPrefixLength) {
+ this.fuzzyPrefixLength = fuzzyPrefixLength;
+ }
+
+ /**
+ * Sets the default slop for phrases. If zero, then exact phrase matches
+ * are required. Default value is zero.
+ */
+ public void setPhraseSlop(int phraseSlop) {
+ this.phraseSlop = phraseSlop;
+ }
+
+ /**
+ * Gets the default slop for phrases.
+ */
+ public int getPhraseSlop() {
+ return phraseSlop;
+ }
+
+
+ /**
+ * Set to true
to allow leading wildcard characters.
+ *
+ * When set, *
or ?
are allowed as
+ * the first character of a PrefixQuery and WildcardQuery.
+ * Note that this can produce very slow
+ * queries on big indexes.
+ *
+ * Default: false.
+ */
+ public void setAllowLeadingWildcard(boolean allowLeadingWildcard) {
+ this.allowLeadingWildcard = allowLeadingWildcard;
+ }
+
+ /**
+ * @see #setAllowLeadingWildcard(boolean)
+ */
+ public boolean getAllowLeadingWildcard() {
+ return allowLeadingWildcard;
+ }
+
+ /**
+ * Set to true
to enable position increments in result query.
+ *
+ * When set, result phrase and multi-phrase queries will
+ * be aware of position increments.
+ * Useful when e.g. a StopFilter increases the position increment of
+ * the token that follows an omitted token.
+ *
+ * Default: true.
+ */
+ public void setEnablePositionIncrements(boolean enable) {
+ this.enablePositionIncrements = enable;
+ }
+
+ /**
+ * @see #setEnablePositionIncrements(boolean)
+ */
+ public boolean getEnablePositionIncrements() {
+ return enablePositionIncrements;
+ }
+
+ /**
+ * Sets the boolean operator of the QueryParser.
+ * In default mode (OR_OPERATOR
) terms without any modifiers
+ * are considered optional: for example capital of Hungary
is equal to
+ * capital OR of OR Hungary
.
+ * In AND_OPERATOR
mode terms are considered to be in conjunction: the
+ * above mentioned query is parsed as capital AND of AND Hungary
+ */
+ public void setDefaultOperator(Operator op) {
+ this.operator = op;
+ }
+
+
+ /**
+ * Gets implicit operator setting, which will be either AND_OPERATOR
+ * or OR_OPERATOR.
+ */
+ public Operator getDefaultOperator() {
+ return operator;
+ }
+
+
+ /**
+ * Whether terms of wildcard, prefix, fuzzy and range queries are to be automatically
+ * lower-cased or not. Default is true
.
+ */
+ public void setLowercaseExpandedTerms(boolean lowercaseExpandedTerms) {
+ this.lowercaseExpandedTerms = lowercaseExpandedTerms;
+ }
+
+
+ /**
+ * @see #setLowercaseExpandedTerms(boolean)
+ */
+ public boolean getLowercaseExpandedTerms() {
+ return lowercaseExpandedTerms;
+ }
+
+ /**
+ * By default QueryParser uses {@link org.apache.lucene.search.MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}
+ * when creating a PrefixQuery, WildcardQuery or RangeQuery. This implementation is generally preferable because it
+ * a) Runs faster b) Does not have the scarcity of terms unduly influence score
+ * c) avoids any "TooManyBooleanClauses" exception.
+ * However, if your application really needs to use the
+ * old-fashioned BooleanQuery expansion rewriting and the above
+ * points are not relevant then use this to change
+ * the rewrite method.
+ */
+ public void setMultiTermRewriteMethod(MultiTermQuery.RewriteMethod method) {
+ multiTermRewriteMethod = method;
+ }
+
+
+ /**
+ * @see #setMultiTermRewriteMethod
+ */
+ public MultiTermQuery.RewriteMethod getMultiTermRewriteMethod() {
+ return multiTermRewriteMethod;
+ }
+
+ /**
+ * Set locale used by date range parsing.
+ */
+ public void setLocale(Locale locale) {
+ this.locale = locale;
+ }
+
+ /**
+ * Returns current locale, allowing access by subclasses.
+ */
+ public Locale getLocale() {
+ return locale;
+ }
+
+ /**
+ * Sets the default date resolution used by RangeQueries for fields for which no
+ * specific date resolutions has been set. Field specific resolutions can be set
+ * with {@link #setDateResolution(String, org.apache.lucene.document.DateTools.Resolution)}.
+ *
+ * @param dateResolution the default date resolution to set
+ */
+ public void setDateResolution(DateTools.Resolution dateResolution) {
+ this.dateResolution = dateResolution;
+ }
+
+ /**
+ * Sets the date resolution used by RangeQueries for a specific field.
+ *
+ * @param fieldName field for which the date resolution is to be set
+ * @param dateResolution date resolution to set
+ */
+ public void setDateResolution(String fieldName, DateTools.Resolution dateResolution) {
+ if (fieldName == null) {
+ throw new IllegalArgumentException("Field cannot be null.");
+ }
+
+ if (fieldToDateResolution == null) {
+ // lazily initialize HashMap
+ fieldToDateResolution = new HashMap();
+ }
+
+ fieldToDateResolution.put(fieldName, dateResolution);
+ }
+
+ /**
+ * Returns the date resolution that is used by RangeQueries for the given field.
+ * Returns null, if no default or field specific date resolution has been set
+ * for the given field.
+ *
+ */
+ public DateTools.Resolution getDateResolution(String fieldName) {
+ if (fieldName == null) {
+ throw new IllegalArgumentException("Field cannot be null.");
+ }
+
+ if (fieldToDateResolution == null) {
+ // no field specific date resolutions set; return default date resolution instead
+ return this.dateResolution;
+ }
+
+ DateTools.Resolution resolution = fieldToDateResolution.get(fieldName);
+ if (resolution == null) {
+ // no date resolutions set for the given field; return default date resolution instead
+ resolution = this.dateResolution;
+ }
+
+ return resolution;
+ }
+
+ /**
+ * Sets the collator used to determine index term inclusion in ranges
+ * for RangeQuerys.
+ *
+ * WARNING: Setting the rangeCollator to a non-null
+ * collator using this method will cause every single index Term in the
+ * Field referenced by lowerTerm and/or upperTerm to be examined.
+ * Depending on the number of index Terms in this Field, the operation could
+ * be very slow.
+ *
+ * @param rc the collator to use when constructing RangeQuerys
+ */
+ public void setRangeCollator(Collator rc) {
+ rangeCollator = rc;
+ }
+
+ /**
+ * @return the collator used to determine index term inclusion in ranges
+ * for RangeQuerys.
+ */
+ public Collator getRangeCollator() {
+ return rangeCollator;
+ }
+
+ protected void addClause(List clauses, int conj, int mods, Query q) {
+ boolean required, prohibited;
+
+ // If this term is introduced by AND, make the preceding term required,
+ // unless it's already prohibited
+ if (clauses.size() > 0 && conj == CONJ_AND) {
+ BooleanClause c = clauses.get(clauses.size()-1);
+ if (!c.isProhibited())
+ c.setOccur(BooleanClause.Occur.MUST);
+ }
+
+ if (clauses.size() > 0 && operator == AND_OPERATOR && conj == CONJ_OR) {
+ // If this term is introduced by OR, make the preceding term optional,
+ // unless it's prohibited (that means we leave -a OR b but +a OR b-->a OR b)
+ // notice if the input is a OR b, first term is parsed as required; without
+ // this modification a OR b would parsed as +a OR b
+ BooleanClause c = clauses.get(clauses.size()-1);
+ if (!c.isProhibited())
+ c.setOccur(BooleanClause.Occur.SHOULD);
+ }
+
+ // We might have been passed a null query; the term might have been
+ // filtered away by the analyzer.
+ if (q == null)
+ return;
+
+ if (operator == OR_OPERATOR) {
+ // We set REQUIRED if we're introduced by AND or +; PROHIBITED if
+ // introduced by NOT or -; make sure not to set both.
+ prohibited = (mods == MOD_NOT);
+ required = (mods == MOD_REQ);
+ if (conj == CONJ_AND && !prohibited) {
+ required = true;
+ }
+ } else {
+ // We set PROHIBITED if we're introduced by NOT or -; We set REQUIRED
+ // if not PROHIBITED and not introduced by OR
+ prohibited = (mods == MOD_NOT);
+ required = (!prohibited && conj != CONJ_OR);
+ }
+ if (required && !prohibited)
+ clauses.add(newBooleanClause(q, BooleanClause.Occur.MUST));
+ else if (!required && !prohibited)
+ clauses.add(newBooleanClause(q, BooleanClause.Occur.SHOULD));
+ else if (!required && prohibited)
+ clauses.add(newBooleanClause(q, BooleanClause.Occur.MUST_NOT));
+ else
+ throw new RuntimeException("Clause cannot be both required and prohibited");
+ }
+
+ /**
+ * @exception org.apache.lucene.queryParser.ParseException throw in overridden method to disallow
+ */
+ protected Query getFieldQuery(String field, String queryText, boolean quoted) throws ParseException {
+ // Use the analyzer to get all the tokens, and then build a TermQuery,
+ // PhraseQuery, or nothing based on the term count
+
+ TokenStream source;
+ try {
+ source = analyzer.reusableTokenStream(field, new StringReader(queryText));
+ source.reset();
+ } catch (IOException e) {
+ source = analyzer.tokenStream(field, new StringReader(queryText));
+ }
+ CachingTokenFilter buffer = new CachingTokenFilter(source);
+ TermToBytesRefAttribute termAtt = null;
+ PositionIncrementAttribute posIncrAtt = null;
+ int numTokens = 0;
+
+ boolean success = false;
+ try {
+ buffer.reset();
+ success = true;
+ } catch (IOException e) {
+ // success==false if we hit an exception
+ }
+ if (success) {
+ if (buffer.hasAttribute(TermToBytesRefAttribute.class)) {
+ termAtt = buffer.getAttribute(TermToBytesRefAttribute.class);
+ }
+ if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
+ posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
+ }
+ }
+
+ int positionCount = 0;
+ boolean severalTokensAtSamePosition = false;
+
+ boolean hasMoreTokens = false;
+ if (termAtt != null) {
+ try {
+ hasMoreTokens = buffer.incrementToken();
+ while (hasMoreTokens) {
+ numTokens++;
+ int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
+ if (positionIncrement != 0) {
+ positionCount += positionIncrement;
+ } else {
+ severalTokensAtSamePosition = true;
+ }
+ hasMoreTokens = buffer.incrementToken();
+ }
+ } catch (IOException e) {
+ // ignore
+ }
+ }
+ try {
+ // rewind the buffer stream
+ buffer.reset();
+
+ // close original stream - all tokens buffered
+ source.close();
+ }
+ catch (IOException e) {
+ // ignore
+ }
+
+ if (numTokens == 0)
+ return null;
+ else if (numTokens == 1) {
+ BytesRef term = new BytesRef();
+ try {
+ boolean hasNext = buffer.incrementToken();
+ assert hasNext == true;
+ termAtt.toBytesRef(term);
+ } catch (IOException e) {
+ // safe to ignore, because we know the number of tokens
+ }
+ return newTermQuery(new Term(field, term));
+ } else {
+ if (severalTokensAtSamePosition || (!quoted && !autoGeneratePhraseQueries)) {
+ if (positionCount == 1 || (!quoted && !autoGeneratePhraseQueries)) {
+ // no phrase query:
+ BooleanQuery q = newBooleanQuery(positionCount == 1);
+
+ BooleanClause.Occur occur = positionCount > 1 && operator == AND_OPERATOR ?
+ BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD;
+
+ for (int i = 0; i < numTokens; i++) {
+ BytesRef term = new BytesRef();
+ try {
+ boolean hasNext = buffer.incrementToken();
+ assert hasNext == true;
+ termAtt.toBytesRef(term);
+ } catch (IOException e) {
+ // safe to ignore, because we know the number of tokens
+ }
+
+ Query currentQuery = newTermQuery(
+ new Term(field, term));
+ q.add(currentQuery, occur);
+ }
+ return q;
+ }
+ else {
+ // phrase query:
+ MultiPhraseQuery mpq = newMultiPhraseQuery();
+ mpq.setSlop(phraseSlop);
+ List multiTerms = new ArrayList();
+ int position = -1;
+ for (int i = 0; i < numTokens; i++) {
+ BytesRef term = new BytesRef();
+ int positionIncrement = 1;
+ try {
+ boolean hasNext = buffer.incrementToken();
+ assert hasNext == true;
+ termAtt.toBytesRef(term);
+ if (posIncrAtt != null) {
+ positionIncrement = posIncrAtt.getPositionIncrement();
+ }
+ } catch (IOException e) {
+ // safe to ignore, because we know the number of tokens
+ }
+
+ if (positionIncrement > 0 && multiTerms.size() > 0) {
+ if (enablePositionIncrements) {
+ mpq.add(multiTerms.toArray(new Term[0]),position);
+ } else {
+ mpq.add(multiTerms.toArray(new Term[0]));
+ }
+ multiTerms.clear();
+ }
+ position += positionIncrement;
+ multiTerms.add(new Term(field, term));
+ }
+ if (enablePositionIncrements) {
+ mpq.add(multiTerms.toArray(new Term[0]),position);
+ } else {
+ mpq.add(multiTerms.toArray(new Term[0]));
+ }
+ return mpq;
+ }
+ }
+ else {
+ PhraseQuery pq = newPhraseQuery();
+ pq.setSlop(phraseSlop);
+ int position = -1;
+
+
+ for (int i = 0; i < numTokens; i++) {
+ BytesRef term = new BytesRef();
+ int positionIncrement = 1;
+
+ try {
+ boolean hasNext = buffer.incrementToken();
+ assert hasNext == true;
+ termAtt.toBytesRef(term);
+ if (posIncrAtt != null) {
+ positionIncrement = posIncrAtt.getPositionIncrement();
+ }
+ } catch (IOException e) {
+ // safe to ignore, because we know the number of tokens
+ }
+
+ if (enablePositionIncrements) {
+ position += positionIncrement;
+ pq.add(new Term(field, term),position);
+ } else {
+ pq.add(new Term(field, term));
+ }
+ }
+ return pq;
+ }
+ }
+ }
+
+
+
+ /**
+ * Base implementation delegates to {@link #getFieldQuery(String,String,boolean)}.
+ * This method may be overridden, for example, to return
+ * a SpanNearQuery instead of a PhraseQuery.
+ *
+ * @exception org.apache.lucene.queryParser.ParseException throw in overridden method to disallow
+ */
+ protected Query getFieldQuery(String field, String queryText, int slop)
+ throws ParseException {
+ Query query = getFieldQuery(field, queryText, true);
+
+ if (query instanceof PhraseQuery) {
+ ((PhraseQuery) query).setSlop(slop);
+ }
+ if (query instanceof MultiPhraseQuery) {
+ ((MultiPhraseQuery) query).setSlop(slop);
+ }
+
+ return query;
+ }
+
+ /**
+ *
+ * @exception org.apache.lucene.queryParser.ParseException
+ */
+ protected Query getRangeQuery(String field,
+ String part1,
+ String part2,
+ boolean startInclusive,
+ boolean endInclusive) throws ParseException
+ {
+ if (lowercaseExpandedTerms) {
+ part1 = part1==null ? null : part1.toLowerCase();
+ part2 = part2==null ? null : part2.toLowerCase();
+ }
+
+
+ DateFormat df = DateFormat.getDateInstance(DateFormat.SHORT, locale);
+ df.setLenient(true);
+ DateTools.Resolution resolution = getDateResolution(field);
+
+ try {
+ part1 = DateTools.dateToString(df.parse(part1), resolution);
+ } catch (Exception e) { }
+
+ try {
+ Date d2 = df.parse(part2);
+ if (endInclusive) {
+ // The user can only specify the date, not the time, so make sure
+ // the time is set to the latest possible time of that date to really
+ // include all documents:
+ Calendar cal = Calendar.getInstance(locale);
+ cal.setTime(d2);
+ cal.set(Calendar.HOUR_OF_DAY, 23);
+ cal.set(Calendar.MINUTE, 59);
+ cal.set(Calendar.SECOND, 59);
+ cal.set(Calendar.MILLISECOND, 999);
+ d2 = cal.getTime();
+ }
+ part2 = DateTools.dateToString(d2, resolution);
+ } catch (Exception e) { }
+
+ return newRangeQuery(field, part1, part2, startInclusive, endInclusive);
+ }
+
+ /**
+ * Builds a new BooleanQuery instance
+ * @param disableCoord disable coord
+ * @return new BooleanQuery instance
+ */
+ protected BooleanQuery newBooleanQuery(boolean disableCoord) {
+ return new BooleanQuery(disableCoord);
+ }
+
+ /**
+ * Builds a new BooleanClause instance
+ * @param q sub query
+ * @param occur how this clause should occur when matching documents
+ * @return new BooleanClause instance
+ */
+ protected BooleanClause newBooleanClause(Query q, BooleanClause.Occur occur) {
+ return new BooleanClause(q, occur);
+ }
+
+ /**
+ * Builds a new TermQuery instance
+ * @param term term
+ * @return new TermQuery instance
+ */
+ protected Query newTermQuery(Term term){
+ return new TermQuery(term);
+ }
+
+ /**
+ * Builds a new PhraseQuery instance
+ * @return new PhraseQuery instance
+ */
+ protected PhraseQuery newPhraseQuery(){
+ return new PhraseQuery();
+ }
+
+ /**
+ * Builds a new MultiPhraseQuery instance
+ * @return new MultiPhraseQuery instance
+ */
+ protected MultiPhraseQuery newMultiPhraseQuery(){
+ return new MultiPhraseQuery();
+ }
+
+ /**
+ * Builds a new PrefixQuery instance
+ * @param prefix Prefix term
+ * @return new PrefixQuery instance
+ */
+ protected Query newPrefixQuery(Term prefix){
+ PrefixQuery query = new PrefixQuery(prefix);
+ query.setRewriteMethod(multiTermRewriteMethod);
+ return query;
+ }
+
+ /**
+ * Builds a new RegexpQuery instance
+ * @param regexp Regexp term
+ * @return new RegexpQuery instance
+ */
+ protected Query newRegexpQuery(Term regexp) {
+ RegexpQuery query = new RegexpQuery(regexp);
+ query.setRewriteMethod(multiTermRewriteMethod);
+ return query;
+ }
+
+ /**
+ * Builds a new FuzzyQuery instance
+ * @param term Term
+ * @param minimumSimilarity minimum similarity
+ * @param prefixLength prefix length
+ * @return new FuzzyQuery Instance
+ */
+ protected Query newFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) {
+ // FuzzyQuery doesn't yet allow constant score rewrite
+ return new FuzzyQuery(term,minimumSimilarity,prefixLength);
+ }
+
+ /**
+ * Builds a new TermRangeQuery instance
+ * @param field Field
+ * @param part1 min
+ * @param part2 max
+ * @param startInclusive true if the start of the range is inclusive
+ * @param endInclusive true if the end of the range is inclusive
+ * @return new TermRangeQuery instance
+ */
+ protected Query newRangeQuery(String field, String part1, String part2, boolean startInclusive, boolean endInclusive) {
+ final TermRangeQuery query = new TermRangeQuery(field, part1, part2, startInclusive, endInclusive, rangeCollator);
+ query.setRewriteMethod(multiTermRewriteMethod);
+ return query;
+ }
+
+ /**
+ * Builds a new MatchAllDocsQuery instance
+ * @return new MatchAllDocsQuery instance
+ */
+ protected Query newMatchAllDocsQuery() {
+ return new MatchAllDocsQuery();
+ }
+
+ /**
+ * Builds a new WildcardQuery instance
+ * @param t wildcard term
+ * @return new WildcardQuery instance
+ */
+ protected Query newWildcardQuery(Term t) {
+ WildcardQuery query = new WildcardQuery(t);
+ query.setRewriteMethod(multiTermRewriteMethod);
+ return query;
+ }
+
+ /**
+ * Factory method for generating query, given a set of clauses.
+ * By default creates a boolean query composed of clauses passed in.
+ *
+ * Can be overridden by extending classes, to modify query being
+ * returned.
+ *
+ * @param clauses List that contains {@link org.apache.lucene.search.BooleanClause} instances
+ * to join.
+ *
+ * @return Resulting {@link org.apache.lucene.search.Query} object.
+ * @exception org.apache.lucene.queryParser.ParseException throw in overridden method to disallow
+ */
+ protected Query getBooleanQuery(List clauses) throws ParseException {
+ return getBooleanQuery(clauses, false);
+ }
+
+ /**
+ * Factory method for generating query, given a set of clauses.
+ * By default creates a boolean query composed of clauses passed in.
+ *
+ * Can be overridden by extending classes, to modify query being
+ * returned.
+ *
+ * @param clauses List that contains {@link org.apache.lucene.search.BooleanClause} instances
+ * to join.
+ * @param disableCoord true if coord scoring should be disabled.
+ *
+ * @return Resulting {@link org.apache.lucene.search.Query} object.
+ * @exception org.apache.lucene.queryParser.ParseException throw in overridden method to disallow
+ */
+ protected Query getBooleanQuery(List clauses, boolean disableCoord)
+ throws ParseException
+ {
+ if (clauses.size()==0) {
+ return null; // all clause words were filtered away by the analyzer.
+ }
+ BooleanQuery query = newBooleanQuery(disableCoord);
+ for(final BooleanClause clause: clauses) {
+ query.add(clause);
+ }
+ return query;
+ }
+
+ /**
+ * Factory method for generating a query. Called when parser
+ * parses an input term token that contains one or more wildcard
+ * characters (? and *), but is not a prefix term token (one
+ * that has just a single * character at the end)
+ *
+ * Depending on settings, prefix term may be lower-cased
+ * automatically. It will not go through the default Analyzer,
+ * however, since normal Analyzers are unlikely to work properly
+ * with wildcard templates.
+ *
+ * Can be overridden by extending classes, to provide custom handling for
+ * wildcard queries, which may be necessary due to missing analyzer calls.
+ *
+ * @param field Name of the field query will use.
+ * @param termStr Term token that contains one or more wild card
+ * characters (? or *), but is not simple prefix term
+ *
+ * @return Resulting {@link org.apache.lucene.search.Query} built for the term
+ * @exception org.apache.lucene.queryParser.ParseException throw in overridden method to disallow
+ */
+ protected Query getWildcardQuery(String field, String termStr) throws ParseException
+ {
+ if ("*".equals(field)) {
+ if ("*".equals(termStr)) return newMatchAllDocsQuery();
+ }
+ if (!allowLeadingWildcard && (termStr.startsWith("*") || termStr.startsWith("?")))
+ throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery");
+ if (lowercaseExpandedTerms) {
+ termStr = termStr.toLowerCase();
+ }
+ Term t = new Term(field, termStr);
+ return newWildcardQuery(t);
+ }
+
+ /**
+ * Factory method for generating a query. Called when parser
+ * parses an input term token that contains a regular expression
+ * query.
+ *
+ * Depending on settings, pattern term may be lower-cased
+ * automatically. It will not go through the default Analyzer,
+ * however, since normal Analyzers are unlikely to work properly
+ * with regular expression templates.
+ *
+ * Can be overridden by extending classes, to provide custom handling for
+ * regular expression queries, which may be necessary due to missing analyzer
+ * calls.
+ *
+ * @param field Name of the field query will use.
+ * @param termStr Term token that contains a regular expression
+ *
+ * @return Resulting {@link org.apache.lucene.search.Query} built for the term
+ * @exception org.apache.lucene.queryParser.ParseException throw in overridden method to disallow
+ */
+ protected Query getRegexpQuery(String field, String termStr) throws ParseException
+ {
+ if (lowercaseExpandedTerms) {
+ termStr = termStr.toLowerCase();
+ }
+ Term t = new Term(field, termStr);
+ return newRegexpQuery(t);
+ }
+
+ /**
+ * Factory method for generating a query (similar to
+ * {@link #getWildcardQuery}). Called when parser parses an input term
+ * token that uses prefix notation; that is, contains a single '*' wildcard
+ * character as its last character. Since this is a special case
+ * of generic wildcard term, and such a query can be optimized easily,
+ * this usually results in a different query object.
+ *
+ * Depending on settings, a prefix term may be lower-cased
+ * automatically. It will not go through the default Analyzer,
+ * however, since normal Analyzers are unlikely to work properly
+ * with wildcard templates.
+ *
+ * Can be overridden by extending classes, to provide custom handling for
+ * wild card queries, which may be necessary due to missing analyzer calls.
+ *
+ * @param field Name of the field query will use.
+ * @param termStr Term token to use for building term for the query
+ * (without trailing '*' character!)
+ *
+ * @return Resulting {@link org.apache.lucene.search.Query} built for the term
+ * @exception org.apache.lucene.queryParser.ParseException throw in overridden method to disallow
+ */
+ protected Query getPrefixQuery(String field, String termStr) throws ParseException
+ {
+ if (!allowLeadingWildcard && termStr.startsWith("*"))
+ throw new ParseException("'*' not allowed as first character in PrefixQuery");
+ if (lowercaseExpandedTerms) {
+ termStr = termStr.toLowerCase();
+ }
+ Term t = new Term(field, termStr);
+ return newPrefixQuery(t);
+ }
+
+ /**
+ * Factory method for generating a query (similar to
+ * {@link #getWildcardQuery}). Called when parser parses
+ * an input term token that has the fuzzy suffix (~) appended.
+ *
+ * @param field Name of the field query will use.
+ * @param termStr Term token to use for building term for the query
+ *
+ * @return Resulting {@link org.apache.lucene.search.Query} built for the term
+ * @exception org.apache.lucene.queryParser.ParseException throw in overridden method to disallow
+ */
+ protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) throws ParseException
+ {
+ if (lowercaseExpandedTerms) {
+ termStr = termStr.toLowerCase();
+ }
+ Term t = new Term(field, termStr);
+ return newFuzzyQuery(t, minSimilarity, fuzzyPrefixLength);
+ }
+
+
+ // extracted from the .jj grammar
+ Query handleBareTokenQuery(String qfield, Token term, Token fuzzySlop, boolean prefix, boolean wildcard, boolean fuzzy, boolean regexp) throws ParseException {
+ Query q;
+
+ String termImage=discardEscapeChar(term.image);
+ if (wildcard) {
+ q = getWildcardQuery(qfield, term.image);
+ } else if (prefix) {
+ q = getPrefixQuery(qfield,
+ discardEscapeChar(term.image.substring
+ (0, term.image.length()-1)));
+ } else if (regexp) {
+ q = getRegexpQuery(qfield, term.image.substring(1, term.image.length()-1));
+ } else if (fuzzy) {
+ float fms = fuzzyMinSim;
+ try {
+ fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue();
+ } catch (Exception ignored) { }
+ if(fms < 0.0f){
+ throw new ParseException("Minimum similarity for a FuzzyQuery has to be between 0.0f and 1.0f !");
+ } else if (fms >= 1.0f && fms != (int) fms) {
+ throw new ParseException("Fractional edit distances are not allowed!");
+ }
+ q = getFuzzyQuery(qfield, termImage, fms);
+ } else {
+ q = getFieldQuery(qfield, termImage, false);
+ }
+ return q;
+ }
+
+ // extracted from the .jj grammar
+ Query handleQuotedTerm(String qfield, Token term, Token fuzzySlop) throws ParseException {
+ int s = phraseSlop; // default
+ if (fuzzySlop != null) {
+ try {
+ s = Float.valueOf(fuzzySlop.image.substring(1)).intValue();
+ }
+ catch (Exception ignored) { }
+ }
+ return getFieldQuery(qfield, discardEscapeChar(term.image.substring(1, term.image.length()-1)), s);
+ }
+
+ // extracted from the .jj grammar
+ Query handleBoost(Query q, Token boost) throws ParseException {
+ if (boost != null) {
+ float f = (float) 1.0;
+ try {
+ f = Float.valueOf(boost.image).floatValue();
+ }
+ catch (Exception ignored) {
+ /* Should this be handled somehow? (defaults to "no boost", if
+ * boost number is invalid)
+ */
+ }
+
+ // avoid boosting null queries, such as those caused by stop words
+ if (q != null) {
+ q.setBoost(f);
+ }
+ }
+ return q;
+ }
+
+
+
+ /**
+ * Returns a String where the escape char has been
+ * removed, or kept only once if there was a double escape.
+ *
+ * Supports escaped unicode characters, e. g. translates
+ * \\u0041
to A
.
+ *
+ */
+ String discardEscapeChar(String input) throws ParseException {
+ // Create char array to hold unescaped char sequence
+ char[] output = new char[input.length()];
+
+ // The length of the output can be less than the input
+ // due to discarded escape chars. This variable holds
+ // the actual length of the output
+ int length = 0;
+
+ // We remember whether the last processed character was
+ // an escape character
+ boolean lastCharWasEscapeChar = false;
+
+ // The multiplier the current unicode digit must be multiplied with.
+ // E. g. the first digit must be multiplied with 16^3, the second with 16^2...
+ int codePointMultiplier = 0;
+
+ // Used to calculate the codepoint of the escaped unicode character
+ int codePoint = 0;
+
+ for (int i = 0; i < input.length(); i++) {
+ char curChar = input.charAt(i);
+ if (codePointMultiplier > 0) {
+ codePoint += hexToInt(curChar) * codePointMultiplier;
+ codePointMultiplier >>>= 4;
+ if (codePointMultiplier == 0) {
+ output[length++] = (char)codePoint;
+ codePoint = 0;
+ }
+ } else if (lastCharWasEscapeChar) {
+ if (curChar == 'u') {
+ // found an escaped unicode character
+ codePointMultiplier = 16 * 16 * 16;
+ } else {
+ // this character was escaped
+ output[length] = curChar;
+ length++;
+ }
+ lastCharWasEscapeChar = false;
+ } else {
+ if (curChar == '\\') {
+ lastCharWasEscapeChar = true;
+ } else {
+ output[length] = curChar;
+ length++;
+ }
+ }
+ }
+
+ if (codePointMultiplier > 0) {
+ throw new ParseException("Truncated unicode escape sequence.");
+ }
+
+ if (lastCharWasEscapeChar) {
+ throw new ParseException("Term can not end with escape character.");
+ }
+
+ return new String(output, 0, length);
+ }
+
+ /** Returns the numeric value of the hexadecimal character */
+ static final int hexToInt(char c) throws ParseException {
+ if ('0' <= c && c <= '9') {
+ return c - '0';
+ } else if ('a' <= c && c <= 'f'){
+ return c - 'a' + 10;
+ } else if ('A' <= c && c <= 'F') {
+ return c - 'A' + 10;
+ } else {
+ throw new ParseException("None-hex character in unicode escape sequence: " + c);
+ }
+ }
+
+ /**
+ * Returns a String where those characters that QueryParser
+ * expects to be escaped are escaped by a preceding \
.
+ */
+ public static String escape(String s) {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ // These characters are part of the query syntax and must be escaped
+ if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':'
+ || c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' || c == '}' || c == '~'
+ || c == '*' || c == '?' || c == '|' || c == '&') {
+ sb.append('\\');
+ }
+ sb.append(c);
+ }
+ return sb.toString();
+ }
+
+}
diff --git a/lucene/src/java/org/apache/lucene/search/AutomatonQuery.java b/lucene/src/java/org/apache/lucene/search/AutomatonQuery.java
index c70db936287..4df94c4486e 100644
--- a/lucene/src/java/org/apache/lucene/search/AutomatonQuery.java
+++ b/lucene/src/java/org/apache/lucene/search/AutomatonQuery.java
@@ -18,15 +18,15 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
+import java.io.Serializable;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.AutomatonTermsEnum.CompiledAutomaton;
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.util.AttributeSource;
-import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.BasicAutomata;
import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.MinimizationOperations;
@@ -56,9 +56,16 @@ public class AutomatonQuery extends MultiTermQuery {
/** term containing the field, and possibly some pattern structure */
protected final Term term;
- transient ByteRunAutomaton runAutomaton;
- transient boolean isFinite;
- transient BytesRef commonSuffixRef;
+ /**
+ * abstraction for returning a termsenum:
+ * in the ctor the query computes one of these, the actual
+ * implementation depends upon the automaton's structure.
+ */
+ private abstract class TermsEnumFactory implements Serializable {
+ protected abstract TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException;
+ }
+
+ private final TermsEnumFactory factory;
/**
* Create a new AutomatonQuery from an {@link Automaton}.
@@ -68,60 +75,77 @@ public class AutomatonQuery extends MultiTermQuery {
* @param automaton Automaton to run, terms that are accepted are considered a
* match.
*/
- public AutomatonQuery(Term term, Automaton automaton) {
+ public AutomatonQuery(final Term term, Automaton automaton) {
super(term.field());
this.term = term;
this.automaton = automaton;
MinimizationOperations.minimize(automaton);
- }
-
- private synchronized void compileAutomaton() {
- // this method must be synchronized, as setting the three transient fields is not atomic:
- if (runAutomaton == null) {
- runAutomaton = new ByteRunAutomaton(automaton);
- isFinite = SpecialOperations.isFinite(automaton);
- commonSuffixRef = isFinite ? null : SpecialOperations.getCommonSuffixBytesRef(runAutomaton.getAutomaton());
+
+ if (BasicOperations.isEmpty(automaton)) {
+ // matches nothing
+ factory = new TermsEnumFactory() {
+ @Override
+ protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
+ return TermsEnum.EMPTY;
+ }
+ };
+ } else if (BasicOperations.isTotal(automaton)) {
+ // matches all possible strings
+ factory = new TermsEnumFactory() {
+ @Override
+ protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
+ return terms.iterator();
+ }
+ };
+ } else {
+ final String singleton;
+ final String commonPrefix;
+
+ if (automaton.getSingleton() == null) {
+ commonPrefix = SpecialOperations.getCommonPrefix(automaton);
+ if (commonPrefix.length() > 0 && BasicOperations.sameLanguage(automaton, BasicAutomata.makeString(commonPrefix))) {
+ singleton = commonPrefix;
+ } else {
+ singleton = null;
+ }
+ } else {
+ commonPrefix = null;
+ singleton = automaton.getSingleton();
+ }
+
+ if (singleton != null) {
+ // matches a fixed string in singleton or expanded representation
+ factory = new TermsEnumFactory() {
+ @Override
+ protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
+ return new SingleTermsEnum(terms.iterator(), term.createTerm(singleton));
+ }
+ };
+ } else if (BasicOperations.sameLanguage(automaton, BasicOperations.concatenate(
+ BasicAutomata.makeString(commonPrefix), BasicAutomata.makeAnyString()))) {
+ // matches a constant prefix
+ factory = new TermsEnumFactory() {
+ @Override
+ protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
+ return new PrefixTermsEnum(terms.iterator(), term.createTerm(commonPrefix));
+ }
+ };
+ } else {
+ final AutomatonTermsEnum.CompiledAutomaton compiled =
+ new CompiledAutomaton(automaton, SpecialOperations.isFinite(automaton));
+ factory = new TermsEnumFactory() {
+ @Override
+ protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
+ return new AutomatonTermsEnum(terms.iterator(), compiled);
+ }
+ };
+ }
}
}
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
- // matches nothing
- if (BasicOperations.isEmpty(automaton)) {
- return TermsEnum.EMPTY;
- }
-
- TermsEnum tenum = terms.iterator();
-
- // matches all possible strings
- if (BasicOperations.isTotal(automaton)) {
- return tenum;
- }
-
- // matches a fixed string in singleton representation
- String singleton = automaton.getSingleton();
- if (singleton != null)
- return new SingleTermsEnum(tenum, term.createTerm(singleton));
-
- // matches a fixed string in expanded representation
- final String commonPrefix = SpecialOperations.getCommonPrefix(automaton);
-
- if (commonPrefix.length() > 0) {
- if (BasicOperations.sameLanguage(automaton, BasicAutomata.makeString(commonPrefix))) {
- return new SingleTermsEnum(tenum, term.createTerm(commonPrefix));
- }
-
- // matches a constant prefix
- Automaton prefixAutomaton = BasicOperations.concatenate(BasicAutomata
- .makeString(commonPrefix), BasicAutomata.makeAnyString());
- if (BasicOperations.sameLanguage(automaton, prefixAutomaton)) {
- return new PrefixTermsEnum(tenum, term.createTerm(commonPrefix));
- }
- }
-
- compileAutomaton();
-
- return new AutomatonTermsEnum(runAutomaton, tenum, isFinite, commonSuffixRef);
+ return factory.getTermsEnum(terms, atts);
}
@Override
diff --git a/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java b/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java
index 401c6133601..58cb5dd851b 100644
--- a/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java
+++ b/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java
@@ -27,6 +27,7 @@ import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.SpecialOperations;
import org.apache.lucene.util.automaton.Transition;
+import org.apache.lucene.util.automaton.UTF32ToUTF8;
/**
* A FilteredTermsEnum that enumerates terms based upon what is accepted by a
@@ -46,8 +47,6 @@ import org.apache.lucene.util.automaton.Transition;
* @lucene.experimental
*/
public class AutomatonTermsEnum extends FilteredTermsEnum {
- // the object-oriented form of the DFA
- private final Automaton automaton;
// a tableized array-based form of the DFA
private final ByteRunAutomaton runAutomaton;
// common suffix of the automaton
@@ -71,54 +70,26 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
private final Comparator termComp;
/**
- * Expert ctor:
* Construct an enumerator based upon an automaton, enumerating the specified
* field, working on a supplied TermsEnum
*
* @lucene.experimental
*
- * @param runAutomaton pre-compiled ByteRunAutomaton
- * @param finite true if the automaton accepts a finite language
+ * @param compiled CompiledAutomaton
*/
- public AutomatonTermsEnum(ByteRunAutomaton runAutomaton,
- TermsEnum tenum,
- boolean finite, BytesRef commonSuffixRef)
- throws IOException {
+ public AutomatonTermsEnum(TermsEnum tenum, CompiledAutomaton compiled) throws IOException {
super(tenum);
- this.automaton = runAutomaton.getAutomaton();
- this.finite = finite;
+ this.finite = compiled.finite;
+ this.runAutomaton = compiled.runAutomaton;
+ this.commonSuffixRef = compiled.commonSuffixRef;
+ this.allTransitions = compiled.sortedTransitions;
- this.runAutomaton = runAutomaton;
- if (finite) {
- // don't use suffix w/ finite DFAs
- this.commonSuffixRef = null;
- } else if (commonSuffixRef == null) {
- // compute now
- this.commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(automaton);
- } else {
- // precomputed
- this.commonSuffixRef = commonSuffixRef;
- }
-
- // build a cache of sorted transitions for every state
- allTransitions = this.automaton.getSortedTransitions();
// used for path tracking, where each bit is a numbered state.
visited = new long[runAutomaton.getSize()];
termComp = getComparator();
}
- /**
- * Construct an enumerator based upon an automaton, enumerating the specified
- * field, working on a supplied TermsEnum
- *
- * It will automatically calculate whether or not the automaton is finite
- */
- public AutomatonTermsEnum(Automaton automaton, TermsEnum tenum)
- throws IOException {
- this(new ByteRunAutomaton(automaton), tenum, SpecialOperations.isFinite(automaton), null);
- }
-
/**
* Returns true if the term matches the automaton. Also stashes away the term
* to assist with smart enumeration.
@@ -140,9 +111,9 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
@Override
protected BytesRef nextSeekTerm(final BytesRef term) throws IOException {
if (term == null) {
- seekBytesRef.copy("");
+ assert seekBytesRef.length == 0;
// return the empty term, as its valid
- if (runAutomaton.run(seekBytesRef.bytes, seekBytesRef.offset, seekBytesRef.length)) {
+ if (runAutomaton.isAccept(runAutomaton.getInitialState())) {
return seekBytesRef;
}
} else {
@@ -151,25 +122,20 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
// seek to the next possible string;
if (nextString()) {
- // reposition
-
- if (linear)
- setLinear(infinitePosition);
- return seekBytesRef;
+ return seekBytesRef; // reposition
+ } else {
+ return null; // no more possible strings can match
}
- // no more possible strings can match
- return null;
}
- // this instance prevents unicode conversion during backtracking,
- // we can just call setLinear once at the end.
- int infinitePosition;
-
/**
* Sets the enum to operate in linear fashion, as we have found
- * a looping transition at position
+ * a looping transition at position: we set an upper bound and
+ * act like a TermRangeQuery for this portion of the term space.
*/
private void setLinear(int position) {
+ assert linear == false;
+
int state = runAutomaton.getInitialState();
int maxInterval = 0xff;
for (int i = 0; i < position; i++) {
@@ -193,6 +159,8 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
System.arraycopy(seekBytesRef.bytes, 0, linearUpperBound.bytes, 0, position);
linearUpperBound.bytes[position] = (byte) maxInterval;
linearUpperBound.length = length;
+
+ linear = true;
}
private final IntsRef savedStates = new IntsRef(10);
@@ -226,8 +194,7 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
states[pos+1] = nextState;
// we found a loop, record it for faster enumeration
if (!finite && !linear && visited[nextState] == curGen) {
- linear = true;
- infinitePosition = pos;
+ setLinear(pos);
}
state = nextState;
}
@@ -313,15 +280,16 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
*/
transition = allTransitions[state][0];
state = transition.getDest().getNumber();
- // we found a loop, record it for faster enumeration
- if (!finite && !linear && visited[state] == curGen) {
- linear = true;
- infinitePosition = seekBytesRef.length;
- }
+
// append the minimum transition
seekBytesRef.grow(seekBytesRef.length + 1);
seekBytesRef.length++;
seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) transition.getMin();
+
+ // we found a loop, record it for faster enumeration
+ if (!finite && !linear && visited[state] == curGen) {
+ setLinear(seekBytesRef.length-1);
+ }
}
return true;
}
@@ -350,4 +318,26 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
}
return -1; /* all solutions exhausted */
}
+
+ /**
+ * immutable class with everything this enum needs.
+ */
+ public static class CompiledAutomaton {
+ public final ByteRunAutomaton runAutomaton;
+ public final Transition[][] sortedTransitions;
+ public final BytesRef commonSuffixRef;
+ public final boolean finite;
+
+ public CompiledAutomaton(Automaton automaton, boolean finite) {
+ Automaton utf8 = new UTF32ToUTF8().convert(automaton);
+ runAutomaton = new ByteRunAutomaton(utf8, true);
+ sortedTransitions = utf8.getSortedTransitions();
+ this.finite = finite;
+ if (finite) {
+ commonSuffixRef = null;
+ } else {
+ commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(utf8);
+ }
+ }
+ }
}
diff --git a/lucene/src/java/org/apache/lucene/search/BooleanQuery.java b/lucene/src/java/org/apache/lucene/search/BooleanQuery.java
index 56f7d098114..8e2240cdea9 100644
--- a/lucene/src/java/org/apache/lucene/search/BooleanQuery.java
+++ b/lucene/src/java/org/apache/lucene/search/BooleanQuery.java
@@ -18,6 +18,7 @@ package org.apache.lucene.search;
*/
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.search.BooleanClause.Occur;
@@ -62,46 +63,32 @@ public class BooleanQuery extends Query implements Iterable {
}
private ArrayList clauses = new ArrayList();
- private boolean disableCoord;
+ private final boolean disableCoord;
/** Constructs an empty boolean query. */
- public BooleanQuery() {}
+ public BooleanQuery() {
+ disableCoord = false;
+ }
/** Constructs an empty boolean query.
*
- * {@link Similarity#coord(int,int)} may be disabled in scoring, as
+ * {@link SimilarityProvider#coord(int,int)} may be disabled in scoring, as
* appropriate. For example, this score factor does not make sense for most
* automatically generated queries, like {@link WildcardQuery} and {@link
* FuzzyQuery}.
*
- * @param disableCoord disables {@link Similarity#coord(int,int)} in scoring.
+ * @param disableCoord disables {@link SimilarityProvider#coord(int,int)} in scoring.
*/
public BooleanQuery(boolean disableCoord) {
this.disableCoord = disableCoord;
}
- /** Returns true iff {@link Similarity#coord(int,int)} is disabled in
+ /** Returns true iff {@link SimilarityProvider#coord(int,int)} is disabled in
* scoring for this query instance.
* @see #BooleanQuery(boolean)
*/
public boolean isCoordDisabled() { return disableCoord; }
- // Implement coord disabling.
- // Inherit javadoc.
- @Override
- public Similarity getSimilarity(IndexSearcher searcher) {
- Similarity result = super.getSimilarity(searcher);
- if (disableCoord) { // disable coord as requested
- result = new SimilarityDelegator(result) {
- @Override
- public float coord(int overlap, int maxOverlap) {
- return 1.0f;
- }
- };
- }
- return result;
- }
-
/**
* Specifies a minimum number of the optional BooleanClauses
* which must be satisfied.
@@ -175,13 +162,15 @@ public class BooleanQuery extends Query implements Iterable {
*/
protected class BooleanWeight extends Weight {
/** The Similarity implementation. */
- protected Similarity similarity;
+ protected SimilarityProvider similarityProvider;
protected ArrayList weights;
protected int maxCoord; // num optional + num required
+ private final boolean disableCoord;
- public BooleanWeight(IndexSearcher searcher)
+ public BooleanWeight(IndexSearcher searcher, boolean disableCoord)
throws IOException {
- this.similarity = getSimilarity(searcher);
+ this.similarityProvider = searcher.getSimilarityProvider();
+ this.disableCoord = disableCoord;
weights = new ArrayList(clauses.size());
for (int i = 0 ; i < clauses.size(); i++) {
BooleanClause c = clauses.get(i);
@@ -212,6 +201,9 @@ public class BooleanQuery extends Query implements Iterable {
return sum ;
}
+ public float coord(int overlap, int maxOverlap) {
+ return similarityProvider.coord(overlap, maxOverlap);
+ }
@Override
public void normalize(float norm) {
@@ -223,7 +215,7 @@ public class BooleanQuery extends Query implements Iterable {
}
@Override
- public Explanation explain(IndexReader reader, int doc)
+ public Explanation explain(AtomicReaderContext context, int doc)
throws IOException {
final int minShouldMatch =
BooleanQuery.this.getMinimumNumberShouldMatch();
@@ -237,7 +229,7 @@ public class BooleanQuery extends Query implements Iterable {
for (Iterator wIter = weights.iterator(); wIter.hasNext();) {
Weight w = wIter.next();
BooleanClause c = cIter.next();
- if (w.scorer(reader, true, true) == null) {
+ if (w.scorer(context, ScorerContext.def().scoreDocsInOrder(true).topScorer(true)) == null) {
if (c.isRequired()) {
fail = true;
Explanation r = new Explanation(0.0f, "no match on required clause (" + c.getQuery().toString() + ")");
@@ -245,7 +237,7 @@ public class BooleanQuery extends Query implements Iterable {
}
continue;
}
- Explanation e = w.explain(reader, doc);
+ Explanation e = w.explain(context, doc);
if (e.isMatch()) {
if (!c.isProhibited()) {
sumExpl.addDetail(e);
@@ -284,10 +276,10 @@ public class BooleanQuery extends Query implements Iterable {
sumExpl.setMatch(0 < coord ? Boolean.TRUE : Boolean.FALSE);
sumExpl.setValue(sum);
- float coordFactor = similarity.coord(coord, maxCoord);
- if (coordFactor == 1.0f) // coord is no-op
+ final float coordFactor = disableCoord ? 1.0f : coord(coord, maxCoord);
+ if (coordFactor == 1.0f) {
return sumExpl; // eliminate wrapper
- else {
+ } else {
ComplexExplanation result = new ComplexExplanation(sumExpl.isMatch(),
sum*coordFactor,
"product of:");
@@ -299,7 +291,7 @@ public class BooleanQuery extends Query implements Iterable {
}
@Override
- public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer)
+ public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext)
throws IOException {
List required = new ArrayList();
List prohibited = new ArrayList();
@@ -307,7 +299,7 @@ public class BooleanQuery extends Query implements Iterable {
Iterator cIter = clauses.iterator();
for (Weight w : weights) {
BooleanClause c = cIter.next();
- Scorer subScorer = w.scorer(reader, true, false);
+ Scorer subScorer = w.scorer(context, ScorerContext.def());
if (subScorer == null) {
if (c.isRequired()) {
return null;
@@ -322,8 +314,8 @@ public class BooleanQuery extends Query implements Iterable {
}
// Check if we can return a BooleanScorer
- if (!scoreDocsInOrder && topScorer && required.size() == 0 && prohibited.size() < 32) {
- return new BooleanScorer(this, similarity, minNrShouldMatch, optional, prohibited, maxCoord);
+ if (!scorerContext.scoreDocsInOrder && scorerContext.topScorer && required.size() == 0 && prohibited.size() < 32) {
+ return new BooleanScorer(this, disableCoord, minNrShouldMatch, optional, prohibited, maxCoord);
}
if (required.size() == 0 && optional.size() == 0) {
@@ -337,7 +329,7 @@ public class BooleanQuery extends Query implements Iterable {
}
// Return a BooleanScorer2
- return new BooleanScorer2(this, similarity, minNrShouldMatch, required, prohibited, optional, maxCoord);
+ return new BooleanScorer2(this, disableCoord, minNrShouldMatch, required, prohibited, optional, maxCoord);
}
@Override
@@ -363,7 +355,7 @@ public class BooleanQuery extends Query implements Iterable {
@Override
public Weight createWeight(IndexSearcher searcher) throws IOException {
- return new BooleanWeight(searcher);
+ return new BooleanWeight(searcher, disableCoord);
}
@Override
diff --git a/lucene/src/java/org/apache/lucene/search/BooleanScorer.java b/lucene/src/java/org/apache/lucene/search/BooleanScorer.java
index 3a90fe023ce..18978c36006 100644
--- a/lucene/src/java/org/apache/lucene/search/BooleanScorer.java
+++ b/lucene/src/java/org/apache/lucene/search/BooleanScorer.java
@@ -20,8 +20,9 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.util.List;
-import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery.BooleanWeight;
/* Description from Doug Cutting (excerpted from
* LUCENE-1483):
@@ -92,7 +93,7 @@ final class BooleanScorer extends Scorer {
}
@Override
- public void setNextReader(IndexReader reader, int docBase) {
+ public void setNextReader(AtomicReaderContext context) {
// not needed by this implementation
}
@@ -118,7 +119,7 @@ final class BooleanScorer extends Scorer {
int doc = NO_MORE_DOCS;
int freq;
- public BucketScorer() { super(null); }
+ public BucketScorer(Weight weight) { super(weight); }
@Override
public int advance(int target) throws IOException { return NO_MORE_DOCS; }
@@ -197,9 +198,9 @@ final class BooleanScorer extends Scorer {
private Bucket current;
private int doc = -1;
- BooleanScorer(Weight weight, Similarity similarity, int minNrShouldMatch,
+ BooleanScorer(BooleanWeight weight, boolean disableCoord, int minNrShouldMatch,
List optionalScorers, List prohibitedScorers, int maxCoord) throws IOException {
- super(similarity, weight);
+ super(weight);
this.minNrShouldMatch = minNrShouldMatch;
if (optionalScorers != null && optionalScorers.size() > 0) {
@@ -222,18 +223,17 @@ final class BooleanScorer extends Scorer {
}
coordFactors = new float[optionalScorers.size() + 1];
- Similarity sim = getSimilarity();
for (int i = 0; i < coordFactors.length; i++) {
- coordFactors[i] = sim.coord(i, maxCoord);
+ coordFactors[i] = disableCoord ? 1.0f : weight.coord(i, maxCoord);
}
}
// firstDocID is ignored since nextDoc() initializes 'current'
@Override
- protected boolean score(Collector collector, int max, int firstDocID) throws IOException {
+ public boolean score(Collector collector, int max, int firstDocID) throws IOException {
boolean more;
Bucket tmp;
- BucketScorer bs = new BucketScorer();
+ BucketScorer bs = new BucketScorer(weight);
// The internal loop will set the score and doc before calling collect.
collector.setScorer(bs);
do {
diff --git a/lucene/src/java/org/apache/lucene/search/BooleanScorer2.java b/lucene/src/java/org/apache/lucene/search/BooleanScorer2.java
index c8dcf2eba20..9c8ac60cbf7 100644
--- a/lucene/src/java/org/apache/lucene/search/BooleanScorer2.java
+++ b/lucene/src/java/org/apache/lucene/search/BooleanScorer2.java
@@ -22,6 +22,7 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery.BooleanWeight;
/* See the description in BooleanScorer.java, comparing
* BooleanScorer & BooleanScorer2 */
@@ -42,14 +43,12 @@ class BooleanScorer2 extends Scorer {
int maxCoord = 0; // to be increased for each non prohibited scorer
int nrMatchers; // to be increased by score() of match counting scorers.
- void init() { // use after all scorers have been added.
+ void init(boolean disableCoord) { // use after all scorers have been added.
coordFactors = new float[optionalScorers.size() + requiredScorers.size() + 1];
- Similarity sim = getSimilarity();
for (int i = 0; i < coordFactors.length; i++) {
- coordFactors[i] = sim.coord(i, maxCoord);
+ coordFactors[i] = disableCoord ? 1.0f : ((BooleanWeight)weight).coord(i, maxCoord);
}
}
-
}
private final Coordinator coordinator;
@@ -69,8 +68,11 @@ class BooleanScorer2 extends Scorer {
* prohibited and optional scorers. In no required scorers are added, at least
* one of the optional scorers will have to match during the search.
*
- * @param similarity
- * The similarity to be used.
+ * @param weight
+ * The BooleanWeight to be used.
+ * @param disableCoord
+ * If this parameter is true, coordination level matching
+ * ({@link Similarity#coord(int, int)}) is not used.
* @param minNrShouldMatch
* The minimum number of optional added scorers that should match
* during the search. In case no required scorers are added, at least
@@ -82,9 +84,9 @@ class BooleanScorer2 extends Scorer {
* @param optional
* the list of optional scorers.
*/
- public BooleanScorer2(Weight weight, Similarity similarity, int minNrShouldMatch,
+ public BooleanScorer2(BooleanWeight weight, boolean disableCoord, int minNrShouldMatch,
List required, List prohibited, List optional, int maxCoord) throws IOException {
- super(similarity, weight);
+ super(weight);
if (minNrShouldMatch < 0) {
throw new IllegalArgumentException("Minimum number of optional scorers should not be negative");
}
@@ -96,8 +98,8 @@ class BooleanScorer2 extends Scorer {
requiredScorers = required;
prohibitedScorers = prohibited;
- coordinator.init();
- countingSumScorer = makeCountingSumScorer();
+ coordinator.init(disableCoord);
+ countingSumScorer = makeCountingSumScorer(disableCoord);
}
/** Count a scorer as a single match. */
@@ -109,7 +111,7 @@ class BooleanScorer2 extends Scorer {
private float lastDocScore = Float.NaN;
SingleMatchScorer(Scorer scorer) {
- super(scorer.getSimilarity());
+ super(scorer.weight);
this.scorer = scorer;
}
@@ -145,7 +147,7 @@ class BooleanScorer2 extends Scorer {
private Scorer countingDisjunctionSumScorer(final List scorers,
int minNrShouldMatch) throws IOException {
// each scorer from the list counted as a single matcher
- return new DisjunctionSumScorer(scorers, minNrShouldMatch) {
+ return new DisjunctionSumScorer(weight, scorers, minNrShouldMatch) {
private int lastScoredDoc = -1;
// Save the score of lastScoredDoc, so that we don't compute it more than
// once in score().
@@ -164,12 +166,11 @@ class BooleanScorer2 extends Scorer {
};
}
- private static final Similarity defaultSimilarity = Similarity.getDefault();
-
- private Scorer countingConjunctionSumScorer(List requiredScorers) throws IOException {
+ private Scorer countingConjunctionSumScorer(boolean disableCoord,
+ List requiredScorers) throws IOException {
// each scorer from the list counted as a single matcher
final int requiredNrMatchers = requiredScorers.size();
- return new ConjunctionScorer(defaultSimilarity, requiredScorers) {
+ return new ConjunctionScorer(weight, disableCoord ? 1.0f : ((BooleanWeight)weight).coord(requiredScorers.size(), requiredScorers.size()), requiredScorers) {
private int lastScoredDoc = -1;
// Save the score of lastScoredDoc, so that we don't compute it more than
// once in score().
@@ -192,8 +193,9 @@ class BooleanScorer2 extends Scorer {
};
}
- private Scorer dualConjunctionSumScorer(Scorer req1, Scorer req2) throws IOException { // non counting.
- return new ConjunctionScorer(defaultSimilarity, req1, req2);
+ private Scorer dualConjunctionSumScorer(boolean disableCoord,
+ Scorer req1, Scorer req2) throws IOException { // non counting.
+ return new ConjunctionScorer(weight, disableCoord ? 1.0f : ((BooleanWeight)weight).coord(2, 2), req1, req2);
// All scorers match, so defaultSimilarity always has 1 as
// the coordination factor.
// Therefore the sum of the scores of two scorers
@@ -203,13 +205,13 @@ class BooleanScorer2 extends Scorer {
/** Returns the scorer to be used for match counting and score summing.
* Uses requiredScorers, optionalScorers and prohibitedScorers.
*/
- private Scorer makeCountingSumScorer() throws IOException { // each scorer counted as a single matcher
+ private Scorer makeCountingSumScorer(boolean disableCoord) throws IOException { // each scorer counted as a single matcher
return (requiredScorers.size() == 0)
- ? makeCountingSumScorerNoReq()
- : makeCountingSumScorerSomeReq();
+ ? makeCountingSumScorerNoReq(disableCoord)
+ : makeCountingSumScorerSomeReq(disableCoord);
}
- private Scorer makeCountingSumScorerNoReq() throws IOException { // No required scorers
+ private Scorer makeCountingSumScorerNoReq(boolean disableCoord) throws IOException { // No required scorers
// minNrShouldMatch optional scorers are required, but at least 1
int nrOptRequired = (minNrShouldMatch < 1) ? 1 : minNrShouldMatch;
Scorer requiredCountingSumScorer;
@@ -217,24 +219,26 @@ class BooleanScorer2 extends Scorer {
requiredCountingSumScorer = countingDisjunctionSumScorer(optionalScorers, nrOptRequired);
else if (optionalScorers.size() == 1)
requiredCountingSumScorer = new SingleMatchScorer(optionalScorers.get(0));
- else
- requiredCountingSumScorer = countingConjunctionSumScorer(optionalScorers);
+ else {
+ requiredCountingSumScorer = countingConjunctionSumScorer(disableCoord, optionalScorers);
+ }
return addProhibitedScorers(requiredCountingSumScorer);
}
- private Scorer makeCountingSumScorerSomeReq() throws IOException { // At least one required scorer.
+ private Scorer makeCountingSumScorerSomeReq(boolean disableCoord) throws IOException { // At least one required scorer.
if (optionalScorers.size() == minNrShouldMatch) { // all optional scorers also required.
ArrayList allReq = new ArrayList(requiredScorers);
allReq.addAll(optionalScorers);
- return addProhibitedScorers(countingConjunctionSumScorer(allReq));
+ return addProhibitedScorers(countingConjunctionSumScorer(disableCoord, allReq));
} else { // optionalScorers.size() > minNrShouldMatch, and at least one required scorer
Scorer requiredCountingSumScorer =
requiredScorers.size() == 1
? new SingleMatchScorer(requiredScorers.get(0))
- : countingConjunctionSumScorer(requiredScorers);
+ : countingConjunctionSumScorer(disableCoord, requiredScorers);
if (minNrShouldMatch > 0) { // use a required disjunction scorer over the optional scorers
return addProhibitedScorers(
dualConjunctionSumScorer( // non counting
+ disableCoord,
requiredCountingSumScorer,
countingDisjunctionSumScorer(
optionalScorers,
@@ -261,7 +265,7 @@ class BooleanScorer2 extends Scorer {
: new ReqExclScorer(requiredCountingSumScorer,
((prohibitedScorers.size() == 1)
? prohibitedScorers.get(0)
- : new DisjunctionSumScorer(prohibitedScorers)));
+ : new DisjunctionSumScorer(weight, prohibitedScorers)));
}
/** Scores and collects all matching documents.
@@ -276,7 +280,7 @@ class BooleanScorer2 extends Scorer {
}
@Override
- protected boolean score(Collector collector, int max, int firstDocID) throws IOException {
+ public boolean score(Collector collector, int max, int firstDocID) throws IOException {
doc = firstDocID;
collector.setScorer(this);
while (doc < max) {
diff --git a/lucene/src/java/org/apache/lucene/search/BoostAttribute.java b/lucene/src/java/org/apache/lucene/search/BoostAttribute.java
index 93a2d3916bf..58f44633b8d 100644
--- a/lucene/src/java/org/apache/lucene/search/BoostAttribute.java
+++ b/lucene/src/java/org/apache/lucene/search/BoostAttribute.java
@@ -21,13 +21,13 @@ import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeSource; // javadocs only
import org.apache.lucene.index.TermsEnum; // javadocs only
-/** Add this {@link Attribute} to a {@link TermsEnum} returned by {@link MultiTermQuery#getTermsEnum(IndexReader,AttributeSource)}
+/** Add this {@link Attribute} to a {@link TermsEnum} returned by {@link MultiTermQuery#getTermsEnum(Terms,AttributeSource)}
* and update the boost on each returned term. This enables to control the boost factor
* for each matching term in {@link MultiTermQuery#SCORING_BOOLEAN_QUERY_REWRITE} or
* {@link TopTermsRewrite} mode.
* {@link FuzzyQuery} is using this to take the edit distance into account.
* Please note: This attribute is intended to be added only by the TermsEnum
- * to itsself in its constructor and consumed by the {@link MultiTermQuery.RewriteMethod}.
+ * to itself in its constructor and consumed by the {@link MultiTermQuery.RewriteMethod}.
* @lucene.internal
*/
public interface BoostAttribute extends Attribute {
diff --git a/lucene/src/java/org/apache/lucene/search/BoostAttributeImpl.java b/lucene/src/java/org/apache/lucene/search/BoostAttributeImpl.java
index 28ce30ee023..f07909021e5 100644
--- a/lucene/src/java/org/apache/lucene/search/BoostAttributeImpl.java
+++ b/lucene/src/java/org/apache/lucene/search/BoostAttributeImpl.java
@@ -37,20 +37,6 @@ public final class BoostAttributeImpl extends AttributeImpl implements BoostAttr
public void clear() {
boost = 1.0f;
}
-
- @Override
- public boolean equals(Object other) {
- if (this == other)
- return true;
- if (other instanceof BoostAttributeImpl)
- return ((BoostAttributeImpl) other).boost == boost;
- return false;
- }
-
- @Override
- public int hashCode() {
- return Float.floatToIntBits(boost);
- }
@Override
public void copyTo(AttributeImpl target) {
diff --git a/lucene/src/java/org/apache/lucene/search/CachingSpanFilter.java b/lucene/src/java/org/apache/lucene/search/CachingSpanFilter.java
index d19c872ee58..e1341fe493a 100644
--- a/lucene/src/java/org/apache/lucene/search/CachingSpanFilter.java
+++ b/lucene/src/java/org/apache/lucene/search/CachingSpanFilter.java
@@ -17,6 +17,7 @@ package org.apache.lucene.search;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.util.Bits;
import java.io.IOException;
@@ -60,15 +61,16 @@ public class CachingSpanFilter extends SpanFilter {
}
@Override
- public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
- SpanFilterResult result = getCachedResult(reader);
+ public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
+ SpanFilterResult result = getCachedResult(context);
return result != null ? result.getDocIdSet() : null;
}
// for testing
int hitCount, missCount;
- private SpanFilterResult getCachedResult(IndexReader reader) throws IOException {
+ private SpanFilterResult getCachedResult(AtomicReaderContext context) throws IOException {
+ final IndexReader reader = context.reader;
final Object coreKey = reader.getCoreCacheKey();
final Object delCoreKey = reader.hasDeletions() ? reader.getDeletedDocs() : coreKey;
@@ -80,7 +82,7 @@ public class CachingSpanFilter extends SpanFilter {
}
missCount++;
- result = filter.bitSpans(reader);
+ result = filter.bitSpans(context);
cache.put(coreKey, delCoreKey, result);
return result;
@@ -88,8 +90,8 @@ public class CachingSpanFilter extends SpanFilter {
@Override
- public SpanFilterResult bitSpans(IndexReader reader) throws IOException {
- return getCachedResult(reader);
+ public SpanFilterResult bitSpans(AtomicReaderContext context) throws IOException {
+ return getCachedResult(context);
}
@Override
diff --git a/lucene/src/java/org/apache/lucene/search/CachingWrapperFilter.java b/lucene/src/java/org/apache/lucene/search/CachingWrapperFilter.java
index d51eed25172..1f865670b56 100644
--- a/lucene/src/java/org/apache/lucene/search/CachingWrapperFilter.java
+++ b/lucene/src/java/org/apache/lucene/search/CachingWrapperFilter.java
@@ -23,6 +23,7 @@ import java.util.Map;
import java.util.WeakHashMap;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.util.OpenBitSetDISI;
import org.apache.lucene.util.Bits;
@@ -37,6 +38,9 @@ import org.apache.lucene.util.Bits;
* {@link DeletesMode#DYNAMIC}).
*/
public class CachingWrapperFilter extends Filter {
+ // TODO: make this filter aware of ReaderContext. a cached filter could
+ // specify the actual readers key or something similar to indicate on which
+ // level of the readers hierarchy it should be cached.
Filter filter;
/**
@@ -191,8 +195,8 @@ public class CachingWrapperFilter extends Filter {
int hitCount, missCount;
@Override
- public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
-
+ public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
+ final IndexReader reader = context.reader;
final Object coreKey = reader.getCoreCacheKey();
final Object delCoreKey = reader.hasDeletions() ? reader.getDeletedDocs() : coreKey;
@@ -205,7 +209,7 @@ public class CachingWrapperFilter extends Filter {
missCount++;
// cache miss
- docIdSet = docIdSetToCache(filter.getDocIdSet(reader), reader);
+ docIdSet = docIdSetToCache(filter.getDocIdSet(context), reader);
if (docIdSet != null) {
cache.put(coreKey, delCoreKey, docIdSet);
diff --git a/lucene/src/java/org/apache/lucene/search/Collector.java b/lucene/src/java/org/apache/lucene/search/Collector.java
index e2514887930..b64abce0f4b 100644
--- a/lucene/src/java/org/apache/lucene/search/Collector.java
+++ b/lucene/src/java/org/apache/lucene/search/Collector.java
@@ -19,7 +19,8 @@ package org.apache.lucene.search;
import java.io.IOException;
-import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
+import org.apache.lucene.index.IndexReader.ReaderContext;
/**
*
Expert: Collectors are primarily meant to be used to
@@ -98,8 +99,8 @@ import org.apache.lucene.index.IndexReader;
* bits.set(doc + docBase);
* }
*
- * public void setNextReader(IndexReader reader, int docBase) {
- * this.docBase = docBase;
+ * public void setNextReader(AtomicReaderContext context) {
+ * this.docBase = context.docBase;
* }
* });
*
@@ -136,24 +137,23 @@ public abstract class Collector {
*
*
* Note: This is called in an inner search loop. For good search performance,
- * implementations of this method should not call {@link Searcher#doc(int)} or
+ * implementations of this method should not call {@link IndexSearcher#doc(int)} or
* {@link org.apache.lucene.index.IndexReader#document(int)} on every hit.
* Doing so can slow searches by an order of magnitude or more.
*/
public abstract void collect(int doc) throws IOException;
/**
- * Called before collecting from each IndexReader. All doc ids in
- * {@link #collect(int)} will correspond to reader.
+ * Called before collecting from each {@link AtomicReaderContext}. All doc ids in
+ * {@link #collect(int)} will correspond to {@link ReaderContext#reader}.
*
- * Add docBase to the current IndexReaders internal document id to re-base ids
- * in {@link #collect(int)}.
+ * Add {@link AtomicReaderContext#docBase} to the current {@link ReaderContext#reader}'s
+ * internal document id to re-base ids in {@link #collect(int)}.
*
- * @param reader
- * next IndexReader
- * @param docBase
+ * @param context
+ * next atomic reader context
*/
- public abstract void setNextReader(IndexReader reader, int docBase) throws IOException;
+ public abstract void setNextReader(AtomicReaderContext context) throws IOException;
/**
* Return true
if this collector does not
diff --git a/lucene/src/java/org/apache/lucene/search/ConjunctionScorer.java b/lucene/src/java/org/apache/lucene/search/ConjunctionScorer.java
index dd254755a31..b8dea2565b9 100644
--- a/lucene/src/java/org/apache/lucene/search/ConjunctionScorer.java
+++ b/lucene/src/java/org/apache/lucene/search/ConjunctionScorer.java
@@ -29,14 +29,14 @@ class ConjunctionScorer extends Scorer {
private final float coord;
private int lastDoc = -1;
- public ConjunctionScorer(Similarity similarity, Collection scorers) throws IOException {
- this(similarity, scorers.toArray(new Scorer[scorers.size()]));
+ public ConjunctionScorer(Weight weight, float coord, Collection scorers) throws IOException {
+ this(weight, coord, scorers.toArray(new Scorer[scorers.size()]));
}
- public ConjunctionScorer(Similarity similarity, Scorer... scorers) throws IOException {
- super(similarity);
+ public ConjunctionScorer(Weight weight, float coord, Scorer... scorers) throws IOException {
+ super(weight);
this.scorers = scorers;
- coord = similarity.coord(scorers.length, scorers.length);
+ this.coord = coord;
for (int i = 0; i < scorers.length; i++) {
if (scorers[i].nextDoc() == NO_MORE_DOCS) {
diff --git a/lucene/src/java/org/apache/lucene/search/ConstantScoreAutoRewrite.java b/lucene/src/java/org/apache/lucene/search/ConstantScoreAutoRewrite.java
index 6dd17bf3645..64aef2b3cf5 100644
--- a/lucene/src/java/org/apache/lucene/search/ConstantScoreAutoRewrite.java
+++ b/lucene/src/java/org/apache/lucene/search/ConstantScoreAutoRewrite.java
@@ -21,9 +21,15 @@ import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
+import org.apache.lucene.util.PerReaderTermState;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
class ConstantScoreAutoRewrite extends TermCollectingRewrite {
@@ -71,8 +77,8 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite {
}
@Override
- protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost /*ignored*/) {
- topLevel.add(new TermQuery(term, docFreq), BooleanClause.Occur.SHOULD);
+ protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost /*ignored*/, PerReaderTermState states) {
+ topLevel.add(new TermQuery(term, states), BooleanClause.Occur.SHOULD);
}
@Override
@@ -98,9 +104,10 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite {
final BytesRefHash pendingTerms = col.pendingTerms;
final int sort[] = pendingTerms.sort(col.termsEnum.getComparator());
for(int i = 0; i < size; i++) {
+ final int pos = sort[i];
// docFreq is not used for constant score here, we pass 1
// to explicitely set a fake value, so it's not calculated
- addClause(bq, placeholderTerm.createTerm(pendingTerms.get(sort[i], new BytesRef())), 1, 1.0f);
+ addClause(bq, placeholderTerm.createTerm(pendingTerms.get(pos, new BytesRef())), 1, 1.0f, col.array.termState[pos]);
}
// Strip scores
final Query result = new ConstantScoreQuery(bq);
@@ -123,12 +130,21 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite {
@Override
public boolean collect(BytesRef bytes) throws IOException {
- pendingTerms.add(bytes);
+ int pos = pendingTerms.add(bytes);
docVisitCount += termsEnum.docFreq();
if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) {
hasCutOff = true;
return false;
}
+
+ final TermState termState = termsEnum.termState();
+ assert termState != null;
+ if (pos < 0) {
+ pos = (-pos)-1;
+ array.termState[pos].register(termState, readerContext.ord, termsEnum.docFreq());
+ } else {
+ array.termState[pos] = new PerReaderTermState(topReaderContext, termState, readerContext.ord, termsEnum.docFreq());
+ }
return true;
}
@@ -137,7 +153,8 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite {
TermsEnum termsEnum;
final int docCountCutoff, termCountLimit;
- final BytesRefHash pendingTerms = new BytesRefHash();
+ final TermStateByteStart array = new TermStateByteStart(16);
+ final BytesRefHash pendingTerms = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectAllocator()), 16, array);
}
@Override
@@ -166,4 +183,40 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite {
return true;
}
+
+ /** Special implementation of BytesStartArray that keeps parallel arrays for {@link PerReaderTermState} */
+ static final class TermStateByteStart extends DirectBytesStartArray {
+ PerReaderTermState[] termState;
+
+ public TermStateByteStart(int initSize) {
+ super(initSize);
+ }
+
+ @Override
+ public int[] init() {
+ final int[] ord = super.init();
+ termState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+ assert termState.length >= ord.length;
+ return ord;
+ }
+
+ @Override
+ public int[] grow() {
+ final int[] ord = super.grow();
+ if (termState.length < ord.length) {
+ PerReaderTermState[] tmpTermState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+ System.arraycopy(termState, 0, tmpTermState, 0, termState.length);
+ termState = tmpTermState;
+ }
+ assert termState.length >= ord.length;
+ return ord;
+ }
+
+ @Override
+ public int[] clear() {
+ termState = null;
+ return super.clear();
+ }
+
+ }
}
diff --git a/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java b/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java
index fe76121d3c2..d5f5f50389b 100644
--- a/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java
+++ b/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java
@@ -18,6 +18,7 @@ package org.apache.lucene.search;
*/
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.ToStringUtils;
@@ -96,12 +97,10 @@ public class ConstantScoreQuery extends Query {
protected class ConstantWeight extends Weight {
private final Weight innerWeight;
- private final Similarity similarity;
private float queryNorm;
private float queryWeight;
public ConstantWeight(IndexSearcher searcher) throws IOException {
- this.similarity = getSimilarity(searcher);
this.innerWeight = (query == null) ? null : query.createWeight(searcher);
}
@@ -132,22 +131,22 @@ public class ConstantScoreQuery extends Query {
}
@Override
- public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException {
+ public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
final DocIdSetIterator disi;
if (filter != null) {
assert query == null;
- final DocIdSet dis = filter.getDocIdSet(reader);
+ final DocIdSet dis = filter.getDocIdSet(context);
if (dis == null)
return null;
disi = dis.iterator();
} else {
assert query != null && innerWeight != null;
disi =
- innerWeight.scorer(reader, scoreDocsInOrder, topScorer);
+ innerWeight.scorer(context, scorerContext);
}
if (disi == null)
return null;
- return new ConstantScorer(similarity, disi, this);
+ return new ConstantScorer(disi, this);
}
@Override
@@ -156,8 +155,8 @@ public class ConstantScoreQuery extends Query {
}
@Override
- public Explanation explain(IndexReader reader, int doc) throws IOException {
- final Scorer cs = scorer(reader, true, false);
+ public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
+ final Scorer cs = scorer(context, ScorerContext.def());
final boolean exists = (cs != null && cs.advance(doc) == doc);
final ComplexExplanation result = new ComplexExplanation();
@@ -180,8 +179,8 @@ public class ConstantScoreQuery extends Query {
final DocIdSetIterator docIdSetIterator;
final float theScore;
- public ConstantScorer(Similarity similarity, DocIdSetIterator docIdSetIterator, Weight w) throws IOException {
- super(similarity,w);
+ public ConstantScorer(DocIdSetIterator docIdSetIterator, Weight w) throws IOException {
+ super(w);
theScore = w.getValue();
this.docIdSetIterator = docIdSetIterator;
}
@@ -211,8 +210,7 @@ public class ConstantScoreQuery extends Query {
@Override
public void setScorer(Scorer scorer) throws IOException {
// we must wrap again here, but using the scorer passed in as parameter:
- collector.setScorer(new ConstantScorer(ConstantScorer.this.getSimilarity(),
- scorer, ConstantScorer.this.weight));
+ collector.setScorer(new ConstantScorer(scorer, ConstantScorer.this.weight));
}
@Override
@@ -221,8 +219,8 @@ public class ConstantScoreQuery extends Query {
}
@Override
- public void setNextReader(IndexReader reader, int docBase) throws IOException {
- collector.setNextReader(reader, docBase);
+ public void setNextReader(AtomicReaderContext context) throws IOException {
+ collector.setNextReader(context);
}
@Override
@@ -243,10 +241,8 @@ public class ConstantScoreQuery extends Query {
}
// this optimization allows out of order scoring as top scorer,
- // TODO: theoretically this method should not be called because its protected and
- // this class does not use it, it should be public in Scorer!
@Override
- protected boolean score(Collector collector, int max, int firstDocID) throws IOException {
+ public boolean score(Collector collector, int max, int firstDocID) throws IOException {
if (docIdSetIterator instanceof Scorer) {
return ((Scorer) docIdSetIterator).score(wrapCollector(collector), max, firstDocID);
} else {
diff --git a/lucene/src/java/org/apache/lucene/search/DefaultSimilarity.java b/lucene/src/java/org/apache/lucene/search/DefaultSimilarity.java
index 0ab551ad8b2..71c8a229089 100644
--- a/lucene/src/java/org/apache/lucene/search/DefaultSimilarity.java
+++ b/lucene/src/java/org/apache/lucene/search/DefaultSimilarity.java
@@ -20,7 +20,7 @@ import org.apache.lucene.index.FieldInvertState;
*/
/** Expert: Default scoring implementation. */
-public class DefaultSimilarity extends Similarity {
+public class DefaultSimilarity extends Similarity implements SimilarityProvider {
/** Implemented as
* state.getBoost()*lengthNorm(numTerms)
, where
@@ -37,17 +37,10 @@ public class DefaultSimilarity extends Similarity {
numTerms = state.getLength() - state.getNumOverlap();
else
numTerms = state.getLength();
- return (state.getBoost() * lengthNorm(field, numTerms));
- }
-
- /** Implemented as 1/sqrt(numTerms)
. */
- @Override
- public float lengthNorm(String fieldName, int numTerms) {
- return (float)(1.0 / Math.sqrt(numTerms));
+ return state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms)));
}
/** Implemented as 1/sqrt(sumOfSquaredWeights)
. */
- @Override
public float queryNorm(float sumOfSquaredWeights) {
return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));
}
@@ -71,7 +64,6 @@ public class DefaultSimilarity extends Similarity {
}
/** Implemented as overlap / maxOverlap
. */
- @Override
public float coord(int overlap, int maxOverlap) {
return overlap / (float)maxOverlap;
}
@@ -96,4 +88,12 @@ public class DefaultSimilarity extends Similarity {
public boolean getDiscountOverlaps() {
return discountOverlaps;
}
+
+ /**
+ * Returns this default implementation for all fields.
+ * Override this method to customize scoring on a per-field basis.
+ */
+ public Similarity get(String field) {
+ return this;
+ }
}
diff --git a/lucene/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java b/lucene/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java
index b6cd0295247..0434232035e 100644
--- a/lucene/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java
+++ b/lucene/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java
@@ -23,6 +23,7 @@ import java.util.Iterator;
import java.util.Set;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
/**
@@ -94,29 +95,26 @@ public class DisjunctionMaxQuery extends Query implements Iterable {
* change suddenly in the next release.
*/
protected class DisjunctionMaxWeight extends Weight {
- /** The Similarity implementation. */
- protected Similarity similarity;
/** The Weights for our subqueries, in 1-1 correspondence with disjuncts */
protected ArrayList weights = new ArrayList(); // The Weight's for our subqueries, in 1-1 correspondence with disjuncts
- /* Construct the Weight for this Query searched by searcher. Recursively construct subquery weights. */
+ /** Construct the Weight for this Query searched by searcher. Recursively construct subquery weights. */
public DisjunctionMaxWeight(IndexSearcher searcher) throws IOException {
- this.similarity = searcher.getSimilarity();
for (Query disjunctQuery : disjuncts) {
weights.add(disjunctQuery.createWeight(searcher));
}
}
- /* Return our associated DisjunctionMaxQuery */
+ /** Return our associated DisjunctionMaxQuery */
@Override
public Query getQuery() { return DisjunctionMaxQuery.this; }
- /* Return our boost */
+ /** Return our boost */
@Override
public float getValue() { return getBoost(); }
- /* Compute the sub of squared weights of us applied to our subqueries. Used for normalization. */
+ /** Compute the sub of squared weights of us applied to our subqueries. Used for normalization. */
@Override
public float sumOfSquaredWeights() throws IOException {
float max = 0.0f, sum = 0.0f;
@@ -130,7 +128,7 @@ public class DisjunctionMaxQuery extends Query implements Iterable {
return (((sum - max) * tieBreakerMultiplier * tieBreakerMultiplier) + max) * boost * boost;
}
- /* Apply the computed normalization factor to our subqueries */
+ /** Apply the computed normalization factor to our subqueries */
@Override
public void normalize(float norm) {
norm *= getBoost(); // Incorporate our boost
@@ -139,32 +137,31 @@ public class DisjunctionMaxQuery extends Query implements Iterable {
}
}
- /* Create the scorer used to score our associated DisjunctionMaxQuery */
+ /** Create the scorer used to score our associated DisjunctionMaxQuery */
@Override
- public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder,
- boolean topScorer) throws IOException {
+ public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
Scorer[] scorers = new Scorer[weights.size()];
int idx = 0;
for (Weight w : weights) {
- Scorer subScorer = w.scorer(reader, true, false);
+ Scorer subScorer = w.scorer(context, ScorerContext.def());
if (subScorer != null && subScorer.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
scorers[idx++] = subScorer;
}
}
if (idx == 0) return null; // all scorers did not have documents
- DisjunctionMaxScorer result = new DisjunctionMaxScorer(tieBreakerMultiplier, similarity, scorers, idx);
+ DisjunctionMaxScorer result = new DisjunctionMaxScorer(this, tieBreakerMultiplier, scorers, idx);
return result;
}
- /* Explain the score we computed for doc */
+ /** Explain the score we computed for doc */
@Override
- public Explanation explain(IndexReader reader, int doc) throws IOException {
- if (disjuncts.size() == 1) return weights.get(0).explain(reader,doc);
+ public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
+ if (disjuncts.size() == 1) return weights.get(0).explain(context,doc);
ComplexExplanation result = new ComplexExplanation();
float max = 0.0f, sum = 0.0f;
result.setDescription(tieBreakerMultiplier == 0.0f ? "max of:" : "max plus " + tieBreakerMultiplier + " times others of:");
for (Weight wt : weights) {
- Explanation e = wt.explain(reader, doc);
+ Explanation e = wt.explain(context, doc);
if (e.isMatch()) {
result.setMatch(Boolean.TRUE);
result.addDetail(e);
@@ -178,7 +175,7 @@ public class DisjunctionMaxQuery extends Query implements Iterable {
} // end of DisjunctionMaxWeight inner class
- /* Create the Weight used to score us */
+ /** Create the Weight used to score us */
@Override
public Weight createWeight(IndexSearcher searcher) throws IOException {
return new DisjunctionMaxWeight(searcher);
diff --git a/lucene/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java b/lucene/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java
index d6f5d2a8a5c..9995062c2fe 100644
--- a/lucene/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java
+++ b/lucene/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java
@@ -40,22 +40,20 @@ class DisjunctionMaxScorer extends Scorer {
/**
* Creates a new instance of DisjunctionMaxScorer
*
+ * @param weight
+ * The Weight to be used.
* @param tieBreakerMultiplier
* Multiplier applied to non-maximum-scoring subqueries for a
* document as they are summed into the result.
- * @param similarity
- * -- not used since our definition involves neither coord nor terms
- * directly
* @param subScorers
* The sub scorers this Scorer should iterate on
* @param numScorers
* The actual number of scorers to iterate on. Note that the array's
* length may be larger than the actual number of scorers.
*/
- public DisjunctionMaxScorer(float tieBreakerMultiplier,
- Similarity similarity, Scorer[] subScorers, int numScorers) throws IOException {
- super(similarity);
-
+ public DisjunctionMaxScorer(Weight weight, float tieBreakerMultiplier,
+ Scorer[] subScorers, int numScorers) throws IOException {
+ super(weight);
this.tieBreakerMultiplier = tieBreakerMultiplier;
// The passed subScorers array includes only scorers which have documents
// (DisjunctionMaxQuery takes care of that), and their nextDoc() was already
diff --git a/lucene/src/java/org/apache/lucene/search/DisjunctionSumScorer.java b/lucene/src/java/org/apache/lucene/search/DisjunctionSumScorer.java
index 7e5016d902b..2f7fa5daf33 100644
--- a/lucene/src/java/org/apache/lucene/search/DisjunctionSumScorer.java
+++ b/lucene/src/java/org/apache/lucene/search/DisjunctionSumScorer.java
@@ -58,6 +58,7 @@ class DisjunctionSumScorer extends Scorer {
private float currentScore = Float.NaN;
/** Construct a DisjunctionScorer
.
+ * @param weight The weight to be used.
* @param subScorers A collection of at least two subscorers.
* @param minimumNrMatchers The positive minimum number of subscorers that should
* match to match this query.
@@ -67,8 +68,8 @@ class DisjunctionSumScorer extends Scorer {
*
When minimumNrMatchers equals the number of subScorers,
* it more efficient to use ConjunctionScorer
.
*/
- public DisjunctionSumScorer( List subScorers, int minimumNrMatchers) throws IOException {
- super(null);
+ public DisjunctionSumScorer(Weight weight, List subScorers, int minimumNrMatchers) throws IOException {
+ super(weight);
nrScorers = subScorers.size();
@@ -88,8 +89,8 @@ class DisjunctionSumScorer extends Scorer {
/** Construct a DisjunctionScorer
, using one as the minimum number
* of matching subscorers.
*/
- public DisjunctionSumScorer(List subScorers) throws IOException {
- this(subScorers, 1);
+ public DisjunctionSumScorer(Weight weight, List subScorers) throws IOException {
+ this(weight, subScorers, 1);
}
/** Called the first time nextDoc() or advance() is called to
@@ -123,7 +124,7 @@ class DisjunctionSumScorer extends Scorer {
* @return true if more matching documents may remain.
*/
@Override
- protected boolean score(Collector collector, int max, int firstDocID) throws IOException {
+ public boolean score(Collector collector, int max, int firstDocID) throws IOException {
// firstDocID is ignored since nextDoc() sets 'currentDoc'
collector.setScorer(this);
while (currentDoc < max) {
diff --git a/lucene/src/java/org/apache/lucene/search/DocIdSetIterator.java b/lucene/src/java/org/apache/lucene/search/DocIdSetIterator.java
index f10d04c0d48..39a73345f9b 100644
--- a/lucene/src/java/org/apache/lucene/search/DocIdSetIterator.java
+++ b/lucene/src/java/org/apache/lucene/search/DocIdSetIterator.java
@@ -78,10 +78,10 @@ public abstract class DocIdSetIterator {
*
* Some implementations are considerably more efficient than that.
*
- * NOTE: certain implementations may return a different value (each
- * time) if called several times in a row with the same target.
+ * NOTE: when target ≤ current
implementations may opt
+ * not to advance beyond their current {@link #docID()}.
*
- * NOTE: this method may be called with {@value #NO_MORE_DOCS} for
+ * NOTE: this method may be called with {@link #NO_MORE_DOCS} for
* efficiency by some Scorers. If your implementation cannot efficiently
* determine that it should exhaust, it is recommended that you check for that
* value in each call to this method.
diff --git a/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java b/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java
index f2c94a7ae6f..153821d92d0 100644
--- a/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java
+++ b/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java
@@ -60,9 +60,12 @@ final class ExactPhraseScorer extends Scorer {
private int docID = -1;
private int freq;
+ private final Similarity similarity;
+
ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
Similarity similarity, byte[] norms) throws IOException {
- super(similarity, weight);
+ super(weight);
+ this.similarity = similarity;
this.norms = norms;
this.value = weight.getValue();
@@ -87,7 +90,7 @@ final class ExactPhraseScorer extends Scorer {
}
for (int i = 0; i < SCORE_CACHE_SIZE; i++) {
- scoreCache[i] = getSimilarity().tf((float) i) * value;
+ scoreCache[i] = similarity.tf((float) i) * value;
}
}
@@ -207,9 +210,9 @@ final class ExactPhraseScorer extends Scorer {
if (freq < SCORE_CACHE_SIZE) {
raw = scoreCache[freq];
} else {
- raw = getSimilarity().tf((float) freq) * value;
+ raw = similarity.tf((float) freq) * value;
}
- return norms == null ? raw : raw * getSimilarity().decodeNormValue(norms[docID]); // normalize
+ return norms == null ? raw : raw * similarity.decodeNormValue(norms[docID]); // normalize
}
private int phraseFreq() throws IOException {
diff --git a/lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java b/lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java
index b583dc6fe78..971d7459840 100644
--- a/lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java
+++ b/lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java
@@ -137,6 +137,13 @@ public class FieldCacheImpl implements FieldCache { // Made Public so that
public Object getValue() { return value; }
}
+ final static IndexReader.ReaderFinishedListener purgeReader = new IndexReader.ReaderFinishedListener() {
+ // @Override -- not until Java 1.6
+ public void finished(IndexReader reader) {
+ FieldCache.DEFAULT.purge(reader);
+ }
+ };
+
/** Expert: Internal cache. */
final static class Cache {
Cache() {
@@ -171,8 +178,10 @@ public class FieldCacheImpl implements FieldCache { // Made Public so that
synchronized (readerCache) {
innerCache = readerCache.get(readerKey);
if (innerCache == null) {
+ // First time this reader is using FieldCache
innerCache = new HashMap,Object>();
readerCache.put(readerKey, innerCache);
+ reader.addReaderFinishedListener(purgeReader);
value = null;
} else {
value = innerCache.get(key);
diff --git a/lucene/src/java/org/apache/lucene/search/FieldCacheRangeFilter.java b/lucene/src/java/org/apache/lucene/search/FieldCacheRangeFilter.java
index 6c4245a5d70..9293e509608 100644
--- a/lucene/src/java/org/apache/lucene/search/FieldCacheRangeFilter.java
+++ b/lucene/src/java/org/apache/lucene/search/FieldCacheRangeFilter.java
@@ -19,6 +19,7 @@ package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.Bits;
@@ -73,7 +74,7 @@ public abstract class FieldCacheRangeFilter extends Filter {
/** This method is implemented for each data type */
@Override
- public abstract DocIdSet getDocIdSet(IndexReader reader) throws IOException;
+ public abstract DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException;
/**
* Creates a string range filter using {@link FieldCache#getTermsIndex}. This works with all
@@ -83,8 +84,8 @@ public abstract class FieldCacheRangeFilter extends Filter {
public static FieldCacheRangeFilter newStringRange(String field, String lowerVal, String upperVal, boolean includeLower, boolean includeUpper) {
return new FieldCacheRangeFilter(field, null, lowerVal, upperVal, includeLower, includeUpper) {
@Override
- public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
- final FieldCache.DocTermsIndex fcsi = FieldCache.DEFAULT.getTermsIndex(reader, field);
+ public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
+ final FieldCache.DocTermsIndex fcsi = FieldCache.DEFAULT.getTermsIndex(context.reader, field);
final BytesRef spare = new BytesRef();
final int lowerPoint = fcsi.binarySearchLookup(lowerVal == null ? null : new BytesRef(lowerVal), spare);
final int upperPoint = fcsi.binarySearchLookup(upperVal == null ? null : new BytesRef(upperVal), spare);
@@ -124,7 +125,7 @@ public abstract class FieldCacheRangeFilter extends Filter {
// for this DocIdSet, we can ignore deleted docs
// because deleted docs have an order of 0 (null entry in StringIndex)
- return new FieldCacheDocIdSet(reader, true) {
+ return new FieldCacheDocIdSet(context.reader, true) {
@Override
final boolean matchDoc(int doc) {
final int docOrd = fcsi.getOrd(doc);
@@ -152,7 +153,7 @@ public abstract class FieldCacheRangeFilter extends Filter {
public static FieldCacheRangeFilter newByteRange(String field, FieldCache.ByteParser parser, Byte lowerVal, Byte upperVal, boolean includeLower, boolean includeUpper) {
return new FieldCacheRangeFilter(field, parser, lowerVal, upperVal, includeLower, includeUpper) {
@Override
- public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
+ public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
final byte inclusiveLowerPoint, inclusiveUpperPoint;
if (lowerVal != null) {
final byte i = lowerVal.byteValue();
@@ -174,9 +175,9 @@ public abstract class FieldCacheRangeFilter extends Filter {
if (inclusiveLowerPoint > inclusiveUpperPoint)
return DocIdSet.EMPTY_DOCIDSET;
- final byte[] values = FieldCache.DEFAULT.getBytes(reader, field, (FieldCache.ByteParser) parser);
+ final byte[] values = FieldCache.DEFAULT.getBytes(context.reader, field, (FieldCache.ByteParser) parser);
// we only respect deleted docs if the range contains 0
- return new FieldCacheDocIdSet(reader, !(inclusiveLowerPoint <= 0 && inclusiveUpperPoint >= 0)) {
+ return new FieldCacheDocIdSet(context.reader, !(inclusiveLowerPoint <= 0 && inclusiveUpperPoint >= 0)) {
@Override
boolean matchDoc(int doc) {
return values[doc] >= inclusiveLowerPoint && values[doc] <= inclusiveUpperPoint;
@@ -203,7 +204,7 @@ public abstract class FieldCacheRangeFilter extends Filter {
public static FieldCacheRangeFilter newShortRange(String field, FieldCache.ShortParser parser, Short lowerVal, Short upperVal, boolean includeLower, boolean includeUpper) {
return new FieldCacheRangeFilter(field, parser, lowerVal, upperVal, includeLower, includeUpper) {
@Override
- public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
+ public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
final short inclusiveLowerPoint, inclusiveUpperPoint;
if (lowerVal != null) {
short i = lowerVal.shortValue();
@@ -225,9 +226,9 @@ public abstract class FieldCacheRangeFilter extends Filter {
if (inclusiveLowerPoint > inclusiveUpperPoint)
return DocIdSet.EMPTY_DOCIDSET;
- final short[] values = FieldCache.DEFAULT.getShorts(reader, field, (FieldCache.ShortParser) parser);
+ final short[] values = FieldCache.DEFAULT.getShorts(context.reader, field, (FieldCache.ShortParser) parser);
// ignore deleted docs if range doesn't contain 0
- return new FieldCacheDocIdSet(reader, !(inclusiveLowerPoint <= 0 && inclusiveUpperPoint >= 0)) {
+ return new FieldCacheDocIdSet(context.reader, !(inclusiveLowerPoint <= 0 && inclusiveUpperPoint >= 0)) {
@Override
boolean matchDoc(int doc) {
return values[doc] >= inclusiveLowerPoint && values[doc] <= inclusiveUpperPoint;
@@ -254,7 +255,7 @@ public abstract class FieldCacheRangeFilter extends Filter {
public static FieldCacheRangeFilter newIntRange(String field, FieldCache.IntParser parser, Integer lowerVal, Integer upperVal, boolean includeLower, boolean includeUpper) {
return new FieldCacheRangeFilter(field, parser, lowerVal, upperVal, includeLower, includeUpper) {
@Override
- public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
+ public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
final int inclusiveLowerPoint, inclusiveUpperPoint;
if (lowerVal != null) {
int i = lowerVal.intValue();
@@ -276,9 +277,9 @@ public abstract class FieldCacheRangeFilter extends Filter {
if (inclusiveLowerPoint > inclusiveUpperPoint)
return DocIdSet.EMPTY_DOCIDSET;
- final int[] values = FieldCache.DEFAULT.getInts(reader, field, (FieldCache.IntParser) parser);
+ final int[] values = FieldCache.DEFAULT.getInts(context.reader, field, (FieldCache.IntParser) parser);
// ignore deleted docs if range doesn't contain 0
- return new FieldCacheDocIdSet(reader, !(inclusiveLowerPoint <= 0 && inclusiveUpperPoint >= 0)) {
+ return new FieldCacheDocIdSet(context.reader, !(inclusiveLowerPoint <= 0 && inclusiveUpperPoint >= 0)) {
@Override
boolean matchDoc(int doc) {
return values[doc] >= inclusiveLowerPoint && values[doc] <= inclusiveUpperPoint;
@@ -305,7 +306,7 @@ public abstract class FieldCacheRangeFilter