LUCENE-7854: enable indexing custom term frequencies

2017-06-06 13:37:31 -04:00 · 2017-06-06 13:37:31 -04:00 · d276acfbbc
parent 09a9fdab6d
commit d276acfbbc
14 changed files with 803 additions and 20 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -14,6 +14,10 @@ New Features
  well as the oldest Lucene version that contributed to the segment.
  (Adrien Grand)

+* LUCENE-7854: The new TermFrequencyAttribute used during analysis
+  with a custom token stream allows indexing custom term frequencies
+  (Mike McCandless)
+
 API Changes

 * LUCENE-2605: Classic QueryParser no longer splits on whitespace by default.
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PackedTokenAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PackedTokenAttributeImpl.java
@ -26,15 +26,18 @@ import org.apache.lucene.util.AttributeReflector;
 * <li>{@link PositionIncrementAttribute}
 * <li>{@link PositionLengthAttribute}
 * <li>{@link OffsetAttribute}
+ * <li>{@link TermFrequencyAttribute}
 * </ul>*/
 public class PackedTokenAttributeImpl extends CharTermAttributeImpl 
                   implements TypeAttribute, PositionIncrementAttribute,
-                              PositionLengthAttribute, OffsetAttribute {
+                              PositionLengthAttribute, OffsetAttribute,
+                              TermFrequencyAttribute {

  private int startOffset,endOffset;
  private String type = DEFAULT_TYPE;
  private int positionIncrement = 1;
  private int positionLength = 1;
+  private int termFrequency = 1;

  /** Constructs the attribute implementation. */
  public PackedTokenAttributeImpl() {
@ -132,12 +135,26 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
    this.type = type;
  }

+  @Override
+  public final void setTermFrequency(int termFrequency) {
+    if (termFrequency < 1) {
+      throw new IllegalArgumentException("Term frequency must be 1 or greater; got " + termFrequency);
+    }
+    this.termFrequency = termFrequency;
+  }
+
+  @Override
+  public final int getTermFrequency() {
+    return termFrequency;
+  }
+
  /** Resets the attributes
   */
  @Override
  public void clear() {
    super.clear();
    positionIncrement = positionLength = 1;
+    termFrequency = 1;
    startOffset = endOffset = 0;
    type = DEFAULT_TYPE;
  }
@ -147,10 +164,8 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
  @Override
  public void end() {
    super.end();
+    // super.end already calls this.clear, so we only set values that are different from clear:
    positionIncrement = 0;
-    positionLength = 1;
-    startOffset = endOffset = 0;
-    type = DEFAULT_TYPE;
  }

  @Override
@ -170,6 +185,7 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
          positionIncrement == other.positionIncrement &&
          positionLength == other.positionLength &&
          (type == null ? other.type == null : type.equals(other.type)) &&
+          termFrequency == other.termFrequency &&
          super.equals(obj)
      );
    } else
@ -185,6 +201,7 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
    code = code * 31 + positionLength;
    if (type != null)
      code = code * 31 + type.hashCode();
+    code = code * 31 + termFrequency;;
    return code;
  }

@ -198,12 +215,14 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
      to.startOffset = startOffset;
      to.endOffset = endOffset;
      to.type = type;
+      to.termFrequency = termFrequency;
    } else {
      super.copyTo(target);
      ((OffsetAttribute) target).setOffset(startOffset, endOffset);
      ((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement);
      ((PositionLengthAttribute) target).setPositionLength(positionLength);
      ((TypeAttribute) target).setType(type);
+      ((TermFrequencyAttribute) target).setTermFrequency(termFrequency);
    }
  }

@ -215,6 +234,6 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
    reflector.reflect(PositionIncrementAttribute.class, "positionIncrement", positionIncrement);
    reflector.reflect(PositionLengthAttribute.class, "positionLength", positionLength);
    reflector.reflect(TypeAttribute.class, "type", type);
+    reflector.reflect(TermFrequencyAttribute.class, "termFrequency", termFrequency);
  }
-
 }
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TermFrequencyAttribute.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TermFrequencyAttribute.java
@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.tokenattributes;
+
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.util.Attribute;
+
+/** Sets the custom term frequency of a term within one document.  If this attribute
+ *  is present in your analysis chain for a given field, that field must be indexed with
+ *  {@link IndexOptions#DOCS_AND_FREQS}. */
+public interface TermFrequencyAttribute extends Attribute {
+
+  /** Set the custom term frequency of the current term within one document. */
+  public void setTermFrequency(int termFrequency);
+
+  /** Returns the custom term frequencey. */
+  public int getTermFrequency();
+}
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TermFrequencyAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TermFrequencyAttributeImpl.java
@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.tokenattributes;
+
+
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
+
+/** Default implementation of {@link TermFrequencyAttribute}. */
+public class TermFrequencyAttributeImpl extends AttributeImpl implements TermFrequencyAttribute, Cloneable {
+  private int termFrequency = 1;
+  
+  /** Initialize this attribute with term frequencey of 1 */
+  public TermFrequencyAttributeImpl() {}
+
+  @Override
+  public void setTermFrequency(int termFrequency) {
+    if (termFrequency < 1) {
+      throw new IllegalArgumentException("Term frequency must be 1 or greater; got " + termFrequency);
+    }
+    this.termFrequency = termFrequency;
+  }
+
+  @Override
+  public int getTermFrequency() {
+    return termFrequency;
+  }
+
+  @Override
+  public void clear() {
+    this.termFrequency = 1;
+  }
+  
+  @Override
+  public void end() {
+    this.termFrequency = 1;
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (other == this) {
+      return true;
+    }
+    
+    if (other instanceof TermFrequencyAttributeImpl) {
+      TermFrequencyAttributeImpl _other = (TermFrequencyAttributeImpl) other;
+      return termFrequency ==  _other.termFrequency;
+    }
+ 
+    return false;
+  }
+
+  @Override
+  public int hashCode() {
+    return Integer.hashCode(termFrequency);
+  }
+  
+  @Override
+  public void copyTo(AttributeImpl target) {
+    TermFrequencyAttribute t = (TermFrequencyAttribute) target;
+    t.setTermFrequency(termFrequency);
+  }  
+
+  @Override
+  public void reflectWith(AttributeReflector reflector) {
+    reflector.reflect(TermFrequencyAttribute.class, "termFrequency", termFrequency);
+  }
+}
--- a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
@ -770,10 +770,8 @@ final class DefaultIndexingChain extends DocConsumer {
          }
          invertState.lastStartOffset = startOffset;

-          invertState.length++;
-          if (invertState.length < 0) {
-            throw new IllegalArgumentException("too many tokens in field '" + field.name() + "'");
-          }
+          invertState.length = Math.addExact(invertState.length, invertState.termFreqAttribute.getTermFrequency());
+          
          //System.out.println("  term=" + invertState.termAttribute);

          // If we hit an exception in here, we abort
--- a/lucene/core/src/java/org/apache/lucene/index/FieldInvertState.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FieldInvertState.java
@ -20,6 +20,7 @@ import org.apache.lucene.analysis.TokenStream; // javadocs
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.util.AttributeSource;

@ -48,6 +49,7 @@ public final class FieldInvertState {
  PositionIncrementAttribute posIncrAttribute;
  PayloadAttribute payloadAttribute;
  TermToBytesRefAttribute termAttribute;
+  TermFrequencyAttribute termFreqAttribute;

  /** Creates {code FieldInvertState} for the specified
   *  field name. */
@ -88,6 +90,7 @@ public final class FieldInvertState {
    if (this.attributeSource != attributeSource) {
      this.attributeSource = attributeSource;
      termAttribute = attributeSource.getAttribute(TermToBytesRefAttribute.class);
+      termFreqAttribute = attributeSource.addAttribute(TermFrequencyAttribute.class);
      posIncrAttribute = attributeSource.addAttribute(PositionIncrementAttribute.class);
      offsetAttribute = attributeSource.addAttribute(OffsetAttribute.class);
      payloadAttribute = attributeSource.getAttribute(PayloadAttribute.class);
--- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
@ -113,9 +113,10 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
    if (!hasFreq) {
      assert postings.termFreqs == null;
      postings.lastDocCodes[termID] = docState.docID;
+      fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
    } else {
      postings.lastDocCodes[termID] = docState.docID << 1;
-      postings.termFreqs[termID] = 1;
+      postings.termFreqs[termID] = getTermFreq();
      if (hasProx) {
        writeProx(termID, fieldState.position);
        if (hasOffsets) {
@ -124,19 +125,21 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
      } else {
        assert !hasOffsets;
      }
+      fieldState.maxTermFrequency = Math.max(postings.termFreqs[termID], fieldState.maxTermFrequency);
    }
-    fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
    fieldState.uniqueTermCount++;
  }

  @Override
  void addTerm(final int termID) {
    final FreqProxPostingsArray postings = freqProxPostingsArray;
-
    assert !hasFreq || postings.termFreqs[termID] > 0;

    if (!hasFreq) {
      assert postings.termFreqs == null;
+      if (termFreqAtt.getTermFrequency() != 1) {
+        throw new IllegalStateException("field \"" + fieldInfo.name + "\": must index term freq while using custom TermFrequencyAttribute");
+      }
      if (docState.docID != postings.lastDocIDs[termID]) {
        // New document; now encode docCode for previous doc:
        assert docState.docID > postings.lastDocIDs[termID];
@ -160,8 +163,8 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
      }

      // Init freq for the current document
-      postings.termFreqs[termID] = 1;
-      fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
+      postings.termFreqs[termID] = getTermFreq();
+      fieldState.maxTermFrequency = Math.max(postings.termFreqs[termID], fieldState.maxTermFrequency);
      postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
      postings.lastDocIDs[termID] = docState.docID;
      if (hasProx) {
@ -175,7 +178,8 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
      }
      fieldState.uniqueTermCount++;
    } else {
-      fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.termFreqs[termID]);
+      postings.termFreqs[termID] = Math.addExact(postings.termFreqs[termID], getTermFreq());
+      fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, postings.termFreqs[termID]);
      if (hasProx) {
        writeProx(termID, fieldState.position-postings.lastPositions[termID]);
        if (hasOffsets) {
@ -185,6 +189,17 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
    }
  }

+  private int getTermFreq() {
+    int freq = termFreqAtt.getTermFrequency();
+    if (freq != 1) {
+      if (hasProx) {
+        throw new IllegalStateException("field \"" + fieldInfo.name + "\": cannot index positions while using custom TermFrequencyAttribute");
+      }
+    }
+
+    return freq;
+  }
+
  @Override
  public void newPostingsArray() {
    freqProxPostingsArray = (FreqProxPostingsArray) postingsArray;
--- a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java
+++ b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java
@ -109,6 +109,7 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {

  @Override
  boolean start(IndexableField field, boolean first) {
+    super.start(field, first);
    assert field.fieldType().indexOptions() != IndexOptions.NONE;

    if (first) {
@ -224,7 +225,7 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {
  void newTerm(final int termID) {
    TermVectorsPostingsArray postings = termVectorsPostingsArray;

-    postings.freqs[termID] = 1;
+    postings.freqs[termID] = getTermFreq();
    postings.lastOffsets[termID] = 0;
    postings.lastPositions[termID] = 0;
    
@ -235,11 +236,25 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {
  void addTerm(final int termID) {
    TermVectorsPostingsArray postings = termVectorsPostingsArray;

-    postings.freqs[termID]++;
+    postings.freqs[termID] += getTermFreq();

    writeProx(postings, termID);
  }

+  private int getTermFreq() {
+    int freq = termFreqAtt.getTermFrequency();
+    if (freq != 1) {
+      if (doVectorPositions) {
+        throw new IllegalArgumentException("field \"" + fieldInfo.name + "\": cannot index term vector positions while using custom TermFrequencyAttribute");
+      }
+      if (doVectorOffsets) {
+        throw new IllegalArgumentException("field \"" + fieldInfo.name + "\": cannot index term vector offsets while using custom TermFrequencyAttribute");
+      }
+    }
+
+    return freq;
+  }
+
  @Override
  public void newPostingsArray() {
    termVectorsPostingsArray = (TermVectorsPostingsArray) postingsArray;
--- a/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
+++ b/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
@ -19,12 +19,13 @@ package org.apache.lucene.index;

 import java.io.IOException;

+import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.util.ByteBlockPool;
+import org.apache.lucene.util.BytesRefHash.BytesStartArray;
 import org.apache.lucene.util.BytesRefHash;
 import org.apache.lucene.util.Counter;
 import org.apache.lucene.util.IntBlockPool;
-import org.apache.lucene.util.BytesRefHash.BytesStartArray;

 abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
  private static final int HASH_INIT_SIZE = 4;
@ -35,6 +36,7 @@ abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
  protected final DocumentsWriterPerThread.DocState docState;
  protected final FieldInvertState fieldState;
  TermToBytesRefAttribute termAtt;
+  protected TermFrequencyAttribute termFreqAtt;

  // Copied from our perThread
  final IntBlockPool intPool;
@ -287,6 +289,7 @@ abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
   *  document. */
  boolean start(IndexableField field, boolean first) {
    termAtt = fieldState.termAttribute;
+    termFreqAtt = fieldState.termFreqAttribute;
    if (nextPerField != null) {
      doNextCall = nextPerField.start(field, first);
    }
--- a/lucene/core/src/test/org/apache/lucene/analysis/TestToken.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestToken.java
@ -125,6 +125,7 @@ public class TestToken extends LuceneTestCase {
    t.setFlags(8);
    t.setPositionIncrement(3);
    t.setPositionLength(11);
+    t.setTermFrequency(42);
    TestUtil.assertAttributeReflection(t,
        new HashMap<String, Object>() {{
          put(CharTermAttribute.class.getName() + "#term", "foobar");
@ -136,6 +137,7 @@ public class TestToken extends LuceneTestCase {
          put(PayloadAttribute.class.getName() + "#payload", null);
          put(TypeAttribute.class.getName() + "#type", TypeAttribute.DEFAULT_TYPE);
          put(FlagsAttribute.class.getName() + "#flags", 8);
+          put(TermFrequencyAttribute.class.getName() + "#termFrequency", 42);
        }});
  }
 }
--- a/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestPackedTokenAttributeImpl.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestPackedTokenAttributeImpl.java
@ -82,6 +82,7 @@ public class TestPackedTokenAttributeImpl extends LuceneTestCase {
    t.setPositionIncrement(3);
    t.setPositionLength(11);
    t.setType("foobar");
+    t.setTermFrequency(42);
    TestUtil.assertAttributeReflection(t,
        new HashMap<String, Object>() {{
          put(CharTermAttribute.class.getName() + "#term", "foobar");
@ -91,6 +92,7 @@ public class TestPackedTokenAttributeImpl extends LuceneTestCase {
          put(PositionIncrementAttribute.class.getName() + "#positionIncrement", 3);
          put(PositionLengthAttribute.class.getName() + "#positionLength", 11);
          put(TypeAttribute.class.getName() + "#type", "foobar");
+          put(TermFrequencyAttribute.class.getName() + "#termFrequency", 42);
        }});
  }
 }
--- a/lucene/core/src/test/org/apache/lucene/index/TestCustomTermFreq.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestCustomTermFreq.java
@ -0,0 +1,468 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.index;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.search.CollectionStatistics;
+import org.apache.lucene.search.TermStatistics;
+import org.apache.lucene.search.similarities.Similarity;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LuceneTestCase;
+
+import static org.apache.lucene.index.PostingsEnum.NO_MORE_DOCS;
+
+public class TestCustomTermFreq extends LuceneTestCase {
+
+  private static final class CannedTermFreqs extends TokenStream {
+    private final String[] terms;
+    private final int[] termFreqs;
+    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private final TermFrequencyAttribute termFreqAtt = addAttribute(TermFrequencyAttribute.class);
+    private int upto;
+    
+    public CannedTermFreqs(String[] terms, int[] termFreqs) {
+      this.terms = terms;
+      this.termFreqs = termFreqs;
+      assert terms.length == termFreqs.length;
+    }
+
+    @Override
+    public boolean incrementToken() {
+      if (upto == terms.length) {
+        return false;
+      }
+
+      clearAttributes();
+
+      termAtt.append(terms[upto]);
+      termFreqAtt.setTermFrequency(termFreqs[upto]);
+
+      upto++;
+      return true;
+    }
+
+    @Override
+    public void reset() {
+      upto = 0;
+    }
+  }
+  
+  public void testSingletonTermsOneDoc() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
+
+    Document doc = new Document();
+    FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
+    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+    Field field = new Field("field",
+                            new CannedTermFreqs(new String[] {"foo", "bar"},
+                                                new int[] {42, 128}),
+                            fieldType);
+    doc.add(field);
+    w.addDocument(doc);
+    IndexReader r = DirectoryReader.open(w);
+    PostingsEnum postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("bar"));
+    assertNotNull(postings);
+    assertEquals(0, postings.nextDoc());
+    assertEquals(128, postings.freq());
+    assertEquals(NO_MORE_DOCS, postings.nextDoc());
+
+    postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("foo"));
+    assertNotNull(postings);
+    assertEquals(0, postings.nextDoc());
+    assertEquals(42, postings.freq());
+    assertEquals(NO_MORE_DOCS, postings.nextDoc());
+    
+    IOUtils.close(r, w, dir);
+  }
+
+  public void testSingletonTermsTwoDocs() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
+
+    Document doc = new Document();
+    FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
+    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+    Field field = new Field("field",
+                            new CannedTermFreqs(new String[] {"foo", "bar"},
+                                                new int[] {42, 128}),
+                            fieldType);
+    doc.add(field);
+    w.addDocument(doc);
+
+    doc = new Document();
+    field = new Field("field",
+                      new CannedTermFreqs(new String[] {"foo", "bar"},
+                                          new int[] {50, 50}),
+                      fieldType);
+    doc.add(field);
+    w.addDocument(doc);
+    
+    IndexReader r = DirectoryReader.open(w);
+    PostingsEnum postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("bar"));
+    assertNotNull(postings);
+    assertEquals(0, postings.nextDoc());
+    assertEquals(128, postings.freq());
+    assertEquals(1, postings.nextDoc());
+    assertEquals(50, postings.freq());
+    assertEquals(NO_MORE_DOCS, postings.nextDoc());
+
+    postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("foo"));
+    assertNotNull(postings);
+    assertEquals(0, postings.nextDoc());
+    assertEquals(42, postings.freq());
+    assertEquals(1, postings.nextDoc());
+    assertEquals(50, postings.freq());
+    assertEquals(NO_MORE_DOCS, postings.nextDoc());
+    
+    IOUtils.close(r, w, dir);
+  }
+
+  public void testRepeatTermsOneDoc() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
+
+    Document doc = new Document();
+    FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
+    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+    Field field = new Field("field",
+                            new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
+                                                new int[] {42, 128, 17, 100}),
+                            fieldType);
+    doc.add(field);
+    w.addDocument(doc);
+    IndexReader r = DirectoryReader.open(w);
+    PostingsEnum postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("bar"));
+    assertNotNull(postings);
+    assertEquals(0, postings.nextDoc());
+    assertEquals(228, postings.freq());
+    assertEquals(NO_MORE_DOCS, postings.nextDoc());
+
+    postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("foo"));
+    assertNotNull(postings);
+    assertEquals(0, postings.nextDoc());
+    assertEquals(59, postings.freq());
+    assertEquals(NO_MORE_DOCS, postings.nextDoc());
+    
+    IOUtils.close(r, w, dir);
+  }
+
+  public void testRepeatTermsTwoDocs() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
+
+    Document doc = new Document();
+    FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
+    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+    Field field = new Field("field",
+                            new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
+                                                new int[] {42, 128, 17, 100}),
+                            fieldType);
+    doc.add(field);
+    w.addDocument(doc);
+
+    doc = new Document();
+    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+    field = new Field("field",
+                      new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
+                                          new int[] {50, 60, 70, 80}),
+                      fieldType);
+    doc.add(field);
+    w.addDocument(doc);
+
+    IndexReader r = DirectoryReader.open(w);
+    PostingsEnum postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("bar"));
+    assertNotNull(postings);
+    assertEquals(0, postings.nextDoc());
+    assertEquals(228, postings.freq());
+    assertEquals(1, postings.nextDoc());
+    assertEquals(140, postings.freq());
+    assertEquals(NO_MORE_DOCS, postings.nextDoc());
+
+    postings = MultiFields.getTermDocsEnum(r, "field", new BytesRef("foo"));
+    assertNotNull(postings);
+    assertEquals(0, postings.nextDoc());
+    assertEquals(59, postings.freq());
+    assertEquals(1, postings.nextDoc());
+    assertEquals(120, postings.freq());
+    assertEquals(NO_MORE_DOCS, postings.nextDoc());
+
+    IOUtils.close(r, w, dir);
+  }
+
+  public void testTotalTermFreq() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
+
+    Document doc = new Document();
+    FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
+    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+    Field field = new Field("field",
+                            new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
+                                                new int[] {42, 128, 17, 100}),
+                            fieldType);
+    doc.add(field);
+    w.addDocument(doc);
+
+    doc = new Document();
+    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+    field = new Field("field",
+                      new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
+                                          new int[] {50, 60, 70, 80}),
+                      fieldType);
+    doc.add(field);
+    w.addDocument(doc);
+
+    IndexReader r = DirectoryReader.open(w);
+
+    TermsEnum termsEnum = MultiFields.getTerms(r, "field").iterator();
+    assertTrue(termsEnum.seekExact(new BytesRef("foo")));
+    assertEquals(179, termsEnum.totalTermFreq());
+    assertTrue(termsEnum.seekExact(new BytesRef("bar")));
+    assertEquals(368, termsEnum.totalTermFreq());
+    
+    IOUtils.close(r, w, dir);
+  }
+
+  // you can't index proximity with custom term freqs:
+  public void testInvalidProx() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
+
+    Document doc = new Document();
+    FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
+    Field field = new Field("field",
+                            new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
+                                                new int[] {42, 128, 17, 100}),
+                            fieldType);
+    doc.add(field);
+    Exception e = expectThrows(IllegalStateException.class, () -> {w.addDocument(doc);});
+    assertEquals("field \"field\": cannot index positions while using custom TermFrequencyAttribute", e.getMessage());
+    IOUtils.close(w, dir);
+  }
+
+  // you can't index DOCS_ONLY with custom term freq
+  public void testInvalidDocsOnly() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
+
+    Document doc = new Document();
+    FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
+    fieldType.setIndexOptions(IndexOptions.DOCS);
+    Field field = new Field("field",
+                            new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
+                                                new int[] {42, 128, 17, 100}),
+                            fieldType);
+    doc.add(field);
+    Exception e = expectThrows(IllegalStateException.class, () -> {w.addDocument(doc);});
+    assertEquals("field \"field\": must index term freq while using custom TermFrequencyAttribute", e.getMessage());
+    IOUtils.close(w, dir);
+  }
+
+  // sum of term freqs must fit in an int
+  public void testOverflowInt() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
+
+    FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
+    fieldType.setIndexOptions(IndexOptions.DOCS);
+    
+    Document doc = new Document();
+    doc.add(new Field("field", "this field should be indexed", fieldType));
+    w.addDocument(doc);
+
+    Document doc2 = new Document();
+    Field field = new Field("field",
+                            new CannedTermFreqs(new String[] {"foo", "bar"},
+                                                new int[] {3, Integer.MAX_VALUE}),
+                            fieldType);
+    doc2.add(field);
+    expectThrows(ArithmeticException.class, () -> {w.addDocument(doc2);});
+
+    IndexReader r = DirectoryReader.open(w);
+    assertEquals(1, r.numDocs());
+
+    IOUtils.close(r, w, dir);
+  }
+
+  public void testInvalidTermVectorPositions() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
+
+    Document doc = new Document();
+    FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
+    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+    fieldType.setStoreTermVectors(true);
+    fieldType.setStoreTermVectorPositions(true);
+    Field field = new Field("field",
+                            new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
+                                                new int[] {42, 128, 17, 100}),
+                            fieldType);
+    doc.add(field);
+    Exception e = expectThrows(IllegalArgumentException.class, () -> {w.addDocument(doc);});
+    assertEquals("field \"field\": cannot index term vector positions while using custom TermFrequencyAttribute", e.getMessage());
+    IOUtils.close(w, dir);
+  }
+
+  public void testInvalidTermVectorOffsets() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
+
+    Document doc = new Document();
+    FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
+    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+    fieldType.setStoreTermVectors(true);
+    fieldType.setStoreTermVectorOffsets(true);
+    Field field = new Field("field",
+                            new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
+                                                new int[] {42, 128, 17, 100}),
+                            fieldType);
+    doc.add(field);
+    Exception e = expectThrows(IllegalArgumentException.class, () -> {w.addDocument(doc);});
+    assertEquals("field \"field\": cannot index term vector offsets while using custom TermFrequencyAttribute", e.getMessage());
+    IOUtils.close(w, dir);
+  }
+
+  public void testTermVectors() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
+
+    Document doc = new Document();
+    FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
+    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+    fieldType.setStoreTermVectors(true);
+    Field field = new Field("field",
+                            new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
+                                                new int[] {42, 128, 17, 100}),
+                            fieldType);
+    doc.add(field);
+    w.addDocument(doc);
+
+    doc = new Document();
+    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+    field = new Field("field",
+                      new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
+                                          new int[] {50, 60, 70, 80}),
+                      fieldType);
+    doc.add(field);
+    w.addDocument(doc);
+
+    IndexReader r = DirectoryReader.open(w);
+
+    Fields fields = r.getTermVectors(0);
+    TermsEnum termsEnum = fields.terms("field").iterator();
+    assertTrue(termsEnum.seekExact(new BytesRef("bar")));
+    assertEquals(228, termsEnum.totalTermFreq());
+    PostingsEnum postings = termsEnum.postings(null);
+    assertNotNull(postings);
+    assertEquals(0, postings.nextDoc());
+    assertEquals(228, postings.freq());
+    assertEquals(NO_MORE_DOCS, postings.nextDoc());
+
+    assertTrue(termsEnum.seekExact(new BytesRef("foo")));
+    assertEquals(59, termsEnum.totalTermFreq());
+    postings = termsEnum.postings(null);
+    assertNotNull(postings);
+    assertEquals(0, postings.nextDoc());
+    assertEquals(59, postings.freq());
+    assertEquals(NO_MORE_DOCS, postings.nextDoc());
+
+    fields = r.getTermVectors(1);
+    termsEnum = fields.terms("field").iterator();
+    assertTrue(termsEnum.seekExact(new BytesRef("bar")));
+    assertEquals(140, termsEnum.totalTermFreq());
+    postings = termsEnum.postings(null);
+    assertNotNull(postings);
+    assertEquals(0, postings.nextDoc());
+    assertEquals(140, postings.freq());
+    assertEquals(NO_MORE_DOCS, postings.nextDoc());
+
+    assertTrue(termsEnum.seekExact(new BytesRef("foo")));
+    assertEquals(120, termsEnum.totalTermFreq());
+    postings = termsEnum.postings(null);
+    assertNotNull(postings);
+    assertEquals(0, postings.nextDoc());
+    assertEquals(120, postings.freq());
+    assertEquals(NO_MORE_DOCS, postings.nextDoc());
+    
+    IOUtils.close(r, w, dir);
+  }
+
+  /**
+   * Similarity holds onto the FieldInvertState for subsequent verification.
+   */
+  private static class NeverForgetsSimilarity extends Similarity {
+    public FieldInvertState lastState;
+    private final static NeverForgetsSimilarity INSTANCE = new NeverForgetsSimilarity();
+
+    private NeverForgetsSimilarity() {
+      // no
+    }
+    
+    @Override
+    public long computeNorm(FieldInvertState state) {
+      this.lastState = state;
+      return 1;
+    }
+    
+    @Override
+    public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
+      throw new UnsupportedOperationException();
+    }
+  }
+
+  public void testFieldInvertState() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
+    iwc.setSimilarity(NeverForgetsSimilarity.INSTANCE);
+    IndexWriter w = new IndexWriter(dir, iwc);
+
+    Document doc = new Document();
+    FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
+    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+    Field field = new Field("field",
+                            new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
+                                                new int[] {42, 128, 17, 100}),
+                            fieldType);
+    doc.add(field);
+    w.addDocument(doc);
+    FieldInvertState fis = NeverForgetsSimilarity.INSTANCE.lastState;
+    assertEquals(228, fis.getMaxTermFrequency());
+    assertEquals(2, fis.getUniqueTermCount());
+    assertEquals(0, fis.getNumOverlap());
+    assertEquals(287, fis.getLength());
+
+    IOUtils.close(w, dir);
+  }
+}
--- a/lucene/core/src/test/org/apache/lucene/index/TestFieldInvertState.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestFieldInvertState.java
@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.index;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.search.CollectionStatistics;
+import org.apache.lucene.search.TermStatistics;
+import org.apache.lucene.search.similarities.Similarity;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+public class TestFieldInvertState extends LuceneTestCase {
+  /**
+   * Similarity holds onto the FieldInvertState for subsequent verification.
+   */
+  private static class NeverForgetsSimilarity extends Similarity {
+    public FieldInvertState lastState;
+    private final static NeverForgetsSimilarity INSTANCE = new NeverForgetsSimilarity();
+
+    private NeverForgetsSimilarity() {
+      // no
+    }
+    
+    @Override
+    public long computeNorm(FieldInvertState state) {
+      this.lastState = state;
+      return 1;
+    }
+    
+    @Override
+    public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
+      throw new UnsupportedOperationException();
+    }
+  }
+
+  public void testBasic() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
+    iwc.setSimilarity(NeverForgetsSimilarity.INSTANCE);
+    IndexWriter w = new IndexWriter(dir, iwc);
+    Document doc = new Document();
+    Field field = new Field("field",
+                            new CannedTokenStream(new Token("a", 0, 1),
+                                                  new Token("b", 2, 3),
+                                                  new Token("c", 4, 5)),
+                            TextField.TYPE_NOT_STORED);
+    doc.add(field);
+    w.addDocument(doc);
+    FieldInvertState fis = NeverForgetsSimilarity.INSTANCE.lastState;
+    assertEquals(1, fis.getMaxTermFrequency());
+    assertEquals(3, fis.getUniqueTermCount());
+    assertEquals(0, fis.getNumOverlap());
+    assertEquals(3, fis.getLength());
+    IOUtils.close(w, dir);
+  }
+
+  public void testRandom() throws Exception {
+    int numUniqueTokens = TestUtil.nextInt(random(), 1, 25);
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
+    iwc.setSimilarity(NeverForgetsSimilarity.INSTANCE);
+    IndexWriter w = new IndexWriter(dir, iwc);
+    Document doc = new Document();
+
+    int numTokens = atLeast(10000);
+    Token[] tokens = new Token[numTokens];
+    Map<Character,Integer> counts = new HashMap<>();
+    int numStacked = 0;
+    int maxTermFreq = 0;
+    int pos = -1;
+    for (int i=0;i<numTokens;i++) {
+      char tokenChar = (char) ('a' + random().nextInt(numUniqueTokens));
+      Integer oldCount = counts.get(tokenChar);
+      int newCount;
+      if (oldCount == null) {
+        newCount = 1;
+      } else {
+        newCount = 1 + oldCount;
+      }
+      counts.put(tokenChar, newCount);
+      maxTermFreq = Math.max(maxTermFreq, newCount);
+      
+      Token token = new Token(Character.toString(tokenChar), 2*i, 2*i+1);
+      
+      if (i > 0 && random().nextInt(7) == 3) {
+        token.setPositionIncrement(0);
+        numStacked++;
+      } else {
+        pos++;
+      }
+      tokens[i] = token;
+    }
+
+    Field field = new Field("field",
+                            new CannedTokenStream(tokens),
+                            TextField.TYPE_NOT_STORED);
+    doc.add(field);
+    w.addDocument(doc);
+    FieldInvertState fis = NeverForgetsSimilarity.INSTANCE.lastState;
+    assertEquals(maxTermFreq, fis.getMaxTermFrequency());
+    assertEquals(counts.size(), fis.getUniqueTermCount());
+    assertEquals(numStacked, fis.getNumOverlap());
+    assertEquals(numTokens, fis.getLength());
+    assertEquals(pos, fis.getPosition());
+    
+    IOUtils.close(w, dir);
+  }
+}
--- a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java
@ -2676,11 +2676,11 @@ public abstract class LuceneTestCase extends Assert {
      if (expectedType.isInstance(e)) {
        return expectedType.cast(e);
      }
-      AssertionFailedError assertion = new AssertionFailedError("Unexpected exception type, expected " + expectedType.getSimpleName());
+      AssertionFailedError assertion = new AssertionFailedError("Unexpected exception type, expected " + expectedType.getSimpleName() + " but got " + e);
      assertion.initCause(e);
      throw assertion;
    }
-    throw new AssertionFailedError("Expected exception " + expectedType.getSimpleName());
+    throw new AssertionFailedError("Expected exception " + expectedType.getSimpleName() + " but no exception was thrown");
  }

  /**