From ec0eff6dcf7b38b5aa212da00c0b02caf1c7dd98 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 23 Jul 2012 15:52:39 +0000 Subject: [PATCH 1/3] LUCENE-3892: add TestPostingsFormat from pforcodec_3892 branch to get it in jenkins git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1364688 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/index/TestPostingsFormat.java | 887 ++++++++++++++++++ .../analysis/BaseTokenStreamTestCase.java | 8 - .../apache/lucene/util/LuceneTestCase.java | 9 + 3 files changed, 896 insertions(+), 8 deletions(-) create mode 100644 lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java b/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java new file mode 100644 index 00000000000..7f59c961e5d --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java @@ -0,0 +1,887 @@ +package org.apache.lucene.index; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.EnumSet; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsConsumer; +import org.apache.lucene.codecs.TermStats; +import org.apache.lucene.codecs.TermsConsumer; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Constants; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; +import org.junit.BeforeClass; + +/* NOTE: This test focuses on the postings + * (docs/freqs/positions/payloads/offsets) impl, not the + * terms dict. The [stretch] goal is for this test to be + * so thorough in testing a new PostingsFormat that if this + * test passes, then all Lucene/Solr tests should also pass. Ie, + * if there is some bug in a given PostingsFormat that this + * test fails to catch then this test needs to be improved! */ + +// TODO can we make it easy for testing to pair up a "random terms dict impl" with your postings base format... + +// TODO test when you reuse after skipping a term or two, eg the block reuse case + +// TODO hmm contract says .doc() can return NO_MORE_DOCS +// before nextDoc too...? + +/* TODO + - threads + - assert doc=-1 before any nextDoc + - if a PF passes this test but fails other tests then this + test has a bug!! + - test tricky reuse cases, eg across fields + - verify you get null if you pass needFreq/needOffset but + they weren't indexed +*/ + +public class TestPostingsFormat extends LuceneTestCase { + + private enum Option { + // Sometimes use .advance(): + SKIPPING, + + // Sometimes reuse the Docs/AndPositionsEnum across terms: + REUSE_ENUMS, + + // Sometimes pass non-null live docs: + LIVE_DOCS, + + // Sometimes seek to term using previously saved TermState: + TERM_STATE, + + // Sometimes don't fully consume docs from the enum + PARTIAL_DOC_CONSUME, + + // Sometimes don't fully consume positions at each doc + PARTIAL_POS_CONSUME, + + // Sometimes check payloads + PAYLOADS, + + // Test w/ multiple threads + THREADS}; + + private static class FieldAndTerm { + String field; + BytesRef term; + + public FieldAndTerm(String field, BytesRef term) { + this.field = field; + this.term = BytesRef.deepCopyOf(term); + } + } + + private static class Position { + int position; + byte[] payload; + int startOffset; + int endOffset; + } + + private static class Posting implements Comparable{ + int docID; + List positions; + + public int compareTo(Posting other) { + return docID - other.docID; + } + } + + // Holds all postings: + private static Map>> fields = new TreeMap>>(); + + // Holds only live doc postings: + private static Map>> fieldsLive = new TreeMap>>(); + + private static FieldInfos fieldInfos; + + private static int maxDocID; + + private static FixedBitSet globalLiveDocs; + + private static List allTerms; + + @BeforeClass + public static void createPostings() throws IOException { + + final int numFields = _TestUtil.nextInt(random(), 1, 5); + if (VERBOSE) { + System.out.println("TEST: " + numFields + " fields"); + } + + FieldInfo[] fieldInfoArray = new FieldInfo[numFields]; + int fieldUpto = 0; + int numMediumTerms = 0; + int numBigTerms = 0; + int numManyPositions = 0; + while (fieldUpto < numFields) { + String field = _TestUtil.randomSimpleString(random()); + if (fields.containsKey(field)) { + continue; + } + + boolean fieldHasPayloads = random().nextBoolean(); + + fieldInfoArray[fieldUpto] = new FieldInfo(field, true, fieldUpto, false, false, fieldHasPayloads, + IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, + null, DocValues.Type.FIXED_INTS_8, null); + fieldUpto++; + + Map> postings = new TreeMap>(); + fields.put(field, postings); + Set seenTerms = new HashSet(); + + // TODO + //final int numTerms = atLeast(10); + final int numTerms = 4; + for(int termUpto=0;termUpto termPostings = new ArrayList(); + postings.put(new BytesRef(term), termPostings); + + int docID = 0; + + // TODO: more realistic to inversely tie this to numDocs: + int maxDocSpacing = _TestUtil.nextInt(random(), 1, 100); + + // 10% of the time create big payloads: + int payloadSize; + if (!fieldHasPayloads) { + payloadSize = 0; + } else if (random().nextInt(10) == 7) { + payloadSize = random().nextInt(50); + } else { + payloadSize = random().nextInt(10); + } + + boolean fixedPayloads = random().nextBoolean(); + + for(int docUpto=0;docUpto(); + termPostings.add(posting); + + int freq; + if (random().nextInt(30) == 17 && numManyPositions < 10) { + freq = _TestUtil.nextInt(random(), 1, 1000); + numManyPositions++; + } else { + freq = _TestUtil.nextInt(random(), 1, 20); + } + int pos = 0; + int offset = 0; + int posSpacing = _TestUtil.nextInt(random(), 1, 100); + for(int posUpto=0;posUpto>> fieldEnt : fields.entrySet()) { + Map> postingsLive = new TreeMap>(); + fieldsLive.put(fieldEnt.getKey(), postingsLive); + for(Map.Entry> termEnt : fieldEnt.getValue().entrySet()) { + List termPostingsLive = new ArrayList(); + postingsLive.put(termEnt.getKey(), termPostingsLive); + for(Posting posting : termEnt.getValue()) { + if (globalLiveDocs.get(posting.docID)) { + termPostingsLive.add(posting); + } + } + } + } + + allTerms = new ArrayList(); + for(Map.Entry>> fieldEnt : fields.entrySet()) { + String field = fieldEnt.getKey(); + for(Map.Entry> termEnt : fieldEnt.getValue().entrySet()) { + allTerms.add(new FieldAndTerm(field, termEnt.getKey())); + } + } + + if (VERBOSE) { + System.out.println("TEST: done init postings; maxDocID=" + maxDocID + "; " + allTerms.size() + " total terms, across " + fieldInfos.size() + " fields"); + } + } + + // TODO maybe instead of @BeforeClass just make a single test run: build postings & index & test it? + + private FieldInfos currentFieldInfos; + + // maxAllowed = the "highest" we can index, but we will still + // randomly index at lower IndexOption + private FieldsProducer buildIndex(Directory dir, IndexOptions maxAllowed, boolean allowPayloads) throws IOException { + SegmentInfo segmentInfo = new SegmentInfo(dir, Constants.LUCENE_MAIN_VERSION, "_0", 1+maxDocID, false, Codec.getDefault(), null, null); + + int maxIndexOption = Arrays.asList(IndexOptions.values()).indexOf(maxAllowed); + if (VERBOSE) { + System.out.println("\nTEST: now build index"); + } + + int maxIndexOptionNoOffsets = Arrays.asList(IndexOptions.values()).indexOf(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); + + // TODO use allowPayloads + + FieldInfo[] newFieldInfoArray = new FieldInfo[fields.size()]; + for(int fieldUpto=0;fieldUpto= 0 && allowPayloads; + + newFieldInfoArray[fieldUpto] = new FieldInfo(oldFieldInfo.name, + true, + fieldUpto, + false, + false, + doPayloads, + indexOptions, + null, + DocValues.Type.FIXED_INTS_8, + null); + } + + FieldInfos newFieldInfos = new FieldInfos(newFieldInfoArray); + + SegmentWriteState writeState = new SegmentWriteState(null, dir, + segmentInfo, newFieldInfos, + 32, null, IOContext.DEFAULT); + + FieldsConsumer fieldsConsumer = Codec.getDefault().postingsFormat().fieldsConsumer(writeState); + + for(Map.Entry>> fieldEnt : fields.entrySet()) { + String field = fieldEnt.getKey(); + Map> terms = fieldEnt.getValue(); + + FieldInfo fieldInfo = newFieldInfos.fieldInfo(field); + + IndexOptions indexOptions = fieldInfo.getIndexOptions(); + + if (VERBOSE) { + System.out.println("field=" + field + " indexOtions=" + indexOptions); + } + + boolean doFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + boolean doPos = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + boolean doPayloads = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 && allowPayloads; + + TermsConsumer termsConsumer = fieldsConsumer.addField(fieldInfo); + long sumTotalTF = 0; + long sumDF = 0; + FixedBitSet seenDocs = new FixedBitSet(maxDocID+1); + for(Map.Entry> termEnt : terms.entrySet()) { + BytesRef term = termEnt.getKey(); + List postings = termEnt.getValue(); + if (VERBOSE) { + System.out.println(" term=" + field + ":" + term.utf8ToString() + " docFreq=" + postings.size()); + } + + PostingsConsumer postingsConsumer = termsConsumer.startTerm(term); + long totalTF = 0; + int docCount = 0; + for(Posting posting : postings) { + if (VERBOSE) { + System.out.println(" " + docCount + ": docID=" + posting.docID + " freq=" + posting.positions.size()); + } + postingsConsumer.startDoc(posting.docID, posting.positions.size()); + seenDocs.set(posting.docID); + if (doPos) { + totalTF += posting.positions.size(); + for(Position pos : posting.positions) { + if (VERBOSE) { + if (doPayloads) { + System.out.println(" pos=" + pos.position + " payload=" + (pos.payload == null ? "null" : pos.payload.length + " bytes")); + } else { + System.out.println(" pos=" + pos.position); + } + } + postingsConsumer.addPosition(pos.position, (doPayloads && pos.payload != null) ? new BytesRef(pos.payload) : null, pos.startOffset, pos.endOffset); + } + } else if (doFreq) { + totalTF += posting.positions.size(); + } else { + totalTF++; + } + postingsConsumer.finishDoc(); + docCount++; + } + termsConsumer.finishTerm(term, new TermStats(postings.size(), totalTF)); + sumTotalTF += totalTF; + sumDF += postings.size(); + } + + termsConsumer.finish(sumTotalTF, sumDF, seenDocs.cardinality()); + } + + fieldsConsumer.close(); + + if (VERBOSE) { + System.out.println("TEST: after indexing: files="); + for(String file : dir.listAll()) { + System.out.println(" " + file + ": " + dir.fileLength(file) + " bytes"); + } + } + + currentFieldInfos = newFieldInfos; + + SegmentReadState readState = new SegmentReadState(dir, segmentInfo, newFieldInfos, IOContext.DEFAULT, 1); + + return Codec.getDefault().postingsFormat().fieldsProducer(readState); + } + + private static class ThreadState { + // Only used with REUSE option: + public DocsEnum reuseDocsEnum; + public DocsAndPositionsEnum reuseDocsAndPositionsEnum; + } + + private void verifyEnum(ThreadState threadState, + String field, + BytesRef term, + TermsEnum termsEnum, + + // Maximum options (docs/freqs/positions/offsets) to test: + IndexOptions maxIndexOptions, + + EnumSet