From a936d09a723b365adfd98f56a0e58740a0ae57d5 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Sat, 29 Sep 2012 15:49:37 +0000 Subject: [PATCH] LUCENE-4449: factor TestPostingsFormat into BasePostingsFormatTestCase git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1391826 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene40/TestLucene40PostingsFormat.java | 34 + .../lucene/index/TestPostingsFormat.java | 910 +---------------- .../index/BasePostingsFormatTestCase.java | 963 ++++++++++++++++++ 3 files changed, 1005 insertions(+), 902 deletions(-) create mode 100644 lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestLucene40PostingsFormat.java create mode 100644 lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestLucene40PostingsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestLucene40PostingsFormat.java new file mode 100644 index 00000000000..39950491a6e --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestLucene40PostingsFormat.java @@ -0,0 +1,34 @@ +package org.apache.lucene.codecs.lucene40; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.index.BasePostingsFormatTestCase; + +/** + * Tests Lucene40PostingsFormat + */ +public class TestLucene40PostingsFormat extends BasePostingsFormatTestCase { + private final Codec codec = new Lucene40Codec(); + + @Override + protected Codec getCodec() { + return codec; + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java b/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java index 94ba59e8a56..3b9ee178386 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java @@ -49,907 +49,13 @@ import org.apache.lucene.util._TestUtil; import org.junit.AfterClass; import org.junit.BeforeClass; -/* NOTE: This test focuses on the postings - * (docs/freqs/positions/payloads/offsets) impl, not the - * terms dict. The [stretch] goal is for this test to be - * so thorough in testing a new PostingsFormat that if this - * test passes, then all Lucene/Solr tests should also pass. Ie, - * if there is some bug in a given PostingsFormat that this - * test fails to catch then this test needs to be improved! */ - -// TODO can we make it easy for testing to pair up a "random terms dict impl" with your postings base format... - -// TODO test when you reuse after skipping a term or two, eg the block reuse case - -// TODO hmm contract says .doc() can return NO_MORE_DOCS -// before nextDoc too...? - -/* TODO - - threads - - assert doc=-1 before any nextDoc - - if a PF passes this test but fails other tests then this - test has a bug!! - - test tricky reuse cases, eg across fields - - verify you get null if you pass needFreq/needOffset but - they weren't indexed -*/ - -public class TestPostingsFormat extends LuceneTestCase { - - private enum Option { - // Sometimes use .advance(): - SKIPPING, - - // Sometimes reuse the Docs/AndPositionsEnum across terms: - REUSE_ENUMS, - - // Sometimes pass non-null live docs: - LIVE_DOCS, - - // Sometimes seek to term using previously saved TermState: - TERM_STATE, - - // Sometimes don't fully consume docs from the enum - PARTIAL_DOC_CONSUME, - - // Sometimes don't fully consume positions at each doc - PARTIAL_POS_CONSUME, - - // Sometimes check payloads - PAYLOADS, - - // Test w/ multiple threads - THREADS}; - - private static class FieldAndTerm { - String field; - BytesRef term; - - public FieldAndTerm(String field, BytesRef term) { - this.field = field; - this.term = BytesRef.deepCopyOf(term); - } - } - - private static class Position { - int position; - byte[] payload; - int startOffset; - int endOffset; - } - - private static class Posting implements Comparable { - int docID; - List positions; - - public int compareTo(Posting other) { - return docID - other.docID; - } - } - - // Holds all postings: - private static Map>> fields = new TreeMap>>(); - - // Holds only live doc postings: - private static Map>> fieldsLive = new TreeMap>>(); - - private static FieldInfos fieldInfos; - - private static int maxDocID; - - private static FixedBitSet globalLiveDocs; - - private static List allTerms; - - private static long totalPostings; - private static long totalPayloadBytes; - - @BeforeClass - public static void createPostings() throws IOException { - - final int numFields = _TestUtil.nextInt(random(), 1, 5); - if (VERBOSE) { - System.out.println("TEST: " + numFields + " fields"); - } - - FieldInfo[] fieldInfoArray = new FieldInfo[numFields]; - int fieldUpto = 0; - int numMediumTerms = 0; - int numBigTerms = 0; - int numManyPositions = 0; - totalPostings = 0; - totalPayloadBytes = 0; - while (fieldUpto < numFields) { - String field = _TestUtil.randomSimpleString(random()); - if (fields.containsKey(field)) { - continue; - } - - boolean fieldHasPayloads = random().nextBoolean(); - - fieldInfoArray[fieldUpto] = new FieldInfo(field, true, fieldUpto, false, false, fieldHasPayloads, - IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, - null, DocValues.Type.FIXED_INTS_8, null); - fieldUpto++; - - Map> postings = new TreeMap>(); - fields.put(field, postings); - Set seenTerms = new HashSet(); - - // TODO - //final int numTerms = atLeast(10); - final int numTerms = 4; - for(int termUpto=0;termUpto termPostings = new ArrayList(); - postings.put(new BytesRef(term), termPostings); - - int docID = 0; - - // TODO: more realistic to inversely tie this to numDocs: - int maxDocSpacing = _TestUtil.nextInt(random(), 1, 100); - - // 10% of the time create big payloads: - int payloadSize; - if (!fieldHasPayloads) { - payloadSize = 0; - } else if (random().nextInt(10) == 7) { - payloadSize = random().nextInt(50); - } else { - payloadSize = random().nextInt(10); - } - - boolean fixedPayloads = random().nextBoolean(); - - for(int docUpto=0;docUpto(); - termPostings.add(posting); - - int freq; - if (random().nextInt(30) == 17 && numManyPositions < 10) { - freq = _TestUtil.nextInt(random(), 1, 1000); - numManyPositions++; - } else { - freq = _TestUtil.nextInt(random(), 1, 20); - } - int pos = 0; - int offset = 0; - int posSpacing = _TestUtil.nextInt(random(), 1, 100); - totalPostings += freq; - for(int posUpto=0;posUpto>> fieldEnt : fields.entrySet()) { - Map> postingsLive = new TreeMap>(); - fieldsLive.put(fieldEnt.getKey(), postingsLive); - for(Map.Entry> termEnt : fieldEnt.getValue().entrySet()) { - List termPostingsLive = new ArrayList(); - postingsLive.put(termEnt.getKey(), termPostingsLive); - for(Posting posting : termEnt.getValue()) { - if (globalLiveDocs.get(posting.docID)) { - termPostingsLive.add(posting); - } - } - } - } - - allTerms = new ArrayList(); - for(Map.Entry>> fieldEnt : fields.entrySet()) { - String field = fieldEnt.getKey(); - for(Map.Entry> termEnt : fieldEnt.getValue().entrySet()) { - allTerms.add(new FieldAndTerm(field, termEnt.getKey())); - } - } - - if (VERBOSE) { - System.out.println("TEST: done init postings; maxDocID=" + maxDocID + "; " + allTerms.size() + " total terms, across " + fieldInfos.size() + " fields"); - } - } - - @AfterClass - public static void afterClass() throws Exception { - allTerms = null; - fieldInfos = null; - fields = null; - fieldsLive = null; - globalLiveDocs = null; - } - - // TODO maybe instead of @BeforeClass just make a single test run: build postings & index & test it? - - private FieldInfos currentFieldInfos; - - // maxAllowed = the "highest" we can index, but we will still - // randomly index at lower IndexOption - private FieldsProducer buildIndex(Directory dir, IndexOptions maxAllowed, boolean allowPayloads) throws IOException { - SegmentInfo segmentInfo = new SegmentInfo(dir, Constants.LUCENE_MAIN_VERSION, "_0", 1+maxDocID, false, Codec.getDefault(), null, null); - - int maxIndexOption = Arrays.asList(IndexOptions.values()).indexOf(maxAllowed); - if (VERBOSE) { - System.out.println("\nTEST: now build index"); - } - - int maxIndexOptionNoOffsets = Arrays.asList(IndexOptions.values()).indexOf(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); - - // TODO use allowPayloads - - FieldInfo[] newFieldInfoArray = new FieldInfo[fields.size()]; - for(int fieldUpto=0;fieldUpto= 0 && allowPayloads; - - newFieldInfoArray[fieldUpto] = new FieldInfo(oldFieldInfo.name, - true, - fieldUpto, - false, - false, - doPayloads, - indexOptions, - null, - DocValues.Type.FIXED_INTS_8, - null); - } - - FieldInfos newFieldInfos = new FieldInfos(newFieldInfoArray); - - // Estimate that flushed segment size will be 25% of - // what we use in RAM: - long bytes = totalPostings * 8 + totalPayloadBytes; - - SegmentWriteState writeState = new SegmentWriteState(null, dir, - segmentInfo, newFieldInfos, - 32, null, new IOContext(new FlushInfo(maxDocID, bytes))); - FieldsConsumer fieldsConsumer = Codec.getDefault().postingsFormat().fieldsConsumer(writeState); - - for(Map.Entry>> fieldEnt : fields.entrySet()) { - String field = fieldEnt.getKey(); - Map> terms = fieldEnt.getValue(); - - FieldInfo fieldInfo = newFieldInfos.fieldInfo(field); - - IndexOptions indexOptions = fieldInfo.getIndexOptions(); - - if (VERBOSE) { - System.out.println("field=" + field + " indexOtions=" + indexOptions); - } - - boolean doFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; - boolean doPos = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; - boolean doPayloads = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 && allowPayloads; - boolean doOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; - - TermsConsumer termsConsumer = fieldsConsumer.addField(fieldInfo); - long sumTotalTF = 0; - long sumDF = 0; - FixedBitSet seenDocs = new FixedBitSet(maxDocID+1); - for(Map.Entry> termEnt : terms.entrySet()) { - BytesRef term = termEnt.getKey(); - List postings = termEnt.getValue(); - if (VERBOSE) { - System.out.println(" term=" + field + ":" + term.utf8ToString() + " docFreq=" + postings.size()); - } - - PostingsConsumer postingsConsumer = termsConsumer.startTerm(term); - long totalTF = 0; - int docCount = 0; - for(Posting posting : postings) { - if (VERBOSE) { - System.out.println(" " + docCount + ": docID=" + posting.docID + " freq=" + posting.positions.size()); - } - postingsConsumer.startDoc(posting.docID, doFreq ? posting.positions.size() : -1); - seenDocs.set(posting.docID); - if (doPos) { - totalTF += posting.positions.size(); - for(Position pos : posting.positions) { - if (VERBOSE) { - if (doPayloads) { - System.out.println(" pos=" + pos.position + " payload=" + (pos.payload == null ? "null" : pos.payload.length + " bytes")); - } else { - System.out.println(" pos=" + pos.position); - } - } - postingsConsumer.addPosition(pos.position, (doPayloads && pos.payload != null) ? new BytesRef(pos.payload) : null, doOffsets ? pos.startOffset : -1, doOffsets ? pos.endOffset : -1); - } - } else if (doFreq) { - totalTF += posting.positions.size(); - } else { - totalTF++; - } - postingsConsumer.finishDoc(); - docCount++; - } - termsConsumer.finishTerm(term, new TermStats(postings.size(), doFreq ? totalTF : -1)); - sumTotalTF += totalTF; - sumDF += postings.size(); - } - - termsConsumer.finish(doFreq ? sumTotalTF : -1, sumDF, seenDocs.cardinality()); - } - - fieldsConsumer.close(); - - if (VERBOSE) { - System.out.println("TEST: after indexing: files="); - for(String file : dir.listAll()) { - System.out.println(" " + file + ": " + dir.fileLength(file) + " bytes"); - } - } - - currentFieldInfos = newFieldInfos; - - SegmentReadState readState = new SegmentReadState(dir, segmentInfo, newFieldInfos, IOContext.DEFAULT, 1); - - return Codec.getDefault().postingsFormat().fieldsProducer(readState); - } - - private static class ThreadState { - // Only used with REUSE option: - public DocsEnum reuseDocsEnum; - public DocsAndPositionsEnum reuseDocsAndPositionsEnum; - } - - private void verifyEnum(ThreadState threadState, - String field, - BytesRef term, - TermsEnum termsEnum, - - // Maximum options (docs/freqs/positions/offsets) to test: - IndexOptions maxIndexOptions, - - EnumSet