From b8fc54e72aa18176763648f6021e7f1d184defa1 Mon Sep 17 00:00:00 2001 From: Karl-Johan Wettin Date: Sat, 28 Jun 2008 17:23:35 +0000 Subject: [PATCH] LUCENE-1312: Added full support for InstantiatedIndexReader#getFieldNames() and extended the test case to assert deleted documents behaves as they should (they did). git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@672556 13f79535-47bb-0310-9956-ffa450edef68 --- contrib/instantiated/CHANGES.txt | 33 ++++ .../store/instantiated/FieldSetting.java | 61 ++++++ .../store/instantiated/FieldSettings.java | 95 ++++++++++ .../store/instantiated/InstantiatedIndex.java | 90 ++++++++- .../instantiated/InstantiatedIndexReader.java | 176 ++++++++++++------ .../instantiated/InstantiatedIndexWriter.java | 165 ++++++++-------- .../instantiated/InstantiatedTermDocs.java | 5 - .../instantiated/InstantiatedTermEnum.java | 2 +- .../lucene/store/instantiated/package.html | 5 +- .../store/instantiated/TestIndicesEquals.java | 37 +++- 10 files changed, 512 insertions(+), 157 deletions(-) create mode 100644 contrib/instantiated/CHANGES.txt create mode 100644 contrib/instantiated/src/java/org/apache/lucene/store/instantiated/FieldSetting.java create mode 100644 contrib/instantiated/src/java/org/apache/lucene/store/instantiated/FieldSettings.java diff --git a/contrib/instantiated/CHANGES.txt b/contrib/instantiated/CHANGES.txt new file mode 100644 index 00000000000..3d777fe843a --- /dev/null +++ b/contrib/instantiated/CHANGES.txt @@ -0,0 +1,33 @@ +Lucene InstantiatedIndex contrib module change Log + +======================= Trunk (not yet released) ======================= + +Changes in runtime behavior + + (None) + +API Changes + + (None) + +Bug fixes + + 1. LUCENE-1312: Added full support for InstantiatedIndexReader#getFieldNames() + and tests that assert that deleted documents behaves as they should (they did). + (Jason Rutherglen, Karl Wettin) + +New features + + (None) + +Documentation + + (None) + +Build + + (None) + +Test Cases + + (None) \ No newline at end of file diff --git a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/FieldSetting.java b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/FieldSetting.java new file mode 100644 index 00000000000..34d96bf710c --- /dev/null +++ b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/FieldSetting.java @@ -0,0 +1,61 @@ +package org.apache.lucene.store.instantiated; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * For non package access see {@link org.apache.lucene.index.IndexReader#getFieldNames(org.apache.lucene.index.IndexReader.FieldOption)} + */ +class FieldSetting { + String fieldName; + + boolean storeTermVector = false; + boolean storeOffsetWithTermVector = false; + boolean storePositionWithTermVector = false; + boolean storePayloads = false; + + boolean stored = false; + boolean indexed = false; + boolean tokenized = false; + boolean compressed = false; + + FieldSetting() { + } + + + FieldSetting(String fieldName) { + this.fieldName = fieldName; + } + + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + + final FieldSetting that = (FieldSetting) o; + + return fieldName.equals(that.fieldName); + + } + + public int hashCode() { + return fieldName.hashCode(); + } + + +} diff --git a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/FieldSettings.java b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/FieldSettings.java new file mode 100644 index 00000000000..99b4ace7ad6 --- /dev/null +++ b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/FieldSettings.java @@ -0,0 +1,95 @@ +package org.apache.lucene.store.instantiated; + +import java.util.HashMap; +import java.util.Map; +import java.util.Collection; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Essetially a Map + */ +class FieldSettings { + + + FieldSettings() { + } + + private Map fieldSettings = new HashMap(); + + synchronized FieldSetting merge(FieldSetting fieldSetting) { + FieldSetting setting = fieldSettings.get(fieldSetting.fieldName); + + if (setting == null) { + setting = new FieldSetting(fieldSetting.fieldName); + fieldSettings.put(fieldSetting.fieldName, setting); + } + + if (fieldSetting.stored) { + setting.stored = true; + } + if (fieldSetting.compressed) { + setting.compressed = true; + } + + if ("b3".equals(fieldSetting.fieldName)) { + System.currentTimeMillis(); + } + if (fieldSetting.indexed) { + setting.indexed = true; + } + if (fieldSetting.tokenized) { + setting.tokenized = true; + } + + if (fieldSetting.storeTermVector) { + setting.storeTermVector = true; + } + if (fieldSetting.storeOffsetWithTermVector) { + setting.storeOffsetWithTermVector = true; + } + if (fieldSetting.storePositionWithTermVector) { + setting.storePositionWithTermVector = true; + } + + if (fieldSetting.storePayloads) { + setting.storePayloads = true; + } + + return setting; + + } + + FieldSetting get(String name) { + return fieldSettings.get(name); + } + + FieldSetting get(String name, boolean create) { + FieldSetting fieldSetting = fieldSettings.get(name); + if (create && fieldSetting == null) { + fieldSetting = new FieldSetting(name); + fieldSettings.put(name, fieldSetting); + } + return fieldSetting; + } + + Collection values() { + return fieldSettings.values(); + } + +} diff --git a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java index 4a89dc0adca..dae534bfaf3 100644 --- a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java +++ b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java @@ -16,14 +16,24 @@ package org.apache.lucene.store.instantiated; * limitations under the License. */ +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.index.*; - -import java.io.IOException; -import java.io.Serializable; -import java.util.*; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermPositionVector; +import org.apache.lucene.index.TermPositions; /** * Represented as a coupled graph of class instances, this @@ -49,7 +59,8 @@ public class InstantiatedIndex private long version = System.currentTimeMillis(); private InstantiatedDocument[] documentsByNumber; - /** todo: this should be a BitSet */ + + /** todo: should this be a BitSet? */ private Set deletedDocuments; private Map> termsByFieldAndText; @@ -57,6 +68,7 @@ public class InstantiatedIndex private Map normsByFieldNameAndDocumentNumber; + private FieldSettings fieldSettings; /** * Creates an empty instantiated index for you to fill with data using an {@link org.apache.lucene.store.instantiated.InstantiatedIndexWriter}. @@ -68,12 +80,14 @@ public class InstantiatedIndex void initialize() { // todo: clear index without loosing memory (uncouple stuff) termsByFieldAndText = new HashMap>(); + fieldSettings = new FieldSettings(); orderedTerms = new InstantiatedTerm[0]; documentsByNumber = new InstantiatedDocument[0]; normsByFieldNameAndDocumentNumber = new HashMap(); deletedDocuments = new HashSet(); } + /** * Creates a new instantiated index that looks just like the index in a specific state as represented by a reader. * @@ -83,7 +97,9 @@ public class InstantiatedIndex public InstantiatedIndex(IndexReader sourceIndexReader) throws IOException { this(sourceIndexReader, null); } + + /** * Creates a new instantiated index that looks just like the index in a specific state as represented by a reader. * @@ -97,10 +113,63 @@ public class InstantiatedIndex throw new IOException("Source index is not optimized."); } - Collection allFieldNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.ALL); initialize(); + Collection allFieldNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.ALL); + + // load field options + + Collection indexedNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.INDEXED); + for (String name : indexedNames) { + FieldSetting setting = fieldSettings.get(name, true); + setting.indexed = true; + } + Collection indexedNoVecNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.INDEXED_NO_TERMVECTOR); + for (String name : indexedNoVecNames) { + FieldSetting setting = fieldSettings.get(name, true); + setting.storeTermVector = false; + setting.indexed = true; + } + Collection indexedVecNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR); + for (String name : indexedVecNames) { + FieldSetting setting = fieldSettings.get(name, true); + setting.storeTermVector = true; + setting.indexed = true; + } + Collection payloadNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.STORES_PAYLOADS); + for (String name : payloadNames) { + FieldSetting setting = fieldSettings.get(name, true); + setting.storePayloads = true; + } + Collection termVecNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR); + for (String name : termVecNames) { + FieldSetting setting = fieldSettings.get(name, true); + setting.storeTermVector = true; + } + Collection termVecOffsetNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET); + for (String name : termVecOffsetNames) { + FieldSetting setting = fieldSettings.get(name, true); + setting.storeOffsetWithTermVector = true; + } + Collection termVecPosNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION); + for (String name : termVecPosNames) { + FieldSetting setting = fieldSettings.get(name, true); + setting.storePositionWithTermVector = true; + } + Collection termVecPosOffNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET); + for (String name : termVecPosOffNames) { + FieldSetting setting = fieldSettings.get(name, true); + setting.storeOffsetWithTermVector = true; + setting.storePositionWithTermVector = true; + } + Collection unindexedNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.UNINDEXED); + for (String name : unindexedNames) { + FieldSetting setting = fieldSettings.get(name, true); + setting.indexed = false; + } + + documentsByNumber = new InstantiatedDocument[sourceIndexReader.numDocs()]; // create documents @@ -129,6 +198,8 @@ public class InstantiatedIndex } } + + // create norms for (String fieldName : allFieldNames) { if (fields == null || fields.contains(fieldName)) { @@ -271,4 +342,9 @@ public class InstantiatedIndex void setVersion(long version) { this.version = version; } + + + FieldSettings getFieldSettings() { + return fieldSettings; + } } diff --git a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java index 50f7924c0fa..ddeb9f43843 100644 --- a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java +++ b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java @@ -16,22 +16,37 @@ package org.apache.lucene.store.instantiated; * limitations under the License. */ +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; -import org.apache.lucene.index.*; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermFreqVector; +import org.apache.lucene.index.TermPositions; +import org.apache.lucene.index.TermVectorMapper; import org.apache.lucene.store.Directory; -import java.io.IOException; -import java.util.*; - /** - * An InstantiatedIndexReader is not a snapshot in time, - * it is completely in sync with the latest commit to the store! - * + * An InstantiatedIndexReader is not a snapshot in time, it is completely in + * sync with the latest commit to the store! + * * Consider using InstantiatedIndex as if it was immutable. */ -public class InstantiatedIndexReader - extends IndexReader { +public class InstantiatedIndexReader extends IndexReader { private final InstantiatedIndex index; @@ -47,36 +62,32 @@ public class InstantiatedIndexReader return true; } - /** - * An InstantiatedIndexReader is not a snapshot in time, - * it is completely in sync with the latest commit to the store! - * + * An InstantiatedIndexReader is not a snapshot in time, it is completely in + * sync with the latest commit to the store! + * * @return output from {@link InstantiatedIndex#getVersion()} in associated instantiated index. */ public long getVersion() { return index.getVersion(); } - public Directory directory() { throw new UnsupportedOperationException(); } - /** * An InstantiatedIndexReader is always current! - * - * Check whether this IndexReader is still using the - * current (i.e., most recently committed) version of the - * index. If a writer has committed any changes to the - * index since this reader was opened, this will return - * false, in which case you must open a new - * IndexReader in order to see the changes. See the - * description of the autoCommit - * flag which controls when the {@link IndexWriter} - * actually commits changes to the index. - * + * + * Check whether this IndexReader is still using the current (i.e., most + * recently committed) version of the index. If a writer has committed any + * changes to the index since this reader was opened, this will return + * false, in which case you must open a new IndexReader in + * order to see the changes. See the description of the autoCommit flag + * which controls when the {@link IndexWriter} actually commits changes to the + * index. + * * @return always true * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error @@ -92,7 +103,7 @@ public class InstantiatedIndexReader private Set deletedDocuments = new HashSet(); private Set deletedDocumentNumbers = new HashSet(); - private Map> updatedNormsByFieldNameAndDocumentNumber = null; + private Map> updatedNormsByFieldNameAndDocumentNumber = null; private class NormUpdate { private int doc; @@ -140,7 +151,7 @@ public class InstantiatedIndexReader // 1. update norms if (updatedNormsByFieldNameAndDocumentNumber != null) { - for (Map.Entry> e : updatedNormsByFieldNameAndDocumentNumber.entrySet()) { + for (Map.Entry> e : updatedNormsByFieldNameAndDocumentNumber.entrySet()) { byte[] norms = getIndex().getNormsByFieldNameAndDocumentNumber().get(e.getKey()); for (NormUpdate normUpdate : e.getValue()) { norms[normUpdate.doc] = normUpdate.value; @@ -168,27 +179,67 @@ public class InstantiatedIndexReader protected void doClose() throws IOException { // ignored + // todo perhaps release all associated instances? } - public Collection getFieldNames(FieldOption fldOption) { - if (fldOption != FieldOption.ALL) { - throw new IllegalArgumentException("Only FieldOption.ALL implemented."); // todo + public Collection getFieldNames(FieldOption fieldOption) { + Set fieldSet = new HashSet(); + for (FieldSetting fi : index.getFieldSettings().values()) { + if (fieldOption == IndexReader.FieldOption.ALL) { + fieldSet.add(fi.fieldName); + } else if (!fi.indexed && fieldOption == IndexReader.FieldOption.UNINDEXED) { + fieldSet.add(fi.fieldName); + } else if (fi.storePayloads && fieldOption == IndexReader.FieldOption.STORES_PAYLOADS) { + fieldSet.add(fi.fieldName); + } else if (fi.indexed && fieldOption == IndexReader.FieldOption.INDEXED) { + fieldSet.add(fi.fieldName); + } else if (fi.indexed && fi.storeTermVector == false && fieldOption == IndexReader.FieldOption.INDEXED_NO_TERMVECTOR) { + fieldSet.add(fi.fieldName); + } else if (fi.storeTermVector == true && fi.storePositionWithTermVector == false && fi.storeOffsetWithTermVector == false + && fieldOption == IndexReader.FieldOption.TERMVECTOR) { + fieldSet.add(fi.fieldName); + } else if (fi.indexed && fi.storeTermVector && fieldOption == IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR) { + fieldSet.add(fi.fieldName); + } else if (fi.storePositionWithTermVector && fi.storeOffsetWithTermVector == false + && fieldOption == IndexReader.FieldOption.TERMVECTOR_WITH_POSITION) { + fieldSet.add(fi.fieldName); + } else if (fi.storeOffsetWithTermVector && fi.storePositionWithTermVector == false + && fieldOption == IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET) { + fieldSet.add(fi.fieldName); + } else if ((fi.storeOffsetWithTermVector && fi.storePositionWithTermVector) + && fieldOption == IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET) { + fieldSet.add(fi.fieldName); + } } - return new ArrayList(getIndex().getTermsByFieldAndText().keySet()); + return fieldSet; } - /** - * This implementation ignores the field selector! All fields are always returned + * Return the {@link org.apache.lucene.document.Document} at the nth + * position. +

+ * Warning! + * The resulting document is the actual stored document instance + * and not a deserialized clone as retuned by an IndexReader + * over a {@link org.apache.lucene.store.Directory}. + * I.e., if you need to touch the document, clone it first! + *

+ * This can also be seen as a feature for live canges of stored values, + * but be carful! Adding a field with an name unknown to the index + * or to a field with previously no stored values will make + * {@link org.apache.lucene.store.instantiated.InstantiatedIndexReader#getFieldNames(org.apache.lucene.index.IndexReader.FieldOption)} + * out of sync, causing problems for instance when merging the + * instantiated index to another index. +

+ * This implementation ignores the field selector! All stored fields are always returned! + *

* - * Get the {@link org.apache.lucene.document.Document} at the nth position. - * - * @param n Get the document at the nth position + * @param n document number * @param fieldSelector ignored * @return The stored fields of the {@link org.apache.lucene.document.Document} at the nth position * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error - * + * * @see org.apache.lucene.document.Fieldable * @see org.apache.lucene.document.FieldSelector * @see org.apache.lucene.document.SetBasedFieldSelector @@ -198,19 +249,34 @@ public class InstantiatedIndexReader return document(n); } + /** + * Returns the stored fields of the nth + * Document in this index. + *

+ * Warning! + * The resulting document is the actual stored document instance + * and not a deserialized clone as retuned by an IndexReader + * over a {@link org.apache.lucene.store.Directory}. + * I.e., if you need to touch the document, clone it first! + *

+ * This can also be seen as a feature for live canges of stored values, + * but be carful! Adding a field with an name unknown to the index + * or to a field with previously no stored values will make + * {@link org.apache.lucene.store.instantiated.InstantiatedIndexReader#getFieldNames(org.apache.lucene.index.IndexReader.FieldOption)} + * out of sync, causing problems for instance when merging the + * instantiated index to another index. + * + * @throws CorruptIndexException if the index is corrupt + * @throws IOException if there is a low-level IO error + */ + public Document document(int n) throws IOException { - if ((deletedDocumentNumbers != null - && deletedDocumentNumbers.contains(n)) - || - (getIndex().getDeletedDocuments() != null - && getIndex().getDeletedDocuments().contains(n))) { - return null; - } - return getIndex().getDocumentsByNumber()[n].getDocument(); + return isDeleted(n) ? null : getIndex().getDocumentsByNumber()[n].getDocument(); } /** - * never ever touch these values. it is the true values, unless norms have been touched. + * never ever touch these values. it is the true values, unless norms have + * been touched. */ public byte[] norms(String field) throws IOException { byte[] norms = getIndex().getNormsByFieldNameAndDocumentNumber().get(field); @@ -233,7 +299,7 @@ public class InstantiatedIndexReader protected void doSetNorm(int doc, String field, byte value) throws IOException { if (updatedNormsByFieldNameAndDocumentNumber == null) { - updatedNormsByFieldNameAndDocumentNumber = new HashMap>(getIndex().getNormsByFieldNameAndDocumentNumber().size()); + updatedNormsByFieldNameAndDocumentNumber = new HashMap>(getIndex().getNormsByFieldNameAndDocumentNumber().size()); } List list = updatedNormsByFieldNameAndDocumentNumber.get(field); if (list == null) { @@ -252,7 +318,6 @@ public class InstantiatedIndexReader } } - public TermEnum terms() throws IOException { return new InstantiatedTermEnum(this); } @@ -260,11 +325,11 @@ public class InstantiatedIndexReader public TermEnum terms(Term t) throws IOException { InstantiatedTerm it = getIndex().findTerm(t); if (it != null) { - return new InstantiatedTermEnum(this, it.getTermIndex()); + return new InstantiatedTermEnum(this, it.getTermIndex()); } else { int startPos = Arrays.binarySearch(index.getOrderedTerms(), t, InstantiatedTerm.termComparator); if (startPos < 0) { - startPos = -1 -startPos; + startPos = -1 - startPos; } return new InstantiatedTermEnum(this, startPos); } @@ -293,19 +358,16 @@ public class InstantiatedIndexReader public TermFreqVector getTermFreqVector(int docNumber, String field) throws IOException { InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber]; - if (doc.getVectorSpace() == null - || doc.getVectorSpace().get(field) == null) { + if (doc.getVectorSpace() == null || doc.getVectorSpace().get(field) == null) { return null; } else { return new InstantiatedTermPositionVector(doc, field); } } - public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException { InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber]; - if (doc.getVectorSpace() != null - && doc.getVectorSpace().get(field) == null) { + if (doc.getVectorSpace() != null && doc.getVectorSpace().get(field) == null) { List tv = doc.getVectorSpace().get(field); mapper.setExpectations(field, tv.size(), true, true); for (InstantiatedTermDocumentInformation tdi : tv) { @@ -316,7 +378,7 @@ public class InstantiatedIndexReader public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException { InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber]; - for (Map.Entry> e : doc.getVectorSpace().entrySet()) { + for (Map.Entry> e : doc.getVectorSpace().entrySet()) { mapper.setExpectations(e.getKey(), e.getValue().size(), true, true); for (InstantiatedTermDocumentInformation tdi : e.getValue()) { mapper.map(tdi.getTerm().text(), tdi.getTermPositions().length, tdi.getTermOffsets(), tdi.getTermPositions()); diff --git a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java index 51088b4390b..d11b8a4c1c3 100644 --- a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java +++ b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java @@ -16,6 +16,22 @@ package org.apache.lucene.store.instantiated; * limitations under the License. */ +import java.io.IOException; +import java.io.PrintStream; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; @@ -28,11 +44,6 @@ import org.apache.lucene.index.TermVectorOffsetInfo; import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.Similarity; -import java.io.IOException; -import java.io.PrintStream; -import java.io.StringReader; -import java.util.*; - /** * This class, similar to {@link org.apache.lucene.index.IndexWriter}, has no locking mechanism. * @@ -161,6 +172,11 @@ public class InstantiatedIndexWriter { boolean orderedTermsDirty = false; Set dirtyTerms = new HashSet(1000); + + Map fieldSettingsByFieldName = new HashMap(); + for (String fieldName : fieldNameBuffer) { + fieldSettingsByFieldName.put(fieldName, new FieldSetting(fieldName)); + } InstantiatedDocument[] documentsByNumber = new InstantiatedDocument[index.getDocumentsByNumber().length + termDocumentInformationFactoryByDocument.size()]; System.arraycopy(index.getDocumentsByNumber(), 0, documentsByNumber, 0, index.getDocumentsByNumber().length); @@ -215,7 +231,7 @@ public class InstantiatedIndexWriter { } termsInDocument += eFieldTermDocInfoFactoriesByTermText.getValue().size(); - if (eFieldTermDocInfoFactoriesByTermText.getKey().isIndexed && !eFieldTermDocInfoFactoriesByTermText.getKey().omitNorms) { + if (eFieldTermDocInfoFactoriesByTermText.getKey().indexed && !eFieldTermDocInfoFactoriesByTermText.getKey().omitNorms) { float norm = eFieldTermDocInfoFactoriesByTermText.getKey().boost; norm *= document.getDocument().getBoost(); norm *= similarity.lengthNorm(eFieldTermDocInfoFactoriesByTermText.getKey().fieldName, eFieldTermDocInfoFactoriesByTermText.getKey().fieldLength); @@ -340,6 +356,7 @@ public class InstantiatedIndexWriter { } } + fieldSettingsByFieldName.putAll(documentFieldSettingsByFieldName); } // order document informations in dirty terms @@ -358,6 +375,9 @@ public class InstantiatedIndexWriter { index.setDocumentsByNumber(documentsByNumber); index.setOrderedTerms(orderedTerms.toArray(new InstantiatedTerm[orderedTerms.size()])); + for (FieldSetting fieldSetting : fieldSettingsByFieldName.values()) { + index.getFieldSettings().merge(fieldSetting); + } // set term index if (orderedTermsDirty) { // todo optimize, only update from start position @@ -434,45 +454,46 @@ public class InstantiatedIndexWriter { Map fieldSettingsByFieldName = new HashMap(); for (Field field : (List) document.getDocument().getFields()) { - FieldSetting fieldSettings = fieldSettingsByFieldName.get(field.name()); - if (fieldSettings == null) { - fieldSettings = new FieldSetting(); - fieldSettings.fieldName = field.name().intern(); - fieldSettingsByFieldName.put(fieldSettings.fieldName, fieldSettings); - fieldNameBuffer.add(fieldSettings.fieldName); + FieldSetting fieldSetting = fieldSettingsByFieldName.get(field.name()); + if (fieldSetting == null) { + fieldSetting = new FieldSetting(); + fieldSetting.fieldName = field.name().intern(); + fieldSettingsByFieldName.put(fieldSetting.fieldName, fieldSetting); + fieldNameBuffer.add(fieldSetting.fieldName); } // todo: fixme: multiple fields with the same name does not mean field boost += more boost. - fieldSettings.boost *= field.getBoost(); + fieldSetting.boost *= field.getBoost(); //fieldSettings.dimensions++; + // once fieldSettings, always fieldSettings. - if (field.getOmitNorms() != fieldSettings.omitNorms) { - fieldSettings.omitNorms = true; + if (field.getOmitNorms()) { + fieldSetting.omitNorms = true; } - if (field.isIndexed() != fieldSettings.isIndexed) { - fieldSettings.isIndexed = true; + if (field.isIndexed() ) { + fieldSetting.indexed = true; } - if (field.isTokenized() != fieldSettings.isTokenized) { - fieldSettings.isTokenized = true; + if (field.isTokenized()) { + fieldSetting.tokenized = true; } - if (field.isCompressed() != fieldSettings.isCompressed) { - fieldSettings.isCompressed = true; + if (field.isCompressed()) { + fieldSetting.compressed = true; } - if (field.isStored() != fieldSettings.isStored) { - fieldSettings.isStored = true; + if (field.isStored()) { + fieldSetting.stored = true; } - if (field.isBinary() != fieldSettings.isBinary) { - fieldSettings.isBinary = true; + if (field.isBinary()) { + fieldSetting.isBinary = true; } - if (field.isTermVectorStored() != fieldSettings.storeTermVector) { - fieldSettings.storeTermVector = true; + if (field.isTermVectorStored()) { + fieldSetting.storeTermVector = true; } - if (field.isStorePositionWithTermVector() != fieldSettings.storePositionWithTermVector) { - fieldSettings.storePositionWithTermVector = true; + if (field.isStorePositionWithTermVector()) { + fieldSetting.storePositionWithTermVector = true; } - if (field.isStoreOffsetWithTermVector() != fieldSettings.storeOffsetWithTermVector) { - fieldSettings.storeOffsetWithTermVector = true; + if (field.isStoreOffsetWithTermVector()) { + fieldSetting.storeOffsetWithTermVector = true; } } @@ -483,7 +504,7 @@ public class InstantiatedIndexWriter { Field field = it.next(); - FieldSetting fieldSettings = fieldSettingsByFieldName.get(field.name()); + FieldSetting fieldSetting = fieldSettingsByFieldName.get(field.name()); if (field.isIndexed()) { @@ -505,15 +526,15 @@ public class InstantiatedIndexWriter { next.setTermText(next.termText().intern()); // todo: not sure this needs to be interned? tokens.add(next); // the vector will be built on commit. next = tokenStream.next(); - fieldSettings.fieldLength++; - if (fieldSettings.fieldLength > maxFieldLength) { + fieldSetting.fieldLength++; + if (fieldSetting.fieldLength > maxFieldLength) { break; } } } else { // untokenized tokens.add(new Token(field.stringValue().intern(), 0, field.stringValue().length(), "untokenized")); - fieldSettings.fieldLength++; + fieldSetting.fieldLength++; } } @@ -528,7 +549,7 @@ public class InstantiatedIndexWriter { // build term vector, term positions and term offsets for (Map.Entry> eField_Tokens : tokensByField.entrySet()) { - FieldSetting fieldSettings = fieldSettingsByFieldName.get(eField_Tokens.getKey().name()); + FieldSetting fieldSetting = fieldSettingsByFieldName.get(eField_Tokens.getKey().name()); Map termDocumentInformationFactoryByTermText = termDocumentInformationFactoryByTermTextAndFieldSetting.get(fieldSettingsByFieldName.get(eField_Tokens.getKey().name())); if (termDocumentInformationFactoryByTermText == null) { @@ -539,9 +560,9 @@ public class InstantiatedIndexWriter { int lastOffset = 0; // for each new field, move positions a bunch. - if (fieldSettings.position > 0) { + if (fieldSetting.position > 0) { // todo what if no analyzer set, multiple fields with same name and index without tokenization? - fieldSettings.position += analyzer.getPositionIncrementGap(fieldSettings.fieldName); + fieldSetting.position += analyzer.getPositionIncrementGap(fieldSetting.fieldName); } for (Token token : eField_Tokens.getValue()) { @@ -553,26 +574,27 @@ public class InstantiatedIndexWriter { } //termDocumentInformationFactory.termFrequency++; - fieldSettings.position += (token.getPositionIncrement() - 1); - termDocumentInformationFactory.termPositions.add(fieldSettings.position++); + fieldSetting.position += (token.getPositionIncrement() - 1); + termDocumentInformationFactory.termPositions.add(fieldSetting.position++); if (token.getPayload() != null && token.getPayload().length() > 0) { termDocumentInformationFactory.payloads.add(token.getPayload().toByteArray()); + fieldSetting.storePayloads = true; } else { termDocumentInformationFactory.payloads.add(null); } if (eField_Tokens.getKey().isStoreOffsetWithTermVector()) { - termDocumentInformationFactory.termOffsets.add(new TermVectorOffsetInfo(fieldSettings.offset + token.startOffset(), fieldSettings.offset + token.endOffset())); - lastOffset = fieldSettings.offset + token.endOffset(); + termDocumentInformationFactory.termOffsets.add(new TermVectorOffsetInfo(fieldSetting.offset + token.startOffset(), fieldSetting.offset + token.endOffset())); + lastOffset = fieldSetting.offset + token.endOffset(); } } if (eField_Tokens.getKey().isStoreOffsetWithTermVector()) { - fieldSettings.offset = lastOffset + 1; + fieldSetting.offset = lastOffset + 1; } } @@ -631,45 +653,6 @@ public class InstantiatedIndexWriter { return analyzer; } - - private class FieldSetting { - private String fieldName; - - private float boost = 1; - //private int dimensions = 0; // this is futuristic - private int position = 0; - private int offset; - private int fieldLength = 0; - - private boolean storeTermVector = false; - private boolean storeOffsetWithTermVector = false; - private boolean storePositionWithTermVector = false; - private boolean omitNorms = false; - private boolean isTokenized = false; - - private boolean isStored = false; - private boolean isIndexed = false; - private boolean isBinary = false; - private boolean isCompressed = false; - - //private float norm; - //private byte encodedNorm; - - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - final FieldSetting that = (FieldSetting) o; - - return fieldName.equals(that.fieldName); - - } - - public int hashCode() { - return fieldName.hashCode(); - } - } - private class TermDocumentInformationFactory { private LinkedList payloads = new LinkedList(); private LinkedList termPositions = new LinkedList(); @@ -677,5 +660,23 @@ public class InstantiatedIndexWriter { } + static class FieldSetting extends org.apache.lucene.store.instantiated.FieldSetting { + + float boost = 1; + int position = 0; + int offset; + int fieldLength = 0; + + boolean omitNorms = false; + boolean isBinary = false; + + private FieldSetting() { + } + + private FieldSetting(String fieldName) { + super(fieldName); + } + } + } diff --git a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocs.java b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocs.java index d89a4fe5fa3..22212470ce1 100644 --- a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocs.java +++ b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocs.java @@ -121,16 +121,11 @@ public class InstantiatedTermDocs } else { return true; } - - } /** * Does nothing */ public void close() { - } - - } diff --git a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermEnum.java b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermEnum.java index 742bd3b9e4e..4306466a363 100644 --- a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermEnum.java +++ b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermEnum.java @@ -61,7 +61,7 @@ public class InstantiatedTermEnum * Returns the current Term in the enumeration. */ public Term term() { - return /*term == null ? null :*/ term.getTerm(); + return term == null ? null : term.getTerm(); } /** diff --git a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/package.html b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/package.html index 1785ee97069..4cecd146c41 100644 --- a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/package.html +++ b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/package.html @@ -70,9 +70,10 @@

Caveats

  • No locks! Consider using InstantiatedIndex as if it was immutable.
  • -
  • No documents with fields containing readers!
  • -
  • Only FieldOption.All allowed by IndexReader#getFieldNames(FieldOption).
  • +
  • No documents with fields containing readers.
  • No field selection when retrieving documents, as all stored field are available in memory.
  • +
  • Any document returned must cloned if they are to be touched.
  • +
  • Norms array returned must not be touched.

Use cases

diff --git a/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java b/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java index f331abb9ad6..a0ec2485424 100644 --- a/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java +++ b/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java @@ -47,7 +47,7 @@ public class TestIndicesEquals extends TestCase { // create dir data IndexWriter indexWriter = new IndexWriter(dir, new StandardAnalyzer(), true); - for (int i = 0; i < 5; i++) { + for (int i = 0; i < 20; i++) { Document document = new Document(); assembleDocument(document, i); indexWriter.addDocument(document); @@ -59,9 +59,10 @@ public class TestIndicesEquals extends TestCase { InstantiatedIndex ii = new InstantiatedIndex(ir); ir.close(); - testEquals(dir, ii); + testEqualBehaviour(dir, ii); } + public void testInstantiatedIndexWriter() throws Exception { @@ -86,7 +87,7 @@ public class TestIndicesEquals extends TestCase { } instantiatedIndexWriter.close(); - testEquals(dir, ii); + testEqualBehaviour(dir, ii); testTermDocs(dir, ii); @@ -186,6 +187,25 @@ public class TestIndicesEquals extends TestCase { * @param testIndex the index that is supposed to equals the apriori index. * @throws Exception */ + protected void testEqualBehaviour(Directory aprioriIndex, InstantiatedIndex testIndex) throws Exception { + + testEquals(aprioriIndex, testIndex); + + // delete a few documents + IndexReader ir = IndexReader.open(aprioriIndex); + ir.deleteDocument(3); + ir.deleteDocument(8); + ir.close(); + + ir = testIndex.indexReaderFactory(); + ir.deleteDocument(3); + ir.deleteDocument(8); + ir.close(); + + // make sure they still equal + testEquals(aprioriIndex, testIndex); + } + protected void testEquals(Directory aprioriIndex, InstantiatedIndex testIndex) throws Exception { IndexReader aprioriReader = IndexReader.open(aprioriIndex); @@ -193,6 +213,17 @@ public class TestIndicesEquals extends TestCase { assertEquals(aprioriReader.numDocs(), testReader.numDocs()); + // assert field options + assertEquals(aprioriReader.getFieldNames(IndexReader.FieldOption.INDEXED), testReader.getFieldNames(IndexReader.FieldOption.INDEXED)); + assertEquals(aprioriReader.getFieldNames(IndexReader.FieldOption.INDEXED_NO_TERMVECTOR), testReader.getFieldNames(IndexReader.FieldOption.INDEXED_NO_TERMVECTOR)); + assertEquals(aprioriReader.getFieldNames(IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR), testReader.getFieldNames(IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR)); + assertEquals(aprioriReader.getFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), testReader.getFieldNames(IndexReader.FieldOption.STORES_PAYLOADS)); + assertEquals(aprioriReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), testReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR)); + assertEquals(aprioriReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), testReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET)); + assertEquals(aprioriReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), testReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION)); + assertEquals(aprioriReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), testReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET)); + assertEquals(aprioriReader.getFieldNames(IndexReader.FieldOption.UNINDEXED), testReader.getFieldNames(IndexReader.FieldOption.UNINDEXED)); + for (Object field : aprioriReader.getFieldNames(IndexReader.FieldOption.ALL)) { // test norms as used by normal use