diff --git a/CHANGES.txt b/CHANGES.txt index 5344e62c6d8..e0e5129c788 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -458,7 +458,10 @@ New features to consider token positions when creating PhraseQuery and MultiPhraseQuery. Disabled by default (so by default the query parser ignores position increments). - (Doron Cohen) + (Doron Cohen) + +13. LUCENE-550: Added InstantiatedIndex implementation. Experimental Index store similar to MemoryIndex but allows +for multiple documents in memory. (Karl Wettin via Grant Ingersoll) Optimizations diff --git a/contrib/instantiated/build.xml b/contrib/instantiated/build.xml new file mode 100644 index 00000000000..0886a401736 --- /dev/null +++ b/contrib/instantiated/build.xml @@ -0,0 +1,32 @@ + + + + + + + + InstantiatedIndex, an alternative RAM store. + + + + + + + + + diff --git a/contrib/instantiated/docs/classdiagram.png b/contrib/instantiated/docs/classdiagram.png new file mode 100644 index 00000000000..00156eab791 Binary files /dev/null and b/contrib/instantiated/docs/classdiagram.png differ diff --git a/contrib/instantiated/docs/classdiagram.uxf b/contrib/instantiated/docs/classdiagram.uxf new file mode 100644 index 00000000000..3464fbd9fd6 --- /dev/null +++ b/contrib/instantiated/docs/classdiagram.uxf @@ -0,0 +1,50 @@ +//Uncomment the following line to change the fontsize: +//fontsize=14 + +//Welcome to UMLet! + +// *Double-click on UML elements to add them to the diagram. +// *Edit element properties by modifying the text in this panel. +// *Edit the files in the 'palettes' directory to store your own element palettes. +// *Press Del or Backspace to remove elements from the diagram. +// *Hold down Ctrl key to select multiple elements. +// *Press c to copy the UML diagram to the system clipboard. +// * This text will be stored with each diagram. Feel free to use the area for notes. +com.umlet.element.base.Class310540310110bg=#eeeeee +fg=#000000 +InstantiatedTermDocumentInformation +-- ++payloads:byte[][] ++termPositions:int[] ++termOffsets:TermVectorOffsetInfo[] ++indexFromTerm:int +--com.umlet.element.base.Relation46038040180lt=.20;20;20;160com.umlet.element.base.Relation4604061160lt=<- +q2=field +m1=0..130;20;30;140com.umlet.element.base.Class4303012030bg=#eeeeee +fg=#000099 +_norm: byte[][]_com.umlet.element.base.Class8039010030bg=#eeeeee +fg=#000099 +Termcom.umlet.element.base.Relation77038012040lt=-> +m2=120;20;100;20com.umlet.element.base.Class87039010030bg=#eeeeee +fg=#000099 +Documentcom.umlet.element.base.Class59037020060bg=#eeeeee +fg=#000000 +InstantiatedDocument +-- ++documentNumber:int +--com.umlet.element.base.Relation520190170200lt=<- +m1=0..* +<<ordered>>150;180;20;20com.umlet.element.base.Relation290190140220lt=<- +m1=0..* +<<ordered>>20;200;120;20com.umlet.element.base.Class38018020030bg=#eeeeee +fg=#000000 +InstantiatedIndexcom.umlet.element.base.Relation16038011040lt=-> +m2=190;20;20;20com.umlet.element.base.Class25039016030bg=#eeeeee +fg=#000000 +InstantiatedTerm +com.umlet.element.base.Relation380190146220lt=<- +q2=field, term +m1=0..120;200;100;20com.umlet.element.base.Relation39038022040lt=- +q2=field +m2=0..* +m1=0..*20;20;200;20 \ No newline at end of file diff --git a/contrib/instantiated/pom.xml.template b/contrib/instantiated/pom.xml.template new file mode 100644 index 00000000000..40bbd5965d4 --- /dev/null +++ b/contrib/instantiated/pom.xml.template @@ -0,0 +1,50 @@ + + + + + 4.0.0 + + org.apache.lucene + lucene-contrib + @version@ + + org.apache.lucene + lucene-instantiated + Lucene InstantiatedIndex + @version@ + InstantiatedIndex, alternative RAM store for small corpora. + jar + + + + + org.apache.maven.plugins + maven-compiler-plugin + + 1.5 + 1.5 + + + + + + diff --git a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedDocument.java b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedDocument.java new file mode 100644 index 00000000000..3f2999ef21b --- /dev/null +++ b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedDocument.java @@ -0,0 +1,79 @@ +package org.apache.lucene.store.instantiated; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.document.Document; + +import java.io.Serializable; +import java.util.List; +import java.util.Map; + +/** + * A document in the instantiated index object graph, optionally coupled to the vector space view. + * + * @see org.apache.lucene.document.Document + */ +public class InstantiatedDocument + implements Serializable { + + private static long serialVersionUID = 1l; + + private Document document; + + public InstantiatedDocument() { + this.document = new Document(); + } + + + public InstantiatedDocument(Document document) { + this.document = document; + } + + /** this is the unsafe index order document number. */ + private Integer documentNumber; + + /** this is the term vector space view */ + private Map> vectorSpace; + + /** + * @return position of document in the index. + */ + public Integer getDocumentNumber() { + return documentNumber; + } + + void setDocumentNumber(Integer documentNumber) { + this.documentNumber = documentNumber; + } + + public Map> getVectorSpace() { + return vectorSpace; + } + + public void setVectorSpace(Map> vectorSpace) { + this.vectorSpace = vectorSpace; + } + + public Document getDocument() { + return document; + } + + + public String toString() { + return document.toString(); + } +} diff --git a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java new file mode 100644 index 00000000000..4a89dc0adca --- /dev/null +++ b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java @@ -0,0 +1,274 @@ +package org.apache.lucene.store.instantiated; + +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.*; + +import java.io.IOException; +import java.io.Serializable; +import java.util.*; + +/** + * Represented as a coupled graph of class instances, this + * all-in-memory index store implementation delivers search + * results up to a 100 times faster than the file-centric RAMDirectory + * at the cost of greater RAM consumption. + * + * WARNING: This contrib is experimental and the APIs may change without warning. + * + * There are no read and write locks in this store. + * {@link InstantiatedIndexReader} {@link InstantiatedIndexReader#isCurrent()} all the time + * and {@link org.apache.lucene.store.instantiated.InstantiatedIndexWriter} + * will attempt to update instances of the object graph in memory + * at the same time as a searcher is reading from it. + * + * Consider using InstantiatedIndex as if it was immutable. + */ +public class InstantiatedIndex + implements Serializable { + + private static final long serialVersionUID = 1l; + + private long version = System.currentTimeMillis(); + + private InstantiatedDocument[] documentsByNumber; + /** todo: this should be a BitSet */ + private Set deletedDocuments; + + private Map> termsByFieldAndText; + private InstantiatedTerm[] orderedTerms; + + private Map normsByFieldNameAndDocumentNumber; + + + /** + * Creates an empty instantiated index for you to fill with data using an {@link org.apache.lucene.store.instantiated.InstantiatedIndexWriter}. + */ + public InstantiatedIndex() { + initialize(); + } + + void initialize() { + // todo: clear index without loosing memory (uncouple stuff) + termsByFieldAndText = new HashMap>(); + orderedTerms = new InstantiatedTerm[0]; + documentsByNumber = new InstantiatedDocument[0]; + normsByFieldNameAndDocumentNumber = new HashMap(); + deletedDocuments = new HashSet(); + } + + /** + * Creates a new instantiated index that looks just like the index in a specific state as represented by a reader. + * + * @param sourceIndexReader the source index this new instantiated index will be copied from. + * @throws IOException if the source index is not optimized, or when accesing the source. + */ + public InstantiatedIndex(IndexReader sourceIndexReader) throws IOException { + this(sourceIndexReader, null); + } + + /** + * Creates a new instantiated index that looks just like the index in a specific state as represented by a reader. + * + * @param sourceIndexReader the source index this new instantiated index will be copied from. + * @param fields fields to be added, or null for all + * @throws IOException if the source index is not optimized, or when accesing the source. + */ + public InstantiatedIndex(IndexReader sourceIndexReader, Set fields) throws IOException { + + if (!sourceIndexReader.isOptimized()) { + throw new IOException("Source index is not optimized."); + } + + Collection allFieldNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.ALL); + + initialize(); + + documentsByNumber = new InstantiatedDocument[sourceIndexReader.numDocs()]; + + // create documents + for (int i = 0; i < sourceIndexReader.numDocs(); i++) { + if (!sourceIndexReader.isDeleted(i)) { + InstantiatedDocument document = new InstantiatedDocument(); + // copy stored fields from source reader + Document sourceDocument = sourceIndexReader.document(i); + for (Field field : (List) sourceDocument.getFields()) { + if (fields == null || fields.contains(field.name())) { + document.getDocument().add(field); + } + } + document.setDocumentNumber(i); + documentsByNumber[i] = document; + for (Field field : (List) document.getDocument().getFields()) { + if (fields == null || fields.contains(field.name())) { + if (field.isTermVectorStored()) { + if (document.getVectorSpace() == null) { + document.setVectorSpace(new HashMap>()); + } + document.getVectorSpace().put(field.name(), new ArrayList()); + } + } + } + } + } + + // create norms + for (String fieldName : allFieldNames) { + if (fields == null || fields.contains(fieldName)) { + getNormsByFieldNameAndDocumentNumber().put(fieldName, sourceIndexReader.norms(fieldName)); + } + } + + // create terms + for (String fieldName : allFieldNames) { + if (fields == null || fields.contains(fieldName)) { + getTermsByFieldAndText().put(fieldName, new HashMap(5000)); + } + } + List terms = new ArrayList(5000 * getTermsByFieldAndText().size()); + TermEnum termEnum = sourceIndexReader.terms(); + while (termEnum.next()) { + if (fields == null || fields.contains(termEnum.term().field())) { // todo skipto if not using field + InstantiatedTerm instantiatedTerm = new InstantiatedTerm(termEnum.term().field(), termEnum.term().text()); + getTermsByFieldAndText().get(termEnum.term().field()).put(termEnum.term().text(), instantiatedTerm); + instantiatedTerm.setTermIndex(terms.size()); + terms.add(instantiatedTerm); + instantiatedTerm.setAssociatedDocuments(new InstantiatedTermDocumentInformation[termEnum.docFreq()]); + } + } + termEnum.close(); + orderedTerms = terms.toArray(new InstantiatedTerm[terms.size()]); + + // create term-document informations + for (InstantiatedTerm term : orderedTerms) { + TermPositions termPositions = sourceIndexReader.termPositions(term.getTerm()); + int position = 0; + while (termPositions.next()) { + InstantiatedDocument document = documentsByNumber[termPositions.doc()]; + + byte[][] payloads = new byte[termPositions.freq()][]; + int[] positions = new int[termPositions.freq()]; + for (int i = 0; i < termPositions.freq(); i++) { + positions[i] = termPositions.nextPosition(); + + if (termPositions.isPayloadAvailable()) { + payloads[i] = new byte[termPositions.getPayloadLength()]; + termPositions.getPayload(payloads[i], 0); + } + } + + InstantiatedTermDocumentInformation termDocumentInformation = new InstantiatedTermDocumentInformation(term, document, positions, payloads); + term.getAssociatedDocuments()[position++] = termDocumentInformation; + + if (document.getVectorSpace() != null + && document.getVectorSpace().containsKey(term.field())) { + document.getVectorSpace().get(term.field()).add(termDocumentInformation); + } + +// termDocumentInformation.setIndexFromTerm(indexFromTerm++); + } + } + + // load offsets to term-document informations + for (InstantiatedDocument document : getDocumentsByNumber()) { + for (Field field : (List) document.getDocument().getFields()) { + if (field.isTermVectorStored() && field.isStoreOffsetWithTermVector()) { + TermPositionVector termPositionVector = (TermPositionVector) sourceIndexReader.getTermFreqVector(document.getDocumentNumber(), field.name()); + if (termPositionVector != null) { + for (int i = 0; i < termPositionVector.getTerms().length; i++) { + String token = termPositionVector.getTerms()[i]; + InstantiatedTerm term = findTerm(field.name(), token); + InstantiatedTermDocumentInformation termDocumentInformation = term.getAssociatedDocument(document.getDocumentNumber()); + termDocumentInformation.setTermOffsets(termPositionVector.getOffsets(i)); + } + } + } + } + } + } + + public InstantiatedIndexWriter indexWriterFactory(Analyzer analyzer, boolean create) throws IOException { + return new InstantiatedIndexWriter(this, analyzer, create); + } + + public InstantiatedIndexReader indexReaderFactory() throws IOException { + return new InstantiatedIndexReader(this); + } + + public void close() throws IOException { + // todo: decouple everything + } + + InstantiatedTerm findTerm(Term term) { + return findTerm(term.field(), term.text()); + } + + InstantiatedTerm findTerm(String field, String text) { + Map termsByField = termsByFieldAndText.get(field); + if (termsByField == null) { + return null; + } else { + return termsByField.get(text); + } + } + + public Map> getTermsByFieldAndText() { + return termsByFieldAndText; + } + + + public InstantiatedTerm[] getOrderedTerms() { + return orderedTerms; + } + + public InstantiatedDocument[] getDocumentsByNumber() { + return documentsByNumber; + } + + public Map getNormsByFieldNameAndDocumentNumber() { + return normsByFieldNameAndDocumentNumber; + } + + void setNormsByFieldNameAndDocumentNumber(Map normsByFieldNameAndDocumentNumber) { + this.normsByFieldNameAndDocumentNumber = normsByFieldNameAndDocumentNumber; + } + + public Set getDeletedDocuments() { + return deletedDocuments; + } + + + void setOrderedTerms(InstantiatedTerm[] orderedTerms) { + this.orderedTerms = orderedTerms; + } + + void setDocumentsByNumber(InstantiatedDocument[] documentsByNumber) { + this.documentsByNumber = documentsByNumber; + } + + + public long getVersion() { + return version; + } + + void setVersion(long version) { + this.version = version; + } +} diff --git a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java new file mode 100644 index 00000000000..50f7924c0fa --- /dev/null +++ b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java @@ -0,0 +1,326 @@ +package org.apache.lucene.store.instantiated; + +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.FieldSelector; +import org.apache.lucene.index.*; +import org.apache.lucene.store.Directory; + +import java.io.IOException; +import java.util.*; + +/** + * An InstantiatedIndexReader is not a snapshot in time, + * it is completely in sync with the latest commit to the store! + * + * Consider using InstantiatedIndex as if it was immutable. + */ +public class InstantiatedIndexReader + extends IndexReader { + + private final InstantiatedIndex index; + + public InstantiatedIndexReader(InstantiatedIndex index) { + super(); + this.index = index; + } + + /** + * @return always true. + */ + public boolean isOptimized() { + return true; + } + + + /** + * An InstantiatedIndexReader is not a snapshot in time, + * it is completely in sync with the latest commit to the store! + * + * @return output from {@link InstantiatedIndex#getVersion()} in associated instantiated index. + */ + public long getVersion() { + return index.getVersion(); + } + + + public Directory directory() { + throw new UnsupportedOperationException(); + } + + + /** + * An InstantiatedIndexReader is always current! + * + * Check whether this IndexReader is still using the + * current (i.e., most recently committed) version of the + * index. If a writer has committed any changes to the + * index since this reader was opened, this will return + * false, in which case you must open a new + * IndexReader in order to see the changes. See the + * description of the autoCommit + * flag which controls when the {@link IndexWriter} + * actually commits changes to the index. + * + * @return always true + * @throws CorruptIndexException if the index is corrupt + * @throws IOException if there is a low-level IO error + * @throws UnsupportedOperationException unless overridden in subclass + */ + public boolean isCurrent() throws IOException { + return true; + } + + public InstantiatedIndex getIndex() { + return index; + } + + private Set deletedDocuments = new HashSet(); + private Set deletedDocumentNumbers = new HashSet(); + private Map> updatedNormsByFieldNameAndDocumentNumber = null; + + private class NormUpdate { + private int doc; + private byte value; + + public NormUpdate(int doc, byte value) { + this.doc = doc; + this.value = value; + } + } + + public int numDocs() { + return getIndex().getDocumentsByNumber().length - index.getDeletedDocuments().size() - deletedDocuments.size(); + } + + public int maxDoc() { + return getIndex().getDocumentsByNumber().length; + } + + public boolean isDeleted(int n) { + return getIndex().getDeletedDocuments().contains(n) || deletedDocumentNumbers.contains(n); + } + + public boolean hasDeletions() { + return getIndex().getDeletedDocuments().size() > 0 || deletedDocumentNumbers.size() > 0; + } + + protected void doDelete(int docNum) throws IOException { + if (!getIndex().getDeletedDocuments().contains(docNum)) { + if (deletedDocumentNumbers.add(docNum)) { + deletedDocuments.add(getIndex().getDocumentsByNumber()[docNum]); + } + } + } + + protected void doUndeleteAll() throws IOException { + deletedDocumentNumbers.clear(); + deletedDocuments.clear(); + } + + protected void doCommit() throws IOException { + // todo: read/write lock + + boolean updated = false; + + // 1. update norms + if (updatedNormsByFieldNameAndDocumentNumber != null) { + for (Map.Entry> e : updatedNormsByFieldNameAndDocumentNumber.entrySet()) { + byte[] norms = getIndex().getNormsByFieldNameAndDocumentNumber().get(e.getKey()); + for (NormUpdate normUpdate : e.getValue()) { + norms[normUpdate.doc] = normUpdate.value; + } + } + updatedNormsByFieldNameAndDocumentNumber = null; + + updated = true; + } + + // 2. remove deleted documents + if (deletedDocumentNumbers.size() > 0) { + for (Integer doc : deletedDocumentNumbers) { + getIndex().getDeletedDocuments().add(doc); + } + deletedDocumentNumbers.clear(); + deletedDocuments.clear(); + + updated = true; + + } + + // todo unlock read/writelock + } + + protected void doClose() throws IOException { + // ignored + } + + public Collection getFieldNames(FieldOption fldOption) { + if (fldOption != FieldOption.ALL) { + throw new IllegalArgumentException("Only FieldOption.ALL implemented."); // todo + } + return new ArrayList(getIndex().getTermsByFieldAndText().keySet()); + } + + + /** + * This implementation ignores the field selector! All fields are always returned + * + * Get the {@link org.apache.lucene.document.Document} at the nth position. + * + * @param n Get the document at the nth position + * @param fieldSelector ignored + * @return The stored fields of the {@link org.apache.lucene.document.Document} at the nth position + * @throws CorruptIndexException if the index is corrupt + * @throws IOException if there is a low-level IO error + * + * @see org.apache.lucene.document.Fieldable + * @see org.apache.lucene.document.FieldSelector + * @see org.apache.lucene.document.SetBasedFieldSelector + * @see org.apache.lucene.document.LoadFirstFieldSelector + */ + public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException { + return document(n); + } + + public Document document(int n) throws IOException { + if ((deletedDocumentNumbers != null + && deletedDocumentNumbers.contains(n)) + || + (getIndex().getDeletedDocuments() != null + && getIndex().getDeletedDocuments().contains(n))) { + return null; + } + return getIndex().getDocumentsByNumber()[n].getDocument(); + } + + /** + * never ever touch these values. it is the true values, unless norms have been touched. + */ + public byte[] norms(String field) throws IOException { + byte[] norms = getIndex().getNormsByFieldNameAndDocumentNumber().get(field); + if (updatedNormsByFieldNameAndDocumentNumber != null) { + norms = norms.clone(); + List updated = updatedNormsByFieldNameAndDocumentNumber.get(field); + if (updated != null) { + for (NormUpdate normUpdate : updated) { + norms[normUpdate.doc] = normUpdate.value; + } + } + } + return norms; + } + + public void norms(String field, byte[] bytes, int offset) throws IOException { + byte[] norms = getIndex().getNormsByFieldNameAndDocumentNumber().get(field); + System.arraycopy(norms, offset, bytes, 0, norms.length); + } + + protected void doSetNorm(int doc, String field, byte value) throws IOException { + if (updatedNormsByFieldNameAndDocumentNumber == null) { + updatedNormsByFieldNameAndDocumentNumber = new HashMap>(getIndex().getNormsByFieldNameAndDocumentNumber().size()); + } + List list = updatedNormsByFieldNameAndDocumentNumber.get(field); + if (list == null) { + list = new LinkedList(); + updatedNormsByFieldNameAndDocumentNumber.put(field, list); + } + list.add(new NormUpdate(doc, value)); + } + + public int docFreq(Term t) throws IOException { + InstantiatedTerm term = getIndex().findTerm(t); + if (term == null) { + return 0; + } else { + return term.getAssociatedDocuments().length; + } + } + + + public TermEnum terms() throws IOException { + return new InstantiatedTermEnum(this); + } + + public TermEnum terms(Term t) throws IOException { + InstantiatedTerm it = getIndex().findTerm(t); + if (it != null) { + return new InstantiatedTermEnum(this, it.getTermIndex()); + } else { + int startPos = Arrays.binarySearch(index.getOrderedTerms(), t, InstantiatedTerm.termComparator); + if (startPos < 0) { + startPos = -1 -startPos; + } + return new InstantiatedTermEnum(this, startPos); + } + } + + public TermDocs termDocs() throws IOException { + return new InstantiatedTermDocs(this); + } + + public TermPositions termPositions() throws IOException { + return new InstantiatedTermPositions(this); + } + + public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException { + InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber]; + if (doc.getVectorSpace() == null) { + return null; + } + TermFreqVector[] ret = new TermFreqVector[doc.getVectorSpace().size()]; + Iterator it = doc.getVectorSpace().keySet().iterator(); + for (int i = 0; i < ret.length; i++) { + ret[i] = new InstantiatedTermPositionVector(getIndex().getDocumentsByNumber()[docNumber], it.next()); + } + return ret; + } + + public TermFreqVector getTermFreqVector(int docNumber, String field) throws IOException { + InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber]; + if (doc.getVectorSpace() == null + || doc.getVectorSpace().get(field) == null) { + return null; + } else { + return new InstantiatedTermPositionVector(doc, field); + } + } + + + public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException { + InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber]; + if (doc.getVectorSpace() != null + && doc.getVectorSpace().get(field) == null) { + List tv = doc.getVectorSpace().get(field); + mapper.setExpectations(field, tv.size(), true, true); + for (InstantiatedTermDocumentInformation tdi : tv) { + mapper.map(tdi.getTerm().text(), tdi.getTermPositions().length, tdi.getTermOffsets(), tdi.getTermPositions()); + } + } + } + + public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException { + InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber]; + for (Map.Entry> e : doc.getVectorSpace().entrySet()) { + mapper.setExpectations(e.getKey(), e.getValue().size(), true, true); + for (InstantiatedTermDocumentInformation tdi : e.getValue()) { + mapper.map(tdi.getTerm().text(), tdi.getTermPositions().length, tdi.getTermOffsets(), tdi.getTermPositions()); + } + } + } +} diff --git a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java new file mode 100644 index 00000000000..51088b4390b --- /dev/null +++ b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java @@ -0,0 +1,681 @@ +package org.apache.lucene.store.instantiated; + +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermVectorOffsetInfo; +import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.Similarity; + +import java.io.IOException; +import java.io.PrintStream; +import java.io.StringReader; +import java.util.*; + +/** + * This class, similar to {@link org.apache.lucene.index.IndexWriter}, has no locking mechanism. + * + * {@link org.apache.lucene.store.instantiated.InstantiatedIndexReader} is navigating + * the same instances in memory as this writer is updating so searchers actice while + * you are committing are bound to throw exceptions. + * + * Consider using InstantiatedIndex as if it was immutable. + * + * @see org.apache.lucene.index.IndexWriter + */ +public class InstantiatedIndexWriter { + + private PrintStream infoStream = null; + + private int maxFieldLength = IndexWriter.DEFAULT_MAX_FIELD_LENGTH; + + private final InstantiatedIndex index; + private final Analyzer analyzer; + + private Similarity similarity = Similarity.getDefault(); // how to normalize; + + private transient Set fieldNameBuffer; + /** + * linked to ensure chronological order + */ + private Map>> termDocumentInformationFactoryByDocument = new LinkedHashMap>>(2000); + + private Set unflushedDocuments = new HashSet(); + + public InstantiatedIndexWriter(InstantiatedIndex index) throws IOException { + this(index, null); + } + + public InstantiatedIndexWriter(InstantiatedIndex index, Analyzer analyzer) throws IOException { + this(index, analyzer, false); + } + + public InstantiatedIndexWriter(InstantiatedIndex index, Analyzer analyzer, boolean create) throws IOException { + this.index = index; + this.analyzer = analyzer; + fieldNameBuffer = new HashSet(); + if (create) { + this.index.initialize(); + } + } + + private int mergeFactor = 2500; + + /** + * The sweetspot for this implementation is somewhere around 2500 at 2K text large documents. + *

+ * Benchmark output: + *

+   *  ------------> Report sum by Prefix (MAddDocs) and Round (8 about 8 out of 160153)
+   *  Operation      round  mrg buf cmpnd   runCnt   recsPerRun        rec/s  elapsedSec    avgUsedMem    avgTotalMem
+   *  MAddDocs_20000     0   10  10  true        1        20000         81,4      245,68   200 325 152    268 156 928
+   *  MAddDocs_20000 -   1 1000  10  true -  -   1 -  -   20000 -  -   494,1 -  -  40,47 - 247 119 072 -  347 025 408
+   *  MAddDocs_20000     2   10 100  true        1        20000        104,8      190,81   233 895 552    363 720 704
+   *  MAddDocs_20000 -   3 2000 100  true -  -   1 -  -   20000 -  -   527,2 -  -  37,94 - 266 136 448 -  378 273 792
+   *  MAddDocs_20000     4   10  10 false        1        20000        103,2      193,75   222 089 792    378 273 792
+   *  MAddDocs_20000 -   5 3000  10 false -  -   1 -  -   20000 -  -   545,2 -  -  36,69 - 237 917 152 -  378 273 792
+   *  MAddDocs_20000     6   10 100 false        1        20000        102,7      194,67   237 018 976    378 273 792
+   *  MAddDocs_20000 -   7 4000 100 false -  -   1 -  -   20000 -  -   535,8 -  -  37,33 - 309 680 640 -  501 968 896
+   * 
+ * + * @see org.apache.lucene.index.IndexWriter#setMergeFactor(int) + */ + public void setMergeFactor(int mergeFactor) { + this.mergeFactor = mergeFactor; + } + + /** + * @see org.apache.lucene.index.IndexWriter#getMergeFactor() + */ + public int getMergeFactor() { + return mergeFactor; + } + + + /** + * If non-null, information about merges and a message when + * maxFieldLength is reached will be printed to this. + */ + public void setInfoStream(PrintStream infoStream) { + this.infoStream = infoStream; + } + + + public void abort() throws IOException { + // what not + } + + + public void addIndexes(IndexReader[] readers) { + throw new RuntimeException("Not implemented"); + } + + + public PrintStream getInfoStream() { + return infoStream; + } + + + /** + * Flushes all changes to an index and closes all associated files. + */ + public void close() throws IOException { + commit(); + } + + /** + * Returns the number of documents currently in this index. + */ + public int docCount() { + // todo: not certain. see http://www.nabble.com/IndexWriter.docCount-tf3128882.html#a8669483 + return index.getDocumentsByNumber().length /* - index.getDeletedDocuments().size() */ + unflushedDocuments.size(); + } + + /** + * Locks the index and commits the buffered documents. + */ + public void commit() throws IOException { + + // todo write lock, unless held by caller + + boolean orderedTermsDirty = false; + Set dirtyTerms = new HashSet(1000); + + InstantiatedDocument[] documentsByNumber = new InstantiatedDocument[index.getDocumentsByNumber().length + termDocumentInformationFactoryByDocument.size()]; + System.arraycopy(index.getDocumentsByNumber(), 0, documentsByNumber, 0, index.getDocumentsByNumber().length); + int documentNumber = index.getDocumentsByNumber().length; + + List orderedTerms = new ArrayList(index.getOrderedTerms().length + 5000); + for (InstantiatedTerm instantiatedTerm : index.getOrderedTerms()) { + orderedTerms.add(instantiatedTerm); + } + + // update norm array with fake values for new documents + Map normsByFieldNameAndDocumentNumber = new HashMap(index.getTermsByFieldAndText().size()); + Set fieldNames = new HashSet(20); + fieldNames.addAll(index.getNormsByFieldNameAndDocumentNumber().keySet()); + fieldNames.addAll(fieldNameBuffer); + for (String field : index.getTermsByFieldAndText().keySet()) { + byte[] norms = new byte[index.getDocumentsByNumber().length + termDocumentInformationFactoryByDocument.size()]; + byte[] oldNorms = index.getNormsByFieldNameAndDocumentNumber().get(field); + if (oldNorms != null) { + System.arraycopy(oldNorms, 0, norms, 0, oldNorms.length); + Arrays.fill(norms, oldNorms.length, norms.length, DefaultSimilarity.encodeNorm(1.0f)); + } else { + Arrays.fill(norms, 0, norms.length, DefaultSimilarity.encodeNorm(1.0f)); + } + normsByFieldNameAndDocumentNumber.put(field, norms); + fieldNames.remove(field); + } + for (String field : fieldNames) { + //System.out.println(field); + byte[] norms = new byte[index.getDocumentsByNumber().length + termDocumentInformationFactoryByDocument.size()]; + Arrays.fill(norms, 0, norms.length, DefaultSimilarity.encodeNorm(1.0f)); + normsByFieldNameAndDocumentNumber.put(field, norms); + } + fieldNames.clear(); + index.setNormsByFieldNameAndDocumentNumber(normsByFieldNameAndDocumentNumber); + + for (Map.Entry>> eDocumentTermDocInfoByTermTextAndField : termDocumentInformationFactoryByDocument.entrySet()) { + + InstantiatedDocument document = eDocumentTermDocInfoByTermTextAndField.getKey(); + + // assign document number + document.setDocumentNumber(documentNumber++); + documentsByNumber[document.getDocumentNumber()] = document; + + // set norms, prepare document and create optimized size collections. + + int numFieldsWithTermVectorsInDocument = 0; + int termsInDocument = 0; + for (Map.Entry> eFieldTermDocInfoFactoriesByTermText : eDocumentTermDocInfoByTermTextAndField.getValue().entrySet()) { + if (eFieldTermDocInfoFactoriesByTermText.getKey().storeTermVector) { + numFieldsWithTermVectorsInDocument += eFieldTermDocInfoFactoriesByTermText.getValue().size(); + } + termsInDocument += eFieldTermDocInfoFactoriesByTermText.getValue().size(); + + if (eFieldTermDocInfoFactoriesByTermText.getKey().isIndexed && !eFieldTermDocInfoFactoriesByTermText.getKey().omitNorms) { + float norm = eFieldTermDocInfoFactoriesByTermText.getKey().boost; + norm *= document.getDocument().getBoost(); + norm *= similarity.lengthNorm(eFieldTermDocInfoFactoriesByTermText.getKey().fieldName, eFieldTermDocInfoFactoriesByTermText.getKey().fieldLength); + normsByFieldNameAndDocumentNumber.get(eFieldTermDocInfoFactoriesByTermText.getKey().fieldName)[document.getDocumentNumber()] = Similarity.encodeNorm(norm); + } else { + System.currentTimeMillis(); + } + + } + + /** used for term vectors only, i think.. */ + Map informationByTermOfCurrentDocument = new HashMap(termsInDocument); + + + Map documentFieldSettingsByFieldName = new HashMap(eDocumentTermDocInfoByTermTextAndField.getValue().size()); + + // terms... + for (Map.Entry> eFieldSetting_TermDocInfoFactoriesByTermText : eDocumentTermDocInfoByTermTextAndField.getValue().entrySet()) { + documentFieldSettingsByFieldName.put(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, eFieldSetting_TermDocInfoFactoriesByTermText.getKey()); + + // find or create term + for (Map.Entry eTermText_TermDocInfoFactory : eFieldSetting_TermDocInfoFactoriesByTermText.getValue().entrySet()) { + + // get term.. + InstantiatedTerm term; + Map termsByText = index.getTermsByFieldAndText().get(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName); + if (termsByText == null) { + termsByText = new HashMap(1000); + index.getTermsByFieldAndText().put(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, termsByText); + term = new InstantiatedTerm(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, eTermText_TermDocInfoFactory.getKey()); + termsByText.put(eTermText_TermDocInfoFactory.getKey(), term); + int pos = Collections.binarySearch(orderedTerms, term, InstantiatedTerm.comparator); + pos = -1 - pos; + orderedTerms.add(pos, term); + orderedTermsDirty = true; + } else { + term = termsByText.get(eTermText_TermDocInfoFactory.getKey()); + if (term == null) { + term = new InstantiatedTerm(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, eTermText_TermDocInfoFactory.getKey()); + termsByText.put(eTermText_TermDocInfoFactory.getKey(), term); + int pos = Collections.binarySearch(orderedTerms, term, InstantiatedTerm.comparator); + pos = -1 - pos; + orderedTerms.add(pos, term); + orderedTermsDirty = true; + } + } + + // create association term document infomation + // + // [Term]-- {0..*} | {0..* ordered} --(field)[Document] + // + // | + // [TermDocumentInformation] + + int[] positions = new int[eTermText_TermDocInfoFactory.getValue().termPositions.size()]; + for (int i = 0; i < positions.length; i++) { + positions[i] = eTermText_TermDocInfoFactory.getValue().termPositions.get(i); + } + + byte[][] payloads = new byte[eTermText_TermDocInfoFactory.getValue().payloads.size()][]; + for (int i = 0; i < payloads.length; i++) { + payloads[i] = eTermText_TermDocInfoFactory.getValue().payloads.get(i); + } + + // couple + + InstantiatedTermDocumentInformation info = new InstantiatedTermDocumentInformation(term, document, /*eTermText_TermDocInfoFactory.getValue().termFrequency,*/ positions, payloads); + + // todo optimize, this should be chached and updated to array in batches rather than appending the array once for every position! + InstantiatedTermDocumentInformation[] associatedDocuments; + if (term.getAssociatedDocuments() != null) { + associatedDocuments = new InstantiatedTermDocumentInformation[term.getAssociatedDocuments().length + 1]; + System.arraycopy(term.getAssociatedDocuments(), 0, associatedDocuments, 0, term.getAssociatedDocuments().length); + } else { + associatedDocuments = new InstantiatedTermDocumentInformation[1]; + } + associatedDocuments[associatedDocuments.length - 1] = info; + term.setAssociatedDocuments(associatedDocuments); + + // todo optimize, only if term vector? + informationByTermOfCurrentDocument.put(term, info); + + + dirtyTerms.add(term); + } + + // term vector offsets + if (eFieldSetting_TermDocInfoFactoriesByTermText.getKey().storeOffsetWithTermVector) { + for (Map.Entry e : informationByTermOfCurrentDocument.entrySet()) { + if (eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName.equals(e.getKey().field())) { + TermDocumentInformationFactory factory = eFieldSetting_TermDocInfoFactoriesByTermText.getValue().get(e.getKey().text()); + e.getValue().setTermOffsets(factory.termOffsets.toArray(new TermVectorOffsetInfo[factory.termOffsets.size()])); + } + } + } + } + + Map> termDocumentInformationsByField = new HashMap>(); + for (Map.Entry eTerm_TermDocumentInformation : informationByTermOfCurrentDocument.entrySet()) { + List termDocumentInformations = termDocumentInformationsByField.get(eTerm_TermDocumentInformation.getKey().field()); + if (termDocumentInformations == null) { + termDocumentInformations = new ArrayList(); + termDocumentInformationsByField.put(eTerm_TermDocumentInformation.getKey().field(), termDocumentInformations); + } + termDocumentInformations.add(eTerm_TermDocumentInformation.getValue()); + } + + for (Map.Entry> eField_TermDocInfos : termDocumentInformationsByField.entrySet()) { + + Collections.sort(eField_TermDocInfos.getValue(), new Comparator() { + public int compare(InstantiatedTermDocumentInformation instantiatedTermDocumentInformation, InstantiatedTermDocumentInformation instantiatedTermDocumentInformation1) { + return instantiatedTermDocumentInformation.getTerm().getTerm().compareTo(instantiatedTermDocumentInformation1.getTerm().getTerm()); + } + }); + + // add term vector + if (documentFieldSettingsByFieldName.get(eField_TermDocInfos.getKey()).storeTermVector) { + if (document.getVectorSpace() == null) { + document.setVectorSpace(new HashMap>(documentFieldSettingsByFieldName.size())); + } + document.getVectorSpace().put(eField_TermDocInfos.getKey(), eField_TermDocInfos.getValue()); + } + + } + } + + // order document informations in dirty terms + for (InstantiatedTerm term : dirtyTerms) { + // todo optimize, i belive this is useless, that the natural order is document number? + Arrays.sort(term.getAssociatedDocuments(), InstantiatedTermDocumentInformation.documentNumberComparator); + +// // update association class reference for speedy skipTo() +// for (int i = 0; i < term.getAssociatedDocuments().length; i++) { +// term.getAssociatedDocuments()[i].setIndexFromTerm(i); +// } + } + + + // flush to writer + index.setDocumentsByNumber(documentsByNumber); + index.setOrderedTerms(orderedTerms.toArray(new InstantiatedTerm[orderedTerms.size()])); + + // set term index + if (orderedTermsDirty) { + // todo optimize, only update from start position + for (int i = 0; i < index.getOrderedTerms().length; i++) { + index.getOrderedTerms()[i].setTermIndex(i); + } + + } + + // remove deleted documents + IndexReader indexDeleter = index.indexReaderFactory(); + if (unflushedDeletions.size() > 0) { + for (Term term : unflushedDeletions) { + indexDeleter.deleteDocuments(term); + } + unflushedDeletions.clear(); + } + + + // all done, clear buffers + unflushedDocuments.clear(); + termDocumentInformationFactoryByDocument.clear(); + fieldNameBuffer.clear(); + + index.setVersion(System.currentTimeMillis()); + + // todo unlock + + indexDeleter.close(); + + } + + /** + * Adds a document to this index. If the document contains more than + * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are + * discarded. + */ + public void addDocument(Document doc) throws IOException { + addDocument(doc, getAnalyzer()); + } + + /** + * Adds a document to this index, using the provided analyzer instead of the + * value of {@link #getAnalyzer()}. If the document contains more than + * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are + * discarded. + * + * @param doc + * @param analyzer + * @throws IOException + */ + public void addDocument(Document doc, Analyzer analyzer) throws IOException { + addDocument(new InstantiatedDocument(doc), analyzer); + } + + /** + * Tokenizes a document and adds it to the buffer. + * Try to do all calculations in this method rather than in commit, as this is a non locking method. + * Remember, this index implementation expects unlimited memory for maximum speed. + * + * @param document + * @param analyzer + * @throws IOException + */ + protected void addDocument(InstantiatedDocument document, Analyzer analyzer) throws IOException { + + if (document.getDocumentNumber() != null) { + throw new RuntimeException("Document number already set! Are you trying to add a document that already is bound to this or another index?"); + } + + // todo: write lock + + // normalize settings per field name in document + + Map fieldSettingsByFieldName = new HashMap(); + for (Field field : (List) document.getDocument().getFields()) { + FieldSetting fieldSettings = fieldSettingsByFieldName.get(field.name()); + if (fieldSettings == null) { + fieldSettings = new FieldSetting(); + fieldSettings.fieldName = field.name().intern(); + fieldSettingsByFieldName.put(fieldSettings.fieldName, fieldSettings); + fieldNameBuffer.add(fieldSettings.fieldName); + } + + // todo: fixme: multiple fields with the same name does not mean field boost += more boost. + fieldSettings.boost *= field.getBoost(); + //fieldSettings.dimensions++; + + // once fieldSettings, always fieldSettings. + if (field.getOmitNorms() != fieldSettings.omitNorms) { + fieldSettings.omitNorms = true; + } + if (field.isIndexed() != fieldSettings.isIndexed) { + fieldSettings.isIndexed = true; + } + if (field.isTokenized() != fieldSettings.isTokenized) { + fieldSettings.isTokenized = true; + } + if (field.isCompressed() != fieldSettings.isCompressed) { + fieldSettings.isCompressed = true; + } + if (field.isStored() != fieldSettings.isStored) { + fieldSettings.isStored = true; + } + if (field.isBinary() != fieldSettings.isBinary) { + fieldSettings.isBinary = true; + } + if (field.isTermVectorStored() != fieldSettings.storeTermVector) { + fieldSettings.storeTermVector = true; + } + if (field.isStorePositionWithTermVector() != fieldSettings.storePositionWithTermVector) { + fieldSettings.storePositionWithTermVector = true; + } + if (field.isStoreOffsetWithTermVector() != fieldSettings.storeOffsetWithTermVector) { + fieldSettings.storeOffsetWithTermVector = true; + } + } + + Map> tokensByField = new LinkedHashMap>(20); + + // tokenize indexed fields. + for (Iterator it = (Iterator) document.getDocument().getFields().iterator(); it.hasNext();) { + + Field field = it.next(); + + FieldSetting fieldSettings = fieldSettingsByFieldName.get(field.name()); + + if (field.isIndexed()) { + + LinkedList tokens = new LinkedList(); + tokensByField.put(field, tokens); + + if (field.isTokenized()) { + int termCounter = 0; + final TokenStream tokenStream; + // todo readerValue(), binaryValue() + if (field.tokenStreamValue() != null) { + tokenStream = field.tokenStreamValue(); + } else { + tokenStream = analyzer.tokenStream(field.name(), new StringReader(field.stringValue())); + } + Token next = tokenStream.next(); + + while (next != null) { + next.setTermText(next.termText().intern()); // todo: not sure this needs to be interned? + tokens.add(next); // the vector will be built on commit. + next = tokenStream.next(); + fieldSettings.fieldLength++; + if (fieldSettings.fieldLength > maxFieldLength) { + break; + } + } + } else { + // untokenized + tokens.add(new Token(field.stringValue().intern(), 0, field.stringValue().length(), "untokenized")); + fieldSettings.fieldLength++; + } + } + + if (!field.isStored()) { + it.remove(); + } + } + + + Map> termDocumentInformationFactoryByTermTextAndFieldSetting = new HashMap>(); + termDocumentInformationFactoryByDocument.put(document, termDocumentInformationFactoryByTermTextAndFieldSetting); + + // build term vector, term positions and term offsets + for (Map.Entry> eField_Tokens : tokensByField.entrySet()) { + FieldSetting fieldSettings = fieldSettingsByFieldName.get(eField_Tokens.getKey().name()); + + Map termDocumentInformationFactoryByTermText = termDocumentInformationFactoryByTermTextAndFieldSetting.get(fieldSettingsByFieldName.get(eField_Tokens.getKey().name())); + if (termDocumentInformationFactoryByTermText == null) { + termDocumentInformationFactoryByTermText = new HashMap(); + termDocumentInformationFactoryByTermTextAndFieldSetting.put(fieldSettingsByFieldName.get(eField_Tokens.getKey().name()), termDocumentInformationFactoryByTermText); + } + + int lastOffset = 0; + + // for each new field, move positions a bunch. + if (fieldSettings.position > 0) { + // todo what if no analyzer set, multiple fields with same name and index without tokenization? + fieldSettings.position += analyzer.getPositionIncrementGap(fieldSettings.fieldName); + } + + for (Token token : eField_Tokens.getValue()) { + + TermDocumentInformationFactory termDocumentInformationFactory = termDocumentInformationFactoryByTermText.get(token.termText()); + if (termDocumentInformationFactory == null) { + termDocumentInformationFactory = new TermDocumentInformationFactory(); + termDocumentInformationFactoryByTermText.put(token.termText(), termDocumentInformationFactory); + } + //termDocumentInformationFactory.termFrequency++; + + fieldSettings.position += (token.getPositionIncrement() - 1); + termDocumentInformationFactory.termPositions.add(fieldSettings.position++); + + if (token.getPayload() != null && token.getPayload().length() > 0) { + termDocumentInformationFactory.payloads.add(token.getPayload().toByteArray()); + } else { + termDocumentInformationFactory.payloads.add(null); + } + + if (eField_Tokens.getKey().isStoreOffsetWithTermVector()) { + + termDocumentInformationFactory.termOffsets.add(new TermVectorOffsetInfo(fieldSettings.offset + token.startOffset(), fieldSettings.offset + token.endOffset())); + lastOffset = fieldSettings.offset + token.endOffset(); + } + + + } + + if (eField_Tokens.getKey().isStoreOffsetWithTermVector()) { + fieldSettings.offset = lastOffset + 1; + } + + } + + + unflushedDocuments.add(document); + + // if too many documents in buffer, commit. + if (unflushedDocuments.size() >= getMergeFactor()) { + commit(/*lock*/); + } + + // todo: unlock write lock + + } + + + private Set unflushedDeletions = new HashSet(); + + public void deleteDocuments(Term term) throws IOException { + unflushedDeletions.add(term); + } + + public void deleteDocuments(Term[] terms) throws IOException { + for (Term term : terms) { + deleteDocuments(term); + } + } + + public void updateDocument(Term term, Document doc) throws IOException { + updateDocument(term, doc, getAnalyzer()); + } + + public void updateDocument(Term term, Document doc, Analyzer analyzer) throws IOException { + deleteDocuments(term); + addDocument(doc, analyzer); + } + + public int getMaxFieldLength() { + return maxFieldLength; + } + + public void setMaxFieldLength(int maxFieldLength) { + this.maxFieldLength = maxFieldLength; + } + + public Similarity getSimilarity() { + return similarity; + } + + public void setSimilarity(Similarity similarity) { + this.similarity = similarity; + } + + public Analyzer getAnalyzer() { + return analyzer; + } + + + private class FieldSetting { + private String fieldName; + + private float boost = 1; + //private int dimensions = 0; // this is futuristic + private int position = 0; + private int offset; + private int fieldLength = 0; + + private boolean storeTermVector = false; + private boolean storeOffsetWithTermVector = false; + private boolean storePositionWithTermVector = false; + private boolean omitNorms = false; + private boolean isTokenized = false; + + private boolean isStored = false; + private boolean isIndexed = false; + private boolean isBinary = false; + private boolean isCompressed = false; + + //private float norm; + //private byte encodedNorm; + + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + final FieldSetting that = (FieldSetting) o; + + return fieldName.equals(that.fieldName); + + } + + public int hashCode() { + return fieldName.hashCode(); + } + } + + private class TermDocumentInformationFactory { + private LinkedList payloads = new LinkedList(); + private LinkedList termPositions = new LinkedList(); + private LinkedList termOffsets = new LinkedList(); + } + + + +} diff --git a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTerm.java b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTerm.java new file mode 100644 index 00000000000..b982ed840bb --- /dev/null +++ b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTerm.java @@ -0,0 +1,250 @@ +package org.apache.lucene.store.instantiated; + +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.Term; + +import java.io.Serializable; +import java.util.Comparator; +import java.util.Collections; +import java.util.Arrays; + +/** + * A term in the inverted index, coupled to the documents it occurs in. + * + * @see org.apache.lucene.index.Term + */ +public class InstantiatedTerm + implements Serializable { + + private static final long serialVersionUID = 1l; + + public static final Comparator comparator = new Comparator() { + public int compare(InstantiatedTerm instantiatedTerm, InstantiatedTerm instantiatedTerm1) { + return instantiatedTerm.getTerm().compareTo(instantiatedTerm1.getTerm()); + } + }; + + public static final Comparator termComparator = new Comparator() { + public int compare(Object o, Object o1) { + return ((InstantiatedTerm)o).getTerm().compareTo((Term)o1); + } + }; + + private Term term; + + /** + * index of term in InstantiatedIndex + * @see org.apache.lucene.store.instantiated.InstantiatedIndex#getOrderedTerms() */ + private int termIndex; + + /** + * @return Term associated with this entry of the index object graph + */ + public Term getTerm() { + return term; + } + + InstantiatedTerm(String field, String text) { + this.term = new Term(field, text); + } + +// this could speed up TermDocs.skipTo even more +// private Map associatedDocumentIndexByDocumentNumber = new HashMap(); +// +// public Map getAssociatedDocumentIndexByDocumentNumber() { +// return associatedDocumentIndexByDocumentNumber; +// } + + /** Ordered by document number */ + private InstantiatedTermDocumentInformation[] associatedDocuments; + + /** + * Meta data per document in wich this term is occuring. + * Ordered by document number. + * + * @return Meta data per document in wich this term is occuring. + */ + public InstantiatedTermDocumentInformation[] getAssociatedDocuments() { + return associatedDocuments; + } + + + /** + * Meta data per document in wich this term is occuring. + * Ordered by document number. + * + * @param associatedDocuments meta data per document in wich this term is occuring, ordered by document number + */ + void setAssociatedDocuments(InstantiatedTermDocumentInformation[] associatedDocuments) { + this.associatedDocuments = associatedDocuments; + } + + /** + * Finds index to the first beyond the current whose document number is + * greater than or equal to target, -1 if there is no such element. + * + * @param target the document number to match + * @return -1 if there is no such element + */ + public int seekCeilingDocumentInformationIndex(int target) { + return seekCeilingDocumentInformationIndex(target, 0, getAssociatedDocuments().length); + } + + /** + * Finds index to the first beyond the current whose document number is + * greater than or equal to target, -1 if there is no such element. + * + * @param target the document number to match + * @param startOffset associated documents index start offset + * @return -1 if there is no such element + */ + public int seekCeilingDocumentInformationIndex(int target, int startOffset) { + return seekCeilingDocumentInformationIndex(target, startOffset, getAssociatedDocuments().length); + } + + /** + * Finds index to the first beyond the current whose document number is + * greater than or equal to target, -1 if there is no such element. + * + * @param target the document number to match + * @param startOffset associated documents index start offset + * @param endPosition associated documents index end position + * @return -1 if there is no such element + */ + public int seekCeilingDocumentInformationIndex(int target, int startOffset, int endPosition) { + + int pos = binarySearchAssociatedDocuments(target, startOffset, endPosition - startOffset); + +// int pos = Arrays.binarySearch(getAssociatedDocuments(), target, InstantiatedTermDocumentInformation.doumentNumberIntegerComparator); + + if (pos < 0) { + pos = -1 - pos; + } + if (getAssociatedDocuments().length <= pos) { + return -1; + } else { + return pos; + } + } + + public int binarySearchAssociatedDocuments(int target) { + return binarySearchAssociatedDocuments(target, 0); + } + + public int binarySearchAssociatedDocuments(int target, int offset) { + return binarySearchAssociatedDocuments(target, offset, associatedDocuments.length - offset); + } + + /** + * @param target value to search for in the array + * @param offset index of the first valid value in the array + * @param length number of valid values in the array + * @return index of an occurrence of key in array, or -(insertionIndex + 1) if key is not contained in array (insertionIndex is then the index at which key could be inserted). + */ + public int binarySearchAssociatedDocuments(int target, int offset, int length) { + + // implementation originally from http://ochafik.free.fr/blog/?p=106 + + if (length == 0) { + return -1 - offset; + } + int min = offset, max = offset + length - 1; + int minVal = getAssociatedDocuments()[min].getDocument().getDocumentNumber(); + int maxVal = getAssociatedDocuments()[max].getDocument().getDocumentNumber(); + + + int nPreviousSteps = 0; + + for (; ;) { + + // be careful not to compute key - minVal, for there might be an integer overflow. + if (target <= minVal) return target == minVal ? min : -1 - min; + if (target >= maxVal) return target == maxVal ? max : -2 - max; + + assert min != max; + + int pivot; + // A typical binarySearch algorithm uses pivot = (min + max) / 2. + // The pivot we use here tries to be smarter and to choose a pivot close to the expectable location of the key. + // This reduces dramatically the number of steps needed to get to the key. + // However, it does not work well with a logaritmic distribution of values, for instance. + // When the key is not found quickly the smart way, we switch to the standard pivot. + if (nPreviousSteps > 2) { + pivot = (min + max) >> 1; + // stop increasing nPreviousSteps from now on + } else { + // NOTE: We cannot do the following operations in int precision, because there might be overflows. + // long operations are slower than float operations with the hardware this was tested on (intel core duo 2, JVM 1.6.0). + // Overall, using float proved to be the safest and fastest approach. + pivot = min + (int) ((target - (float) minVal) / (maxVal - (float) minVal) * (max - min)); + nPreviousSteps++; + } + + int pivotVal = getAssociatedDocuments()[pivot].getDocument().getDocumentNumber(); + + // NOTE: do not store key - pivotVal because of overflows + if (target > pivotVal) { + min = pivot + 1; + max--; + } else if (target == pivotVal) { + return pivot; + } else { + min++; + max = pivot - 1; + } + maxVal = getAssociatedDocuments()[max].getDocument().getDocumentNumber(); + minVal = getAssociatedDocuments()[min].getDocument().getDocumentNumber(); + } + } + + + /** + * Navigates to the view of this occurances of this term in a specific document. + * + * This method is only used by InstantiatedIndex(IndexReader) and + * should not be optimized for less CPU at the cost of more RAM. + * + * @param documentNumber the n:th document in the index + * @return view of this term from specified document + */ + public InstantiatedTermDocumentInformation getAssociatedDocument(int documentNumber) { + int pos = binarySearchAssociatedDocuments(documentNumber); + return pos < 0 ? null : getAssociatedDocuments()[pos]; + } + + public final String field() { + return term.field(); + } + + public final String text() { + return term.text(); + } + + public String toString() { + return term.toString(); + } + + + public int getTermIndex() { + return termIndex; + } + + public void setTermIndex(int termIndex) { + this.termIndex = termIndex; + } +} diff --git a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocs.java b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocs.java new file mode 100644 index 00000000000..d89a4fe5fa3 --- /dev/null +++ b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocs.java @@ -0,0 +1,136 @@ +package org.apache.lucene.store.instantiated; + +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermDocs; + +/** + * A {@link org.apache.lucene.index.TermDocs} navigating an {@link InstantiatedIndexReader}. + */ +public class InstantiatedTermDocs + implements TermDocs { + + private final InstantiatedIndexReader reader; + + public InstantiatedTermDocs(InstantiatedIndexReader reader) { + this.reader = reader; + } + + private int currentDocumentIndex; + protected InstantiatedTermDocumentInformation currentDocumentInformation; + protected InstantiatedTerm currentTerm; + + + public void seek(Term term) { + currentTerm = reader.getIndex().findTerm(term); + currentDocumentIndex = -1; + } + + public void seek(org.apache.lucene.index.TermEnum termEnum) { + seek(termEnum.term()); + } + + + public int doc() { + return currentDocumentInformation.getDocument().getDocumentNumber(); + } + + public int freq() { + return currentDocumentInformation.getTermPositions().length; + } + + + public boolean next() { + if (currentTerm != null) { + currentDocumentIndex++; + if (currentDocumentIndex < currentTerm.getAssociatedDocuments().length) { + currentDocumentInformation = currentTerm.getAssociatedDocuments()[currentDocumentIndex]; + if (reader.hasDeletions() && reader.isDeleted(currentDocumentInformation.getDocument().getDocumentNumber())) { + return next(); + } else { + return true; + } + } + } + return false; + } + + + public int read(int[] docs, int[] freqs) { + int i; + for (i = 0; i < docs.length; i++) { + if (!next()) { + break; + } + docs[i] = doc(); + freqs[i] = freq(); + } + return i; + } + + /** + * Skips entries to the first beyond the current whose document number is + * greater than or equal to target.

Returns true if there is such + * an entry.

Behaves as if written:

+   *   boolean skipTo(int target) {
+   *     do {
+   *       if (!next())
+   * 	     return false;
+   *     } while (target > doc());
+   *     return true;
+   *   }
+   * 
+ * This implementation is considerably more efficient than that. + * + */ + public boolean skipTo(int target) { + if (currentTerm == null) { + return false; + } + + if (currentDocumentIndex >= target) { + return next(); + } + + int startOffset = currentDocumentIndex >= 0 ? currentDocumentIndex : 0; + int pos = currentTerm.seekCeilingDocumentInformationIndex(target, startOffset); + + if (pos == -1) { + return false; + } + + currentDocumentInformation = currentTerm.getAssociatedDocuments()[pos]; + currentDocumentIndex = pos; + if (reader.hasDeletions() && reader.isDeleted(currentDocumentInformation.getDocument().getDocumentNumber())) { + return next(); + } else { + return true; + } + + + } + + /** + * Does nothing + */ + public void close() { + + } + + +} diff --git a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocumentInformation.java b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocumentInformation.java new file mode 100644 index 00000000000..409d33d40f1 --- /dev/null +++ b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocumentInformation.java @@ -0,0 +1,139 @@ +package org.apache.lucene.store.instantiated; + +import org.apache.lucene.index.TermVectorOffsetInfo; + +import java.io.Serializable; +import java.util.Comparator; + +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * There is one instance of this class per indexed term in a document + * and it contains the meta data about each occurance of a term in a docment. + * + * It is the inner glue of the inverted index. + * + *
+ * [Term]-- {0..*} | {0..*} --(field)[Document]
+ *            <<ordered>>
+ *                 |
+ *    [TermDocumentInformation]
+ *       +payloads
+ *       +termPositions
+ *       +termOffsets
+ * 
+ * + */ +public class InstantiatedTermDocumentInformation + implements Serializable { + + private static final long serialVersionUID = 1l; + + public static final Comparator termComparator = new Comparator() { + public int compare(InstantiatedTermDocumentInformation instantiatedTermDocumentInformation, InstantiatedTermDocumentInformation instantiatedTermDocumentInformation1) { + return instantiatedTermDocumentInformation.getTerm().getTerm().compareTo(instantiatedTermDocumentInformation1.getTerm()); + } + }; + + public static final Comparator documentNumberComparator = new Comparator() { + public int compare(InstantiatedTermDocumentInformation instantiatedTermDocumentInformation, InstantiatedTermDocumentInformation instantiatedTermDocumentInformation1) { + return instantiatedTermDocumentInformation.getDocument().getDocumentNumber().compareTo(instantiatedTermDocumentInformation1.getDocument().getDocumentNumber()); + } + }; + + public static final Comparator doumentNumberIntegerComparator = new Comparator() { + public int compare(Object o1, Object o2) { + InstantiatedTermDocumentInformation di = (InstantiatedTermDocumentInformation) o1; + Integer i = (Integer) o2; + return di.getDocument().getDocumentNumber().compareTo(i); + } + }; + + + private byte[][] payloads; + private int[] termPositions; + private InstantiatedTerm term; + private InstantiatedDocument document; + private TermVectorOffsetInfo[] termOffsets; + + + + public InstantiatedTermDocumentInformation(InstantiatedTerm term, InstantiatedDocument document, int[] termPositions, byte[][] payloads) { + this.term = term; + this.document = document; + this.termPositions = termPositions; + this.payloads = payloads; + } + + +// not quite sure why I wanted this. +// /** +// * [Term]--- {0..* ordered} ->[Info] +// */ +// private int indexFromTerm; + + +// public int getIndexFromTerm() { +// return indexFromTerm; +// } +// +// void setIndexFromTerm(int indexFromTerm) { +// this.indexFromTerm = indexFromTerm; +// } + + + public int[] getTermPositions() { + return termPositions; + } + + + public byte[][] getPayloads() { + return payloads; + } + + public InstantiatedDocument getDocument() { + return document; + } + + + + public InstantiatedTerm getTerm() { + return term; + } + + + void setTermPositions(int[] termPositions) { + this.termPositions = termPositions; + } + + + void setTerm(InstantiatedTerm term) { + this.term = term; + } + + void setDocument(InstantiatedDocument document) { + this.document = document; + } + + public TermVectorOffsetInfo[] getTermOffsets() { + return termOffsets; + } + + void setTermOffsets(TermVectorOffsetInfo[] termOffsets) { + this.termOffsets = termOffsets; + } +} diff --git a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermEnum.java b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermEnum.java new file mode 100644 index 00000000000..742bd3b9e4e --- /dev/null +++ b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermEnum.java @@ -0,0 +1,109 @@ +package org.apache.lucene.store.instantiated; + +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; + +import java.io.IOException; +import java.util.Arrays; + +/** + * A {@link org.apache.lucene.index.TermEnum} navigating an {@link org.apache.lucene.store.instantiated.InstantiatedIndexReader}. + */ +public class InstantiatedTermEnum + extends TermEnum { + + private final InstantiatedIndexReader reader; + + public InstantiatedTermEnum(InstantiatedIndexReader reader) { + this.nextTermIndex = 0; + this.reader = reader; + } + + public InstantiatedTermEnum(InstantiatedIndexReader reader, int startPosition) { + this.reader = reader; + this.nextTermIndex = startPosition; + next(); + } + + private int nextTermIndex; + private InstantiatedTerm term; + + /** + * Increments the enumeration to the next element. True if one exists. + */ + public boolean next() { + if (reader.getIndex().getOrderedTerms().length <= nextTermIndex) { + return false; + } else { + term = reader.getIndex().getOrderedTerms()[nextTermIndex]; + nextTermIndex++; + return true; + } + } + + /** + * Returns the current Term in the enumeration. + */ + public Term term() { + return /*term == null ? null :*/ term.getTerm(); + } + + /** + * Returns the docFreq of the current Term in the enumeration. + */ + public int docFreq() { + return term.getAssociatedDocuments().length; + } + + /** + * Closes the enumeration to further activity, freeing resources. + */ + public void close() { + } + + + public boolean skipTo(Term target) throws IOException { + + // this method is not known to be used by anything + // in lucene for many years now, so there is + // very to gain by optimizing this method more, + + InstantiatedTerm term = reader.getIndex().findTerm(target); + if (term != null) { + this.term = term; + nextTermIndex = term.getTermIndex() + 1; + return true; + } else { + int pos = Arrays.binarySearch(reader.getIndex().getOrderedTerms(), target, InstantiatedTerm.termComparator); + if (pos < 0) { + pos = -1 - pos; + } + + if (pos > reader.getIndex().getOrderedTerms().length) { + return false; + } + this.term = reader.getIndex().getOrderedTerms()[pos]; + nextTermIndex = pos + 1; + return true; + } + } +} + + + diff --git a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermFreqVector.java b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermFreqVector.java new file mode 100644 index 00000000000..9128b4b76af --- /dev/null +++ b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermFreqVector.java @@ -0,0 +1,112 @@ +package org.apache.lucene.store.instantiated; + +import org.apache.lucene.index.TermFreqVector; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.List; + +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Vector space view of a document in an {@link InstantiatedIndexReader}. + * + * @see org.apache.lucene.index.TermFreqVector + */ +public class InstantiatedTermFreqVector + implements TermFreqVector, Serializable { + + private static final long serialVersionUID = 1l; + + private final List termDocumentInformations; + private final String field; + private final String terms[]; + private final int termFrequencies[]; + + public InstantiatedTermFreqVector(InstantiatedDocument document, String field) { + this.field = field; + termDocumentInformations = document.getVectorSpace().get(field); + terms = new String[termDocumentInformations.size()]; + termFrequencies = new int[termDocumentInformations.size()]; + + for (int i = 0; i < termDocumentInformations.size(); i++) { + InstantiatedTermDocumentInformation termDocumentInformation = termDocumentInformations.get(i); + terms[i] = termDocumentInformation.getTerm().text(); + termFrequencies[i] = termDocumentInformation.getTermPositions().length; + } + } + + /** + * @return The number of the field this vector is associated with + */ + public String getField() { + return field; + } + + public String toString() { + StringBuffer sb = new StringBuffer(); + sb.append('{'); + sb.append(field).append(": "); + if (terms != null) { + for (int i = 0; i < terms.length; i++) { + if (i > 0) sb.append(", "); + sb.append(terms[i]).append('/').append(termFrequencies[i]); + } + } + sb.append('}'); + + return sb.toString(); + } + + public int size() { + return terms == null ? 0 : terms.length; + } + + public String[] getTerms() { + return terms; + } + + public int[] getTermFrequencies() { + return termFrequencies; + } + + public int indexOf(String termText) { + if (terms == null) + return -1; + int res = Arrays.binarySearch(terms, termText); + return res >= 0 ? res : -1; + } + + public int[] indexesOf(String[] termNumbers, int start, int len) { + // TODO: there must be a more efficient way of doing this. + // At least, we could advance the lower bound of the terms array + // as we find valid indices. Also, it might be possible to leverage + // this even more by starting in the middle of the termNumbers array + // and thus dividing the terms array maybe in half with each found index. + int res[] = new int[len]; + + for (int i = 0; i < len; i++) { + res[i] = indexOf(termNumbers[start + i]); + } + return res; + } + + public List getTermDocumentInformations() { + return termDocumentInformations; + } + +} diff --git a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermPositionVector.java b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermPositionVector.java new file mode 100644 index 00000000000..56d8e02a2ed --- /dev/null +++ b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermPositionVector.java @@ -0,0 +1,47 @@ +package org.apache.lucene.store.instantiated; + +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.TermPositionVector; +import org.apache.lucene.index.TermVectorOffsetInfo; + +import java.io.Serializable; + +/** + * Extended vector space view of a document in an {@link InstantiatedIndexReader}. + * + * @see org.apache.lucene.index.TermPositionVector + */ +public class InstantiatedTermPositionVector + extends InstantiatedTermFreqVector + implements TermPositionVector, Serializable { + + private static final long serialVersionUID = 1l; + + public InstantiatedTermPositionVector(InstantiatedDocument document, String field) { + super(document, field); + } + + public int[] getTermPositions(int index) { + return getTermDocumentInformations().get(index).getTermPositions(); + } + + public TermVectorOffsetInfo[] getOffsets(int index) { + return getTermDocumentInformations().get(index).getTermOffsets(); + } + +} diff --git a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermPositions.java b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermPositions.java new file mode 100644 index 00000000000..3af9283dec0 --- /dev/null +++ b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermPositions.java @@ -0,0 +1,100 @@ +package org.apache.lucene.store.instantiated; + +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.TermPositions; + +import java.io.IOException; + +/** + * A {@link org.apache.lucene.index.TermPositions} navigating an {@link InstantiatedIndexReader}. + */ +public class InstantiatedTermPositions + extends InstantiatedTermDocs + implements TermPositions { + + public int getPayloadLength() { + return currentDocumentInformation.getPayloads()[currentTermPositionIndex].length; + } + + public byte[] getPayload(byte[] data, int offset) throws IOException { + byte[] payloads = currentDocumentInformation.getPayloads()[currentTermPositionIndex]; + + // read payloads lazily + if (data == null || data.length - offset < getPayloadLength()) { + // the array is too small to store the payload data, + return payloads; + } else { + System.arraycopy(payloads, 0, data, offset, payloads.length); + return data; + } + } + + public boolean isPayloadAvailable() { + return currentDocumentInformation.getPayloads()[currentTermPositionIndex] != null; + } + + public InstantiatedTermPositions(InstantiatedIndexReader reader) { + super(reader); + } + + /** + * Returns next position in the current document. It is an error to call + * this more than {@link #freq()} times + * without calling {@link #next()}

This is + * invalid until {@link #next()} is called for + * the first time. + */ + public int nextPosition() { + currentTermPositionIndex++; + // if you get an array out of index exception here, + // it might be due to currentDocumentInformation.getIndexFromTerm not beeing set!! + return currentDocumentInformation.getTermPositions()[currentTermPositionIndex]; + } + + private int currentTermPositionIndex; + + /** + * Moves to the next pair in the enumeration. + *

Returns true if there is such a next pair in the enumeration. + */ + @Override + public boolean next() { + currentTermPositionIndex = -1; + return super.next(); + } + + /** + * Skips entries to the first beyond the current whose document number is + * greater than or equal to target.

Returns true iff there is such + * an entry.

Behaves as if written:

+   *   boolean skipTo(int target) {
+   *     do {
+   *       if (!next())
+   * 	     return false;
+   *     } while (target > doc());
+   *     return true;
+   *   }
+   * 
+ * Some implementations are considerably more efficient than that. + */ + @Override + public boolean skipTo(int target) { + currentTermPositionIndex = -1; + return super.skipTo(target); + } +} diff --git a/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/package.html b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/package.html new file mode 100644 index 00000000000..81dc96a5460 --- /dev/null +++ b/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/package.html @@ -0,0 +1,90 @@ + + + + + InstantiatedIndex + + +

WARNING: This contrib is experimental and the APIs may change without warning.

+

Abstract

+ +

+ Represented as a coupled graph of class instances, this + all-in-memory index store implementation delivers search + results up to a 100 times faster than the file-centric RAMDirectory + at the cost of greater RAM consumption. +

+ +

API

+ +

+ Just as the default store implementation, InstantiatedIndex + comes with an IndexReader and IndexWriter. The latter share + many method signatures with the file-centric IndexWriter. +

+ +

+ It is also possible to load the content of another index + by passing an IndexReader to the InstantiatedIndex constructor. +

+ +

Performance

+ +

+ At a few thousand ~160 characters long documents + InstantiaedIndex outperforms RAMDirectory some 50x, + 15x at 100 documents of 2000 charachters length, + and is linear to RAMDirectory at 10,000 documents of 2000 characters length. +

+ +

Mileage may vary depending on term saturation.

+ +

+ Populated with a single document InstantiatedIndex is almost, but not quite, as fast as MemoryIndex. +

+ +

+ It takes more or less the same time to populate an InstantiatedIndex + as it takes to populate a RAMDirectory. Hardly any effort has been put + in to optimizing the InstantiatedIndexWriter, only minimizing the amount + of time needed to write-lock the index has been considered. +

+ +

Caveats

+
    +
  • No locks! Consider using InstantiatedIndex as if it was immutable.
  • +
  • No documents with fields containing readers!
  • +
  • Only FieldOption.All allowed by IndexReader#getFieldNames(FieldOption).
  • +
  • No field selection when retrieving documents, as all stored field are available in memory.
  • +
+ +

Use cases

+ +

+ Could replace any small index that could do with greater response time. + spell check a priori index, + the index of new documents exposed to user search agent queries, + to compile classifiers in machine learning environments, et c. +

+ +

Class diagram

+class diagram +
+Diagram rendered using UMLet 7.1. + + diff --git a/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java b/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java new file mode 100644 index 00000000000..f331abb9ad6 --- /dev/null +++ b/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java @@ -0,0 +1,424 @@ +package org.apache.lucene.store.instantiated; +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.*; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +import java.io.IOException; +import java.util.*; + +/** + * Asserts equality of content and behaviour of two index readers. + */ +public class TestIndicesEquals extends TestCase { + +// public void test2() throws Exception { +// FSDirectory fsdir = FSDirectory.getDirectory("/tmp/fatcorpus"); +// IndexReader ir = IndexReader.open(fsdir); +// InstantiatedIndex ii = new InstantiatedIndex(ir); +// ir.close(); +// testEquals(fsdir, ii); +// } + + + public void testLoadIndexReader() throws Exception { + RAMDirectory dir = new RAMDirectory(); + + // create dir data + IndexWriter indexWriter = new IndexWriter(dir, new StandardAnalyzer(), true); + for (int i = 0; i < 5; i++) { + Document document = new Document(); + assembleDocument(document, i); + indexWriter.addDocument(document); + } + indexWriter.close(); + + // test load ii from index reader + IndexReader ir = IndexReader.open(dir); + InstantiatedIndex ii = new InstantiatedIndex(ir); + ir.close(); + + testEquals(dir, ii); + } + + public void testInstantiatedIndexWriter() throws Exception { + + + RAMDirectory dir = new RAMDirectory(); + InstantiatedIndex ii = new InstantiatedIndex(); + + // create dir data + IndexWriter indexWriter = new IndexWriter(dir, new StandardAnalyzer(), true); + for (int i = 0; i < 500; i++) { + Document document = new Document(); + assembleDocument(document, i); + indexWriter.addDocument(document); + } + indexWriter.close(); + + // test ii writer + InstantiatedIndexWriter instantiatedIndexWriter = ii.indexWriterFactory(new StandardAnalyzer(), true); + for (int i = 0; i < 500; i++) { + Document document = new Document(); + assembleDocument(document, i); + instantiatedIndexWriter.addDocument(document); + } + instantiatedIndexWriter.close(); + + testEquals(dir, ii); + + testTermDocs(dir, ii); + + + } + + + private void testTermDocs(Directory aprioriIndex, InstantiatedIndex testIndex) throws Exception { + + IndexReader aprioriReader = IndexReader.open(aprioriIndex); + IndexReader testReader = testIndex.indexReaderFactory(); + + TermEnum aprioriTermEnum = aprioriReader.terms(new Term("c", "danny")); + + TermDocs aprioriTermDocs = aprioriReader.termDocs(aprioriTermEnum.term()); + TermDocs testTermDocs = testReader.termDocs(aprioriTermEnum.term()); + + assertEquals(aprioriTermDocs.next(), testTermDocs.next()); + assertEquals(aprioriTermDocs.doc(), testTermDocs.doc()); + + assertEquals(aprioriTermDocs.skipTo(100), testTermDocs.skipTo(100)); + assertEquals(aprioriTermDocs.doc(), testTermDocs.doc()); + + assertEquals(aprioriTermDocs.next(), testTermDocs.next()); + assertEquals(aprioriTermDocs.doc(), testTermDocs.doc()); + + assertEquals(aprioriTermDocs.next(), testTermDocs.next()); + assertEquals(aprioriTermDocs.doc(), testTermDocs.doc()); + + assertEquals(aprioriTermDocs.skipTo(110), testTermDocs.skipTo(110)); + assertEquals(aprioriTermDocs.doc(), testTermDocs.doc()); + + assertEquals(aprioriTermDocs.skipTo(10), testTermDocs.skipTo(10)); + assertEquals(aprioriTermDocs.doc(), testTermDocs.doc()); + + assertEquals(aprioriTermDocs.skipTo(210), testTermDocs.skipTo(210)); + assertEquals(aprioriTermDocs.doc(), testTermDocs.doc()); + + aprioriTermDocs.close(); + aprioriReader.close(); + + testTermDocs.close(); + testReader.close(); + + } + + private void assembleDocument(Document document, int i) { + document.add(new Field("a", i + " Do you really want to go and live in that house all winter?", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); + if (i > 0) { + document.add(new Field("b0", i + " All work and no play makes Jack a dull boy", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); + document.add(new Field("b1", i + " All work and no play makes Jack a dull boy", Field.Store.YES, Field.Index.NO_NORMS, Field.TermVector.NO)); + document.add(new Field("b2", i + " All work and no play makes Jack a dull boy", Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO)); + document.add(new Field("b3", i + " All work and no play makes Jack a dull boy", Field.Store.YES, Field.Index.NO, Field.TermVector.NO)); + if (i > 1) { + document.add(new Field("c", i + " Redrum redrum", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); + if (i > 2) { + document.add(new Field("d", i + " Hello Danny, come and play with us... forever and ever. and ever.", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); + if (i > 3) { + Field f = new Field("e", i + " Heres Johnny!", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + f.setOmitNorms(true); + document.add(f); + if (i > 4) { + final List tokens = new ArrayList(2); + Token t = new Token("the", 0, 2, "text"); + t.setPayload(new Payload(new byte[]{1, 2, 3})); + tokens.add(t); + t = new Token("end", 3, 5, "text"); + t.setPayload(new Payload(new byte[]{2})); + tokens.add(t); + tokens.add(new Token("fin", 7, 9)); + document.add(new Field("f", new TokenStream() { + Iterator it = tokens.iterator(); + + public Token next() throws IOException { + if (!it.hasNext()) { + return null; + } + return it.next(); + } + + public void reset() throws IOException { + it = tokens.iterator(); + } + })); + } + } + } + } + } + } + + + /** + * Asserts that the content of two index readers equal each other. + * + * @param aprioriIndex the index that is known to be correct + * @param testIndex the index that is supposed to equals the apriori index. + * @throws Exception + */ + protected void testEquals(Directory aprioriIndex, InstantiatedIndex testIndex) throws Exception { + + IndexReader aprioriReader = IndexReader.open(aprioriIndex); + IndexReader testReader = testIndex.indexReaderFactory(); + + assertEquals(aprioriReader.numDocs(), testReader.numDocs()); + + for (Object field : aprioriReader.getFieldNames(IndexReader.FieldOption.ALL)) { + + // test norms as used by normal use + + byte[] aprioriNorms = aprioriReader.norms((String) field); + byte[] testNorms = testReader.norms((String) field); + + assertEquals(aprioriNorms.length, testNorms.length); + + for (int i = 0; i < aprioriNorms.length; i++) { + assertEquals("norms does not equals for field " + field + " in document " + i, aprioriNorms[i], testNorms[i]); + } + + // test norms as used by multireader + + aprioriNorms = new byte[aprioriReader.maxDoc()]; + aprioriReader.norms((String) field, aprioriNorms, 0); + + testNorms = new byte[testReader.maxDoc()]; + testReader.norms((String) field, testNorms, 0); + + assertEquals(aprioriNorms.length, testNorms.length); + + for (int i = 0; i < aprioriNorms.length; i++) { + assertEquals("norms does not equals for field " + field + " in document " + i, aprioriNorms[i], testNorms[i]); + } + + + } + + for (int docIndex = 0; docIndex < aprioriReader.numDocs(); docIndex++) { + assertEquals(aprioriReader.isDeleted(docIndex), testReader.isDeleted(docIndex)); + } + + // compare term enumeration stepping + + TermEnum aprioriTermEnum = aprioriReader.terms(); + TermEnum testTermEnum = testReader.terms(); + + + while (true) { + + if (!aprioriTermEnum.next()) { + assertFalse(testTermEnum.next()); + break; + } + assertTrue(testTermEnum.next()); + + assertEquals(aprioriTermEnum.term(), testTermEnum.term()); + assertTrue(aprioriTermEnum.docFreq() == testTermEnum.docFreq()); + + // compare termDocs seeking + + TermDocs aprioriTermDocsSeeker = aprioriReader.termDocs(aprioriTermEnum.term()); + TermDocs testTermDocsSeeker = testReader.termDocs(testTermEnum.term()); + + while (aprioriTermDocsSeeker.next()) { + assertTrue(testTermDocsSeeker.skipTo(aprioriTermDocsSeeker.doc())); + assertEquals(aprioriTermDocsSeeker.doc(), testTermDocsSeeker.doc()); + } + + aprioriTermDocsSeeker.close(); + testTermDocsSeeker.close(); + + // compare documents per term + + assertEquals(aprioriReader.docFreq(aprioriTermEnum.term()), testReader.docFreq(testTermEnum.term())); + + TermDocs aprioriTermDocs = aprioriReader.termDocs(aprioriTermEnum.term()); + TermDocs testTermDocs = testReader.termDocs(testTermEnum.term()); + + while (true) { + if (!aprioriTermDocs.next()) { + assertFalse(testTermDocs.next()); + break; + } + assertTrue(testTermDocs.next()); + + assertEquals(aprioriTermDocs.doc(), testTermDocs.doc()); + assertEquals(aprioriTermDocs.freq(), testTermDocs.freq()); + } + + aprioriTermDocs.close(); + testTermDocs.close(); + + // compare term positions + + TermPositions testTermPositions = testReader.termPositions(testTermEnum.term()); + TermPositions aprioriTermPositions = aprioriReader.termPositions(aprioriTermEnum.term()); + + if (aprioriTermPositions != null) { + + for (int docIndex = 0; docIndex < aprioriReader.maxDoc(); docIndex++) { + boolean hasNext = aprioriTermPositions.next(); + if (hasNext) { + assertTrue(testTermPositions.next()); + + assertEquals(aprioriTermPositions.freq(), testTermPositions.freq()); + + + for (int termPositionIndex = 0; termPositionIndex < aprioriTermPositions.freq(); termPositionIndex++) { + int aprioriPos = aprioriTermPositions.nextPosition(); + int testPos = testTermPositions.nextPosition(); + + if (aprioriPos != testPos) { + assertEquals(aprioriPos, testPos); + } + + + assertEquals(aprioriTermPositions.isPayloadAvailable(), testTermPositions.isPayloadAvailable()); + if (aprioriTermPositions.isPayloadAvailable()) { + assertEquals(aprioriTermPositions.getPayloadLength(), testTermPositions.getPayloadLength()); + byte[] aprioriPayloads = aprioriTermPositions.getPayload(new byte[aprioriTermPositions.getPayloadLength()], 0); + byte[] testPayloads = testTermPositions.getPayload(new byte[testTermPositions.getPayloadLength()], 0); + for (int i = 0; i < aprioriPayloads.length; i++) { + assertEquals(aprioriPayloads[i], testPayloads[i]); + } + } + + } + } + } + + aprioriTermPositions.close(); + testTermPositions.close(); + + } + } + + // compare term enumeration seeking + + aprioriTermEnum = aprioriReader.terms(); + + TermEnum aprioriTermEnumSeeker = aprioriReader.terms(); + TermEnum testTermEnumSeeker = testReader.terms(); + + while (aprioriTermEnum.next()) { + if (aprioriTermEnumSeeker.skipTo(aprioriTermEnum.term())) { + assertTrue(testTermEnumSeeker.skipTo(aprioriTermEnum.term())); + assertEquals(aprioriTermEnumSeeker.term(), testTermEnumSeeker.term()); + } else { + assertFalse(testTermEnumSeeker.skipTo(aprioriTermEnum.term())); + } + } + + aprioriTermEnum.close(); + aprioriTermEnumSeeker.close(); + testTermEnumSeeker.close(); + + // skip to non existing terms + + aprioriTermEnumSeeker = aprioriReader.terms(); + testTermEnumSeeker = testReader.terms(); + + aprioriTermEnum = aprioriReader.terms(); + aprioriTermEnum.next(); + Term nonExistingTerm = new Term(aprioriTermEnum.term().field(), "bzzzzoo993djdj380sdf"); + aprioriTermEnum.close(); + + assertEquals(aprioriTermEnumSeeker.skipTo(nonExistingTerm), testTermEnumSeeker.skipTo(nonExistingTerm)); + assertEquals(aprioriTermEnumSeeker.term(), testTermEnumSeeker.term()); + + aprioriTermEnumSeeker.close(); + testTermEnumSeeker.close(); + + // compare term vectors and position vectors + + for (int documentNumber = 0; documentNumber < aprioriReader.numDocs(); documentNumber++) { + + if (documentNumber > 0) { + assertNotNull(aprioriReader.getTermFreqVector(documentNumber, "b0")); + assertNull(aprioriReader.getTermFreqVector(documentNumber, "b1")); + + assertNotNull(testReader.getTermFreqVector(documentNumber, "b0")); + assertNull(testReader.getTermFreqVector(documentNumber, "b1")); + + } + + TermFreqVector[] aprioriFreqVectors = aprioriReader.getTermFreqVectors(documentNumber); + TermFreqVector[] testFreqVectors = testReader.getTermFreqVectors(documentNumber); + + if (aprioriFreqVectors != null && testFreqVectors != null) { + + Arrays.sort(aprioriFreqVectors, new Comparator() { + public int compare(TermFreqVector termFreqVector, TermFreqVector termFreqVector1) { + return termFreqVector.getField().compareTo(termFreqVector1.getField()); + } + }); + Arrays.sort(testFreqVectors, new Comparator() { + public int compare(TermFreqVector termFreqVector, TermFreqVector termFreqVector1) { + return termFreqVector.getField().compareTo(termFreqVector1.getField()); + } + }); + + assertEquals("document " + documentNumber + " vectors does not match", aprioriFreqVectors.length, testFreqVectors.length); + + for (int freqVectorIndex = 0; freqVectorIndex < aprioriFreqVectors.length; freqVectorIndex++) { + assertTrue(Arrays.equals(aprioriFreqVectors[freqVectorIndex].getTermFrequencies(), testFreqVectors[freqVectorIndex].getTermFrequencies())); + assertTrue(Arrays.equals(aprioriFreqVectors[freqVectorIndex].getTerms(), testFreqVectors[freqVectorIndex].getTerms())); + + if (aprioriFreqVectors[freqVectorIndex] instanceof TermPositionVector) { + TermPositionVector aprioriTermPositionVector = (TermPositionVector) aprioriFreqVectors[freqVectorIndex]; + TermPositionVector testTermPositionVector = (TermPositionVector) testFreqVectors[freqVectorIndex]; + + for (int positionVectorIndex = 0; positionVectorIndex < aprioriFreqVectors[freqVectorIndex].getTerms().length; positionVectorIndex++) + { + if (aprioriTermPositionVector.getOffsets(positionVectorIndex) != null) { + assertTrue(Arrays.equals(aprioriTermPositionVector.getOffsets(positionVectorIndex), testTermPositionVector.getOffsets(positionVectorIndex))); + } + + if (aprioriTermPositionVector.getTermPositions(positionVectorIndex) != null) { + assertTrue(Arrays.equals(aprioriTermPositionVector.getTermPositions(positionVectorIndex), testTermPositionVector.getTermPositions(positionVectorIndex))); + } + } + } + + } + } + + } + + aprioriTermEnum.close(); + testTermEnum.close(); + + aprioriReader.close(); + testReader.close(); + } + +}