LUCENE-550: Added RAMDirectory alternative as a contrib. Similar to MemoryIndex, but can hold more than one document

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@636745 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2008-03-13 12:34:30 +00:00
parent b36d9f9be3
commit f4cc6e2269
18 changed files with 2903 additions and 1 deletions

View File

@ -458,7 +458,10 @@ New features
to consider token positions when creating PhraseQuery to consider token positions when creating PhraseQuery
and MultiPhraseQuery. Disabled by default (so by default and MultiPhraseQuery. Disabled by default (so by default
the query parser ignores position increments). the query parser ignores position increments).
(Doron Cohen) (Doron Cohen)
13. LUCENE-550: Added InstantiatedIndex implementation. Experimental Index store similar to MemoryIndex but allows
for multiple documents in memory. (Karl Wettin via Grant Ingersoll)
Optimizations Optimizations

View File

@ -0,0 +1,32 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="instantiated" default="default">
<description>
InstantiatedIndex, an alternative RAM store.
</description>
<property name="javac.source" value="1.5" />
<property name="javac.target" value="1.5" />
<import file="../contrib-build.xml"/>
</project>

Binary file not shown.

After

Width:  |  Height:  |  Size: 61 KiB

View File

@ -0,0 +1,50 @@
<?xml version="1.0" encoding="UTF-8"?><umlet_diagram><help_text>//Uncomment the following line to change the fontsize:
//fontsize=14
//Welcome to UMLet!
// *Double-click on UML elements to add them to the diagram.
// *Edit element properties by modifying the text in this panel.
// *Edit the files in the 'palettes' directory to store your own element palettes.
// *Press Del or Backspace to remove elements from the diagram.
// *Hold down Ctrl key to select multiple elements.
// *Press c to copy the UML diagram to the system clipboard.
// * This text will be stored with each diagram. Feel free to use the area for notes.
</help_text><element><type>com.umlet.element.base.Class</type><coordinates><x>310</x><y>540</y><w>310</w><h>110</h></coordinates><panel_attributes>bg=#eeeeee
fg=#000000
InstantiatedTermDocumentInformation
--
+payloads:byte[][]
+termPositions:int[]
+termOffsets:TermVectorOffsetInfo[]
+indexFromTerm:int
--</panel_attributes><additional_attributes></additional_attributes></element><element><type>com.umlet.element.base.Relation</type><coordinates><x>460</x><y>380</y><w>40</w><h>180</h></coordinates><panel_attributes>lt=.</panel_attributes><additional_attributes>20;20;20;160</additional_attributes></element><element><type>com.umlet.element.base.Relation</type><coordinates><x>460</x><y>40</y><w>61</w><h>160</h></coordinates><panel_attributes>lt=&lt;-
q2=field
m1=0..1</panel_attributes><additional_attributes>30;20;30;140</additional_attributes></element><element><type>com.umlet.element.base.Class</type><coordinates><x>430</x><y>30</y><w>120</w><h>30</h></coordinates><panel_attributes>bg=#eeeeee
fg=#000099
_norm: byte[][]_</panel_attributes><additional_attributes></additional_attributes></element><element><type>com.umlet.element.base.Class</type><coordinates><x>80</x><y>390</y><w>100</w><h>30</h></coordinates><panel_attributes>bg=#eeeeee
fg=#000099
Term</panel_attributes><additional_attributes></additional_attributes></element><element><type>com.umlet.element.base.Relation</type><coordinates><x>770</x><y>380</y><w>120</w><h>40</h></coordinates><panel_attributes>lt=-&gt;
m2=1</panel_attributes><additional_attributes>20;20;100;20</additional_attributes></element><element><type>com.umlet.element.base.Class</type><coordinates><x>870</x><y>390</y><w>100</w><h>30</h></coordinates><panel_attributes>bg=#eeeeee
fg=#000099
Document</panel_attributes><additional_attributes></additional_attributes></element><element><type>com.umlet.element.base.Class</type><coordinates><x>590</x><y>370</y><w>200</w><h>60</h></coordinates><panel_attributes>bg=#eeeeee
fg=#000000
InstantiatedDocument
--
+documentNumber:int
--</panel_attributes><additional_attributes></additional_attributes></element><element><type>com.umlet.element.base.Relation</type><coordinates><x>520</x><y>190</y><w>170</w><h>200</h></coordinates><panel_attributes>lt=&lt;-
m1=0..*
&lt;&lt;ordered&gt;&gt;</panel_attributes><additional_attributes>150;180;20;20</additional_attributes></element><element><type>com.umlet.element.base.Relation</type><coordinates><x>290</x><y>190</y><w>140</w><h>220</h></coordinates><panel_attributes>lt=&lt;-
m1=0..*
&lt;&lt;ordered&gt;&gt;</panel_attributes><additional_attributes>20;200;120;20</additional_attributes></element><element><type>com.umlet.element.base.Class</type><coordinates><x>380</x><y>180</y><w>200</w><h>30</h></coordinates><panel_attributes>bg=#eeeeee
fg=#000000
InstantiatedIndex</panel_attributes><additional_attributes></additional_attributes></element><element><type>com.umlet.element.base.Relation</type><coordinates><x>160</x><y>380</y><w>110</w><h>40</h></coordinates><panel_attributes>lt=-&gt;
m2=1</panel_attributes><additional_attributes>90;20;20;20</additional_attributes></element><element><type>com.umlet.element.base.Class</type><coordinates><x>250</x><y>390</y><w>160</w><h>30</h></coordinates><panel_attributes>bg=#eeeeee
fg=#000000
InstantiatedTerm
</panel_attributes><additional_attributes></additional_attributes></element><element><type>com.umlet.element.base.Relation</type><coordinates><x>380</x><y>190</y><w>146</w><h>220</h></coordinates><panel_attributes>lt=&lt;-
q2=field, term
m1=0..1</panel_attributes><additional_attributes>20;200;100;20</additional_attributes></element><element><type>com.umlet.element.base.Relation</type><coordinates><x>390</x><y>380</y><w>220</w><h>40</h></coordinates><panel_attributes>lt=-
q2=field
m2=0..*
m1=0..*</panel_attributes><additional_attributes>20;20;200;20</additional_attributes></element></umlet_diagram>

View File

@ -0,0 +1,50 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-contrib</artifactId>
<version>@version@</version>
</parent>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-instantiated</artifactId>
<name>Lucene InstantiatedIndex</name>
<version>@version@</version>
<description>InstantiatedIndex, alternative RAM store for small corpora.</description>
<packaging>jar</packaging>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.5</source>
<target>1.5</target>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,79 @@
package org.apache.lucene.store.instantiated;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.document.Document;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
/**
* A document in the instantiated index object graph, optionally coupled to the vector space view.
*
* @see org.apache.lucene.document.Document
*/
public class InstantiatedDocument
implements Serializable {
private static long serialVersionUID = 1l;
private Document document;
public InstantiatedDocument() {
this.document = new Document();
}
public InstantiatedDocument(Document document) {
this.document = document;
}
/** this is the unsafe index order document number. */
private Integer documentNumber;
/** this is the term vector space view */
private Map<String /*field name*/, List<InstantiatedTermDocumentInformation>> vectorSpace;
/**
* @return position of document in the index.
*/
public Integer getDocumentNumber() {
return documentNumber;
}
void setDocumentNumber(Integer documentNumber) {
this.documentNumber = documentNumber;
}
public Map</*field name*/ String, List<InstantiatedTermDocumentInformation>> getVectorSpace() {
return vectorSpace;
}
public void setVectorSpace(Map</*field name*/ String, List<InstantiatedTermDocumentInformation>> vectorSpace) {
this.vectorSpace = vectorSpace;
}
public Document getDocument() {
return document;
}
public String toString() {
return document.toString();
}
}

View File

@ -0,0 +1,274 @@
package org.apache.lucene.store.instantiated;
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import java.io.IOException;
import java.io.Serializable;
import java.util.*;
/**
* Represented as a coupled graph of class instances, this
* all-in-memory index store implementation delivers search
* results up to a 100 times faster than the file-centric RAMDirectory
* at the cost of greater RAM consumption.
*
* WARNING: This contrib is experimental and the APIs may change without warning.
*
* There are no read and write locks in this store.
* {@link InstantiatedIndexReader} {@link InstantiatedIndexReader#isCurrent()} all the time
* and {@link org.apache.lucene.store.instantiated.InstantiatedIndexWriter}
* will attempt to update instances of the object graph in memory
* at the same time as a searcher is reading from it.
*
* Consider using InstantiatedIndex as if it was immutable.
*/
public class InstantiatedIndex
implements Serializable {
private static final long serialVersionUID = 1l;
private long version = System.currentTimeMillis();
private InstantiatedDocument[] documentsByNumber;
/** todo: this should be a BitSet */
private Set<Integer> deletedDocuments;
private Map<String, Map<String, InstantiatedTerm>> termsByFieldAndText;
private InstantiatedTerm[] orderedTerms;
private Map<String, byte[]> normsByFieldNameAndDocumentNumber;
/**
* Creates an empty instantiated index for you to fill with data using an {@link org.apache.lucene.store.instantiated.InstantiatedIndexWriter}.
*/
public InstantiatedIndex() {
initialize();
}
void initialize() {
// todo: clear index without loosing memory (uncouple stuff)
termsByFieldAndText = new HashMap<String, Map<String, InstantiatedTerm>>();
orderedTerms = new InstantiatedTerm[0];
documentsByNumber = new InstantiatedDocument[0];
normsByFieldNameAndDocumentNumber = new HashMap<String, byte[]>();
deletedDocuments = new HashSet<Integer>();
}
/**
* Creates a new instantiated index that looks just like the index in a specific state as represented by a reader.
*
* @param sourceIndexReader the source index this new instantiated index will be copied from.
* @throws IOException if the source index is not optimized, or when accesing the source.
*/
public InstantiatedIndex(IndexReader sourceIndexReader) throws IOException {
this(sourceIndexReader, null);
}
/**
* Creates a new instantiated index that looks just like the index in a specific state as represented by a reader.
*
* @param sourceIndexReader the source index this new instantiated index will be copied from.
* @param fields fields to be added, or null for all
* @throws IOException if the source index is not optimized, or when accesing the source.
*/
public InstantiatedIndex(IndexReader sourceIndexReader, Set<String> fields) throws IOException {
if (!sourceIndexReader.isOptimized()) {
throw new IOException("Source index is not optimized.");
}
Collection<String> allFieldNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.ALL);
initialize();
documentsByNumber = new InstantiatedDocument[sourceIndexReader.numDocs()];
// create documents
for (int i = 0; i < sourceIndexReader.numDocs(); i++) {
if (!sourceIndexReader.isDeleted(i)) {
InstantiatedDocument document = new InstantiatedDocument();
// copy stored fields from source reader
Document sourceDocument = sourceIndexReader.document(i);
for (Field field : (List<Field>) sourceDocument.getFields()) {
if (fields == null || fields.contains(field.name())) {
document.getDocument().add(field);
}
}
document.setDocumentNumber(i);
documentsByNumber[i] = document;
for (Field field : (List<Field>) document.getDocument().getFields()) {
if (fields == null || fields.contains(field.name())) {
if (field.isTermVectorStored()) {
if (document.getVectorSpace() == null) {
document.setVectorSpace(new HashMap<String, List<InstantiatedTermDocumentInformation>>());
}
document.getVectorSpace().put(field.name(), new ArrayList<InstantiatedTermDocumentInformation>());
}
}
}
}
}
// create norms
for (String fieldName : allFieldNames) {
if (fields == null || fields.contains(fieldName)) {
getNormsByFieldNameAndDocumentNumber().put(fieldName, sourceIndexReader.norms(fieldName));
}
}
// create terms
for (String fieldName : allFieldNames) {
if (fields == null || fields.contains(fieldName)) {
getTermsByFieldAndText().put(fieldName, new HashMap<String, InstantiatedTerm>(5000));
}
}
List<InstantiatedTerm> terms = new ArrayList<InstantiatedTerm>(5000 * getTermsByFieldAndText().size());
TermEnum termEnum = sourceIndexReader.terms();
while (termEnum.next()) {
if (fields == null || fields.contains(termEnum.term().field())) { // todo skipto if not using field
InstantiatedTerm instantiatedTerm = new InstantiatedTerm(termEnum.term().field(), termEnum.term().text());
getTermsByFieldAndText().get(termEnum.term().field()).put(termEnum.term().text(), instantiatedTerm);
instantiatedTerm.setTermIndex(terms.size());
terms.add(instantiatedTerm);
instantiatedTerm.setAssociatedDocuments(new InstantiatedTermDocumentInformation[termEnum.docFreq()]);
}
}
termEnum.close();
orderedTerms = terms.toArray(new InstantiatedTerm[terms.size()]);
// create term-document informations
for (InstantiatedTerm term : orderedTerms) {
TermPositions termPositions = sourceIndexReader.termPositions(term.getTerm());
int position = 0;
while (termPositions.next()) {
InstantiatedDocument document = documentsByNumber[termPositions.doc()];
byte[][] payloads = new byte[termPositions.freq()][];
int[] positions = new int[termPositions.freq()];
for (int i = 0; i < termPositions.freq(); i++) {
positions[i] = termPositions.nextPosition();
if (termPositions.isPayloadAvailable()) {
payloads[i] = new byte[termPositions.getPayloadLength()];
termPositions.getPayload(payloads[i], 0);
}
}
InstantiatedTermDocumentInformation termDocumentInformation = new InstantiatedTermDocumentInformation(term, document, positions, payloads);
term.getAssociatedDocuments()[position++] = termDocumentInformation;
if (document.getVectorSpace() != null
&& document.getVectorSpace().containsKey(term.field())) {
document.getVectorSpace().get(term.field()).add(termDocumentInformation);
}
// termDocumentInformation.setIndexFromTerm(indexFromTerm++);
}
}
// load offsets to term-document informations
for (InstantiatedDocument document : getDocumentsByNumber()) {
for (Field field : (List<Field>) document.getDocument().getFields()) {
if (field.isTermVectorStored() && field.isStoreOffsetWithTermVector()) {
TermPositionVector termPositionVector = (TermPositionVector) sourceIndexReader.getTermFreqVector(document.getDocumentNumber(), field.name());
if (termPositionVector != null) {
for (int i = 0; i < termPositionVector.getTerms().length; i++) {
String token = termPositionVector.getTerms()[i];
InstantiatedTerm term = findTerm(field.name(), token);
InstantiatedTermDocumentInformation termDocumentInformation = term.getAssociatedDocument(document.getDocumentNumber());
termDocumentInformation.setTermOffsets(termPositionVector.getOffsets(i));
}
}
}
}
}
}
public InstantiatedIndexWriter indexWriterFactory(Analyzer analyzer, boolean create) throws IOException {
return new InstantiatedIndexWriter(this, analyzer, create);
}
public InstantiatedIndexReader indexReaderFactory() throws IOException {
return new InstantiatedIndexReader(this);
}
public void close() throws IOException {
// todo: decouple everything
}
InstantiatedTerm findTerm(Term term) {
return findTerm(term.field(), term.text());
}
InstantiatedTerm findTerm(String field, String text) {
Map<String, InstantiatedTerm> termsByField = termsByFieldAndText.get(field);
if (termsByField == null) {
return null;
} else {
return termsByField.get(text);
}
}
public Map<String, Map<String, InstantiatedTerm>> getTermsByFieldAndText() {
return termsByFieldAndText;
}
public InstantiatedTerm[] getOrderedTerms() {
return orderedTerms;
}
public InstantiatedDocument[] getDocumentsByNumber() {
return documentsByNumber;
}
public Map<String, byte[]> getNormsByFieldNameAndDocumentNumber() {
return normsByFieldNameAndDocumentNumber;
}
void setNormsByFieldNameAndDocumentNumber(Map<String, byte[]> normsByFieldNameAndDocumentNumber) {
this.normsByFieldNameAndDocumentNumber = normsByFieldNameAndDocumentNumber;
}
public Set<Integer> getDeletedDocuments() {
return deletedDocuments;
}
void setOrderedTerms(InstantiatedTerm[] orderedTerms) {
this.orderedTerms = orderedTerms;
}
void setDocumentsByNumber(InstantiatedDocument[] documentsByNumber) {
this.documentsByNumber = documentsByNumber;
}
public long getVersion() {
return version;
}
void setVersion(long version) {
this.version = version;
}
}

View File

@ -0,0 +1,326 @@
package org.apache.lucene.store.instantiated;
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import java.io.IOException;
import java.util.*;
/**
* An InstantiatedIndexReader is not a snapshot in time,
* it is completely in sync with the latest commit to the store!
*
* Consider using InstantiatedIndex as if it was immutable.
*/
public class InstantiatedIndexReader
extends IndexReader {
private final InstantiatedIndex index;
public InstantiatedIndexReader(InstantiatedIndex index) {
super();
this.index = index;
}
/**
* @return always true.
*/
public boolean isOptimized() {
return true;
}
/**
* An InstantiatedIndexReader is not a snapshot in time,
* it is completely in sync with the latest commit to the store!
*
* @return output from {@link InstantiatedIndex#getVersion()} in associated instantiated index.
*/
public long getVersion() {
return index.getVersion();
}
public Directory directory() {
throw new UnsupportedOperationException();
}
/**
* An InstantiatedIndexReader is always current!
*
* Check whether this IndexReader is still using the
* current (i.e., most recently committed) version of the
* index. If a writer has committed any changes to the
* index since this reader was opened, this will return
* <code>false</code>, in which case you must open a new
* IndexReader in order to see the changes. See the
* description of the <a href="IndexWriter.html#autoCommit"><code>autoCommit</code></a>
* flag which controls when the {@link IndexWriter}
* actually commits changes to the index.
*
* @return always true
* @throws CorruptIndexException if the index is corrupt
* @throws IOException if there is a low-level IO error
* @throws UnsupportedOperationException unless overridden in subclass
*/
public boolean isCurrent() throws IOException {
return true;
}
public InstantiatedIndex getIndex() {
return index;
}
private Set<InstantiatedDocument> deletedDocuments = new HashSet<InstantiatedDocument>();
private Set<Integer> deletedDocumentNumbers = new HashSet<Integer>();
private Map<String, List<NormUpdate>> updatedNormsByFieldNameAndDocumentNumber = null;
private class NormUpdate {
private int doc;
private byte value;
public NormUpdate(int doc, byte value) {
this.doc = doc;
this.value = value;
}
}
public int numDocs() {
return getIndex().getDocumentsByNumber().length - index.getDeletedDocuments().size() - deletedDocuments.size();
}
public int maxDoc() {
return getIndex().getDocumentsByNumber().length;
}
public boolean isDeleted(int n) {
return getIndex().getDeletedDocuments().contains(n) || deletedDocumentNumbers.contains(n);
}
public boolean hasDeletions() {
return getIndex().getDeletedDocuments().size() > 0 || deletedDocumentNumbers.size() > 0;
}
protected void doDelete(int docNum) throws IOException {
if (!getIndex().getDeletedDocuments().contains(docNum)) {
if (deletedDocumentNumbers.add(docNum)) {
deletedDocuments.add(getIndex().getDocumentsByNumber()[docNum]);
}
}
}
protected void doUndeleteAll() throws IOException {
deletedDocumentNumbers.clear();
deletedDocuments.clear();
}
protected void doCommit() throws IOException {
// todo: read/write lock
boolean updated = false;
// 1. update norms
if (updatedNormsByFieldNameAndDocumentNumber != null) {
for (Map.Entry<String, List<NormUpdate>> e : updatedNormsByFieldNameAndDocumentNumber.entrySet()) {
byte[] norms = getIndex().getNormsByFieldNameAndDocumentNumber().get(e.getKey());
for (NormUpdate normUpdate : e.getValue()) {
norms[normUpdate.doc] = normUpdate.value;
}
}
updatedNormsByFieldNameAndDocumentNumber = null;
updated = true;
}
// 2. remove deleted documents
if (deletedDocumentNumbers.size() > 0) {
for (Integer doc : deletedDocumentNumbers) {
getIndex().getDeletedDocuments().add(doc);
}
deletedDocumentNumbers.clear();
deletedDocuments.clear();
updated = true;
}
// todo unlock read/writelock
}
protected void doClose() throws IOException {
// ignored
}
public Collection getFieldNames(FieldOption fldOption) {
if (fldOption != FieldOption.ALL) {
throw new IllegalArgumentException("Only FieldOption.ALL implemented."); // todo
}
return new ArrayList<String>(getIndex().getTermsByFieldAndText().keySet());
}
/**
* This implementation ignores the field selector! All fields are always returned
*
* Get the {@link org.apache.lucene.document.Document} at the <code>n</code><sup>th</sup> position.
*
* @param n Get the document at the <code>n</code><sup>th</sup> position
* @param fieldSelector ignored
* @return The stored fields of the {@link org.apache.lucene.document.Document} at the nth position
* @throws CorruptIndexException if the index is corrupt
* @throws IOException if there is a low-level IO error
*
* @see org.apache.lucene.document.Fieldable
* @see org.apache.lucene.document.FieldSelector
* @see org.apache.lucene.document.SetBasedFieldSelector
* @see org.apache.lucene.document.LoadFirstFieldSelector
*/
public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
return document(n);
}
public Document document(int n) throws IOException {
if ((deletedDocumentNumbers != null
&& deletedDocumentNumbers.contains(n))
||
(getIndex().getDeletedDocuments() != null
&& getIndex().getDeletedDocuments().contains(n))) {
return null;
}
return getIndex().getDocumentsByNumber()[n].getDocument();
}
/**
* never ever touch these values. it is the true values, unless norms have been touched.
*/
public byte[] norms(String field) throws IOException {
byte[] norms = getIndex().getNormsByFieldNameAndDocumentNumber().get(field);
if (updatedNormsByFieldNameAndDocumentNumber != null) {
norms = norms.clone();
List<NormUpdate> updated = updatedNormsByFieldNameAndDocumentNumber.get(field);
if (updated != null) {
for (NormUpdate normUpdate : updated) {
norms[normUpdate.doc] = normUpdate.value;
}
}
}
return norms;
}
public void norms(String field, byte[] bytes, int offset) throws IOException {
byte[] norms = getIndex().getNormsByFieldNameAndDocumentNumber().get(field);
System.arraycopy(norms, offset, bytes, 0, norms.length);
}
protected void doSetNorm(int doc, String field, byte value) throws IOException {
if (updatedNormsByFieldNameAndDocumentNumber == null) {
updatedNormsByFieldNameAndDocumentNumber = new HashMap<String, List<NormUpdate>>(getIndex().getNormsByFieldNameAndDocumentNumber().size());
}
List<NormUpdate> list = updatedNormsByFieldNameAndDocumentNumber.get(field);
if (list == null) {
list = new LinkedList<NormUpdate>();
updatedNormsByFieldNameAndDocumentNumber.put(field, list);
}
list.add(new NormUpdate(doc, value));
}
public int docFreq(Term t) throws IOException {
InstantiatedTerm term = getIndex().findTerm(t);
if (term == null) {
return 0;
} else {
return term.getAssociatedDocuments().length;
}
}
public TermEnum terms() throws IOException {
return new InstantiatedTermEnum(this);
}
public TermEnum terms(Term t) throws IOException {
InstantiatedTerm it = getIndex().findTerm(t);
if (it != null) {
return new InstantiatedTermEnum(this, it.getTermIndex());
} else {
int startPos = Arrays.binarySearch(index.getOrderedTerms(), t, InstantiatedTerm.termComparator);
if (startPos < 0) {
startPos = -1 -startPos;
}
return new InstantiatedTermEnum(this, startPos);
}
}
public TermDocs termDocs() throws IOException {
return new InstantiatedTermDocs(this);
}
public TermPositions termPositions() throws IOException {
return new InstantiatedTermPositions(this);
}
public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException {
InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber];
if (doc.getVectorSpace() == null) {
return null;
}
TermFreqVector[] ret = new TermFreqVector[doc.getVectorSpace().size()];
Iterator<String> it = doc.getVectorSpace().keySet().iterator();
for (int i = 0; i < ret.length; i++) {
ret[i] = new InstantiatedTermPositionVector(getIndex().getDocumentsByNumber()[docNumber], it.next());
}
return ret;
}
public TermFreqVector getTermFreqVector(int docNumber, String field) throws IOException {
InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber];
if (doc.getVectorSpace() == null
|| doc.getVectorSpace().get(field) == null) {
return null;
} else {
return new InstantiatedTermPositionVector(doc, field);
}
}
public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber];
if (doc.getVectorSpace() != null
&& doc.getVectorSpace().get(field) == null) {
List<InstantiatedTermDocumentInformation> tv = doc.getVectorSpace().get(field);
mapper.setExpectations(field, tv.size(), true, true);
for (InstantiatedTermDocumentInformation tdi : tv) {
mapper.map(tdi.getTerm().text(), tdi.getTermPositions().length, tdi.getTermOffsets(), tdi.getTermPositions());
}
}
}
public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber];
for (Map.Entry<String, List<InstantiatedTermDocumentInformation>> e : doc.getVectorSpace().entrySet()) {
mapper.setExpectations(e.getKey(), e.getValue().size(), true, true);
for (InstantiatedTermDocumentInformation tdi : e.getValue()) {
mapper.map(tdi.getTerm().text(), tdi.getTermPositions().length, tdi.getTermOffsets(), tdi.getTermPositions());
}
}
}
}

View File

@ -0,0 +1,681 @@
package org.apache.lucene.store.instantiated;
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermVectorOffsetInfo;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.Similarity;
import java.io.IOException;
import java.io.PrintStream;
import java.io.StringReader;
import java.util.*;
/**
* This class, similar to {@link org.apache.lucene.index.IndexWriter}, has no locking mechanism.
*
* {@link org.apache.lucene.store.instantiated.InstantiatedIndexReader} is navigating
* the same instances in memory as this writer is updating so searchers actice while
* you are committing are bound to throw exceptions.
*
* Consider using InstantiatedIndex as if it was immutable.
*
* @see org.apache.lucene.index.IndexWriter
*/
public class InstantiatedIndexWriter {
private PrintStream infoStream = null;
private int maxFieldLength = IndexWriter.DEFAULT_MAX_FIELD_LENGTH;
private final InstantiatedIndex index;
private final Analyzer analyzer;
private Similarity similarity = Similarity.getDefault(); // how to normalize;
private transient Set<String> fieldNameBuffer;
/**
* linked to ensure chronological order
*/
private Map<InstantiatedDocument, Map<FieldSetting, Map<String /*text*/, TermDocumentInformationFactory>>> termDocumentInformationFactoryByDocument = new LinkedHashMap<InstantiatedDocument, Map<FieldSetting, Map<String /*text*/, TermDocumentInformationFactory>>>(2000);
private Set<InstantiatedDocument> unflushedDocuments = new HashSet<InstantiatedDocument>();
public InstantiatedIndexWriter(InstantiatedIndex index) throws IOException {
this(index, null);
}
public InstantiatedIndexWriter(InstantiatedIndex index, Analyzer analyzer) throws IOException {
this(index, analyzer, false);
}
public InstantiatedIndexWriter(InstantiatedIndex index, Analyzer analyzer, boolean create) throws IOException {
this.index = index;
this.analyzer = analyzer;
fieldNameBuffer = new HashSet<String>();
if (create) {
this.index.initialize();
}
}
private int mergeFactor = 2500;
/**
* The sweetspot for this implementation is somewhere around 2500 at 2K text large documents.
* <p/>
* Benchmark output:
* <pre>
* ------------> Report sum by Prefix (MAddDocs) and Round (8 about 8 out of 160153)
* Operation round mrg buf cmpnd runCnt recsPerRun rec/s elapsedSec avgUsedMem avgTotalMem
* MAddDocs_20000 0 10 10 true 1 20000 81,4 245,68 200 325 152 268 156 928
* MAddDocs_20000 - 1 1000 10 true - - 1 - - 20000 - - 494,1 - - 40,47 - 247 119 072 - 347 025 408
* MAddDocs_20000 2 10 100 true 1 20000 104,8 190,81 233 895 552 363 720 704
* MAddDocs_20000 - 3 2000 100 true - - 1 - - 20000 - - 527,2 - - 37,94 - 266 136 448 - 378 273 792
* MAddDocs_20000 4 10 10 false 1 20000 103,2 193,75 222 089 792 378 273 792
* MAddDocs_20000 - 5 3000 10 false - - 1 - - 20000 - - 545,2 - - 36,69 - 237 917 152 - 378 273 792
* MAddDocs_20000 6 10 100 false 1 20000 102,7 194,67 237 018 976 378 273 792
* MAddDocs_20000 - 7 4000 100 false - - 1 - - 20000 - - 535,8 - - 37,33 - 309 680 640 - 501 968 896
* </pre>
*
* @see org.apache.lucene.index.IndexWriter#setMergeFactor(int)
*/
public void setMergeFactor(int mergeFactor) {
this.mergeFactor = mergeFactor;
}
/**
* @see org.apache.lucene.index.IndexWriter#getMergeFactor()
*/
public int getMergeFactor() {
return mergeFactor;
}
/**
* If non-null, information about merges and a message when
* maxFieldLength is reached will be printed to this.
*/
public void setInfoStream(PrintStream infoStream) {
this.infoStream = infoStream;
}
public void abort() throws IOException {
// what not
}
public void addIndexes(IndexReader[] readers) {
throw new RuntimeException("Not implemented");
}
public PrintStream getInfoStream() {
return infoStream;
}
/**
* Flushes all changes to an index and closes all associated files.
*/
public void close() throws IOException {
commit();
}
/**
* Returns the number of documents currently in this index.
*/
public int docCount() {
// todo: not certain. see http://www.nabble.com/IndexWriter.docCount-tf3128882.html#a8669483
return index.getDocumentsByNumber().length /* - index.getDeletedDocuments().size() */ + unflushedDocuments.size();
}
/**
* Locks the index and commits the buffered documents.
*/
public void commit() throws IOException {
// todo write lock, unless held by caller
boolean orderedTermsDirty = false;
Set<InstantiatedTerm> dirtyTerms = new HashSet<InstantiatedTerm>(1000);
InstantiatedDocument[] documentsByNumber = new InstantiatedDocument[index.getDocumentsByNumber().length + termDocumentInformationFactoryByDocument.size()];
System.arraycopy(index.getDocumentsByNumber(), 0, documentsByNumber, 0, index.getDocumentsByNumber().length);
int documentNumber = index.getDocumentsByNumber().length;
List<InstantiatedTerm> orderedTerms = new ArrayList<InstantiatedTerm>(index.getOrderedTerms().length + 5000);
for (InstantiatedTerm instantiatedTerm : index.getOrderedTerms()) {
orderedTerms.add(instantiatedTerm);
}
// update norm array with fake values for new documents
Map<String, byte[]> normsByFieldNameAndDocumentNumber = new HashMap<String, byte[]>(index.getTermsByFieldAndText().size());
Set<String> fieldNames = new HashSet<String>(20);
fieldNames.addAll(index.getNormsByFieldNameAndDocumentNumber().keySet());
fieldNames.addAll(fieldNameBuffer);
for (String field : index.getTermsByFieldAndText().keySet()) {
byte[] norms = new byte[index.getDocumentsByNumber().length + termDocumentInformationFactoryByDocument.size()];
byte[] oldNorms = index.getNormsByFieldNameAndDocumentNumber().get(field);
if (oldNorms != null) {
System.arraycopy(oldNorms, 0, norms, 0, oldNorms.length);
Arrays.fill(norms, oldNorms.length, norms.length, DefaultSimilarity.encodeNorm(1.0f));
} else {
Arrays.fill(norms, 0, norms.length, DefaultSimilarity.encodeNorm(1.0f));
}
normsByFieldNameAndDocumentNumber.put(field, norms);
fieldNames.remove(field);
}
for (String field : fieldNames) {
//System.out.println(field);
byte[] norms = new byte[index.getDocumentsByNumber().length + termDocumentInformationFactoryByDocument.size()];
Arrays.fill(norms, 0, norms.length, DefaultSimilarity.encodeNorm(1.0f));
normsByFieldNameAndDocumentNumber.put(field, norms);
}
fieldNames.clear();
index.setNormsByFieldNameAndDocumentNumber(normsByFieldNameAndDocumentNumber);
for (Map.Entry<InstantiatedDocument, Map<FieldSetting, Map<String /*text*/, TermDocumentInformationFactory>>> eDocumentTermDocInfoByTermTextAndField : termDocumentInformationFactoryByDocument.entrySet()) {
InstantiatedDocument document = eDocumentTermDocInfoByTermTextAndField.getKey();
// assign document number
document.setDocumentNumber(documentNumber++);
documentsByNumber[document.getDocumentNumber()] = document;
// set norms, prepare document and create optimized size collections.
int numFieldsWithTermVectorsInDocument = 0;
int termsInDocument = 0;
for (Map.Entry<FieldSetting, Map<String /*text*/, TermDocumentInformationFactory>> eFieldTermDocInfoFactoriesByTermText : eDocumentTermDocInfoByTermTextAndField.getValue().entrySet()) {
if (eFieldTermDocInfoFactoriesByTermText.getKey().storeTermVector) {
numFieldsWithTermVectorsInDocument += eFieldTermDocInfoFactoriesByTermText.getValue().size();
}
termsInDocument += eFieldTermDocInfoFactoriesByTermText.getValue().size();
if (eFieldTermDocInfoFactoriesByTermText.getKey().isIndexed && !eFieldTermDocInfoFactoriesByTermText.getKey().omitNorms) {
float norm = eFieldTermDocInfoFactoriesByTermText.getKey().boost;
norm *= document.getDocument().getBoost();
norm *= similarity.lengthNorm(eFieldTermDocInfoFactoriesByTermText.getKey().fieldName, eFieldTermDocInfoFactoriesByTermText.getKey().fieldLength);
normsByFieldNameAndDocumentNumber.get(eFieldTermDocInfoFactoriesByTermText.getKey().fieldName)[document.getDocumentNumber()] = Similarity.encodeNorm(norm);
} else {
System.currentTimeMillis();
}
}
/** used for term vectors only, i think.. */
Map<InstantiatedTerm, InstantiatedTermDocumentInformation> informationByTermOfCurrentDocument = new HashMap<InstantiatedTerm, InstantiatedTermDocumentInformation>(termsInDocument);
Map<String, FieldSetting> documentFieldSettingsByFieldName = new HashMap<String, FieldSetting>(eDocumentTermDocInfoByTermTextAndField.getValue().size());
// terms...
for (Map.Entry<FieldSetting, Map<String /*text*/, TermDocumentInformationFactory>> eFieldSetting_TermDocInfoFactoriesByTermText : eDocumentTermDocInfoByTermTextAndField.getValue().entrySet()) {
documentFieldSettingsByFieldName.put(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, eFieldSetting_TermDocInfoFactoriesByTermText.getKey());
// find or create term
for (Map.Entry<String /*text*/, TermDocumentInformationFactory> eTermText_TermDocInfoFactory : eFieldSetting_TermDocInfoFactoriesByTermText.getValue().entrySet()) {
// get term..
InstantiatedTerm term;
Map<String, InstantiatedTerm> termsByText = index.getTermsByFieldAndText().get(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName);
if (termsByText == null) {
termsByText = new HashMap<String, InstantiatedTerm>(1000);
index.getTermsByFieldAndText().put(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, termsByText);
term = new InstantiatedTerm(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, eTermText_TermDocInfoFactory.getKey());
termsByText.put(eTermText_TermDocInfoFactory.getKey(), term);
int pos = Collections.binarySearch(orderedTerms, term, InstantiatedTerm.comparator);
pos = -1 - pos;
orderedTerms.add(pos, term);
orderedTermsDirty = true;
} else {
term = termsByText.get(eTermText_TermDocInfoFactory.getKey());
if (term == null) {
term = new InstantiatedTerm(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, eTermText_TermDocInfoFactory.getKey());
termsByText.put(eTermText_TermDocInfoFactory.getKey(), term);
int pos = Collections.binarySearch(orderedTerms, term, InstantiatedTerm.comparator);
pos = -1 - pos;
orderedTerms.add(pos, term);
orderedTermsDirty = true;
}
}
// create association term document infomation
//
// [Term]-- {0..*} | {0..* ordered} --(field)[Document]
//
// |
// [TermDocumentInformation]
int[] positions = new int[eTermText_TermDocInfoFactory.getValue().termPositions.size()];
for (int i = 0; i < positions.length; i++) {
positions[i] = eTermText_TermDocInfoFactory.getValue().termPositions.get(i);
}
byte[][] payloads = new byte[eTermText_TermDocInfoFactory.getValue().payloads.size()][];
for (int i = 0; i < payloads.length; i++) {
payloads[i] = eTermText_TermDocInfoFactory.getValue().payloads.get(i);
}
// couple
InstantiatedTermDocumentInformation info = new InstantiatedTermDocumentInformation(term, document, /*eTermText_TermDocInfoFactory.getValue().termFrequency,*/ positions, payloads);
// todo optimize, this should be chached and updated to array in batches rather than appending the array once for every position!
InstantiatedTermDocumentInformation[] associatedDocuments;
if (term.getAssociatedDocuments() != null) {
associatedDocuments = new InstantiatedTermDocumentInformation[term.getAssociatedDocuments().length + 1];
System.arraycopy(term.getAssociatedDocuments(), 0, associatedDocuments, 0, term.getAssociatedDocuments().length);
} else {
associatedDocuments = new InstantiatedTermDocumentInformation[1];
}
associatedDocuments[associatedDocuments.length - 1] = info;
term.setAssociatedDocuments(associatedDocuments);
// todo optimize, only if term vector?
informationByTermOfCurrentDocument.put(term, info);
dirtyTerms.add(term);
}
// term vector offsets
if (eFieldSetting_TermDocInfoFactoriesByTermText.getKey().storeOffsetWithTermVector) {
for (Map.Entry<InstantiatedTerm, InstantiatedTermDocumentInformation> e : informationByTermOfCurrentDocument.entrySet()) {
if (eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName.equals(e.getKey().field())) {
TermDocumentInformationFactory factory = eFieldSetting_TermDocInfoFactoriesByTermText.getValue().get(e.getKey().text());
e.getValue().setTermOffsets(factory.termOffsets.toArray(new TermVectorOffsetInfo[factory.termOffsets.size()]));
}
}
}
}
Map<String, List<InstantiatedTermDocumentInformation>> termDocumentInformationsByField = new HashMap<String, List<InstantiatedTermDocumentInformation>>();
for (Map.Entry<InstantiatedTerm, InstantiatedTermDocumentInformation> eTerm_TermDocumentInformation : informationByTermOfCurrentDocument.entrySet()) {
List<InstantiatedTermDocumentInformation> termDocumentInformations = termDocumentInformationsByField.get(eTerm_TermDocumentInformation.getKey().field());
if (termDocumentInformations == null) {
termDocumentInformations = new ArrayList<InstantiatedTermDocumentInformation>();
termDocumentInformationsByField.put(eTerm_TermDocumentInformation.getKey().field(), termDocumentInformations);
}
termDocumentInformations.add(eTerm_TermDocumentInformation.getValue());
}
for (Map.Entry<String, List<InstantiatedTermDocumentInformation>> eField_TermDocInfos : termDocumentInformationsByField.entrySet()) {
Collections.sort(eField_TermDocInfos.getValue(), new Comparator<InstantiatedTermDocumentInformation>() {
public int compare(InstantiatedTermDocumentInformation instantiatedTermDocumentInformation, InstantiatedTermDocumentInformation instantiatedTermDocumentInformation1) {
return instantiatedTermDocumentInformation.getTerm().getTerm().compareTo(instantiatedTermDocumentInformation1.getTerm().getTerm());
}
});
// add term vector
if (documentFieldSettingsByFieldName.get(eField_TermDocInfos.getKey()).storeTermVector) {
if (document.getVectorSpace() == null) {
document.setVectorSpace(new HashMap<String, List<InstantiatedTermDocumentInformation>>(documentFieldSettingsByFieldName.size()));
}
document.getVectorSpace().put(eField_TermDocInfos.getKey(), eField_TermDocInfos.getValue());
}
}
}
// order document informations in dirty terms
for (InstantiatedTerm term : dirtyTerms) {
// todo optimize, i belive this is useless, that the natural order is document number?
Arrays.sort(term.getAssociatedDocuments(), InstantiatedTermDocumentInformation.documentNumberComparator);
// // update association class reference for speedy skipTo()
// for (int i = 0; i < term.getAssociatedDocuments().length; i++) {
// term.getAssociatedDocuments()[i].setIndexFromTerm(i);
// }
}
// flush to writer
index.setDocumentsByNumber(documentsByNumber);
index.setOrderedTerms(orderedTerms.toArray(new InstantiatedTerm[orderedTerms.size()]));
// set term index
if (orderedTermsDirty) {
// todo optimize, only update from start position
for (int i = 0; i < index.getOrderedTerms().length; i++) {
index.getOrderedTerms()[i].setTermIndex(i);
}
}
// remove deleted documents
IndexReader indexDeleter = index.indexReaderFactory();
if (unflushedDeletions.size() > 0) {
for (Term term : unflushedDeletions) {
indexDeleter.deleteDocuments(term);
}
unflushedDeletions.clear();
}
// all done, clear buffers
unflushedDocuments.clear();
termDocumentInformationFactoryByDocument.clear();
fieldNameBuffer.clear();
index.setVersion(System.currentTimeMillis());
// todo unlock
indexDeleter.close();
}
/**
* Adds a document to this index. If the document contains more than
* {@link #setMaxFieldLength(int)} terms for a given field, the remainder are
* discarded.
*/
public void addDocument(Document doc) throws IOException {
addDocument(doc, getAnalyzer());
}
/**
* Adds a document to this index, using the provided analyzer instead of the
* value of {@link #getAnalyzer()}. If the document contains more than
* {@link #setMaxFieldLength(int)} terms for a given field, the remainder are
* discarded.
*
* @param doc
* @param analyzer
* @throws IOException
*/
public void addDocument(Document doc, Analyzer analyzer) throws IOException {
addDocument(new InstantiatedDocument(doc), analyzer);
}
/**
* Tokenizes a document and adds it to the buffer.
* Try to do all calculations in this method rather than in commit, as this is a non locking method.
* Remember, this index implementation expects unlimited memory for maximum speed.
*
* @param document
* @param analyzer
* @throws IOException
*/
protected void addDocument(InstantiatedDocument document, Analyzer analyzer) throws IOException {
if (document.getDocumentNumber() != null) {
throw new RuntimeException("Document number already set! Are you trying to add a document that already is bound to this or another index?");
}
// todo: write lock
// normalize settings per field name in document
Map<String /* field name */, FieldSetting> fieldSettingsByFieldName = new HashMap<String, FieldSetting>();
for (Field field : (List<Field>) document.getDocument().getFields()) {
FieldSetting fieldSettings = fieldSettingsByFieldName.get(field.name());
if (fieldSettings == null) {
fieldSettings = new FieldSetting();
fieldSettings.fieldName = field.name().intern();
fieldSettingsByFieldName.put(fieldSettings.fieldName, fieldSettings);
fieldNameBuffer.add(fieldSettings.fieldName);
}
// todo: fixme: multiple fields with the same name does not mean field boost += more boost.
fieldSettings.boost *= field.getBoost();
//fieldSettings.dimensions++;
// once fieldSettings, always fieldSettings.
if (field.getOmitNorms() != fieldSettings.omitNorms) {
fieldSettings.omitNorms = true;
}
if (field.isIndexed() != fieldSettings.isIndexed) {
fieldSettings.isIndexed = true;
}
if (field.isTokenized() != fieldSettings.isTokenized) {
fieldSettings.isTokenized = true;
}
if (field.isCompressed() != fieldSettings.isCompressed) {
fieldSettings.isCompressed = true;
}
if (field.isStored() != fieldSettings.isStored) {
fieldSettings.isStored = true;
}
if (field.isBinary() != fieldSettings.isBinary) {
fieldSettings.isBinary = true;
}
if (field.isTermVectorStored() != fieldSettings.storeTermVector) {
fieldSettings.storeTermVector = true;
}
if (field.isStorePositionWithTermVector() != fieldSettings.storePositionWithTermVector) {
fieldSettings.storePositionWithTermVector = true;
}
if (field.isStoreOffsetWithTermVector() != fieldSettings.storeOffsetWithTermVector) {
fieldSettings.storeOffsetWithTermVector = true;
}
}
Map<Field, LinkedList<Token>> tokensByField = new LinkedHashMap<Field, LinkedList<Token>>(20);
// tokenize indexed fields.
for (Iterator<Field> it = (Iterator<Field>) document.getDocument().getFields().iterator(); it.hasNext();) {
Field field = it.next();
FieldSetting fieldSettings = fieldSettingsByFieldName.get(field.name());
if (field.isIndexed()) {
LinkedList<Token> tokens = new LinkedList<Token>();
tokensByField.put(field, tokens);
if (field.isTokenized()) {
int termCounter = 0;
final TokenStream tokenStream;
// todo readerValue(), binaryValue()
if (field.tokenStreamValue() != null) {
tokenStream = field.tokenStreamValue();
} else {
tokenStream = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
}
Token next = tokenStream.next();
while (next != null) {
next.setTermText(next.termText().intern()); // todo: not sure this needs to be interned?
tokens.add(next); // the vector will be built on commit.
next = tokenStream.next();
fieldSettings.fieldLength++;
if (fieldSettings.fieldLength > maxFieldLength) {
break;
}
}
} else {
// untokenized
tokens.add(new Token(field.stringValue().intern(), 0, field.stringValue().length(), "untokenized"));
fieldSettings.fieldLength++;
}
}
if (!field.isStored()) {
it.remove();
}
}
Map<FieldSetting, Map<String /*text*/, TermDocumentInformationFactory>> termDocumentInformationFactoryByTermTextAndFieldSetting = new HashMap<FieldSetting, Map<String /*text*/, TermDocumentInformationFactory>>();
termDocumentInformationFactoryByDocument.put(document, termDocumentInformationFactoryByTermTextAndFieldSetting);
// build term vector, term positions and term offsets
for (Map.Entry<Field, LinkedList<Token>> eField_Tokens : tokensByField.entrySet()) {
FieldSetting fieldSettings = fieldSettingsByFieldName.get(eField_Tokens.getKey().name());
Map<String, TermDocumentInformationFactory> termDocumentInformationFactoryByTermText = termDocumentInformationFactoryByTermTextAndFieldSetting.get(fieldSettingsByFieldName.get(eField_Tokens.getKey().name()));
if (termDocumentInformationFactoryByTermText == null) {
termDocumentInformationFactoryByTermText = new HashMap<String /*text*/, TermDocumentInformationFactory>();
termDocumentInformationFactoryByTermTextAndFieldSetting.put(fieldSettingsByFieldName.get(eField_Tokens.getKey().name()), termDocumentInformationFactoryByTermText);
}
int lastOffset = 0;
// for each new field, move positions a bunch.
if (fieldSettings.position > 0) {
// todo what if no analyzer set, multiple fields with same name and index without tokenization?
fieldSettings.position += analyzer.getPositionIncrementGap(fieldSettings.fieldName);
}
for (Token token : eField_Tokens.getValue()) {
TermDocumentInformationFactory termDocumentInformationFactory = termDocumentInformationFactoryByTermText.get(token.termText());
if (termDocumentInformationFactory == null) {
termDocumentInformationFactory = new TermDocumentInformationFactory();
termDocumentInformationFactoryByTermText.put(token.termText(), termDocumentInformationFactory);
}
//termDocumentInformationFactory.termFrequency++;
fieldSettings.position += (token.getPositionIncrement() - 1);
termDocumentInformationFactory.termPositions.add(fieldSettings.position++);
if (token.getPayload() != null && token.getPayload().length() > 0) {
termDocumentInformationFactory.payloads.add(token.getPayload().toByteArray());
} else {
termDocumentInformationFactory.payloads.add(null);
}
if (eField_Tokens.getKey().isStoreOffsetWithTermVector()) {
termDocumentInformationFactory.termOffsets.add(new TermVectorOffsetInfo(fieldSettings.offset + token.startOffset(), fieldSettings.offset + token.endOffset()));
lastOffset = fieldSettings.offset + token.endOffset();
}
}
if (eField_Tokens.getKey().isStoreOffsetWithTermVector()) {
fieldSettings.offset = lastOffset + 1;
}
}
unflushedDocuments.add(document);
// if too many documents in buffer, commit.
if (unflushedDocuments.size() >= getMergeFactor()) {
commit(/*lock*/);
}
// todo: unlock write lock
}
private Set<Term> unflushedDeletions = new HashSet<Term>();
public void deleteDocuments(Term term) throws IOException {
unflushedDeletions.add(term);
}
public void deleteDocuments(Term[] terms) throws IOException {
for (Term term : terms) {
deleteDocuments(term);
}
}
public void updateDocument(Term term, Document doc) throws IOException {
updateDocument(term, doc, getAnalyzer());
}
public void updateDocument(Term term, Document doc, Analyzer analyzer) throws IOException {
deleteDocuments(term);
addDocument(doc, analyzer);
}
public int getMaxFieldLength() {
return maxFieldLength;
}
public void setMaxFieldLength(int maxFieldLength) {
this.maxFieldLength = maxFieldLength;
}
public Similarity getSimilarity() {
return similarity;
}
public void setSimilarity(Similarity similarity) {
this.similarity = similarity;
}
public Analyzer getAnalyzer() {
return analyzer;
}
private class FieldSetting {
private String fieldName;
private float boost = 1;
//private int dimensions = 0; // this is futuristic
private int position = 0;
private int offset;
private int fieldLength = 0;
private boolean storeTermVector = false;
private boolean storeOffsetWithTermVector = false;
private boolean storePositionWithTermVector = false;
private boolean omitNorms = false;
private boolean isTokenized = false;
private boolean isStored = false;
private boolean isIndexed = false;
private boolean isBinary = false;
private boolean isCompressed = false;
//private float norm;
//private byte encodedNorm;
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
final FieldSetting that = (FieldSetting) o;
return fieldName.equals(that.fieldName);
}
public int hashCode() {
return fieldName.hashCode();
}
}
private class TermDocumentInformationFactory {
private LinkedList<byte[]> payloads = new LinkedList<byte[]>();
private LinkedList<Integer> termPositions = new LinkedList<Integer>();
private LinkedList<TermVectorOffsetInfo> termOffsets = new LinkedList<TermVectorOffsetInfo>();
}
}

View File

@ -0,0 +1,250 @@
package org.apache.lucene.store.instantiated;
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.Term;
import java.io.Serializable;
import java.util.Comparator;
import java.util.Collections;
import java.util.Arrays;
/**
* A term in the inverted index, coupled to the documents it occurs in.
*
* @see org.apache.lucene.index.Term
*/
public class InstantiatedTerm
implements Serializable {
private static final long serialVersionUID = 1l;
public static final Comparator<InstantiatedTerm> comparator = new Comparator<InstantiatedTerm>() {
public int compare(InstantiatedTerm instantiatedTerm, InstantiatedTerm instantiatedTerm1) {
return instantiatedTerm.getTerm().compareTo(instantiatedTerm1.getTerm());
}
};
public static final Comparator termComparator = new Comparator() {
public int compare(Object o, Object o1) {
return ((InstantiatedTerm)o).getTerm().compareTo((Term)o1);
}
};
private Term term;
/**
* index of term in InstantiatedIndex
* @see org.apache.lucene.store.instantiated.InstantiatedIndex#getOrderedTerms() */
private int termIndex;
/**
* @return Term associated with this entry of the index object graph
*/
public Term getTerm() {
return term;
}
InstantiatedTerm(String field, String text) {
this.term = new Term(field, text);
}
// this could speed up TermDocs.skipTo even more
// private Map</** document number*/Integer, /** index in associatedDocuments */Integer> associatedDocumentIndexByDocumentNumber = new HashMap<Integer, Integer>();
//
// public Map</** document number*/Integer, /** index in associatedDocuments */Integer> getAssociatedDocumentIndexByDocumentNumber() {
// return associatedDocumentIndexByDocumentNumber;
// }
/** Ordered by document number */
private InstantiatedTermDocumentInformation[] associatedDocuments;
/**
* Meta data per document in wich this term is occuring.
* Ordered by document number.
*
* @return Meta data per document in wich this term is occuring.
*/
public InstantiatedTermDocumentInformation[] getAssociatedDocuments() {
return associatedDocuments;
}
/**
* Meta data per document in wich this term is occuring.
* Ordered by document number.
*
* @param associatedDocuments meta data per document in wich this term is occuring, ordered by document number
*/
void setAssociatedDocuments(InstantiatedTermDocumentInformation[] associatedDocuments) {
this.associatedDocuments = associatedDocuments;
}
/**
* Finds index to the first beyond the current whose document number is
* greater than or equal to <i>target</i>, -1 if there is no such element.
*
* @param target the document number to match
* @return -1 if there is no such element
*/
public int seekCeilingDocumentInformationIndex(int target) {
return seekCeilingDocumentInformationIndex(target, 0, getAssociatedDocuments().length);
}
/**
* Finds index to the first beyond the current whose document number is
* greater than or equal to <i>target</i>, -1 if there is no such element.
*
* @param target the document number to match
* @param startOffset associated documents index start offset
* @return -1 if there is no such element
*/
public int seekCeilingDocumentInformationIndex(int target, int startOffset) {
return seekCeilingDocumentInformationIndex(target, startOffset, getAssociatedDocuments().length);
}
/**
* Finds index to the first beyond the current whose document number is
* greater than or equal to <i>target</i>, -1 if there is no such element.
*
* @param target the document number to match
* @param startOffset associated documents index start offset
* @param endPosition associated documents index end position
* @return -1 if there is no such element
*/
public int seekCeilingDocumentInformationIndex(int target, int startOffset, int endPosition) {
int pos = binarySearchAssociatedDocuments(target, startOffset, endPosition - startOffset);
// int pos = Arrays.binarySearch(getAssociatedDocuments(), target, InstantiatedTermDocumentInformation.doumentNumberIntegerComparator);
if (pos < 0) {
pos = -1 - pos;
}
if (getAssociatedDocuments().length <= pos) {
return -1;
} else {
return pos;
}
}
public int binarySearchAssociatedDocuments(int target) {
return binarySearchAssociatedDocuments(target, 0);
}
public int binarySearchAssociatedDocuments(int target, int offset) {
return binarySearchAssociatedDocuments(target, offset, associatedDocuments.length - offset);
}
/**
* @param target value to search for in the array
* @param offset index of the first valid value in the array
* @param length number of valid values in the array
* @return index of an occurrence of key in array, or -(insertionIndex + 1) if key is not contained in array (<i>insertionIndex</i> is then the index at which key could be inserted).
*/
public int binarySearchAssociatedDocuments(int target, int offset, int length) {
// implementation originally from http://ochafik.free.fr/blog/?p=106
if (length == 0) {
return -1 - offset;
}
int min = offset, max = offset + length - 1;
int minVal = getAssociatedDocuments()[min].getDocument().getDocumentNumber();
int maxVal = getAssociatedDocuments()[max].getDocument().getDocumentNumber();
int nPreviousSteps = 0;
for (; ;) {
// be careful not to compute key - minVal, for there might be an integer overflow.
if (target <= minVal) return target == minVal ? min : -1 - min;
if (target >= maxVal) return target == maxVal ? max : -2 - max;
assert min != max;
int pivot;
// A typical binarySearch algorithm uses pivot = (min + max) / 2.
// The pivot we use here tries to be smarter and to choose a pivot close to the expectable location of the key.
// This reduces dramatically the number of steps needed to get to the key.
// However, it does not work well with a logaritmic distribution of values, for instance.
// When the key is not found quickly the smart way, we switch to the standard pivot.
if (nPreviousSteps > 2) {
pivot = (min + max) >> 1;
// stop increasing nPreviousSteps from now on
} else {
// NOTE: We cannot do the following operations in int precision, because there might be overflows.
// long operations are slower than float operations with the hardware this was tested on (intel core duo 2, JVM 1.6.0).
// Overall, using float proved to be the safest and fastest approach.
pivot = min + (int) ((target - (float) minVal) / (maxVal - (float) minVal) * (max - min));
nPreviousSteps++;
}
int pivotVal = getAssociatedDocuments()[pivot].getDocument().getDocumentNumber();
// NOTE: do not store key - pivotVal because of overflows
if (target > pivotVal) {
min = pivot + 1;
max--;
} else if (target == pivotVal) {
return pivot;
} else {
min++;
max = pivot - 1;
}
maxVal = getAssociatedDocuments()[max].getDocument().getDocumentNumber();
minVal = getAssociatedDocuments()[min].getDocument().getDocumentNumber();
}
}
/**
* Navigates to the view of this occurances of this term in a specific document.
*
* This method is only used by InstantiatedIndex(IndexReader) and
* should not be optimized for less CPU at the cost of more RAM.
*
* @param documentNumber the n:th document in the index
* @return view of this term from specified document
*/
public InstantiatedTermDocumentInformation getAssociatedDocument(int documentNumber) {
int pos = binarySearchAssociatedDocuments(documentNumber);
return pos < 0 ? null : getAssociatedDocuments()[pos];
}
public final String field() {
return term.field();
}
public final String text() {
return term.text();
}
public String toString() {
return term.toString();
}
public int getTermIndex() {
return termIndex;
}
public void setTermIndex(int termIndex) {
this.termIndex = termIndex;
}
}

View File

@ -0,0 +1,136 @@
package org.apache.lucene.store.instantiated;
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
/**
* A {@link org.apache.lucene.index.TermDocs} navigating an {@link InstantiatedIndexReader}.
*/
public class InstantiatedTermDocs
implements TermDocs {
private final InstantiatedIndexReader reader;
public InstantiatedTermDocs(InstantiatedIndexReader reader) {
this.reader = reader;
}
private int currentDocumentIndex;
protected InstantiatedTermDocumentInformation currentDocumentInformation;
protected InstantiatedTerm currentTerm;
public void seek(Term term) {
currentTerm = reader.getIndex().findTerm(term);
currentDocumentIndex = -1;
}
public void seek(org.apache.lucene.index.TermEnum termEnum) {
seek(termEnum.term());
}
public int doc() {
return currentDocumentInformation.getDocument().getDocumentNumber();
}
public int freq() {
return currentDocumentInformation.getTermPositions().length;
}
public boolean next() {
if (currentTerm != null) {
currentDocumentIndex++;
if (currentDocumentIndex < currentTerm.getAssociatedDocuments().length) {
currentDocumentInformation = currentTerm.getAssociatedDocuments()[currentDocumentIndex];
if (reader.hasDeletions() && reader.isDeleted(currentDocumentInformation.getDocument().getDocumentNumber())) {
return next();
} else {
return true;
}
}
}
return false;
}
public int read(int[] docs, int[] freqs) {
int i;
for (i = 0; i < docs.length; i++) {
if (!next()) {
break;
}
docs[i] = doc();
freqs[i] = freq();
}
return i;
}
/**
* Skips entries to the first beyond the current whose document number is
* greater than or equal to <i>target</i>. <p>Returns true if there is such
* an entry. <p>Behaves as if written: <pre>
* boolean skipTo(int target) {
* do {
* if (!next())
* return false;
* } while (target > doc());
* return true;
* }
* </pre>
* This implementation is considerably more efficient than that.
*
*/
public boolean skipTo(int target) {
if (currentTerm == null) {
return false;
}
if (currentDocumentIndex >= target) {
return next();
}
int startOffset = currentDocumentIndex >= 0 ? currentDocumentIndex : 0;
int pos = currentTerm.seekCeilingDocumentInformationIndex(target, startOffset);
if (pos == -1) {
return false;
}
currentDocumentInformation = currentTerm.getAssociatedDocuments()[pos];
currentDocumentIndex = pos;
if (reader.hasDeletions() && reader.isDeleted(currentDocumentInformation.getDocument().getDocumentNumber())) {
return next();
} else {
return true;
}
}
/**
* Does nothing
*/
public void close() {
}
}

View File

@ -0,0 +1,139 @@
package org.apache.lucene.store.instantiated;
import org.apache.lucene.index.TermVectorOffsetInfo;
import java.io.Serializable;
import java.util.Comparator;
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* There is one instance of this class per indexed term in a document
* and it contains the meta data about each occurance of a term in a docment.
*
* It is the inner glue of the inverted index.
*
* <pre>
* [Term]-- {0..*} | {0..*} --(field)[Document]
* &lt;&lt;ordered>>
* |
* [TermDocumentInformation]
* +payloads
* +termPositions
* +termOffsets
* </pre>
*
*/
public class InstantiatedTermDocumentInformation
implements Serializable {
private static final long serialVersionUID = 1l;
public static final Comparator<InstantiatedTermDocumentInformation> termComparator = new Comparator<InstantiatedTermDocumentInformation>() {
public int compare(InstantiatedTermDocumentInformation instantiatedTermDocumentInformation, InstantiatedTermDocumentInformation instantiatedTermDocumentInformation1) {
return instantiatedTermDocumentInformation.getTerm().getTerm().compareTo(instantiatedTermDocumentInformation1.getTerm());
}
};
public static final Comparator<InstantiatedTermDocumentInformation> documentNumberComparator = new Comparator<InstantiatedTermDocumentInformation>() {
public int compare(InstantiatedTermDocumentInformation instantiatedTermDocumentInformation, InstantiatedTermDocumentInformation instantiatedTermDocumentInformation1) {
return instantiatedTermDocumentInformation.getDocument().getDocumentNumber().compareTo(instantiatedTermDocumentInformation1.getDocument().getDocumentNumber());
}
};
public static final Comparator doumentNumberIntegerComparator = new Comparator() {
public int compare(Object o1, Object o2) {
InstantiatedTermDocumentInformation di = (InstantiatedTermDocumentInformation) o1;
Integer i = (Integer) o2;
return di.getDocument().getDocumentNumber().compareTo(i);
}
};
private byte[][] payloads;
private int[] termPositions;
private InstantiatedTerm term;
private InstantiatedDocument document;
private TermVectorOffsetInfo[] termOffsets;
public InstantiatedTermDocumentInformation(InstantiatedTerm term, InstantiatedDocument document, int[] termPositions, byte[][] payloads) {
this.term = term;
this.document = document;
this.termPositions = termPositions;
this.payloads = payloads;
}
// not quite sure why I wanted this.
// /**
// * [Term]--- {0..* ordered} ->[Info]
// */
// private int indexFromTerm;
// public int getIndexFromTerm() {
// return indexFromTerm;
// }
//
// void setIndexFromTerm(int indexFromTerm) {
// this.indexFromTerm = indexFromTerm;
// }
public int[] getTermPositions() {
return termPositions;
}
public byte[][] getPayloads() {
return payloads;
}
public InstantiatedDocument getDocument() {
return document;
}
public InstantiatedTerm getTerm() {
return term;
}
void setTermPositions(int[] termPositions) {
this.termPositions = termPositions;
}
void setTerm(InstantiatedTerm term) {
this.term = term;
}
void setDocument(InstantiatedDocument document) {
this.document = document;
}
public TermVectorOffsetInfo[] getTermOffsets() {
return termOffsets;
}
void setTermOffsets(TermVectorOffsetInfo[] termOffsets) {
this.termOffsets = termOffsets;
}
}

View File

@ -0,0 +1,109 @@
package org.apache.lucene.store.instantiated;
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import java.io.IOException;
import java.util.Arrays;
/**
* A {@link org.apache.lucene.index.TermEnum} navigating an {@link org.apache.lucene.store.instantiated.InstantiatedIndexReader}.
*/
public class InstantiatedTermEnum
extends TermEnum {
private final InstantiatedIndexReader reader;
public InstantiatedTermEnum(InstantiatedIndexReader reader) {
this.nextTermIndex = 0;
this.reader = reader;
}
public InstantiatedTermEnum(InstantiatedIndexReader reader, int startPosition) {
this.reader = reader;
this.nextTermIndex = startPosition;
next();
}
private int nextTermIndex;
private InstantiatedTerm term;
/**
* Increments the enumeration to the next element. True if one exists.
*/
public boolean next() {
if (reader.getIndex().getOrderedTerms().length <= nextTermIndex) {
return false;
} else {
term = reader.getIndex().getOrderedTerms()[nextTermIndex];
nextTermIndex++;
return true;
}
}
/**
* Returns the current Term in the enumeration.
*/
public Term term() {
return /*term == null ? null :*/ term.getTerm();
}
/**
* Returns the docFreq of the current Term in the enumeration.
*/
public int docFreq() {
return term.getAssociatedDocuments().length;
}
/**
* Closes the enumeration to further activity, freeing resources.
*/
public void close() {
}
public boolean skipTo(Term target) throws IOException {
// this method is not known to be used by anything
// in lucene for many years now, so there is
// very to gain by optimizing this method more,
InstantiatedTerm term = reader.getIndex().findTerm(target);
if (term != null) {
this.term = term;
nextTermIndex = term.getTermIndex() + 1;
return true;
} else {
int pos = Arrays.binarySearch(reader.getIndex().getOrderedTerms(), target, InstantiatedTerm.termComparator);
if (pos < 0) {
pos = -1 - pos;
}
if (pos > reader.getIndex().getOrderedTerms().length) {
return false;
}
this.term = reader.getIndex().getOrderedTerms()[pos];
nextTermIndex = pos + 1;
return true;
}
}
}

View File

@ -0,0 +1,112 @@
package org.apache.lucene.store.instantiated;
import org.apache.lucene.index.TermFreqVector;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Vector space view of a document in an {@link InstantiatedIndexReader}.
*
* @see org.apache.lucene.index.TermFreqVector
*/
public class InstantiatedTermFreqVector
implements TermFreqVector, Serializable {
private static final long serialVersionUID = 1l;
private final List<InstantiatedTermDocumentInformation> termDocumentInformations;
private final String field;
private final String terms[];
private final int termFrequencies[];
public InstantiatedTermFreqVector(InstantiatedDocument document, String field) {
this.field = field;
termDocumentInformations = document.getVectorSpace().get(field);
terms = new String[termDocumentInformations.size()];
termFrequencies = new int[termDocumentInformations.size()];
for (int i = 0; i < termDocumentInformations.size(); i++) {
InstantiatedTermDocumentInformation termDocumentInformation = termDocumentInformations.get(i);
terms[i] = termDocumentInformation.getTerm().text();
termFrequencies[i] = termDocumentInformation.getTermPositions().length;
}
}
/**
* @return The number of the field this vector is associated with
*/
public String getField() {
return field;
}
public String toString() {
StringBuffer sb = new StringBuffer();
sb.append('{');
sb.append(field).append(": ");
if (terms != null) {
for (int i = 0; i < terms.length; i++) {
if (i > 0) sb.append(", ");
sb.append(terms[i]).append('/').append(termFrequencies[i]);
}
}
sb.append('}');
return sb.toString();
}
public int size() {
return terms == null ? 0 : terms.length;
}
public String[] getTerms() {
return terms;
}
public int[] getTermFrequencies() {
return termFrequencies;
}
public int indexOf(String termText) {
if (terms == null)
return -1;
int res = Arrays.binarySearch(terms, termText);
return res >= 0 ? res : -1;
}
public int[] indexesOf(String[] termNumbers, int start, int len) {
// TODO: there must be a more efficient way of doing this.
// At least, we could advance the lower bound of the terms array
// as we find valid indices. Also, it might be possible to leverage
// this even more by starting in the middle of the termNumbers array
// and thus dividing the terms array maybe in half with each found index.
int res[] = new int[len];
for (int i = 0; i < len; i++) {
res[i] = indexOf(termNumbers[start + i]);
}
return res;
}
public List<InstantiatedTermDocumentInformation> getTermDocumentInformations() {
return termDocumentInformations;
}
}

View File

@ -0,0 +1,47 @@
package org.apache.lucene.store.instantiated;
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermVectorOffsetInfo;
import java.io.Serializable;
/**
* Extended vector space view of a document in an {@link InstantiatedIndexReader}.
*
* @see org.apache.lucene.index.TermPositionVector
*/
public class InstantiatedTermPositionVector
extends InstantiatedTermFreqVector
implements TermPositionVector, Serializable {
private static final long serialVersionUID = 1l;
public InstantiatedTermPositionVector(InstantiatedDocument document, String field) {
super(document, field);
}
public int[] getTermPositions(int index) {
return getTermDocumentInformations().get(index).getTermPositions();
}
public TermVectorOffsetInfo[] getOffsets(int index) {
return getTermDocumentInformations().get(index).getTermOffsets();
}
}

View File

@ -0,0 +1,100 @@
package org.apache.lucene.store.instantiated;
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.TermPositions;
import java.io.IOException;
/**
* A {@link org.apache.lucene.index.TermPositions} navigating an {@link InstantiatedIndexReader}.
*/
public class InstantiatedTermPositions
extends InstantiatedTermDocs
implements TermPositions {
public int getPayloadLength() {
return currentDocumentInformation.getPayloads()[currentTermPositionIndex].length;
}
public byte[] getPayload(byte[] data, int offset) throws IOException {
byte[] payloads = currentDocumentInformation.getPayloads()[currentTermPositionIndex];
// read payloads lazily
if (data == null || data.length - offset < getPayloadLength()) {
// the array is too small to store the payload data,
return payloads;
} else {
System.arraycopy(payloads, 0, data, offset, payloads.length);
return data;
}
}
public boolean isPayloadAvailable() {
return currentDocumentInformation.getPayloads()[currentTermPositionIndex] != null;
}
public InstantiatedTermPositions(InstantiatedIndexReader reader) {
super(reader);
}
/**
* Returns next position in the current document. It is an error to call
* this more than {@link #freq()} times
* without calling {@link #next()}<p> This is
* invalid until {@link #next()} is called for
* the first time.
*/
public int nextPosition() {
currentTermPositionIndex++;
// if you get an array out of index exception here,
// it might be due to currentDocumentInformation.getIndexFromTerm not beeing set!!
return currentDocumentInformation.getTermPositions()[currentTermPositionIndex];
}
private int currentTermPositionIndex;
/**
* Moves to the next pair in the enumeration.
* <p> Returns true if there is such a next pair in the enumeration.
*/
@Override
public boolean next() {
currentTermPositionIndex = -1;
return super.next();
}
/**
* Skips entries to the first beyond the current whose document number is
* greater than or equal to <currentTermPositionIndex>target</currentTermPositionIndex>. <p>Returns true iff there is such
* an entry. <p>Behaves as if written: <pre>
* boolean skipTo(int target) {
* do {
* if (!next())
* return false;
* } while (target > doc());
* return true;
* }
* </pre>
* Some implementations are considerably more efficient than that.
*/
@Override
public boolean skipTo(int target) {
currentTermPositionIndex = -1;
return super.skipTo(target);
}
}

View File

@ -0,0 +1,90 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html>
<!--**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
-->
<head>
<title>InstantiatedIndex</title>
</head>
<body>
<p>WARNING: This contrib is experimental and the APIs may change without warning.</p>
<h2>Abstract</h2>
<p>
Represented as a coupled graph of class instances, this
all-in-memory index store implementation delivers search
results up to a 100 times faster than the file-centric RAMDirectory
at the cost of greater RAM consumption.
</p>
<h2>API</h2>
<p>
Just as the default store implementation, InstantiatedIndex
comes with an IndexReader and IndexWriter. The latter share
many method signatures with the file-centric IndexWriter.
</p>
<p>
It is also possible to load the content of another index
by passing an IndexReader to the InstantiatedIndex constructor.
</p>
<h2>Performance</h2>
<p>
At a few thousand ~160 characters long documents
InstantiaedIndex outperforms RAMDirectory some 50x,
15x at 100 documents of 2000 charachters length,
and is linear to RAMDirectory at 10,000 documents of 2000 characters length.
</p>
<p>Mileage may vary depending on term saturation.</p>
<p>
Populated with a single document InstantiatedIndex is almost, but not quite, as fast as MemoryIndex.
</p>
<p>
It takes more or less the same time to populate an InstantiatedIndex
as it takes to populate a RAMDirectory. Hardly any effort has been put
in to optimizing the InstantiatedIndexWriter, only minimizing the amount
of time needed to write-lock the index has been considered.
</p>
<h2>Caveats</h2>
<ul>
<li>No locks! Consider using InstantiatedIndex as if it was immutable.</li>
<li>No documents with fields containing readers!</li>
<li>Only FieldOption.All allowed by IndexReader#getFieldNames(FieldOption).</li>
<li>No field selection when retrieving documents, as all stored field are available in memory.</li>
</ul>
<h2>Use cases</h2>
<p>
Could replace any small index that could do with greater response time.
spell check a priori index,
the index of new documents exposed to user search agent queries,
to compile classifiers in machine learning environments, et c.
</p>
<h2>Class diagram</h2>
<a href="../../../../../../../docs/classdiagram.png"><img width="640px" height="480px" src="../../../../../../../docs/classdiagram.png" alt="class diagram"></a>
<br/>
<a href="../../../../../../../docs/classdiagram.uxf">Diagram</a> rendered using <a href="http://umlet.com">UMLet</a> 7.1.
</body>
</html>

View File

@ -0,0 +1,424 @@
package org.apache.lucene.store.instantiated;
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import java.io.IOException;
import java.util.*;
/**
* Asserts equality of content and behaviour of two index readers.
*/
public class TestIndicesEquals extends TestCase {
// public void test2() throws Exception {
// FSDirectory fsdir = FSDirectory.getDirectory("/tmp/fatcorpus");
// IndexReader ir = IndexReader.open(fsdir);
// InstantiatedIndex ii = new InstantiatedIndex(ir);
// ir.close();
// testEquals(fsdir, ii);
// }
public void testLoadIndexReader() throws Exception {
RAMDirectory dir = new RAMDirectory();
// create dir data
IndexWriter indexWriter = new IndexWriter(dir, new StandardAnalyzer(), true);
for (int i = 0; i < 5; i++) {
Document document = new Document();
assembleDocument(document, i);
indexWriter.addDocument(document);
}
indexWriter.close();
// test load ii from index reader
IndexReader ir = IndexReader.open(dir);
InstantiatedIndex ii = new InstantiatedIndex(ir);
ir.close();
testEquals(dir, ii);
}
public void testInstantiatedIndexWriter() throws Exception {
RAMDirectory dir = new RAMDirectory();
InstantiatedIndex ii = new InstantiatedIndex();
// create dir data
IndexWriter indexWriter = new IndexWriter(dir, new StandardAnalyzer(), true);
for (int i = 0; i < 500; i++) {
Document document = new Document();
assembleDocument(document, i);
indexWriter.addDocument(document);
}
indexWriter.close();
// test ii writer
InstantiatedIndexWriter instantiatedIndexWriter = ii.indexWriterFactory(new StandardAnalyzer(), true);
for (int i = 0; i < 500; i++) {
Document document = new Document();
assembleDocument(document, i);
instantiatedIndexWriter.addDocument(document);
}
instantiatedIndexWriter.close();
testEquals(dir, ii);
testTermDocs(dir, ii);
}
private void testTermDocs(Directory aprioriIndex, InstantiatedIndex testIndex) throws Exception {
IndexReader aprioriReader = IndexReader.open(aprioriIndex);
IndexReader testReader = testIndex.indexReaderFactory();
TermEnum aprioriTermEnum = aprioriReader.terms(new Term("c", "danny"));
TermDocs aprioriTermDocs = aprioriReader.termDocs(aprioriTermEnum.term());
TermDocs testTermDocs = testReader.termDocs(aprioriTermEnum.term());
assertEquals(aprioriTermDocs.next(), testTermDocs.next());
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
assertEquals(aprioriTermDocs.skipTo(100), testTermDocs.skipTo(100));
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
assertEquals(aprioriTermDocs.next(), testTermDocs.next());
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
assertEquals(aprioriTermDocs.next(), testTermDocs.next());
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
assertEquals(aprioriTermDocs.skipTo(110), testTermDocs.skipTo(110));
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
assertEquals(aprioriTermDocs.skipTo(10), testTermDocs.skipTo(10));
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
assertEquals(aprioriTermDocs.skipTo(210), testTermDocs.skipTo(210));
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
aprioriTermDocs.close();
aprioriReader.close();
testTermDocs.close();
testReader.close();
}
private void assembleDocument(Document document, int i) {
document.add(new Field("a", i + " Do you really want to go and live in that house all winter?", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
if (i > 0) {
document.add(new Field("b0", i + " All work and no play makes Jack a dull boy", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
document.add(new Field("b1", i + " All work and no play makes Jack a dull boy", Field.Store.YES, Field.Index.NO_NORMS, Field.TermVector.NO));
document.add(new Field("b2", i + " All work and no play makes Jack a dull boy", Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
document.add(new Field("b3", i + " All work and no play makes Jack a dull boy", Field.Store.YES, Field.Index.NO, Field.TermVector.NO));
if (i > 1) {
document.add(new Field("c", i + " Redrum redrum", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
if (i > 2) {
document.add(new Field("d", i + " Hello Danny, come and play with us... forever and ever. and ever.", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
if (i > 3) {
Field f = new Field("e", i + " Heres Johnny!", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
f.setOmitNorms(true);
document.add(f);
if (i > 4) {
final List<Token> tokens = new ArrayList<Token>(2);
Token t = new Token("the", 0, 2, "text");
t.setPayload(new Payload(new byte[]{1, 2, 3}));
tokens.add(t);
t = new Token("end", 3, 5, "text");
t.setPayload(new Payload(new byte[]{2}));
tokens.add(t);
tokens.add(new Token("fin", 7, 9));
document.add(new Field("f", new TokenStream() {
Iterator<Token> it = tokens.iterator();
public Token next() throws IOException {
if (!it.hasNext()) {
return null;
}
return it.next();
}
public void reset() throws IOException {
it = tokens.iterator();
}
}));
}
}
}
}
}
}
/**
* Asserts that the content of two index readers equal each other.
*
* @param aprioriIndex the index that is known to be correct
* @param testIndex the index that is supposed to equals the apriori index.
* @throws Exception
*/
protected void testEquals(Directory aprioriIndex, InstantiatedIndex testIndex) throws Exception {
IndexReader aprioriReader = IndexReader.open(aprioriIndex);
IndexReader testReader = testIndex.indexReaderFactory();
assertEquals(aprioriReader.numDocs(), testReader.numDocs());
for (Object field : aprioriReader.getFieldNames(IndexReader.FieldOption.ALL)) {
// test norms as used by normal use
byte[] aprioriNorms = aprioriReader.norms((String) field);
byte[] testNorms = testReader.norms((String) field);
assertEquals(aprioriNorms.length, testNorms.length);
for (int i = 0; i < aprioriNorms.length; i++) {
assertEquals("norms does not equals for field " + field + " in document " + i, aprioriNorms[i], testNorms[i]);
}
// test norms as used by multireader
aprioriNorms = new byte[aprioriReader.maxDoc()];
aprioriReader.norms((String) field, aprioriNorms, 0);
testNorms = new byte[testReader.maxDoc()];
testReader.norms((String) field, testNorms, 0);
assertEquals(aprioriNorms.length, testNorms.length);
for (int i = 0; i < aprioriNorms.length; i++) {
assertEquals("norms does not equals for field " + field + " in document " + i, aprioriNorms[i], testNorms[i]);
}
}
for (int docIndex = 0; docIndex < aprioriReader.numDocs(); docIndex++) {
assertEquals(aprioriReader.isDeleted(docIndex), testReader.isDeleted(docIndex));
}
// compare term enumeration stepping
TermEnum aprioriTermEnum = aprioriReader.terms();
TermEnum testTermEnum = testReader.terms();
while (true) {
if (!aprioriTermEnum.next()) {
assertFalse(testTermEnum.next());
break;
}
assertTrue(testTermEnum.next());
assertEquals(aprioriTermEnum.term(), testTermEnum.term());
assertTrue(aprioriTermEnum.docFreq() == testTermEnum.docFreq());
// compare termDocs seeking
TermDocs aprioriTermDocsSeeker = aprioriReader.termDocs(aprioriTermEnum.term());
TermDocs testTermDocsSeeker = testReader.termDocs(testTermEnum.term());
while (aprioriTermDocsSeeker.next()) {
assertTrue(testTermDocsSeeker.skipTo(aprioriTermDocsSeeker.doc()));
assertEquals(aprioriTermDocsSeeker.doc(), testTermDocsSeeker.doc());
}
aprioriTermDocsSeeker.close();
testTermDocsSeeker.close();
// compare documents per term
assertEquals(aprioriReader.docFreq(aprioriTermEnum.term()), testReader.docFreq(testTermEnum.term()));
TermDocs aprioriTermDocs = aprioriReader.termDocs(aprioriTermEnum.term());
TermDocs testTermDocs = testReader.termDocs(testTermEnum.term());
while (true) {
if (!aprioriTermDocs.next()) {
assertFalse(testTermDocs.next());
break;
}
assertTrue(testTermDocs.next());
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
assertEquals(aprioriTermDocs.freq(), testTermDocs.freq());
}
aprioriTermDocs.close();
testTermDocs.close();
// compare term positions
TermPositions testTermPositions = testReader.termPositions(testTermEnum.term());
TermPositions aprioriTermPositions = aprioriReader.termPositions(aprioriTermEnum.term());
if (aprioriTermPositions != null) {
for (int docIndex = 0; docIndex < aprioriReader.maxDoc(); docIndex++) {
boolean hasNext = aprioriTermPositions.next();
if (hasNext) {
assertTrue(testTermPositions.next());
assertEquals(aprioriTermPositions.freq(), testTermPositions.freq());
for (int termPositionIndex = 0; termPositionIndex < aprioriTermPositions.freq(); termPositionIndex++) {
int aprioriPos = aprioriTermPositions.nextPosition();
int testPos = testTermPositions.nextPosition();
if (aprioriPos != testPos) {
assertEquals(aprioriPos, testPos);
}
assertEquals(aprioriTermPositions.isPayloadAvailable(), testTermPositions.isPayloadAvailable());
if (aprioriTermPositions.isPayloadAvailable()) {
assertEquals(aprioriTermPositions.getPayloadLength(), testTermPositions.getPayloadLength());
byte[] aprioriPayloads = aprioriTermPositions.getPayload(new byte[aprioriTermPositions.getPayloadLength()], 0);
byte[] testPayloads = testTermPositions.getPayload(new byte[testTermPositions.getPayloadLength()], 0);
for (int i = 0; i < aprioriPayloads.length; i++) {
assertEquals(aprioriPayloads[i], testPayloads[i]);
}
}
}
}
}
aprioriTermPositions.close();
testTermPositions.close();
}
}
// compare term enumeration seeking
aprioriTermEnum = aprioriReader.terms();
TermEnum aprioriTermEnumSeeker = aprioriReader.terms();
TermEnum testTermEnumSeeker = testReader.terms();
while (aprioriTermEnum.next()) {
if (aprioriTermEnumSeeker.skipTo(aprioriTermEnum.term())) {
assertTrue(testTermEnumSeeker.skipTo(aprioriTermEnum.term()));
assertEquals(aprioriTermEnumSeeker.term(), testTermEnumSeeker.term());
} else {
assertFalse(testTermEnumSeeker.skipTo(aprioriTermEnum.term()));
}
}
aprioriTermEnum.close();
aprioriTermEnumSeeker.close();
testTermEnumSeeker.close();
// skip to non existing terms
aprioriTermEnumSeeker = aprioriReader.terms();
testTermEnumSeeker = testReader.terms();
aprioriTermEnum = aprioriReader.terms();
aprioriTermEnum.next();
Term nonExistingTerm = new Term(aprioriTermEnum.term().field(), "bzzzzoo993djdj380sdf");
aprioriTermEnum.close();
assertEquals(aprioriTermEnumSeeker.skipTo(nonExistingTerm), testTermEnumSeeker.skipTo(nonExistingTerm));
assertEquals(aprioriTermEnumSeeker.term(), testTermEnumSeeker.term());
aprioriTermEnumSeeker.close();
testTermEnumSeeker.close();
// compare term vectors and position vectors
for (int documentNumber = 0; documentNumber < aprioriReader.numDocs(); documentNumber++) {
if (documentNumber > 0) {
assertNotNull(aprioriReader.getTermFreqVector(documentNumber, "b0"));
assertNull(aprioriReader.getTermFreqVector(documentNumber, "b1"));
assertNotNull(testReader.getTermFreqVector(documentNumber, "b0"));
assertNull(testReader.getTermFreqVector(documentNumber, "b1"));
}
TermFreqVector[] aprioriFreqVectors = aprioriReader.getTermFreqVectors(documentNumber);
TermFreqVector[] testFreqVectors = testReader.getTermFreqVectors(documentNumber);
if (aprioriFreqVectors != null && testFreqVectors != null) {
Arrays.sort(aprioriFreqVectors, new Comparator<TermFreqVector>() {
public int compare(TermFreqVector termFreqVector, TermFreqVector termFreqVector1) {
return termFreqVector.getField().compareTo(termFreqVector1.getField());
}
});
Arrays.sort(testFreqVectors, new Comparator<TermFreqVector>() {
public int compare(TermFreqVector termFreqVector, TermFreqVector termFreqVector1) {
return termFreqVector.getField().compareTo(termFreqVector1.getField());
}
});
assertEquals("document " + documentNumber + " vectors does not match", aprioriFreqVectors.length, testFreqVectors.length);
for (int freqVectorIndex = 0; freqVectorIndex < aprioriFreqVectors.length; freqVectorIndex++) {
assertTrue(Arrays.equals(aprioriFreqVectors[freqVectorIndex].getTermFrequencies(), testFreqVectors[freqVectorIndex].getTermFrequencies()));
assertTrue(Arrays.equals(aprioriFreqVectors[freqVectorIndex].getTerms(), testFreqVectors[freqVectorIndex].getTerms()));
if (aprioriFreqVectors[freqVectorIndex] instanceof TermPositionVector) {
TermPositionVector aprioriTermPositionVector = (TermPositionVector) aprioriFreqVectors[freqVectorIndex];
TermPositionVector testTermPositionVector = (TermPositionVector) testFreqVectors[freqVectorIndex];
for (int positionVectorIndex = 0; positionVectorIndex < aprioriFreqVectors[freqVectorIndex].getTerms().length; positionVectorIndex++)
{
if (aprioriTermPositionVector.getOffsets(positionVectorIndex) != null) {
assertTrue(Arrays.equals(aprioriTermPositionVector.getOffsets(positionVectorIndex), testTermPositionVector.getOffsets(positionVectorIndex)));
}
if (aprioriTermPositionVector.getTermPositions(positionVectorIndex) != null) {
assertTrue(Arrays.equals(aprioriTermPositionVector.getTermPositions(positionVectorIndex), testTermPositionVector.getTermPositions(positionVectorIndex)));
}
}
}
}
}
}
aprioriTermEnum.close();
testTermEnum.close();
aprioriReader.close();
testReader.close();
}
}