From 4eb869258836297f6b252ae5559ce8c386fa3c7a Mon Sep 17 00:00:00 2001 From: Michael Busch Date: Tue, 20 May 2008 07:40:54 +0000 Subject: [PATCH] Set eol-style to native for all files in src/java and src/test that did not have this property set before. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@658136 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/CachingTokenFilter.java | 144 +- .../apache/lucene/document/AbstractField.java | 548 ++++---- .../apache/lucene/document/FieldSelector.java | 68 +- .../lucene/document/FieldSelectorResult.java | 192 +-- .../org/apache/lucene/document/Fieldable.java | 288 ++-- .../document/LoadFirstFieldSelector.java | 56 +- .../document/SetBasedFieldSelector.java | 118 +- .../lucene/index/DefaultSkipListReader.java | 228 ++-- .../lucene/index/DefaultSkipListWriter.java | 248 ++-- .../lucene/index/FieldReaderException.java | 158 +-- .../index/MultiLevelSkipListReader.java | 546 ++++---- .../index/MultiLevelSkipListWriter.java | 302 ++--- src/java/org/apache/lucene/index/Payload.java | 326 ++--- .../lucene/search/DisjunctionMaxQuery.java | 514 ++++---- .../lucene/search/DisjunctionMaxScorer.java | 390 +++--- .../org/apache/lucene/util/SmallFloat.java | 250 ++-- .../analysis/TestCachingTokenFilter.java | 206 +-- .../lucene/analysis/TestStopFilter.java | 256 ++-- .../lucene/index/TestIndexWriterMerging.java | 216 +-- .../lucene/index/TestMultiLevelSkipList.java | 316 ++--- .../org/apache/lucene/index/TestPayloads.java | 1172 ++++++++--------- .../apache/lucene/index/TestTermdocPerf.java | 230 ++-- .../lucene/search/TestCustomSearcherSort.java | 568 ++++---- 23 files changed, 3670 insertions(+), 3670 deletions(-) diff --git a/src/java/org/apache/lucene/analysis/CachingTokenFilter.java b/src/java/org/apache/lucene/analysis/CachingTokenFilter.java index c35011d09eb..d2e236c1d4c 100644 --- a/src/java/org/apache/lucene/analysis/CachingTokenFilter.java +++ b/src/java/org/apache/lucene/analysis/CachingTokenFilter.java @@ -1,72 +1,72 @@ -package org.apache.lucene.analysis; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; - -/** - * This class can be used if the Tokens of a TokenStream - * are intended to be consumed more than once. It caches - * all Tokens locally in a List. - * - * CachingTokenFilter implements the optional method - * {@link TokenStream#reset()}, which repositions the - * stream to the first Token. - * - */ -public class CachingTokenFilter extends TokenFilter { - private List cache; - private Iterator iterator; - - public CachingTokenFilter(TokenStream input) { - super(input); - } - - public Token next() throws IOException { - if (cache == null) { - // fill cache lazily - cache = new LinkedList(); - fillCache(); - iterator = cache.iterator(); - } - - if (!iterator.hasNext()) { - // the cache is exhausted, return null - return null; - } - - return (Token) iterator.next(); - } - - public void reset() throws IOException { - if(cache != null) { - iterator = cache.iterator(); - } - } - - private void fillCache() throws IOException { - Token token; - while ( (token = input.next()) != null) { - cache.add(token); - } - } - -} +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +/** + * This class can be used if the Tokens of a TokenStream + * are intended to be consumed more than once. It caches + * all Tokens locally in a List. + * + * CachingTokenFilter implements the optional method + * {@link TokenStream#reset()}, which repositions the + * stream to the first Token. + * + */ +public class CachingTokenFilter extends TokenFilter { + private List cache; + private Iterator iterator; + + public CachingTokenFilter(TokenStream input) { + super(input); + } + + public Token next() throws IOException { + if (cache == null) { + // fill cache lazily + cache = new LinkedList(); + fillCache(); + iterator = cache.iterator(); + } + + if (!iterator.hasNext()) { + // the cache is exhausted, return null + return null; + } + + return (Token) iterator.next(); + } + + public void reset() throws IOException { + if(cache != null) { + iterator = cache.iterator(); + } + } + + private void fillCache() throws IOException { + Token token; + while ( (token = input.next()) != null) { + cache.add(token); + } + } + +} diff --git a/src/java/org/apache/lucene/document/AbstractField.java b/src/java/org/apache/lucene/document/AbstractField.java index c639349dae6..77e158a967a 100755 --- a/src/java/org/apache/lucene/document/AbstractField.java +++ b/src/java/org/apache/lucene/document/AbstractField.java @@ -1,274 +1,274 @@ -package org.apache.lucene.document; -/** - * Copyright 2006 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -/** - * - * - **/ -public abstract class AbstractField implements Fieldable { - - protected String name = "body"; - protected boolean storeTermVector = false; - protected boolean storeOffsetWithTermVector = false; - protected boolean storePositionWithTermVector = false; - protected boolean omitNorms = false; - protected boolean isStored = false; - protected boolean isIndexed = true; - protected boolean isTokenized = true; - protected boolean isBinary = false; - protected boolean isCompressed = false; - protected boolean lazy = false; - protected float boost = 1.0f; - // the one and only data object for all different kind of field values - protected Object fieldsData = null; - - protected AbstractField() - { - - } - - protected AbstractField(String name, Field.Store store, Field.Index index, Field.TermVector termVector) { - if (name == null) - throw new NullPointerException("name cannot be null"); - this.name = name.intern(); // field names are interned - - if (store == Field.Store.YES){ - this.isStored = true; - this.isCompressed = false; - } - else if (store == Field.Store.COMPRESS) { - this.isStored = true; - this.isCompressed = true; - } - else if (store == Field.Store.NO){ - this.isStored = false; - this.isCompressed = false; - } - else - throw new IllegalArgumentException("unknown store parameter " + store); - - if (index == Field.Index.NO) { - this.isIndexed = false; - this.isTokenized = false; - } else if (index == Field.Index.TOKENIZED) { - this.isIndexed = true; - this.isTokenized = true; - } else if (index == Field.Index.UN_TOKENIZED) { - this.isIndexed = true; - this.isTokenized = false; - } else if (index == Field.Index.NO_NORMS) { - this.isIndexed = true; - this.isTokenized = false; - this.omitNorms = true; - } else { - throw new IllegalArgumentException("unknown index parameter " + index); - } - - this.isBinary = false; - - setStoreTermVector(termVector); - } - - /** Sets the boost factor hits on this field. This value will be - * multiplied into the score of all hits on this this field of this - * document. - * - *

The boost is multiplied by {@link org.apache.lucene.document.Document#getBoost()} of the document - * containing this field. If a document has multiple fields with the same - * name, all such values are multiplied together. This product is then - * multipled by the value {@link org.apache.lucene.search.Similarity#lengthNorm(String,int)}, and - * rounded by {@link org.apache.lucene.search.Similarity#encodeNorm(float)} before it is stored in the - * index. One should attempt to ensure that this product does not overflow - * the range of that encoding. - * - * @see org.apache.lucene.document.Document#setBoost(float) - * @see org.apache.lucene.search.Similarity#lengthNorm(String, int) - * @see org.apache.lucene.search.Similarity#encodeNorm(float) - */ - public void setBoost(float boost) { - this.boost = boost; - } - - /** Returns the boost factor for hits for this field. - * - *

The default value is 1.0. - * - *

Note: this value is not stored directly with the document in the index. - * Documents returned from {@link org.apache.lucene.index.IndexReader#document(int)} and - * {@link org.apache.lucene.search.Hits#doc(int)} may thus not have the same value present as when - * this field was indexed. - * - * @see #setBoost(float) - */ - public float getBoost() { - return boost; - } - - /** Returns the name of the field as an interned string. - * For example "date", "title", "body", ... - */ - public String name() { return name; } - - protected void setStoreTermVector(Field.TermVector termVector) { - if (termVector == Field.TermVector.NO) { - this.storeTermVector = false; - this.storePositionWithTermVector = false; - this.storeOffsetWithTermVector = false; - } - else if (termVector == Field.TermVector.YES) { - this.storeTermVector = true; - this.storePositionWithTermVector = false; - this.storeOffsetWithTermVector = false; - } - else if (termVector == Field.TermVector.WITH_POSITIONS) { - this.storeTermVector = true; - this.storePositionWithTermVector = true; - this.storeOffsetWithTermVector = false; - } - else if (termVector == Field.TermVector.WITH_OFFSETS) { - this.storeTermVector = true; - this.storePositionWithTermVector = false; - this.storeOffsetWithTermVector = true; - } - else if (termVector == Field.TermVector.WITH_POSITIONS_OFFSETS) { - this.storeTermVector = true; - this.storePositionWithTermVector = true; - this.storeOffsetWithTermVector = true; - } - else { - throw new IllegalArgumentException("unknown termVector parameter " + termVector); - } - } - - /** True iff the value of the field is to be stored in the index for return - with search hits. It is an error for this to be true if a field is - Reader-valued. */ - public final boolean isStored() { return isStored; } - - /** True iff the value of the field is to be indexed, so that it may be - searched on. */ - public final boolean isIndexed() { return isIndexed; } - - /** True iff the value of the field should be tokenized as text prior to - indexing. Un-tokenized fields are indexed as a single word and may not be - Reader-valued. */ - public final boolean isTokenized() { return isTokenized; } - - /** True if the value of the field is stored and compressed within the index */ - public final boolean isCompressed() { return isCompressed; } - - /** True iff the term or terms used to index this field are stored as a term - * vector, available from {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}. - * These methods do not provide access to the original content of the field, - * only to terms used to index it. If the original content must be - * preserved, use the stored attribute instead. - * - * @see org.apache.lucene.index.IndexReader#getTermFreqVector(int, String) - */ - public final boolean isTermVectorStored() { return storeTermVector; } - - /** - * True iff terms are stored as term vector together with their offsets - * (start and end positon in source text). - */ - public boolean isStoreOffsetWithTermVector(){ - return storeOffsetWithTermVector; - } - - /** - * True iff terms are stored as term vector together with their token positions. - */ - public boolean isStorePositionWithTermVector(){ - return storePositionWithTermVector; - } - - /** True iff the value of the filed is stored as binary */ - public final boolean isBinary() { return isBinary; } - - /** True if norms are omitted for this indexed field */ - public boolean getOmitNorms() { return omitNorms; } - - /** Expert: - * - * If set, omit normalization factors associated with this indexed field. - * This effectively disables indexing boosts and length normalization for this field. - */ - public void setOmitNorms(boolean omitNorms) { this.omitNorms=omitNorms; } - - public boolean isLazy() { - return lazy; - } - - /** Prints a Field for human consumption. */ - public final String toString() { - StringBuffer result = new StringBuffer(); - if (isStored) { - result.append("stored"); - if (isCompressed) - result.append("/compressed"); - else - result.append("/uncompressed"); - } - if (isIndexed) { - if (result.length() > 0) - result.append(","); - result.append("indexed"); - } - if (isTokenized) { - if (result.length() > 0) - result.append(","); - result.append("tokenized"); - } - if (storeTermVector) { - if (result.length() > 0) - result.append(","); - result.append("termVector"); - } - if (storeOffsetWithTermVector) { - if (result.length() > 0) - result.append(","); - result.append("termVectorOffsets"); - } - if (storePositionWithTermVector) { - if (result.length() > 0) - result.append(","); - result.append("termVectorPosition"); - } - if (isBinary) { - if (result.length() > 0) - result.append(","); - result.append("binary"); - } - if (omitNorms) { - result.append(",omitNorms"); - } - if (lazy){ - result.append(",lazy"); - } - result.append('<'); - result.append(name); - result.append(':'); - - if (fieldsData != null && lazy == false) { - result.append(fieldsData); - } - - result.append('>'); - return result.toString(); - } -} +package org.apache.lucene.document; +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * + * + **/ +public abstract class AbstractField implements Fieldable { + + protected String name = "body"; + protected boolean storeTermVector = false; + protected boolean storeOffsetWithTermVector = false; + protected boolean storePositionWithTermVector = false; + protected boolean omitNorms = false; + protected boolean isStored = false; + protected boolean isIndexed = true; + protected boolean isTokenized = true; + protected boolean isBinary = false; + protected boolean isCompressed = false; + protected boolean lazy = false; + protected float boost = 1.0f; + // the one and only data object for all different kind of field values + protected Object fieldsData = null; + + protected AbstractField() + { + + } + + protected AbstractField(String name, Field.Store store, Field.Index index, Field.TermVector termVector) { + if (name == null) + throw new NullPointerException("name cannot be null"); + this.name = name.intern(); // field names are interned + + if (store == Field.Store.YES){ + this.isStored = true; + this.isCompressed = false; + } + else if (store == Field.Store.COMPRESS) { + this.isStored = true; + this.isCompressed = true; + } + else if (store == Field.Store.NO){ + this.isStored = false; + this.isCompressed = false; + } + else + throw new IllegalArgumentException("unknown store parameter " + store); + + if (index == Field.Index.NO) { + this.isIndexed = false; + this.isTokenized = false; + } else if (index == Field.Index.TOKENIZED) { + this.isIndexed = true; + this.isTokenized = true; + } else if (index == Field.Index.UN_TOKENIZED) { + this.isIndexed = true; + this.isTokenized = false; + } else if (index == Field.Index.NO_NORMS) { + this.isIndexed = true; + this.isTokenized = false; + this.omitNorms = true; + } else { + throw new IllegalArgumentException("unknown index parameter " + index); + } + + this.isBinary = false; + + setStoreTermVector(termVector); + } + + /** Sets the boost factor hits on this field. This value will be + * multiplied into the score of all hits on this this field of this + * document. + * + *

The boost is multiplied by {@link org.apache.lucene.document.Document#getBoost()} of the document + * containing this field. If a document has multiple fields with the same + * name, all such values are multiplied together. This product is then + * multipled by the value {@link org.apache.lucene.search.Similarity#lengthNorm(String,int)}, and + * rounded by {@link org.apache.lucene.search.Similarity#encodeNorm(float)} before it is stored in the + * index. One should attempt to ensure that this product does not overflow + * the range of that encoding. + * + * @see org.apache.lucene.document.Document#setBoost(float) + * @see org.apache.lucene.search.Similarity#lengthNorm(String, int) + * @see org.apache.lucene.search.Similarity#encodeNorm(float) + */ + public void setBoost(float boost) { + this.boost = boost; + } + + /** Returns the boost factor for hits for this field. + * + *

The default value is 1.0. + * + *

Note: this value is not stored directly with the document in the index. + * Documents returned from {@link org.apache.lucene.index.IndexReader#document(int)} and + * {@link org.apache.lucene.search.Hits#doc(int)} may thus not have the same value present as when + * this field was indexed. + * + * @see #setBoost(float) + */ + public float getBoost() { + return boost; + } + + /** Returns the name of the field as an interned string. + * For example "date", "title", "body", ... + */ + public String name() { return name; } + + protected void setStoreTermVector(Field.TermVector termVector) { + if (termVector == Field.TermVector.NO) { + this.storeTermVector = false; + this.storePositionWithTermVector = false; + this.storeOffsetWithTermVector = false; + } + else if (termVector == Field.TermVector.YES) { + this.storeTermVector = true; + this.storePositionWithTermVector = false; + this.storeOffsetWithTermVector = false; + } + else if (termVector == Field.TermVector.WITH_POSITIONS) { + this.storeTermVector = true; + this.storePositionWithTermVector = true; + this.storeOffsetWithTermVector = false; + } + else if (termVector == Field.TermVector.WITH_OFFSETS) { + this.storeTermVector = true; + this.storePositionWithTermVector = false; + this.storeOffsetWithTermVector = true; + } + else if (termVector == Field.TermVector.WITH_POSITIONS_OFFSETS) { + this.storeTermVector = true; + this.storePositionWithTermVector = true; + this.storeOffsetWithTermVector = true; + } + else { + throw new IllegalArgumentException("unknown termVector parameter " + termVector); + } + } + + /** True iff the value of the field is to be stored in the index for return + with search hits. It is an error for this to be true if a field is + Reader-valued. */ + public final boolean isStored() { return isStored; } + + /** True iff the value of the field is to be indexed, so that it may be + searched on. */ + public final boolean isIndexed() { return isIndexed; } + + /** True iff the value of the field should be tokenized as text prior to + indexing. Un-tokenized fields are indexed as a single word and may not be + Reader-valued. */ + public final boolean isTokenized() { return isTokenized; } + + /** True if the value of the field is stored and compressed within the index */ + public final boolean isCompressed() { return isCompressed; } + + /** True iff the term or terms used to index this field are stored as a term + * vector, available from {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}. + * These methods do not provide access to the original content of the field, + * only to terms used to index it. If the original content must be + * preserved, use the stored attribute instead. + * + * @see org.apache.lucene.index.IndexReader#getTermFreqVector(int, String) + */ + public final boolean isTermVectorStored() { return storeTermVector; } + + /** + * True iff terms are stored as term vector together with their offsets + * (start and end positon in source text). + */ + public boolean isStoreOffsetWithTermVector(){ + return storeOffsetWithTermVector; + } + + /** + * True iff terms are stored as term vector together with their token positions. + */ + public boolean isStorePositionWithTermVector(){ + return storePositionWithTermVector; + } + + /** True iff the value of the filed is stored as binary */ + public final boolean isBinary() { return isBinary; } + + /** True if norms are omitted for this indexed field */ + public boolean getOmitNorms() { return omitNorms; } + + /** Expert: + * + * If set, omit normalization factors associated with this indexed field. + * This effectively disables indexing boosts and length normalization for this field. + */ + public void setOmitNorms(boolean omitNorms) { this.omitNorms=omitNorms; } + + public boolean isLazy() { + return lazy; + } + + /** Prints a Field for human consumption. */ + public final String toString() { + StringBuffer result = new StringBuffer(); + if (isStored) { + result.append("stored"); + if (isCompressed) + result.append("/compressed"); + else + result.append("/uncompressed"); + } + if (isIndexed) { + if (result.length() > 0) + result.append(","); + result.append("indexed"); + } + if (isTokenized) { + if (result.length() > 0) + result.append(","); + result.append("tokenized"); + } + if (storeTermVector) { + if (result.length() > 0) + result.append(","); + result.append("termVector"); + } + if (storeOffsetWithTermVector) { + if (result.length() > 0) + result.append(","); + result.append("termVectorOffsets"); + } + if (storePositionWithTermVector) { + if (result.length() > 0) + result.append(","); + result.append("termVectorPosition"); + } + if (isBinary) { + if (result.length() > 0) + result.append(","); + result.append("binary"); + } + if (omitNorms) { + result.append(",omitNorms"); + } + if (lazy){ + result.append(",lazy"); + } + result.append('<'); + result.append(name); + result.append(':'); + + if (fieldsData != null && lazy == false) { + result.append(fieldsData); + } + + result.append('>'); + return result.toString(); + } +} diff --git a/src/java/org/apache/lucene/document/FieldSelector.java b/src/java/org/apache/lucene/document/FieldSelector.java index 869c846a5db..079ade6ffb5 100755 --- a/src/java/org/apache/lucene/document/FieldSelector.java +++ b/src/java/org/apache/lucene/document/FieldSelector.java @@ -1,34 +1,34 @@ -package org.apache.lucene.document; - -import java.io.Serializable; -/** - * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Similar to a {@link java.io.FileFilter}, the FieldSelector allows one to make decisions about - * what Fields get loaded on a {@link Document} by {@link org.apache.lucene.index.IndexReader#document(int,org.apache.lucene.document.FieldSelector)} - * - **/ -public interface FieldSelector extends Serializable { - - /** - * - * @param fieldName the field to accept or reject - * @return an instance of {@link FieldSelectorResult} - * if the {@link Field} named fieldName should be loaded. - */ - FieldSelectorResult accept(String fieldName); -} +package org.apache.lucene.document; + +import java.io.Serializable; +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Similar to a {@link java.io.FileFilter}, the FieldSelector allows one to make decisions about + * what Fields get loaded on a {@link Document} by {@link org.apache.lucene.index.IndexReader#document(int,org.apache.lucene.document.FieldSelector)} + * + **/ +public interface FieldSelector extends Serializable { + + /** + * + * @param fieldName the field to accept or reject + * @return an instance of {@link FieldSelectorResult} + * if the {@link Field} named fieldName should be loaded. + */ + FieldSelectorResult accept(String fieldName); +} diff --git a/src/java/org/apache/lucene/document/FieldSelectorResult.java b/src/java/org/apache/lucene/document/FieldSelectorResult.java index e92bec2d05a..b9c76a4d5b4 100755 --- a/src/java/org/apache/lucene/document/FieldSelectorResult.java +++ b/src/java/org/apache/lucene/document/FieldSelectorResult.java @@ -1,96 +1,96 @@ -package org.apache.lucene.document; - -import java.io.Serializable; -/** - * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Provides information about what should be done with this Field - * - **/ -//Replace with an enumerated type in 1.5 -public final class FieldSelectorResult implements Serializable { - - /** - * Load this {@link Field} every time the {@link Document} is loaded, reading in the data as it is encounterd. - * {@link Document#getField(String)} and {@link Document#getFieldable(String)} should not return null. - *

- * {@link Document#add(Fieldable)} should be called by the Reader. - */ - public transient static final FieldSelectorResult LOAD = new FieldSelectorResult(0); - /** - * Lazily load this {@link Field}. This means the {@link Field} is valid, but it may not actually contain its data until - * invoked. {@link Document#getField(String)} SHOULD NOT BE USED. {@link Document#getFieldable(String)} is safe to use and should - * return a valid instance of a {@link Fieldable}. - *

- * {@link Document#add(Fieldable)} should be called by the Reader. - */ - public transient static final FieldSelectorResult LAZY_LOAD = new FieldSelectorResult(1); - /** - * Do not load the {@link Field}. {@link Document#getField(String)} and {@link Document#getFieldable(String)} should return null. - * {@link Document#add(Fieldable)} is not called. - *

- * {@link Document#add(Fieldable)} should not be called by the Reader. - */ - public transient static final FieldSelectorResult NO_LOAD = new FieldSelectorResult(2); - /** - * Load this field as in the {@link #LOAD} case, but immediately return from {@link Field} loading for the {@link Document}. Thus, the - * Document may not have its complete set of Fields. {@link Document#getField(String)} and {@link Document#getFieldable(String)} should - * both be valid for this {@link Field} - *

- * {@link Document#add(Fieldable)} should be called by the Reader. - */ - public transient static final FieldSelectorResult LOAD_AND_BREAK = new FieldSelectorResult(3); - /** - * Behaves much like {@link #LOAD} but does not uncompress any compressed data. This is used for internal purposes. - * {@link Document#getField(String)} and {@link Document#getFieldable(String)} should not return null. - *

- * {@link Document#add(Fieldable)} should be called by the Reader. - */ - public transient static final FieldSelectorResult LOAD_FOR_MERGE = new FieldSelectorResult(4); - - /** Expert: Load the size of this {@link Field} rather than its value. - * Size is measured as number of bytes required to store the field == bytes for a binary or any compressed value, and 2*chars for a String value. - * The size is stored as a binary value, represented as an int in a byte[], with the higher order byte first in [0] - */ - public transient static final FieldSelectorResult SIZE = new FieldSelectorResult(5); - - /** Expert: Like {@link #SIZE} but immediately break from the field loading loop, i.e., stop loading further fields, after the size is loaded */ - public transient static final FieldSelectorResult SIZE_AND_BREAK = new FieldSelectorResult(6); - - - - private int id; - - private FieldSelectorResult(int id) { - this.id = id; - } - - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - final FieldSelectorResult that = (FieldSelectorResult) o; - - if (id != that.id) return false; - - return true; - } - - public int hashCode() { - return id; - } -} +package org.apache.lucene.document; + +import java.io.Serializable; +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Provides information about what should be done with this Field + * + **/ +//Replace with an enumerated type in 1.5 +public final class FieldSelectorResult implements Serializable { + + /** + * Load this {@link Field} every time the {@link Document} is loaded, reading in the data as it is encounterd. + * {@link Document#getField(String)} and {@link Document#getFieldable(String)} should not return null. + *

+ * {@link Document#add(Fieldable)} should be called by the Reader. + */ + public transient static final FieldSelectorResult LOAD = new FieldSelectorResult(0); + /** + * Lazily load this {@link Field}. This means the {@link Field} is valid, but it may not actually contain its data until + * invoked. {@link Document#getField(String)} SHOULD NOT BE USED. {@link Document#getFieldable(String)} is safe to use and should + * return a valid instance of a {@link Fieldable}. + *

+ * {@link Document#add(Fieldable)} should be called by the Reader. + */ + public transient static final FieldSelectorResult LAZY_LOAD = new FieldSelectorResult(1); + /** + * Do not load the {@link Field}. {@link Document#getField(String)} and {@link Document#getFieldable(String)} should return null. + * {@link Document#add(Fieldable)} is not called. + *

+ * {@link Document#add(Fieldable)} should not be called by the Reader. + */ + public transient static final FieldSelectorResult NO_LOAD = new FieldSelectorResult(2); + /** + * Load this field as in the {@link #LOAD} case, but immediately return from {@link Field} loading for the {@link Document}. Thus, the + * Document may not have its complete set of Fields. {@link Document#getField(String)} and {@link Document#getFieldable(String)} should + * both be valid for this {@link Field} + *

+ * {@link Document#add(Fieldable)} should be called by the Reader. + */ + public transient static final FieldSelectorResult LOAD_AND_BREAK = new FieldSelectorResult(3); + /** + * Behaves much like {@link #LOAD} but does not uncompress any compressed data. This is used for internal purposes. + * {@link Document#getField(String)} and {@link Document#getFieldable(String)} should not return null. + *

+ * {@link Document#add(Fieldable)} should be called by the Reader. + */ + public transient static final FieldSelectorResult LOAD_FOR_MERGE = new FieldSelectorResult(4); + + /** Expert: Load the size of this {@link Field} rather than its value. + * Size is measured as number of bytes required to store the field == bytes for a binary or any compressed value, and 2*chars for a String value. + * The size is stored as a binary value, represented as an int in a byte[], with the higher order byte first in [0] + */ + public transient static final FieldSelectorResult SIZE = new FieldSelectorResult(5); + + /** Expert: Like {@link #SIZE} but immediately break from the field loading loop, i.e., stop loading further fields, after the size is loaded */ + public transient static final FieldSelectorResult SIZE_AND_BREAK = new FieldSelectorResult(6); + + + + private int id; + + private FieldSelectorResult(int id) { + this.id = id; + } + + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + final FieldSelectorResult that = (FieldSelectorResult) o; + + if (id != that.id) return false; + + return true; + } + + public int hashCode() { + return id; + } +} diff --git a/src/java/org/apache/lucene/document/Fieldable.java b/src/java/org/apache/lucene/document/Fieldable.java index 7b2fd4f7eb3..f6ea323a068 100755 --- a/src/java/org/apache/lucene/document/Fieldable.java +++ b/src/java/org/apache/lucene/document/Fieldable.java @@ -1,144 +1,144 @@ -package org.apache.lucene.document; - -/** - * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.Serializable; - -import org.apache.lucene.analysis.TokenStream; - -/** - * Synonymous with {@link Field}. - * - **/ -public interface Fieldable extends Serializable { - /** Sets the boost factor hits on this field. This value will be - * multiplied into the score of all hits on this this field of this - * document. - * - *

The boost is multiplied by {@link org.apache.lucene.document.Document#getBoost()} of the document - * containing this field. If a document has multiple fields with the same - * name, all such values are multiplied together. This product is then - * multipled by the value {@link org.apache.lucene.search.Similarity#lengthNorm(String,int)}, and - * rounded by {@link org.apache.lucene.search.Similarity#encodeNorm(float)} before it is stored in the - * index. One should attempt to ensure that this product does not overflow - * the range of that encoding. - * - * @see org.apache.lucene.document.Document#setBoost(float) - * @see org.apache.lucene.search.Similarity#lengthNorm(String, int) - * @see org.apache.lucene.search.Similarity#encodeNorm(float) - */ - void setBoost(float boost); - - /** Returns the boost factor for hits for this field. - * - *

The default value is 1.0. - * - *

Note: this value is not stored directly with the document in the index. - * Documents returned from {@link org.apache.lucene.index.IndexReader#document(int)} and - * {@link org.apache.lucene.search.Hits#doc(int)} may thus not have the same value present as when - * this field was indexed. - * - * @see #setBoost(float) - */ - float getBoost(); - - /** Returns the name of the field as an interned string. - * For example "date", "title", "body", ... - */ - String name(); - - /** The value of the field as a String, or null. If null, the Reader value, - * binary value, or TokenStream value is used. Exactly one of stringValue(), - * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ - public String stringValue(); - - /** The value of the field as a Reader, or null. If null, the String value, - * binary value, or TokenStream value is used. Exactly one of stringValue(), - * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ - public Reader readerValue(); - - /** The value of the field in Binary, or null. If null, the Reader value, - * String value, or TokenStream value is used. Exactly one of stringValue(), - * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ - public byte[] binaryValue(); - - /** The value of the field as a TokenStream, or null. If null, the Reader value, - * String value, or binary value is used. Exactly one of stringValue(), - * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ - public TokenStream tokenStreamValue(); - - /** True iff the value of the field is to be stored in the index for return - with search hits. It is an error for this to be true if a field is - Reader-valued. */ - boolean isStored(); - - /** True iff the value of the field is to be indexed, so that it may be - searched on. */ - boolean isIndexed(); - - /** True iff the value of the field should be tokenized as text prior to - indexing. Un-tokenized fields are indexed as a single word and may not be - Reader-valued. */ - boolean isTokenized(); - - /** True if the value of the field is stored and compressed within the index */ - boolean isCompressed(); - - /** True iff the term or terms used to index this field are stored as a term - * vector, available from {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}. - * These methods do not provide access to the original content of the field, - * only to terms used to index it. If the original content must be - * preserved, use the stored attribute instead. - * - * @see org.apache.lucene.index.IndexReader#getTermFreqVector(int, String) - */ - boolean isTermVectorStored(); - - /** - * True iff terms are stored as term vector together with their offsets - * (start and end positon in source text). - */ - boolean isStoreOffsetWithTermVector(); - - /** - * True iff terms are stored as term vector together with their token positions. - */ - boolean isStorePositionWithTermVector(); - - /** True iff the value of the filed is stored as binary */ - boolean isBinary(); - - /** True if norms are omitted for this indexed field */ - boolean getOmitNorms(); - - /** Expert: - * - * If set, omit normalization factors associated with this indexed field. - * This effectively disables indexing boosts and length normalization for this field. - */ - void setOmitNorms(boolean omitNorms); - - /** - * Indicates whether a Field is Lazy or not. The semantics of Lazy loading are such that if a Field is lazily loaded, retrieving - * it's values via {@link #stringValue()} or {@link #binaryValue()} is only valid as long as the {@link org.apache.lucene.index.IndexReader} that - * retrieved the {@link Document} is still open. - * - * @return true if this field can be loaded lazily - */ - boolean isLazy(); -} +package org.apache.lucene.document; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.Serializable; + +import org.apache.lucene.analysis.TokenStream; + +/** + * Synonymous with {@link Field}. + * + **/ +public interface Fieldable extends Serializable { + /** Sets the boost factor hits on this field. This value will be + * multiplied into the score of all hits on this this field of this + * document. + * + *

The boost is multiplied by {@link org.apache.lucene.document.Document#getBoost()} of the document + * containing this field. If a document has multiple fields with the same + * name, all such values are multiplied together. This product is then + * multipled by the value {@link org.apache.lucene.search.Similarity#lengthNorm(String,int)}, and + * rounded by {@link org.apache.lucene.search.Similarity#encodeNorm(float)} before it is stored in the + * index. One should attempt to ensure that this product does not overflow + * the range of that encoding. + * + * @see org.apache.lucene.document.Document#setBoost(float) + * @see org.apache.lucene.search.Similarity#lengthNorm(String, int) + * @see org.apache.lucene.search.Similarity#encodeNorm(float) + */ + void setBoost(float boost); + + /** Returns the boost factor for hits for this field. + * + *

The default value is 1.0. + * + *

Note: this value is not stored directly with the document in the index. + * Documents returned from {@link org.apache.lucene.index.IndexReader#document(int)} and + * {@link org.apache.lucene.search.Hits#doc(int)} may thus not have the same value present as when + * this field was indexed. + * + * @see #setBoost(float) + */ + float getBoost(); + + /** Returns the name of the field as an interned string. + * For example "date", "title", "body", ... + */ + String name(); + + /** The value of the field as a String, or null. If null, the Reader value, + * binary value, or TokenStream value is used. Exactly one of stringValue(), + * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ + public String stringValue(); + + /** The value of the field as a Reader, or null. If null, the String value, + * binary value, or TokenStream value is used. Exactly one of stringValue(), + * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ + public Reader readerValue(); + + /** The value of the field in Binary, or null. If null, the Reader value, + * String value, or TokenStream value is used. Exactly one of stringValue(), + * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ + public byte[] binaryValue(); + + /** The value of the field as a TokenStream, or null. If null, the Reader value, + * String value, or binary value is used. Exactly one of stringValue(), + * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ + public TokenStream tokenStreamValue(); + + /** True iff the value of the field is to be stored in the index for return + with search hits. It is an error for this to be true if a field is + Reader-valued. */ + boolean isStored(); + + /** True iff the value of the field is to be indexed, so that it may be + searched on. */ + boolean isIndexed(); + + /** True iff the value of the field should be tokenized as text prior to + indexing. Un-tokenized fields are indexed as a single word and may not be + Reader-valued. */ + boolean isTokenized(); + + /** True if the value of the field is stored and compressed within the index */ + boolean isCompressed(); + + /** True iff the term or terms used to index this field are stored as a term + * vector, available from {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}. + * These methods do not provide access to the original content of the field, + * only to terms used to index it. If the original content must be + * preserved, use the stored attribute instead. + * + * @see org.apache.lucene.index.IndexReader#getTermFreqVector(int, String) + */ + boolean isTermVectorStored(); + + /** + * True iff terms are stored as term vector together with their offsets + * (start and end positon in source text). + */ + boolean isStoreOffsetWithTermVector(); + + /** + * True iff terms are stored as term vector together with their token positions. + */ + boolean isStorePositionWithTermVector(); + + /** True iff the value of the filed is stored as binary */ + boolean isBinary(); + + /** True if norms are omitted for this indexed field */ + boolean getOmitNorms(); + + /** Expert: + * + * If set, omit normalization factors associated with this indexed field. + * This effectively disables indexing boosts and length normalization for this field. + */ + void setOmitNorms(boolean omitNorms); + + /** + * Indicates whether a Field is Lazy or not. The semantics of Lazy loading are such that if a Field is lazily loaded, retrieving + * it's values via {@link #stringValue()} or {@link #binaryValue()} is only valid as long as the {@link org.apache.lucene.index.IndexReader} that + * retrieved the {@link Document} is still open. + * + * @return true if this field can be loaded lazily + */ + boolean isLazy(); +} diff --git a/src/java/org/apache/lucene/document/LoadFirstFieldSelector.java b/src/java/org/apache/lucene/document/LoadFirstFieldSelector.java index 4c67ab98c58..9928dd41a15 100755 --- a/src/java/org/apache/lucene/document/LoadFirstFieldSelector.java +++ b/src/java/org/apache/lucene/document/LoadFirstFieldSelector.java @@ -1,29 +1,29 @@ -package org.apache.lucene.document; -/** - * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -/** - * Load the First field and break. - *

- * See {@link FieldSelectorResult#LOAD_AND_BREAK} - */ -public class LoadFirstFieldSelector implements FieldSelector { - - public FieldSelectorResult accept(String fieldName) { - return FieldSelectorResult.LOAD_AND_BREAK; - } +package org.apache.lucene.document; +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * Load the First field and break. + *

+ * See {@link FieldSelectorResult#LOAD_AND_BREAK} + */ +public class LoadFirstFieldSelector implements FieldSelector { + + public FieldSelectorResult accept(String fieldName) { + return FieldSelectorResult.LOAD_AND_BREAK; + } } \ No newline at end of file diff --git a/src/java/org/apache/lucene/document/SetBasedFieldSelector.java b/src/java/org/apache/lucene/document/SetBasedFieldSelector.java index 5ceb7967d42..f753a104a82 100755 --- a/src/java/org/apache/lucene/document/SetBasedFieldSelector.java +++ b/src/java/org/apache/lucene/document/SetBasedFieldSelector.java @@ -1,60 +1,60 @@ -package org.apache.lucene.document; - -import java.util.Set; -/** - * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Declare what fields to load normally and what fields to load lazily - * - **/ -public class SetBasedFieldSelector implements FieldSelector { - - private Set fieldsToLoad; - private Set lazyFieldsToLoad; - - - - /** - * Pass in the Set of {@link Field} names to load and the Set of {@link Field} names to load lazily. If both are null, the - * Document will not have any {@link Field} on it. - * @param fieldsToLoad A Set of {@link String} field names to load. May be empty, but not null - * @param lazyFieldsToLoad A Set of {@link String} field names to load lazily. May be empty, but not null - */ - public SetBasedFieldSelector(Set fieldsToLoad, Set lazyFieldsToLoad) { - this.fieldsToLoad = fieldsToLoad; - this.lazyFieldsToLoad = lazyFieldsToLoad; - } - - /** - * Indicate whether to load the field with the given name or not. If the {@link Field#name()} is not in either of the - * initializing Sets, then {@link org.apache.lucene.document.FieldSelectorResult#NO_LOAD} is returned. If a Field name - * is in both fieldsToLoad and lazyFieldsToLoad, lazy has precedence. - * - * @param fieldName The {@link Field} name to check - * @return The {@link FieldSelectorResult} - */ - public FieldSelectorResult accept(String fieldName) { - FieldSelectorResult result = FieldSelectorResult.NO_LOAD; - if (fieldsToLoad.contains(fieldName) == true){ - result = FieldSelectorResult.LOAD; - } - if (lazyFieldsToLoad.contains(fieldName) == true){ - result = FieldSelectorResult.LAZY_LOAD; - } - return result; - } +package org.apache.lucene.document; + +import java.util.Set; +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Declare what fields to load normally and what fields to load lazily + * + **/ +public class SetBasedFieldSelector implements FieldSelector { + + private Set fieldsToLoad; + private Set lazyFieldsToLoad; + + + + /** + * Pass in the Set of {@link Field} names to load and the Set of {@link Field} names to load lazily. If both are null, the + * Document will not have any {@link Field} on it. + * @param fieldsToLoad A Set of {@link String} field names to load. May be empty, but not null + * @param lazyFieldsToLoad A Set of {@link String} field names to load lazily. May be empty, but not null + */ + public SetBasedFieldSelector(Set fieldsToLoad, Set lazyFieldsToLoad) { + this.fieldsToLoad = fieldsToLoad; + this.lazyFieldsToLoad = lazyFieldsToLoad; + } + + /** + * Indicate whether to load the field with the given name or not. If the {@link Field#name()} is not in either of the + * initializing Sets, then {@link org.apache.lucene.document.FieldSelectorResult#NO_LOAD} is returned. If a Field name + * is in both fieldsToLoad and lazyFieldsToLoad, lazy has precedence. + * + * @param fieldName The {@link Field} name to check + * @return The {@link FieldSelectorResult} + */ + public FieldSelectorResult accept(String fieldName) { + FieldSelectorResult result = FieldSelectorResult.NO_LOAD; + if (fieldsToLoad.contains(fieldName) == true){ + result = FieldSelectorResult.LOAD; + } + if (lazyFieldsToLoad.contains(fieldName) == true){ + result = FieldSelectorResult.LAZY_LOAD; + } + return result; + } } \ No newline at end of file diff --git a/src/java/org/apache/lucene/index/DefaultSkipListReader.java b/src/java/org/apache/lucene/index/DefaultSkipListReader.java index 9c2b92a26cf..e679ecf3b6a 100644 --- a/src/java/org/apache/lucene/index/DefaultSkipListReader.java +++ b/src/java/org/apache/lucene/index/DefaultSkipListReader.java @@ -1,114 +1,114 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.Arrays; - -import org.apache.lucene.store.IndexInput; - -/** - * Implements the skip list reader for the default posting list format - * that stores positions and payloads. - * - */ -class DefaultSkipListReader extends MultiLevelSkipListReader { - private boolean currentFieldStoresPayloads; - private long freqPointer[]; - private long proxPointer[]; - private int payloadLength[]; - - private long lastFreqPointer; - private long lastProxPointer; - private int lastPayloadLength; - - - DefaultSkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) { - super(skipStream, maxSkipLevels, skipInterval); - freqPointer = new long[maxSkipLevels]; - proxPointer = new long[maxSkipLevels]; - payloadLength = new int[maxSkipLevels]; - } - - void init(long skipPointer, long freqBasePointer, long proxBasePointer, int df, boolean storesPayloads) { - super.init(skipPointer, df); - this.currentFieldStoresPayloads = storesPayloads; - lastFreqPointer = freqBasePointer; - lastProxPointer = proxBasePointer; - - Arrays.fill(freqPointer, freqBasePointer); - Arrays.fill(proxPointer, proxBasePointer); - Arrays.fill(payloadLength, 0); - } - - /** Returns the freq pointer of the doc to which the last call of - * {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */ - long getFreqPointer() { - return lastFreqPointer; - } - - /** Returns the prox pointer of the doc to which the last call of - * {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */ - long getProxPointer() { - return lastProxPointer; - } - - /** Returns the payload length of the payload stored just before - * the doc to which the last call of {@link MultiLevelSkipListReader#skipTo(int)} - * has skipped. */ - int getPayloadLength() { - return lastPayloadLength; - } - - protected void seekChild(int level) throws IOException { - super.seekChild(level); - freqPointer[level] = lastFreqPointer; - proxPointer[level] = lastProxPointer; - payloadLength[level] = lastPayloadLength; - } - - protected void setLastSkipData(int level) { - super.setLastSkipData(level); - lastFreqPointer = freqPointer[level]; - lastProxPointer = proxPointer[level]; - lastPayloadLength = payloadLength[level]; - } - - - protected int readSkipData(int level, IndexInput skipStream) throws IOException { - int delta; - if (currentFieldStoresPayloads) { - // the current field stores payloads. - // if the doc delta is odd then we have - // to read the current payload length - // because it differs from the length of the - // previous payload - delta = skipStream.readVInt(); - if ((delta & 1) != 0) { - payloadLength[level] = skipStream.readVInt(); - } - delta >>>= 1; - } else { - delta = skipStream.readVInt(); - } - freqPointer[level] += skipStream.readVInt(); - proxPointer[level] += skipStream.readVInt(); - - return delta; - } -} +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.store.IndexInput; + +/** + * Implements the skip list reader for the default posting list format + * that stores positions and payloads. + * + */ +class DefaultSkipListReader extends MultiLevelSkipListReader { + private boolean currentFieldStoresPayloads; + private long freqPointer[]; + private long proxPointer[]; + private int payloadLength[]; + + private long lastFreqPointer; + private long lastProxPointer; + private int lastPayloadLength; + + + DefaultSkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) { + super(skipStream, maxSkipLevels, skipInterval); + freqPointer = new long[maxSkipLevels]; + proxPointer = new long[maxSkipLevels]; + payloadLength = new int[maxSkipLevels]; + } + + void init(long skipPointer, long freqBasePointer, long proxBasePointer, int df, boolean storesPayloads) { + super.init(skipPointer, df); + this.currentFieldStoresPayloads = storesPayloads; + lastFreqPointer = freqBasePointer; + lastProxPointer = proxBasePointer; + + Arrays.fill(freqPointer, freqBasePointer); + Arrays.fill(proxPointer, proxBasePointer); + Arrays.fill(payloadLength, 0); + } + + /** Returns the freq pointer of the doc to which the last call of + * {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */ + long getFreqPointer() { + return lastFreqPointer; + } + + /** Returns the prox pointer of the doc to which the last call of + * {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */ + long getProxPointer() { + return lastProxPointer; + } + + /** Returns the payload length of the payload stored just before + * the doc to which the last call of {@link MultiLevelSkipListReader#skipTo(int)} + * has skipped. */ + int getPayloadLength() { + return lastPayloadLength; + } + + protected void seekChild(int level) throws IOException { + super.seekChild(level); + freqPointer[level] = lastFreqPointer; + proxPointer[level] = lastProxPointer; + payloadLength[level] = lastPayloadLength; + } + + protected void setLastSkipData(int level) { + super.setLastSkipData(level); + lastFreqPointer = freqPointer[level]; + lastProxPointer = proxPointer[level]; + lastPayloadLength = payloadLength[level]; + } + + + protected int readSkipData(int level, IndexInput skipStream) throws IOException { + int delta; + if (currentFieldStoresPayloads) { + // the current field stores payloads. + // if the doc delta is odd then we have + // to read the current payload length + // because it differs from the length of the + // previous payload + delta = skipStream.readVInt(); + if ((delta & 1) != 0) { + payloadLength[level] = skipStream.readVInt(); + } + delta >>>= 1; + } else { + delta = skipStream.readVInt(); + } + freqPointer[level] += skipStream.readVInt(); + proxPointer[level] += skipStream.readVInt(); + + return delta; + } +} diff --git a/src/java/org/apache/lucene/index/DefaultSkipListWriter.java b/src/java/org/apache/lucene/index/DefaultSkipListWriter.java index 124dd94c20e..73799ad83a3 100644 --- a/src/java/org/apache/lucene/index/DefaultSkipListWriter.java +++ b/src/java/org/apache/lucene/index/DefaultSkipListWriter.java @@ -1,124 +1,124 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.Arrays; - -import org.apache.lucene.store.IndexOutput; - - -/** - * Implements the skip list writer for the default posting list format - * that stores positions and payloads. - * - */ -class DefaultSkipListWriter extends MultiLevelSkipListWriter { - private int[] lastSkipDoc; - private int[] lastSkipPayloadLength; - private long[] lastSkipFreqPointer; - private long[] lastSkipProxPointer; - - private IndexOutput freqOutput; - private IndexOutput proxOutput; - - private int curDoc; - private boolean curStorePayloads; - private int curPayloadLength; - private long curFreqPointer; - private long curProxPointer; - - DefaultSkipListWriter(int skipInterval, int numberOfSkipLevels, int docCount, IndexOutput freqOutput, IndexOutput proxOutput) { - super(skipInterval, numberOfSkipLevels, docCount); - this.freqOutput = freqOutput; - this.proxOutput = proxOutput; - - lastSkipDoc = new int[numberOfSkipLevels]; - lastSkipPayloadLength = new int[numberOfSkipLevels]; - lastSkipFreqPointer = new long[numberOfSkipLevels]; - lastSkipProxPointer = new long[numberOfSkipLevels]; - } - - /** - * Sets the values for the current skip data. - */ - void setSkipData(int doc, boolean storePayloads, int payloadLength) { - this.curDoc = doc; - this.curStorePayloads = storePayloads; - this.curPayloadLength = payloadLength; - this.curFreqPointer = freqOutput.getFilePointer(); - this.curProxPointer = proxOutput.getFilePointer(); - } - - protected void resetSkip() { - super.resetSkip(); - Arrays.fill(lastSkipDoc, 0); - Arrays.fill(lastSkipPayloadLength, -1); // we don't have to write the first length in the skip list - Arrays.fill(lastSkipFreqPointer, freqOutput.getFilePointer()); - Arrays.fill(lastSkipProxPointer, proxOutput.getFilePointer()); - } - - protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException { - // To efficiently store payloads in the posting lists we do not store the length of - // every payload. Instead we omit the length for a payload if the previous payload had - // the same length. - // However, in order to support skipping the payload length at every skip point must be known. - // So we use the same length encoding that we use for the posting lists for the skip data as well: - // Case 1: current field does not store payloads - // SkipDatum --> DocSkip, FreqSkip, ProxSkip - // DocSkip,FreqSkip,ProxSkip --> VInt - // DocSkip records the document number before every SkipInterval th document in TermFreqs. - // Document numbers are represented as differences from the previous value in the sequence. - // Case 2: current field stores payloads - // SkipDatum --> DocSkip, PayloadLength?, FreqSkip,ProxSkip - // DocSkip,FreqSkip,ProxSkip --> VInt - // PayloadLength --> VInt - // In this case DocSkip/2 is the difference between - // the current and the previous value. If DocSkip - // is odd, then a PayloadLength encoded as VInt follows, - // if DocSkip is even, then it is assumed that the - // current payload length equals the length at the previous - // skip point - if (curStorePayloads) { - int delta = curDoc - lastSkipDoc[level]; - if (curPayloadLength == lastSkipPayloadLength[level]) { - // the current payload length equals the length at the previous skip point, - // so we don't store the length again - skipBuffer.writeVInt(delta * 2); - } else { - // the payload length is different from the previous one. We shift the DocSkip, - // set the lowest bit and store the current payload length as VInt. - skipBuffer.writeVInt(delta * 2 + 1); - skipBuffer.writeVInt(curPayloadLength); - lastSkipPayloadLength[level] = curPayloadLength; - } - } else { - // current field does not store payloads - skipBuffer.writeVInt(curDoc - lastSkipDoc[level]); - } - skipBuffer.writeVInt((int) (curFreqPointer - lastSkipFreqPointer[level])); - skipBuffer.writeVInt((int) (curProxPointer - lastSkipProxPointer[level])); - - lastSkipDoc[level] = curDoc; - //System.out.println("write doc at level " + level + ": " + curDoc); - - lastSkipFreqPointer[level] = curFreqPointer; - lastSkipProxPointer[level] = curProxPointer; - } - -} +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.store.IndexOutput; + + +/** + * Implements the skip list writer for the default posting list format + * that stores positions and payloads. + * + */ +class DefaultSkipListWriter extends MultiLevelSkipListWriter { + private int[] lastSkipDoc; + private int[] lastSkipPayloadLength; + private long[] lastSkipFreqPointer; + private long[] lastSkipProxPointer; + + private IndexOutput freqOutput; + private IndexOutput proxOutput; + + private int curDoc; + private boolean curStorePayloads; + private int curPayloadLength; + private long curFreqPointer; + private long curProxPointer; + + DefaultSkipListWriter(int skipInterval, int numberOfSkipLevels, int docCount, IndexOutput freqOutput, IndexOutput proxOutput) { + super(skipInterval, numberOfSkipLevels, docCount); + this.freqOutput = freqOutput; + this.proxOutput = proxOutput; + + lastSkipDoc = new int[numberOfSkipLevels]; + lastSkipPayloadLength = new int[numberOfSkipLevels]; + lastSkipFreqPointer = new long[numberOfSkipLevels]; + lastSkipProxPointer = new long[numberOfSkipLevels]; + } + + /** + * Sets the values for the current skip data. + */ + void setSkipData(int doc, boolean storePayloads, int payloadLength) { + this.curDoc = doc; + this.curStorePayloads = storePayloads; + this.curPayloadLength = payloadLength; + this.curFreqPointer = freqOutput.getFilePointer(); + this.curProxPointer = proxOutput.getFilePointer(); + } + + protected void resetSkip() { + super.resetSkip(); + Arrays.fill(lastSkipDoc, 0); + Arrays.fill(lastSkipPayloadLength, -1); // we don't have to write the first length in the skip list + Arrays.fill(lastSkipFreqPointer, freqOutput.getFilePointer()); + Arrays.fill(lastSkipProxPointer, proxOutput.getFilePointer()); + } + + protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException { + // To efficiently store payloads in the posting lists we do not store the length of + // every payload. Instead we omit the length for a payload if the previous payload had + // the same length. + // However, in order to support skipping the payload length at every skip point must be known. + // So we use the same length encoding that we use for the posting lists for the skip data as well: + // Case 1: current field does not store payloads + // SkipDatum --> DocSkip, FreqSkip, ProxSkip + // DocSkip,FreqSkip,ProxSkip --> VInt + // DocSkip records the document number before every SkipInterval th document in TermFreqs. + // Document numbers are represented as differences from the previous value in the sequence. + // Case 2: current field stores payloads + // SkipDatum --> DocSkip, PayloadLength?, FreqSkip,ProxSkip + // DocSkip,FreqSkip,ProxSkip --> VInt + // PayloadLength --> VInt + // In this case DocSkip/2 is the difference between + // the current and the previous value. If DocSkip + // is odd, then a PayloadLength encoded as VInt follows, + // if DocSkip is even, then it is assumed that the + // current payload length equals the length at the previous + // skip point + if (curStorePayloads) { + int delta = curDoc - lastSkipDoc[level]; + if (curPayloadLength == lastSkipPayloadLength[level]) { + // the current payload length equals the length at the previous skip point, + // so we don't store the length again + skipBuffer.writeVInt(delta * 2); + } else { + // the payload length is different from the previous one. We shift the DocSkip, + // set the lowest bit and store the current payload length as VInt. + skipBuffer.writeVInt(delta * 2 + 1); + skipBuffer.writeVInt(curPayloadLength); + lastSkipPayloadLength[level] = curPayloadLength; + } + } else { + // current field does not store payloads + skipBuffer.writeVInt(curDoc - lastSkipDoc[level]); + } + skipBuffer.writeVInt((int) (curFreqPointer - lastSkipFreqPointer[level])); + skipBuffer.writeVInt((int) (curProxPointer - lastSkipProxPointer[level])); + + lastSkipDoc[level] = curDoc; + //System.out.println("write doc at level " + level + ": " + curDoc); + + lastSkipFreqPointer[level] = curFreqPointer; + lastSkipProxPointer[level] = curProxPointer; + } + +} diff --git a/src/java/org/apache/lucene/index/FieldReaderException.java b/src/java/org/apache/lucene/index/FieldReaderException.java index 1243731aa69..07736e97562 100755 --- a/src/java/org/apache/lucene/index/FieldReaderException.java +++ b/src/java/org/apache/lucene/index/FieldReaderException.java @@ -1,79 +1,79 @@ -package org.apache.lucene.index; -/** - * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * - * - **/ -public class FieldReaderException extends RuntimeException{ - /** - * Constructs a new runtime exception with null as its - * detail message. The cause is not initialized, and may subsequently be - * initialized by a call to {@link #initCause}. - */ - public FieldReaderException() { - } - - /** - * Constructs a new runtime exception with the specified cause and a - * detail message of (cause==null ? null : cause.toString()) - * (which typically contains the class and detail message of - * cause). - *

- * This constructor is useful for runtime exceptions - * that are little more than wrappers for other throwables. - * - * @param cause the cause (which is saved for later retrieval by the - * {@link #getCause()} method). (A null value is - * permitted, and indicates that the cause is nonexistent or - * unknown.) - * @since 1.4 - */ - public FieldReaderException(Throwable cause) { - super(cause); - } - - /** - * Constructs a new runtime exception with the specified detail message. - * The cause is not initialized, and may subsequently be initialized by a - * call to {@link #initCause}. - * - * @param message the detail message. The detail message is saved for - * later retrieval by the {@link #getMessage()} method. - */ - public FieldReaderException(String message) { - super(message); - } - - /** - * Constructs a new runtime exception with the specified detail message and - * cause.

Note that the detail message associated with - * cause is not automatically incorporated in - * this runtime exception's detail message. - * - * @param message the detail message (which is saved for later retrieval - * by the {@link #getMessage()} method). - * @param cause the cause (which is saved for later retrieval by the - * {@link #getCause()} method). (A null value is - * permitted, and indicates that the cause is nonexistent or - * unknown.) - * @since 1.4 - */ - public FieldReaderException(String message, Throwable cause) { - super(message, cause); - } -} +package org.apache.lucene.index; +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * + * + **/ +public class FieldReaderException extends RuntimeException{ + /** + * Constructs a new runtime exception with null as its + * detail message. The cause is not initialized, and may subsequently be + * initialized by a call to {@link #initCause}. + */ + public FieldReaderException() { + } + + /** + * Constructs a new runtime exception with the specified cause and a + * detail message of (cause==null ? null : cause.toString()) + * (which typically contains the class and detail message of + * cause). + *

+ * This constructor is useful for runtime exceptions + * that are little more than wrappers for other throwables. + * + * @param cause the cause (which is saved for later retrieval by the + * {@link #getCause()} method). (A null value is + * permitted, and indicates that the cause is nonexistent or + * unknown.) + * @since 1.4 + */ + public FieldReaderException(Throwable cause) { + super(cause); + } + + /** + * Constructs a new runtime exception with the specified detail message. + * The cause is not initialized, and may subsequently be initialized by a + * call to {@link #initCause}. + * + * @param message the detail message. The detail message is saved for + * later retrieval by the {@link #getMessage()} method. + */ + public FieldReaderException(String message) { + super(message); + } + + /** + * Constructs a new runtime exception with the specified detail message and + * cause.

Note that the detail message associated with + * cause is not automatically incorporated in + * this runtime exception's detail message. + * + * @param message the detail message (which is saved for later retrieval + * by the {@link #getMessage()} method). + * @param cause the cause (which is saved for later retrieval by the + * {@link #getCause()} method). (A null value is + * permitted, and indicates that the cause is nonexistent or + * unknown.) + * @since 1.4 + */ + public FieldReaderException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/src/java/org/apache/lucene/index/MultiLevelSkipListReader.java b/src/java/org/apache/lucene/index/MultiLevelSkipListReader.java index ede8e8f6231..156a67a076e 100644 --- a/src/java/org/apache/lucene/index/MultiLevelSkipListReader.java +++ b/src/java/org/apache/lucene/index/MultiLevelSkipListReader.java @@ -1,273 +1,273 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.Arrays; - -import org.apache.lucene.store.BufferedIndexInput; -import org.apache.lucene.store.IndexInput; - -/** - * This abstract class reads skip lists with multiple levels. - * - * See {@link MultiLevelSkipListWriter} for the information about the encoding - * of the multi level skip lists. - * - * Subclasses must implement the abstract method {@link #readSkipData(int, IndexInput)} - * which defines the actual format of the skip data. - */ -abstract class MultiLevelSkipListReader { - // the maximum number of skip levels possible for this index - private int maxNumberOfSkipLevels; - - // number of levels in this skip list - private int numberOfSkipLevels; - - // Expert: defines the number of top skip levels to buffer in memory. - // Reducing this number results in less memory usage, but possibly - // slower performance due to more random I/Os. - // Please notice that the space each level occupies is limited by - // the skipInterval. The top level can not contain more than - // skipLevel entries, the second top level can not contain more - // than skipLevel^2 entries and so forth. - private int numberOfLevelsToBuffer = 1; - - private int docCount; - private boolean haveSkipped; - - private IndexInput[] skipStream; // skipStream for each level - private long skipPointer[]; // the start pointer of each skip level - private int skipInterval[]; // skipInterval of each level - private int[] numSkipped; // number of docs skipped per level - - private int[] skipDoc; // doc id of current skip entry per level - private int lastDoc; // doc id of last read skip entry with docId <= target - private long[] childPointer; // child pointer of current skip entry per level - private long lastChildPointer; // childPointer of last read skip entry with docId <= target - - private boolean inputIsBuffered; - - public MultiLevelSkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) { - this.skipStream = new IndexInput[maxSkipLevels]; - this.skipPointer = new long[maxSkipLevels]; - this.childPointer = new long[maxSkipLevels]; - this.numSkipped = new int[maxSkipLevels]; - this.maxNumberOfSkipLevels = maxSkipLevels; - this.skipInterval = new int[maxSkipLevels]; - this.skipStream [0]= skipStream; - this.inputIsBuffered = (skipStream instanceof BufferedIndexInput); - this.skipInterval[0] = skipInterval; - for (int i = 1; i < maxSkipLevels; i++) { - // cache skip intervals - this.skipInterval[i] = this.skipInterval[i - 1] * skipInterval; - } - skipDoc = new int[maxSkipLevels]; - } - - - /** Returns the id of the doc to which the last call of {@link #skipTo(int)} - * has skipped. */ - int getDoc() { - return lastDoc; - } - - - /** Skips entries to the first beyond the current whose document number is - * greater than or equal to target. Returns the current doc count. - */ - int skipTo(int target) throws IOException { - if (!haveSkipped) { - // first time, load skip levels - loadSkipLevels(); - haveSkipped = true; - } - - // walk up the levels until highest level is found that has a skip - // for this target - int level = 0; - while (level < numberOfSkipLevels - 1 && target > skipDoc[level + 1]) { - level++; - } - - while (level >= 0) { - if (target > skipDoc[level]) { - if (!loadNextSkip(level)) { - continue; - } - } else { - // no more skips on this level, go down one level - if (level > 0 && lastChildPointer > skipStream[level - 1].getFilePointer()) { - seekChild(level - 1); - } - level--; - } - } - - return numSkipped[0] - skipInterval[0] - 1; - } - - private boolean loadNextSkip(int level) throws IOException { - // we have to skip, the target document is greater than the current - // skip list entry - setLastSkipData(level); - - numSkipped[level] += skipInterval[level]; - - if (numSkipped[level] > docCount) { - // this skip list is exhausted - skipDoc[level] = Integer.MAX_VALUE; - if (numberOfSkipLevels > level) numberOfSkipLevels = level; - return false; - } - - // read next skip entry - skipDoc[level] += readSkipData(level, skipStream[level]); - - if (level != 0) { - // read the child pointer if we are not on the leaf level - childPointer[level] = skipStream[level].readVLong() + skipPointer[level - 1]; - } - - return true; - - } - - /** Seeks the skip entry on the given level */ - protected void seekChild(int level) throws IOException { - skipStream[level].seek(lastChildPointer); - numSkipped[level] = numSkipped[level + 1] - skipInterval[level + 1]; - skipDoc[level] = lastDoc; - if (level > 0) { - childPointer[level] = skipStream[level].readVLong() + skipPointer[level - 1]; - } - } - - void close() throws IOException { - for (int i = 1; i < skipStream.length; i++) { - if (skipStream[i] != null) { - skipStream[i].close(); - } - } - } - - /** initializes the reader */ - void init(long skipPointer, int df) { - this.skipPointer[0] = skipPointer; - this.docCount = df; - Arrays.fill(skipDoc, 0); - Arrays.fill(numSkipped, 0); - Arrays.fill(childPointer, 0); - - haveSkipped = false; - for (int i = 1; i < numberOfSkipLevels; i++) { - skipStream[i] = null; - } - } - - /** Loads the skip levels */ - private void loadSkipLevels() throws IOException { - numberOfSkipLevels = docCount == 0 ? 0 : (int) Math.floor(Math.log(docCount) / Math.log(skipInterval[0])); - if (numberOfSkipLevels > maxNumberOfSkipLevels) { - numberOfSkipLevels = maxNumberOfSkipLevels; - } - - skipStream[0].seek(skipPointer[0]); - - int toBuffer = numberOfLevelsToBuffer; - - for (int i = numberOfSkipLevels - 1; i > 0; i--) { - // the length of the current level - long length = skipStream[0].readVLong(); - - // the start pointer of the current level - skipPointer[i] = skipStream[0].getFilePointer(); - if (toBuffer > 0) { - // buffer this level - skipStream[i] = new SkipBuffer(skipStream[0], (int) length); - toBuffer--; - } else { - // clone this stream, it is already at the start of the current level - skipStream[i] = (IndexInput) skipStream[0].clone(); - if (inputIsBuffered && length < BufferedIndexInput.BUFFER_SIZE) { - ((BufferedIndexInput) skipStream[i]).setBufferSize((int) length); - } - - // move base stream beyond the current level - skipStream[0].seek(skipStream[0].getFilePointer() + length); - } - } - - // use base stream for the lowest level - skipPointer[0] = skipStream[0].getFilePointer(); - } - - /** - * Subclasses must implement the actual skip data encoding in this method. - * - * @param level the level skip data shall be read from - * @param skipStream the skip stream to read from - */ - protected abstract int readSkipData(int level, IndexInput skipStream) throws IOException; - - /** Copies the values of the last read skip entry on this level */ - protected void setLastSkipData(int level) { - lastDoc = skipDoc[level]; - lastChildPointer = childPointer[level]; - } - - - /** used to buffer the top skip levels */ - private final static class SkipBuffer extends IndexInput { - private byte[] data; - private long pointer; - private int pos; - - SkipBuffer(IndexInput input, int length) throws IOException { - data = new byte[length]; - pointer = input.getFilePointer(); - input.readBytes(data, 0, length); - } - - public void close() throws IOException { - data = null; - } - - public long getFilePointer() { - return pointer + pos; - } - - public long length() { - return data.length; - } - - public byte readByte() throws IOException { - return data[pos++]; - } - - public void readBytes(byte[] b, int offset, int len) throws IOException { - System.arraycopy(data, pos, b, offset, len); - pos += len; - } - - public void seek(long pos) throws IOException { - this.pos = (int) (pos - pointer); - } - - } -} +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.store.BufferedIndexInput; +import org.apache.lucene.store.IndexInput; + +/** + * This abstract class reads skip lists with multiple levels. + * + * See {@link MultiLevelSkipListWriter} for the information about the encoding + * of the multi level skip lists. + * + * Subclasses must implement the abstract method {@link #readSkipData(int, IndexInput)} + * which defines the actual format of the skip data. + */ +abstract class MultiLevelSkipListReader { + // the maximum number of skip levels possible for this index + private int maxNumberOfSkipLevels; + + // number of levels in this skip list + private int numberOfSkipLevels; + + // Expert: defines the number of top skip levels to buffer in memory. + // Reducing this number results in less memory usage, but possibly + // slower performance due to more random I/Os. + // Please notice that the space each level occupies is limited by + // the skipInterval. The top level can not contain more than + // skipLevel entries, the second top level can not contain more + // than skipLevel^2 entries and so forth. + private int numberOfLevelsToBuffer = 1; + + private int docCount; + private boolean haveSkipped; + + private IndexInput[] skipStream; // skipStream for each level + private long skipPointer[]; // the start pointer of each skip level + private int skipInterval[]; // skipInterval of each level + private int[] numSkipped; // number of docs skipped per level + + private int[] skipDoc; // doc id of current skip entry per level + private int lastDoc; // doc id of last read skip entry with docId <= target + private long[] childPointer; // child pointer of current skip entry per level + private long lastChildPointer; // childPointer of last read skip entry with docId <= target + + private boolean inputIsBuffered; + + public MultiLevelSkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) { + this.skipStream = new IndexInput[maxSkipLevels]; + this.skipPointer = new long[maxSkipLevels]; + this.childPointer = new long[maxSkipLevels]; + this.numSkipped = new int[maxSkipLevels]; + this.maxNumberOfSkipLevels = maxSkipLevels; + this.skipInterval = new int[maxSkipLevels]; + this.skipStream [0]= skipStream; + this.inputIsBuffered = (skipStream instanceof BufferedIndexInput); + this.skipInterval[0] = skipInterval; + for (int i = 1; i < maxSkipLevels; i++) { + // cache skip intervals + this.skipInterval[i] = this.skipInterval[i - 1] * skipInterval; + } + skipDoc = new int[maxSkipLevels]; + } + + + /** Returns the id of the doc to which the last call of {@link #skipTo(int)} + * has skipped. */ + int getDoc() { + return lastDoc; + } + + + /** Skips entries to the first beyond the current whose document number is + * greater than or equal to target. Returns the current doc count. + */ + int skipTo(int target) throws IOException { + if (!haveSkipped) { + // first time, load skip levels + loadSkipLevels(); + haveSkipped = true; + } + + // walk up the levels until highest level is found that has a skip + // for this target + int level = 0; + while (level < numberOfSkipLevels - 1 && target > skipDoc[level + 1]) { + level++; + } + + while (level >= 0) { + if (target > skipDoc[level]) { + if (!loadNextSkip(level)) { + continue; + } + } else { + // no more skips on this level, go down one level + if (level > 0 && lastChildPointer > skipStream[level - 1].getFilePointer()) { + seekChild(level - 1); + } + level--; + } + } + + return numSkipped[0] - skipInterval[0] - 1; + } + + private boolean loadNextSkip(int level) throws IOException { + // we have to skip, the target document is greater than the current + // skip list entry + setLastSkipData(level); + + numSkipped[level] += skipInterval[level]; + + if (numSkipped[level] > docCount) { + // this skip list is exhausted + skipDoc[level] = Integer.MAX_VALUE; + if (numberOfSkipLevels > level) numberOfSkipLevels = level; + return false; + } + + // read next skip entry + skipDoc[level] += readSkipData(level, skipStream[level]); + + if (level != 0) { + // read the child pointer if we are not on the leaf level + childPointer[level] = skipStream[level].readVLong() + skipPointer[level - 1]; + } + + return true; + + } + + /** Seeks the skip entry on the given level */ + protected void seekChild(int level) throws IOException { + skipStream[level].seek(lastChildPointer); + numSkipped[level] = numSkipped[level + 1] - skipInterval[level + 1]; + skipDoc[level] = lastDoc; + if (level > 0) { + childPointer[level] = skipStream[level].readVLong() + skipPointer[level - 1]; + } + } + + void close() throws IOException { + for (int i = 1; i < skipStream.length; i++) { + if (skipStream[i] != null) { + skipStream[i].close(); + } + } + } + + /** initializes the reader */ + void init(long skipPointer, int df) { + this.skipPointer[0] = skipPointer; + this.docCount = df; + Arrays.fill(skipDoc, 0); + Arrays.fill(numSkipped, 0); + Arrays.fill(childPointer, 0); + + haveSkipped = false; + for (int i = 1; i < numberOfSkipLevels; i++) { + skipStream[i] = null; + } + } + + /** Loads the skip levels */ + private void loadSkipLevels() throws IOException { + numberOfSkipLevels = docCount == 0 ? 0 : (int) Math.floor(Math.log(docCount) / Math.log(skipInterval[0])); + if (numberOfSkipLevels > maxNumberOfSkipLevels) { + numberOfSkipLevels = maxNumberOfSkipLevels; + } + + skipStream[0].seek(skipPointer[0]); + + int toBuffer = numberOfLevelsToBuffer; + + for (int i = numberOfSkipLevels - 1; i > 0; i--) { + // the length of the current level + long length = skipStream[0].readVLong(); + + // the start pointer of the current level + skipPointer[i] = skipStream[0].getFilePointer(); + if (toBuffer > 0) { + // buffer this level + skipStream[i] = new SkipBuffer(skipStream[0], (int) length); + toBuffer--; + } else { + // clone this stream, it is already at the start of the current level + skipStream[i] = (IndexInput) skipStream[0].clone(); + if (inputIsBuffered && length < BufferedIndexInput.BUFFER_SIZE) { + ((BufferedIndexInput) skipStream[i]).setBufferSize((int) length); + } + + // move base stream beyond the current level + skipStream[0].seek(skipStream[0].getFilePointer() + length); + } + } + + // use base stream for the lowest level + skipPointer[0] = skipStream[0].getFilePointer(); + } + + /** + * Subclasses must implement the actual skip data encoding in this method. + * + * @param level the level skip data shall be read from + * @param skipStream the skip stream to read from + */ + protected abstract int readSkipData(int level, IndexInput skipStream) throws IOException; + + /** Copies the values of the last read skip entry on this level */ + protected void setLastSkipData(int level) { + lastDoc = skipDoc[level]; + lastChildPointer = childPointer[level]; + } + + + /** used to buffer the top skip levels */ + private final static class SkipBuffer extends IndexInput { + private byte[] data; + private long pointer; + private int pos; + + SkipBuffer(IndexInput input, int length) throws IOException { + data = new byte[length]; + pointer = input.getFilePointer(); + input.readBytes(data, 0, length); + } + + public void close() throws IOException { + data = null; + } + + public long getFilePointer() { + return pointer + pos; + } + + public long length() { + return data.length; + } + + public byte readByte() throws IOException { + return data[pos++]; + } + + public void readBytes(byte[] b, int offset, int len) throws IOException { + System.arraycopy(data, pos, b, offset, len); + pos += len; + } + + public void seek(long pos) throws IOException { + this.pos = (int) (pos - pointer); + } + + } +} diff --git a/src/java/org/apache/lucene/index/MultiLevelSkipListWriter.java b/src/java/org/apache/lucene/index/MultiLevelSkipListWriter.java index 3aa9b5efe0f..bded80ebee6 100644 --- a/src/java/org/apache/lucene/index/MultiLevelSkipListWriter.java +++ b/src/java/org/apache/lucene/index/MultiLevelSkipListWriter.java @@ -1,151 +1,151 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.store.RAMOutputStream; - -/** - * This abstract class writes skip lists with multiple levels. - * - * Example for skipInterval = 3: - * c (skip level 2) - * c c c (skip level 1) - * x x x x x x x x x x (skip level 0) - * d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d (posting list) - * 3 6 9 12 15 18 21 24 27 30 (df) - * - * d - document - * x - skip data - * c - skip data with child pointer - * - * Skip level i contains every skipInterval-th entry from skip level i-1. - * Therefore the number of entries on level i is: floor(df / ((skipInterval ^ (i + 1))). - * - * Each skip entry on a level i>0 contains a pointer to the corresponding skip entry in list i-1. - * This guarantess a logarithmic amount of skips to find the target document. - * - * While this class takes care of writing the different skip levels, - * subclasses must define the actual format of the skip data. - * - */ -abstract class MultiLevelSkipListWriter { - // number of levels in this skip list - private int numberOfSkipLevels; - - // the skip interval in the list with level = 0 - private int skipInterval; - - // for every skip level a different buffer is used - private RAMOutputStream[] skipBuffer; - - protected MultiLevelSkipListWriter(int skipInterval, int maxSkipLevels, int df) { - this.skipInterval = skipInterval; - - // calculate the maximum number of skip levels for this document frequency - numberOfSkipLevels = df == 0 ? 0 : (int) Math.floor(Math.log(df) / Math.log(skipInterval)); - - // make sure it does not exceed maxSkipLevels - if (numberOfSkipLevels > maxSkipLevels) { - numberOfSkipLevels = maxSkipLevels; - } - } - - protected void init() { - skipBuffer = new RAMOutputStream[numberOfSkipLevels]; - for (int i = 0; i < numberOfSkipLevels; i++) { - skipBuffer[i] = new RAMOutputStream(); - } - } - - protected void resetSkip() { - // creates new buffers or empties the existing ones - if (skipBuffer == null) { - init(); - } else { - for (int i = 0; i < skipBuffer.length; i++) { - skipBuffer[i].reset(); - } - } - } - - /** - * Subclasses must implement the actual skip data encoding in this method. - * - * @param level the level skip data shall be writting for - * @param skipBuffer the skip buffer to write to - */ - protected abstract void writeSkipData(int level, IndexOutput skipBuffer) throws IOException; - - /** - * Writes the current skip data to the buffers. The current document frequency determines - * the max level is skip data is to be written to. - * - * @param df the current document frequency - * @throws IOException - */ - void bufferSkip(int df) throws IOException { - int numLevels; - - // determine max level - for (numLevels = 0; (df % skipInterval) == 0 && numLevels < numberOfSkipLevels; df /= skipInterval) { - numLevels++; - } - - long childPointer = 0; - - for (int level = 0; level < numLevels; level++) { - writeSkipData(level, skipBuffer[level]); - - long newChildPointer = skipBuffer[level].getFilePointer(); - - if (level != 0) { - // store child pointers for all levels except the lowest - skipBuffer[level].writeVLong(childPointer); - } - - //remember the childPointer for the next level - childPointer = newChildPointer; - } - } - - /** - * Writes the buffered skip lists to the given output. - * - * @param output the IndexOutput the skip lists shall be written to - * @return the pointer the skip list starts - */ - long writeSkip(IndexOutput output) throws IOException { - long skipPointer = output.getFilePointer(); - if (skipBuffer == null || skipBuffer.length == 0) return skipPointer; - - for (int level = numberOfSkipLevels - 1; level > 0; level--) { - long length = skipBuffer[level].getFilePointer(); - if (length > 0) { - output.writeVLong(length); - skipBuffer[level].writeTo(output); - } - } - skipBuffer[0].writeTo(output); - - return skipPointer; - } - -} +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RAMOutputStream; + +/** + * This abstract class writes skip lists with multiple levels. + * + * Example for skipInterval = 3: + * c (skip level 2) + * c c c (skip level 1) + * x x x x x x x x x x (skip level 0) + * d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d (posting list) + * 3 6 9 12 15 18 21 24 27 30 (df) + * + * d - document + * x - skip data + * c - skip data with child pointer + * + * Skip level i contains every skipInterval-th entry from skip level i-1. + * Therefore the number of entries on level i is: floor(df / ((skipInterval ^ (i + 1))). + * + * Each skip entry on a level i>0 contains a pointer to the corresponding skip entry in list i-1. + * This guarantess a logarithmic amount of skips to find the target document. + * + * While this class takes care of writing the different skip levels, + * subclasses must define the actual format of the skip data. + * + */ +abstract class MultiLevelSkipListWriter { + // number of levels in this skip list + private int numberOfSkipLevels; + + // the skip interval in the list with level = 0 + private int skipInterval; + + // for every skip level a different buffer is used + private RAMOutputStream[] skipBuffer; + + protected MultiLevelSkipListWriter(int skipInterval, int maxSkipLevels, int df) { + this.skipInterval = skipInterval; + + // calculate the maximum number of skip levels for this document frequency + numberOfSkipLevels = df == 0 ? 0 : (int) Math.floor(Math.log(df) / Math.log(skipInterval)); + + // make sure it does not exceed maxSkipLevels + if (numberOfSkipLevels > maxSkipLevels) { + numberOfSkipLevels = maxSkipLevels; + } + } + + protected void init() { + skipBuffer = new RAMOutputStream[numberOfSkipLevels]; + for (int i = 0; i < numberOfSkipLevels; i++) { + skipBuffer[i] = new RAMOutputStream(); + } + } + + protected void resetSkip() { + // creates new buffers or empties the existing ones + if (skipBuffer == null) { + init(); + } else { + for (int i = 0; i < skipBuffer.length; i++) { + skipBuffer[i].reset(); + } + } + } + + /** + * Subclasses must implement the actual skip data encoding in this method. + * + * @param level the level skip data shall be writting for + * @param skipBuffer the skip buffer to write to + */ + protected abstract void writeSkipData(int level, IndexOutput skipBuffer) throws IOException; + + /** + * Writes the current skip data to the buffers. The current document frequency determines + * the max level is skip data is to be written to. + * + * @param df the current document frequency + * @throws IOException + */ + void bufferSkip(int df) throws IOException { + int numLevels; + + // determine max level + for (numLevels = 0; (df % skipInterval) == 0 && numLevels < numberOfSkipLevels; df /= skipInterval) { + numLevels++; + } + + long childPointer = 0; + + for (int level = 0; level < numLevels; level++) { + writeSkipData(level, skipBuffer[level]); + + long newChildPointer = skipBuffer[level].getFilePointer(); + + if (level != 0) { + // store child pointers for all levels except the lowest + skipBuffer[level].writeVLong(childPointer); + } + + //remember the childPointer for the next level + childPointer = newChildPointer; + } + } + + /** + * Writes the buffered skip lists to the given output. + * + * @param output the IndexOutput the skip lists shall be written to + * @return the pointer the skip list starts + */ + long writeSkip(IndexOutput output) throws IOException { + long skipPointer = output.getFilePointer(); + if (skipBuffer == null || skipBuffer.length == 0) return skipPointer; + + for (int level = numberOfSkipLevels - 1; level > 0; level--) { + long length = skipBuffer[level].getFilePointer(); + if (length > 0) { + output.writeVLong(length); + skipBuffer[level].writeTo(output); + } + } + skipBuffer[0].writeTo(output); + + return skipPointer; + } + +} diff --git a/src/java/org/apache/lucene/index/Payload.java b/src/java/org/apache/lucene/index/Payload.java index f68ae1fd60e..83e0f903842 100644 --- a/src/java/org/apache/lucene/index/Payload.java +++ b/src/java/org/apache/lucene/index/Payload.java @@ -1,163 +1,163 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Serializable; - -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenStream; - - /** - * A Payload is metadata that can be stored together with each occurrence - * of a term. This metadata is stored inline in the posting list of the - * specific term. - *

- * To store payloads in the index a {@link TokenStream} has to be used that - * produces {@link Token}s containing payload data. - *

- * Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)} - * to retrieve the payloads from the index.
- * - */ - public class Payload implements Serializable, Cloneable { - /** the byte array containing the payload data */ - protected byte[] data; - - /** the offset within the byte array */ - protected int offset; - - /** the length of the payload data */ - protected int length; - - /** Creates an empty payload and does not allocate a byte array. */ - public Payload() { - // nothing to do - } - - /** - * Creates a new payload with the the given array as data. - * A reference to the passed-in array is held, i. e. no - * copy is made. - * - * @param data the data of this payload - */ - public Payload(byte[] data) { - this(data, 0, data.length); - } - - /** - * Creates a new payload with the the given array as data. - * A reference to the passed-in array is held, i. e. no - * copy is made. - * - * @param data the data of this payload - * @param offset the offset in the data byte array - * @param length the length of the data - */ - public Payload(byte[] data, int offset, int length) { - if (offset < 0 || offset + length > data.length) { - throw new IllegalArgumentException(); - } - this.data = data; - this.offset = offset; - this.length = length; - } - - /** - * Sets this payloads data. - * A reference to the passed-in array is held, i. e. no - * copy is made. - */ - public void setData(byte[] data) { - setData(data, 0, data.length); - } - - /** - * Sets this payloads data. - * A reference to the passed-in array is held, i. e. no - * copy is made. - */ - public void setData(byte[] data, int offset, int length) { - this.data = data; - this.offset = offset; - this.length = length; - } - - /** - * Returns a reference to the underlying byte array - * that holds this payloads data. - */ - public byte[] getData() { - return this.data; - } - - /** - * Returns the offset in the underlying byte array - */ - public int getOffset() { - return this.offset; - } - - /** - * Returns the length of the payload data. - */ - public int length() { - return this.length; - } - - /** - * Returns the byte at the given index. - */ - public byte byteAt(int index) { - if (0 <= index && index < this.length) { - return this.data[this.offset + index]; - } - throw new ArrayIndexOutOfBoundsException(index); - } - - /** - * Allocates a new byte array, copies the payload data into it and returns it. - */ - public byte[] toByteArray() { - byte[] retArray = new byte[this.length]; - System.arraycopy(this.data, this.offset, retArray, 0, this.length); - return retArray; - } - - /** - * Copies the payload data to a byte array. - * - * @param target the target byte array - * @param targetOffset the offset in the target byte array - */ - public void copyTo(byte[] target, int targetOffset) { - if (this.length > target.length + targetOffset) { - throw new ArrayIndexOutOfBoundsException(); - } - System.arraycopy(this.data, this.offset, target, targetOffset, this.length); - } - - /** - * Clones this payload by creating a copy of the underlying - * byte array. - */ - public Object clone() { - Payload clone = new Payload(this.toByteArray()); - return clone; - } -} +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + + /** + * A Payload is metadata that can be stored together with each occurrence + * of a term. This metadata is stored inline in the posting list of the + * specific term. + *

+ * To store payloads in the index a {@link TokenStream} has to be used that + * produces {@link Token}s containing payload data. + *

+ * Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)} + * to retrieve the payloads from the index.
+ * + */ + public class Payload implements Serializable, Cloneable { + /** the byte array containing the payload data */ + protected byte[] data; + + /** the offset within the byte array */ + protected int offset; + + /** the length of the payload data */ + protected int length; + + /** Creates an empty payload and does not allocate a byte array. */ + public Payload() { + // nothing to do + } + + /** + * Creates a new payload with the the given array as data. + * A reference to the passed-in array is held, i. e. no + * copy is made. + * + * @param data the data of this payload + */ + public Payload(byte[] data) { + this(data, 0, data.length); + } + + /** + * Creates a new payload with the the given array as data. + * A reference to the passed-in array is held, i. e. no + * copy is made. + * + * @param data the data of this payload + * @param offset the offset in the data byte array + * @param length the length of the data + */ + public Payload(byte[] data, int offset, int length) { + if (offset < 0 || offset + length > data.length) { + throw new IllegalArgumentException(); + } + this.data = data; + this.offset = offset; + this.length = length; + } + + /** + * Sets this payloads data. + * A reference to the passed-in array is held, i. e. no + * copy is made. + */ + public void setData(byte[] data) { + setData(data, 0, data.length); + } + + /** + * Sets this payloads data. + * A reference to the passed-in array is held, i. e. no + * copy is made. + */ + public void setData(byte[] data, int offset, int length) { + this.data = data; + this.offset = offset; + this.length = length; + } + + /** + * Returns a reference to the underlying byte array + * that holds this payloads data. + */ + public byte[] getData() { + return this.data; + } + + /** + * Returns the offset in the underlying byte array + */ + public int getOffset() { + return this.offset; + } + + /** + * Returns the length of the payload data. + */ + public int length() { + return this.length; + } + + /** + * Returns the byte at the given index. + */ + public byte byteAt(int index) { + if (0 <= index && index < this.length) { + return this.data[this.offset + index]; + } + throw new ArrayIndexOutOfBoundsException(index); + } + + /** + * Allocates a new byte array, copies the payload data into it and returns it. + */ + public byte[] toByteArray() { + byte[] retArray = new byte[this.length]; + System.arraycopy(this.data, this.offset, retArray, 0, this.length); + return retArray; + } + + /** + * Copies the payload data to a byte array. + * + * @param target the target byte array + * @param targetOffset the offset in the target byte array + */ + public void copyTo(byte[] target, int targetOffset) { + if (this.length > target.length + targetOffset) { + throw new ArrayIndexOutOfBoundsException(); + } + System.arraycopy(this.data, this.offset, target, targetOffset, this.length); + } + + /** + * Clones this payload by creating a copy of the underlying + * byte array. + */ + public Object clone() { + Payload clone = new Payload(this.toByteArray()); + return clone; + } +} diff --git a/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java b/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java index 1577ffce317..57392136ee9 100644 --- a/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java +++ b/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java @@ -1,257 +1,257 @@ -package org.apache.lucene.search; - -/** - * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.index.IndexReader; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.Collection; -import java.util.Set; - -/** - * A query that generates the union of documents produced by its subqueries, and that scores each document with the maximum - * score for that document as produced by any subquery, plus a tie breaking increment for any additional matching subqueries. - * This is useful when searching for a word in multiple fields with different boost factors (so that the fields cannot be - * combined equivalently into a single search field). We want the primary score to be the one associated with the highest boost, - * not the sum of the field scores (as BooleanQuery would give). - * If the query is "albino elephant" this ensures that "albino" matching one field and "elephant" matching - * another gets a higher score than "albino" matching both fields. - * To get this result, use both BooleanQuery and DisjunctionMaxQuery: for each term a DisjunctionMaxQuery searches for it in - * each field, while the set of these DisjunctionMaxQuery's is combined into a BooleanQuery. - * The tie breaker capability allows results that include the same term in multiple fields to be judged better than results that - * include this term in only the best of those multiple fields, without confusing this with the better case of two different terms - * in the multiple fields. - * @author Chuck Williams - */ -public class DisjunctionMaxQuery extends Query { - - /* The subqueries */ - private ArrayList disjuncts = new ArrayList(); - - /* Multiple of the non-max disjunct scores added into our final score. Non-zero values support tie-breaking. */ - private float tieBreakerMultiplier = 0.0f; - - /** Creates a new empty DisjunctionMaxQuery. Use add() to add the subqueries. - * @param tieBreakerMultiplier this score of each non-maximum disjunct for a document is multiplied by this weight - * and added into the final score. If non-zero, the value should be small, on the order of 0.1, which says that - * 10 occurrences of word in a lower-scored field that is also in a higher scored field is just as good as a unique - * word in the lower scored field (i.e., one that is not in any higher scored field. - */ - public DisjunctionMaxQuery(float tieBreakerMultiplier) { - this.tieBreakerMultiplier = tieBreakerMultiplier; - } - - /** - * Creates a new DisjunctionMaxQuery - * @param disjuncts a Collection of all the disjuncts to add - * @param tieBreakerMultiplier the weight to give to each matching non-maximum disjunct - */ - public DisjunctionMaxQuery(Collection disjuncts, float tieBreakerMultiplier) { - this.tieBreakerMultiplier = tieBreakerMultiplier; - add(disjuncts); - } - - /** Add a subquery to this disjunction - * @param query the disjunct added - */ - public void add(Query query) { - disjuncts.add(query); - } - - /** Add a collection of disjuncts to this disjunction - * via Iterable - */ - public void add(Collection disjuncts) { - this.disjuncts.addAll(disjuncts); - } - - /** An Iterator over the disjuncts */ - public Iterator iterator() { - return disjuncts.iterator(); - } - - /* The Weight for DisjunctionMaxQuery's, used to normalize, score and explain these queries */ - private class DisjunctionMaxWeight implements Weight { - - private Similarity similarity; // The similarity which we are associated. - private ArrayList weights = new ArrayList(); // The Weight's for our subqueries, in 1-1 correspondence with disjuncts - - /* Construct the Weight for this Query searched by searcher. Recursively construct subquery weights. */ - public DisjunctionMaxWeight(Searcher searcher) throws IOException { - this.similarity = searcher.getSimilarity(); - for (int i = 0; i < disjuncts.size(); i++) - weights.add(((Query) disjuncts.get(i)).createWeight(searcher)); - } - - /* Return our associated DisjunctionMaxQuery */ - public Query getQuery() { return DisjunctionMaxQuery.this; } - - /* Return our boost */ - public float getValue() { return getBoost(); } - - /* Compute the sub of squared weights of us applied to our subqueries. Used for normalization. */ - public float sumOfSquaredWeights() throws IOException { - float max = 0.0f, sum = 0.0f; - for (int i = 0; i < weights.size(); i++) { - float sub = ((Weight) weights.get(i)).sumOfSquaredWeights(); - sum += sub; - max = Math.max(max, sub); - } - return (((sum - max) * tieBreakerMultiplier * tieBreakerMultiplier) + max) * getBoost() * getBoost(); - } - - /* Apply the computed normalization factor to our subqueries */ - public void normalize(float norm) { - norm *= getBoost(); // Incorporate our boost - for (int i = 0 ; i < weights.size(); i++) - ((Weight) weights.get(i)).normalize(norm); - } - - /* Create the scorer used to score our associated DisjunctionMaxQuery */ - public Scorer scorer(IndexReader reader) throws IOException { - DisjunctionMaxScorer result = new DisjunctionMaxScorer(tieBreakerMultiplier, similarity); - for (int i = 0 ; i < weights.size(); i++) { - Weight w = (Weight) weights.get(i); - Scorer subScorer = w.scorer(reader); - if (subScorer == null) return null; - result.add(subScorer); - } - return result; - } - - /* Explain the score we computed for doc */ - public Explanation explain(IndexReader reader, int doc) throws IOException { - if ( disjuncts.size() == 1) return ((Weight) weights.get(0)).explain(reader,doc); - ComplexExplanation result = new ComplexExplanation(); - float max = 0.0f, sum = 0.0f; - result.setDescription(tieBreakerMultiplier == 0.0f ? "max of:" : "max plus " + tieBreakerMultiplier + " times others of:"); - for (int i = 0 ; i < weights.size(); i++) { - Explanation e = ((Weight) weights.get(i)).explain(reader, doc); - if (e.isMatch()) { - result.setMatch(Boolean.TRUE); - result.addDetail(e); - sum += e.getValue(); - max = Math.max(max, e.getValue()); - } - } - result.setValue(max + (sum - max)*tieBreakerMultiplier); - return result; - } - - } // end of DisjunctionMaxWeight inner class - - /* Create the Weight used to score us */ - protected Weight createWeight(Searcher searcher) throws IOException { - return new DisjunctionMaxWeight(searcher); - } - - /** Optimize our representation and our subqueries representations - * @param reader the IndexReader we query - * @return an optimized copy of us (which may not be a copy if there is nothing to optimize) */ - public Query rewrite(IndexReader reader) throws IOException { - if (disjuncts.size() == 1) { - Query singleton = (Query) disjuncts.get(0); - Query result = singleton.rewrite(reader); - if (getBoost() != 1.0f) { - if (result == singleton) result = (Query)result.clone(); - result.setBoost(getBoost() * result.getBoost()); - } - return result; - } - DisjunctionMaxQuery clone = null; - for (int i = 0 ; i < disjuncts.size(); i++) { - Query clause = (Query) disjuncts.get(i); - Query rewrite = clause.rewrite(reader); - if (rewrite != clause) { - if (clone == null) clone = (DisjunctionMaxQuery)this.clone(); - clone.disjuncts.set(i, rewrite); - } - } - if (clone != null) return clone; - else return this; - } - - /** Create a shallow copy of us -- used in rewriting if necessary - * @return a copy of us (but reuse, don't copy, our subqueries) */ - public Object clone() { - DisjunctionMaxQuery clone = (DisjunctionMaxQuery)super.clone(); - clone.disjuncts = (ArrayList)this.disjuncts.clone(); - return clone; - } - - - // inherit javadoc - public void extractTerms(Set terms) { - for (int i = 0; i < disjuncts.size(); i++) { - ((Query)disjuncts.get(i)).extractTerms(terms); - } - } - - - /** Prettyprint us. - * @param field the field to which we are applied - * @return a string that shows what we do, of the form "(disjunct1 | disjunct2 | ... | disjunctn)^boost" - */ - public String toString(String field) { - StringBuffer buffer = new StringBuffer(); - buffer.append("("); - for (int i = 0 ; i < disjuncts.size(); i++) { - Query subquery = (Query) disjuncts.get(i); - if (subquery instanceof BooleanQuery) { // wrap sub-bools in parens - buffer.append("("); - buffer.append(subquery.toString(field)); - buffer.append(")"); - } - else buffer.append(subquery.toString(field)); - if (i != disjuncts.size()-1) buffer.append(" | "); - } - buffer.append(")"); - if (tieBreakerMultiplier != 0.0f) { - buffer.append("~"); - buffer.append(tieBreakerMultiplier); - } - if (getBoost() != 1.0) { - buffer.append("^"); - buffer.append(getBoost()); - } - return buffer.toString(); - } - - /** Return true iff we represent the same query as o - * @param o another object - * @return true iff o is a DisjunctionMaxQuery with the same boost and the same subqueries, in the same order, as us - */ - public boolean equals(Object o) { - if (! (o instanceof DisjunctionMaxQuery) ) return false; - DisjunctionMaxQuery other = (DisjunctionMaxQuery)o; - return this.getBoost() == other.getBoost() - && this.tieBreakerMultiplier == other.tieBreakerMultiplier - && this.disjuncts.equals(other.disjuncts); - } - - /** Compute a hash code for hashing us - * @return the hash code - */ - public int hashCode() { - return Float.floatToIntBits(getBoost()) - + Float.floatToIntBits(tieBreakerMultiplier) - + disjuncts.hashCode(); - } - -} +package org.apache.lucene.search; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.IndexReader; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.Collection; +import java.util.Set; + +/** + * A query that generates the union of documents produced by its subqueries, and that scores each document with the maximum + * score for that document as produced by any subquery, plus a tie breaking increment for any additional matching subqueries. + * This is useful when searching for a word in multiple fields with different boost factors (so that the fields cannot be + * combined equivalently into a single search field). We want the primary score to be the one associated with the highest boost, + * not the sum of the field scores (as BooleanQuery would give). + * If the query is "albino elephant" this ensures that "albino" matching one field and "elephant" matching + * another gets a higher score than "albino" matching both fields. + * To get this result, use both BooleanQuery and DisjunctionMaxQuery: for each term a DisjunctionMaxQuery searches for it in + * each field, while the set of these DisjunctionMaxQuery's is combined into a BooleanQuery. + * The tie breaker capability allows results that include the same term in multiple fields to be judged better than results that + * include this term in only the best of those multiple fields, without confusing this with the better case of two different terms + * in the multiple fields. + * @author Chuck Williams + */ +public class DisjunctionMaxQuery extends Query { + + /* The subqueries */ + private ArrayList disjuncts = new ArrayList(); + + /* Multiple of the non-max disjunct scores added into our final score. Non-zero values support tie-breaking. */ + private float tieBreakerMultiplier = 0.0f; + + /** Creates a new empty DisjunctionMaxQuery. Use add() to add the subqueries. + * @param tieBreakerMultiplier this score of each non-maximum disjunct for a document is multiplied by this weight + * and added into the final score. If non-zero, the value should be small, on the order of 0.1, which says that + * 10 occurrences of word in a lower-scored field that is also in a higher scored field is just as good as a unique + * word in the lower scored field (i.e., one that is not in any higher scored field. + */ + public DisjunctionMaxQuery(float tieBreakerMultiplier) { + this.tieBreakerMultiplier = tieBreakerMultiplier; + } + + /** + * Creates a new DisjunctionMaxQuery + * @param disjuncts a Collection of all the disjuncts to add + * @param tieBreakerMultiplier the weight to give to each matching non-maximum disjunct + */ + public DisjunctionMaxQuery(Collection disjuncts, float tieBreakerMultiplier) { + this.tieBreakerMultiplier = tieBreakerMultiplier; + add(disjuncts); + } + + /** Add a subquery to this disjunction + * @param query the disjunct added + */ + public void add(Query query) { + disjuncts.add(query); + } + + /** Add a collection of disjuncts to this disjunction + * via Iterable + */ + public void add(Collection disjuncts) { + this.disjuncts.addAll(disjuncts); + } + + /** An Iterator over the disjuncts */ + public Iterator iterator() { + return disjuncts.iterator(); + } + + /* The Weight for DisjunctionMaxQuery's, used to normalize, score and explain these queries */ + private class DisjunctionMaxWeight implements Weight { + + private Similarity similarity; // The similarity which we are associated. + private ArrayList weights = new ArrayList(); // The Weight's for our subqueries, in 1-1 correspondence with disjuncts + + /* Construct the Weight for this Query searched by searcher. Recursively construct subquery weights. */ + public DisjunctionMaxWeight(Searcher searcher) throws IOException { + this.similarity = searcher.getSimilarity(); + for (int i = 0; i < disjuncts.size(); i++) + weights.add(((Query) disjuncts.get(i)).createWeight(searcher)); + } + + /* Return our associated DisjunctionMaxQuery */ + public Query getQuery() { return DisjunctionMaxQuery.this; } + + /* Return our boost */ + public float getValue() { return getBoost(); } + + /* Compute the sub of squared weights of us applied to our subqueries. Used for normalization. */ + public float sumOfSquaredWeights() throws IOException { + float max = 0.0f, sum = 0.0f; + for (int i = 0; i < weights.size(); i++) { + float sub = ((Weight) weights.get(i)).sumOfSquaredWeights(); + sum += sub; + max = Math.max(max, sub); + } + return (((sum - max) * tieBreakerMultiplier * tieBreakerMultiplier) + max) * getBoost() * getBoost(); + } + + /* Apply the computed normalization factor to our subqueries */ + public void normalize(float norm) { + norm *= getBoost(); // Incorporate our boost + for (int i = 0 ; i < weights.size(); i++) + ((Weight) weights.get(i)).normalize(norm); + } + + /* Create the scorer used to score our associated DisjunctionMaxQuery */ + public Scorer scorer(IndexReader reader) throws IOException { + DisjunctionMaxScorer result = new DisjunctionMaxScorer(tieBreakerMultiplier, similarity); + for (int i = 0 ; i < weights.size(); i++) { + Weight w = (Weight) weights.get(i); + Scorer subScorer = w.scorer(reader); + if (subScorer == null) return null; + result.add(subScorer); + } + return result; + } + + /* Explain the score we computed for doc */ + public Explanation explain(IndexReader reader, int doc) throws IOException { + if ( disjuncts.size() == 1) return ((Weight) weights.get(0)).explain(reader,doc); + ComplexExplanation result = new ComplexExplanation(); + float max = 0.0f, sum = 0.0f; + result.setDescription(tieBreakerMultiplier == 0.0f ? "max of:" : "max plus " + tieBreakerMultiplier + " times others of:"); + for (int i = 0 ; i < weights.size(); i++) { + Explanation e = ((Weight) weights.get(i)).explain(reader, doc); + if (e.isMatch()) { + result.setMatch(Boolean.TRUE); + result.addDetail(e); + sum += e.getValue(); + max = Math.max(max, e.getValue()); + } + } + result.setValue(max + (sum - max)*tieBreakerMultiplier); + return result; + } + + } // end of DisjunctionMaxWeight inner class + + /* Create the Weight used to score us */ + protected Weight createWeight(Searcher searcher) throws IOException { + return new DisjunctionMaxWeight(searcher); + } + + /** Optimize our representation and our subqueries representations + * @param reader the IndexReader we query + * @return an optimized copy of us (which may not be a copy if there is nothing to optimize) */ + public Query rewrite(IndexReader reader) throws IOException { + if (disjuncts.size() == 1) { + Query singleton = (Query) disjuncts.get(0); + Query result = singleton.rewrite(reader); + if (getBoost() != 1.0f) { + if (result == singleton) result = (Query)result.clone(); + result.setBoost(getBoost() * result.getBoost()); + } + return result; + } + DisjunctionMaxQuery clone = null; + for (int i = 0 ; i < disjuncts.size(); i++) { + Query clause = (Query) disjuncts.get(i); + Query rewrite = clause.rewrite(reader); + if (rewrite != clause) { + if (clone == null) clone = (DisjunctionMaxQuery)this.clone(); + clone.disjuncts.set(i, rewrite); + } + } + if (clone != null) return clone; + else return this; + } + + /** Create a shallow copy of us -- used in rewriting if necessary + * @return a copy of us (but reuse, don't copy, our subqueries) */ + public Object clone() { + DisjunctionMaxQuery clone = (DisjunctionMaxQuery)super.clone(); + clone.disjuncts = (ArrayList)this.disjuncts.clone(); + return clone; + } + + + // inherit javadoc + public void extractTerms(Set terms) { + for (int i = 0; i < disjuncts.size(); i++) { + ((Query)disjuncts.get(i)).extractTerms(terms); + } + } + + + /** Prettyprint us. + * @param field the field to which we are applied + * @return a string that shows what we do, of the form "(disjunct1 | disjunct2 | ... | disjunctn)^boost" + */ + public String toString(String field) { + StringBuffer buffer = new StringBuffer(); + buffer.append("("); + for (int i = 0 ; i < disjuncts.size(); i++) { + Query subquery = (Query) disjuncts.get(i); + if (subquery instanceof BooleanQuery) { // wrap sub-bools in parens + buffer.append("("); + buffer.append(subquery.toString(field)); + buffer.append(")"); + } + else buffer.append(subquery.toString(field)); + if (i != disjuncts.size()-1) buffer.append(" | "); + } + buffer.append(")"); + if (tieBreakerMultiplier != 0.0f) { + buffer.append("~"); + buffer.append(tieBreakerMultiplier); + } + if (getBoost() != 1.0) { + buffer.append("^"); + buffer.append(getBoost()); + } + return buffer.toString(); + } + + /** Return true iff we represent the same query as o + * @param o another object + * @return true iff o is a DisjunctionMaxQuery with the same boost and the same subqueries, in the same order, as us + */ + public boolean equals(Object o) { + if (! (o instanceof DisjunctionMaxQuery) ) return false; + DisjunctionMaxQuery other = (DisjunctionMaxQuery)o; + return this.getBoost() == other.getBoost() + && this.tieBreakerMultiplier == other.tieBreakerMultiplier + && this.disjuncts.equals(other.disjuncts); + } + + /** Compute a hash code for hashing us + * @return the hash code + */ + public int hashCode() { + return Float.floatToIntBits(getBoost()) + + Float.floatToIntBits(tieBreakerMultiplier) + + disjuncts.hashCode(); + } + +} diff --git a/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java b/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java index bed2815603d..8569257a5af 100644 --- a/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java +++ b/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java @@ -1,195 +1,195 @@ -package org.apache.lucene.search; - -/** - * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.ArrayList; - -/** - * The Scorer for DisjunctionMaxQuery's. The union of all documents generated by the the subquery scorers - * is generated in document number order. The score for each document is the maximum of the scores computed - * by the subquery scorers that generate that document, plus tieBreakerMultiplier times the sum of the scores - * for the other subqueries that generate the document. - * @author Chuck Williams - */ -class DisjunctionMaxScorer extends Scorer { - - /* The scorers for subqueries that have remaining docs, kept as a min heap by number of next doc. */ - private ArrayList subScorers = new ArrayList(); - - /* Multiplier applied to non-maximum-scoring subqueries for a document as they are summed into the result. */ - private float tieBreakerMultiplier; - - private boolean more = false; // True iff there is a next document - private boolean firstTime = true; // True iff next() has not yet been called - - /** Creates a new instance of DisjunctionMaxScorer - * @param tieBreakerMultiplier Multiplier applied to non-maximum-scoring subqueries for a document as they are summed into the result. - * @param similarity -- not used since our definition involves neither coord nor terms directly */ - public DisjunctionMaxScorer(float tieBreakerMultiplier, Similarity similarity) { - super(similarity); - this.tieBreakerMultiplier = tieBreakerMultiplier; - } - - /** Add the scorer for a subquery - * @param scorer the scorer of a subquery of our associated DisjunctionMaxQuery - */ - public void add(Scorer scorer) throws IOException { - if (scorer.next()) { // Initialize and retain only if it produces docs - subScorers.add(scorer); - more = true; - } - } - - /** Generate the next document matching our associated DisjunctionMaxQuery. - * @return true iff there is a next document - */ - public boolean next() throws IOException { - if (!more) return false; - if (firstTime) { - heapify(); - firstTime = false; - return true; // more would have been false if no subScorers had any docs - } - // Increment all generators that generated the last doc and adjust the heap. - int lastdoc = ((Scorer) subScorers.get(0)).doc(); - do { - if (((Scorer) subScorers.get(0)).next()) - heapAdjust(0); - else { - heapRemoveRoot(); - if (subScorers.isEmpty()) return (more = false); - } - } while ( ((Scorer) subScorers.get(0)).doc()==lastdoc ); - return true; - } - - /** Determine the current document number. Initially invalid, until {@link #next()} is called the first time. - * @return the document number of the currently generated document - */ - public int doc() { - return ((Scorer) subScorers.get(0)).doc(); - } - - /** Determine the current document score. Initially invalid, until {@link #next()} is called the first time. - * @return the score of the current generated document - */ - public float score() throws IOException { - int doc = ((Scorer) subScorers.get(0)).doc(); - float[] sum = {((Scorer) subScorers.get(0)).score()}, max = {sum[0]}; - int size = subScorers.size(); - scoreAll(1, size, doc, sum, max); - scoreAll(2, size, doc, sum, max); - return max[0] + (sum[0] - max[0])*tieBreakerMultiplier; - } - - // Recursively iterate all subScorers that generated last doc computing sum and max - private void scoreAll(int root, int size, int doc, float[] sum, float[] max) throws IOException { - if (root0 && ((Scorer)subScorers.get(0)).doc()>1)-1; i>=0; i--) - heapAdjust(i); - } - - /* The subtree of subScorers at root is a min heap except possibly for its root element. - * Bubble the root down as required to make the subtree a heap. - */ - private void heapAdjust(int root) { - Scorer scorer=(Scorer)subScorers.get(root); - int doc=scorer.doc(); - int i=root, size=subScorers.size(); - while (i<=(size>>1)-1) { - int lchild=(i<<1)+1; - Scorer lscorer=(Scorer)subScorers.get(lchild); - int ldoc=lscorer.doc(); - int rdoc=Integer.MAX_VALUE, rchild=(i<<1)+2; - Scorer rscorer=null; - if (rchild0 && ((Scorer)subScorers.get(0)).doc()>1)-1; i>=0; i--) + heapAdjust(i); + } + + /* The subtree of subScorers at root is a min heap except possibly for its root element. + * Bubble the root down as required to make the subtree a heap. + */ + private void heapAdjust(int root) { + Scorer scorer=(Scorer)subScorers.get(root); + int doc=scorer.doc(); + int i=root, size=subScorers.size(); + while (i<=(size>>1)-1) { + int lchild=(i<<1)+1; + Scorer lscorer=(Scorer)subScorers.get(lchild); + int ldoc=lscorer.doc(); + int rdoc=Integer.MAX_VALUE, rchild=(i<<1)+2; + Scorer rscorer=null; + if (rchildValues less than zero are all mapped to zero. - *
Values are truncated (rounded down) to the nearest 8 bit value. - *
Values between zero and the smallest representable value - * are rounded up. - * - * @param f the 32 bit float to be converted to an 8 bit float (byte) - * @param numMantissaBits the number of mantissa bits to use in the byte, with the remainder to be used in the exponent - * @param zeroExp the zero-point in the range of exponent values - * @return the 8 bit float representation - */ - public static byte floatToByte(float f, int numMantissaBits, int zeroExp) { - // Adjustment from a float zero exponent to our zero exponent, - // shifted over to our exponent position. - int fzero = (63-zeroExp)<> (24-numMantissaBits); - if (smallfloat < fzero) { - return (bits<=0) ? - (byte)0 // negative numbers and zero both map to 0 byte - :(byte)1; // underflow is mapped to smallest non-zero number. - } else if (smallfloat >= fzero + 0x100) { - return -1; // overflow maps to largest number - } else { - return (byte)(smallfloat - fzero); - } - } - - /** Converts an 8 bit float to a 32 bit float. */ - public static float byteToFloat(byte b, int numMantissaBits, int zeroExp) { - // on Java1.5 & 1.6 JVMs, prebuilding a decoding array and doing a lookup - // is only a little bit faster (anywhere from 0% to 7%) - if (b == 0) return 0.0f; - int bits = (b&0xff) << (24-numMantissaBits); - bits += (63-zeroExp) << 24; - return Float.intBitsToFloat(bits); - } - - - // - // Some specializations of the generic functions follow. - // The generic functions are just as fast with current (1.5) - // -server JVMs, but still slower with client JVMs. - // - - /** floatToByte(b, mantissaBits=3, zeroExponent=15) - *
smallest non-zero value = 5.820766E-10 - *
largest value = 7.5161928E9 - *
epsilon = 0.125 - */ - public static byte floatToByte315(float f) { - int bits = Float.floatToRawIntBits(f); - int smallfloat = bits >> (24-3); - if (smallfloat < (63-15)<<3) { - return (bits<=0) ? (byte)0 : (byte)1; - } - if (smallfloat >= ((63-15)<<3) + 0x100) { - return -1; - } - return (byte)(smallfloat - ((63-15)<<3)); - } - - /** byteToFloat(b, mantissaBits=3, zeroExponent=15) */ - public static float byte315ToFloat(byte b) { - // on Java1.5 & 1.6 JVMs, prebuilding a decoding array and doing a lookup - // is only a little bit faster (anywhere from 0% to 7%) - if (b == 0) return 0.0f; - int bits = (b&0xff) << (24-3); - bits += (63-15) << 24; - return Float.intBitsToFloat(bits); - } - - - /** floatToByte(b, mantissaBits=5, zeroExponent=2) - *
smallest nonzero value = 0.033203125 - *
largest value = 1984.0 - *
epsilon = 0.03125 - */ - public static byte floatToByte52(float f) { - int bits = Float.floatToRawIntBits(f); - int smallfloat = bits >> (24-5); - if (smallfloat < (63-2)<<5) { - return (bits<=0) ? (byte)0 : (byte)1; - } - if (smallfloat >= ((63-2)<<5) + 0x100) { - return -1; - } - return (byte)(smallfloat - ((63-2)<<5)); - } - - /** byteToFloat(b, mantissaBits=5, zeroExponent=2) */ - public static float byte52ToFloat(byte b) { - // on Java1.5 & 1.6 JVMs, prebuilding a decoding array and doing a lookup - // is only a little bit faster (anywhere from 0% to 7%) - if (b == 0) return 0.0f; - int bits = (b&0xff) << (24-5); - bits += (63-2) << 24; - return Float.intBitsToFloat(bits); - } -} +package org.apache.lucene.util; +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** Floating point numbers smaller than 32 bits. + * + * @author yonik + * @version $Id$ + */ +public class SmallFloat { + + /** Converts a 32 bit float to an 8 bit float. + *
Values less than zero are all mapped to zero. + *
Values are truncated (rounded down) to the nearest 8 bit value. + *
Values between zero and the smallest representable value + * are rounded up. + * + * @param f the 32 bit float to be converted to an 8 bit float (byte) + * @param numMantissaBits the number of mantissa bits to use in the byte, with the remainder to be used in the exponent + * @param zeroExp the zero-point in the range of exponent values + * @return the 8 bit float representation + */ + public static byte floatToByte(float f, int numMantissaBits, int zeroExp) { + // Adjustment from a float zero exponent to our zero exponent, + // shifted over to our exponent position. + int fzero = (63-zeroExp)<> (24-numMantissaBits); + if (smallfloat < fzero) { + return (bits<=0) ? + (byte)0 // negative numbers and zero both map to 0 byte + :(byte)1; // underflow is mapped to smallest non-zero number. + } else if (smallfloat >= fzero + 0x100) { + return -1; // overflow maps to largest number + } else { + return (byte)(smallfloat - fzero); + } + } + + /** Converts an 8 bit float to a 32 bit float. */ + public static float byteToFloat(byte b, int numMantissaBits, int zeroExp) { + // on Java1.5 & 1.6 JVMs, prebuilding a decoding array and doing a lookup + // is only a little bit faster (anywhere from 0% to 7%) + if (b == 0) return 0.0f; + int bits = (b&0xff) << (24-numMantissaBits); + bits += (63-zeroExp) << 24; + return Float.intBitsToFloat(bits); + } + + + // + // Some specializations of the generic functions follow. + // The generic functions are just as fast with current (1.5) + // -server JVMs, but still slower with client JVMs. + // + + /** floatToByte(b, mantissaBits=3, zeroExponent=15) + *
smallest non-zero value = 5.820766E-10 + *
largest value = 7.5161928E9 + *
epsilon = 0.125 + */ + public static byte floatToByte315(float f) { + int bits = Float.floatToRawIntBits(f); + int smallfloat = bits >> (24-3); + if (smallfloat < (63-15)<<3) { + return (bits<=0) ? (byte)0 : (byte)1; + } + if (smallfloat >= ((63-15)<<3) + 0x100) { + return -1; + } + return (byte)(smallfloat - ((63-15)<<3)); + } + + /** byteToFloat(b, mantissaBits=3, zeroExponent=15) */ + public static float byte315ToFloat(byte b) { + // on Java1.5 & 1.6 JVMs, prebuilding a decoding array and doing a lookup + // is only a little bit faster (anywhere from 0% to 7%) + if (b == 0) return 0.0f; + int bits = (b&0xff) << (24-3); + bits += (63-15) << 24; + return Float.intBitsToFloat(bits); + } + + + /** floatToByte(b, mantissaBits=5, zeroExponent=2) + *
smallest nonzero value = 0.033203125 + *
largest value = 1984.0 + *
epsilon = 0.03125 + */ + public static byte floatToByte52(float f) { + int bits = Float.floatToRawIntBits(f); + int smallfloat = bits >> (24-5); + if (smallfloat < (63-2)<<5) { + return (bits<=0) ? (byte)0 : (byte)1; + } + if (smallfloat >= ((63-2)<<5) + 0x100) { + return -1; + } + return (byte)(smallfloat - ((63-2)<<5)); + } + + /** byteToFloat(b, mantissaBits=5, zeroExponent=2) */ + public static float byte52ToFloat(byte b) { + // on Java1.5 & 1.6 JVMs, prebuilding a decoding array and doing a lookup + // is only a little bit faster (anywhere from 0% to 7%) + if (b == 0) return 0.0f; + int bits = (b&0xff) << (24-5); + bits += (63-2) << 24; + return Float.intBitsToFloat(bits); + } +} diff --git a/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java b/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java index 60203e86fa1..485053e3e08 100644 --- a/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java +++ b/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java @@ -1,103 +1,103 @@ -package org.apache.lucene.analysis; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -import java.io.IOException; - -import org.apache.lucene.util.LuceneTestCase; - -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.Field.TermVector; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermPositions; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.RAMDirectory; - -public class TestCachingTokenFilter extends LuceneTestCase { - private String[] tokens = new String[] {"term1", "term2", "term3", "term2"}; - - public void testCaching() throws IOException { - Directory dir = new RAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); - Document doc = new Document(); - TokenStream stream = new TokenStream() { - private int index = 0; - - public Token next() throws IOException { - if (index == tokens.length) { - return null; - } else { - return new Token(tokens[index++], 0, 0); - } - } - - }; - - stream = new CachingTokenFilter(stream); - - doc.add(new Field("preanalyzed", stream, TermVector.NO)); - - // 1) we consume all tokens twice before we add the doc to the index - checkTokens(stream); - stream.reset(); - checkTokens(stream); - - // 2) now add the document to the index and verify if all tokens are indexed - // don't reset the stream here, the DocumentWriter should do that implicitly - writer.addDocument(doc); - writer.close(); - - IndexReader reader = IndexReader.open(dir); - TermPositions termPositions = reader.termPositions(new Term("preanalyzed", "term1")); - assertTrue(termPositions.next()); - assertEquals(1, termPositions.freq()); - assertEquals(0, termPositions.nextPosition()); - - termPositions.seek(new Term("preanalyzed", "term2")); - assertTrue(termPositions.next()); - assertEquals(2, termPositions.freq()); - assertEquals(1, termPositions.nextPosition()); - assertEquals(3, termPositions.nextPosition()); - - termPositions.seek(new Term("preanalyzed", "term3")); - assertTrue(termPositions.next()); - assertEquals(1, termPositions.freq()); - assertEquals(2, termPositions.nextPosition()); - reader.close(); - - // 3) reset stream and consume tokens again - stream.reset(); - checkTokens(stream); - } - - private void checkTokens(TokenStream stream) throws IOException { - int count = 0; - Token token; - while ((token = stream.next()) != null) { - assertTrue(count < tokens.length); - assertEquals(tokens[count], token.termText()); - count++; - } - - assertEquals(tokens.length, count); - } -} +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.io.IOException; + +import org.apache.lucene.util.LuceneTestCase; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.TermVector; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermPositions; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +public class TestCachingTokenFilter extends LuceneTestCase { + private String[] tokens = new String[] {"term1", "term2", "term3", "term2"}; + + public void testCaching() throws IOException { + Directory dir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + TokenStream stream = new TokenStream() { + private int index = 0; + + public Token next() throws IOException { + if (index == tokens.length) { + return null; + } else { + return new Token(tokens[index++], 0, 0); + } + } + + }; + + stream = new CachingTokenFilter(stream); + + doc.add(new Field("preanalyzed", stream, TermVector.NO)); + + // 1) we consume all tokens twice before we add the doc to the index + checkTokens(stream); + stream.reset(); + checkTokens(stream); + + // 2) now add the document to the index and verify if all tokens are indexed + // don't reset the stream here, the DocumentWriter should do that implicitly + writer.addDocument(doc); + writer.close(); + + IndexReader reader = IndexReader.open(dir); + TermPositions termPositions = reader.termPositions(new Term("preanalyzed", "term1")); + assertTrue(termPositions.next()); + assertEquals(1, termPositions.freq()); + assertEquals(0, termPositions.nextPosition()); + + termPositions.seek(new Term("preanalyzed", "term2")); + assertTrue(termPositions.next()); + assertEquals(2, termPositions.freq()); + assertEquals(1, termPositions.nextPosition()); + assertEquals(3, termPositions.nextPosition()); + + termPositions.seek(new Term("preanalyzed", "term3")); + assertTrue(termPositions.next()); + assertEquals(1, termPositions.freq()); + assertEquals(2, termPositions.nextPosition()); + reader.close(); + + // 3) reset stream and consume tokens again + stream.reset(); + checkTokens(stream); + } + + private void checkTokens(TokenStream stream) throws IOException { + int count = 0; + Token token; + while ((token = stream.next()) != null) { + assertTrue(count < tokens.length); + assertEquals(tokens[count], token.termText()); + count++; + } + + assertEquals(tokens.length, count); + } +} diff --git a/src/test/org/apache/lucene/analysis/TestStopFilter.java b/src/test/org/apache/lucene/analysis/TestStopFilter.java index f26824f44f2..9e9d300022c 100644 --- a/src/test/org/apache/lucene/analysis/TestStopFilter.java +++ b/src/test/org/apache/lucene/analysis/TestStopFilter.java @@ -1,128 +1,128 @@ -package org.apache.lucene.analysis; - -/** - * Copyright 2005 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.util.English; -import org.apache.lucene.util.LuceneTestCase; - -import java.io.IOException; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.Set; - -/** - * @author yonik - */ -public class TestStopFilter extends LuceneTestCase { - - private final static boolean VERBOSE = false; - - // other StopFilter functionality is already tested by TestStopAnalyzer - - public void testExactCase() throws IOException { - StringReader reader = new StringReader("Now is The Time"); - String[] stopWords = new String[] { "is", "the", "Time" }; - TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopWords); - assertEquals("Now", stream.next().termText()); - assertEquals("The", stream.next().termText()); - assertEquals(null, stream.next()); - } - - public void testIgnoreCase() throws IOException { - StringReader reader = new StringReader("Now is The Time"); - String[] stopWords = new String[] { "is", "the", "Time" }; - TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopWords, true); - assertEquals("Now", stream.next().termText()); - assertEquals(null,stream.next()); - } - - public void testStopFilt() throws IOException { - StringReader reader = new StringReader("Now is The Time"); - String[] stopWords = new String[] { "is", "the", "Time" }; - Set stopSet = StopFilter.makeStopSet(stopWords); - TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopSet); - assertEquals("Now", stream.next().termText()); - assertEquals("The", stream.next().termText()); - assertEquals(null, stream.next()); - } - - /** - * Test Position increments applied by StopFilter with and without enabling this option. - */ - public void testStopPositons() throws IOException { - StringBuffer sb = new StringBuffer(); - ArrayList a = new ArrayList(); - for (int i=0; i<20; i++) { - String w = English.intToEnglish(i).trim(); - sb.append(w).append(" "); - if (i%3 != 0) a.add(w); - } - log(sb.toString()); - String stopWords[] = (String[]) a.toArray(new String[0]); - for (int i=0; i test with enable-increments-"+(enableIcrements?"enabled":"disabled")); - stpf.setEnablePositionIncrements(enableIcrements); - for (int i=0; i<20; i+=3) { - Token t = stpf.next(); - log("Token "+i+": "+t); - String w = English.intToEnglish(i).trim(); - assertEquals("expecting token "+i+" to be "+w,w,t.termText()); - assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,t.getPositionIncrement()); - } - assertNull(stpf.next()); - } - - // print debug info depending on VERBOSE - private static void log(String s) { - if (VERBOSE) { - System.out.println(s); - } - } -} +package org.apache.lucene.analysis; + +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.English; +import org.apache.lucene.util.LuceneTestCase; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Set; + +/** + * @author yonik + */ +public class TestStopFilter extends LuceneTestCase { + + private final static boolean VERBOSE = false; + + // other StopFilter functionality is already tested by TestStopAnalyzer + + public void testExactCase() throws IOException { + StringReader reader = new StringReader("Now is The Time"); + String[] stopWords = new String[] { "is", "the", "Time" }; + TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopWords); + assertEquals("Now", stream.next().termText()); + assertEquals("The", stream.next().termText()); + assertEquals(null, stream.next()); + } + + public void testIgnoreCase() throws IOException { + StringReader reader = new StringReader("Now is The Time"); + String[] stopWords = new String[] { "is", "the", "Time" }; + TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopWords, true); + assertEquals("Now", stream.next().termText()); + assertEquals(null,stream.next()); + } + + public void testStopFilt() throws IOException { + StringReader reader = new StringReader("Now is The Time"); + String[] stopWords = new String[] { "is", "the", "Time" }; + Set stopSet = StopFilter.makeStopSet(stopWords); + TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopSet); + assertEquals("Now", stream.next().termText()); + assertEquals("The", stream.next().termText()); + assertEquals(null, stream.next()); + } + + /** + * Test Position increments applied by StopFilter with and without enabling this option. + */ + public void testStopPositons() throws IOException { + StringBuffer sb = new StringBuffer(); + ArrayList a = new ArrayList(); + for (int i=0; i<20; i++) { + String w = English.intToEnglish(i).trim(); + sb.append(w).append(" "); + if (i%3 != 0) a.add(w); + } + log(sb.toString()); + String stopWords[] = (String[]) a.toArray(new String[0]); + for (int i=0; i test with enable-increments-"+(enableIcrements?"enabled":"disabled")); + stpf.setEnablePositionIncrements(enableIcrements); + for (int i=0; i<20; i+=3) { + Token t = stpf.next(); + log("Token "+i+": "+t); + String w = English.intToEnglish(i).trim(); + assertEquals("expecting token "+i+" to be "+w,w,t.termText()); + assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,t.getPositionIncrement()); + } + assertNull(stpf.next()); + } + + // print debug info depending on VERBOSE + private static void log(String s) { + if (VERBOSE) { + System.out.println(s); + } + } +} diff --git a/src/test/org/apache/lucene/index/TestIndexWriterMerging.java b/src/test/org/apache/lucene/index/TestIndexWriterMerging.java index 5cf54020c78..02035cbeb79 100644 --- a/src/test/org/apache/lucene/index/TestIndexWriterMerging.java +++ b/src/test/org/apache/lucene/index/TestIndexWriterMerging.java @@ -1,108 +1,108 @@ -package org.apache.lucene.index; -/** - * Copyright 2006 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.MockRAMDirectory; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.util.LuceneTestCase; - -import java.io.IOException; - - -public class TestIndexWriterMerging extends LuceneTestCase -{ - - /** - * Tests that index merging (specifically addIndexes()) doesn't - * change the index order of documents. - */ - public void testLucene() throws IOException - { - - int num=100; - - Directory indexA = new MockRAMDirectory(); - Directory indexB = new MockRAMDirectory(); - - fillIndex(indexA, 0, num); - boolean fail = verifyIndex(indexA, 0); - if (fail) - { - fail("Index a is invalid"); - } - - fillIndex(indexB, num, num); - fail = verifyIndex(indexB, num); - if (fail) - { - fail("Index b is invalid"); - } - - Directory merged = new MockRAMDirectory(); - - IndexWriter writer = new IndexWriter(merged, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); - writer.setMergeFactor(2); - - writer.addIndexes(new Directory[]{indexA, indexB}); - writer.close(); - - fail = verifyIndex(merged, 0); - merged.close(); - - assertFalse("The merged index is invalid", fail); - } - - private boolean verifyIndex(Directory directory, int startAt) throws IOException - { - boolean fail = false; - IndexReader reader = IndexReader.open(directory); - - int max = reader.maxDoc(); - for (int i = 0; i < max; i++) - { - Document temp = reader.document(i); - //System.out.println("doc "+i+"="+temp.getField("count").stringValue()); - //compare the index doc number to the value that it should be - if (!temp.getField("count").stringValue().equals((i + startAt) + "")) - { - fail = true; - System.out.println("Document " + (i + startAt) + " is returning document " + temp.getField("count").stringValue()); - } - } - reader.close(); - return fail; - } - - private void fillIndex(Directory dir, int start, int numDocs) throws IOException - { - - IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); - writer.setMergeFactor(2); - writer.setMaxBufferedDocs(2); - - for (int i = start; i < (start + numDocs); i++) - { - Document temp = new Document(); - temp.add(new Field("count", (""+i), Field.Store.YES, Field.Index.UN_TOKENIZED)); - - writer.addDocument(temp); - } - writer.close(); - } -} +package org.apache.lucene.index; +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.MockRAMDirectory; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.util.LuceneTestCase; + +import java.io.IOException; + + +public class TestIndexWriterMerging extends LuceneTestCase +{ + + /** + * Tests that index merging (specifically addIndexes()) doesn't + * change the index order of documents. + */ + public void testLucene() throws IOException + { + + int num=100; + + Directory indexA = new MockRAMDirectory(); + Directory indexB = new MockRAMDirectory(); + + fillIndex(indexA, 0, num); + boolean fail = verifyIndex(indexA, 0); + if (fail) + { + fail("Index a is invalid"); + } + + fillIndex(indexB, num, num); + fail = verifyIndex(indexB, num); + if (fail) + { + fail("Index b is invalid"); + } + + Directory merged = new MockRAMDirectory(); + + IndexWriter writer = new IndexWriter(merged, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); + writer.setMergeFactor(2); + + writer.addIndexes(new Directory[]{indexA, indexB}); + writer.close(); + + fail = verifyIndex(merged, 0); + merged.close(); + + assertFalse("The merged index is invalid", fail); + } + + private boolean verifyIndex(Directory directory, int startAt) throws IOException + { + boolean fail = false; + IndexReader reader = IndexReader.open(directory); + + int max = reader.maxDoc(); + for (int i = 0; i < max; i++) + { + Document temp = reader.document(i); + //System.out.println("doc "+i+"="+temp.getField("count").stringValue()); + //compare the index doc number to the value that it should be + if (!temp.getField("count").stringValue().equals((i + startAt) + "")) + { + fail = true; + System.out.println("Document " + (i + startAt) + " is returning document " + temp.getField("count").stringValue()); + } + } + reader.close(); + return fail; + } + + private void fillIndex(Directory dir, int start, int numDocs) throws IOException + { + + IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); + writer.setMergeFactor(2); + writer.setMaxBufferedDocs(2); + + for (int i = start; i < (start + numDocs); i++) + { + Document temp = new Document(); + temp.add(new Field("count", (""+i), Field.Store.YES, Field.Index.UN_TOKENIZED)); + + writer.addDocument(temp); + } + writer.close(); + } +} diff --git a/src/test/org/apache/lucene/index/TestMultiLevelSkipList.java b/src/test/org/apache/lucene/index/TestMultiLevelSkipList.java index aa1f5218f31..505c7635c29 100644 --- a/src/test/org/apache/lucene/index/TestMultiLevelSkipList.java +++ b/src/test/org/apache/lucene/index/TestMultiLevelSkipList.java @@ -1,158 +1,158 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.Reader; - -import org.apache.lucene.util.LuceneTestCase; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.LowerCaseTokenizer; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.Field.Index; -import org.apache.lucene.document.Field.Store; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.RAMDirectory; - -/** - * This testcase tests whether multi-level skipping is being used - * to reduce I/O while skipping through posting lists. - * - * Skipping in general is already covered by several other - * testcases. - * - */ -public class TestMultiLevelSkipList extends LuceneTestCase { - public void testSimpleSkip() throws IOException { - RAMDirectory dir = new RAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new PayloadAnalyzer(), true, - IndexWriter.MaxFieldLength.LIMITED); - Term term = new Term("test", "a"); - for (int i = 0; i < 5000; i++) { - Document d1 = new Document(); - d1.add(new Field(term.field(), term.text(), Store.NO, Index.TOKENIZED)); - writer.addDocument(d1); - } - writer.flush(); - writer.optimize(); - writer.close(); - - IndexReader reader = IndexReader.open(dir); - SegmentTermPositions tp = (SegmentTermPositions) reader.termPositions(); - tp.freqStream = new CountingStream(tp.freqStream); - - for (int i = 0; i < 2; i++) { - counter = 0; - tp.seek(term); - - checkSkipTo(tp, 14, 185); // no skips - checkSkipTo(tp, 17, 190); // one skip on level 0 - checkSkipTo(tp, 287, 200); // one skip on level 1, two on level 0 - - // this test would fail if we had only one skip level, - // because than more bytes would be read from the freqStream - checkSkipTo(tp, 4800, 250);// one skip on level 2 - } - } - - public void checkSkipTo(TermPositions tp, int target, int maxCounter) throws IOException { - tp.skipTo(target); - if (maxCounter < counter) { - fail("Too many bytes read: " + counter); - } - - assertEquals("Wrong document " + tp.doc() + " after skipTo target " + target, target, tp.doc()); - assertEquals("Frequency is not 1: " + tp.freq(), 1,tp.freq()); - tp.nextPosition(); - byte[] b = new byte[1]; - tp.getPayload(b, 0); - assertEquals("Wrong payload for the target " + target + ": " + b[0], (byte) target, b[0]); - } - - private static class PayloadAnalyzer extends Analyzer { - public TokenStream tokenStream(String fieldName, Reader reader) { - return new PayloadFilter(new LowerCaseTokenizer(reader)); - } - - } - - private static class PayloadFilter extends TokenFilter { - static int count = 0; - - protected PayloadFilter(TokenStream input) { - super(input); - } - - public Token next() throws IOException { - Token t = input.next(); - if (t != null) { - t.setPayload(new Payload(new byte[] { (byte) count++ })); - } - return t; - } - - } - - private int counter = 0; - - // Simply extends IndexInput in a way that we are able to count the number - // of bytes read - class CountingStream extends IndexInput { - private IndexInput input; - - CountingStream(IndexInput input) { - this.input = input; - } - - public byte readByte() throws IOException { - TestMultiLevelSkipList.this.counter++; - return this.input.readByte(); - } - - public void readBytes(byte[] b, int offset, int len) throws IOException { - TestMultiLevelSkipList.this.counter += len; - this.input.readBytes(b, offset, len); - } - - public void close() throws IOException { - this.input.close(); - } - - public long getFilePointer() { - return this.input.getFilePointer(); - } - - public void seek(long pos) throws IOException { - this.input.seek(pos); - } - - public long length() { - return this.input.length(); - } - - public Object clone() { - return new CountingStream((IndexInput) this.input.clone()); - } - - } -} +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.util.LuceneTestCase; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseTokenizer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.RAMDirectory; + +/** + * This testcase tests whether multi-level skipping is being used + * to reduce I/O while skipping through posting lists. + * + * Skipping in general is already covered by several other + * testcases. + * + */ +public class TestMultiLevelSkipList extends LuceneTestCase { + public void testSimpleSkip() throws IOException { + RAMDirectory dir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new PayloadAnalyzer(), true, + IndexWriter.MaxFieldLength.LIMITED); + Term term = new Term("test", "a"); + for (int i = 0; i < 5000; i++) { + Document d1 = new Document(); + d1.add(new Field(term.field(), term.text(), Store.NO, Index.TOKENIZED)); + writer.addDocument(d1); + } + writer.flush(); + writer.optimize(); + writer.close(); + + IndexReader reader = IndexReader.open(dir); + SegmentTermPositions tp = (SegmentTermPositions) reader.termPositions(); + tp.freqStream = new CountingStream(tp.freqStream); + + for (int i = 0; i < 2; i++) { + counter = 0; + tp.seek(term); + + checkSkipTo(tp, 14, 185); // no skips + checkSkipTo(tp, 17, 190); // one skip on level 0 + checkSkipTo(tp, 287, 200); // one skip on level 1, two on level 0 + + // this test would fail if we had only one skip level, + // because than more bytes would be read from the freqStream + checkSkipTo(tp, 4800, 250);// one skip on level 2 + } + } + + public void checkSkipTo(TermPositions tp, int target, int maxCounter) throws IOException { + tp.skipTo(target); + if (maxCounter < counter) { + fail("Too many bytes read: " + counter); + } + + assertEquals("Wrong document " + tp.doc() + " after skipTo target " + target, target, tp.doc()); + assertEquals("Frequency is not 1: " + tp.freq(), 1,tp.freq()); + tp.nextPosition(); + byte[] b = new byte[1]; + tp.getPayload(b, 0); + assertEquals("Wrong payload for the target " + target + ": " + b[0], (byte) target, b[0]); + } + + private static class PayloadAnalyzer extends Analyzer { + public TokenStream tokenStream(String fieldName, Reader reader) { + return new PayloadFilter(new LowerCaseTokenizer(reader)); + } + + } + + private static class PayloadFilter extends TokenFilter { + static int count = 0; + + protected PayloadFilter(TokenStream input) { + super(input); + } + + public Token next() throws IOException { + Token t = input.next(); + if (t != null) { + t.setPayload(new Payload(new byte[] { (byte) count++ })); + } + return t; + } + + } + + private int counter = 0; + + // Simply extends IndexInput in a way that we are able to count the number + // of bytes read + class CountingStream extends IndexInput { + private IndexInput input; + + CountingStream(IndexInput input) { + this.input = input; + } + + public byte readByte() throws IOException { + TestMultiLevelSkipList.this.counter++; + return this.input.readByte(); + } + + public void readBytes(byte[] b, int offset, int len) throws IOException { + TestMultiLevelSkipList.this.counter += len; + this.input.readBytes(b, offset, len); + } + + public void close() throws IOException { + this.input.close(); + } + + public long getFilePointer() { + return this.input.getFilePointer(); + } + + public void seek(long pos) throws IOException { + this.input.seek(pos); + } + + public long length() { + return this.input.length(); + } + + public Object clone() { + return new CountingStream((IndexInput) this.input.clone()); + } + + } +} diff --git a/src/test/org/apache/lucene/index/TestPayloads.java b/src/test/org/apache/lucene/index/TestPayloads.java index 369f3518f0f..bc46678f006 100644 --- a/src/test/org/apache/lucene/index/TestPayloads.java +++ b/src/test/org/apache/lucene/index/TestPayloads.java @@ -1,586 +1,586 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.File; -import java.io.IOException; -import java.io.Reader; -import java.io.UnsupportedEncodingException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Random; - -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.UnicodeUtil; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.WhitespaceAnalyzer; -import org.apache.lucene.analysis.WhitespaceTokenizer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.store.RAMDirectory; - - -public class TestPayloads extends LuceneTestCase { - - // Simple tests to test the Payload class - public void testPayload() throws Exception { - byte[] testData = "This is a test!".getBytes(); - Payload payload = new Payload(testData); - assertEquals("Wrong payload length.", testData.length, payload.length()); - - // test copyTo() - byte[] target = new byte[testData.length - 1]; - try { - payload.copyTo(target, 0); - fail("Expected exception not thrown"); - } catch (Exception expected) { - // expected exception - } - - target = new byte[testData.length + 3]; - payload.copyTo(target, 3); - - for (int i = 0; i < testData.length; i++) { - assertEquals(testData[i], target[i + 3]); - } - - - // test toByteArray() - target = payload.toByteArray(); - assertByteArrayEquals(testData, target); - - // test byteAt() - for (int i = 0; i < testData.length; i++) { - assertEquals(payload.byteAt(i), testData[i]); - } - - try { - payload.byteAt(testData.length + 1); - fail("Expected exception not thrown"); - } catch (Exception expected) { - // expected exception - } - - Payload clone = (Payload) payload.clone(); - assertEquals(payload.length(), clone.length()); - for (int i = 0; i < payload.length(); i++) { - assertEquals(payload.byteAt(i), clone.byteAt(i)); - } - - } - - // Tests whether the DocumentWriter and SegmentMerger correctly enable the - // payload bit in the FieldInfo - public void testPayloadFieldBit() throws Exception { - Directory ram = new RAMDirectory(); - PayloadAnalyzer analyzer = new PayloadAnalyzer(); - IndexWriter writer = new IndexWriter(ram, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); - Document d = new Document(); - // this field won't have any payloads - d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.TOKENIZED)); - // this field will have payloads in all docs, however not for all term positions, - // so this field is used to check if the DocumentWriter correctly enables the payloads bit - // even if only some term positions have payloads - d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED)); - d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED)); - // this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads - // enabled in only some documents - d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.TOKENIZED)); - // only add payload data for field f2 - analyzer.setPayloadData("f2", 1, "somedata".getBytes(), 0, 1); - writer.addDocument(d); - // flush - writer.close(); - - // only one segment in the index, so we can cast to SegmentReader - SegmentReader reader = (SegmentReader) IndexReader.open(ram); - FieldInfos fi = reader.fieldInfos(); - assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads); - assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads); - assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").storePayloads); - reader.close(); - - // now we add another document which has payloads for field f3 and verify if the SegmentMerger - // enabled payloads for that field - writer = new IndexWriter(ram, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); - d = new Document(); - d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.TOKENIZED)); - d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED)); - d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED)); - d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.TOKENIZED)); - // add payload data for field f2 and f3 - analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1); - analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3); - writer.addDocument(d); - // force merge - writer.optimize(); - // flush - writer.close(); - - // only one segment in the index, so we can cast to SegmentReader - reader = (SegmentReader) IndexReader.open(ram); - fi = reader.fieldInfos(); - assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads); - assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads); - assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").storePayloads); - reader.close(); - } - - // Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory - public void testPayloadsEncoding() throws Exception { - // first perform the test using a RAMDirectory - Directory dir = new RAMDirectory(); - performTest(dir); - - // now use a FSDirectory and repeat same test - String dirName = "test_payloads"; - dir = FSDirectory.getDirectory(dirName); - performTest(dir); - rmDir(dirName); - } - - // builds an index with payloads in the given Directory and performs - // different tests to verify the payload encoding - private void performTest(Directory dir) throws Exception { - PayloadAnalyzer analyzer = new PayloadAnalyzer(); - IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); - - // should be in sync with value in TermInfosWriter - final int skipInterval = 16; - - final int numTerms = 5; - final String fieldName = "f1"; - - int numDocs = skipInterval + 1; - // create content for the test documents with just a few terms - Term[] terms = generateTerms(fieldName, numTerms); - StringBuffer sb = new StringBuffer(); - for (int i = 0; i < terms.length; i++) { - sb.append(terms[i].text); - sb.append(" "); - } - String content = sb.toString(); - - - int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2; - byte[] payloadData = generateRandomData(payloadDataLength); - - Document d = new Document(); - d.add(new Field(fieldName, content, Field.Store.NO, Field.Index.TOKENIZED)); - // add the same document multiple times to have the same payload lengths for all - // occurrences within two consecutive skip intervals - int offset = 0; - for (int i = 0; i < 2 * numDocs; i++) { - analyzer.setPayloadData(fieldName, payloadData, offset, 1); - offset += numTerms; - writer.addDocument(d); - } - - // make sure we create more than one segment to test merging - writer.flush(); - - // now we make sure to have different payload lengths next at the next skip point - for (int i = 0; i < numDocs; i++) { - analyzer.setPayloadData(fieldName, payloadData, offset, i); - offset += i * numTerms; - writer.addDocument(d); - } - - writer.optimize(); - // flush - writer.close(); - - - /* - * Verify the index - * first we test if all payloads are stored correctly - */ - IndexReader reader = IndexReader.open(dir); - - byte[] verifyPayloadData = new byte[payloadDataLength]; - offset = 0; - TermPositions[] tps = new TermPositions[numTerms]; - for (int i = 0; i < numTerms; i++) { - tps[i] = reader.termPositions(terms[i]); - } - - while (tps[0].next()) { - for (int i = 1; i < numTerms; i++) { - tps[i].next(); - } - int freq = tps[0].freq(); - - for (int i = 0; i < freq; i++) { - for (int j = 0; j < numTerms; j++) { - tps[j].nextPosition(); - tps[j].getPayload(verifyPayloadData, offset); - offset += tps[j].getPayloadLength(); - } - } - } - - for (int i = 0; i < numTerms; i++) { - tps[i].close(); - } - - assertByteArrayEquals(payloadData, verifyPayloadData); - - /* - * test lazy skipping - */ - TermPositions tp = reader.termPositions(terms[0]); - tp.next(); - tp.nextPosition(); - // now we don't read this payload - tp.nextPosition(); - assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); - byte[] payload = tp.getPayload(null, 0); - assertEquals(payload[0], payloadData[numTerms]); - tp.nextPosition(); - - // we don't read this payload and skip to a different document - tp.skipTo(5); - tp.nextPosition(); - assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); - payload = tp.getPayload(null, 0); - assertEquals(payload[0], payloadData[5 * numTerms]); - - - /* - * Test different lengths at skip points - */ - tp.seek(terms[1]); - tp.next(); - tp.nextPosition(); - assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); - tp.skipTo(skipInterval - 1); - tp.nextPosition(); - assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); - tp.skipTo(2 * skipInterval - 1); - tp.nextPosition(); - assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); - tp.skipTo(3 * skipInterval - 1); - tp.nextPosition(); - assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayloadLength()); - - /* - * Test multiple call of getPayload() - */ - tp.getPayload(null, 0); - try { - // it is forbidden to call getPayload() more than once - // without calling nextPosition() - tp.getPayload(null, 0); - fail("Expected exception not thrown"); - } catch (Exception expected) { - // expected exception - } - - reader.close(); - - // test long payload - analyzer = new PayloadAnalyzer(); - writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); - String singleTerm = "lucene"; - - d = new Document(); - d.add(new Field(fieldName, singleTerm, Field.Store.NO, Field.Index.TOKENIZED)); - // add a payload whose length is greater than the buffer size of BufferedIndexOutput - payloadData = generateRandomData(2000); - analyzer.setPayloadData(fieldName, payloadData, 100, 1500); - writer.addDocument(d); - - - writer.optimize(); - // flush - writer.close(); - - reader = IndexReader.open(dir); - tp = reader.termPositions(new Term(fieldName, singleTerm)); - tp.next(); - tp.nextPosition(); - - verifyPayloadData = new byte[tp.getPayloadLength()]; - tp.getPayload(verifyPayloadData, 0); - byte[] portion = new byte[1500]; - System.arraycopy(payloadData, 100, portion, 0, 1500); - - assertByteArrayEquals(portion, verifyPayloadData); - reader.close(); - - } - - private static Random rnd = new Random(); - - private static void generateRandomData(byte[] data) { - rnd.nextBytes(data); - } - - private static byte[] generateRandomData(int n) { - byte[] data = new byte[n]; - generateRandomData(data); - return data; - } - - private Term[] generateTerms(String fieldName, int n) { - int maxDigits = (int) (Math.log(n) / Math.log(10)); - Term[] terms = new Term[n]; - StringBuffer sb = new StringBuffer(); - for (int i = 0; i < n; i++) { - sb.setLength(0); - sb.append("t"); - int zeros = maxDigits - (int) (Math.log(i) / Math.log(10)); - for (int j = 0; j < zeros; j++) { - sb.append("0"); - } - sb.append(i); - terms[i] = new Term(fieldName, sb.toString()); - } - return terms; - } - - - private void rmDir(String dir) { - File fileDir = new File(dir); - if (fileDir.exists()) { - File[] files = fileDir.listFiles(); - if (files != null) { - for (int i = 0; i < files.length; i++) { - files[i].delete(); - } - } - fileDir.delete(); - } - } - - - - void assertByteArrayEquals(byte[] b1, byte[] b2) { - if (b1.length != b2.length) { - fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length); - } - - for (int i = 0; i < b1.length; i++) { - if (b1[i] != b2[i]) { - fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]); - } - } - } - - - /** - * This Analyzer uses an WhitespaceTokenizer and PayloadFilter. - */ - private static class PayloadAnalyzer extends Analyzer { - Map fieldToData = new HashMap(); - - void setPayloadData(String field, byte[] data, int offset, int length) { - fieldToData.put(field, new PayloadData(0, data, offset, length)); - } - - void setPayloadData(String field, int numFieldInstancesToSkip, byte[] data, int offset, int length) { - fieldToData.put(field, new PayloadData(numFieldInstancesToSkip, data, offset, length)); - } - - public TokenStream tokenStream(String fieldName, Reader reader) { - PayloadData payload = (PayloadData) fieldToData.get(fieldName); - TokenStream ts = new WhitespaceTokenizer(reader); - if (payload != null) { - if (payload.numFieldInstancesToSkip == 0) { - ts = new PayloadFilter(ts, payload.data, payload.offset, payload.length); - } else { - payload.numFieldInstancesToSkip--; - } - } - return ts; - } - - private static class PayloadData { - byte[] data; - int offset; - int length; - int numFieldInstancesToSkip; - - PayloadData(int skip, byte[] data, int offset, int length) { - numFieldInstancesToSkip = skip; - this.data = data; - this.offset = offset; - this.length = length; - } - } - } - - - /** - * This Filter adds payloads to the tokens. - */ - private static class PayloadFilter extends TokenFilter { - private byte[] data; - private int length; - private int offset; - Payload payload = new Payload(); - - public PayloadFilter(TokenStream in, byte[] data, int offset, int length) { - super(in); - this.data = data; - this.length = length; - this.offset = offset; - } - - public Token next(Token token) throws IOException { - token = input.next(token); - if (token != null) { - if (offset + length <= data.length) { - Payload p = null; - if (p == null) { - p = new Payload(); - token.setPayload(p); - } - p.setData(data, offset, length); - offset += length; - } else { - token.setPayload(null); - } - } - - return token; - } - } - - public void testThreadSafety() throws IOException { - final int numThreads = 5; - final int numDocs = 50; - final ByteArrayPool pool = new ByteArrayPool(numThreads, 5); - - Directory dir = new RAMDirectory(); - final IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); - final String field = "test"; - - Thread[] ingesters = new Thread[numThreads]; - for (int i = 0; i < numThreads; i++) { - ingesters[i] = new Thread() { - public void run() { - try { - for (int j = 0; j < numDocs; j++) { - Document d = new Document(); - d.add(new Field(field, new PoolingPayloadTokenStream(pool))); - writer.addDocument(d); - } - } catch (Exception e) { - e.printStackTrace(); - fail(e.toString()); - } - } - }; - ingesters[i].start(); - } - - for (int i = 0; i < numThreads; i++) { - try { - ingesters[i].join(); - } catch (InterruptedException e) {} - } - writer.close(); - IndexReader reader = IndexReader.open(dir); - TermEnum terms = reader.terms(); - while (terms.next()) { - TermPositions tp = reader.termPositions(terms.term()); - while(tp.next()) { - int freq = tp.freq(); - for (int i = 0; i < freq; i++) { - tp.nextPosition(); - assertEquals(pool.bytesToString(tp.getPayload(new byte[5], 0)), terms.term().text); - } - } - tp.close(); - } - terms.close(); - reader.close(); - - assertEquals(pool.size(), numThreads); - } - - private static class PoolingPayloadTokenStream extends TokenStream { - private byte[] payload; - private boolean first; - private ByteArrayPool pool; - private String term; - PoolingPayloadTokenStream(ByteArrayPool pool) { - this.pool = pool; - payload = pool.get(); - generateRandomData(payload); - term = pool.bytesToString(payload); - first = true; - } - - public Token next() throws IOException { - if (!first) return null; - Token t = new Token(term, 0, 0); - t.setPayload(new Payload(payload)); - return t; - } - - public void close() throws IOException { - pool.release(payload); - } - - } - - private static class ByteArrayPool { - private List pool; - - ByteArrayPool(int capacity, int size) { - pool = new ArrayList(); - for (int i = 0; i < capacity; i++) { - pool.add(new byte[size]); - } - } - - private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result(); - - synchronized String bytesToString(byte[] bytes) { - String s = new String(bytes); - UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8Result); - try { - return new String(utf8Result.result, 0, utf8Result.length, "UTF-8"); - } catch (UnsupportedEncodingException uee) { - return null; - } - } - - synchronized byte[] get() { - return (byte[]) pool.remove(0); - } - - synchronized void release(byte[] b) { - pool.add(b); - } - - synchronized int size() { - return pool.size(); - } - } -} +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; +import java.io.Reader; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.UnicodeUtil; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.RAMDirectory; + + +public class TestPayloads extends LuceneTestCase { + + // Simple tests to test the Payload class + public void testPayload() throws Exception { + byte[] testData = "This is a test!".getBytes(); + Payload payload = new Payload(testData); + assertEquals("Wrong payload length.", testData.length, payload.length()); + + // test copyTo() + byte[] target = new byte[testData.length - 1]; + try { + payload.copyTo(target, 0); + fail("Expected exception not thrown"); + } catch (Exception expected) { + // expected exception + } + + target = new byte[testData.length + 3]; + payload.copyTo(target, 3); + + for (int i = 0; i < testData.length; i++) { + assertEquals(testData[i], target[i + 3]); + } + + + // test toByteArray() + target = payload.toByteArray(); + assertByteArrayEquals(testData, target); + + // test byteAt() + for (int i = 0; i < testData.length; i++) { + assertEquals(payload.byteAt(i), testData[i]); + } + + try { + payload.byteAt(testData.length + 1); + fail("Expected exception not thrown"); + } catch (Exception expected) { + // expected exception + } + + Payload clone = (Payload) payload.clone(); + assertEquals(payload.length(), clone.length()); + for (int i = 0; i < payload.length(); i++) { + assertEquals(payload.byteAt(i), clone.byteAt(i)); + } + + } + + // Tests whether the DocumentWriter and SegmentMerger correctly enable the + // payload bit in the FieldInfo + public void testPayloadFieldBit() throws Exception { + Directory ram = new RAMDirectory(); + PayloadAnalyzer analyzer = new PayloadAnalyzer(); + IndexWriter writer = new IndexWriter(ram, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); + Document d = new Document(); + // this field won't have any payloads + d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.TOKENIZED)); + // this field will have payloads in all docs, however not for all term positions, + // so this field is used to check if the DocumentWriter correctly enables the payloads bit + // even if only some term positions have payloads + d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED)); + d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED)); + // this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads + // enabled in only some documents + d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.TOKENIZED)); + // only add payload data for field f2 + analyzer.setPayloadData("f2", 1, "somedata".getBytes(), 0, 1); + writer.addDocument(d); + // flush + writer.close(); + + // only one segment in the index, so we can cast to SegmentReader + SegmentReader reader = (SegmentReader) IndexReader.open(ram); + FieldInfos fi = reader.fieldInfos(); + assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads); + assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads); + assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").storePayloads); + reader.close(); + + // now we add another document which has payloads for field f3 and verify if the SegmentMerger + // enabled payloads for that field + writer = new IndexWriter(ram, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); + d = new Document(); + d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.TOKENIZED)); + d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED)); + d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED)); + d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.TOKENIZED)); + // add payload data for field f2 and f3 + analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1); + analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3); + writer.addDocument(d); + // force merge + writer.optimize(); + // flush + writer.close(); + + // only one segment in the index, so we can cast to SegmentReader + reader = (SegmentReader) IndexReader.open(ram); + fi = reader.fieldInfos(); + assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads); + assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads); + assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").storePayloads); + reader.close(); + } + + // Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory + public void testPayloadsEncoding() throws Exception { + // first perform the test using a RAMDirectory + Directory dir = new RAMDirectory(); + performTest(dir); + + // now use a FSDirectory and repeat same test + String dirName = "test_payloads"; + dir = FSDirectory.getDirectory(dirName); + performTest(dir); + rmDir(dirName); + } + + // builds an index with payloads in the given Directory and performs + // different tests to verify the payload encoding + private void performTest(Directory dir) throws Exception { + PayloadAnalyzer analyzer = new PayloadAnalyzer(); + IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); + + // should be in sync with value in TermInfosWriter + final int skipInterval = 16; + + final int numTerms = 5; + final String fieldName = "f1"; + + int numDocs = skipInterval + 1; + // create content for the test documents with just a few terms + Term[] terms = generateTerms(fieldName, numTerms); + StringBuffer sb = new StringBuffer(); + for (int i = 0; i < terms.length; i++) { + sb.append(terms[i].text); + sb.append(" "); + } + String content = sb.toString(); + + + int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2; + byte[] payloadData = generateRandomData(payloadDataLength); + + Document d = new Document(); + d.add(new Field(fieldName, content, Field.Store.NO, Field.Index.TOKENIZED)); + // add the same document multiple times to have the same payload lengths for all + // occurrences within two consecutive skip intervals + int offset = 0; + for (int i = 0; i < 2 * numDocs; i++) { + analyzer.setPayloadData(fieldName, payloadData, offset, 1); + offset += numTerms; + writer.addDocument(d); + } + + // make sure we create more than one segment to test merging + writer.flush(); + + // now we make sure to have different payload lengths next at the next skip point + for (int i = 0; i < numDocs; i++) { + analyzer.setPayloadData(fieldName, payloadData, offset, i); + offset += i * numTerms; + writer.addDocument(d); + } + + writer.optimize(); + // flush + writer.close(); + + + /* + * Verify the index + * first we test if all payloads are stored correctly + */ + IndexReader reader = IndexReader.open(dir); + + byte[] verifyPayloadData = new byte[payloadDataLength]; + offset = 0; + TermPositions[] tps = new TermPositions[numTerms]; + for (int i = 0; i < numTerms; i++) { + tps[i] = reader.termPositions(terms[i]); + } + + while (tps[0].next()) { + for (int i = 1; i < numTerms; i++) { + tps[i].next(); + } + int freq = tps[0].freq(); + + for (int i = 0; i < freq; i++) { + for (int j = 0; j < numTerms; j++) { + tps[j].nextPosition(); + tps[j].getPayload(verifyPayloadData, offset); + offset += tps[j].getPayloadLength(); + } + } + } + + for (int i = 0; i < numTerms; i++) { + tps[i].close(); + } + + assertByteArrayEquals(payloadData, verifyPayloadData); + + /* + * test lazy skipping + */ + TermPositions tp = reader.termPositions(terms[0]); + tp.next(); + tp.nextPosition(); + // now we don't read this payload + tp.nextPosition(); + assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); + byte[] payload = tp.getPayload(null, 0); + assertEquals(payload[0], payloadData[numTerms]); + tp.nextPosition(); + + // we don't read this payload and skip to a different document + tp.skipTo(5); + tp.nextPosition(); + assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); + payload = tp.getPayload(null, 0); + assertEquals(payload[0], payloadData[5 * numTerms]); + + + /* + * Test different lengths at skip points + */ + tp.seek(terms[1]); + tp.next(); + tp.nextPosition(); + assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); + tp.skipTo(skipInterval - 1); + tp.nextPosition(); + assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); + tp.skipTo(2 * skipInterval - 1); + tp.nextPosition(); + assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); + tp.skipTo(3 * skipInterval - 1); + tp.nextPosition(); + assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayloadLength()); + + /* + * Test multiple call of getPayload() + */ + tp.getPayload(null, 0); + try { + // it is forbidden to call getPayload() more than once + // without calling nextPosition() + tp.getPayload(null, 0); + fail("Expected exception not thrown"); + } catch (Exception expected) { + // expected exception + } + + reader.close(); + + // test long payload + analyzer = new PayloadAnalyzer(); + writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); + String singleTerm = "lucene"; + + d = new Document(); + d.add(new Field(fieldName, singleTerm, Field.Store.NO, Field.Index.TOKENIZED)); + // add a payload whose length is greater than the buffer size of BufferedIndexOutput + payloadData = generateRandomData(2000); + analyzer.setPayloadData(fieldName, payloadData, 100, 1500); + writer.addDocument(d); + + + writer.optimize(); + // flush + writer.close(); + + reader = IndexReader.open(dir); + tp = reader.termPositions(new Term(fieldName, singleTerm)); + tp.next(); + tp.nextPosition(); + + verifyPayloadData = new byte[tp.getPayloadLength()]; + tp.getPayload(verifyPayloadData, 0); + byte[] portion = new byte[1500]; + System.arraycopy(payloadData, 100, portion, 0, 1500); + + assertByteArrayEquals(portion, verifyPayloadData); + reader.close(); + + } + + private static Random rnd = new Random(); + + private static void generateRandomData(byte[] data) { + rnd.nextBytes(data); + } + + private static byte[] generateRandomData(int n) { + byte[] data = new byte[n]; + generateRandomData(data); + return data; + } + + private Term[] generateTerms(String fieldName, int n) { + int maxDigits = (int) (Math.log(n) / Math.log(10)); + Term[] terms = new Term[n]; + StringBuffer sb = new StringBuffer(); + for (int i = 0; i < n; i++) { + sb.setLength(0); + sb.append("t"); + int zeros = maxDigits - (int) (Math.log(i) / Math.log(10)); + for (int j = 0; j < zeros; j++) { + sb.append("0"); + } + sb.append(i); + terms[i] = new Term(fieldName, sb.toString()); + } + return terms; + } + + + private void rmDir(String dir) { + File fileDir = new File(dir); + if (fileDir.exists()) { + File[] files = fileDir.listFiles(); + if (files != null) { + for (int i = 0; i < files.length; i++) { + files[i].delete(); + } + } + fileDir.delete(); + } + } + + + + void assertByteArrayEquals(byte[] b1, byte[] b2) { + if (b1.length != b2.length) { + fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length); + } + + for (int i = 0; i < b1.length; i++) { + if (b1[i] != b2[i]) { + fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]); + } + } + } + + + /** + * This Analyzer uses an WhitespaceTokenizer and PayloadFilter. + */ + private static class PayloadAnalyzer extends Analyzer { + Map fieldToData = new HashMap(); + + void setPayloadData(String field, byte[] data, int offset, int length) { + fieldToData.put(field, new PayloadData(0, data, offset, length)); + } + + void setPayloadData(String field, int numFieldInstancesToSkip, byte[] data, int offset, int length) { + fieldToData.put(field, new PayloadData(numFieldInstancesToSkip, data, offset, length)); + } + + public TokenStream tokenStream(String fieldName, Reader reader) { + PayloadData payload = (PayloadData) fieldToData.get(fieldName); + TokenStream ts = new WhitespaceTokenizer(reader); + if (payload != null) { + if (payload.numFieldInstancesToSkip == 0) { + ts = new PayloadFilter(ts, payload.data, payload.offset, payload.length); + } else { + payload.numFieldInstancesToSkip--; + } + } + return ts; + } + + private static class PayloadData { + byte[] data; + int offset; + int length; + int numFieldInstancesToSkip; + + PayloadData(int skip, byte[] data, int offset, int length) { + numFieldInstancesToSkip = skip; + this.data = data; + this.offset = offset; + this.length = length; + } + } + } + + + /** + * This Filter adds payloads to the tokens. + */ + private static class PayloadFilter extends TokenFilter { + private byte[] data; + private int length; + private int offset; + Payload payload = new Payload(); + + public PayloadFilter(TokenStream in, byte[] data, int offset, int length) { + super(in); + this.data = data; + this.length = length; + this.offset = offset; + } + + public Token next(Token token) throws IOException { + token = input.next(token); + if (token != null) { + if (offset + length <= data.length) { + Payload p = null; + if (p == null) { + p = new Payload(); + token.setPayload(p); + } + p.setData(data, offset, length); + offset += length; + } else { + token.setPayload(null); + } + } + + return token; + } + } + + public void testThreadSafety() throws IOException { + final int numThreads = 5; + final int numDocs = 50; + final ByteArrayPool pool = new ByteArrayPool(numThreads, 5); + + Directory dir = new RAMDirectory(); + final IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + final String field = "test"; + + Thread[] ingesters = new Thread[numThreads]; + for (int i = 0; i < numThreads; i++) { + ingesters[i] = new Thread() { + public void run() { + try { + for (int j = 0; j < numDocs; j++) { + Document d = new Document(); + d.add(new Field(field, new PoolingPayloadTokenStream(pool))); + writer.addDocument(d); + } + } catch (Exception e) { + e.printStackTrace(); + fail(e.toString()); + } + } + }; + ingesters[i].start(); + } + + for (int i = 0; i < numThreads; i++) { + try { + ingesters[i].join(); + } catch (InterruptedException e) {} + } + writer.close(); + IndexReader reader = IndexReader.open(dir); + TermEnum terms = reader.terms(); + while (terms.next()) { + TermPositions tp = reader.termPositions(terms.term()); + while(tp.next()) { + int freq = tp.freq(); + for (int i = 0; i < freq; i++) { + tp.nextPosition(); + assertEquals(pool.bytesToString(tp.getPayload(new byte[5], 0)), terms.term().text); + } + } + tp.close(); + } + terms.close(); + reader.close(); + + assertEquals(pool.size(), numThreads); + } + + private static class PoolingPayloadTokenStream extends TokenStream { + private byte[] payload; + private boolean first; + private ByteArrayPool pool; + private String term; + PoolingPayloadTokenStream(ByteArrayPool pool) { + this.pool = pool; + payload = pool.get(); + generateRandomData(payload); + term = pool.bytesToString(payload); + first = true; + } + + public Token next() throws IOException { + if (!first) return null; + Token t = new Token(term, 0, 0); + t.setPayload(new Payload(payload)); + return t; + } + + public void close() throws IOException { + pool.release(payload); + } + + } + + private static class ByteArrayPool { + private List pool; + + ByteArrayPool(int capacity, int size) { + pool = new ArrayList(); + for (int i = 0; i < capacity; i++) { + pool.add(new byte[size]); + } + } + + private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result(); + + synchronized String bytesToString(byte[] bytes) { + String s = new String(bytes); + UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8Result); + try { + return new String(utf8Result.result, 0, utf8Result.length, "UTF-8"); + } catch (UnsupportedEncodingException uee) { + return null; + } + } + + synchronized byte[] get() { + return (byte[]) pool.remove(0); + } + + synchronized void release(byte[] b) { + pool.add(b); + } + + synchronized int size() { + return pool.size(); + } + } +} diff --git a/src/test/org/apache/lucene/index/TestTermdocPerf.java b/src/test/org/apache/lucene/index/TestTermdocPerf.java index 1f29db66fc1..f7605052423 100644 --- a/src/test/org/apache/lucene/index/TestTermdocPerf.java +++ b/src/test/org/apache/lucene/index/TestTermdocPerf.java @@ -1,115 +1,115 @@ -package org.apache.lucene.index; - -/** - * Copyright 2006 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.RAMDirectory; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; - -import java.io.Reader; -import java.io.IOException; -import java.util.Random; - -/** - * @author yonik - * @version $Id$ - */ - -class RepeatingTokenStream extends TokenStream { - public int num; - Token t; - - public RepeatingTokenStream(String val) { - t = new Token(val,0,val.length()); - } - - public Token next() throws IOException { - return --num<0 ? null : t; - } -} - - -public class TestTermdocPerf extends LuceneTestCase { - - void addDocs(Directory dir, final int ndocs, String field, final String val, final int maxTF, final float percentDocs) throws IOException { - final Random random = new Random(0); - final RepeatingTokenStream ts = new RepeatingTokenStream(val); - - Analyzer analyzer = new Analyzer() { - public TokenStream tokenStream(String fieldName, Reader reader) { - if (random.nextFloat() < percentDocs) ts.num = random.nextInt(maxTF)+1; - else ts.num=0; - return ts; - } - }; - - Document doc = new Document(); - doc.add(new Field(field,val, Field.Store.NO, Field.Index.NO_NORMS)); - IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); - writer.setMaxBufferedDocs(100); - writer.setMergeFactor(100); - - for (int i=0; i remove it from the Map. - // At the end the Map must be empty! - resultMap.remove(idHitDate); - } - if(resultMap.size()==0) { - // log("All hits matched"); - } else { - log("Couldn't match "+resultMap.size()+" hits."); - } - assertEquals(resultMap.size(), 0); - } - - /** - * Check the hits for duplicates. - * @param hits - */ - private void checkHits(Hits hits, String prefix) { - if(hits!=null) { - Map idMap = new TreeMap(); - for(int docnum=0;docnum remove it from the Map. + // At the end the Map must be empty! + resultMap.remove(idHitDate); + } + if(resultMap.size()==0) { + // log("All hits matched"); + } else { + log("Couldn't match "+resultMap.size()+" hits."); + } + assertEquals(resultMap.size(), 0); + } + + /** + * Check the hits for duplicates. + * @param hits + */ + private void checkHits(Hits hits, String prefix) { + if(hits!=null) { + Map idMap = new TreeMap(); + for(int docnum=0;docnum