From a6fad246dd8feb71df4e32130c43dd5ff6e6697d Mon Sep 17 00:00:00 2001 From: Christoph Goller Date: Wed, 15 Sep 2004 12:50:23 +0000 Subject: [PATCH] Applied patch #29370 supplied by Drew Farris and Bernhard Messer. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150510 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/lucene/document/Document.java | 71 +++++++++-- .../org/apache/lucene/document/Field.java | 52 ++++++-- .../org/apache/lucene/index/FieldsReader.java | 36 ++++-- .../org/apache/lucene/index/FieldsWriter.java | 120 ++++++++++-------- .../apache/lucene/document/TestDocument.java | 50 ++++++++ 5 files changed, 239 insertions(+), 90 deletions(-) diff --git a/src/java/org/apache/lucene/document/Document.java b/src/java/org/apache/lucene/document/Document.java index ea0a42686bb..5fb0ce5e693 100644 --- a/src/java/org/apache/lucene/document/Document.java +++ b/src/java/org/apache/lucene/document/Document.java @@ -144,14 +144,16 @@ public final class Document implements java.io.Serializable { /** Returns the string value of the field with the given name if any exist in * this document, or null. If multiple fields exist with this name, this - * method returns the first value added. + * method returns the first value added. If only binary fields with this name + * exist, returns null. */ public final String get(String name) { - Field field = getField(name); - if (field != null) - return field.stringValue(); - else - return null; + for (int i = 0; i < fields.size(); i++) { + Field field = (Field)fields.get(i); + if (field.name().equals(name) && (!field.isBinary())) + return field.stringValue(); + } + return null; } /** Returns an Enumeration of all the fields in a document. */ @@ -189,16 +191,59 @@ public final class Document implements java.io.Serializable { * @return a String[] of field values */ public final String[] getValues(String name) { - Field[] namedFields = getFields(name); - if (namedFields == null) - return null; - String[] values = new String[namedFields.length]; - for (int i = 0; i < namedFields.length; i++) { - values[i] = namedFields[i].stringValue(); + List result = new ArrayList(); + for (int i = 0; i < fields.size(); i++) { + Field field = (Field)fields.get(i); + if (field.name().equals(name) && (!field.isBinary())) + result.add(field.stringValue()); } - return values; + + if (result.size() == 0) + return null; + + return (String[])result.toArray(new String[result.size()]); } + /** + * Returns an array of byte arrays for of the fields that have the name specified + * as the method parameter. This method will return null if no + * binary fields with the specified name are available. + * + * @param name the name of the field + * @return a byte[][] of binary field values. + */ + public final byte[][] getBinaryValues(String name) { + List result = new ArrayList(); + for (int i = 0; i < fields.size(); i++) { + Field field = (Field)fields.get(i); + if (field.name().equals(name) && (field.isBinary())) + result.add(field.binaryValue()); + } + + if (result.size() == 0) + return null; + + return (byte[][])result.toArray(new byte[result.size()][]); + } + + /** + * Returns an array of bytes for the first (or only) field that has the name + * specified as the method parameter. This method will return null + * if no binary fields with the specified name are available. + * There may be non-binary fields with the same name. + * + * @param name the name of the field. + * @return a byte[] containing the binary field value. + */ + public final byte[] getBinaryValue(String name) { + for (int i=0; i < fields.size(); i++) { + Field field = (Field)fields.get(i); + if (field.name().equals(name) && (field.isBinary())) + return field.binaryValue(); + } + return null; + } + /** Prints the fields of a document for human consumption. */ public final String toString() { StringBuffer buffer = new StringBuffer(); diff --git a/src/java/org/apache/lucene/document/Field.java b/src/java/org/apache/lucene/document/Field.java index 5890df6a25c..90fe3fb822a 100644 --- a/src/java/org/apache/lucene/document/Field.java +++ b/src/java/org/apache/lucene/document/Field.java @@ -18,9 +18,10 @@ package org.apache.lucene.document; import java.io.Reader; import java.util.Date; -import org.apache.lucene.index.IndexReader; // for javadoc -import org.apache.lucene.search.Similarity; // for javadoc -import org.apache.lucene.search.Hits; // for javadoc + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.Similarity; /** A field is a section of a Document. Each field has two parts, a name and a @@ -33,12 +34,15 @@ import org.apache.lucene.search.Hits; // for javadoc public final class Field implements java.io.Serializable { private String name = "body"; private String stringValue = null; - private boolean storeTermVector = false; private Reader readerValue = null; + private byte[] binaryValue = null; + + private boolean storeTermVector = false; private boolean isStored = false; private boolean isIndexed = true; private boolean isTokenized = true; - + private boolean isBinary = false; + private float boost = 1.0f; public static final class Store { @@ -101,7 +105,7 @@ public final class Field implements java.io.Serializable { * of the document's terms and their number of occurences in that document. */ public static final TermVector YES = new TermVector("YES"); } - + /** Sets the boost factor hits on this field. This value will be * multiplied into the score of all hits on this this field of this * document. @@ -213,7 +217,7 @@ public final class Field implements java.io.Serializable { f.storeTermVector = storeTermVector; return f; } - + /** The name of the field (e.g., "date", "title", "body", ...) as an interned string. */ public String name() { return name; } @@ -224,7 +228,11 @@ public final class Field implements java.io.Serializable { /** The value of the field as a Reader, or null. If null, the String value is used. Exactly one of stringValue() and readerValue() must be set. */ public Reader readerValue() { return readerValue; } - + /** The value of the field in Binary, or null. If null, the Reader or + String value is used. Exactly one of stringValue(), readerValue() and + binaryValue() must be set. */ + public byte[] binaryValue() { return binaryValue; } + /** * Create a field by specifying its name, value and how it will * be saved in the index. Term vectors will not be stored in the index. @@ -240,7 +248,7 @@ public final class Field implements java.io.Serializable { public Field(String name, String value, Store store, Index index) { this(name, value, store, index, TermVector.NO); } - + /** * Create a field by specifying its name, value and how it will * be saved in the index. @@ -340,6 +348,23 @@ public final class Field implements java.io.Serializable { this(name, string, store, index, token, false); } + public Field(String name, byte[] value) { + if (name == null) + throw new IllegalArgumentException("name cannot be null"); + if (value == null) + throw new IllegalArgumentException("value cannot be null"); + + this.name = name.intern(); + this.binaryValue = value; + + this.isBinary = true; + this.isStored = true; + + this.isIndexed = false; + this.isTokenized = false; + this.storeTermVector = false; + } + /** * * @param name The name of the field @@ -402,6 +427,9 @@ public final class Field implements java.io.Serializable { */ public final boolean isTermVectorStored() { return storeTermVector; } + /** True iff the value of the filed is stored as binary */ + public final boolean isBinary() { return isBinary; } + /** Prints a Field for human consumption. */ public final String toString() { StringBuffer result = new StringBuffer(); @@ -422,6 +450,12 @@ public final class Field implements java.io.Serializable { result.append(","); result.append("termVector"); } + if (isBinary) { + if (result.length() > 0) + result.append(","); + result.append("binary"); + } + result.append('<'); result.append(name); result.append(':'); diff --git a/src/java/org/apache/lucene/index/FieldsReader.java b/src/java/org/apache/lucene/index/FieldsReader.java index 145c4c96b0e..08cfa712d84 100644 --- a/src/java/org/apache/lucene/index/FieldsReader.java +++ b/src/java/org/apache/lucene/index/FieldsReader.java @@ -18,10 +18,10 @@ package org.apache.lucene.index; import java.io.IOException; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.InputStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.InputStream; /** * Class responsible for access to stored document fields. @@ -67,20 +67,28 @@ final class FieldsReader { byte bits = fieldsStream.readByte(); - Field.Index index; - boolean tokenize = (bits & 1) != 0; - if (fi.isIndexed && tokenize) - index = Field.Index.TOKENIZED; - else if (fi.isIndexed && !tokenize) - index = Field.Index.UN_TOKENIZED; - else - index = Field.Index.NO; - doc.add(new Field(fi.name, // name - fieldsStream.readString(), // read value - Field.Store.YES, index, - fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO)); + if ((bits & 2) != 0) { + final byte[] b = new byte[fieldsStream.readVInt()]; + fieldsStream.readBytes(b, 0, b.length); + doc.add(new Field(fi.name, b)); + } + else { + Field.Index index; + boolean tokenize = (bits & 1) != 0; + if (fi.isIndexed && tokenize) + index = Field.Index.TOKENIZED; + else if (fi.isIndexed && !tokenize) + index = Field.Index.UN_TOKENIZED; + else + index = Field.Index.NO; + doc.add(new Field(fi.name, // name + fieldsStream.readString(), // read value + Field.Store.YES, index, + fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO)); + } } return doc; } + } diff --git a/src/java/org/apache/lucene/index/FieldsWriter.java b/src/java/org/apache/lucene/index/FieldsWriter.java index 9cef98eba02..dfb17827db6 100644 --- a/src/java/org/apache/lucene/index/FieldsWriter.java +++ b/src/java/org/apache/lucene/index/FieldsWriter.java @@ -2,70 +2,82 @@ package org.apache.lucene.index; /** * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. */ -import java.util.Enumeration; import java.io.IOException; +import java.util.Enumeration; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.OutputStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.OutputStream; -final class FieldsWriter { - private FieldInfos fieldInfos; - private OutputStream fieldsStream; - private OutputStream indexStream; - - FieldsWriter(Directory d, String segment, FieldInfos fn) - throws IOException { - fieldInfos = fn; - fieldsStream = d.createFile(segment + ".fdt"); - indexStream = d.createFile(segment + ".fdx"); - } +final class FieldsWriter +{ + private FieldInfos fieldInfos; - final void close() throws IOException { - fieldsStream.close(); - indexStream.close(); - } + private OutputStream fieldsStream; - final void addDocument(Document doc) throws IOException { - indexStream.writeLong(fieldsStream.getFilePointer()); - - int storedCount = 0; - Enumeration fields = doc.fields(); - while (fields.hasMoreElements()) { - Field field = (Field)fields.nextElement(); - if (field.isStored()) - storedCount++; + private OutputStream indexStream; + + FieldsWriter(Directory d, String segment, FieldInfos fn) throws IOException { + fieldInfos = fn; + fieldsStream = d.createFile(segment + ".fdt"); + indexStream = d.createFile(segment + ".fdx"); } - fieldsStream.writeVInt(storedCount); - - fields = doc.fields(); - while (fields.hasMoreElements()) { - Field field = (Field)fields.nextElement(); - if (field.isStored()) { - fieldsStream.writeVInt(fieldInfos.fieldNumber(field.name())); - byte bits = 0; - if (field.isTokenized()) - bits |= 1; - fieldsStream.writeByte(bits); - - fieldsStream.writeString(field.stringValue()); - } + final void close() throws IOException { + fieldsStream.close(); + indexStream.close(); } - } -} + + final void addDocument(Document doc) throws IOException { + indexStream.writeLong(fieldsStream.getFilePointer()); + + int storedCount = 0; + Enumeration fields = doc.fields(); + while (fields.hasMoreElements()) { + Field field = (Field) fields.nextElement(); + if (field.isStored()) + storedCount++; + } + fieldsStream.writeVInt(storedCount); + + fields = doc.fields(); + while (fields.hasMoreElements()) { + Field field = (Field) fields.nextElement(); + if (field.isStored()) { + fieldsStream.writeVInt(fieldInfos.fieldNumber(field.name())); + + byte bits = 0; + if (field.isTokenized()) + bits |= 1; + if (field.isBinary()) + bits |= 2; + fieldsStream.writeByte(bits); + + if (field.isBinary()) { + byte[] data = field.binaryValue(); + final int len = data.length; + fieldsStream.writeVInt(len); + fieldsStream.writeBytes(data, len); + } else { + fieldsStream.writeString(field.stringValue()); + } + } + } + } + +} \ No newline at end of file diff --git a/src/test/org/apache/lucene/document/TestDocument.java b/src/test/org/apache/lucene/document/TestDocument.java index 0f1c96e663a..ff0c1aaa8a5 100644 --- a/src/test/org/apache/lucene/document/TestDocument.java +++ b/src/test/org/apache/lucene/document/TestDocument.java @@ -39,6 +39,56 @@ import org.apache.lucene.search.Hits; public class TestDocument extends TestCase { + String binaryVal = "this text will be stored as a byte array in the index"; + String binaryVal2 = "this text will be also stored as a byte array in the index"; + + public void testBinaryField() + throws Exception + { + Document doc = new Document(); + Field stringFld = new Field("string", binaryVal, Field.Store.YES, Field.Index.NO); + Field binaryFld = new Field("binary", binaryVal.getBytes()); + Field binaryFld2 = new Field("binary", binaryVal2.getBytes()); + + doc.add(stringFld); + doc.add(binaryFld); + + assertEquals(2, doc.fields.size()); + + assertTrue(binaryFld.isBinary()); + assertTrue(binaryFld.isStored()); + assertFalse(binaryFld.isIndexed()); + assertFalse(binaryFld.isTokenized()); + + String binaryTest = new String(doc.getBinaryValue("binary")); + assertTrue(binaryTest.equals(binaryVal)); + + String stringTest = doc.get("string"); + assertTrue(binaryTest.equals(stringTest)); + + doc.add(binaryFld2); + + assertEquals(3, doc.fields.size()); + + byte[][] binaryTests = doc.getBinaryValues("binary"); + + assertEquals(2, binaryTests.length); + + binaryTest = new String(binaryTests[0]); + String binaryTest2 = new String(binaryTests[1]); + + assertFalse(binaryTest.equals(binaryTest2)); + + assertTrue(binaryTest.equals(binaryVal)); + assertTrue(binaryTest2.equals(binaryVal2)); + + doc.removeField("string"); + assertEquals(2, doc.fields.size()); + + doc.removeFields("binary"); + assertEquals(0, doc.fields.size()); + } + /** * Tests {@link Document#removeField(String)} method for a brand new Document * that has not been indexed yet.