From 5a694ea26ff862ecc874ca798135073d300c2234 Mon Sep 17 00:00:00 2001 From: Namgyu Kim Date: Tue, 21 May 2019 22:18:14 +0900 Subject: [PATCH] LUCENE-8805: Parameter changes for stringField() in StoredFieldVisitor Signed-off-by: Namgyu Kim Signed-off-by: Adrien Grand --- lucene/CHANGES.txt | 4 +++ .../quality/utils/DocNameExtractor.java | 7 +++-- .../SimpleTextStoredFieldsReader.java | 3 ++- .../lucene/codecs/StoredFieldsWriter.java | 7 +++-- .../CompressingStoredFieldsReader.java | 5 +--- .../document/DocumentStoredFieldVisitor.java | 6 ++--- .../index/SortingStoredFieldsConsumer.java | 7 +++-- .../lucene/index/StoredFieldVisitor.java | 4 +-- .../search/uhighlight/UnifiedHighlighter.java | 5 ++-- .../vectorhighlight/BaseFragmentsBuilder.java | 6 ++--- .../lucene/document/TestLazyDocument.java | 6 ++--- .../lucene/index/FieldFilterLeafReader.java | 5 ++-- .../lucene/index/MismatchedLeafReader.java | 5 ++-- .../solr/search/SolrDocumentFetcher.java | 26 +++++++++++++------ 14 files changed, 53 insertions(+), 43 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a33eb0c0a62..5407e6ba6ec 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -18,6 +18,10 @@ API Changes * LUCENE-3041: The deprecated Weight#extractTerms() method has been removed (Alan Woodward, Simon Willnauer, David Smiley, Luca Cavanna) +* LUCENE-8805: StoredFieldVisitor#stringField now takes a String rather than a + byte[] that stores the UTF-8 bytes of the stored string. + (Namgyu Kim via Adrien Grand) + Bug fixes * LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/DocNameExtractor.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/DocNameExtractor.java index f489c0bc04e..fd48f1f30a8 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/DocNameExtractor.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/DocNameExtractor.java @@ -17,9 +17,9 @@ package org.apache.lucene.benchmark.quality.utils; import java.io.IOException; -import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; +import java.util.Objects; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.StoredFieldVisitor; @@ -51,9 +51,8 @@ public class DocNameExtractor { final List name = new ArrayList<>(); searcher.getIndexReader().document(docid, new StoredFieldVisitor() { @Override - public void stringField(FieldInfo fieldInfo, byte[] bytes) { - String value = new String(bytes, StandardCharsets.UTF_8); - name.add(value); + public void stringField(FieldInfo fieldInfo, String value) { + name.add(Objects.requireNonNull(value, "String value should not be null")); } @Override diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextStoredFieldsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextStoredFieldsReader.java index 2078378331d..79f8784c168 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextStoredFieldsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextStoredFieldsReader.java @@ -18,6 +18,7 @@ package org.apache.lucene.codecs.simpletext; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import org.apache.lucene.codecs.StoredFieldsReader; @@ -155,7 +156,7 @@ public class SimpleTextStoredFieldsReader extends StoredFieldsReader { if (type == TYPE_STRING) { byte[] bytes = new byte[scratch.length() - VALUE.length]; System.arraycopy(scratch.bytes(), VALUE.length, bytes, 0, bytes.length); - visitor.stringField(fieldInfo, bytes); + visitor.stringField(fieldInfo, new String(bytes, 0, bytes.length, StandardCharsets.UTF_8)); } else if (type == TYPE_BINARY) { byte[] copy = new byte[scratch.length()-VALUE.length]; System.arraycopy(scratch.bytes(), VALUE.length, copy, 0, copy.length); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java index 39ade4242b6..6494a55a9e5 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java @@ -19,9 +19,9 @@ package org.apache.lucene.codecs; import java.io.Closeable; import java.io.IOException; import java.io.Reader; -import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; +import java.util.Objects; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; @@ -180,10 +180,9 @@ public abstract class StoredFieldsWriter implements Closeable { } @Override - public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException { + public void stringField(FieldInfo fieldInfo, String value) throws IOException { reset(fieldInfo); - // TODO: can we avoid new String here? - stringValue = new String(value, StandardCharsets.UTF_8); + stringValue = Objects.requireNonNull(value, "String value should not be null"); write(); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java index 62508f88d4d..b4b09106687 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java @@ -211,10 +211,7 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader { visitor.binaryField(info, data); break; case STRING: - length = in.readVInt(); - data = new byte[length]; - in.readBytes(data, 0, length); - visitor.stringField(info, data); + visitor.stringField(info, in.readString()); break; case NUMERIC_INT: visitor.intField(info, in.readZInt()); diff --git a/lucene/core/src/java/org/apache/lucene/document/DocumentStoredFieldVisitor.java b/lucene/core/src/java/org/apache/lucene/document/DocumentStoredFieldVisitor.java index ed4d0aada56..1a9b9e734aa 100644 --- a/lucene/core/src/java/org/apache/lucene/document/DocumentStoredFieldVisitor.java +++ b/lucene/core/src/java/org/apache/lucene/document/DocumentStoredFieldVisitor.java @@ -17,8 +17,8 @@ package org.apache.lucene.document; import java.io.IOException; -import java.nio.charset.StandardCharsets; import java.util.HashSet; +import java.util.Objects; import java.util.Set; import org.apache.lucene.index.FieldInfo; @@ -67,12 +67,12 @@ public class DocumentStoredFieldVisitor extends StoredFieldVisitor { } @Override - public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException { + public void stringField(FieldInfo fieldInfo, String value) throws IOException { final FieldType ft = new FieldType(TextField.TYPE_STORED); ft.setStoreTermVectors(fieldInfo.hasVectors()); ft.setOmitNorms(fieldInfo.omitsNorms()); ft.setIndexOptions(fieldInfo.getIndexOptions()); - doc.add(new StoredField(fieldInfo.name, new String(value, StandardCharsets.UTF_8), ft)); + doc.add(new StoredField(fieldInfo.name, Objects.requireNonNull(value, "String value should not be null"), ft)); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java b/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java index 97253a5ee35..ba3351425ee 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java @@ -19,8 +19,8 @@ package org.apache.lucene.index; import java.io.IOException; import java.io.Reader; -import java.nio.charset.StandardCharsets; import java.util.Map; +import java.util.Objects; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; @@ -114,10 +114,9 @@ final class SortingStoredFieldsConsumer extends StoredFieldsConsumer { } @Override - public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException { + public void stringField(FieldInfo fieldInfo, String value) throws IOException { reset(fieldInfo); - // TODO: can we avoid new String here? - stringValue = new String(value, StandardCharsets.UTF_8); + stringValue = Objects.requireNonNull(value, "String value should not be null"); write(); } diff --git a/lucene/core/src/java/org/apache/lucene/index/StoredFieldVisitor.java b/lucene/core/src/java/org/apache/lucene/index/StoredFieldVisitor.java index 911a1575b71..38c5d6d94e8 100644 --- a/lucene/core/src/java/org/apache/lucene/index/StoredFieldVisitor.java +++ b/lucene/core/src/java/org/apache/lucene/index/StoredFieldVisitor.java @@ -53,8 +53,8 @@ public abstract class StoredFieldVisitor { public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException { } - /** Process a string field; the provided byte[] value is a UTF-8 encoded string value. */ - public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException { + /** Process a string field. */ + public void stringField(FieldInfo fieldInfo, String value) throws IOException { } /** Process a int numeric field. */ diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java index f4b1ab1b7b3..a29176a7cf5 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java @@ -17,7 +17,6 @@ package org.apache.lucene.search.uhighlight; import java.io.IOException; -import java.nio.charset.StandardCharsets; import java.text.BreakIterator; import java.util.ArrayList; import java.util.Arrays; @@ -995,9 +994,9 @@ public class UnifiedHighlighter { } @Override - public void stringField(FieldInfo fieldInfo, byte[] byteValue) throws IOException { - String value = new String(byteValue, StandardCharsets.UTF_8); + public void stringField(FieldInfo fieldInfo, String value) throws IOException { assert currentField >= 0; + Objects.requireNonNull(value, "String value should not be null"); CharSequence curValue = values[currentField]; if (curValue == null) { //question: if truncate due to maxLength, should we try and avoid keeping the other chars in-memory on diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java index 15bcfcd63b8..f81d718fe95 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java @@ -17,7 +17,6 @@ package org.apache.lucene.search.vectorhighlight; import java.io.IOException; -import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; @@ -25,6 +24,7 @@ import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Objects; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; @@ -152,8 +152,8 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder { reader.document(docId, new StoredFieldVisitor() { @Override - public void stringField(FieldInfo fieldInfo, byte[] bytes) { - String value = new String(bytes, StandardCharsets.UTF_8); + public void stringField(FieldInfo fieldInfo, String value) { + Objects.requireNonNull(value, "String value should not be null"); FieldType ft = new FieldType(TextField.TYPE_STORED); ft.setStoreTermVectors(fieldInfo.hasVectors()); fields.add(new Field(fieldInfo.name, value, ft)); diff --git a/lucene/misc/src/test/org/apache/lucene/document/TestLazyDocument.java b/lucene/misc/src/test/org/apache/lucene/document/TestLazyDocument.java index ba7a48372d5..5a2f047eca3 100644 --- a/lucene/misc/src/test/org/apache/lucene/document/TestLazyDocument.java +++ b/lucene/misc/src/test/org/apache/lucene/document/TestLazyDocument.java @@ -17,11 +17,11 @@ package org.apache.lucene.document; import java.io.IOException; -import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Map; +import java.util.Objects; import java.util.Set; import org.apache.lucene.analysis.*; @@ -209,12 +209,12 @@ public class TestLazyDocument extends LuceneTestCase { } @Override - public void stringField(FieldInfo fieldInfo, byte[] bytes) throws IOException { - String value = new String(bytes, StandardCharsets.UTF_8); + public void stringField(FieldInfo fieldInfo, String value) throws IOException { final FieldType ft = new FieldType(TextField.TYPE_STORED); ft.setStoreTermVectors(fieldInfo.hasVectors()); ft.setOmitNorms(fieldInfo.omitsNorms()); ft.setIndexOptions(fieldInfo.getIndexOptions()); + Objects.requireNonNull(value, "String value should not be null"); doc.add(new Field(fieldInfo.name, value, ft)); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/FieldFilterLeafReader.java b/lucene/test-framework/src/java/org/apache/lucene/index/FieldFilterLeafReader.java index 38a3b7a6749..84d06e0e657 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/FieldFilterLeafReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/FieldFilterLeafReader.java @@ -19,6 +19,7 @@ package org.apache.lucene.index; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; +import java.util.Objects; import java.util.Set; import org.apache.lucene.util.FilterIterator; @@ -76,8 +77,8 @@ public final class FieldFilterLeafReader extends FilterLeafReader { } @Override - public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException { - visitor.stringField(fieldInfo, value); + public void stringField(FieldInfo fieldInfo, String value) throws IOException { + visitor.stringField(fieldInfo, Objects.requireNonNull(value, "String value should not be null")); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java b/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java index 691f19ee7f3..e16ce4b1dbf 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.Objects; import java.util.Random; /** @@ -105,8 +106,8 @@ public class MismatchedLeafReader extends FilterLeafReader { } @Override - public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException { - in.stringField(renumber(fieldInfo), value); + public void stringField(FieldInfo fieldInfo, String value) throws IOException { + in.stringField(renumber(fieldInfo), Objects.requireNonNull(value, "String value should not be null")); } @Override diff --git a/solr/core/src/java/org/apache/solr/search/SolrDocumentFetcher.java b/solr/core/src/java/org/apache/solr/search/SolrDocumentFetcher.java index 4d5018d5732..f6b4760b19d 100644 --- a/solr/core/src/java/org/apache/solr/search/SolrDocumentFetcher.java +++ b/solr/core/src/java/org/apache/solr/search/SolrDocumentFetcher.java @@ -29,6 +29,7 @@ import java.util.Date; import java.util.HashSet; import java.util.LinkedList; import java.util.List; +import java.util.Objects; import java.util.Set; import java.util.function.Predicate; import java.util.function.Supplier; @@ -62,7 +63,6 @@ import org.apache.lucene.util.NumericUtils; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentBase; import org.apache.solr.common.SolrException; -import org.apache.solr.common.util.ByteArrayUtf8CharSequence; import org.apache.solr.core.SolrConfig; import org.apache.solr.response.DocsStreamer; import org.apache.solr.response.ResultContext; @@ -299,14 +299,15 @@ public class SolrDocumentFetcher { @Override - public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException { + public void stringField(FieldInfo fieldInfo, String value) throws IOException { Predicate readAsBytes = ResultContext.READASBYTES.get(); if (readAsBytes != null && readAsBytes.test(fieldInfo.name)) { final FieldType ft = new FieldType(TextField.TYPE_STORED); ft.setStoreTermVectors(fieldInfo.hasVectors()); ft.setOmitNorms(fieldInfo.omitsNorms()); ft.setIndexOptions(fieldInfo.getIndexOptions()); - doc.add(new StoredField(fieldInfo.name, new ByteArrayUtf8CharSequence(value, 0, value.length), ft)); + Objects.requireNonNull(value, "String value should not be null"); + doc.add(new StoredField(fieldInfo.name, value, ft)); } else { super.stringField(fieldInfo, value); } @@ -371,9 +372,9 @@ public class SolrDocumentFetcher { } // must be String if (f instanceof LargeLazyField) { // optimization to avoid premature string conversion - visitor.stringField(info, toByteArrayUnwrapIfPossible(((LargeLazyField) f).readBytes())); + visitor.stringField(info, toStringUnwrapIfPossible(((LargeLazyField) f).readBytes())); } else { - visitor.stringField(info, f.stringValue().getBytes(StandardCharsets.UTF_8)); + visitor.stringField(info, f.stringValue()); } } } @@ -386,6 +387,14 @@ public class SolrDocumentFetcher { } } + private String toStringUnwrapIfPossible(BytesRef bytesRef) { + if (bytesRef.offset == 0 && bytesRef.bytes.length == bytesRef.length) { + return new String(bytesRef.bytes, StandardCharsets.UTF_8); + } else { + return new String(bytesRef.bytes, bytesRef.offset, bytesRef.offset + bytesRef.length, StandardCharsets.UTF_8); + } + } + /** Unlike LazyDocument.LazyField, we (a) don't cache large values, and (b) provide access to the byte[]. */ class LargeLazyField implements IndexableField { @@ -450,9 +459,10 @@ public class SolrDocumentFetcher { } @Override - public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException { - bytesRef.bytes = value; - bytesRef.length = value.length; + public void stringField(FieldInfo fieldInfo, String value) throws IOException { + Objects.requireNonNull(value, "String value should not be null"); + bytesRef.bytes = value.getBytes(StandardCharsets.UTF_8); + bytesRef.length = value.length(); done = true; }