diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 1dc2c8f0b0e..7de2cde2ceb 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -156,6 +156,9 @@ Improvements * LUCENE-9531: Consolidated CharStream and FastCharStream classes: these have been moved from each query parser package to org.apache.lucene.queryparser.charstream (Dawid Weiss). +* LUCENE-9450: Use BinaryDocValues for the taxonomy index instead of StoredFields. + Add backwards compatibility tests for the taxonomy index. (Gautam Worah, Michael McCandless) + Bug fixes * LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while diff --git a/lucene/facet/build.gradle b/lucene/facet/build.gradle index 6b6a6ef137e..e94d8b0815a 100644 --- a/lucene/facet/build.gradle +++ b/lucene/facet/build.gradle @@ -27,4 +27,6 @@ dependencies { testImplementation project(':lucene:test-framework') testImplementation project(':lucene:queries') + // Required for opening older indexes for backward compatibility tests + testCompile group: 'org.apache.lucene', name: 'lucene-codecs', version: '8.6.3' } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java index bcc4c6d5189..e0632304b96 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java @@ -31,12 +31,15 @@ import org.apache.lucene.facet.taxonomy.FacetLabel; import org.apache.lucene.facet.taxonomy.LRUHashMap; import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays; import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.CorruptIndexException; // javadocs import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MultiTerms; import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.SegmentReader; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.Directory; @@ -322,9 +325,24 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab return res; } } - - Document doc = indexReader.document(ordinal); - FacetLabel ret = new FacetLabel(FacetsConfig.stringToPath(doc.get(Consts.FULL))); + + int readerIndex = ReaderUtil.subIndex(ordinal, indexReader.leaves()); + LeafReader leafReader = indexReader.leaves().get(readerIndex).reader(); + // TODO: Use LUCENE-9476 to get the bulk lookup API for extracting BinaryDocValues + BinaryDocValues values = leafReader.getBinaryDocValues(Consts.FULL); + + FacetLabel ret; + + if (values == null || values.advanceExact(ordinal-indexReader.leaves().get(readerIndex).docBase) == false) { + // The index uses the older StoredField format to store the mapping + // On recreating the index, the values will be stored using the BinaryDocValuesField format + Document doc = indexReader.document(ordinal); + ret = new FacetLabel(FacetsConfig.stringToPath(doc.get(Consts.FULL))); + } else { + // The index uses the BinaryDocValuesField format to store the mapping + ret = new FacetLabel(FacetsConfig.stringToPath(values.binaryValue().utf8ToString())); + } + synchronized (categoryCache) { categoryCache.put(catIDInteger, ret); } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java index f8374a38e16..d03d32c3a56 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java @@ -31,6 +31,7 @@ import java.util.concurrent.atomic.AtomicInteger; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; @@ -193,7 +194,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter { FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setOmitNorms(true); parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream, ft); - fullPathField = new StringField(Consts.FULL, "", Field.Store.YES); + fullPathField = new StringField(Consts.FULL, "", Field.Store.NO); nextID = indexWriter.getDocStats().maxDoc; @@ -492,8 +493,10 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter { Document d = new Document(); d.add(parentStreamField); - fullPathField.setStringValue(FacetsConfig.pathToString(categoryPath.components, categoryPath.length)); + String fieldPath = FacetsConfig.pathToString(categoryPath.components, categoryPath.length); + fullPathField.setStringValue(fieldPath); d.add(fullPathField); + d.add(new BinaryDocValuesField(Consts.FULL, new BytesRef(fieldPath))); // Note that we do no pass an Analyzer here because the fields that are // added to the Document are untokenized or contains their own TokenStream. diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java new file mode 100644 index 00000000000..69d975d3a70 --- /dev/null +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.facet.taxonomy.directory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; + +import org.apache.lucene.facet.taxonomy.FacetLabel; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; +import org.junit.Ignore; + +/* + Verify we can read previous versions' taxonomy indexes, do searches + against them, and add documents to them. +*/ +// See: https://issues.apache.org/jira/browse/SOLR-12028 Tests cannot remove files on Windows machines occasionally +public class TestBackwardsCompatibility extends LuceneTestCase { + + // To generate backcompat indexes with the current default codec, run the following gradle command: + // gradlew test -Dtestcase=TestBackwardsCompatibility -Dtests.bwcdir=/path/to/store/indexes + // -Dtests.codec=default -Dtests.useSecurityManager=false + // Also add testmethod with one of the index creation methods below, for example: + // -Dtestmethod=testCreateOldTaxonomy + // + // Zip up the generated indexes: + // + // cd /path/to/store/indexes/index.cfs ; zip index.-cfs.zip * + // + // Then move the zip file to your trunk checkout and use it in your test cases + + public static final String oldTaxonomyIndexName = "taxonomy.8.6.3-cfs"; + + public void testCreateNewTaxonomy() throws IOException { + createNewTaxonomyIndex(oldTaxonomyIndexName); + } + + // Opens up a pre-existing old taxonomy index and adds new BinaryDocValues based fields + private void createNewTaxonomyIndex(String dirName) throws IOException { + Path indexDir = createTempDir(oldTaxonomyIndexName); + TestUtil.unzip(getDataInputStream(dirName + ".zip"), indexDir); + Directory dir = newFSDirectory(indexDir); + + DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir); + + FacetLabel cp_b = new FacetLabel("b"); + writer.addCategory(cp_b); + writer.getInternalIndexWriter().forceMerge(1); + writer.commit(); + + TaxonomyReader reader = new DirectoryTaxonomyReader(writer); + + int ord1 = reader.getOrdinal(new FacetLabel("a")); + assert ord1 != TaxonomyReader.INVALID_ORDINAL; + // Just asserting ord1 != TaxonomyReader.INVALID_ORDINAL is not enough to check compatibility + assertNotNull(reader.getPath(ord1)); + + int ord2 = reader.getOrdinal(cp_b); + assert ord2 != TaxonomyReader.INVALID_ORDINAL; + assertNotNull(reader.getPath(ord2)); + + reader.close(); + writer.close(); + dir.close(); + } + + // Used to create a fresh taxonomy index with StoredFields + @Ignore + public void testCreateOldTaxonomy() throws IOException { + createOldTaxonomyIndex(oldTaxonomyIndexName); + } + + private void createOldTaxonomyIndex(String dirName) throws IOException { + Path indexDir = getIndexDir().resolve(dirName); + Files.deleteIfExists(indexDir); + Directory dir = newFSDirectory(indexDir); + + TaxonomyWriter writer = new DirectoryTaxonomyWriter(dir); + + writer.addCategory(new FacetLabel("a")); + writer.commit(); + writer.close(); + dir.close(); + } + + private Path getIndexDir() { + String path = System.getProperty("tests.bwcdir"); + assumeTrue("backcompat creation tests must be run with -Dtests.bwcdir=/path/to/write/indexes", path != null); + return Paths.get(path); + } +} \ No newline at end of file diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/taxonomy.8.6.3-cfs.zip b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/taxonomy.8.6.3-cfs.zip new file mode 100644 index 00000000000..d04c7064802 Binary files /dev/null and b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/taxonomy.8.6.3-cfs.zip differ diff --git a/versions.lock b/versions.lock index b738b791bba..572771857e8 100644 --- a/versions.lock +++ b/versions.lock @@ -204,6 +204,8 @@ org.apache.kerby:kerb-server:1.0.1 (1 constraints: 0405f135) org.apache.kerby:kerb-simplekdc:1.0.1 (1 constraints: 0405f135) org.apache.kerby:kerby-kdc:1.0.1 (1 constraints: 0405f135) org.apache.logging.log4j:log4j-1.2-api:2.13.2 (1 constraints: 3a053a3b) +org.apache.lucene:lucene-codecs:8.6.3 (1 constraints: 13052836) +org.apache.lucene:lucene-core:8.6.3 (1 constraints: 7f0d022f) org.asciidoctor:asciidoctorj:1.6.2 (1 constraints: 0b050436) org.asciidoctor:asciidoctorj-api:1.6.2 (1 constraints: e30cfb0d) org.hsqldb:hsqldb:2.4.0 (1 constraints: 08050136)