From d8e0288109f05ffc828e7f41a0f00b2c6f707151 Mon Sep 17 00:00:00 2001 From: Shai Erera Date: Wed, 16 May 2012 08:01:40 +0000 Subject: [PATCH] LUCENE-4060: port to trunk git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1339047 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 5 + .../example/merge/TaxonomyMergeUtils.java | 2 +- .../facet/taxonomy/directory/Consts.java | 3 +- .../directory/DirectoryTaxonomyWriter.java | 208 ++++---------- .../taxonomy/directory/TestAddTaxonomies.java | 254 ------------------ .../taxonomy/directory/TestAddTaxonomy.java | 228 ++++++++++++++++ 6 files changed, 280 insertions(+), 420 deletions(-) delete mode 100644 lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestAddTaxonomies.java create mode 100644 lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestAddTaxonomy.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 17ea33945f8..5d86c224963 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -953,6 +953,11 @@ Bug fixes offset calculation in PathHierarchyTokenizer. (Mike McCandless, Uwe Schindler, Robert Muir) +* LUCENE-4060: Fix a synchronization bug in + DirectoryTaxonomyWriter.addTaxonomies(). Also, the method has been renamed to + addTaxonomy and now takes only one Directory and one OrdinalMap. + (Shai Erera, Gilad Barkai) + Documentation * LUCENE-3958: Javadocs corrections for IndexWriter. diff --git a/lucene/facet/src/examples/org/apache/lucene/facet/example/merge/TaxonomyMergeUtils.java b/lucene/facet/src/examples/org/apache/lucene/facet/example/merge/TaxonomyMergeUtils.java index 40dfac53dfd..835c8e846f5 100644 --- a/lucene/facet/src/examples/org/apache/lucene/facet/example/merge/TaxonomyMergeUtils.java +++ b/lucene/facet/src/examples/org/apache/lucene/facet/example/merge/TaxonomyMergeUtils.java @@ -81,7 +81,7 @@ public class TaxonomyMergeUtils { OrdinalMap map, IndexWriter destIndexWriter, DirectoryTaxonomyWriter destTaxWriter) throws IOException { // merge the taxonomies - destTaxWriter.addTaxonomies(new Directory[] { srcTaxDir }, new OrdinalMap[] { map }); + destTaxWriter.addTaxonomy(srcTaxDir, map); PayloadProcessorProvider payloadProcessor = new FacetsPayloadProcessorProvider( srcIndexDir, map.getMap(), new DefaultFacetIndexingParams()); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java index ec6e842c067..2d7988ad152 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java @@ -4,8 +4,6 @@ import java.io.IOException; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.StoredFieldVisitor; -import org.apache.lucene.index.StoredFieldVisitor.Status; -import org.apache.lucene.store.IndexInput; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -42,6 +40,7 @@ abstract class Consts { public static final class LoadFullPathOnly extends StoredFieldVisitor { private String fullPath; + @Override public void stringField(FieldInfo fieldInfo, String value) throws IOException { fullPath = value; } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java index e1dfbea534d..b77e0f1bcfe 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java @@ -12,15 +12,21 @@ import java.io.IOException; import java.util.HashMap; import java.util.Map; -import org.apache.lucene.analysis.core.KeywordAnalyzer; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.core.KeywordAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache; +import org.apache.lucene.facet.taxonomy.writercache.cl2o.Cl2oTaxonomyWriterCache; +import org.apache.lucene.facet.taxonomy.writercache.lru.LruTaxonomyWriterCache; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DocsEnum; @@ -30,9 +36,9 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.LogByteSizeMergePolicy; import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.SegmentInfos; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.index.SegmentInfos; import org.apache.lucene.index.TieredMergePolicy; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.AlreadyClosedException; @@ -44,13 +50,6 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Version; -import org.apache.lucene.facet.taxonomy.CategoryPath; -import org.apache.lucene.facet.taxonomy.TaxonomyReader; -import org.apache.lucene.facet.taxonomy.TaxonomyWriter; -import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache; -import org.apache.lucene.facet.taxonomy.writercache.cl2o.Cl2oTaxonomyWriterCache; -import org.apache.lucene.facet.taxonomy.writercache.lru.LruTaxonomyWriterCache; - /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -812,6 +811,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter { } return parentArray; } + @Override public int getParent(int ordinal) throws IOException { ensureOpen(); @@ -823,158 +823,47 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter { } return getParentArray().getArray()[ordinal]; } - + /** - * Take all the categories of one or more given taxonomies, and add them to - * the main taxonomy (this), if they are not already there. - *

- * Additionally, fill a mapping for each of the added taxonomies, - * mapping its ordinals to the ordinals in the enlarged main taxonomy. - * These mapping are saved into an array of OrdinalMap objects given by the - * user, one for each of the given taxonomies (not including "this", the main - * taxonomy). Often the first of these will be a MemoryOrdinalMap and the - * others will be a DiskOrdinalMap - see discussion in {OrdinalMap}. - *

- * Note that the taxonomies to be added are given as Directory objects, - * not opened TaxonomyReader/TaxonomyWriter objects, so if any of them are - * currently managed by an open TaxonomyWriter, make sure to commit() (or - * close()) it first. The main taxonomy (this) is an open TaxonomyWriter, - * and does not need to be commit()ed before this call. + * Takes the categories from the given taxonomy directory, and adds the + * missing ones to this taxonomy. Additionally, it fills the given + * {@link OrdinalMap} with a mapping from the original ordinal to the new + * ordinal. */ - public void addTaxonomies(Directory[] taxonomies, OrdinalMap[] ordinalMaps) throws IOException { + public void addTaxonomy(Directory taxoDir, OrdinalMap map) throws IOException { ensureOpen(); - // To prevent us stepping on the rest of this class's decisions on when - // to open a reader, and when not, we'll be opening a new reader instead - // of using the existing "reader" object: - IndexReader mainreader = openReader(); - // TODO (Facet): can this then go segment-by-segment and avoid MultiDocsEnum etc? - Terms terms = MultiFields.getTerms(mainreader, Consts.FULL); - assert terms != null; // TODO (Facet): explicit check / throw exception? - TermsEnum mainte = terms.iterator(null); - DocsEnum mainde = null; - - IndexReader[] otherreaders = new IndexReader[taxonomies.length]; - TermsEnum[] othertes = new TermsEnum[taxonomies.length]; - DocsEnum[] otherdocsEnum = new DocsEnum[taxonomies.length]; // just for reuse - for (int i=0; i0) { - // TODO: use a pq here - String first=null; - for (int i=0; i0) { - first = currentOthers[i]; - } - } - int comp = 0; - if (currentMain==null || (comp = currentMain.compareTo(first))>0) { - // If 'first' is before currentMain, or currentMain is null, - // then 'first' is a new category and we need to add it to the - // main taxonomy. Then for all taxonomies with this 'first' - // category, we need to add the new category number to their - // map, and move to the next category in all of them. + IndexReader r = DirectoryReader.open(taxoDir); + try { + final int size = r.numDocs(); + final OrdinalMap ordinalMap = map; + ordinalMap.setSize(size); + CategoryPath cp = new CategoryPath(); + Terms terms = MultiFields.getTerms(r, Consts.FULL); + TermsEnum te = terms.iterator(null); + Bits liveDocs = MultiFields.getLiveDocs(r); + DocsEnum docs = null; + // we call next() first, to skip the root category which always exists. + while (te.next() != null) { + String value = te.term().utf8ToString(); cp.clear(); - cp.add(first, delimiter); - // We can call internalAddCategory() instead of addCategory() - // because we know the category hasn't been seen yet. - int newordinal = internalAddCategory(cp, cp.length()); - // TODO (Facet): we already had this term in our hands before, in nextTE... - // // TODO (Facet): no need to make this term? - for (int i=0; i 0 */ { - // The currentMain doesn't appear in any of the other taxonomies - - // we don't need to do anything, just continue to the next one - currentMain = nextTE(mainte); + docs = te.docs(liveDocs, docs, false); + ordinalMap.addMapping(docs.nextDoc(), ordinal); } - } - - // Close all the readers we've opened, and also tell the ordinal maps - // we're done adding to them - mainreader.close(); - for (int i=0; i1) { - for (int i=0; i= copytr.getSize()); - } else { - assertEquals(copytr.getSize(), tr.getSize()); - } - for (int j=0; j copytr.getSize()) { - String prev = tr.getPath(copytr.getSize()).toString(); - for (int j=copytr.getSize()+1; j 0); + assertEquals(destOrdinal, map[j]); + } + } finally { + srcTR.close(); + } + } finally { + destTR.close(); + } + } + + public void testAddEmpty() throws Exception { + Directory dest = newDirectory(); + DirectoryTaxonomyWriter destTW = new DirectoryTaxonomyWriter(dest); + destTW.addCategory(new CategoryPath("Author", "Rob Pike")); + destTW.addCategory(new CategoryPath("Aardvarks", "Bob")); + destTW.commit(); + + Directory src = newDirectory(); + new DirectoryTaxonomyWriter(src).close(); // create an empty taxonomy + + OrdinalMap map = randomOrdinalMap(); + destTW.addTaxonomy(src, map); + destTW.close(); + + validate(dest, src, map); + + IOUtils.close(dest, src); + } + + public void testAddToEmpty() throws Exception { + Directory dest = newDirectory(); + + Directory src = newDirectory(); + DirectoryTaxonomyWriter srcTW = new DirectoryTaxonomyWriter(src); + srcTW.addCategory(new CategoryPath("Author", "Rob Pike")); + srcTW.addCategory(new CategoryPath("Aardvarks", "Bob")); + srcTW.close(); + + DirectoryTaxonomyWriter destTW = new DirectoryTaxonomyWriter(dest); + OrdinalMap map = randomOrdinalMap(); + destTW.addTaxonomy(src, map); + destTW.close(); + + validate(dest, src, map); + + IOUtils.close(dest, src); + } + + // A more comprehensive and big random test. + @Nightly + public void testBig() throws Exception { + dotest(200, 10000); + dotest(1000, 20000); + // really big + dotest(400000, 1000000); + } + + // a reasonable random test + public void testMedium() throws Exception { + Random random = random(); + int numTests = atLeast(3); + for (int i = 0; i < numTests; i++) { + dotest(_TestUtil.nextInt(random, 2, 100), + _TestUtil.nextInt(random, 100, 1000)); + } + } + + public void testSimple() throws Exception { + Directory dest = newDirectory(); + DirectoryTaxonomyWriter tw1 = new DirectoryTaxonomyWriter(dest); + tw1.addCategory(new CategoryPath("Author", "Mark Twain")); + tw1.addCategory(new CategoryPath("Animals", "Dog")); + tw1.addCategory(new CategoryPath("Author", "Rob Pike")); + + Directory src = newDirectory(); + DirectoryTaxonomyWriter tw2 = new DirectoryTaxonomyWriter(src); + tw2.addCategory(new CategoryPath("Author", "Rob Pike")); + tw2.addCategory(new CategoryPath("Aardvarks", "Bob")); + tw2.close(); + + OrdinalMap map = randomOrdinalMap(); + + tw1.addTaxonomy(src, map); + tw1.close(); + + validate(dest, src, map); + + IOUtils.close(dest, src); + } + + public void testConcurrency() throws Exception { + // tests that addTaxonomy and addCategory work in parallel + final int numCategories = atLeast(5000); + + // build an input taxonomy index + Directory src = newDirectory(); + DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(src); + for (int i = 0; i < numCategories; i++) { + tw.addCategory(new CategoryPath("a", Integer.toString(i))); + } + tw.close(); + + // now add the taxonomy to an empty taxonomy, while adding the categories + // again, in parallel -- in the end, no duplicate categories should exist. + Directory dest = newDirectory(); + final DirectoryTaxonomyWriter destTW = new DirectoryTaxonomyWriter(dest); + Thread t = new Thread() { + @Override + public void run() { + for (int i = 0; i < numCategories; i++) { + try { + destTW.addCategory(new CategoryPath("a", Integer.toString(i))); + } catch (IOException e) { + // shouldn't happen - if it does, let the test fail on uncaught exception. + throw new RuntimeException(e); + } + } + } + }; + t.start(); + + OrdinalMap map = new MemoryOrdinalMap(); + destTW.addTaxonomy(src, map); + t.join(); + destTW.close(); + + // now validate + + DirectoryTaxonomyReader dtr = new DirectoryTaxonomyReader(dest); + // +2 to account for the root category + "a" + assertEquals(numCategories + 2, dtr.getSize()); + HashSet categories = new HashSet(); + for (int i = 1; i < dtr.getSize(); i++) { + CategoryPath cat = dtr.getPath(i); + assertTrue("category " + cat + " already existed", categories.add(cat)); + } + dtr.close(); + + IOUtils.close(src, dest); + } + +}