mirror of https://github.com/apache/lucene.git
LUCENE-4060: port to trunk
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1339047 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e9f32e7a72
commit
d8e0288109
|
@ -953,6 +953,11 @@ Bug fixes
|
|||
offset calculation in PathHierarchyTokenizer.
|
||||
(Mike McCandless, Uwe Schindler, Robert Muir)
|
||||
|
||||
* LUCENE-4060: Fix a synchronization bug in
|
||||
DirectoryTaxonomyWriter.addTaxonomies(). Also, the method has been renamed to
|
||||
addTaxonomy and now takes only one Directory and one OrdinalMap.
|
||||
(Shai Erera, Gilad Barkai)
|
||||
|
||||
Documentation
|
||||
|
||||
* LUCENE-3958: Javadocs corrections for IndexWriter.
|
||||
|
|
|
@ -81,7 +81,7 @@ public class TaxonomyMergeUtils {
|
|||
OrdinalMap map, IndexWriter destIndexWriter,
|
||||
DirectoryTaxonomyWriter destTaxWriter) throws IOException {
|
||||
// merge the taxonomies
|
||||
destTaxWriter.addTaxonomies(new Directory[] { srcTaxDir }, new OrdinalMap[] { map });
|
||||
destTaxWriter.addTaxonomy(srcTaxDir, map);
|
||||
|
||||
PayloadProcessorProvider payloadProcessor = new FacetsPayloadProcessorProvider(
|
||||
srcIndexDir, map.getMap(), new DefaultFacetIndexingParams());
|
||||
|
|
|
@ -4,8 +4,6 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.StoredFieldVisitor;
|
||||
import org.apache.lucene.index.StoredFieldVisitor.Status;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -42,6 +40,7 @@ abstract class Consts {
|
|||
public static final class LoadFullPathOnly extends StoredFieldVisitor {
|
||||
private String fullPath;
|
||||
|
||||
@Override
|
||||
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
|
||||
fullPath = value;
|
||||
}
|
||||
|
|
|
@ -12,15 +12,21 @@ import java.io.IOException;
|
|||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.core.KeywordAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.core.KeywordAnalyzer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
|
||||
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
|
||||
import org.apache.lucene.facet.taxonomy.writercache.cl2o.Cl2oTaxonomyWriterCache;
|
||||
import org.apache.lucene.facet.taxonomy.writercache.lru.LruTaxonomyWriterCache;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
|
@ -30,9 +36,9 @@ import org.apache.lucene.index.IndexWriterConfig;
|
|||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.index.LogByteSizeMergePolicy;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.SegmentInfos;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.SegmentInfos;
|
||||
import org.apache.lucene.index.TieredMergePolicy;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
|
@ -44,13 +50,6 @@ import org.apache.lucene.util.Bits;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
|
||||
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
|
||||
import org.apache.lucene.facet.taxonomy.writercache.cl2o.Cl2oTaxonomyWriterCache;
|
||||
import org.apache.lucene.facet.taxonomy.writercache.lru.LruTaxonomyWriterCache;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -812,6 +811,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
|||
}
|
||||
return parentArray;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getParent(int ordinal) throws IOException {
|
||||
ensureOpen();
|
||||
|
@ -823,158 +823,47 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
|||
}
|
||||
return getParentArray().getArray()[ordinal];
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Take all the categories of one or more given taxonomies, and add them to
|
||||
* the main taxonomy (this), if they are not already there.
|
||||
* <P>
|
||||
* Additionally, fill a <I>mapping</I> for each of the added taxonomies,
|
||||
* mapping its ordinals to the ordinals in the enlarged main taxonomy.
|
||||
* These mapping are saved into an array of OrdinalMap objects given by the
|
||||
* user, one for each of the given taxonomies (not including "this", the main
|
||||
* taxonomy). Often the first of these will be a MemoryOrdinalMap and the
|
||||
* others will be a DiskOrdinalMap - see discussion in {OrdinalMap}.
|
||||
* <P>
|
||||
* Note that the taxonomies to be added are given as Directory objects,
|
||||
* not opened TaxonomyReader/TaxonomyWriter objects, so if any of them are
|
||||
* currently managed by an open TaxonomyWriter, make sure to commit() (or
|
||||
* close()) it first. The main taxonomy (this) is an open TaxonomyWriter,
|
||||
* and does not need to be commit()ed before this call.
|
||||
* Takes the categories from the given taxonomy directory, and adds the
|
||||
* missing ones to this taxonomy. Additionally, it fills the given
|
||||
* {@link OrdinalMap} with a mapping from the original ordinal to the new
|
||||
* ordinal.
|
||||
*/
|
||||
public void addTaxonomies(Directory[] taxonomies, OrdinalMap[] ordinalMaps) throws IOException {
|
||||
public void addTaxonomy(Directory taxoDir, OrdinalMap map) throws IOException {
|
||||
ensureOpen();
|
||||
// To prevent us stepping on the rest of this class's decisions on when
|
||||
// to open a reader, and when not, we'll be opening a new reader instead
|
||||
// of using the existing "reader" object:
|
||||
IndexReader mainreader = openReader();
|
||||
// TODO (Facet): can this then go segment-by-segment and avoid MultiDocsEnum etc?
|
||||
Terms terms = MultiFields.getTerms(mainreader, Consts.FULL);
|
||||
assert terms != null; // TODO (Facet): explicit check / throw exception?
|
||||
TermsEnum mainte = terms.iterator(null);
|
||||
DocsEnum mainde = null;
|
||||
|
||||
IndexReader[] otherreaders = new IndexReader[taxonomies.length];
|
||||
TermsEnum[] othertes = new TermsEnum[taxonomies.length];
|
||||
DocsEnum[] otherdocsEnum = new DocsEnum[taxonomies.length]; // just for reuse
|
||||
for (int i=0; i<taxonomies.length; i++) {
|
||||
otherreaders[i] = DirectoryReader.open(taxonomies[i]);
|
||||
terms = MultiFields.getTerms(otherreaders[i], Consts.FULL);
|
||||
assert terms != null; // TODO (Facet): explicit check / throw exception?
|
||||
othertes[i] = terms.iterator(null);
|
||||
// Also tell the ordinal maps their expected sizes:
|
||||
ordinalMaps[i].setSize(otherreaders[i].numDocs());
|
||||
}
|
||||
|
||||
CategoryPath cp = new CategoryPath();
|
||||
|
||||
// We keep a "current" cursor over the alphabetically-ordered list of
|
||||
// categories in each taxonomy. We start the cursor on the first
|
||||
// (alphabetically) category of each taxonomy:
|
||||
|
||||
String currentMain;
|
||||
String[] currentOthers = new String[taxonomies.length];
|
||||
currentMain = nextTE(mainte);
|
||||
int otherTaxonomiesLeft = 0;
|
||||
for (int i=0; i<taxonomies.length; i++) {
|
||||
currentOthers[i] = nextTE(othertes[i]);
|
||||
if (currentOthers[i]!=null) {
|
||||
otherTaxonomiesLeft++;
|
||||
}
|
||||
}
|
||||
|
||||
// And then, at each step look at the first (alphabetically) of the
|
||||
// current taxonomies.
|
||||
// NOTE: The most efficient way we could have done this is using a
|
||||
// PriorityQueue. But for simplicity, and assuming that usually we'll
|
||||
// have a very small number of other taxonomies (often just 1), we use
|
||||
// a more naive algorithm (o(ntaxonomies) instead of o(ln ntaxonomies)
|
||||
// per step)
|
||||
|
||||
while (otherTaxonomiesLeft>0) {
|
||||
// TODO: use a pq here
|
||||
String first=null;
|
||||
for (int i=0; i<taxonomies.length; i++) {
|
||||
if (currentOthers[i]==null) continue;
|
||||
if (first==null || first.compareTo(currentOthers[i])>0) {
|
||||
first = currentOthers[i];
|
||||
}
|
||||
}
|
||||
int comp = 0;
|
||||
if (currentMain==null || (comp = currentMain.compareTo(first))>0) {
|
||||
// If 'first' is before currentMain, or currentMain is null,
|
||||
// then 'first' is a new category and we need to add it to the
|
||||
// main taxonomy. Then for all taxonomies with this 'first'
|
||||
// category, we need to add the new category number to their
|
||||
// map, and move to the next category in all of them.
|
||||
IndexReader r = DirectoryReader.open(taxoDir);
|
||||
try {
|
||||
final int size = r.numDocs();
|
||||
final OrdinalMap ordinalMap = map;
|
||||
ordinalMap.setSize(size);
|
||||
CategoryPath cp = new CategoryPath();
|
||||
Terms terms = MultiFields.getTerms(r, Consts.FULL);
|
||||
TermsEnum te = terms.iterator(null);
|
||||
Bits liveDocs = MultiFields.getLiveDocs(r);
|
||||
DocsEnum docs = null;
|
||||
// we call next() first, to skip the root category which always exists.
|
||||
while (te.next() != null) {
|
||||
String value = te.term().utf8ToString();
|
||||
cp.clear();
|
||||
cp.add(first, delimiter);
|
||||
// We can call internalAddCategory() instead of addCategory()
|
||||
// because we know the category hasn't been seen yet.
|
||||
int newordinal = internalAddCategory(cp, cp.length());
|
||||
// TODO (Facet): we already had this term in our hands before, in nextTE...
|
||||
// // TODO (Facet): no need to make this term?
|
||||
for (int i=0; i<taxonomies.length; i++) {
|
||||
if (first.equals(currentOthers[i])) {
|
||||
// remember the remapping of this ordinal. Note how
|
||||
// this requires reading a posting list from the index -
|
||||
// but since we do this in lexical order of terms, just
|
||||
// like Lucene's merge works, we hope there are few seeks.
|
||||
// TODO (Facet): is there a quicker way? E.g., not specifying the
|
||||
// next term by name every time?
|
||||
otherdocsEnum[i] = othertes[i].docs(MultiFields.getLiveDocs(otherreaders[i]), otherdocsEnum[i], false);
|
||||
otherdocsEnum[i].nextDoc(); // TODO (Facet): check?
|
||||
int origordinal = otherdocsEnum[i].docID();
|
||||
ordinalMaps[i].addMapping(origordinal, newordinal);
|
||||
// and move to the next category in the i'th taxonomy
|
||||
currentOthers[i] = nextTE(othertes[i]);
|
||||
if (currentOthers[i]==null) {
|
||||
otherTaxonomiesLeft--;
|
||||
}
|
||||
}
|
||||
cp.add(value, Consts.DEFAULT_DELIMITER);
|
||||
int ordinal = findCategory(cp);
|
||||
if (ordinal < 0) {
|
||||
// NOTE: call addCategory so that it works well in a multi-threaded
|
||||
// environment, in case e.g. a thread just added the category, after
|
||||
// the findCategory() call above failed to find it.
|
||||
ordinal = addCategory(cp);
|
||||
}
|
||||
} else if (comp==0) {
|
||||
// 'first' and currentMain are the same, so both the main and some
|
||||
// other taxonomies need to be moved, but a category doesn't need
|
||||
// to be added because it already existed in the main taxonomy.
|
||||
|
||||
// TODO (Facet): Again, is there a quicker way?
|
||||
mainde = mainte.docs(MultiFields.getLiveDocs(mainreader), mainde, false);
|
||||
mainde.nextDoc(); // TODO (Facet): check?
|
||||
int newordinal = mainde.docID();
|
||||
|
||||
currentMain = nextTE(mainte);
|
||||
for (int i=0; i<taxonomies.length; i++) {
|
||||
if (first.equals(currentOthers[i])) {
|
||||
// TODO (Facet): again, is there a quicker way?
|
||||
otherdocsEnum[i] = othertes[i].docs(MultiFields.getLiveDocs(otherreaders[i]), otherdocsEnum[i], false);
|
||||
otherdocsEnum[i].nextDoc(); // TODO (Facet): check?
|
||||
int origordinal = otherdocsEnum[i].docID();
|
||||
ordinalMaps[i].addMapping(origordinal, newordinal);
|
||||
|
||||
// and move to the next category
|
||||
currentOthers[i] = nextTE(othertes[i]);
|
||||
if (currentOthers[i]==null) {
|
||||
otherTaxonomiesLeft--;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else /* comp > 0 */ {
|
||||
// The currentMain doesn't appear in any of the other taxonomies -
|
||||
// we don't need to do anything, just continue to the next one
|
||||
currentMain = nextTE(mainte);
|
||||
docs = te.docs(liveDocs, docs, false);
|
||||
ordinalMap.addMapping(docs.nextDoc(), ordinal);
|
||||
}
|
||||
}
|
||||
|
||||
// Close all the readers we've opened, and also tell the ordinal maps
|
||||
// we're done adding to them
|
||||
mainreader.close();
|
||||
for (int i=0; i<taxonomies.length; i++) {
|
||||
otherreaders[i].close();
|
||||
// We never actually added a mapping for the root ordinal - let's do
|
||||
// it now, just so that the map is complete (every ordinal between 0
|
||||
// and size-1 is remapped)
|
||||
ordinalMaps[i].addMapping(0, 0);
|
||||
ordinalMaps[i].addDone();
|
||||
// we must add the root ordinal map, so that the map will be complete
|
||||
// (otherwise e.g. DiskOrdinalMap may fail because it expects more
|
||||
// categories to exist in the file).
|
||||
ordinalMap.addMapping(0, 0);
|
||||
ordinalMap.addDone();
|
||||
} finally {
|
||||
r.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1113,13 +1002,6 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
|||
}
|
||||
}
|
||||
|
||||
private static final String nextTE(TermsEnum te) throws IOException {
|
||||
if (te.next() != null) {
|
||||
return te.term().utf8ToString(); // TODO (Facet): avoid String creation/use Bytes?
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Rollback changes to the taxonomy writer and closes the instance. Following
|
||||
* this method the instance becomes unusable (calling any of its API methods
|
||||
|
|
|
@ -1,254 +0,0 @@
|
|||
package org.apache.lucene.facet.taxonomy.directory;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.junit.Test;
|
||||
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
|
||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
|
||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.DiskOrdinalMap;
|
||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.MemoryOrdinalMap;
|
||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.OrdinalMap;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
public class TestAddTaxonomies extends LuceneTestCase {
|
||||
|
||||
@Test
|
||||
public void test1() throws Exception {
|
||||
Directory dir1 = newDirectory();
|
||||
DirectoryTaxonomyWriter tw1 = new DirectoryTaxonomyWriter(dir1);
|
||||
tw1.addCategory(new CategoryPath("Author", "Mark Twain"));
|
||||
tw1.addCategory(new CategoryPath("Animals", "Dog"));
|
||||
Directory dir2 = newDirectory();
|
||||
DirectoryTaxonomyWriter tw2 = new DirectoryTaxonomyWriter(dir2);
|
||||
tw2.addCategory(new CategoryPath("Author", "Rob Pike"));
|
||||
tw2.addCategory(new CategoryPath("Aardvarks", "Bob"));
|
||||
tw2.close();
|
||||
Directory dir3 = newDirectory();
|
||||
DirectoryTaxonomyWriter tw3 = new DirectoryTaxonomyWriter(dir3);
|
||||
tw3.addCategory(new CategoryPath("Author", "Zebra Smith"));
|
||||
tw3.addCategory(new CategoryPath("Aardvarks", "Bob"));
|
||||
tw3.addCategory(new CategoryPath("Aardvarks", "Aaron"));
|
||||
tw3.close();
|
||||
|
||||
MemoryOrdinalMap[] maps = new MemoryOrdinalMap[2];
|
||||
maps[0] = new MemoryOrdinalMap();
|
||||
maps[1] = new MemoryOrdinalMap();
|
||||
|
||||
tw1.addTaxonomies(new Directory[] { dir2, dir3 }, maps);
|
||||
tw1.close();
|
||||
|
||||
TaxonomyReader tr = new DirectoryTaxonomyReader(dir1);
|
||||
|
||||
// Test that the merged taxonomy now contains what we expect:
|
||||
// First all the categories of the original taxonomy, in their original order:
|
||||
assertEquals(tr.getPath(0).toString(), "");
|
||||
assertEquals(tr.getPath(1).toString(), "Author");
|
||||
assertEquals(tr.getPath(2).toString(), "Author/Mark Twain");
|
||||
assertEquals(tr.getPath(3).toString(), "Animals");
|
||||
assertEquals(tr.getPath(4).toString(), "Animals/Dog");
|
||||
// Then the categories new in the new taxonomy, in alphabetical order:
|
||||
assertEquals(tr.getPath(5).toString(), "Aardvarks");
|
||||
assertEquals(tr.getPath(6).toString(), "Aardvarks/Aaron");
|
||||
assertEquals(tr.getPath(7).toString(), "Aardvarks/Bob");
|
||||
assertEquals(tr.getPath(8).toString(), "Author/Rob Pike");
|
||||
assertEquals(tr.getPath(9).toString(), "Author/Zebra Smith");
|
||||
assertEquals(tr.getSize(), 10);
|
||||
|
||||
// Test that the maps contain what we expect
|
||||
int[] map0 = maps[0].getMap();
|
||||
assertEquals(5, map0.length);
|
||||
assertEquals(0, map0[0]);
|
||||
assertEquals(1, map0[1]);
|
||||
assertEquals(8, map0[2]);
|
||||
assertEquals(5, map0[3]);
|
||||
assertEquals(7, map0[4]);
|
||||
|
||||
int[] map1 = maps[1].getMap();
|
||||
assertEquals(6, map1.length);
|
||||
assertEquals(0, map1[0]);
|
||||
assertEquals(1, map1[1]);
|
||||
assertEquals(9, map1[2]);
|
||||
assertEquals(5, map1[3]);
|
||||
assertEquals(7, map1[4]);
|
||||
assertEquals(6, map1[5]);
|
||||
|
||||
tr.close();
|
||||
dir1.close();
|
||||
dir2.close();
|
||||
dir3.close();
|
||||
}
|
||||
|
||||
// a reasonable random test
|
||||
public void testmedium() throws Exception {
|
||||
int numTests = atLeast(3);
|
||||
for (int i = 0; i < numTests; i++) {
|
||||
dotest(_TestUtil.nextInt(random(), 1, 10),
|
||||
_TestUtil.nextInt(random(), 1, 100),
|
||||
_TestUtil.nextInt(random(), 100, 1000),
|
||||
random().nextBoolean());
|
||||
}
|
||||
}
|
||||
|
||||
// A more comprehensive and big random test.
|
||||
@Test @Nightly
|
||||
public void testbig() throws Exception {
|
||||
dotest(2, 1000, 5000, false);
|
||||
dotest(10, 10000, 100, false);
|
||||
dotest(50, 20, 100, false);
|
||||
dotest(10, 1000, 10000, false);
|
||||
dotest(50, 20, 10000, false);
|
||||
dotest(1, 20, 10000, false);
|
||||
dotest(10, 1, 10000, false);
|
||||
dotest(10, 1000, 20000, true);
|
||||
}
|
||||
|
||||
private void dotest(int ntaxonomies, int ncats, int range, boolean disk) throws Exception {
|
||||
Directory dirs[] = new Directory[ntaxonomies];
|
||||
Directory copydirs[] = new Directory[ntaxonomies];
|
||||
|
||||
for (int i=0; i<ntaxonomies; i++) {
|
||||
dirs[i] = newDirectory();
|
||||
copydirs[i] = newDirectory();
|
||||
DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(dirs[i]);
|
||||
DirectoryTaxonomyWriter copytw = new DirectoryTaxonomyWriter(copydirs[i]);
|
||||
for (int j=0; j<ncats; j++) {
|
||||
String cat = Integer.toString(random().nextInt(range));
|
||||
tw.addCategory(new CategoryPath("a",cat));
|
||||
copytw.addCategory(new CategoryPath("a",cat));
|
||||
}
|
||||
// System.err.println("Taxonomy "+i+": "+tw.getSize());
|
||||
tw.close();
|
||||
copytw.close();
|
||||
}
|
||||
|
||||
DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(dirs[0]);
|
||||
Directory otherdirs[] = new Directory[ntaxonomies-1];
|
||||
System.arraycopy(dirs, 1, otherdirs, 0, ntaxonomies-1);
|
||||
|
||||
OrdinalMap[] maps = new OrdinalMap[ntaxonomies-1];
|
||||
if (ntaxonomies>1) {
|
||||
for (int i=0; i<ntaxonomies-1; i++) {
|
||||
if (disk) {
|
||||
// TODO: use a LTC tempfile
|
||||
maps[i] = new DiskOrdinalMap(new File(System.getProperty("java.io.tmpdir"),
|
||||
"tmpmap"+i));
|
||||
} else {
|
||||
maps[i] = new MemoryOrdinalMap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tw.addTaxonomies(otherdirs, maps);
|
||||
// System.err.println("Merged axonomy: "+tw.getSize());
|
||||
tw.close();
|
||||
|
||||
// Check that all original categories in the main taxonomy remain in
|
||||
// unchanged, and the rest of the taxonomies are completely unchanged.
|
||||
for (int i=0; i<ntaxonomies; i++) {
|
||||
TaxonomyReader tr = new DirectoryTaxonomyReader(dirs[i]);
|
||||
TaxonomyReader copytr = new DirectoryTaxonomyReader(copydirs[i]);
|
||||
if (i==0) {
|
||||
assertTrue(tr.getSize() >= copytr.getSize());
|
||||
} else {
|
||||
assertEquals(copytr.getSize(), tr.getSize());
|
||||
}
|
||||
for (int j=0; j<copytr.getSize(); j++) {
|
||||
String expected = copytr.getPath(j).toString();
|
||||
String got = tr.getPath(j).toString();
|
||||
assertTrue("Comparing category "+j+" of taxonomy "+i+": expected "+expected+", got "+got,
|
||||
expected.equals(got));
|
||||
}
|
||||
tr.close();
|
||||
copytr.close();
|
||||
}
|
||||
|
||||
// Check that all the new categories in the main taxonomy are in
|
||||
// lexicographic order. This isn't a requirement of our API, but happens
|
||||
// this way in our current implementation.
|
||||
TaxonomyReader tr = new DirectoryTaxonomyReader(dirs[0]);
|
||||
TaxonomyReader copytr = new DirectoryTaxonomyReader(copydirs[0]);
|
||||
if (tr.getSize() > copytr.getSize()) {
|
||||
String prev = tr.getPath(copytr.getSize()).toString();
|
||||
for (int j=copytr.getSize()+1; j<tr.getSize(); j++) {
|
||||
String n = tr.getPath(j).toString();
|
||||
assertTrue(prev.compareTo(n)<0);
|
||||
prev=n;
|
||||
}
|
||||
}
|
||||
int oldsize = copytr.getSize(); // remember for later
|
||||
tr.close();
|
||||
copytr.close();
|
||||
|
||||
// Check that all the categories from other taxonomies exist in the new
|
||||
// taxonomy.
|
||||
TaxonomyReader main = new DirectoryTaxonomyReader(dirs[0]);
|
||||
for (int i=1; i<ntaxonomies; i++) {
|
||||
TaxonomyReader other = new DirectoryTaxonomyReader(dirs[i]);
|
||||
for (int j=0; j<other.getSize(); j++) {
|
||||
int otherord = main.getOrdinal(other.getPath(j));
|
||||
assertTrue(otherord != TaxonomyReader.INVALID_ORDINAL);
|
||||
}
|
||||
other.close();
|
||||
}
|
||||
|
||||
// Check that all the new categories in the merged taxonomy exist in
|
||||
// one of the added taxonomies.
|
||||
TaxonomyReader[] others = new TaxonomyReader[ntaxonomies-1];
|
||||
for (int i=1; i<ntaxonomies; i++) {
|
||||
others[i-1] = new DirectoryTaxonomyReader(dirs[i]);
|
||||
}
|
||||
for (int j=oldsize; j<main.getSize(); j++) {
|
||||
boolean found=false;
|
||||
CategoryPath path = main.getPath(j);
|
||||
for (int i=1; i<ntaxonomies; i++) {
|
||||
if (others[i-1].getOrdinal(path) != TaxonomyReader.INVALID_ORDINAL) {
|
||||
found=true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
fail("Found category "+j+" ("+path+") in merged taxonomy not in any of the separate ones");
|
||||
}
|
||||
}
|
||||
|
||||
// Check that all the maps are correct
|
||||
for (int i=0; i<ntaxonomies-1; i++) {
|
||||
int[] map = maps[i].getMap();
|
||||
for (int j=0; j<map.length; j++) {
|
||||
assertEquals(map[j], main.getOrdinal(others[i].getPath(j)));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i=1; i<ntaxonomies; i++) {
|
||||
others[i-1].close();
|
||||
}
|
||||
|
||||
main.close();
|
||||
IOUtils.close(dirs);
|
||||
IOUtils.close(copydirs);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,228 @@
|
|||
package org.apache.lucene.facet.taxonomy.directory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.DiskOrdinalMap;
|
||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.MemoryOrdinalMap;
|
||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.OrdinalMap;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
public class TestAddTaxonomy extends LuceneTestCase {
|
||||
|
||||
private void dotest(int ncats, int range) throws Exception {
|
||||
Directory dirs[] = new Directory[2];
|
||||
Random random = random();
|
||||
for (int i = 0; i < dirs.length; i++) {
|
||||
dirs[i] = newDirectory();
|
||||
DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(dirs[i]);
|
||||
for (int j = 0; j < ncats; j++) {
|
||||
String cat = Integer.toString(random.nextInt(range));
|
||||
tw.addCategory(new CategoryPath("a", cat));
|
||||
}
|
||||
tw.close();
|
||||
}
|
||||
|
||||
DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(dirs[0]);
|
||||
OrdinalMap map = randomOrdinalMap();
|
||||
tw.addTaxonomy(dirs[1], map);
|
||||
tw.close();
|
||||
|
||||
validate(dirs[0], dirs[1], map);
|
||||
|
||||
IOUtils.close(dirs);
|
||||
}
|
||||
|
||||
private OrdinalMap randomOrdinalMap() throws IOException {
|
||||
if (random().nextBoolean()) {
|
||||
return new DiskOrdinalMap(_TestUtil.createTempFile("taxoMap", "", TEMP_DIR));
|
||||
} else {
|
||||
return new MemoryOrdinalMap();
|
||||
}
|
||||
}
|
||||
|
||||
private void validate(Directory dest, Directory src, OrdinalMap ordMap) throws Exception {
|
||||
CategoryPath cp = new CategoryPath();
|
||||
DirectoryTaxonomyReader destTR = new DirectoryTaxonomyReader(dest);
|
||||
try {
|
||||
final int destSize = destTR.getSize();
|
||||
DirectoryTaxonomyReader srcTR = new DirectoryTaxonomyReader(src);
|
||||
try {
|
||||
int[] map = ordMap.getMap();
|
||||
|
||||
// validate taxo sizes
|
||||
int srcSize = srcTR.getSize();
|
||||
assertTrue("destination taxonomy expected to be larger than source; dest="
|
||||
+ destSize + " src=" + srcSize,
|
||||
destSize >= srcSize);
|
||||
|
||||
// validate that all source categories exist in destination, and their
|
||||
// ordinals are as expected.
|
||||
for (int j = 1; j < srcSize; j++) {
|
||||
srcTR.getPath(j, cp);
|
||||
int destOrdinal = destTR.getOrdinal(cp);
|
||||
assertTrue(cp + " not found in destination", destOrdinal > 0);
|
||||
assertEquals(destOrdinal, map[j]);
|
||||
}
|
||||
} finally {
|
||||
srcTR.close();
|
||||
}
|
||||
} finally {
|
||||
destTR.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void testAddEmpty() throws Exception {
|
||||
Directory dest = newDirectory();
|
||||
DirectoryTaxonomyWriter destTW = new DirectoryTaxonomyWriter(dest);
|
||||
destTW.addCategory(new CategoryPath("Author", "Rob Pike"));
|
||||
destTW.addCategory(new CategoryPath("Aardvarks", "Bob"));
|
||||
destTW.commit();
|
||||
|
||||
Directory src = newDirectory();
|
||||
new DirectoryTaxonomyWriter(src).close(); // create an empty taxonomy
|
||||
|
||||
OrdinalMap map = randomOrdinalMap();
|
||||
destTW.addTaxonomy(src, map);
|
||||
destTW.close();
|
||||
|
||||
validate(dest, src, map);
|
||||
|
||||
IOUtils.close(dest, src);
|
||||
}
|
||||
|
||||
public void testAddToEmpty() throws Exception {
|
||||
Directory dest = newDirectory();
|
||||
|
||||
Directory src = newDirectory();
|
||||
DirectoryTaxonomyWriter srcTW = new DirectoryTaxonomyWriter(src);
|
||||
srcTW.addCategory(new CategoryPath("Author", "Rob Pike"));
|
||||
srcTW.addCategory(new CategoryPath("Aardvarks", "Bob"));
|
||||
srcTW.close();
|
||||
|
||||
DirectoryTaxonomyWriter destTW = new DirectoryTaxonomyWriter(dest);
|
||||
OrdinalMap map = randomOrdinalMap();
|
||||
destTW.addTaxonomy(src, map);
|
||||
destTW.close();
|
||||
|
||||
validate(dest, src, map);
|
||||
|
||||
IOUtils.close(dest, src);
|
||||
}
|
||||
|
||||
// A more comprehensive and big random test.
|
||||
@Nightly
|
||||
public void testBig() throws Exception {
|
||||
dotest(200, 10000);
|
||||
dotest(1000, 20000);
|
||||
// really big
|
||||
dotest(400000, 1000000);
|
||||
}
|
||||
|
||||
// a reasonable random test
|
||||
public void testMedium() throws Exception {
|
||||
Random random = random();
|
||||
int numTests = atLeast(3);
|
||||
for (int i = 0; i < numTests; i++) {
|
||||
dotest(_TestUtil.nextInt(random, 2, 100),
|
||||
_TestUtil.nextInt(random, 100, 1000));
|
||||
}
|
||||
}
|
||||
|
||||
public void testSimple() throws Exception {
|
||||
Directory dest = newDirectory();
|
||||
DirectoryTaxonomyWriter tw1 = new DirectoryTaxonomyWriter(dest);
|
||||
tw1.addCategory(new CategoryPath("Author", "Mark Twain"));
|
||||
tw1.addCategory(new CategoryPath("Animals", "Dog"));
|
||||
tw1.addCategory(new CategoryPath("Author", "Rob Pike"));
|
||||
|
||||
Directory src = newDirectory();
|
||||
DirectoryTaxonomyWriter tw2 = new DirectoryTaxonomyWriter(src);
|
||||
tw2.addCategory(new CategoryPath("Author", "Rob Pike"));
|
||||
tw2.addCategory(new CategoryPath("Aardvarks", "Bob"));
|
||||
tw2.close();
|
||||
|
||||
OrdinalMap map = randomOrdinalMap();
|
||||
|
||||
tw1.addTaxonomy(src, map);
|
||||
tw1.close();
|
||||
|
||||
validate(dest, src, map);
|
||||
|
||||
IOUtils.close(dest, src);
|
||||
}
|
||||
|
||||
public void testConcurrency() throws Exception {
|
||||
// tests that addTaxonomy and addCategory work in parallel
|
||||
final int numCategories = atLeast(5000);
|
||||
|
||||
// build an input taxonomy index
|
||||
Directory src = newDirectory();
|
||||
DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(src);
|
||||
for (int i = 0; i < numCategories; i++) {
|
||||
tw.addCategory(new CategoryPath("a", Integer.toString(i)));
|
||||
}
|
||||
tw.close();
|
||||
|
||||
// now add the taxonomy to an empty taxonomy, while adding the categories
|
||||
// again, in parallel -- in the end, no duplicate categories should exist.
|
||||
Directory dest = newDirectory();
|
||||
final DirectoryTaxonomyWriter destTW = new DirectoryTaxonomyWriter(dest);
|
||||
Thread t = new Thread() {
|
||||
@Override
|
||||
public void run() {
|
||||
for (int i = 0; i < numCategories; i++) {
|
||||
try {
|
||||
destTW.addCategory(new CategoryPath("a", Integer.toString(i)));
|
||||
} catch (IOException e) {
|
||||
// shouldn't happen - if it does, let the test fail on uncaught exception.
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
t.start();
|
||||
|
||||
OrdinalMap map = new MemoryOrdinalMap();
|
||||
destTW.addTaxonomy(src, map);
|
||||
t.join();
|
||||
destTW.close();
|
||||
|
||||
// now validate
|
||||
|
||||
DirectoryTaxonomyReader dtr = new DirectoryTaxonomyReader(dest);
|
||||
// +2 to account for the root category + "a"
|
||||
assertEquals(numCategories + 2, dtr.getSize());
|
||||
HashSet<CategoryPath> categories = new HashSet<CategoryPath>();
|
||||
for (int i = 1; i < dtr.getSize(); i++) {
|
||||
CategoryPath cat = dtr.getPath(i);
|
||||
assertTrue("category " + cat + " already existed", categories.add(cat));
|
||||
}
|
||||
dtr.close();
|
||||
|
||||
IOUtils.close(src, dest);
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue