LUCENE-4060: port to trunk

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1339047 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shai Erera 2012-05-16 08:01:40 +00:00
parent e9f32e7a72
commit d8e0288109
6 changed files with 280 additions and 420 deletions

View File

@ -953,6 +953,11 @@ Bug fixes
offset calculation in PathHierarchyTokenizer.
(Mike McCandless, Uwe Schindler, Robert Muir)
* LUCENE-4060: Fix a synchronization bug in
DirectoryTaxonomyWriter.addTaxonomies(). Also, the method has been renamed to
addTaxonomy and now takes only one Directory and one OrdinalMap.
(Shai Erera, Gilad Barkai)
Documentation
* LUCENE-3958: Javadocs corrections for IndexWriter.

View File

@ -81,7 +81,7 @@ public class TaxonomyMergeUtils {
OrdinalMap map, IndexWriter destIndexWriter,
DirectoryTaxonomyWriter destTaxWriter) throws IOException {
// merge the taxonomies
destTaxWriter.addTaxonomies(new Directory[] { srcTaxDir }, new OrdinalMap[] { map });
destTaxWriter.addTaxonomy(srcTaxDir, map);
PayloadProcessorProvider payloadProcessor = new FacetsPayloadProcessorProvider(
srcIndexDir, map.getMap(), new DefaultFacetIndexingParams());

View File

@ -4,8 +4,6 @@ import java.io.IOException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.StoredFieldVisitor.Status;
import org.apache.lucene.store.IndexInput;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -42,6 +40,7 @@ abstract class Consts {
public static final class LoadFullPathOnly extends StoredFieldVisitor {
private String fullPath;
@Override
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
fullPath = value;
}

View File

@ -12,15 +12,21 @@ import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.cl2o.Cl2oTaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.lru.LruTaxonomyWriterCache;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
@ -30,9 +36,9 @@ import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.LogByteSizeMergePolicy;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.index.TieredMergePolicy;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.AlreadyClosedException;
@ -44,13 +50,6 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.cl2o.Cl2oTaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.lru.LruTaxonomyWriterCache;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -812,6 +811,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
}
return parentArray;
}
@Override
public int getParent(int ordinal) throws IOException {
ensureOpen();
@ -823,158 +823,47 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
}
return getParentArray().getArray()[ordinal];
}
/**
* Take all the categories of one or more given taxonomies, and add them to
* the main taxonomy (this), if they are not already there.
* <P>
* Additionally, fill a <I>mapping</I> for each of the added taxonomies,
* mapping its ordinals to the ordinals in the enlarged main taxonomy.
* These mapping are saved into an array of OrdinalMap objects given by the
* user, one for each of the given taxonomies (not including "this", the main
* taxonomy). Often the first of these will be a MemoryOrdinalMap and the
* others will be a DiskOrdinalMap - see discussion in {OrdinalMap}.
* <P>
* Note that the taxonomies to be added are given as Directory objects,
* not opened TaxonomyReader/TaxonomyWriter objects, so if any of them are
* currently managed by an open TaxonomyWriter, make sure to commit() (or
* close()) it first. The main taxonomy (this) is an open TaxonomyWriter,
* and does not need to be commit()ed before this call.
* Takes the categories from the given taxonomy directory, and adds the
* missing ones to this taxonomy. Additionally, it fills the given
* {@link OrdinalMap} with a mapping from the original ordinal to the new
* ordinal.
*/
public void addTaxonomies(Directory[] taxonomies, OrdinalMap[] ordinalMaps) throws IOException {
public void addTaxonomy(Directory taxoDir, OrdinalMap map) throws IOException {
ensureOpen();
// To prevent us stepping on the rest of this class's decisions on when
// to open a reader, and when not, we'll be opening a new reader instead
// of using the existing "reader" object:
IndexReader mainreader = openReader();
// TODO (Facet): can this then go segment-by-segment and avoid MultiDocsEnum etc?
Terms terms = MultiFields.getTerms(mainreader, Consts.FULL);
assert terms != null; // TODO (Facet): explicit check / throw exception?
TermsEnum mainte = terms.iterator(null);
DocsEnum mainde = null;
IndexReader[] otherreaders = new IndexReader[taxonomies.length];
TermsEnum[] othertes = new TermsEnum[taxonomies.length];
DocsEnum[] otherdocsEnum = new DocsEnum[taxonomies.length]; // just for reuse
for (int i=0; i<taxonomies.length; i++) {
otherreaders[i] = DirectoryReader.open(taxonomies[i]);
terms = MultiFields.getTerms(otherreaders[i], Consts.FULL);
assert terms != null; // TODO (Facet): explicit check / throw exception?
othertes[i] = terms.iterator(null);
// Also tell the ordinal maps their expected sizes:
ordinalMaps[i].setSize(otherreaders[i].numDocs());
}
CategoryPath cp = new CategoryPath();
// We keep a "current" cursor over the alphabetically-ordered list of
// categories in each taxonomy. We start the cursor on the first
// (alphabetically) category of each taxonomy:
String currentMain;
String[] currentOthers = new String[taxonomies.length];
currentMain = nextTE(mainte);
int otherTaxonomiesLeft = 0;
for (int i=0; i<taxonomies.length; i++) {
currentOthers[i] = nextTE(othertes[i]);
if (currentOthers[i]!=null) {
otherTaxonomiesLeft++;
}
}
// And then, at each step look at the first (alphabetically) of the
// current taxonomies.
// NOTE: The most efficient way we could have done this is using a
// PriorityQueue. But for simplicity, and assuming that usually we'll
// have a very small number of other taxonomies (often just 1), we use
// a more naive algorithm (o(ntaxonomies) instead of o(ln ntaxonomies)
// per step)
while (otherTaxonomiesLeft>0) {
// TODO: use a pq here
String first=null;
for (int i=0; i<taxonomies.length; i++) {
if (currentOthers[i]==null) continue;
if (first==null || first.compareTo(currentOthers[i])>0) {
first = currentOthers[i];
}
}
int comp = 0;
if (currentMain==null || (comp = currentMain.compareTo(first))>0) {
// If 'first' is before currentMain, or currentMain is null,
// then 'first' is a new category and we need to add it to the
// main taxonomy. Then for all taxonomies with this 'first'
// category, we need to add the new category number to their
// map, and move to the next category in all of them.
IndexReader r = DirectoryReader.open(taxoDir);
try {
final int size = r.numDocs();
final OrdinalMap ordinalMap = map;
ordinalMap.setSize(size);
CategoryPath cp = new CategoryPath();
Terms terms = MultiFields.getTerms(r, Consts.FULL);
TermsEnum te = terms.iterator(null);
Bits liveDocs = MultiFields.getLiveDocs(r);
DocsEnum docs = null;
// we call next() first, to skip the root category which always exists.
while (te.next() != null) {
String value = te.term().utf8ToString();
cp.clear();
cp.add(first, delimiter);
// We can call internalAddCategory() instead of addCategory()
// because we know the category hasn't been seen yet.
int newordinal = internalAddCategory(cp, cp.length());
// TODO (Facet): we already had this term in our hands before, in nextTE...
// // TODO (Facet): no need to make this term?
for (int i=0; i<taxonomies.length; i++) {
if (first.equals(currentOthers[i])) {
// remember the remapping of this ordinal. Note how
// this requires reading a posting list from the index -
// but since we do this in lexical order of terms, just
// like Lucene's merge works, we hope there are few seeks.
// TODO (Facet): is there a quicker way? E.g., not specifying the
// next term by name every time?
otherdocsEnum[i] = othertes[i].docs(MultiFields.getLiveDocs(otherreaders[i]), otherdocsEnum[i], false);
otherdocsEnum[i].nextDoc(); // TODO (Facet): check?
int origordinal = otherdocsEnum[i].docID();
ordinalMaps[i].addMapping(origordinal, newordinal);
// and move to the next category in the i'th taxonomy
currentOthers[i] = nextTE(othertes[i]);
if (currentOthers[i]==null) {
otherTaxonomiesLeft--;
}
}
cp.add(value, Consts.DEFAULT_DELIMITER);
int ordinal = findCategory(cp);
if (ordinal < 0) {
// NOTE: call addCategory so that it works well in a multi-threaded
// environment, in case e.g. a thread just added the category, after
// the findCategory() call above failed to find it.
ordinal = addCategory(cp);
}
} else if (comp==0) {
// 'first' and currentMain are the same, so both the main and some
// other taxonomies need to be moved, but a category doesn't need
// to be added because it already existed in the main taxonomy.
// TODO (Facet): Again, is there a quicker way?
mainde = mainte.docs(MultiFields.getLiveDocs(mainreader), mainde, false);
mainde.nextDoc(); // TODO (Facet): check?
int newordinal = mainde.docID();
currentMain = nextTE(mainte);
for (int i=0; i<taxonomies.length; i++) {
if (first.equals(currentOthers[i])) {
// TODO (Facet): again, is there a quicker way?
otherdocsEnum[i] = othertes[i].docs(MultiFields.getLiveDocs(otherreaders[i]), otherdocsEnum[i], false);
otherdocsEnum[i].nextDoc(); // TODO (Facet): check?
int origordinal = otherdocsEnum[i].docID();
ordinalMaps[i].addMapping(origordinal, newordinal);
// and move to the next category
currentOthers[i] = nextTE(othertes[i]);
if (currentOthers[i]==null) {
otherTaxonomiesLeft--;
}
}
}
} else /* comp > 0 */ {
// The currentMain doesn't appear in any of the other taxonomies -
// we don't need to do anything, just continue to the next one
currentMain = nextTE(mainte);
docs = te.docs(liveDocs, docs, false);
ordinalMap.addMapping(docs.nextDoc(), ordinal);
}
}
// Close all the readers we've opened, and also tell the ordinal maps
// we're done adding to them
mainreader.close();
for (int i=0; i<taxonomies.length; i++) {
otherreaders[i].close();
// We never actually added a mapping for the root ordinal - let's do
// it now, just so that the map is complete (every ordinal between 0
// and size-1 is remapped)
ordinalMaps[i].addMapping(0, 0);
ordinalMaps[i].addDone();
// we must add the root ordinal map, so that the map will be complete
// (otherwise e.g. DiskOrdinalMap may fail because it expects more
// categories to exist in the file).
ordinalMap.addMapping(0, 0);
ordinalMap.addDone();
} finally {
r.close();
}
}
@ -1113,13 +1002,6 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
}
}
private static final String nextTE(TermsEnum te) throws IOException {
if (te.next() != null) {
return te.term().utf8ToString(); // TODO (Facet): avoid String creation/use Bytes?
}
return null;
}
/**
* Rollback changes to the taxonomy writer and closes the instance. Following
* this method the instance becomes unusable (calling any of its API methods

View File

@ -1,254 +0,0 @@
package org.apache.lucene.facet.taxonomy.directory;
import java.io.File;
import org.apache.lucene.store.Directory;
import org.junit.Test;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.DiskOrdinalMap;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.MemoryOrdinalMap;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.OrdinalMap;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TestAddTaxonomies extends LuceneTestCase {
@Test
public void test1() throws Exception {
Directory dir1 = newDirectory();
DirectoryTaxonomyWriter tw1 = new DirectoryTaxonomyWriter(dir1);
tw1.addCategory(new CategoryPath("Author", "Mark Twain"));
tw1.addCategory(new CategoryPath("Animals", "Dog"));
Directory dir2 = newDirectory();
DirectoryTaxonomyWriter tw2 = new DirectoryTaxonomyWriter(dir2);
tw2.addCategory(new CategoryPath("Author", "Rob Pike"));
tw2.addCategory(new CategoryPath("Aardvarks", "Bob"));
tw2.close();
Directory dir3 = newDirectory();
DirectoryTaxonomyWriter tw3 = new DirectoryTaxonomyWriter(dir3);
tw3.addCategory(new CategoryPath("Author", "Zebra Smith"));
tw3.addCategory(new CategoryPath("Aardvarks", "Bob"));
tw3.addCategory(new CategoryPath("Aardvarks", "Aaron"));
tw3.close();
MemoryOrdinalMap[] maps = new MemoryOrdinalMap[2];
maps[0] = new MemoryOrdinalMap();
maps[1] = new MemoryOrdinalMap();
tw1.addTaxonomies(new Directory[] { dir2, dir3 }, maps);
tw1.close();
TaxonomyReader tr = new DirectoryTaxonomyReader(dir1);
// Test that the merged taxonomy now contains what we expect:
// First all the categories of the original taxonomy, in their original order:
assertEquals(tr.getPath(0).toString(), "");
assertEquals(tr.getPath(1).toString(), "Author");
assertEquals(tr.getPath(2).toString(), "Author/Mark Twain");
assertEquals(tr.getPath(3).toString(), "Animals");
assertEquals(tr.getPath(4).toString(), "Animals/Dog");
// Then the categories new in the new taxonomy, in alphabetical order:
assertEquals(tr.getPath(5).toString(), "Aardvarks");
assertEquals(tr.getPath(6).toString(), "Aardvarks/Aaron");
assertEquals(tr.getPath(7).toString(), "Aardvarks/Bob");
assertEquals(tr.getPath(8).toString(), "Author/Rob Pike");
assertEquals(tr.getPath(9).toString(), "Author/Zebra Smith");
assertEquals(tr.getSize(), 10);
// Test that the maps contain what we expect
int[] map0 = maps[0].getMap();
assertEquals(5, map0.length);
assertEquals(0, map0[0]);
assertEquals(1, map0[1]);
assertEquals(8, map0[2]);
assertEquals(5, map0[3]);
assertEquals(7, map0[4]);
int[] map1 = maps[1].getMap();
assertEquals(6, map1.length);
assertEquals(0, map1[0]);
assertEquals(1, map1[1]);
assertEquals(9, map1[2]);
assertEquals(5, map1[3]);
assertEquals(7, map1[4]);
assertEquals(6, map1[5]);
tr.close();
dir1.close();
dir2.close();
dir3.close();
}
// a reasonable random test
public void testmedium() throws Exception {
int numTests = atLeast(3);
for (int i = 0; i < numTests; i++) {
dotest(_TestUtil.nextInt(random(), 1, 10),
_TestUtil.nextInt(random(), 1, 100),
_TestUtil.nextInt(random(), 100, 1000),
random().nextBoolean());
}
}
// A more comprehensive and big random test.
@Test @Nightly
public void testbig() throws Exception {
dotest(2, 1000, 5000, false);
dotest(10, 10000, 100, false);
dotest(50, 20, 100, false);
dotest(10, 1000, 10000, false);
dotest(50, 20, 10000, false);
dotest(1, 20, 10000, false);
dotest(10, 1, 10000, false);
dotest(10, 1000, 20000, true);
}
private void dotest(int ntaxonomies, int ncats, int range, boolean disk) throws Exception {
Directory dirs[] = new Directory[ntaxonomies];
Directory copydirs[] = new Directory[ntaxonomies];
for (int i=0; i<ntaxonomies; i++) {
dirs[i] = newDirectory();
copydirs[i] = newDirectory();
DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(dirs[i]);
DirectoryTaxonomyWriter copytw = new DirectoryTaxonomyWriter(copydirs[i]);
for (int j=0; j<ncats; j++) {
String cat = Integer.toString(random().nextInt(range));
tw.addCategory(new CategoryPath("a",cat));
copytw.addCategory(new CategoryPath("a",cat));
}
// System.err.println("Taxonomy "+i+": "+tw.getSize());
tw.close();
copytw.close();
}
DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(dirs[0]);
Directory otherdirs[] = new Directory[ntaxonomies-1];
System.arraycopy(dirs, 1, otherdirs, 0, ntaxonomies-1);
OrdinalMap[] maps = new OrdinalMap[ntaxonomies-1];
if (ntaxonomies>1) {
for (int i=0; i<ntaxonomies-1; i++) {
if (disk) {
// TODO: use a LTC tempfile
maps[i] = new DiskOrdinalMap(new File(System.getProperty("java.io.tmpdir"),
"tmpmap"+i));
} else {
maps[i] = new MemoryOrdinalMap();
}
}
}
tw.addTaxonomies(otherdirs, maps);
// System.err.println("Merged axonomy: "+tw.getSize());
tw.close();
// Check that all original categories in the main taxonomy remain in
// unchanged, and the rest of the taxonomies are completely unchanged.
for (int i=0; i<ntaxonomies; i++) {
TaxonomyReader tr = new DirectoryTaxonomyReader(dirs[i]);
TaxonomyReader copytr = new DirectoryTaxonomyReader(copydirs[i]);
if (i==0) {
assertTrue(tr.getSize() >= copytr.getSize());
} else {
assertEquals(copytr.getSize(), tr.getSize());
}
for (int j=0; j<copytr.getSize(); j++) {
String expected = copytr.getPath(j).toString();
String got = tr.getPath(j).toString();
assertTrue("Comparing category "+j+" of taxonomy "+i+": expected "+expected+", got "+got,
expected.equals(got));
}
tr.close();
copytr.close();
}
// Check that all the new categories in the main taxonomy are in
// lexicographic order. This isn't a requirement of our API, but happens
// this way in our current implementation.
TaxonomyReader tr = new DirectoryTaxonomyReader(dirs[0]);
TaxonomyReader copytr = new DirectoryTaxonomyReader(copydirs[0]);
if (tr.getSize() > copytr.getSize()) {
String prev = tr.getPath(copytr.getSize()).toString();
for (int j=copytr.getSize()+1; j<tr.getSize(); j++) {
String n = tr.getPath(j).toString();
assertTrue(prev.compareTo(n)<0);
prev=n;
}
}
int oldsize = copytr.getSize(); // remember for later
tr.close();
copytr.close();
// Check that all the categories from other taxonomies exist in the new
// taxonomy.
TaxonomyReader main = new DirectoryTaxonomyReader(dirs[0]);
for (int i=1; i<ntaxonomies; i++) {
TaxonomyReader other = new DirectoryTaxonomyReader(dirs[i]);
for (int j=0; j<other.getSize(); j++) {
int otherord = main.getOrdinal(other.getPath(j));
assertTrue(otherord != TaxonomyReader.INVALID_ORDINAL);
}
other.close();
}
// Check that all the new categories in the merged taxonomy exist in
// one of the added taxonomies.
TaxonomyReader[] others = new TaxonomyReader[ntaxonomies-1];
for (int i=1; i<ntaxonomies; i++) {
others[i-1] = new DirectoryTaxonomyReader(dirs[i]);
}
for (int j=oldsize; j<main.getSize(); j++) {
boolean found=false;
CategoryPath path = main.getPath(j);
for (int i=1; i<ntaxonomies; i++) {
if (others[i-1].getOrdinal(path) != TaxonomyReader.INVALID_ORDINAL) {
found=true;
break;
}
}
if (!found) {
fail("Found category "+j+" ("+path+") in merged taxonomy not in any of the separate ones");
}
}
// Check that all the maps are correct
for (int i=0; i<ntaxonomies-1; i++) {
int[] map = maps[i].getMap();
for (int j=0; j<map.length; j++) {
assertEquals(map[j], main.getOrdinal(others[i].getPath(j)));
}
}
for (int i=1; i<ntaxonomies; i++) {
others[i-1].close();
}
main.close();
IOUtils.close(dirs);
IOUtils.close(copydirs);
}
}

View File

@ -0,0 +1,228 @@
package org.apache.lucene.facet.taxonomy.directory;
import java.io.IOException;
import java.util.HashSet;
import java.util.Random;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.DiskOrdinalMap;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.MemoryOrdinalMap;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.OrdinalMap;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TestAddTaxonomy extends LuceneTestCase {
private void dotest(int ncats, int range) throws Exception {
Directory dirs[] = new Directory[2];
Random random = random();
for (int i = 0; i < dirs.length; i++) {
dirs[i] = newDirectory();
DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(dirs[i]);
for (int j = 0; j < ncats; j++) {
String cat = Integer.toString(random.nextInt(range));
tw.addCategory(new CategoryPath("a", cat));
}
tw.close();
}
DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(dirs[0]);
OrdinalMap map = randomOrdinalMap();
tw.addTaxonomy(dirs[1], map);
tw.close();
validate(dirs[0], dirs[1], map);
IOUtils.close(dirs);
}
private OrdinalMap randomOrdinalMap() throws IOException {
if (random().nextBoolean()) {
return new DiskOrdinalMap(_TestUtil.createTempFile("taxoMap", "", TEMP_DIR));
} else {
return new MemoryOrdinalMap();
}
}
private void validate(Directory dest, Directory src, OrdinalMap ordMap) throws Exception {
CategoryPath cp = new CategoryPath();
DirectoryTaxonomyReader destTR = new DirectoryTaxonomyReader(dest);
try {
final int destSize = destTR.getSize();
DirectoryTaxonomyReader srcTR = new DirectoryTaxonomyReader(src);
try {
int[] map = ordMap.getMap();
// validate taxo sizes
int srcSize = srcTR.getSize();
assertTrue("destination taxonomy expected to be larger than source; dest="
+ destSize + " src=" + srcSize,
destSize >= srcSize);
// validate that all source categories exist in destination, and their
// ordinals are as expected.
for (int j = 1; j < srcSize; j++) {
srcTR.getPath(j, cp);
int destOrdinal = destTR.getOrdinal(cp);
assertTrue(cp + " not found in destination", destOrdinal > 0);
assertEquals(destOrdinal, map[j]);
}
} finally {
srcTR.close();
}
} finally {
destTR.close();
}
}
public void testAddEmpty() throws Exception {
Directory dest = newDirectory();
DirectoryTaxonomyWriter destTW = new DirectoryTaxonomyWriter(dest);
destTW.addCategory(new CategoryPath("Author", "Rob Pike"));
destTW.addCategory(new CategoryPath("Aardvarks", "Bob"));
destTW.commit();
Directory src = newDirectory();
new DirectoryTaxonomyWriter(src).close(); // create an empty taxonomy
OrdinalMap map = randomOrdinalMap();
destTW.addTaxonomy(src, map);
destTW.close();
validate(dest, src, map);
IOUtils.close(dest, src);
}
public void testAddToEmpty() throws Exception {
Directory dest = newDirectory();
Directory src = newDirectory();
DirectoryTaxonomyWriter srcTW = new DirectoryTaxonomyWriter(src);
srcTW.addCategory(new CategoryPath("Author", "Rob Pike"));
srcTW.addCategory(new CategoryPath("Aardvarks", "Bob"));
srcTW.close();
DirectoryTaxonomyWriter destTW = new DirectoryTaxonomyWriter(dest);
OrdinalMap map = randomOrdinalMap();
destTW.addTaxonomy(src, map);
destTW.close();
validate(dest, src, map);
IOUtils.close(dest, src);
}
// A more comprehensive and big random test.
@Nightly
public void testBig() throws Exception {
dotest(200, 10000);
dotest(1000, 20000);
// really big
dotest(400000, 1000000);
}
// a reasonable random test
public void testMedium() throws Exception {
Random random = random();
int numTests = atLeast(3);
for (int i = 0; i < numTests; i++) {
dotest(_TestUtil.nextInt(random, 2, 100),
_TestUtil.nextInt(random, 100, 1000));
}
}
public void testSimple() throws Exception {
Directory dest = newDirectory();
DirectoryTaxonomyWriter tw1 = new DirectoryTaxonomyWriter(dest);
tw1.addCategory(new CategoryPath("Author", "Mark Twain"));
tw1.addCategory(new CategoryPath("Animals", "Dog"));
tw1.addCategory(new CategoryPath("Author", "Rob Pike"));
Directory src = newDirectory();
DirectoryTaxonomyWriter tw2 = new DirectoryTaxonomyWriter(src);
tw2.addCategory(new CategoryPath("Author", "Rob Pike"));
tw2.addCategory(new CategoryPath("Aardvarks", "Bob"));
tw2.close();
OrdinalMap map = randomOrdinalMap();
tw1.addTaxonomy(src, map);
tw1.close();
validate(dest, src, map);
IOUtils.close(dest, src);
}
public void testConcurrency() throws Exception {
// tests that addTaxonomy and addCategory work in parallel
final int numCategories = atLeast(5000);
// build an input taxonomy index
Directory src = newDirectory();
DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(src);
for (int i = 0; i < numCategories; i++) {
tw.addCategory(new CategoryPath("a", Integer.toString(i)));
}
tw.close();
// now add the taxonomy to an empty taxonomy, while adding the categories
// again, in parallel -- in the end, no duplicate categories should exist.
Directory dest = newDirectory();
final DirectoryTaxonomyWriter destTW = new DirectoryTaxonomyWriter(dest);
Thread t = new Thread() {
@Override
public void run() {
for (int i = 0; i < numCategories; i++) {
try {
destTW.addCategory(new CategoryPath("a", Integer.toString(i)));
} catch (IOException e) {
// shouldn't happen - if it does, let the test fail on uncaught exception.
throw new RuntimeException(e);
}
}
}
};
t.start();
OrdinalMap map = new MemoryOrdinalMap();
destTW.addTaxonomy(src, map);
t.join();
destTW.close();
// now validate
DirectoryTaxonomyReader dtr = new DirectoryTaxonomyReader(dest);
// +2 to account for the root category + "a"
assertEquals(numCategories + 2, dtr.getSize());
HashSet<CategoryPath> categories = new HashSet<CategoryPath>();
for (int i = 1; i < dtr.getSize(); i++) {
CategoryPath cat = dtr.getPath(i);
assertTrue("category " + cat + " already existed", categories.add(cat));
}
dtr.close();
IOUtils.close(src, dest);
}
}