LUCENE-5985: add id for each segment and commit to aid replication

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1619620 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2014-08-21 22:58:30 +00:00
parent 271af418c5
commit 1d5d73eefc
14 changed files with 237 additions and 28 deletions

View File

@ -98,6 +98,11 @@ Other
======================= Lucene 4.11.0 ======================
(No Changes)
New Features
* LUCENE-5895: Lucene now stores a unique id per-segment and per-commit to aid
in accurate replication of index files (Robert Muir, Mike McCandless)
======================= Lucene 4.10.0 ======================
New Features

View File

@ -17,15 +17,6 @@ package org.apache.lucene.codecs.simpletext;
* limitations under the License.
*/
import static org.apache.lucene.codecs.simpletext.SimpleTextSegmentInfoWriter.SI_DIAG_KEY;
import static org.apache.lucene.codecs.simpletext.SimpleTextSegmentInfoWriter.SI_DIAG_VALUE;
import static org.apache.lucene.codecs.simpletext.SimpleTextSegmentInfoWriter.SI_DOCCOUNT;
import static org.apache.lucene.codecs.simpletext.SimpleTextSegmentInfoWriter.SI_FILE;
import static org.apache.lucene.codecs.simpletext.SimpleTextSegmentInfoWriter.SI_NUM_DIAG;
import static org.apache.lucene.codecs.simpletext.SimpleTextSegmentInfoWriter.SI_NUM_FILES;
import static org.apache.lucene.codecs.simpletext.SimpleTextSegmentInfoWriter.SI_USECOMPOUND;
import static org.apache.lucene.codecs.simpletext.SimpleTextSegmentInfoWriter.SI_VERSION;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
@ -44,6 +35,16 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.Version;
import static org.apache.lucene.codecs.simpletext.SimpleTextSegmentInfoWriter.SI_DIAG_KEY;
import static org.apache.lucene.codecs.simpletext.SimpleTextSegmentInfoWriter.SI_DIAG_VALUE;
import static org.apache.lucene.codecs.simpletext.SimpleTextSegmentInfoWriter.SI_DOCCOUNT;
import static org.apache.lucene.codecs.simpletext.SimpleTextSegmentInfoWriter.SI_FILE;
import static org.apache.lucene.codecs.simpletext.SimpleTextSegmentInfoWriter.SI_NUM_DIAG;
import static org.apache.lucene.codecs.simpletext.SimpleTextSegmentInfoWriter.SI_NUM_FILES;
import static org.apache.lucene.codecs.simpletext.SimpleTextSegmentInfoWriter.SI_USECOMPOUND;
import static org.apache.lucene.codecs.simpletext.SimpleTextSegmentInfoWriter.SI_ID;
import static org.apache.lucene.codecs.simpletext.SimpleTextSegmentInfoWriter.SI_VERSION;
/**
* reads plaintext segments files
* <p>
@ -99,10 +100,14 @@ public class SimpleTextSegmentInfoReader extends SegmentInfoReader {
files.add(fileName);
}
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch.get(), SI_ID);
final String id = readString(SI_ID.length, scratch);
SimpleTextUtil.checkFooter(input);
SegmentInfo info = new SegmentInfo(directory, version, segmentName, docCount,
isCompoundFile, null, diagnostics);
SegmentInfo info = new SegmentInfo(directory, version, segmentName, docCount,
isCompoundFile, null, diagnostics, id);
info.setFiles(files);
success = true;
return info;

View File

@ -48,6 +48,7 @@ public class SimpleTextSegmentInfoWriter extends SegmentInfoWriter {
final static BytesRef SI_DIAG_VALUE = new BytesRef(" value ");
final static BytesRef SI_NUM_FILES = new BytesRef(" files ");
final static BytesRef SI_FILE = new BytesRef(" file ");
final static BytesRef SI_ID = new BytesRef(" id ");
@Override
public void write(Directory dir, SegmentInfo si, FieldInfos fis, IOContext ioContext) throws IOException {
@ -104,6 +105,10 @@ public class SimpleTextSegmentInfoWriter extends SegmentInfoWriter {
SimpleTextUtil.writeNewline(output);
}
}
SimpleTextUtil.write(output, SI_ID);
SimpleTextUtil.write(output, si.getId(), scratch);
SimpleTextUtil.writeNewline(output);
SimpleTextUtil.writeChecksum(output, scratch);
success = true;

View File

@ -31,7 +31,7 @@ import org.apache.lucene.store.DataOutput; // javadocs
* <p>
* Files:
* <ul>
* <li><tt>.si</tt>: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files, Footer
* <li><tt>.si</tt>: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files, Id, Footer
* </ul>
* </p>
* Data types:
@ -44,6 +44,7 @@ import org.apache.lucene.store.DataOutput; // javadocs
* <li>Diagnostics --&gt; {@link DataOutput#writeStringStringMap Map&lt;String,String&gt;}</li>
* <li>IsCompoundFile --&gt; {@link DataOutput#writeByte Int8}</li>
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
* <li>Id --&gt; {@link DataOutput#writeString String}</li>
* </ul>
* </p>
* Field Descriptions:
@ -88,5 +89,6 @@ public class Lucene46SegmentInfoFormat extends SegmentInfoFormat {
static final String CODEC_NAME = "Lucene46SegmentInfo";
static final int VERSION_START = 0;
static final int VERSION_CHECKSUM = 1;
static final int VERSION_CURRENT = VERSION_CHECKSUM;
static final int VERSION_ID = 2;
static final int VERSION_CURRENT = VERSION_ID;
}

View File

@ -62,13 +62,20 @@ public class Lucene46SegmentInfoReader extends SegmentInfoReader {
final Map<String,String> diagnostics = input.readStringStringMap();
final Set<String> files = input.readStringSet();
String id;
if (codecVersion >= Lucene46SegmentInfoFormat.VERSION_ID) {
id = input.readString();
} else {
id = null;
}
if (codecVersion >= Lucene46SegmentInfoFormat.VERSION_CHECKSUM) {
CodecUtil.checkFooter(input);
} else {
CodecUtil.checkEOF(input);
}
final SegmentInfo si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics);
final SegmentInfo si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, id);
si.setFiles(files);
success = true;

View File

@ -59,6 +59,7 @@ public class Lucene46SegmentInfoWriter extends SegmentInfoWriter {
output.writeByte((byte) (si.getUseCompoundFile() ? SegmentInfo.YES : SegmentInfo.NO));
output.writeStringStringMap(si.getDiagnostics());
output.writeStringSet(si.files());
output.writeString(si.getId());
CodecUtil.writeFooter(output);
success = true;
} finally {

View File

@ -484,7 +484,7 @@ public class CheckIndex {
}
msg(infoStream, "Segments file=" + segmentsFileName + " numSegments=" + numSegments
+ " " + versionString + " format=" + sFormat + userDataString);
+ " " + versionString + " id=" + sis.getId() + " format=" + sFormat + userDataString);
if (onlySegments != null) {
result.partial = true;
@ -535,6 +535,7 @@ public class CheckIndex {
try {
msg(infoStream, " version=" + (version == null ? "3.0" : version));
msg(infoStream, " id=" + info.info.getId());
final Codec codec = info.info.getCodec();
msg(infoStream, " codec=" + codec);
segInfoStat.codec = codec;

View File

@ -39,6 +39,7 @@ import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.IntBlockPool;
import org.apache.lucene.util.MutableBits;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.Version;
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_MASK;
@ -178,7 +179,7 @@ class DocumentsWriterPerThread {
pendingUpdates.clear();
deleteSlice = deleteQueue.newSlice();
segmentInfo = new SegmentInfo(directoryOrig, Version.LATEST, segmentName, -1, false, codec, null);
segmentInfo = new SegmentInfo(directoryOrig, Version.LATEST, segmentName, -1, false, codec, null, StringHelper.randomId());
assert numDocsInRAM == 0;
if (INFO_VERBOSE && infoStream.isEnabled("DWPT")) {
infoStream.message("DWPT", Thread.currentThread().getName() + " init seg=" + segmentName + " delQueue=" + deleteQueue);

View File

@ -63,6 +63,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.ThreadInterruptedException;
import org.apache.lucene.util.Version;
@ -2566,7 +2567,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
TrackingDirectoryWrapper trackingDir = new TrackingDirectoryWrapper(directory);
SegmentInfo info = new SegmentInfo(directory, Version.LATEST, mergedName, -1,
false, codec, null);
false, codec, null, StringHelper.randomId());
SegmentMerger merger = new SegmentMerger(mergeReaders, info, infoStream, trackingDir,
MergeState.CheckAbort.NONE, globalFieldNumberMap,
@ -2667,7 +2668,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
// Same SI as before but we change directory and name
SegmentInfo newInfo = new SegmentInfo(directory, info.info.getVersion(), segName, info.info.getDocCount(),
info.info.getUseCompoundFile(), info.info.getCodec(),
info.info.getDiagnostics());
info.info.getDiagnostics(), StringHelper.randomId());
SegmentCommitInfo newInfoPerCommit = new SegmentCommitInfo(newInfo,
info.getDelCount(), info.getDelGen(), info.getFieldInfosGen(),
info.getDocValuesGen());
@ -3789,7 +3790,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
// ConcurrentMergePolicy we keep deterministic segment
// names.
final String mergeSegmentName = newSegmentName();
SegmentInfo si = new SegmentInfo(directory, Version.LATEST, mergeSegmentName, -1, false, codec, null);
SegmentInfo si = new SegmentInfo(directory, Version.LATEST, mergeSegmentName, -1, false, codec, null, StringHelper.randomId());
Map<String,String> details = new HashMap<>();
details.put("mergeMaxNumSegments", "" + merge.maxNumSegments);
details.put("mergeFactor", Integer.toString(merge.segments.size()));

View File

@ -27,7 +27,6 @@ import java.util.regex.Matcher;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.TrackingDirectoryWrapper;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.Version;
@ -58,10 +57,13 @@ public final class SegmentInfo {
private boolean isCompoundFile;
/** Id that uniquely identifies this segment. */
private final String id;
private Codec codec;
private Map<String,String> diagnostics;
// Tracks the Lucene version this segment was created with, since 3.1. Null
// indicates an older than 3.0 index, and it's used to detect a too old index.
// The format expected is "x.y" - "2.x" for pre-3.0 indexes (or null), and
@ -79,13 +81,23 @@ public final class SegmentInfo {
return diagnostics;
}
/**
* Construct a new complete SegmentInfo instance from
* input, with a newly generated random id.
*/
public SegmentInfo(Directory dir, Version version, String name, int docCount,
boolean isCompoundFile, Codec codec, Map<String,String> diagnostics) {
this(dir, version, name, docCount, isCompoundFile, codec, diagnostics, null);
}
/**
* Construct a new complete SegmentInfo instance from input.
* <p>Note: this is public only to allow access from
* the codecs package.</p>
*/
public SegmentInfo(Directory dir, Version version, String name, int docCount,
boolean isCompoundFile, Codec codec, Map<String,String> diagnostics) {
boolean isCompoundFile, Codec codec, Map<String,String> diagnostics,
String id) {
assert !(dir instanceof TrackingDirectoryWrapper);
this.dir = dir;
this.version = version;
@ -94,6 +106,7 @@ public final class SegmentInfo {
this.isCompoundFile = isCompoundFile;
this.codec = codec;
this.diagnostics = diagnostics;
this.id = id;
}
/**
@ -212,6 +225,11 @@ public final class SegmentInfo {
return version;
}
/** Return the id that uniquely identifies this segment. */
public String getId() {
return id;
}
private Set<String> setFiles;
/** Sets the files written for this segment. */

View File

@ -27,8 +27,8 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.codecs.Codec;
@ -43,6 +43,7 @@ import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.NoSuchDirectoryException;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;
/**
* A collection of segmentInfo objects with methods for operating on those
@ -137,6 +138,9 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
/** The file format version for the segments_N codec header, since 4.9+ */
public static final int VERSION_49 = 3;
/** The file format version for the segments_N codec header, since 4.10+ */
public static final int VERSION_410 = 4;
// Used for the segments.gen file only!
// Whenever you add a new format, make it 1 smaller (negative version logic)!
private static final int FORMAT_SEGMENTS_GEN_47 = -2;
@ -167,6 +171,9 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
*/
private static PrintStream infoStream = null;
/** Id for this commit; only written starting with Lucene 4.10 */
private String id;
/** Sole constructor. Typically you call this and then
* use {@link #read(Directory) or
* #read(Directory,String)} to populate each {@link
@ -317,6 +324,12 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
nextGeneration);
}
/** Since Lucene 4.10, every commit (segments_N) writes a unique id. This will
* return that id, or null if this commit was pre-4.10. */
public String getId() {
return id;
}
/**
* Read a particular segmentFileName. Note that this may
* throw an IOException if a commit is in process.
@ -345,7 +358,7 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
throw new IndexFormatTooOldException(input, magic, CodecUtil.CODEC_MAGIC, CodecUtil.CODEC_MAGIC);
}
// 4.0+
int format = CodecUtil.checkHeaderNoMagic(input, "segments", VERSION_40, VERSION_49);
int format = CodecUtil.checkHeaderNoMagic(input, "segments", VERSION_40, VERSION_410);
version = input.readLong();
counter = input.readInt();
int numSegments = input.readInt();
@ -410,6 +423,9 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
add(siPerCommit);
}
userData = input.readStringStringMap();
if (format >= VERSION_410) {
id = input.readString();
}
if (format >= VERSION_48) {
CodecUtil.checkFooter(input);
@ -470,7 +486,7 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
try {
segnOutput = directory.createOutput(segmentFileName, IOContext.DEFAULT);
CodecUtil.writeHeader(segnOutput, "segments", VERSION_49);
CodecUtil.writeHeader(segnOutput, "segments", VERSION_410);
segnOutput.writeLong(version);
segnOutput.writeInt(counter); // write counter
segnOutput.writeInt(size()); // write infos
@ -496,6 +512,7 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
assert si.dir == directory;
}
segnOutput.writeStringStringMap(userData);
segnOutput.writeString(StringHelper.randomId());
pendingSegnOutput = segnOutput;
success = true;
} finally {

View File

@ -17,8 +17,9 @@ package org.apache.lucene.util;
* limitations under the License.
*/
import java.util.Comparator;
import java.util.StringTokenizer;
import java.math.BigInteger;
import java.util.Arrays;
import java.util.Properties;
/**
* Methods for manipulating strings.
@ -228,4 +229,88 @@ public abstract class StringHelper {
public static int murmurhash3_x86_32(BytesRef bytes, int seed) {
return murmurhash3_x86_32(bytes.bytes, bytes.offset, bytes.length, seed);
}
// Holds 128 bit unsigned value:
private static BigInteger nextId;
private static final BigInteger idMask;
private static final Object idLock = new Object();
private static final String idPad = "00000000000000000000000000000000";
static {
byte[] maskBytes = new byte[16];
Arrays.fill(maskBytes, (byte) 0xff);
idMask = new BigInteger(maskBytes);
String prop = System.getProperty("tests.seed");
// State for xorshift128:
long x0;
long x1;
long seed;
if (prop != null) {
// So if there is a test failure that somehow relied on this id,
// we remain reproducible based on the test seed:
if (prop.length() > 8) {
prop = prop.substring(prop.length()-8);
}
x0 = Long.parseLong(prop, 16);
x1 = x0;
} else {
// "Ghetto randomess" from 3 different sources:
x0 = System.nanoTime();
x1 = StringHelper.class.hashCode() << 32;
StringBuilder sb = new StringBuilder();
// Properties can vary across JVM instances:
Properties p = System.getProperties();
for (String s: p.stringPropertyNames()) {
sb.append(s);
sb.append(p.getProperty(s));
}
x1 |= sb.toString().hashCode();
// TODO: maybe read from /dev/urandom when it's available?
}
// Use a few iterations of xorshift128 to scatter the seed
// in case multiple Lucene instances starting up "near" the same
// nanoTime, since we use ++ (mod 2^128) for full period cycle:
for(int i=0;i<10;i++) {
long s1 = x0;
long s0 = x1;
x0 = s0;
s1 ^= s1 << 23; // a
x1 = s1 ^ s0 ^ (s1 >>> 17) ^ (s0 >>> 26); // b, c
}
// Concatentate bits of x0 and x1, as unsigned 128 bit integer:
nextId = new BigInteger(1, BigInteger.valueOf(x0).shiftLeft(64).or(BigInteger.valueOf(x1)).toByteArray());
}
/** Generates a non-cryptographic globally unique id. */
public static String randomId() {
// NOTE: we don't use Java's UUID.randomUUID() implementation here because:
//
// * It's overkill for our usage: it tries to be cryptographically
// secure, whereas for this use we don't care if someone can
// guess the IDs.
//
// * It uses SecureRandom, which on Linux can easily take a long time
// (I saw ~ 10 seconds just running a Lucene test) when entropy
// harvesting is falling behind.
//
// * It loses a few (6) bits to version and variant and it's not clear
// what impact that has on the period, whereas the simple ++ (mod 2^128)
// we use here is guaranteed to have the full period.
String id;
synchronized(idLock) {
id = nextId.toString(16);
nextId = nextId.add(BigInteger.ONE).and(idMask);
}
assert id.length() <= 32: "id=" + id;
id = idPad.substring(id.length()) + id;
return id;
}
}

View File

@ -81,11 +81,12 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.SetOnce;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.ThreadInterruptedException;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.junit.Test;
public class TestIndexWriter extends LuceneTestCase {
@ -2819,4 +2820,55 @@ public class TestIndexWriter extends LuceneTestCase {
iw.close();
dir.close();
}
// LUCENE-5895:
/** Make sure we see ids per segment and per commit. */
public void testIds() throws Exception {
Directory d = newDirectory();
IndexWriter w = new IndexWriter(d, newIndexWriterConfig(new MockAnalyzer(random())));
w.addDocument(new Document());
w.close();
SegmentInfos sis = new SegmentInfos();
sis.read(d);
String id1 = sis.getId();
assertNotNull(id1);
String id2 = sis.info(0).info.getId();
if (defaultCodecSupportsSegmentIds()) {
assertNotNull(id2);
} else {
assertNull(id2);
}
// Make sure CheckIndex includes id output:
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
CheckIndex checker = new CheckIndex(d);
checker.setCrossCheckTermVectors(false);
checker.setInfoStream(new PrintStream(bos, false, IOUtils.UTF_8), false);
CheckIndex.Status indexStatus = checker.checkIndex(null);
String s = bos.toString(IOUtils.UTF_8);
// Make sure CheckIndex didn't fail
assertTrue(s, indexStatus != null && indexStatus.clean);
// Commit id is always stored:
assertTrue("missing id=" + id1 + " in:\n" + s, s.contains("id=" + id1));
// Per-segment id may or may not be stored depending on the codec:
if (defaultCodecSupportsSegmentIds()) {
assertTrue("missing id=" + id2 + " in:\n" + s, s.contains("id=" + id2));
} else {
assertTrue("missing id=null in:\n" + s, s.contains("id=null"));
}
d.close();
Set<String> ids = new HashSet<>();
for(int i=0;i<100000;i++) {
String id = StringHelper.randomId();
assertFalse("id=" + id + " i=" + i, ids.contains(id));
ids.add(id);
}
}
}

View File

@ -55,6 +55,9 @@ import java.util.logging.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.lucene46.Lucene46SegmentInfoFormat;
import org.apache.lucene.codecs.simpletext.SimpleTextSegmentInfoFormat;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
@ -1736,6 +1739,12 @@ public abstract class LuceneTestCase extends Assert {
return true;
}
/** Returns true if the codec "supports" writing segment and commit ids. */
public static boolean defaultCodecSupportsSegmentIds() {
SegmentInfoFormat siFormat = Codec.getDefault().segmentInfoFormat();
return siFormat instanceof SimpleTextSegmentInfoFormat || siFormat instanceof Lucene46SegmentInfoFormat;
}
public void assertReaderEquals(String info, IndexReader leftReader, IndexReader rightReader) throws IOException {
assertReaderStatisticsEquals(info, leftReader, rightReader);
assertFieldsEquals(info, leftReader, MultiFields.getFields(leftReader), MultiFields.getFields(rightReader), true);