mirror of https://github.com/apache/lucene.git
Maintain norms in a single file .nrm: LUCENE-756
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@493641 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f0b51f5e2b
commit
c9795dd56b
|
@ -163,6 +163,13 @@ API Changes
|
|||
small. This changes the index file format and cannot be
|
||||
read by previous versions of Lucene. (Doron Cohen via Yonik Seeley)
|
||||
|
||||
13. LUCENE-756: Maintain all norms in a single .nrm file to reduce the
|
||||
number of open files and file descriptors for the non-compound index
|
||||
format. This changes the index file format, but maintains the
|
||||
ability to read and update older indicies. The first segment merge
|
||||
on an older format index will create a single .nrm file for the new
|
||||
segment. (Doron Cohen via Yonik Seeley)
|
||||
|
||||
Bug fixes
|
||||
|
||||
1. Fixed the web application demo (built with "ant war-demo") which
|
||||
|
|
|
@ -35,6 +35,9 @@ final class IndexFileNames {
|
|||
* pre-lockless indices) */
|
||||
static final String DELETABLE = "deletable";
|
||||
|
||||
/** Extension of norms file */
|
||||
static final String NORMS_EXTENSION = "nrm";
|
||||
|
||||
/**
|
||||
* This array contains all filename extensions used by
|
||||
* Lucene's index files, with two exceptions, namely the
|
||||
|
@ -45,7 +48,8 @@ final class IndexFileNames {
|
|||
*/
|
||||
static final String INDEX_EXTENSIONS[] = new String[] {
|
||||
"cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del",
|
||||
"tvx", "tvd", "tvf", "tvp", "gen"};
|
||||
"tvx", "tvd", "tvf", "tvp", "gen", "nrm"
|
||||
};
|
||||
|
||||
/** File extensions of old-style index files */
|
||||
static final String COMPOUND_EXTENSIONS[] = new String[] {
|
||||
|
|
|
@ -639,7 +639,7 @@ public class IndexWriter {
|
|||
String segmentName = newRAMSegmentName();
|
||||
dw.addDocument(segmentName, doc);
|
||||
synchronized (this) {
|
||||
ramSegmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory, false));
|
||||
ramSegmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory, false, false));
|
||||
maybeFlushRamSegments();
|
||||
}
|
||||
}
|
||||
|
@ -772,10 +772,10 @@ public class IndexWriter {
|
|||
while (segmentInfos.size() > 1 ||
|
||||
(segmentInfos.size() == 1 &&
|
||||
(SegmentReader.hasDeletions(segmentInfos.info(0)) ||
|
||||
SegmentReader.hasSeparateNorms(segmentInfos.info(0)) ||
|
||||
segmentInfos.info(0).dir != directory ||
|
||||
(useCompoundFile &&
|
||||
(!SegmentReader.usesCompoundFile(segmentInfos.info(0)) ||
|
||||
SegmentReader.hasSeparateNorms(segmentInfos.info(0))))))) {
|
||||
(!SegmentReader.usesCompoundFile(segmentInfos.info(0))))))) {
|
||||
int minSegment = segmentInfos.size() - mergeFactor;
|
||||
mergeSegments(segmentInfos, minSegment < 0 ? 0 : minSegment, segmentInfos.size());
|
||||
}
|
||||
|
@ -1127,7 +1127,7 @@ public class IndexWriter {
|
|||
int docCount = merger.merge(); // merge 'em
|
||||
|
||||
segmentInfos.setSize(0); // pop old infos & add new
|
||||
info = new SegmentInfo(mergedName, docCount, directory, false);
|
||||
info = new SegmentInfo(mergedName, docCount, directory, false, true);
|
||||
segmentInfos.addElement(info);
|
||||
commitPending = true;
|
||||
|
||||
|
@ -1347,7 +1347,7 @@ public class IndexWriter {
|
|||
}
|
||||
|
||||
newSegment = new SegmentInfo(mergedName, mergedDocCount,
|
||||
directory, false);
|
||||
directory, false, true);
|
||||
|
||||
|
||||
if (sourceSegments == ramSegmentInfos) {
|
||||
|
|
|
@ -42,8 +42,13 @@ final class SegmentInfo {
|
|||
|
||||
private byte isCompoundFile; // -1 if it is not; 1 if it is; 0 if it's
|
||||
// pre-2.1 (ie, must check file system to see
|
||||
// if <name>.cfs exists)
|
||||
// if <name>.cfs and <name>.nrm exist)
|
||||
|
||||
private byte withNrm; // 1 if this segment maintains norms in a single file;
|
||||
// -1 if not; 0 if check file is required to tell.
|
||||
// would be -1 for segments populated by DocumentWriter.
|
||||
// would be 1 for (newly created) merge resulted segments (both compound and non compound).
|
||||
|
||||
public SegmentInfo(String name, int docCount, Directory dir) {
|
||||
this.name = name;
|
||||
this.docCount = docCount;
|
||||
|
@ -51,14 +56,13 @@ final class SegmentInfo {
|
|||
delGen = -1;
|
||||
isCompoundFile = 0;
|
||||
preLockless = true;
|
||||
withNrm = 0;
|
||||
}
|
||||
public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile) {
|
||||
|
||||
public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean withNrm) {
|
||||
this(name, docCount, dir);
|
||||
if (isCompoundFile) {
|
||||
this.isCompoundFile = 1;
|
||||
} else {
|
||||
this.isCompoundFile = -1;
|
||||
}
|
||||
this.isCompoundFile = (byte) (isCompoundFile ? 1 : -1);
|
||||
this.withNrm = (byte) (withNrm ? 1 : -1);
|
||||
preLockless = false;
|
||||
}
|
||||
|
||||
|
@ -78,6 +82,7 @@ final class SegmentInfo {
|
|||
System.arraycopy(src.normGen, 0, normGen, 0, src.normGen.length);
|
||||
}
|
||||
isCompoundFile = src.isCompoundFile;
|
||||
withNrm = src.withNrm;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -111,19 +116,20 @@ final class SegmentInfo {
|
|||
isCompoundFile = 0;
|
||||
preLockless = true;
|
||||
}
|
||||
withNrm = 0;
|
||||
}
|
||||
|
||||
void setNumField(int numField) {
|
||||
void setNumFields(int numFields) {
|
||||
if (normGen == null) {
|
||||
// normGen is null if we loaded a pre-2.1 segment
|
||||
// file, or, if this segments file hasn't had any
|
||||
// norms set against it yet:
|
||||
normGen = new long[numField];
|
||||
normGen = new long[numFields];
|
||||
|
||||
if (!preLockless) {
|
||||
// This is a FORMAT_LOCKLESS segment, which means
|
||||
// there are no norms:
|
||||
for(int i=0;i<numField;i++) {
|
||||
for(int i=0;i<numFields;i++) {
|
||||
normGen[i] = -1;
|
||||
}
|
||||
}
|
||||
|
@ -173,6 +179,7 @@ final class SegmentInfo {
|
|||
si.isCompoundFile = isCompoundFile;
|
||||
si.delGen = delGen;
|
||||
si.preLockless = preLockless;
|
||||
si.withNrm = withNrm;
|
||||
if (normGen != null) {
|
||||
si.normGen = (long[]) normGen.clone();
|
||||
}
|
||||
|
@ -245,7 +252,7 @@ final class SegmentInfo {
|
|||
// pre-LOCKLESS and must be checked in directory:
|
||||
for(int i=0;i<normGen.length;i++) {
|
||||
if (normGen[i] == 0) {
|
||||
if (dir.fileExists(getNormFileName(i))) {
|
||||
if (hasSeparateNorms(i)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -285,12 +292,21 @@ final class SegmentInfo {
|
|||
}
|
||||
|
||||
if (hasSeparateNorms(number)) {
|
||||
// case 1: separate norm
|
||||
prefix = ".s";
|
||||
return IndexFileNames.fileNameFromGeneration(name, prefix + number, gen);
|
||||
} else {
|
||||
prefix = ".f";
|
||||
return IndexFileNames.fileNameFromGeneration(name, prefix + number, 0);
|
||||
}
|
||||
|
||||
|
||||
if (withNrm()) {
|
||||
// case 2: lockless (or nrm file exists) - single file for all norms
|
||||
prefix = "." + IndexFileNames.NORMS_EXTENSION;
|
||||
return IndexFileNames.fileNameFromGeneration(name, prefix, 0);
|
||||
}
|
||||
|
||||
// case 3: norm file for each field
|
||||
prefix = ".f";
|
||||
return IndexFileNames.fileNameFromGeneration(name, prefix + number, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -310,11 +326,6 @@ final class SegmentInfo {
|
|||
/**
|
||||
* Returns true if this segment is stored as a compound
|
||||
* file; else, false.
|
||||
*
|
||||
* @param directory directory to check. This parameter is
|
||||
* only used when the segment was written before version
|
||||
* 2.1 (at which point compound file or not became stored
|
||||
* in the segments info file).
|
||||
*/
|
||||
boolean getUseCompoundFile() throws IOException {
|
||||
if (isCompoundFile == -1) {
|
||||
|
@ -325,6 +336,32 @@ final class SegmentInfo {
|
|||
return dir.fileExists(name + ".cfs");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true iff this segment stores filed norms in a single .nrm file.
|
||||
*/
|
||||
private boolean withNrm () throws IOException {
|
||||
if (withNrm == -1) {
|
||||
return false;
|
||||
}
|
||||
if (withNrm == 1) {
|
||||
return true;
|
||||
}
|
||||
Directory d = dir;
|
||||
try {
|
||||
if (getUseCompoundFile()) {
|
||||
d = new CompoundFileReader(dir, name + ".cfs");
|
||||
}
|
||||
boolean res = d.fileExists(name + "." + IndexFileNames.NORMS_EXTENSION);
|
||||
withNrm = (byte) (res ? 1 : -1); // avoid more file tests like this
|
||||
return res;
|
||||
} finally {
|
||||
if (d!=dir && d!=null) {
|
||||
d.close();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Save this segment's info.
|
||||
|
|
|
@ -40,6 +40,10 @@ import org.apache.lucene.store.RAMOutputStream;
|
|||
* @see #add
|
||||
*/
|
||||
final class SegmentMerger {
|
||||
|
||||
/** norms header placeholder */
|
||||
static final byte[] NORMS_HEADER = new byte[]{'N','R','M',-1};
|
||||
|
||||
private Directory directory;
|
||||
private String segment;
|
||||
private int termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL;
|
||||
|
@ -116,7 +120,7 @@ final class SegmentMerger {
|
|||
new CompoundFileWriter(directory, fileName);
|
||||
|
||||
Vector files =
|
||||
new Vector(IndexFileNames.COMPOUND_EXTENSIONS.length + fieldInfos.size());
|
||||
new Vector(IndexFileNames.COMPOUND_EXTENSIONS.length + 1);
|
||||
|
||||
// Basic files
|
||||
for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS.length; i++) {
|
||||
|
@ -127,7 +131,8 @@ final class SegmentMerger {
|
|||
for (int i = 0; i < fieldInfos.size(); i++) {
|
||||
FieldInfo fi = fieldInfos.fieldInfo(i);
|
||||
if (fi.isIndexed && !fi.omitNorms) {
|
||||
files.add(segment + ".f" + i);
|
||||
files.add(segment + "." + IndexFileNames.NORMS_EXTENSION);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -408,11 +413,15 @@ final class SegmentMerger {
|
|||
|
||||
private void mergeNorms() throws IOException {
|
||||
byte[] normBuffer = null;
|
||||
for (int i = 0; i < fieldInfos.size(); i++) {
|
||||
FieldInfo fi = fieldInfos.fieldInfo(i);
|
||||
if (fi.isIndexed && !fi.omitNorms) {
|
||||
IndexOutput output = directory.createOutput(segment + ".f" + i);
|
||||
try {
|
||||
IndexOutput output = null;
|
||||
try {
|
||||
for (int i = 0; i < fieldInfos.size(); i++) {
|
||||
FieldInfo fi = fieldInfos.fieldInfo(i);
|
||||
if (fi.isIndexed && !fi.omitNorms) {
|
||||
if (output == null) {
|
||||
output = directory.createOutput(segment + "." + IndexFileNames.NORMS_EXTENSION);
|
||||
output.writeBytes(NORMS_HEADER,NORMS_HEADER.length);
|
||||
}
|
||||
for (int j = 0; j < readers.size(); j++) {
|
||||
IndexReader reader = (IndexReader) readers.elementAt(j);
|
||||
int maxDoc = reader.maxDoc();
|
||||
|
@ -434,10 +443,12 @@ final class SegmentMerger {
|
|||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
output.close();
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
if (output != null) {
|
||||
output.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -58,23 +58,25 @@ class SegmentReader extends IndexReader {
|
|||
CompoundFileReader cfsReader = null;
|
||||
|
||||
private class Norm {
|
||||
public Norm(IndexInput in, int number)
|
||||
public Norm(IndexInput in, int number, long normSeek)
|
||||
{
|
||||
this.in = in;
|
||||
this.number = number;
|
||||
this.normSeek = normSeek;
|
||||
}
|
||||
|
||||
private IndexInput in;
|
||||
private byte[] bytes;
|
||||
private boolean dirty;
|
||||
private int number;
|
||||
private long normSeek;
|
||||
private boolean rollbackDirty;
|
||||
|
||||
private void reWrite(SegmentInfo si) throws IOException {
|
||||
// NOTE: norms are re-written in regular directory, not cfs
|
||||
|
||||
String oldFileName = si.getNormFileName(this.number);
|
||||
if (oldFileName != null) {
|
||||
if (oldFileName != null && !oldFileName.endsWith("." + IndexFileNames.NORMS_EXTENSION)) {
|
||||
// Mark this file for deletion. Note that we don't
|
||||
// actually try to delete it until the new segments files is
|
||||
// successfully written:
|
||||
|
@ -215,7 +217,7 @@ class SegmentReader extends IndexReader {
|
|||
si.clearDelGen();
|
||||
}
|
||||
if (normsDirty) { // re-write norms
|
||||
si.setNumField(fieldInfos.size());
|
||||
si.setNumFields(fieldInfos.size());
|
||||
Enumeration values = norms.elements();
|
||||
while (values.hasMoreElements()) {
|
||||
Norm norm = (Norm) values.nextElement();
|
||||
|
@ -301,10 +303,16 @@ class SegmentReader extends IndexReader {
|
|||
files.addElement(si.getDelFileName());
|
||||
}
|
||||
|
||||
boolean addedNrm = false;
|
||||
for (int i = 0; i < fieldInfos.size(); i++) {
|
||||
String name = si.getNormFileName(i);
|
||||
if (name != null && directory().fileExists(name))
|
||||
if (name != null && directory().fileExists(name)) {
|
||||
if (name.endsWith("." + IndexFileNames.NORMS_EXTENSION)) {
|
||||
if (addedNrm) continue; // add .nrm just once
|
||||
addedNrm = true;
|
||||
}
|
||||
files.addElement(name);
|
||||
}
|
||||
}
|
||||
return files;
|
||||
}
|
||||
|
@ -462,7 +470,7 @@ class SegmentReader extends IndexReader {
|
|||
|
||||
IndexInput normStream = (IndexInput) norm.in.clone();
|
||||
try { // read from disk
|
||||
normStream.seek(0);
|
||||
normStream.seek(norm.normSeek);
|
||||
normStream.readBytes(bytes, offset, maxDoc());
|
||||
} finally {
|
||||
normStream.close();
|
||||
|
@ -471,6 +479,8 @@ class SegmentReader extends IndexReader {
|
|||
|
||||
|
||||
private void openNorms(Directory cfsDir) throws IOException {
|
||||
long nextNormSeek = SegmentMerger.NORMS_HEADER.length; //skip header (header unused for now)
|
||||
int maxDoc = maxDoc();
|
||||
for (int i = 0; i < fieldInfos.size(); i++) {
|
||||
FieldInfo fi = fieldInfos.fieldInfo(i);
|
||||
if (fi.isIndexed && !fi.omitNorms) {
|
||||
|
@ -479,7 +489,9 @@ class SegmentReader extends IndexReader {
|
|||
if (!si.hasSeparateNorms(fi.number)) {
|
||||
d = cfsDir;
|
||||
}
|
||||
norms.put(fi.name, new Norm(d.openInput(fileName), fi.number));
|
||||
long normSeek = (fileName.endsWith("." + IndexFileNames.NORMS_EXTENSION) ? nextNormSeek : 0);
|
||||
norms.put(fi.name, new Norm(d.openInput(fileName), fi.number, normSeek));
|
||||
nextNormSeek += maxDoc; // increment also if some norms are separate
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1397,7 +1397,9 @@
|
|||
</p>
|
||||
</section>
|
||||
<section id="Normalization Factors"><title>Normalization Factors</title>
|
||||
<p>There's a norm file for each indexed field with a byte for
|
||||
<p>
|
||||
<b>Pre-2.1:</b>
|
||||
There's a norm file for each indexed field with a byte for
|
||||
each document. The .f[0-9]* file contains,
|
||||
for each document, a byte that encodes a value that is multiplied
|
||||
into the score for hits on that field:
|
||||
|
@ -1405,6 +1407,27 @@
|
|||
<p>Norms
|
||||
(.f[0-9]*) --> <Byte>
|
||||
<sup>SegSize</sup>
|
||||
</p>
|
||||
<p>
|
||||
<b>2.1 and above:</b>
|
||||
There's a single .nrm file containing all norms:
|
||||
</p>
|
||||
<p>AllNorms
|
||||
(.nrm) --> NormsHeader,<Norms>
|
||||
<sup>NumFieldsWithNorms</sup>
|
||||
</p>
|
||||
<p>Norms
|
||||
--> <Byte>
|
||||
<sup>SegSize</sup>
|
||||
</p>
|
||||
<p>NormsHeader
|
||||
--> 'N','R','M',Version
|
||||
</p>
|
||||
<p>Version
|
||||
--> Byte
|
||||
</p>
|
||||
<p>NormsHeader
|
||||
has 4 bytes, last of which is the format version for this file, currently -1.
|
||||
</p>
|
||||
<p>Each
|
||||
byte encodes a floating point value. Bits 0-2 contain the 3-bit
|
||||
|
@ -1441,6 +1464,18 @@
|
|||
</p>
|
||||
</li>
|
||||
</ol>
|
||||
<p>A separate norm file is created when the norm values of an existing segment are modified.
|
||||
When field <em>N</em> is modified, a separate norm file <em>.sN</em>
|
||||
is created, to maintain the norm values for that field.
|
||||
</p>
|
||||
<p>
|
||||
<b>Pre-2.1:</b>
|
||||
Separate norm files are created only for compound segments.
|
||||
</p>
|
||||
<p>
|
||||
<b>2.1 and above:</b>
|
||||
Separate norm files are created (when adequate) for both compound and non compound segments.
|
||||
</p>
|
||||
|
||||
</section>
|
||||
<section id="Term Vectors"><title>Term Vectors</title>
|
||||
|
|
Loading…
Reference in New Issue