Maintain norms in a single file .nrm: LUCENE-756

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@493641 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2007-01-07 04:19:21 +00:00
parent f0b51f5e2b
commit c9795dd56b
7 changed files with 147 additions and 41 deletions

View File

@ -163,6 +163,13 @@ API Changes
small. This changes the index file format and cannot be
read by previous versions of Lucene. (Doron Cohen via Yonik Seeley)
13. LUCENE-756: Maintain all norms in a single .nrm file to reduce the
number of open files and file descriptors for the non-compound index
format. This changes the index file format, but maintains the
ability to read and update older indicies. The first segment merge
on an older format index will create a single .nrm file for the new
segment. (Doron Cohen via Yonik Seeley)
Bug fixes
1. Fixed the web application demo (built with "ant war-demo") which

View File

@ -35,6 +35,9 @@ final class IndexFileNames {
* pre-lockless indices) */
static final String DELETABLE = "deletable";
/** Extension of norms file */
static final String NORMS_EXTENSION = "nrm";
/**
* This array contains all filename extensions used by
* Lucene's index files, with two exceptions, namely the
@ -45,7 +48,8 @@ final class IndexFileNames {
*/
static final String INDEX_EXTENSIONS[] = new String[] {
"cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del",
"tvx", "tvd", "tvf", "tvp", "gen"};
"tvx", "tvd", "tvf", "tvp", "gen", "nrm"
};
/** File extensions of old-style index files */
static final String COMPOUND_EXTENSIONS[] = new String[] {

View File

@ -639,7 +639,7 @@ public class IndexWriter {
String segmentName = newRAMSegmentName();
dw.addDocument(segmentName, doc);
synchronized (this) {
ramSegmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory, false));
ramSegmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory, false, false));
maybeFlushRamSegments();
}
}
@ -772,10 +772,10 @@ public class IndexWriter {
while (segmentInfos.size() > 1 ||
(segmentInfos.size() == 1 &&
(SegmentReader.hasDeletions(segmentInfos.info(0)) ||
SegmentReader.hasSeparateNorms(segmentInfos.info(0)) ||
segmentInfos.info(0).dir != directory ||
(useCompoundFile &&
(!SegmentReader.usesCompoundFile(segmentInfos.info(0)) ||
SegmentReader.hasSeparateNorms(segmentInfos.info(0))))))) {
(!SegmentReader.usesCompoundFile(segmentInfos.info(0))))))) {
int minSegment = segmentInfos.size() - mergeFactor;
mergeSegments(segmentInfos, minSegment < 0 ? 0 : minSegment, segmentInfos.size());
}
@ -1127,7 +1127,7 @@ public class IndexWriter {
int docCount = merger.merge(); // merge 'em
segmentInfos.setSize(0); // pop old infos & add new
info = new SegmentInfo(mergedName, docCount, directory, false);
info = new SegmentInfo(mergedName, docCount, directory, false, true);
segmentInfos.addElement(info);
commitPending = true;
@ -1347,7 +1347,7 @@ public class IndexWriter {
}
newSegment = new SegmentInfo(mergedName, mergedDocCount,
directory, false);
directory, false, true);
if (sourceSegments == ramSegmentInfos) {

View File

@ -42,8 +42,13 @@ final class SegmentInfo {
private byte isCompoundFile; // -1 if it is not; 1 if it is; 0 if it's
// pre-2.1 (ie, must check file system to see
// if <name>.cfs exists)
// if <name>.cfs and <name>.nrm exist)
private byte withNrm; // 1 if this segment maintains norms in a single file;
// -1 if not; 0 if check file is required to tell.
// would be -1 for segments populated by DocumentWriter.
// would be 1 for (newly created) merge resulted segments (both compound and non compound).
public SegmentInfo(String name, int docCount, Directory dir) {
this.name = name;
this.docCount = docCount;
@ -51,14 +56,13 @@ final class SegmentInfo {
delGen = -1;
isCompoundFile = 0;
preLockless = true;
withNrm = 0;
}
public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile) {
public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean withNrm) {
this(name, docCount, dir);
if (isCompoundFile) {
this.isCompoundFile = 1;
} else {
this.isCompoundFile = -1;
}
this.isCompoundFile = (byte) (isCompoundFile ? 1 : -1);
this.withNrm = (byte) (withNrm ? 1 : -1);
preLockless = false;
}
@ -78,6 +82,7 @@ final class SegmentInfo {
System.arraycopy(src.normGen, 0, normGen, 0, src.normGen.length);
}
isCompoundFile = src.isCompoundFile;
withNrm = src.withNrm;
}
/**
@ -111,19 +116,20 @@ final class SegmentInfo {
isCompoundFile = 0;
preLockless = true;
}
withNrm = 0;
}
void setNumField(int numField) {
void setNumFields(int numFields) {
if (normGen == null) {
// normGen is null if we loaded a pre-2.1 segment
// file, or, if this segments file hasn't had any
// norms set against it yet:
normGen = new long[numField];
normGen = new long[numFields];
if (!preLockless) {
// This is a FORMAT_LOCKLESS segment, which means
// there are no norms:
for(int i=0;i<numField;i++) {
for(int i=0;i<numFields;i++) {
normGen[i] = -1;
}
}
@ -173,6 +179,7 @@ final class SegmentInfo {
si.isCompoundFile = isCompoundFile;
si.delGen = delGen;
si.preLockless = preLockless;
si.withNrm = withNrm;
if (normGen != null) {
si.normGen = (long[]) normGen.clone();
}
@ -245,7 +252,7 @@ final class SegmentInfo {
// pre-LOCKLESS and must be checked in directory:
for(int i=0;i<normGen.length;i++) {
if (normGen[i] == 0) {
if (dir.fileExists(getNormFileName(i))) {
if (hasSeparateNorms(i)) {
return true;
}
}
@ -285,12 +292,21 @@ final class SegmentInfo {
}
if (hasSeparateNorms(number)) {
// case 1: separate norm
prefix = ".s";
return IndexFileNames.fileNameFromGeneration(name, prefix + number, gen);
} else {
prefix = ".f";
return IndexFileNames.fileNameFromGeneration(name, prefix + number, 0);
}
if (withNrm()) {
// case 2: lockless (or nrm file exists) - single file for all norms
prefix = "." + IndexFileNames.NORMS_EXTENSION;
return IndexFileNames.fileNameFromGeneration(name, prefix, 0);
}
// case 3: norm file for each field
prefix = ".f";
return IndexFileNames.fileNameFromGeneration(name, prefix + number, 0);
}
/**
@ -310,11 +326,6 @@ final class SegmentInfo {
/**
* Returns true if this segment is stored as a compound
* file; else, false.
*
* @param directory directory to check. This parameter is
* only used when the segment was written before version
* 2.1 (at which point compound file or not became stored
* in the segments info file).
*/
boolean getUseCompoundFile() throws IOException {
if (isCompoundFile == -1) {
@ -325,6 +336,32 @@ final class SegmentInfo {
return dir.fileExists(name + ".cfs");
}
}
/**
* Returns true iff this segment stores filed norms in a single .nrm file.
*/
private boolean withNrm () throws IOException {
if (withNrm == -1) {
return false;
}
if (withNrm == 1) {
return true;
}
Directory d = dir;
try {
if (getUseCompoundFile()) {
d = new CompoundFileReader(dir, name + ".cfs");
}
boolean res = d.fileExists(name + "." + IndexFileNames.NORMS_EXTENSION);
withNrm = (byte) (res ? 1 : -1); // avoid more file tests like this
return res;
} finally {
if (d!=dir && d!=null) {
d.close();
}
}
}
/**
* Save this segment's info.

View File

@ -40,6 +40,10 @@ import org.apache.lucene.store.RAMOutputStream;
* @see #add
*/
final class SegmentMerger {
/** norms header placeholder */
static final byte[] NORMS_HEADER = new byte[]{'N','R','M',-1};
private Directory directory;
private String segment;
private int termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL;
@ -116,7 +120,7 @@ final class SegmentMerger {
new CompoundFileWriter(directory, fileName);
Vector files =
new Vector(IndexFileNames.COMPOUND_EXTENSIONS.length + fieldInfos.size());
new Vector(IndexFileNames.COMPOUND_EXTENSIONS.length + 1);
// Basic files
for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS.length; i++) {
@ -127,7 +131,8 @@ final class SegmentMerger {
for (int i = 0; i < fieldInfos.size(); i++) {
FieldInfo fi = fieldInfos.fieldInfo(i);
if (fi.isIndexed && !fi.omitNorms) {
files.add(segment + ".f" + i);
files.add(segment + "." + IndexFileNames.NORMS_EXTENSION);
break;
}
}
@ -408,11 +413,15 @@ final class SegmentMerger {
private void mergeNorms() throws IOException {
byte[] normBuffer = null;
for (int i = 0; i < fieldInfos.size(); i++) {
FieldInfo fi = fieldInfos.fieldInfo(i);
if (fi.isIndexed && !fi.omitNorms) {
IndexOutput output = directory.createOutput(segment + ".f" + i);
try {
IndexOutput output = null;
try {
for (int i = 0; i < fieldInfos.size(); i++) {
FieldInfo fi = fieldInfos.fieldInfo(i);
if (fi.isIndexed && !fi.omitNorms) {
if (output == null) {
output = directory.createOutput(segment + "." + IndexFileNames.NORMS_EXTENSION);
output.writeBytes(NORMS_HEADER,NORMS_HEADER.length);
}
for (int j = 0; j < readers.size(); j++) {
IndexReader reader = (IndexReader) readers.elementAt(j);
int maxDoc = reader.maxDoc();
@ -434,10 +443,12 @@ final class SegmentMerger {
}
}
}
} finally {
output.close();
}
}
} finally {
if (output != null) {
output.close();
}
}
}

View File

@ -58,23 +58,25 @@ class SegmentReader extends IndexReader {
CompoundFileReader cfsReader = null;
private class Norm {
public Norm(IndexInput in, int number)
public Norm(IndexInput in, int number, long normSeek)
{
this.in = in;
this.number = number;
this.normSeek = normSeek;
}
private IndexInput in;
private byte[] bytes;
private boolean dirty;
private int number;
private long normSeek;
private boolean rollbackDirty;
private void reWrite(SegmentInfo si) throws IOException {
// NOTE: norms are re-written in regular directory, not cfs
String oldFileName = si.getNormFileName(this.number);
if (oldFileName != null) {
if (oldFileName != null && !oldFileName.endsWith("." + IndexFileNames.NORMS_EXTENSION)) {
// Mark this file for deletion. Note that we don't
// actually try to delete it until the new segments files is
// successfully written:
@ -215,7 +217,7 @@ class SegmentReader extends IndexReader {
si.clearDelGen();
}
if (normsDirty) { // re-write norms
si.setNumField(fieldInfos.size());
si.setNumFields(fieldInfos.size());
Enumeration values = norms.elements();
while (values.hasMoreElements()) {
Norm norm = (Norm) values.nextElement();
@ -301,10 +303,16 @@ class SegmentReader extends IndexReader {
files.addElement(si.getDelFileName());
}
boolean addedNrm = false;
for (int i = 0; i < fieldInfos.size(); i++) {
String name = si.getNormFileName(i);
if (name != null && directory().fileExists(name))
if (name != null && directory().fileExists(name)) {
if (name.endsWith("." + IndexFileNames.NORMS_EXTENSION)) {
if (addedNrm) continue; // add .nrm just once
addedNrm = true;
}
files.addElement(name);
}
}
return files;
}
@ -462,7 +470,7 @@ class SegmentReader extends IndexReader {
IndexInput normStream = (IndexInput) norm.in.clone();
try { // read from disk
normStream.seek(0);
normStream.seek(norm.normSeek);
normStream.readBytes(bytes, offset, maxDoc());
} finally {
normStream.close();
@ -471,6 +479,8 @@ class SegmentReader extends IndexReader {
private void openNorms(Directory cfsDir) throws IOException {
long nextNormSeek = SegmentMerger.NORMS_HEADER.length; //skip header (header unused for now)
int maxDoc = maxDoc();
for (int i = 0; i < fieldInfos.size(); i++) {
FieldInfo fi = fieldInfos.fieldInfo(i);
if (fi.isIndexed && !fi.omitNorms) {
@ -479,7 +489,9 @@ class SegmentReader extends IndexReader {
if (!si.hasSeparateNorms(fi.number)) {
d = cfsDir;
}
norms.put(fi.name, new Norm(d.openInput(fileName), fi.number));
long normSeek = (fileName.endsWith("." + IndexFileNames.NORMS_EXTENSION) ? nextNormSeek : 0);
norms.put(fi.name, new Norm(d.openInput(fileName), fi.number, normSeek));
nextNormSeek += maxDoc; // increment also if some norms are separate
}
}
}

View File

@ -1397,7 +1397,9 @@
</p>
</section>
<section id="Normalization Factors"><title>Normalization Factors</title>
<p>There's a norm file for each indexed field with a byte for
<p>
<b>Pre-2.1:</b>
There's a norm file for each indexed field with a byte for
each document. The .f[0-9]* file contains,
for each document, a byte that encodes a value that is multiplied
into the score for hits on that field:
@ -1405,6 +1407,27 @@
<p>Norms
(.f[0-9]*) --&gt; &lt;Byte&gt;
<sup>SegSize</sup>
</p>
<p>
<b>2.1 and above:</b>
There's a single .nrm file containing all norms:
</p>
<p>AllNorms
(.nrm) --&gt; NormsHeader,&lt;Norms&gt;
<sup>NumFieldsWithNorms</sup>
</p>
<p>Norms
--&gt; &lt;Byte&gt;
<sup>SegSize</sup>
</p>
<p>NormsHeader
--&gt; 'N','R','M',Version
</p>
<p>Version
--&gt; Byte
</p>
<p>NormsHeader
has 4 bytes, last of which is the format version for this file, currently -1.
</p>
<p>Each
byte encodes a floating point value. Bits 0-2 contain the 3-bit
@ -1441,6 +1464,18 @@
</p>
</li>
</ol>
<p>A separate norm file is created when the norm values of an existing segment are modified.
When field <em>N</em> is modified, a separate norm file <em>.sN</em>
is created, to maintain the norm values for that field.
</p>
<p>
<b>Pre-2.1:</b>
Separate norm files are created only for compound segments.
</p>
<p>
<b>2.1 and above:</b>
Separate norm files are created (when adequate) for both compound and non compound segments.
</p>
</section>
<section id="Term Vectors"><title>Term Vectors</title>