diff --git a/CHANGES.txt b/CHANGES.txt index 6e3b65692d8..1bcf39e53fe 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -163,6 +163,13 @@ API Changes small. This changes the index file format and cannot be read by previous versions of Lucene. (Doron Cohen via Yonik Seeley) +13. LUCENE-756: Maintain all norms in a single .nrm file to reduce the + number of open files and file descriptors for the non-compound index + format. This changes the index file format, but maintains the + ability to read and update older indicies. The first segment merge + on an older format index will create a single .nrm file for the new + segment. (Doron Cohen via Yonik Seeley) + Bug fixes 1. Fixed the web application demo (built with "ant war-demo") which diff --git a/src/java/org/apache/lucene/index/IndexFileNames.java b/src/java/org/apache/lucene/index/IndexFileNames.java index 1de30016f7a..880fcfd3c2b 100644 --- a/src/java/org/apache/lucene/index/IndexFileNames.java +++ b/src/java/org/apache/lucene/index/IndexFileNames.java @@ -35,6 +35,9 @@ final class IndexFileNames { * pre-lockless indices) */ static final String DELETABLE = "deletable"; + /** Extension of norms file */ + static final String NORMS_EXTENSION = "nrm"; + /** * This array contains all filename extensions used by * Lucene's index files, with two exceptions, namely the @@ -45,7 +48,8 @@ final class IndexFileNames { */ static final String INDEX_EXTENSIONS[] = new String[] { "cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del", - "tvx", "tvd", "tvf", "tvp", "gen"}; + "tvx", "tvd", "tvf", "tvp", "gen", "nrm" + }; /** File extensions of old-style index files */ static final String COMPOUND_EXTENSIONS[] = new String[] { diff --git a/src/java/org/apache/lucene/index/IndexWriter.java b/src/java/org/apache/lucene/index/IndexWriter.java index 7f85af5b6d6..a9793301a28 100644 --- a/src/java/org/apache/lucene/index/IndexWriter.java +++ b/src/java/org/apache/lucene/index/IndexWriter.java @@ -639,7 +639,7 @@ public class IndexWriter { String segmentName = newRAMSegmentName(); dw.addDocument(segmentName, doc); synchronized (this) { - ramSegmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory, false)); + ramSegmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory, false, false)); maybeFlushRamSegments(); } } @@ -772,10 +772,10 @@ public class IndexWriter { while (segmentInfos.size() > 1 || (segmentInfos.size() == 1 && (SegmentReader.hasDeletions(segmentInfos.info(0)) || + SegmentReader.hasSeparateNorms(segmentInfos.info(0)) || segmentInfos.info(0).dir != directory || (useCompoundFile && - (!SegmentReader.usesCompoundFile(segmentInfos.info(0)) || - SegmentReader.hasSeparateNorms(segmentInfos.info(0))))))) { + (!SegmentReader.usesCompoundFile(segmentInfos.info(0))))))) { int minSegment = segmentInfos.size() - mergeFactor; mergeSegments(segmentInfos, minSegment < 0 ? 0 : minSegment, segmentInfos.size()); } @@ -1127,7 +1127,7 @@ public class IndexWriter { int docCount = merger.merge(); // merge 'em segmentInfos.setSize(0); // pop old infos & add new - info = new SegmentInfo(mergedName, docCount, directory, false); + info = new SegmentInfo(mergedName, docCount, directory, false, true); segmentInfos.addElement(info); commitPending = true; @@ -1347,7 +1347,7 @@ public class IndexWriter { } newSegment = new SegmentInfo(mergedName, mergedDocCount, - directory, false); + directory, false, true); if (sourceSegments == ramSegmentInfos) { diff --git a/src/java/org/apache/lucene/index/SegmentInfo.java b/src/java/org/apache/lucene/index/SegmentInfo.java index e1860c0eb63..9e06834ca7f 100644 --- a/src/java/org/apache/lucene/index/SegmentInfo.java +++ b/src/java/org/apache/lucene/index/SegmentInfo.java @@ -42,8 +42,13 @@ final class SegmentInfo { private byte isCompoundFile; // -1 if it is not; 1 if it is; 0 if it's // pre-2.1 (ie, must check file system to see - // if .cfs exists) + // if .cfs and .nrm exist) + private byte withNrm; // 1 if this segment maintains norms in a single file; + // -1 if not; 0 if check file is required to tell. + // would be -1 for segments populated by DocumentWriter. + // would be 1 for (newly created) merge resulted segments (both compound and non compound). + public SegmentInfo(String name, int docCount, Directory dir) { this.name = name; this.docCount = docCount; @@ -51,14 +56,13 @@ final class SegmentInfo { delGen = -1; isCompoundFile = 0; preLockless = true; + withNrm = 0; } - public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile) { + + public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean withNrm) { this(name, docCount, dir); - if (isCompoundFile) { - this.isCompoundFile = 1; - } else { - this.isCompoundFile = -1; - } + this.isCompoundFile = (byte) (isCompoundFile ? 1 : -1); + this.withNrm = (byte) (withNrm ? 1 : -1); preLockless = false; } @@ -78,6 +82,7 @@ final class SegmentInfo { System.arraycopy(src.normGen, 0, normGen, 0, src.normGen.length); } isCompoundFile = src.isCompoundFile; + withNrm = src.withNrm; } /** @@ -111,19 +116,20 @@ final class SegmentInfo { isCompoundFile = 0; preLockless = true; } + withNrm = 0; } - void setNumField(int numField) { + void setNumFields(int numFields) { if (normGen == null) { // normGen is null if we loaded a pre-2.1 segment // file, or, if this segments file hasn't had any // norms set against it yet: - normGen = new long[numField]; + normGen = new long[numFields]; if (!preLockless) { // This is a FORMAT_LOCKLESS segment, which means // there are no norms: - for(int i=0;i
Normalization Factors -

There's a norm file for each indexed field with a byte for +

+ Pre-2.1: + There's a norm file for each indexed field with a byte for each document. The .f[0-9]* file contains, for each document, a byte that encodes a value that is multiplied into the score for hits on that field: @@ -1405,6 +1407,27 @@

Norms (.f[0-9]*) --> <Byte> SegSize +

+

+ 2.1 and above: + There's a single .nrm file containing all norms: +

+

AllNorms + (.nrm) --> NormsHeader,<Norms> + NumFieldsWithNorms +

+

Norms + --> <Byte> + SegSize +

+

NormsHeader + --> 'N','R','M',Version +

+

Version + --> Byte +

+

NormsHeader + has 4 bytes, last of which is the format version for this file, currently -1.

Each byte encodes a floating point value. Bits 0-2 contain the 3-bit @@ -1441,6 +1464,18 @@

+

A separate norm file is created when the norm values of an existing segment are modified. + When field N is modified, a separate norm file .sN + is created, to maintain the norm values for that field. +

+

+ Pre-2.1: + Separate norm files are created only for compound segments. +

+

+ 2.1 and above: + Separate norm files are created (when adequate) for both compound and non compound segments. +

Term Vectors