sorted bytes file format

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1407612 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-11-09 20:04:40 +00:00
parent 5910327656
commit 75b5ba898a
1 changed files with 25 additions and 0 deletions

View File

@ -62,6 +62,9 @@ public class SimpleTextSimpleDocValuesFormat extends SimpleDocValuesFormat {
// used for bytes
final static BytesRef MAXLENGTH = new BytesRef(" maxlength ");
final static BytesRef LENGTH = new BytesRef("length ");
// used for sorted bytes
final static BytesRef NUMVALUES = new BytesRef(" numvalues");
final static BytesRef ORDPATTERN = new BytesRef(" ordpattern");
@Override
public SimpleDVConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
@ -100,6 +103,28 @@ public class SimpleTextSimpleDocValuesFormat extends SimpleDocValuesFormat {
* </pre>
* so a document's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*docid
* the extra 9 is 2 newlines, plus "length " itself.
*
* for sorted bytes this is a fixed-width file, for example:
* <pre>
* field myField
* numvalues 10
* maxLength 8
* pattern 0
* ordpattern 00
* length 6
* foobar[space][space]
* length 3
* baz[space][space][space][space][space]
* ...
* 03
* 06
* 01
* 10
* ...
* </pre>
* so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues.
* a document's ord can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid
* an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord
*
* the reader can just scan this file when it opens, skipping over the data blocks
* and saving the offset/etc for each field.