From 75b5ba898a8993c6154b0b92701862b8193f35c0 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 9 Nov 2012 20:04:40 +0000 Subject: [PATCH] sorted bytes file format git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1407612 13f79535-47bb-0310-9956-ffa450edef68 --- .../SimpleTextSimpleDocValuesFormat.java | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSimpleDocValuesFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSimpleDocValuesFormat.java index 16e35bc88b1..3d9981e33aa 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSimpleDocValuesFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSimpleDocValuesFormat.java @@ -62,6 +62,9 @@ public class SimpleTextSimpleDocValuesFormat extends SimpleDocValuesFormat { // used for bytes final static BytesRef MAXLENGTH = new BytesRef(" maxlength "); final static BytesRef LENGTH = new BytesRef("length "); + // used for sorted bytes + final static BytesRef NUMVALUES = new BytesRef(" numvalues"); + final static BytesRef ORDPATTERN = new BytesRef(" ordpattern"); @Override public SimpleDVConsumer fieldsConsumer(SegmentWriteState state) throws IOException { @@ -100,6 +103,28 @@ public class SimpleTextSimpleDocValuesFormat extends SimpleDocValuesFormat { * * so a document's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*docid * the extra 9 is 2 newlines, plus "length " itself. + * + * for sorted bytes this is a fixed-width file, for example: + *
+   *  field myField
+   *    numvalues 10
+   *    maxLength 8
+   *    pattern 0
+   *    ordpattern 00
+   *  length 6
+   *  foobar[space][space]
+   *  length 3
+   *  baz[space][space][space][space][space]
+   *  ...
+   *  03
+   *  06
+   *  01
+   *  10
+   *  ...
+   *  
+ * so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues. + * a document's ord can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid + * an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord * * the reader can just scan this file when it opens, skipping over the data blocks * and saving the offset/etc for each field.