From a30c2632c8c7800287d3c9b319b703d34d55f7d8 Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Sat, 7 May 2016 18:36:13 -0400 Subject: [PATCH] LUCENE-6766: add float, double --- .../SimpleTextSegmentInfoFormat.java | 36 ++++- .../lucene62/Lucene62SegmentInfoFormat.java | 32 ++++ .../org/apache/lucene/index/CheckIndex.java | 13 +- .../org/apache/lucene/index/MultiSorter.java | 111 ++++++++++++-- .../lucene/index/SortingLeafReader.java | 1 - .../apache/lucene/index/TestIndexSorting.java | 139 ++++++++++++++++++ 6 files changed, 313 insertions(+), 19 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java index bf9d3ded573..8ab45be2ede 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java @@ -168,6 +168,12 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { case "int": type = SortField.Type.INT; break; + case "double": + type = SortField.Type.DOUBLE; + break; + case "float": + type = SortField.Type.FLOAT; + break; default: throw new CorruptIndexException("unable to parse sort type string: " + typeAsString, input); } @@ -216,6 +222,26 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { break; } break; + case DOUBLE: + switch (missingLastAsString) { + case "null": + missingValue = null; + break; + default: + missingValue = Double.parseDouble(missingLastAsString); + break; + } + break; + case FLOAT: + switch (missingLastAsString) { + case "null": + missingValue = null; + break; + default: + missingValue = Float.parseFloat(missingLastAsString); + break; + } + break; // nocommit need the rest default: throw new AssertionError(); @@ -338,6 +364,12 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { case INT: sortType = "int"; break; + case DOUBLE: + sortType = "double"; + break; + case FLOAT: + sortType = "float"; + break; // nocommit the rest: default: throw new IllegalStateException("Unexpected sort type: " + sortField.getType()); @@ -358,10 +390,8 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { missing = "first"; } else if (missingValue == SortField.STRING_LAST) { missing = "last"; - } else if (missingValue instanceof Long) { - missing = Long.toString((Long) missingValue); } else { - throw new IllegalStateException("Unexpected missing sort value: " + missingValue); + missing = missingValue.toString(); } SimpleTextUtil.write(output, missing, scratch); SimpleTextUtil.writeNewline(output); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java index 53d273474a8..bb52eebc06e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java @@ -121,6 +121,12 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { case 2: sortType = SortField.Type.INT; break; + case 3: + sortType = SortField.Type.DOUBLE; + break; + case 4: + sortType = SortField.Type.FLOAT; + break; default: throw new CorruptIndexException("invalid index sort field type ID: " + sortTypeID, input); } @@ -163,6 +169,18 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { } missingValue = input.readInt(); break; + case DOUBLE: + if (b != 1) { + throw new CorruptIndexException("invalid missing value flag: " + b, input); + } + missingValue = Double.longBitsToDouble(input.readLong()); + break; + case FLOAT: + if (b != 1) { + throw new CorruptIndexException("invalid missing value flag: " + b, input); + } + missingValue = Float.intBitsToFloat(input.readInt()); + break; default: throw new AssertionError("unhandled sortType=" + sortType); } @@ -240,6 +258,12 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { case INT: sortTypeID = 2; break; + case DOUBLE: + sortTypeID = 3; + break; + case FLOAT: + sortTypeID = 4; + break; // nocommit the rest: default: throw new IllegalStateException("Unexpected sort type: " + sortField.getType()); @@ -270,6 +294,14 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { output.writeByte((byte) 1); output.writeInt(((Integer) missingValue).intValue()); break; + case DOUBLE: + output.writeByte((byte) 1); + output.writeLong(Double.doubleToLongBits(((Double) missingValue).doubleValue())); + break; + case FLOAT: + output.writeByte((byte) 1); + output.writeLong(Float.floatToIntBits(((Float) missingValue).floatValue())); + break; // nocommit the rest: default: throw new IllegalStateException("Unexpected sort type: " + sortField.getType()); diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index fb2dc80ce3f..1031d22e22a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -648,6 +648,7 @@ public final class CheckIndex implements Closeable { int toLoseDocCount = info.info.maxDoc(); SegmentReader reader = null; + Sort previousIndexSort = null; try { msg(infoStream, " version=" + (version == null ? "3.0" : version)); @@ -661,6 +662,13 @@ public final class CheckIndex implements Closeable { Sort indexSort = info.info.getIndexSort(); if (indexSort != null) { msg(infoStream, " sort=" + indexSort); + if (previousIndexSort != null) { + if (previousIndexSort.equals(indexSort) == false) { + throw new RuntimeException("index sort changed from " + previousIndexSort + " to " + indexSort); + } + } else { + previousIndexSort = indexSort; + } } segInfoStat.numFiles = info.files().size(); segInfoStat.sizeMB = info.sizeInBytes()/(1024.*1024.); @@ -835,8 +843,6 @@ public final class CheckIndex implements Closeable { for (int i = 0; i < fields.length; i++) { reverseMul[i] = fields[i].getReverse() ? -1 : 1; comparators[i] = fields[i].getComparator(1, i).getLeafComparator(readerContext); - // nocommit we prevent SCORE? - //comparators[i].setScorer(FAKESCORER); } int maxDoc = reader.maxDoc(); @@ -2585,9 +2591,6 @@ public final class CheckIndex implements Closeable { } } - // nocommit must check index is sorted, if it claims to be - // nocommit must check that all segments have the same sort, if any segment is sorted - /** * Parse command line args into fields * @param args The command line arguments diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index ca1ebe57780..4c78aa1233f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -164,6 +164,51 @@ final class MultiSorter { }; } + case LONG: + { + List values = new ArrayList<>(); + List docsWithFields = new ArrayList<>(); + for(CodecReader reader : readers) { + values.add(DocValues.getNumeric(reader, sortField.getField())); + docsWithFields.add(DocValues.getDocsWithField(reader, sortField.getField())); + } + + final int reverseMul; + if (sortField.getReverse()) { + reverseMul = -1; + } else { + reverseMul = 1; + } + + final int missingValue; + + if (sortField.getMissingValue() != null) { + missingValue = (Integer) sortField.getMissingValue(); + } else { + missingValue = 0; + } + + return new CrossReaderComparator() { + @Override + public int compare(int readerIndexA, int docIDA, int readerIndexB, int docIDB) { + long valueA; + if (docsWithFields.get(readerIndexA).get(docIDA)) { + valueA = (int) values.get(readerIndexA).get(docIDA); + } else { + valueA = missingValue; + } + + long valueB; + if (docsWithFields.get(readerIndexB).get(docIDB)) { + valueB = (int) values.get(readerIndexB).get(docIDB); + } else { + valueB = missingValue; + } + return reverseMul * Long.compare(valueA, valueB); + } + }; + } + case INT: { List values = new ArrayList<>(); @@ -208,8 +253,8 @@ final class MultiSorter { } }; } - case LONG: - // nocommit refactor/share at least numerics here: + + case DOUBLE: { List values = new ArrayList<>(); List docsWithFields = new ArrayList<>(); @@ -225,34 +270,80 @@ final class MultiSorter { reverseMul = 1; } - final int missingValue; + final double missingValue; if (sortField.getMissingValue() != null) { - missingValue = (Integer) sortField.getMissingValue(); + missingValue = (Double) sortField.getMissingValue(); } else { - missingValue = 0; + missingValue = 0.0; } return new CrossReaderComparator() { @Override public int compare(int readerIndexA, int docIDA, int readerIndexB, int docIDB) { - long valueA; + double valueA; if (docsWithFields.get(readerIndexA).get(docIDA)) { - valueA = (int) values.get(readerIndexA).get(docIDA); + valueA = Double.longBitsToDouble(values.get(readerIndexA).get(docIDA)); } else { valueA = missingValue; } - long valueB; + double valueB; if (docsWithFields.get(readerIndexB).get(docIDB)) { - valueB = (int) values.get(readerIndexB).get(docIDB); + valueB = Double.longBitsToDouble(values.get(readerIndexB).get(docIDB)); } else { valueB = missingValue; } - return reverseMul * Long.compare(valueA, valueB); + return reverseMul * Double.compare(valueA, valueB); } }; } + + case FLOAT: + { + List values = new ArrayList<>(); + List docsWithFields = new ArrayList<>(); + for(CodecReader reader : readers) { + values.add(DocValues.getNumeric(reader, sortField.getField())); + docsWithFields.add(DocValues.getDocsWithField(reader, sortField.getField())); + } + + final int reverseMul; + if (sortField.getReverse()) { + reverseMul = -1; + } else { + reverseMul = 1; + } + + final float missingValue; + + if (sortField.getMissingValue() != null) { + missingValue = (Float) sortField.getMissingValue(); + } else { + missingValue = 0.0f; + } + + return new CrossReaderComparator() { + @Override + public int compare(int readerIndexA, int docIDA, int readerIndexB, int docIDB) { + float valueA; + if (docsWithFields.get(readerIndexA).get(docIDA)) { + valueA = Float.intBitsToFloat((int) values.get(readerIndexA).get(docIDA)); + } else { + valueA = missingValue; + } + + float valueB; + if (docsWithFields.get(readerIndexB).get(docIDB)) { + valueB = Float.intBitsToFloat((int) values.get(readerIndexB).get(docIDB)); + } else { + valueB = missingValue; + } + return reverseMul * Float.compare(valueA, valueB); + } + }; + } + // nocommit do the rest: default: throw new IllegalArgumentException("unhandled SortField.getType()=" + sortField.getType()); diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java index b6558f7fd15..70d5d204439 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java @@ -840,7 +840,6 @@ class SortingLeafReader extends FilterLeafReader { if (inPointValues == null) { return null; } else { - // nocommit make sure this is tested return new SortingPointValues(inPointValues, docMap); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 1da6c82cc0e..4e2606374f0 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -36,9 +36,11 @@ import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.BinaryPoint; import org.apache.lucene.document.Document; +import org.apache.lucene.document.DoubleDocValuesField; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.FloatDocValuesField; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.SortedNumericDocValuesField; @@ -72,6 +74,7 @@ import org.junit.BeforeClass; // nocommit test tie break // nocommit test multiple sorts // nocommit test update dvs +// nocommit test missing value // nocommit test EarlyTerminatingCollector @@ -113,6 +116,142 @@ public class TestIndexSorting extends LuceneTestCase { dir.close(); } + public void testBasicLong() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + Sort indexSort = new Sort(new SortField("foo", SortField.Type.LONG)); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(new NumericDocValuesField("foo", 18)); + w.addDocument(doc); + // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging: + w.commit(); + + doc = new Document(); + doc.add(new NumericDocValuesField("foo", -1)); + w.addDocument(doc); + w.commit(); + + doc = new Document(); + doc.add(new NumericDocValuesField("foo", 7)); + w.addDocument(doc); + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(3, leaf.maxDoc()); + NumericDocValues values = leaf.getNumericDocValues("foo"); + assertEquals(-1, values.get(0)); + assertEquals(7, values.get(1)); + assertEquals(18, values.get(2)); + r.close(); + w.close(); + dir.close(); + } + + public void testBasicInt() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + Sort indexSort = new Sort(new SortField("foo", SortField.Type.INT)); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(new NumericDocValuesField("foo", 18)); + w.addDocument(doc); + // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging: + w.commit(); + + doc = new Document(); + doc.add(new NumericDocValuesField("foo", -1)); + w.addDocument(doc); + w.commit(); + + doc = new Document(); + doc.add(new NumericDocValuesField("foo", 7)); + w.addDocument(doc); + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(3, leaf.maxDoc()); + NumericDocValues values = leaf.getNumericDocValues("foo"); + assertEquals(-1, values.get(0)); + assertEquals(7, values.get(1)); + assertEquals(18, values.get(2)); + r.close(); + w.close(); + dir.close(); + } + + public void testBasicDouble() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + Sort indexSort = new Sort(new SortField("foo", SortField.Type.DOUBLE)); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(new DoubleDocValuesField("foo", 18.0)); + w.addDocument(doc); + // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging: + w.commit(); + + doc = new Document(); + doc.add(new DoubleDocValuesField("foo", -1.0)); + w.addDocument(doc); + w.commit(); + + doc = new Document(); + doc.add(new DoubleDocValuesField("foo", 7.0)); + w.addDocument(doc); + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(3, leaf.maxDoc()); + NumericDocValues values = leaf.getNumericDocValues("foo"); + assertEquals(-1.0, Double.longBitsToDouble(values.get(0)), 0.0); + assertEquals(7.0, Double.longBitsToDouble(values.get(1)), 0.0); + assertEquals(18.0, Double.longBitsToDouble(values.get(2)), 0.0); + r.close(); + w.close(); + dir.close(); + } + + public void testBasicFloat() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + Sort indexSort = new Sort(new SortField("foo", SortField.Type.FLOAT)); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(new FloatDocValuesField("foo", 18.0f)); + w.addDocument(doc); + // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging: + w.commit(); + + doc = new Document(); + doc.add(new FloatDocValuesField("foo", -1.0f)); + w.addDocument(doc); + w.commit(); + + doc = new Document(); + doc.add(new FloatDocValuesField("foo", 7.0f)); + w.addDocument(doc); + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(3, leaf.maxDoc()); + NumericDocValues values = leaf.getNumericDocValues("foo"); + assertEquals(-1.0f, Float.intBitsToFloat((int) values.get(0)), 0.0f); + assertEquals(7.0f, Float.intBitsToFloat((int) values.get(1)), 0.0f); + assertEquals(18.0f, Float.intBitsToFloat((int) values.get(2)), 0.0f); + r.close(); + w.close(); + dir.close(); + } + public void testSortOnMerge(boolean withDeletes) throws IOException { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));