From c83fac628893a691a57f72872109f51cec78c26a Mon Sep 17 00:00:00 2001 From: Tommaso Teofili Date: Mon, 10 Dec 2012 08:20:03 +0000 Subject: [PATCH] [LUCENE-4345] - improved DS performance by doing commits only once git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1419258 13f79535-47bb-0310-9956-ffa450edef68 --- .../classification/utils/DatasetSplitter.java | 31 +++++++++---------- .../utils/DataSplitterTest.java | 2 +- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java b/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java index 3d87a078e69..faea4dcc03d 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java @@ -45,7 +45,8 @@ public class DatasetSplitter { /** * Create a {@link DatasetSplitter} by giving test and cross validation IDXs sizes - * @param testRatio the ratio of the original index to be used for the test IDX as a double between 0.0 and 1.0 + * + * @param testRatio the ratio of the original index to be used for the test IDX as a double between 0.0 and 1.0 * @param crossValidationRatio the ratio of the original index to be used for the c.v. IDX as a double between 0.0 and 1.0 */ public DatasetSplitter(double testRatio, double crossValidationRatio) { @@ -55,12 +56,13 @@ public class DatasetSplitter { /** * Split a given index into 3 indexes for training, test and cross validation tasks respectively - * @param originalIndex an {@link AtomicReader} on the source index - * @param trainingIndex a {@link Directory} used to write the training index - * @param testIndex a {@link Directory} used to write the test index + * + * @param originalIndex an {@link AtomicReader} on the source index + * @param trainingIndex a {@link Directory} used to write the training index + * @param testIndex a {@link Directory} used to write the test index * @param crossValidationIndex a {@link Directory} used to write the cross validation index - * @param analyzer {@link Analyzer} used to create the new docs - * @param fieldNames names of fields that need to be put in the new indexes or null if all should be used + * @param analyzer {@link Analyzer} used to create the new docs + * @param fieldNames names of fields that need to be put in the new indexes or null if all should be used * @throws IOException if any writing operation fails on any of the indexes */ public void split(AtomicReader originalIndex, Directory trainingIndex, Directory testIndex, Directory crossValidationIndex, @@ -98,16 +100,13 @@ public class DatasetSplitter { } } else { for (StorableField storableField : originalIndex.document(scoreDoc.doc).getFields()) { - if (storableField.readerValue()!= null){ + if (storableField.readerValue() != null) { doc.add(new Field(storableField.name(), storableField.readerValue(), ft)); - } - else if (storableField.binaryValue()!= null){ + } else if (storableField.binaryValue() != null) { doc.add(new Field(storableField.name(), storableField.binaryValue(), ft)); - } - else if (storableField.stringValue()!= null){ + } else if (storableField.stringValue() != null) { doc.add(new Field(storableField.name(), storableField.stringValue(), ft)); - } - else if (storableField.numericValue()!= null){ + } else if (storableField.numericValue() != null) { doc.add(new Field(storableField.name(), storableField.numericValue().toString(), ft)); } } @@ -116,19 +115,19 @@ public class DatasetSplitter { // add it to one of the IDXs if (b % 2 == 0 && testWriter.maxDoc() < size * testRatio) { testWriter.addDocument(doc); - testWriter.commit(); } else if (cvWriter.maxDoc() < size * crossValidationRatio) { cvWriter.addDocument(doc); - cvWriter.commit(); } else { trainingWriter.addDocument(doc); - trainingWriter.commit(); } b++; } } catch (Exception e) { throw new IOException(e); } finally { + testWriter.commit(); + cvWriter.commit(); + trainingWriter.commit(); // close IWs testWriter.close(); cvWriter.close(); diff --git a/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java b/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java index 1f6d79af911..31a8704dd5d 100644 --- a/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java +++ b/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java @@ -92,7 +92,7 @@ public class DataSplitterTest extends LuceneTestCase { @Test public void testSplitOnAllFields() throws Exception { - assertSplit(originalIndex, 0.1, 0.1, null); + assertSplit(originalIndex, 0.1, 0.1); }