[LUCENE-4345] - improved DS performance by doing commits only once

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1419258 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tommaso Teofili 2012-12-10 08:20:03 +00:00
parent 8561fc877c
commit c83fac6288
2 changed files with 16 additions and 17 deletions

View File

@ -45,6 +45,7 @@ public class DatasetSplitter {
/** /**
* Create a {@link DatasetSplitter} by giving test and cross validation IDXs sizes * Create a {@link DatasetSplitter} by giving test and cross validation IDXs sizes
*
* @param testRatio the ratio of the original index to be used for the test IDX as a <code>double</code> between 0.0 and 1.0 * @param testRatio the ratio of the original index to be used for the test IDX as a <code>double</code> between 0.0 and 1.0
* @param crossValidationRatio the ratio of the original index to be used for the c.v. IDX as a <code>double</code> between 0.0 and 1.0 * @param crossValidationRatio the ratio of the original index to be used for the c.v. IDX as a <code>double</code> between 0.0 and 1.0
*/ */
@ -55,6 +56,7 @@ public class DatasetSplitter {
/** /**
* Split a given index into 3 indexes for training, test and cross validation tasks respectively * Split a given index into 3 indexes for training, test and cross validation tasks respectively
*
* @param originalIndex an {@link AtomicReader} on the source index * @param originalIndex an {@link AtomicReader} on the source index
* @param trainingIndex a {@link Directory} used to write the training index * @param trainingIndex a {@link Directory} used to write the training index
* @param testIndex a {@link Directory} used to write the test index * @param testIndex a {@link Directory} used to write the test index
@ -100,14 +102,11 @@ public class DatasetSplitter {
for (StorableField storableField : originalIndex.document(scoreDoc.doc).getFields()) { for (StorableField storableField : originalIndex.document(scoreDoc.doc).getFields()) {
if (storableField.readerValue() != null) { if (storableField.readerValue() != null) {
doc.add(new Field(storableField.name(), storableField.readerValue(), ft)); doc.add(new Field(storableField.name(), storableField.readerValue(), ft));
} } else if (storableField.binaryValue() != null) {
else if (storableField.binaryValue()!= null){
doc.add(new Field(storableField.name(), storableField.binaryValue(), ft)); doc.add(new Field(storableField.name(), storableField.binaryValue(), ft));
} } else if (storableField.stringValue() != null) {
else if (storableField.stringValue()!= null){
doc.add(new Field(storableField.name(), storableField.stringValue(), ft)); doc.add(new Field(storableField.name(), storableField.stringValue(), ft));
} } else if (storableField.numericValue() != null) {
else if (storableField.numericValue()!= null){
doc.add(new Field(storableField.name(), storableField.numericValue().toString(), ft)); doc.add(new Field(storableField.name(), storableField.numericValue().toString(), ft));
} }
} }
@ -116,19 +115,19 @@ public class DatasetSplitter {
// add it to one of the IDXs // add it to one of the IDXs
if (b % 2 == 0 && testWriter.maxDoc() < size * testRatio) { if (b % 2 == 0 && testWriter.maxDoc() < size * testRatio) {
testWriter.addDocument(doc); testWriter.addDocument(doc);
testWriter.commit();
} else if (cvWriter.maxDoc() < size * crossValidationRatio) { } else if (cvWriter.maxDoc() < size * crossValidationRatio) {
cvWriter.addDocument(doc); cvWriter.addDocument(doc);
cvWriter.commit();
} else { } else {
trainingWriter.addDocument(doc); trainingWriter.addDocument(doc);
trainingWriter.commit();
} }
b++; b++;
} }
} catch (Exception e) { } catch (Exception e) {
throw new IOException(e); throw new IOException(e);
} finally { } finally {
testWriter.commit();
cvWriter.commit();
trainingWriter.commit();
// close IWs // close IWs
testWriter.close(); testWriter.close();
cvWriter.close(); cvWriter.close();

View File

@ -92,7 +92,7 @@ public class DataSplitterTest extends LuceneTestCase {
@Test @Test
public void testSplitOnAllFields() throws Exception { public void testSplitOnAllFields() throws Exception {
assertSplit(originalIndex, 0.1, 0.1, null); assertSplit(originalIndex, 0.1, 0.1);
} }