LUCENE-4590: Added WriteEnwikiLineDocTask.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1418852 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Doron Cohen 2012-12-09 09:26:09 +00:00
parent 2ae20cf77c
commit d3da4138d6
6 changed files with 212 additions and 14 deletions

View File

@ -128,6 +128,11 @@ New Features
rule files in the ICU RuleBasedBreakIterator format. rule files in the ICU RuleBasedBreakIterator format.
(Shawn Heisey, Robert Muir, Steve Rowe) (Shawn Heisey, Robert Muir, Steve Rowe)
* LUCENE-4590: Added WriteEnwikiLineDocTask - a benchmark task for writing
Wikipedia category pages and non-category pages into separate line files.
extractWikipedia.alg was changed to use this task, so now it creates two
files. (Doron Cohen)
API Changes API Changes
* LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries * LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries

View File

@ -41,4 +41,4 @@ content.source.forever=false
# ------------------------------------------------------------------------------------- # -------------------------------------------------------------------------------------
# Process all documents, appending each one to the line file: # Process all documents, appending each one to the line file:
{WriteLineDoc() > : * {WriteEnwikiLineDoc() > : *

View File

@ -0,0 +1,70 @@
package org.apache.lucene.benchmark.byTask.tasks;
import java.io.BufferedWriter;
import java.io.File;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.StorableField;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A {@link WriteLineDocTask} which for Wikipedia input, will write category pages
* to another file, while remaining pages will be written to the original file.
* The categories file is derived from the original file, by adding a prefix "categories-".
*/
public class WriteEnwikiLineDocTask extends WriteLineDocTask {
private final PrintWriter categoryLineFileOut;
public WriteEnwikiLineDocTask(PerfRunData runData) throws Exception {
super(runData);
OutputStream out = StreamUtils.outputStream(categoriesLineFile(new File(fname)));
categoryLineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), StreamUtils.BUFFER_SIZE));
writeHeader(categoryLineFileOut);
}
/** Compose categories line file out of original line file */
static File categoriesLineFile(File f) {
File dir = f.getParentFile();
String categoriesName = "categories-"+f.getName();
return dir==null ? new File(categoriesName) : new File(dir,categoriesName);
}
@Override
public void close() throws Exception {
categoryLineFileOut.close();
super.close();
}
@Override
protected PrintWriter lineFileOut(Document doc) {
StorableField titleField = doc.getField(DocMaker.TITLE_FIELD);
if (titleField!=null && titleField.stringValue().startsWith("Category:")) {
return categoryLineFileOut;
}
return super.lineFileOut(doc);
}
}

View File

@ -32,7 +32,6 @@ import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.StreamUtils; import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.StorableField; import org.apache.lucene.index.StorableField;
/** /**
@ -84,18 +83,20 @@ public class WriteLineDocTask extends PerfTask {
public static final String DEFAULT_SUFFICIENT_FIELDS = DocMaker.TITLE_FIELD +',' + DocMaker.BODY_FIELD; public static final String DEFAULT_SUFFICIENT_FIELDS = DocMaker.TITLE_FIELD +',' + DocMaker.BODY_FIELD;
private int docSize = 0; private int docSize = 0;
private PrintWriter lineFileOut = null; protected final String fname;
private DocMaker docMaker; private final PrintWriter lineFileOut;
private ThreadLocal<StringBuilder> threadBuffer = new ThreadLocal<StringBuilder>(); private final DocMaker docMaker;
private ThreadLocal<Matcher> threadNormalizer = new ThreadLocal<Matcher>(); private final ThreadLocal<StringBuilder> threadBuffer = new ThreadLocal<StringBuilder>();
private final String[] fieldsToWrite;; private final ThreadLocal<Matcher> threadNormalizer = new ThreadLocal<Matcher>();
private final String[] fieldsToWrite;
private final boolean[] sufficientFields; private final boolean[] sufficientFields;
private final boolean checkSufficientFields; private final boolean checkSufficientFields;
public WriteLineDocTask(PerfRunData runData) throws Exception { public WriteLineDocTask(PerfRunData runData) throws Exception {
super(runData); super(runData);
Config config = runData.getConfig(); Config config = runData.getConfig();
String fname = config.get("line.file.out", null); fname = config.get("line.file.out", null);
if (fname == null) { if (fname == null) {
throw new IllegalArgumentException("line.file.out must be set"); throw new IllegalArgumentException("line.file.out must be set");
} }
@ -129,13 +130,13 @@ public class WriteLineDocTask extends PerfTask {
} }
} }
writeHeader(); writeHeader(lineFileOut);
} }
/** /**
* Write a header to the lines file - indicating how to read the file later * Write header to the lines file - indicating how to read the file later.
*/ */
private void writeHeader() { protected void writeHeader(PrintWriter out) {
StringBuilder sb = threadBuffer.get(); StringBuilder sb = threadBuffer.get();
if (sb == null) { if (sb == null) {
sb = new StringBuilder(); sb = new StringBuilder();
@ -146,7 +147,7 @@ public class WriteLineDocTask extends PerfTask {
for (String f : fieldsToWrite) { for (String f : fieldsToWrite) {
sb.append(SEP).append(f); sb.append(SEP).append(f);
} }
lineFileOut.println(sb.toString()); out.println(sb.toString());
} }
@Override @Override
@ -181,12 +182,20 @@ public class WriteLineDocTask extends PerfTask {
if (sufficient) { if (sufficient) {
sb.setLength(sb.length()-1); // remove redundant last separator sb.setLength(sb.length()-1); // remove redundant last separator
// lineFileOut is a PrintWriter, which synchronizes internally in println. // lineFileOut is a PrintWriter, which synchronizes internally in println.
lineFileOut.println(sb.toString()); lineFileOut(doc).println(sb.toString());
} }
return 1; return 1;
} }
/**
* Selects output line file by written doc.
* Default: original output line file.
*/
protected PrintWriter lineFileOut(Document doc) {
return lineFileOut;
}
@Override @Override
public void close() throws Exception { public void close() throws Exception {
lineFileOut.close(); lineFileOut.close();

View File

@ -0,0 +1,114 @@
package org.apache.lucene.benchmark.byTask.tasks;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Properties;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.benchmark.BenchmarkTestCase;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
/** Tests the functionality of {@link WriteEnwikiLineDocTask}. */
public class WriteEnwikiLineDocTaskTest extends BenchmarkTestCase {
// class has to be public so that Class.forName.newInstance() will work
/** Interleaves category docs with regular docs */
public static final class WriteLineCategoryDocMaker extends DocMaker {
AtomicInteger flip = new AtomicInteger(0);
@Override
public Document makeDocument() throws Exception {
boolean isCategory = (flip.incrementAndGet() % 2 == 0);
Document doc = new Document();
doc.add(new StringField(BODY_FIELD, "body text", Field.Store.NO));
doc.add(new StringField(TITLE_FIELD, isCategory ? "Category:title text" : "title text", Field.Store.NO));
doc.add(new StringField(DATE_FIELD, "date text", Field.Store.NO));
return doc;
}
}
private PerfRunData createPerfRunData(File file, String docMakerName) throws Exception {
Properties props = new Properties();
props.setProperty("doc.maker", docMakerName);
props.setProperty("line.file.out", file.getAbsolutePath());
props.setProperty("directory", "RAMDirectory"); // no accidental FS dir.
Config config = new Config(props);
return new PerfRunData(config);
}
private void doReadTest(File file, String expTitle,
String expDate, String expBody) throws Exception {
doReadTest(2, file, expTitle, expDate, expBody);
File categoriesFile = WriteEnwikiLineDocTask.categoriesLineFile(file);
doReadTest(2, categoriesFile, "Category:"+expTitle, expDate, expBody);
}
private void doReadTest(int n, File file, String expTitle, String expDate, String expBody) throws Exception {
InputStream in = new FileInputStream(file);
BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
try {
String line = br.readLine();
WriteLineDocTaskTest.assertHeaderLine(line);
for (int i=0; i<n; i++) {
line = br.readLine();
assertNotNull(line);
String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
int numExpParts = expBody == null ? 2 : 3;
assertEquals(numExpParts, parts.length);
assertEquals(expTitle, parts[0]);
assertEquals(expDate, parts[1]);
if (expBody != null) {
assertEquals(expBody, parts[2]);
}
}
assertNull(br.readLine());
} finally {
br.close();
}
}
public void testCategoryLines() throws Exception {
// WriteLineDocTask replaced only \t characters w/ a space, since that's its
// separator char. However, it didn't replace newline characters, which
// resulted in errors in LineDocSource.
File file = new File(getWorkDir(), "two-lines-each.txt");
PerfRunData runData = createPerfRunData(file, WriteLineCategoryDocMaker.class.getName());
WriteLineDocTask wldt = new WriteEnwikiLineDocTask(runData);
for (int i=0; i<4; i++) { // four times so that each file should have 2 lines.
wldt.doLogic();
}
wldt.close();
doReadTest(file, "title text", "date text", "body text");
}
}

View File

@ -188,7 +188,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
} }
} }
private void assertHeaderLine(String line) { static void assertHeaderLine(String line) {
assertTrue("First line should be a header line",line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR)); assertTrue("First line should be a header line",line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR));
} }