mirror of https://github.com/apache/lucene.git
LUCENE-4590: Added WriteEnwikiLineDocTask.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1418852 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2ae20cf77c
commit
d3da4138d6
|
@ -128,6 +128,11 @@ New Features
|
|||
rule files in the ICU RuleBasedBreakIterator format.
|
||||
(Shawn Heisey, Robert Muir, Steve Rowe)
|
||||
|
||||
* LUCENE-4590: Added WriteEnwikiLineDocTask - a benchmark task for writing
|
||||
Wikipedia category pages and non-category pages into separate line files.
|
||||
extractWikipedia.alg was changed to use this task, so now it creates two
|
||||
files. (Doron Cohen)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries
|
||||
|
|
|
@ -41,4 +41,4 @@ content.source.forever=false
|
|||
# -------------------------------------------------------------------------------------
|
||||
|
||||
# Process all documents, appending each one to the line file:
|
||||
{WriteLineDoc() > : *
|
||||
{WriteEnwikiLineDoc() > : *
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
package org.apache.lucene.benchmark.byTask.tasks;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.File;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.PrintWriter;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.StorableField;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* A {@link WriteLineDocTask} which for Wikipedia input, will write category pages
|
||||
* to another file, while remaining pages will be written to the original file.
|
||||
* The categories file is derived from the original file, by adding a prefix "categories-".
|
||||
*/
|
||||
public class WriteEnwikiLineDocTask extends WriteLineDocTask {
|
||||
|
||||
private final PrintWriter categoryLineFileOut;
|
||||
|
||||
public WriteEnwikiLineDocTask(PerfRunData runData) throws Exception {
|
||||
super(runData);
|
||||
OutputStream out = StreamUtils.outputStream(categoriesLineFile(new File(fname)));
|
||||
categoryLineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), StreamUtils.BUFFER_SIZE));
|
||||
writeHeader(categoryLineFileOut);
|
||||
}
|
||||
|
||||
/** Compose categories line file out of original line file */
|
||||
static File categoriesLineFile(File f) {
|
||||
File dir = f.getParentFile();
|
||||
String categoriesName = "categories-"+f.getName();
|
||||
return dir==null ? new File(categoriesName) : new File(dir,categoriesName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
categoryLineFileOut.close();
|
||||
super.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected PrintWriter lineFileOut(Document doc) {
|
||||
StorableField titleField = doc.getField(DocMaker.TITLE_FIELD);
|
||||
if (titleField!=null && titleField.stringValue().startsWith("Category:")) {
|
||||
return categoryLineFileOut;
|
||||
}
|
||||
return super.lineFileOut(doc);
|
||||
}
|
||||
|
||||
}
|
|
@ -32,7 +32,6 @@ import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
|||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.index.StorableField;
|
||||
|
||||
/**
|
||||
|
@ -84,18 +83,20 @@ public class WriteLineDocTask extends PerfTask {
|
|||
public static final String DEFAULT_SUFFICIENT_FIELDS = DocMaker.TITLE_FIELD +',' + DocMaker.BODY_FIELD;
|
||||
|
||||
private int docSize = 0;
|
||||
private PrintWriter lineFileOut = null;
|
||||
private DocMaker docMaker;
|
||||
private ThreadLocal<StringBuilder> threadBuffer = new ThreadLocal<StringBuilder>();
|
||||
private ThreadLocal<Matcher> threadNormalizer = new ThreadLocal<Matcher>();
|
||||
private final String[] fieldsToWrite;;
|
||||
protected final String fname;
|
||||
private final PrintWriter lineFileOut;
|
||||
private final DocMaker docMaker;
|
||||
private final ThreadLocal<StringBuilder> threadBuffer = new ThreadLocal<StringBuilder>();
|
||||
private final ThreadLocal<Matcher> threadNormalizer = new ThreadLocal<Matcher>();
|
||||
private final String[] fieldsToWrite;
|
||||
private final boolean[] sufficientFields;
|
||||
private final boolean checkSufficientFields;
|
||||
|
||||
|
||||
public WriteLineDocTask(PerfRunData runData) throws Exception {
|
||||
super(runData);
|
||||
Config config = runData.getConfig();
|
||||
String fname = config.get("line.file.out", null);
|
||||
fname = config.get("line.file.out", null);
|
||||
if (fname == null) {
|
||||
throw new IllegalArgumentException("line.file.out must be set");
|
||||
}
|
||||
|
@ -129,13 +130,13 @@ public class WriteLineDocTask extends PerfTask {
|
|||
}
|
||||
}
|
||||
|
||||
writeHeader();
|
||||
writeHeader(lineFileOut);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a header to the lines file - indicating how to read the file later
|
||||
* Write header to the lines file - indicating how to read the file later.
|
||||
*/
|
||||
private void writeHeader() {
|
||||
protected void writeHeader(PrintWriter out) {
|
||||
StringBuilder sb = threadBuffer.get();
|
||||
if (sb == null) {
|
||||
sb = new StringBuilder();
|
||||
|
@ -146,7 +147,7 @@ public class WriteLineDocTask extends PerfTask {
|
|||
for (String f : fieldsToWrite) {
|
||||
sb.append(SEP).append(f);
|
||||
}
|
||||
lineFileOut.println(sb.toString());
|
||||
out.println(sb.toString());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -181,12 +182,20 @@ public class WriteLineDocTask extends PerfTask {
|
|||
if (sufficient) {
|
||||
sb.setLength(sb.length()-1); // remove redundant last separator
|
||||
// lineFileOut is a PrintWriter, which synchronizes internally in println.
|
||||
lineFileOut.println(sb.toString());
|
||||
lineFileOut(doc).println(sb.toString());
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Selects output line file by written doc.
|
||||
* Default: original output line file.
|
||||
*/
|
||||
protected PrintWriter lineFileOut(Document doc) {
|
||||
return lineFileOut;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
lineFileOut.close();
|
||||
|
|
|
@ -0,0 +1,114 @@
|
|||
package org.apache.lucene.benchmark.byTask.tasks;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.Properties;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.StringField;
|
||||
|
||||
/** Tests the functionality of {@link WriteEnwikiLineDocTask}. */
|
||||
public class WriteEnwikiLineDocTaskTest extends BenchmarkTestCase {
|
||||
|
||||
|
||||
// class has to be public so that Class.forName.newInstance() will work
|
||||
/** Interleaves category docs with regular docs */
|
||||
public static final class WriteLineCategoryDocMaker extends DocMaker {
|
||||
|
||||
AtomicInteger flip = new AtomicInteger(0);
|
||||
|
||||
@Override
|
||||
public Document makeDocument() throws Exception {
|
||||
boolean isCategory = (flip.incrementAndGet() % 2 == 0);
|
||||
Document doc = new Document();
|
||||
doc.add(new StringField(BODY_FIELD, "body text", Field.Store.NO));
|
||||
doc.add(new StringField(TITLE_FIELD, isCategory ? "Category:title text" : "title text", Field.Store.NO));
|
||||
doc.add(new StringField(DATE_FIELD, "date text", Field.Store.NO));
|
||||
return doc;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private PerfRunData createPerfRunData(File file, String docMakerName) throws Exception {
|
||||
Properties props = new Properties();
|
||||
props.setProperty("doc.maker", docMakerName);
|
||||
props.setProperty("line.file.out", file.getAbsolutePath());
|
||||
props.setProperty("directory", "RAMDirectory"); // no accidental FS dir.
|
||||
Config config = new Config(props);
|
||||
return new PerfRunData(config);
|
||||
}
|
||||
|
||||
private void doReadTest(File file, String expTitle,
|
||||
String expDate, String expBody) throws Exception {
|
||||
doReadTest(2, file, expTitle, expDate, expBody);
|
||||
File categoriesFile = WriteEnwikiLineDocTask.categoriesLineFile(file);
|
||||
doReadTest(2, categoriesFile, "Category:"+expTitle, expDate, expBody);
|
||||
}
|
||||
|
||||
private void doReadTest(int n, File file, String expTitle, String expDate, String expBody) throws Exception {
|
||||
InputStream in = new FileInputStream(file);
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
|
||||
try {
|
||||
String line = br.readLine();
|
||||
WriteLineDocTaskTest.assertHeaderLine(line);
|
||||
for (int i=0; i<n; i++) {
|
||||
line = br.readLine();
|
||||
assertNotNull(line);
|
||||
String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
|
||||
int numExpParts = expBody == null ? 2 : 3;
|
||||
assertEquals(numExpParts, parts.length);
|
||||
assertEquals(expTitle, parts[0]);
|
||||
assertEquals(expDate, parts[1]);
|
||||
if (expBody != null) {
|
||||
assertEquals(expBody, parts[2]);
|
||||
}
|
||||
}
|
||||
assertNull(br.readLine());
|
||||
} finally {
|
||||
br.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void testCategoryLines() throws Exception {
|
||||
// WriteLineDocTask replaced only \t characters w/ a space, since that's its
|
||||
// separator char. However, it didn't replace newline characters, which
|
||||
// resulted in errors in LineDocSource.
|
||||
File file = new File(getWorkDir(), "two-lines-each.txt");
|
||||
PerfRunData runData = createPerfRunData(file, WriteLineCategoryDocMaker.class.getName());
|
||||
WriteLineDocTask wldt = new WriteEnwikiLineDocTask(runData);
|
||||
for (int i=0; i<4; i++) { // four times so that each file should have 2 lines.
|
||||
wldt.doLogic();
|
||||
}
|
||||
wldt.close();
|
||||
|
||||
doReadTest(file, "title text", "date text", "body text");
|
||||
}
|
||||
|
||||
}
|
|
@ -188,7 +188,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
private void assertHeaderLine(String line) {
|
||||
static void assertHeaderLine(String line) {
|
||||
assertTrue("First line should be a header line",line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR));
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue