mirror of https://github.com/apache/lucene.git
LUCENE-4590: Added WriteEnwikiLineDocTask.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1418852 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2ae20cf77c
commit
d3da4138d6
|
@ -128,6 +128,11 @@ New Features
|
||||||
rule files in the ICU RuleBasedBreakIterator format.
|
rule files in the ICU RuleBasedBreakIterator format.
|
||||||
(Shawn Heisey, Robert Muir, Steve Rowe)
|
(Shawn Heisey, Robert Muir, Steve Rowe)
|
||||||
|
|
||||||
|
* LUCENE-4590: Added WriteEnwikiLineDocTask - a benchmark task for writing
|
||||||
|
Wikipedia category pages and non-category pages into separate line files.
|
||||||
|
extractWikipedia.alg was changed to use this task, so now it creates two
|
||||||
|
files. (Doron Cohen)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
* LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries
|
* LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries
|
||||||
|
|
|
@ -41,4 +41,4 @@ content.source.forever=false
|
||||||
# -------------------------------------------------------------------------------------
|
# -------------------------------------------------------------------------------------
|
||||||
|
|
||||||
# Process all documents, appending each one to the line file:
|
# Process all documents, appending each one to the line file:
|
||||||
{WriteLineDoc() > : *
|
{WriteEnwikiLineDoc() > : *
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
package org.apache.lucene.benchmark.byTask.tasks;
|
||||||
|
|
||||||
|
import java.io.BufferedWriter;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
|
import java.io.PrintWriter;
|
||||||
|
|
||||||
|
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
|
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||||
|
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.index.StorableField;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link WriteLineDocTask} which for Wikipedia input, will write category pages
|
||||||
|
* to another file, while remaining pages will be written to the original file.
|
||||||
|
* The categories file is derived from the original file, by adding a prefix "categories-".
|
||||||
|
*/
|
||||||
|
public class WriteEnwikiLineDocTask extends WriteLineDocTask {
|
||||||
|
|
||||||
|
private final PrintWriter categoryLineFileOut;
|
||||||
|
|
||||||
|
public WriteEnwikiLineDocTask(PerfRunData runData) throws Exception {
|
||||||
|
super(runData);
|
||||||
|
OutputStream out = StreamUtils.outputStream(categoriesLineFile(new File(fname)));
|
||||||
|
categoryLineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), StreamUtils.BUFFER_SIZE));
|
||||||
|
writeHeader(categoryLineFileOut);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Compose categories line file out of original line file */
|
||||||
|
static File categoriesLineFile(File f) {
|
||||||
|
File dir = f.getParentFile();
|
||||||
|
String categoriesName = "categories-"+f.getName();
|
||||||
|
return dir==null ? new File(categoriesName) : new File(dir,categoriesName);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws Exception {
|
||||||
|
categoryLineFileOut.close();
|
||||||
|
super.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected PrintWriter lineFileOut(Document doc) {
|
||||||
|
StorableField titleField = doc.getField(DocMaker.TITLE_FIELD);
|
||||||
|
if (titleField!=null && titleField.stringValue().startsWith("Category:")) {
|
||||||
|
return categoryLineFileOut;
|
||||||
|
}
|
||||||
|
return super.lineFileOut(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -32,7 +32,6 @@ import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
|
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.index.IndexableField;
|
|
||||||
import org.apache.lucene.index.StorableField;
|
import org.apache.lucene.index.StorableField;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -84,18 +83,20 @@ public class WriteLineDocTask extends PerfTask {
|
||||||
public static final String DEFAULT_SUFFICIENT_FIELDS = DocMaker.TITLE_FIELD +',' + DocMaker.BODY_FIELD;
|
public static final String DEFAULT_SUFFICIENT_FIELDS = DocMaker.TITLE_FIELD +',' + DocMaker.BODY_FIELD;
|
||||||
|
|
||||||
private int docSize = 0;
|
private int docSize = 0;
|
||||||
private PrintWriter lineFileOut = null;
|
protected final String fname;
|
||||||
private DocMaker docMaker;
|
private final PrintWriter lineFileOut;
|
||||||
private ThreadLocal<StringBuilder> threadBuffer = new ThreadLocal<StringBuilder>();
|
private final DocMaker docMaker;
|
||||||
private ThreadLocal<Matcher> threadNormalizer = new ThreadLocal<Matcher>();
|
private final ThreadLocal<StringBuilder> threadBuffer = new ThreadLocal<StringBuilder>();
|
||||||
private final String[] fieldsToWrite;;
|
private final ThreadLocal<Matcher> threadNormalizer = new ThreadLocal<Matcher>();
|
||||||
|
private final String[] fieldsToWrite;
|
||||||
private final boolean[] sufficientFields;
|
private final boolean[] sufficientFields;
|
||||||
private final boolean checkSufficientFields;
|
private final boolean checkSufficientFields;
|
||||||
|
|
||||||
|
|
||||||
public WriteLineDocTask(PerfRunData runData) throws Exception {
|
public WriteLineDocTask(PerfRunData runData) throws Exception {
|
||||||
super(runData);
|
super(runData);
|
||||||
Config config = runData.getConfig();
|
Config config = runData.getConfig();
|
||||||
String fname = config.get("line.file.out", null);
|
fname = config.get("line.file.out", null);
|
||||||
if (fname == null) {
|
if (fname == null) {
|
||||||
throw new IllegalArgumentException("line.file.out must be set");
|
throw new IllegalArgumentException("line.file.out must be set");
|
||||||
}
|
}
|
||||||
|
@ -129,13 +130,13 @@ public class WriteLineDocTask extends PerfTask {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
writeHeader();
|
writeHeader(lineFileOut);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Write a header to the lines file - indicating how to read the file later
|
* Write header to the lines file - indicating how to read the file later.
|
||||||
*/
|
*/
|
||||||
private void writeHeader() {
|
protected void writeHeader(PrintWriter out) {
|
||||||
StringBuilder sb = threadBuffer.get();
|
StringBuilder sb = threadBuffer.get();
|
||||||
if (sb == null) {
|
if (sb == null) {
|
||||||
sb = new StringBuilder();
|
sb = new StringBuilder();
|
||||||
|
@ -146,7 +147,7 @@ public class WriteLineDocTask extends PerfTask {
|
||||||
for (String f : fieldsToWrite) {
|
for (String f : fieldsToWrite) {
|
||||||
sb.append(SEP).append(f);
|
sb.append(SEP).append(f);
|
||||||
}
|
}
|
||||||
lineFileOut.println(sb.toString());
|
out.println(sb.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -181,12 +182,20 @@ public class WriteLineDocTask extends PerfTask {
|
||||||
if (sufficient) {
|
if (sufficient) {
|
||||||
sb.setLength(sb.length()-1); // remove redundant last separator
|
sb.setLength(sb.length()-1); // remove redundant last separator
|
||||||
// lineFileOut is a PrintWriter, which synchronizes internally in println.
|
// lineFileOut is a PrintWriter, which synchronizes internally in println.
|
||||||
lineFileOut.println(sb.toString());
|
lineFileOut(doc).println(sb.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Selects output line file by written doc.
|
||||||
|
* Default: original output line file.
|
||||||
|
*/
|
||||||
|
protected PrintWriter lineFileOut(Document doc) {
|
||||||
|
return lineFileOut;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws Exception {
|
public void close() throws Exception {
|
||||||
lineFileOut.close();
|
lineFileOut.close();
|
||||||
|
|
|
@ -0,0 +1,114 @@
|
||||||
|
package org.apache.lucene.benchmark.byTask.tasks;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.util.Properties;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
|
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
||||||
|
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
|
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||||
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.document.StringField;
|
||||||
|
|
||||||
|
/** Tests the functionality of {@link WriteEnwikiLineDocTask}. */
|
||||||
|
public class WriteEnwikiLineDocTaskTest extends BenchmarkTestCase {
|
||||||
|
|
||||||
|
|
||||||
|
// class has to be public so that Class.forName.newInstance() will work
|
||||||
|
/** Interleaves category docs with regular docs */
|
||||||
|
public static final class WriteLineCategoryDocMaker extends DocMaker {
|
||||||
|
|
||||||
|
AtomicInteger flip = new AtomicInteger(0);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Document makeDocument() throws Exception {
|
||||||
|
boolean isCategory = (flip.incrementAndGet() % 2 == 0);
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new StringField(BODY_FIELD, "body text", Field.Store.NO));
|
||||||
|
doc.add(new StringField(TITLE_FIELD, isCategory ? "Category:title text" : "title text", Field.Store.NO));
|
||||||
|
doc.add(new StringField(DATE_FIELD, "date text", Field.Store.NO));
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private PerfRunData createPerfRunData(File file, String docMakerName) throws Exception {
|
||||||
|
Properties props = new Properties();
|
||||||
|
props.setProperty("doc.maker", docMakerName);
|
||||||
|
props.setProperty("line.file.out", file.getAbsolutePath());
|
||||||
|
props.setProperty("directory", "RAMDirectory"); // no accidental FS dir.
|
||||||
|
Config config = new Config(props);
|
||||||
|
return new PerfRunData(config);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doReadTest(File file, String expTitle,
|
||||||
|
String expDate, String expBody) throws Exception {
|
||||||
|
doReadTest(2, file, expTitle, expDate, expBody);
|
||||||
|
File categoriesFile = WriteEnwikiLineDocTask.categoriesLineFile(file);
|
||||||
|
doReadTest(2, categoriesFile, "Category:"+expTitle, expDate, expBody);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doReadTest(int n, File file, String expTitle, String expDate, String expBody) throws Exception {
|
||||||
|
InputStream in = new FileInputStream(file);
|
||||||
|
BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
|
||||||
|
try {
|
||||||
|
String line = br.readLine();
|
||||||
|
WriteLineDocTaskTest.assertHeaderLine(line);
|
||||||
|
for (int i=0; i<n; i++) {
|
||||||
|
line = br.readLine();
|
||||||
|
assertNotNull(line);
|
||||||
|
String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
|
||||||
|
int numExpParts = expBody == null ? 2 : 3;
|
||||||
|
assertEquals(numExpParts, parts.length);
|
||||||
|
assertEquals(expTitle, parts[0]);
|
||||||
|
assertEquals(expDate, parts[1]);
|
||||||
|
if (expBody != null) {
|
||||||
|
assertEquals(expBody, parts[2]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertNull(br.readLine());
|
||||||
|
} finally {
|
||||||
|
br.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void testCategoryLines() throws Exception {
|
||||||
|
// WriteLineDocTask replaced only \t characters w/ a space, since that's its
|
||||||
|
// separator char. However, it didn't replace newline characters, which
|
||||||
|
// resulted in errors in LineDocSource.
|
||||||
|
File file = new File(getWorkDir(), "two-lines-each.txt");
|
||||||
|
PerfRunData runData = createPerfRunData(file, WriteLineCategoryDocMaker.class.getName());
|
||||||
|
WriteLineDocTask wldt = new WriteEnwikiLineDocTask(runData);
|
||||||
|
for (int i=0; i<4; i++) { // four times so that each file should have 2 lines.
|
||||||
|
wldt.doLogic();
|
||||||
|
}
|
||||||
|
wldt.close();
|
||||||
|
|
||||||
|
doReadTest(file, "title text", "date text", "body text");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -188,7 +188,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assertHeaderLine(String line) {
|
static void assertHeaderLine(String line) {
|
||||||
assertTrue("First line should be a header line",line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR));
|
assertTrue("First line should be a header line",line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue