missing svn:eol-style

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1097216 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-04-27 19:40:18 +00:00
parent 9accd70058
commit 1f67321074
11 changed files with 687 additions and 496 deletions

View File

@ -1 +1,57 @@
package org.apache.lucene.search.highlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; /** * This TokenFilter limits the number of tokens while indexing by adding up the * current offset. */ public final class OffsetLimitTokenFilter extends TokenFilter { private int offsetCount; private OffsetAttribute offsetAttrib = getAttribute(OffsetAttribute.class); private int offsetLimit; public OffsetLimitTokenFilter(TokenStream input, int offsetLimit) { super(input); this.offsetLimit = offsetLimit; } @Override public boolean incrementToken() throws IOException { if (offsetCount < offsetLimit && input.incrementToken()) { int offsetLength = offsetAttrib.endOffset() - offsetAttrib.startOffset(); offsetCount += offsetLength; return true; } return false; } @Override public void reset() throws IOException { super.reset(); offsetCount = 0; } }
package org.apache.lucene.search.highlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
/**
* This TokenFilter limits the number of tokens while indexing by adding up the
* current offset.
*/
public final class OffsetLimitTokenFilter extends TokenFilter {
private int offsetCount;
private OffsetAttribute offsetAttrib = getAttribute(OffsetAttribute.class);
private int offsetLimit;
public OffsetLimitTokenFilter(TokenStream input, int offsetLimit) {
super(input);
this.offsetLimit = offsetLimit;
}
@Override
public boolean incrementToken() throws IOException {
if (offsetCount < offsetLimit && input.incrementToken()) {
int offsetLength = offsetAttrib.endOffset() - offsetAttrib.startOffset();
offsetCount += offsetLength;
return true;
}
return false;
}
@Override
public void reset() throws IOException {
super.reset();
offsetCount = 0;
}
}

View File

@ -1 +1,60 @@
package org.apache.lucene.search.highlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Reader; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; public class OffsetLimitTokenFilterTest extends BaseTokenStreamTestCase { public void testFilter() throws Exception { TokenStream stream = new MockTokenizer(new StringReader( "short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false); OffsetLimitTokenFilter filter = new OffsetLimitTokenFilter(stream, 10); assertTokenStreamContents(filter, new String[] {"short", "toolong"}); stream = new MockTokenizer(new StringReader( "short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false); filter = new OffsetLimitTokenFilter(stream, 12); assertTokenStreamContents(filter, new String[] {"short", "toolong"}); stream = new MockTokenizer(new StringReader( "short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false); filter = new OffsetLimitTokenFilter(stream, 30); assertTokenStreamContents(filter, new String[] {"short", "toolong", "evenmuchlongertext"}); checkOneTermReuse(new Analyzer() { @Override public TokenStream tokenStream(String fieldName, Reader reader) { return new OffsetLimitTokenFilter(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), 10); } }, "llenges", "llenges"); } }
package org.apache.lucene.search.highlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
public class OffsetLimitTokenFilterTest extends BaseTokenStreamTestCase {
public void testFilter() throws Exception {
TokenStream stream = new MockTokenizer(new StringReader(
"short toolong evenmuchlongertext a ab toolong foo"),
MockTokenizer.WHITESPACE, false);
OffsetLimitTokenFilter filter = new OffsetLimitTokenFilter(stream, 10);
assertTokenStreamContents(filter, new String[] {"short", "toolong"});
stream = new MockTokenizer(new StringReader(
"short toolong evenmuchlongertext a ab toolong foo"),
MockTokenizer.WHITESPACE, false);
filter = new OffsetLimitTokenFilter(stream, 12);
assertTokenStreamContents(filter, new String[] {"short", "toolong"});
stream = new MockTokenizer(new StringReader(
"short toolong evenmuchlongertext a ab toolong foo"),
MockTokenizer.WHITESPACE, false);
filter = new OffsetLimitTokenFilter(stream, 30);
assertTokenStreamContents(filter, new String[] {"short", "toolong",
"evenmuchlongertext"});
checkOneTermReuse(new Analyzer() {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new OffsetLimitTokenFilter(new MockTokenizer(reader,
MockTokenizer.WHITESPACE, false), 10);
}
}, "llenges", "llenges");
}
}

View File

@ -1,28 +1,28 @@
<html>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<head>
<title>Apache Lucene Test Framework API</title>
</head>
<body>
<p>
The Lucene Test Framework is used by Lucene as the basis for its tests.
The framework can also be used for testing third-party code that uses
the Lucene API.
</p>
</body>
</html>
<html>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<head>
<title>Apache Lucene Test Framework API</title>
</head>
<body>
<p>
The Lucene Test Framework is used by Lucene as the basis for its tests.
The framework can also be used for testing third-party code that uses
the Lucene API.
</p>
</body>
</html>

View File

@ -1,136 +1,136 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
/**
* Parser for trec doc content, invoked on doc text excluding <DOC> and <DOCNO>
* which are handled in TrecContentSource. Required to be stateless and hence thread safe.
*/
public abstract class TrecDocParser {
/** Types of trec parse paths, */
public enum ParsePathType { GOV2, FBIS, FT, FR94, LATIMES }
/** trec parser type used for unknown extensions */
public static final ParsePathType DEFAULT_PATH_TYPE = ParsePathType.GOV2;
static final Map<ParsePathType,TrecDocParser> pathType2parser = new HashMap<ParsePathType,TrecDocParser>();
static {
pathType2parser.put(ParsePathType.GOV2, new TrecGov2Parser());
pathType2parser.put(ParsePathType.FBIS, new TrecFBISParser());
pathType2parser.put(ParsePathType.FR94, new TrecFR94Parser());
pathType2parser.put(ParsePathType.FT, new TrecFTParser());
pathType2parser.put(ParsePathType.LATIMES, new TrecLATimesParser());
}
static final Map<String,ParsePathType> pathName2Type = new HashMap<String,ParsePathType>();
static {
for (ParsePathType ppt : ParsePathType.values()) {
pathName2Type.put(ppt.name().toUpperCase(Locale.ENGLISH),ppt);
}
}
/** max length of walk up from file to its ancestors when looking for a known path type */
private static final int MAX_PATH_LENGTH = 10;
/**
* Compute the path type of a file by inspecting name of file and its parents
*/
public static ParsePathType pathType(File f) {
int pathLength = 0;
while (f != null && ++pathLength < MAX_PATH_LENGTH) {
ParsePathType ppt = pathName2Type.get(f.getName().toUpperCase(Locale.ENGLISH));
if (ppt!=null) {
return ppt;
}
f = f.getParentFile();
}
return DEFAULT_PATH_TYPE;
}
/**
* parse the text prepared in docBuf into a result DocData,
* no synchronization is required.
* @param docData reusable result
* @param name name that should be set to the result
* @param trecSrc calling trec content source
* @param docBuf text to parse
* @param pathType type of parsed file, or null if unknown - may be used by
* parsers to alter their behavior according to the file path type.
*/
public abstract DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException;
/**
* strip tags from <code>buf</code>: each tag is replaced by a single blank.
* @return text obtained when stripping all tags from <code>buf</code> (Input StringBuilder is unmodified).
*/
public static String stripTags(StringBuilder buf, int start) {
return stripTags(buf.substring(start),0);
}
/**
* strip tags from input.
* @see #stripTags(StringBuilder, int)
*/
public static String stripTags(String buf, int start) {
if (start>0) {
buf = buf.substring(0);
}
return buf.replaceAll("<[^>]*>", " ");
}
/**
* Extract from <code>buf</code> the text of interest within specified tags
* @param buf entire input text
* @param startTag tag marking start of text of interest
* @param endTag tag marking end of text of interest
* @param maxPos if &ge; 0 sets a limit on start of text of interest
* @return text of interest or null if not found
*/
public static String extract(StringBuilder buf, String startTag, String endTag, int maxPos, String noisePrefixes[]) {
int k1 = buf.indexOf(startTag);
if (k1>=0 && (maxPos<0 || k1<maxPos)) {
k1 += startTag.length();
int k2 = buf.indexOf(endTag,k1);
if (k2>=0 && (maxPos<0 || k2<maxPos)) { // found end tag with allowed range
if (noisePrefixes != null) {
for (String noise : noisePrefixes) {
int k1a = buf.indexOf(noise,k1);
if (k1a>=0 && k1a<k2) {
k1 = k1a + noise.length();
}
}
}
return buf.substring(k1,k2).trim();
}
}
return null;
}
//public static void main(String[] args) {
// System.out.println(stripTags("is it true that<space>2<<second space>><almost last space>1<one more space>?",0));
//}
}
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
/**
* Parser for trec doc content, invoked on doc text excluding <DOC> and <DOCNO>
* which are handled in TrecContentSource. Required to be stateless and hence thread safe.
*/
public abstract class TrecDocParser {
/** Types of trec parse paths, */
public enum ParsePathType { GOV2, FBIS, FT, FR94, LATIMES }
/** trec parser type used for unknown extensions */
public static final ParsePathType DEFAULT_PATH_TYPE = ParsePathType.GOV2;
static final Map<ParsePathType,TrecDocParser> pathType2parser = new HashMap<ParsePathType,TrecDocParser>();
static {
pathType2parser.put(ParsePathType.GOV2, new TrecGov2Parser());
pathType2parser.put(ParsePathType.FBIS, new TrecFBISParser());
pathType2parser.put(ParsePathType.FR94, new TrecFR94Parser());
pathType2parser.put(ParsePathType.FT, new TrecFTParser());
pathType2parser.put(ParsePathType.LATIMES, new TrecLATimesParser());
}
static final Map<String,ParsePathType> pathName2Type = new HashMap<String,ParsePathType>();
static {
for (ParsePathType ppt : ParsePathType.values()) {
pathName2Type.put(ppt.name().toUpperCase(Locale.ENGLISH),ppt);
}
}
/** max length of walk up from file to its ancestors when looking for a known path type */
private static final int MAX_PATH_LENGTH = 10;
/**
* Compute the path type of a file by inspecting name of file and its parents
*/
public static ParsePathType pathType(File f) {
int pathLength = 0;
while (f != null && ++pathLength < MAX_PATH_LENGTH) {
ParsePathType ppt = pathName2Type.get(f.getName().toUpperCase(Locale.ENGLISH));
if (ppt!=null) {
return ppt;
}
f = f.getParentFile();
}
return DEFAULT_PATH_TYPE;
}
/**
* parse the text prepared in docBuf into a result DocData,
* no synchronization is required.
* @param docData reusable result
* @param name name that should be set to the result
* @param trecSrc calling trec content source
* @param docBuf text to parse
* @param pathType type of parsed file, or null if unknown - may be used by
* parsers to alter their behavior according to the file path type.
*/
public abstract DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException;
/**
* strip tags from <code>buf</code>: each tag is replaced by a single blank.
* @return text obtained when stripping all tags from <code>buf</code> (Input StringBuilder is unmodified).
*/
public static String stripTags(StringBuilder buf, int start) {
return stripTags(buf.substring(start),0);
}
/**
* strip tags from input.
* @see #stripTags(StringBuilder, int)
*/
public static String stripTags(String buf, int start) {
if (start>0) {
buf = buf.substring(0);
}
return buf.replaceAll("<[^>]*>", " ");
}
/**
* Extract from <code>buf</code> the text of interest within specified tags
* @param buf entire input text
* @param startTag tag marking start of text of interest
* @param endTag tag marking end of text of interest
* @param maxPos if &ge; 0 sets a limit on start of text of interest
* @return text of interest or null if not found
*/
public static String extract(StringBuilder buf, String startTag, String endTag, int maxPos, String noisePrefixes[]) {
int k1 = buf.indexOf(startTag);
if (k1>=0 && (maxPos<0 || k1<maxPos)) {
k1 += startTag.length();
int k2 = buf.indexOf(endTag,k1);
if (k2>=0 && (maxPos<0 || k2<maxPos)) { // found end tag with allowed range
if (noisePrefixes != null) {
for (String noise : noisePrefixes) {
int k1a = buf.indexOf(noise,k1);
if (k1a>=0 && k1a<k2) {
k1 = k1a + noise.length();
}
}
}
return buf.substring(k1,k2).trim();
}
}
return null;
}
//public static void main(String[] args) {
// System.out.println(stripTags("is it true that<space>2<<second space>><almost last space>1<one more space>?",0));
//}
}

View File

@ -1,65 +1,65 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Date;
/**
* Parser for the FBIS docs in trec disks 4+5 collection format
*/
public class TrecFBISParser extends TrecDocParser {
private static final String HEADER = "<HEADER>";
private static final String HEADER_END = "</HEADER>";
private static final int HEADER_END_LENGTH = HEADER_END.length();
private static final String DATE1 = "<DATE1>";
private static final String DATE1_END = "</DATE1>";
private static final String TI = "<TI>";
private static final String TI_END = "</TI>";
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
int mark = 0; // that much is skipped
// optionally skip some of the text, set date, title
Date date = null;
String title = null;
int h1 = docBuf.indexOf(HEADER);
if (h1>=0) {
int h2 = docBuf.indexOf(HEADER_END,h1);
mark = h2+HEADER_END_LENGTH;
// date...
String dateStr = extract(docBuf, DATE1, DATE1_END, h2, null);
if (dateStr != null) {
date = trecSrc.parseDate(dateStr);
}
// title...
title = extract(docBuf, TI, TI_END, h2, null);
}
docData.clear();
docData.setName(name);
docData.setDate(date);
docData.setTitle(title);
docData.setBody(stripTags(docBuf, mark).toString());
return docData;
}
}
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Date;
/**
* Parser for the FBIS docs in trec disks 4+5 collection format
*/
public class TrecFBISParser extends TrecDocParser {
private static final String HEADER = "<HEADER>";
private static final String HEADER_END = "</HEADER>";
private static final int HEADER_END_LENGTH = HEADER_END.length();
private static final String DATE1 = "<DATE1>";
private static final String DATE1_END = "</DATE1>";
private static final String TI = "<TI>";
private static final String TI_END = "</TI>";
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
int mark = 0; // that much is skipped
// optionally skip some of the text, set date, title
Date date = null;
String title = null;
int h1 = docBuf.indexOf(HEADER);
if (h1>=0) {
int h2 = docBuf.indexOf(HEADER_END,h1);
mark = h2+HEADER_END_LENGTH;
// date...
String dateStr = extract(docBuf, DATE1, DATE1_END, h2, null);
if (dateStr != null) {
date = trecSrc.parseDate(dateStr);
}
// title...
title = extract(docBuf, TI, TI_END, h2, null);
}
docData.clear();
docData.setName(name);
docData.setDate(date);
docData.setTitle(title);
docData.setBody(stripTags(docBuf, mark).toString());
return docData;
}
}

View File

@ -1,66 +1,66 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Date;
/**
* Parser for the FR94 docs in trec disks 4+5 collection format
*/
public class TrecFR94Parser extends TrecDocParser {
private static final String TEXT = "<TEXT>";
private static final int TEXT_LENGTH = TEXT.length();
private static final String TEXT_END = "</TEXT>";
private static final String DATE = "<DATE>";
private static final String[] DATE_NOISE_PREFIXES = {
"DATE:",
"date:", //TODO improve date extraction for this format
"t.c.",
};
private static final String DATE_END = "</DATE>";
//TODO can we also extract title for this format?
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
int mark = 0; // that much is skipped
// optionally skip some of the text, set date (no title?)
Date date = null;
int h1 = docBuf.indexOf(TEXT);
if (h1>=0) {
int h2 = docBuf.indexOf(TEXT_END,h1);
mark = h1+TEXT_LENGTH;
// date...
String dateStr = extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES);
if (dateStr != null) {
dateStr = stripTags(dateStr,0).toString();
date = trecSrc.parseDate(dateStr.trim());
}
}
docData.clear();
docData.setName(name);
docData.setDate(date);
docData.setBody(stripTags(docBuf, mark).toString());
return docData;
}
}
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Date;
/**
* Parser for the FR94 docs in trec disks 4+5 collection format
*/
public class TrecFR94Parser extends TrecDocParser {
private static final String TEXT = "<TEXT>";
private static final int TEXT_LENGTH = TEXT.length();
private static final String TEXT_END = "</TEXT>";
private static final String DATE = "<DATE>";
private static final String[] DATE_NOISE_PREFIXES = {
"DATE:",
"date:", //TODO improve date extraction for this format
"t.c.",
};
private static final String DATE_END = "</DATE>";
//TODO can we also extract title for this format?
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
int mark = 0; // that much is skipped
// optionally skip some of the text, set date (no title?)
Date date = null;
int h1 = docBuf.indexOf(TEXT);
if (h1>=0) {
int h2 = docBuf.indexOf(TEXT_END,h1);
mark = h1+TEXT_LENGTH;
// date...
String dateStr = extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES);
if (dateStr != null) {
dateStr = stripTags(dateStr,0).toString();
date = trecSrc.parseDate(dateStr.trim());
}
}
docData.clear();
docData.setName(name);
docData.setDate(date);
docData.setBody(stripTags(docBuf, mark).toString());
return docData;
}
}

View File

@ -1,57 +1,57 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Date;
/**
* Parser for the FT docs in trec disks 4+5 collection format
*/
public class TrecFTParser extends TrecDocParser {
private static final String DATE = "<DATE>";
private static final String DATE_END = "</DATE>";
private static final String HEADLINE = "<HEADLINE>";
private static final String HEADLINE_END = "</HEADLINE>";
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
int mark = 0; // that much is skipped
// date...
Date date = null;
String dateStr = extract(docBuf, DATE, DATE_END, -1, null);
if (dateStr != null) {
date = trecSrc.parseDate(dateStr);
}
// title...
String title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
docData.clear();
docData.setName(name);
docData.setDate(date);
docData.setTitle(title);
docData.setBody(stripTags(docBuf, mark).toString());
return docData;
}
}
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Date;
/**
* Parser for the FT docs in trec disks 4+5 collection format
*/
public class TrecFTParser extends TrecDocParser {
private static final String DATE = "<DATE>";
private static final String DATE_END = "</DATE>";
private static final String HEADLINE = "<HEADLINE>";
private static final String HEADLINE_END = "</HEADLINE>";
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
int mark = 0; // that much is skipped
// date...
Date date = null;
String dateStr = extract(docBuf, DATE, DATE_END, -1, null);
if (dateStr != null) {
date = trecSrc.parseDate(dateStr);
}
// title...
String title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
docData.clear();
docData.setName(name);
docData.setDate(date);
docData.setTitle(title);
docData.setBody(stripTags(docBuf, mark).toString());
return docData;
}
}

View File

@ -1,71 +1,71 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Date;
/**
* Parser for the FT docs in trec disks 4+5 collection format
*/
public class TrecLATimesParser extends TrecDocParser {
private static final String DATE = "<DATE>";
private static final String DATE_END = "</DATE>";
private static final String DATE_NOISE = "day,"; // anything aftre the ','
private static final String SUBJECT = "<SUBJECT>";
private static final String SUBJECT_END = "</SUBJECT>";
private static final String HEADLINE = "<HEADLINE>";
private static final String HEADLINE_END = "</HEADLINE>";
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
int mark = 0; // that much is skipped
// date...
Date date = null;
String dateStr = extract(docBuf, DATE, DATE_END, -1, null);
if (dateStr != null) {
int d2a = dateStr.indexOf(DATE_NOISE);
if (d2a > 0) {
dateStr = dateStr.substring(0,d2a+3); // we need the "day" part
}
dateStr = stripTags(dateStr,0).toString();
date = trecSrc.parseDate(dateStr.trim());
}
// title... first try with SUBJECT, them with HEADLINE
String title = extract(docBuf, SUBJECT, SUBJECT_END, -1, null);
if (title==null) {
title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
}
if (title!=null) {
title = stripTags(title,0).toString().trim();
}
docData.clear();
docData.setName(name);
docData.setDate(date);
docData.setTitle(title);
docData.setBody(stripTags(docBuf, mark).toString());
return docData;
}
}
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Date;
/**
* Parser for the FT docs in trec disks 4+5 collection format
*/
public class TrecLATimesParser extends TrecDocParser {
private static final String DATE = "<DATE>";
private static final String DATE_END = "</DATE>";
private static final String DATE_NOISE = "day,"; // anything aftre the ','
private static final String SUBJECT = "<SUBJECT>";
private static final String SUBJECT_END = "</SUBJECT>";
private static final String HEADLINE = "<HEADLINE>";
private static final String HEADLINE_END = "</HEADLINE>";
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
int mark = 0; // that much is skipped
// date...
Date date = null;
String dateStr = extract(docBuf, DATE, DATE_END, -1, null);
if (dateStr != null) {
int d2a = dateStr.indexOf(DATE_NOISE);
if (d2a > 0) {
dateStr = dateStr.substring(0,d2a+3); // we need the "day" part
}
dateStr = stripTags(dateStr,0).toString();
date = trecSrc.parseDate(dateStr.trim());
}
// title... first try with SUBJECT, them with HEADLINE
String title = extract(docBuf, SUBJECT, SUBJECT_END, -1, null);
if (title==null) {
title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
}
if (title!=null) {
title = stripTags(title,0).toString().trim();
}
docData.clear();
docData.setName(name);
docData.setDate(date);
docData.setTitle(title);
docData.setBody(stripTags(docBuf, mark).toString());
return docData;
}
}

View File

@ -1,33 +1,33 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
/**
* Parser for trec docs which selects the parser to apply according
* to the source files path, defaulting to {@link TrecGov2Parser}.
*/
public class TrecParserByPath extends TrecDocParser {
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
return pathType2parser.get(pathType).parse(docData, name, trecSrc, docBuf, pathType);
}
}
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
/**
* Parser for trec docs which selects the parser to apply according
* to the source files path, defaulting to {@link TrecGov2Parser}.
*/
public class TrecParserByPath extends TrecDocParser {
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
return pathType2parser.get(pathType).parse(docData, name, trecSrc, docBuf, pathType);
}
}

View File

@ -1,37 +1,37 @@
package org.apache.lucene.benchmark.byTask.tasks.alt;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.tasks.PerfTask;
/**
* {@link PerfTask} which does nothing, but is in a different package
*/
public class AltTestTask extends PerfTask {
public AltTestTask(PerfRunData runData) {
super(runData);
}
@Override
public int doLogic() throws Exception {
return 0;
}
}
package org.apache.lucene.benchmark.byTask.tasks.alt;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.tasks.PerfTask;
/**
* {@link PerfTask} which does nothing, but is in a different package
*/
public class AltTestTask extends PerfTask {
public AltTestTask(PerfRunData runData) {
super(runData);
}
@Override
public int doLogic() throws Exception {
return 0;
}
}

View File

@ -1 +1,77 @@
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.update; import java.util.HashMap; import org.apache.solr.common.params.MapSolrParams; import org.apache.solr.common.params.UpdateParams; import org.apache.solr.core.*; import org.apache.solr.handler.XmlUpdateRequestHandler; import org.apache.solr.request.SolrQueryRequestBase; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.util.AbstractSolrTestCase; public class UpdateParamsTest extends AbstractSolrTestCase { @Override public String getSchemaFile() { return "schema.xml"; } @Override public String getSolrConfigFile() { return "solrconfig.xml"; } /** * Tests that both update.chain and update.processor works * NOTE: This test will fail when support for update.processor is removed and should then be removed */ public void testUpdateProcessorParamDeprecation() throws Exception { SolrCore core = h.getCore(); XmlUpdateRequestHandler handler = new XmlUpdateRequestHandler(); handler.init( null ); MapSolrParams params = new MapSolrParams( new HashMap<String, String>() ); params.getMap().put(UpdateParams.UPDATE_CHAIN_DEPRECATED, "nonexistant"); // Add a single document SolrQueryResponse rsp = new SolrQueryResponse(); SolrQueryRequestBase req = new SolrQueryRequestBase( core, params ) {}; // First check that the old param behaves as it should try { handler.handleRequestBody(req, rsp); assertFalse("Faulty update.processor parameter (deprecated but should work) not causing an error - i.e. it is not detected", true); } catch (Exception e) { assertEquals("Got wrong exception while testing update.chain", e.getMessage(), "unknown UpdateRequestProcessorChain: nonexistant"); } // Then check that the new param behaves correctly params.getMap().remove(UpdateParams.UPDATE_CHAIN_DEPRECATED); params.getMap().put(UpdateParams.UPDATE_CHAIN, "nonexistant"); req.setParams(params); try { handler.handleRequestBody(req, rsp); assertFalse("Faulty update.chain parameter not causing an error - i.e. it is not detected", true); } catch (Exception e) { assertEquals("Got wrong exception while testing update.chain", e.getMessage(), "unknown UpdateRequestProcessorChain: nonexistant"); } } }
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update;
import java.util.HashMap;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.core.*;
import org.apache.solr.handler.XmlUpdateRequestHandler;
import org.apache.solr.request.SolrQueryRequestBase;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.util.AbstractSolrTestCase;
public class UpdateParamsTest extends AbstractSolrTestCase {
@Override
public String getSchemaFile() { return "schema.xml"; }
@Override
public String getSolrConfigFile() { return "solrconfig.xml"; }
/**
* Tests that both update.chain and update.processor works
* NOTE: This test will fail when support for update.processor is removed and should then be removed
*/
public void testUpdateProcessorParamDeprecation() throws Exception {
SolrCore core = h.getCore();
XmlUpdateRequestHandler handler = new XmlUpdateRequestHandler();
handler.init( null );
MapSolrParams params = new MapSolrParams( new HashMap<String, String>() );
params.getMap().put(UpdateParams.UPDATE_CHAIN_DEPRECATED, "nonexistant");
// Add a single document
SolrQueryResponse rsp = new SolrQueryResponse();
SolrQueryRequestBase req = new SolrQueryRequestBase( core, params ) {};
// First check that the old param behaves as it should
try {
handler.handleRequestBody(req, rsp);
assertFalse("Faulty update.processor parameter (deprecated but should work) not causing an error - i.e. it is not detected", true);
} catch (Exception e) {
assertEquals("Got wrong exception while testing update.chain", e.getMessage(), "unknown UpdateRequestProcessorChain: nonexistant");
}
// Then check that the new param behaves correctly
params.getMap().remove(UpdateParams.UPDATE_CHAIN_DEPRECATED);
params.getMap().put(UpdateParams.UPDATE_CHAIN, "nonexistant");
req.setParams(params);
try {
handler.handleRequestBody(req, rsp);
assertFalse("Faulty update.chain parameter not causing an error - i.e. it is not detected", true);
} catch (Exception e) {
assertEquals("Got wrong exception while testing update.chain", e.getMessage(), "unknown UpdateRequestProcessorChain: nonexistant");
}
}
}