mirror of https://github.com/apache/lucene.git
missing svn:eol-style
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/realtime_search@1097225 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4d73a6f931
commit
9ba804541d
|
@ -1 +1,57 @@
|
|||
package org.apache.lucene.search.highlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
/**
* This TokenFilter limits the number of tokens while indexing by adding up the
* current offset.
*/
public final class OffsetLimitTokenFilter extends TokenFilter {
private int offsetCount;
private OffsetAttribute offsetAttrib = getAttribute(OffsetAttribute.class);
private int offsetLimit;
public OffsetLimitTokenFilter(TokenStream input, int offsetLimit) {
super(input);
this.offsetLimit = offsetLimit;
}
@Override
public boolean incrementToken() throws IOException {
if (offsetCount < offsetLimit && input.incrementToken()) {
int offsetLength = offsetAttrib.endOffset() - offsetAttrib.startOffset();
offsetCount += offsetLength;
return true;
}
return false;
}
@Override
public void reset() throws IOException {
super.reset();
offsetCount = 0;
}
}
|
||||
package org.apache.lucene.search.highlight;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
|
||||
/**
|
||||
* This TokenFilter limits the number of tokens while indexing by adding up the
|
||||
* current offset.
|
||||
*/
|
||||
public final class OffsetLimitTokenFilter extends TokenFilter {
|
||||
|
||||
private int offsetCount;
|
||||
private OffsetAttribute offsetAttrib = getAttribute(OffsetAttribute.class);
|
||||
private int offsetLimit;
|
||||
|
||||
public OffsetLimitTokenFilter(TokenStream input, int offsetLimit) {
|
||||
super(input);
|
||||
this.offsetLimit = offsetLimit;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (offsetCount < offsetLimit && input.incrementToken()) {
|
||||
int offsetLength = offsetAttrib.endOffset() - offsetAttrib.startOffset();
|
||||
offsetCount += offsetLength;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
offsetCount = 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1 +1,60 @@
|
|||
package org.apache.lucene.search.highlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
public class OffsetLimitTokenFilterTest extends BaseTokenStreamTestCase {
public void testFilter() throws Exception {
TokenStream stream = new MockTokenizer(new StringReader(
"short toolong evenmuchlongertext a ab toolong foo"),
MockTokenizer.WHITESPACE, false);
OffsetLimitTokenFilter filter = new OffsetLimitTokenFilter(stream, 10);
assertTokenStreamContents(filter, new String[] {"short", "toolong"});
stream = new MockTokenizer(new StringReader(
"short toolong evenmuchlongertext a ab toolong foo"),
MockTokenizer.WHITESPACE, false);
filter = new OffsetLimitTokenFilter(stream, 12);
assertTokenStreamContents(filter, new String[] {"short", "toolong"});
stream = new MockTokenizer(new StringReader(
"short toolong evenmuchlongertext a ab toolong foo"),
MockTokenizer.WHITESPACE, false);
filter = new OffsetLimitTokenFilter(stream, 30);
assertTokenStreamContents(filter, new String[] {"short", "toolong",
"evenmuchlongertext"});
checkOneTermReuse(new Analyzer() {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new OffsetLimitTokenFilter(new MockTokenizer(reader,
MockTokenizer.WHITESPACE, false), 10);
}
}, "llenges", "llenges");
}
}
|
||||
package org.apache.lucene.search.highlight;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
public class OffsetLimitTokenFilterTest extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testFilter() throws Exception {
|
||||
TokenStream stream = new MockTokenizer(new StringReader(
|
||||
"short toolong evenmuchlongertext a ab toolong foo"),
|
||||
MockTokenizer.WHITESPACE, false);
|
||||
OffsetLimitTokenFilter filter = new OffsetLimitTokenFilter(stream, 10);
|
||||
assertTokenStreamContents(filter, new String[] {"short", "toolong"});
|
||||
|
||||
stream = new MockTokenizer(new StringReader(
|
||||
"short toolong evenmuchlongertext a ab toolong foo"),
|
||||
MockTokenizer.WHITESPACE, false);
|
||||
filter = new OffsetLimitTokenFilter(stream, 12);
|
||||
assertTokenStreamContents(filter, new String[] {"short", "toolong"});
|
||||
|
||||
stream = new MockTokenizer(new StringReader(
|
||||
"short toolong evenmuchlongertext a ab toolong foo"),
|
||||
MockTokenizer.WHITESPACE, false);
|
||||
filter = new OffsetLimitTokenFilter(stream, 30);
|
||||
assertTokenStreamContents(filter, new String[] {"short", "toolong",
|
||||
"evenmuchlongertext"});
|
||||
|
||||
|
||||
checkOneTermReuse(new Analyzer() {
|
||||
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new OffsetLimitTokenFilter(new MockTokenizer(reader,
|
||||
MockTokenizer.WHITESPACE, false), 10);
|
||||
}
|
||||
}, "llenges", "llenges");
|
||||
}
|
||||
}
|
|
@ -1,28 +1,28 @@
|
|||
<html>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<head>
|
||||
<title>Apache Lucene Test Framework API</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>
|
||||
The Lucene Test Framework is used by Lucene as the basis for its tests.
|
||||
The framework can also be used for testing third-party code that uses
|
||||
the Lucene API.
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
<html>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<head>
|
||||
<title>Apache Lucene Test Framework API</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>
|
||||
The Lucene Test Framework is used by Lucene as the basis for its tests.
|
||||
The framework can also be used for testing third-party code that uses
|
||||
the Lucene API.
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -1,136 +1,136 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Parser for trec doc content, invoked on doc text excluding <DOC> and <DOCNO>
|
||||
* which are handled in TrecContentSource. Required to be stateless and hence thread safe.
|
||||
*/
|
||||
public abstract class TrecDocParser {
|
||||
|
||||
/** Types of trec parse paths, */
|
||||
public enum ParsePathType { GOV2, FBIS, FT, FR94, LATIMES }
|
||||
|
||||
/** trec parser type used for unknown extensions */
|
||||
public static final ParsePathType DEFAULT_PATH_TYPE = ParsePathType.GOV2;
|
||||
|
||||
static final Map<ParsePathType,TrecDocParser> pathType2parser = new HashMap<ParsePathType,TrecDocParser>();
|
||||
static {
|
||||
pathType2parser.put(ParsePathType.GOV2, new TrecGov2Parser());
|
||||
pathType2parser.put(ParsePathType.FBIS, new TrecFBISParser());
|
||||
pathType2parser.put(ParsePathType.FR94, new TrecFR94Parser());
|
||||
pathType2parser.put(ParsePathType.FT, new TrecFTParser());
|
||||
pathType2parser.put(ParsePathType.LATIMES, new TrecLATimesParser());
|
||||
}
|
||||
|
||||
static final Map<String,ParsePathType> pathName2Type = new HashMap<String,ParsePathType>();
|
||||
static {
|
||||
for (ParsePathType ppt : ParsePathType.values()) {
|
||||
pathName2Type.put(ppt.name().toUpperCase(Locale.ENGLISH),ppt);
|
||||
}
|
||||
}
|
||||
|
||||
/** max length of walk up from file to its ancestors when looking for a known path type */
|
||||
private static final int MAX_PATH_LENGTH = 10;
|
||||
|
||||
/**
|
||||
* Compute the path type of a file by inspecting name of file and its parents
|
||||
*/
|
||||
public static ParsePathType pathType(File f) {
|
||||
int pathLength = 0;
|
||||
while (f != null && ++pathLength < MAX_PATH_LENGTH) {
|
||||
ParsePathType ppt = pathName2Type.get(f.getName().toUpperCase(Locale.ENGLISH));
|
||||
if (ppt!=null) {
|
||||
return ppt;
|
||||
}
|
||||
f = f.getParentFile();
|
||||
}
|
||||
return DEFAULT_PATH_TYPE;
|
||||
}
|
||||
|
||||
/**
|
||||
* parse the text prepared in docBuf into a result DocData,
|
||||
* no synchronization is required.
|
||||
* @param docData reusable result
|
||||
* @param name name that should be set to the result
|
||||
* @param trecSrc calling trec content source
|
||||
* @param docBuf text to parse
|
||||
* @param pathType type of parsed file, or null if unknown - may be used by
|
||||
* parsers to alter their behavior according to the file path type.
|
||||
*/
|
||||
public abstract DocData parse(DocData docData, String name, TrecContentSource trecSrc,
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException;
|
||||
|
||||
/**
|
||||
* strip tags from <code>buf</code>: each tag is replaced by a single blank.
|
||||
* @return text obtained when stripping all tags from <code>buf</code> (Input StringBuilder is unmodified).
|
||||
*/
|
||||
public static String stripTags(StringBuilder buf, int start) {
|
||||
return stripTags(buf.substring(start),0);
|
||||
}
|
||||
|
||||
/**
|
||||
* strip tags from input.
|
||||
* @see #stripTags(StringBuilder, int)
|
||||
*/
|
||||
public static String stripTags(String buf, int start) {
|
||||
if (start>0) {
|
||||
buf = buf.substring(0);
|
||||
}
|
||||
return buf.replaceAll("<[^>]*>", " ");
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract from <code>buf</code> the text of interest within specified tags
|
||||
* @param buf entire input text
|
||||
* @param startTag tag marking start of text of interest
|
||||
* @param endTag tag marking end of text of interest
|
||||
* @param maxPos if ≥ 0 sets a limit on start of text of interest
|
||||
* @return text of interest or null if not found
|
||||
*/
|
||||
public static String extract(StringBuilder buf, String startTag, String endTag, int maxPos, String noisePrefixes[]) {
|
||||
int k1 = buf.indexOf(startTag);
|
||||
if (k1>=0 && (maxPos<0 || k1<maxPos)) {
|
||||
k1 += startTag.length();
|
||||
int k2 = buf.indexOf(endTag,k1);
|
||||
if (k2>=0 && (maxPos<0 || k2<maxPos)) { // found end tag with allowed range
|
||||
if (noisePrefixes != null) {
|
||||
for (String noise : noisePrefixes) {
|
||||
int k1a = buf.indexOf(noise,k1);
|
||||
if (k1a>=0 && k1a<k2) {
|
||||
k1 = k1a + noise.length();
|
||||
}
|
||||
}
|
||||
}
|
||||
return buf.substring(k1,k2).trim();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
//public static void main(String[] args) {
|
||||
// System.out.println(stripTags("is it true that<space>2<<second space>><almost last space>1<one more space>?",0));
|
||||
//}
|
||||
|
||||
}
|
||||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Parser for trec doc content, invoked on doc text excluding <DOC> and <DOCNO>
|
||||
* which are handled in TrecContentSource. Required to be stateless and hence thread safe.
|
||||
*/
|
||||
public abstract class TrecDocParser {
|
||||
|
||||
/** Types of trec parse paths, */
|
||||
public enum ParsePathType { GOV2, FBIS, FT, FR94, LATIMES }
|
||||
|
||||
/** trec parser type used for unknown extensions */
|
||||
public static final ParsePathType DEFAULT_PATH_TYPE = ParsePathType.GOV2;
|
||||
|
||||
static final Map<ParsePathType,TrecDocParser> pathType2parser = new HashMap<ParsePathType,TrecDocParser>();
|
||||
static {
|
||||
pathType2parser.put(ParsePathType.GOV2, new TrecGov2Parser());
|
||||
pathType2parser.put(ParsePathType.FBIS, new TrecFBISParser());
|
||||
pathType2parser.put(ParsePathType.FR94, new TrecFR94Parser());
|
||||
pathType2parser.put(ParsePathType.FT, new TrecFTParser());
|
||||
pathType2parser.put(ParsePathType.LATIMES, new TrecLATimesParser());
|
||||
}
|
||||
|
||||
static final Map<String,ParsePathType> pathName2Type = new HashMap<String,ParsePathType>();
|
||||
static {
|
||||
for (ParsePathType ppt : ParsePathType.values()) {
|
||||
pathName2Type.put(ppt.name().toUpperCase(Locale.ENGLISH),ppt);
|
||||
}
|
||||
}
|
||||
|
||||
/** max length of walk up from file to its ancestors when looking for a known path type */
|
||||
private static final int MAX_PATH_LENGTH = 10;
|
||||
|
||||
/**
|
||||
* Compute the path type of a file by inspecting name of file and its parents
|
||||
*/
|
||||
public static ParsePathType pathType(File f) {
|
||||
int pathLength = 0;
|
||||
while (f != null && ++pathLength < MAX_PATH_LENGTH) {
|
||||
ParsePathType ppt = pathName2Type.get(f.getName().toUpperCase(Locale.ENGLISH));
|
||||
if (ppt!=null) {
|
||||
return ppt;
|
||||
}
|
||||
f = f.getParentFile();
|
||||
}
|
||||
return DEFAULT_PATH_TYPE;
|
||||
}
|
||||
|
||||
/**
|
||||
* parse the text prepared in docBuf into a result DocData,
|
||||
* no synchronization is required.
|
||||
* @param docData reusable result
|
||||
* @param name name that should be set to the result
|
||||
* @param trecSrc calling trec content source
|
||||
* @param docBuf text to parse
|
||||
* @param pathType type of parsed file, or null if unknown - may be used by
|
||||
* parsers to alter their behavior according to the file path type.
|
||||
*/
|
||||
public abstract DocData parse(DocData docData, String name, TrecContentSource trecSrc,
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException;
|
||||
|
||||
/**
|
||||
* strip tags from <code>buf</code>: each tag is replaced by a single blank.
|
||||
* @return text obtained when stripping all tags from <code>buf</code> (Input StringBuilder is unmodified).
|
||||
*/
|
||||
public static String stripTags(StringBuilder buf, int start) {
|
||||
return stripTags(buf.substring(start),0);
|
||||
}
|
||||
|
||||
/**
|
||||
* strip tags from input.
|
||||
* @see #stripTags(StringBuilder, int)
|
||||
*/
|
||||
public static String stripTags(String buf, int start) {
|
||||
if (start>0) {
|
||||
buf = buf.substring(0);
|
||||
}
|
||||
return buf.replaceAll("<[^>]*>", " ");
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract from <code>buf</code> the text of interest within specified tags
|
||||
* @param buf entire input text
|
||||
* @param startTag tag marking start of text of interest
|
||||
* @param endTag tag marking end of text of interest
|
||||
* @param maxPos if ≥ 0 sets a limit on start of text of interest
|
||||
* @return text of interest or null if not found
|
||||
*/
|
||||
public static String extract(StringBuilder buf, String startTag, String endTag, int maxPos, String noisePrefixes[]) {
|
||||
int k1 = buf.indexOf(startTag);
|
||||
if (k1>=0 && (maxPos<0 || k1<maxPos)) {
|
||||
k1 += startTag.length();
|
||||
int k2 = buf.indexOf(endTag,k1);
|
||||
if (k2>=0 && (maxPos<0 || k2<maxPos)) { // found end tag with allowed range
|
||||
if (noisePrefixes != null) {
|
||||
for (String noise : noisePrefixes) {
|
||||
int k1a = buf.indexOf(noise,k1);
|
||||
if (k1a>=0 && k1a<k2) {
|
||||
k1 = k1a + noise.length();
|
||||
}
|
||||
}
|
||||
}
|
||||
return buf.substring(k1,k2).trim();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
//public static void main(String[] args) {
|
||||
// System.out.println(stripTags("is it true that<space>2<<second space>><almost last space>1<one more space>?",0));
|
||||
//}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,65 +1,65 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
|
||||
/**
|
||||
* Parser for the FBIS docs in trec disks 4+5 collection format
|
||||
*/
|
||||
public class TrecFBISParser extends TrecDocParser {
|
||||
|
||||
private static final String HEADER = "<HEADER>";
|
||||
private static final String HEADER_END = "</HEADER>";
|
||||
private static final int HEADER_END_LENGTH = HEADER_END.length();
|
||||
|
||||
private static final String DATE1 = "<DATE1>";
|
||||
private static final String DATE1_END = "</DATE1>";
|
||||
|
||||
private static final String TI = "<TI>";
|
||||
private static final String TI_END = "</TI>";
|
||||
|
||||
@Override
|
||||
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
|
||||
int mark = 0; // that much is skipped
|
||||
// optionally skip some of the text, set date, title
|
||||
Date date = null;
|
||||
String title = null;
|
||||
int h1 = docBuf.indexOf(HEADER);
|
||||
if (h1>=0) {
|
||||
int h2 = docBuf.indexOf(HEADER_END,h1);
|
||||
mark = h2+HEADER_END_LENGTH;
|
||||
// date...
|
||||
String dateStr = extract(docBuf, DATE1, DATE1_END, h2, null);
|
||||
if (dateStr != null) {
|
||||
date = trecSrc.parseDate(dateStr);
|
||||
}
|
||||
// title...
|
||||
title = extract(docBuf, TI, TI_END, h2, null);
|
||||
}
|
||||
docData.clear();
|
||||
docData.setName(name);
|
||||
docData.setDate(date);
|
||||
docData.setTitle(title);
|
||||
docData.setBody(stripTags(docBuf, mark).toString());
|
||||
return docData;
|
||||
}
|
||||
|
||||
}
|
||||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
|
||||
/**
|
||||
* Parser for the FBIS docs in trec disks 4+5 collection format
|
||||
*/
|
||||
public class TrecFBISParser extends TrecDocParser {
|
||||
|
||||
private static final String HEADER = "<HEADER>";
|
||||
private static final String HEADER_END = "</HEADER>";
|
||||
private static final int HEADER_END_LENGTH = HEADER_END.length();
|
||||
|
||||
private static final String DATE1 = "<DATE1>";
|
||||
private static final String DATE1_END = "</DATE1>";
|
||||
|
||||
private static final String TI = "<TI>";
|
||||
private static final String TI_END = "</TI>";
|
||||
|
||||
@Override
|
||||
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
|
||||
int mark = 0; // that much is skipped
|
||||
// optionally skip some of the text, set date, title
|
||||
Date date = null;
|
||||
String title = null;
|
||||
int h1 = docBuf.indexOf(HEADER);
|
||||
if (h1>=0) {
|
||||
int h2 = docBuf.indexOf(HEADER_END,h1);
|
||||
mark = h2+HEADER_END_LENGTH;
|
||||
// date...
|
||||
String dateStr = extract(docBuf, DATE1, DATE1_END, h2, null);
|
||||
if (dateStr != null) {
|
||||
date = trecSrc.parseDate(dateStr);
|
||||
}
|
||||
// title...
|
||||
title = extract(docBuf, TI, TI_END, h2, null);
|
||||
}
|
||||
docData.clear();
|
||||
docData.setName(name);
|
||||
docData.setDate(date);
|
||||
docData.setTitle(title);
|
||||
docData.setBody(stripTags(docBuf, mark).toString());
|
||||
return docData;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,66 +1,66 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
|
||||
/**
|
||||
* Parser for the FR94 docs in trec disks 4+5 collection format
|
||||
*/
|
||||
public class TrecFR94Parser extends TrecDocParser {
|
||||
|
||||
private static final String TEXT = "<TEXT>";
|
||||
private static final int TEXT_LENGTH = TEXT.length();
|
||||
private static final String TEXT_END = "</TEXT>";
|
||||
|
||||
private static final String DATE = "<DATE>";
|
||||
private static final String[] DATE_NOISE_PREFIXES = {
|
||||
"DATE:",
|
||||
"date:", //TODO improve date extraction for this format
|
||||
"t.c.",
|
||||
};
|
||||
private static final String DATE_END = "</DATE>";
|
||||
|
||||
//TODO can we also extract title for this format?
|
||||
|
||||
@Override
|
||||
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
|
||||
int mark = 0; // that much is skipped
|
||||
// optionally skip some of the text, set date (no title?)
|
||||
Date date = null;
|
||||
int h1 = docBuf.indexOf(TEXT);
|
||||
if (h1>=0) {
|
||||
int h2 = docBuf.indexOf(TEXT_END,h1);
|
||||
mark = h1+TEXT_LENGTH;
|
||||
// date...
|
||||
String dateStr = extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES);
|
||||
if (dateStr != null) {
|
||||
dateStr = stripTags(dateStr,0).toString();
|
||||
date = trecSrc.parseDate(dateStr.trim());
|
||||
}
|
||||
}
|
||||
docData.clear();
|
||||
docData.setName(name);
|
||||
docData.setDate(date);
|
||||
docData.setBody(stripTags(docBuf, mark).toString());
|
||||
return docData;
|
||||
}
|
||||
|
||||
}
|
||||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
|
||||
/**
|
||||
* Parser for the FR94 docs in trec disks 4+5 collection format
|
||||
*/
|
||||
public class TrecFR94Parser extends TrecDocParser {
|
||||
|
||||
private static final String TEXT = "<TEXT>";
|
||||
private static final int TEXT_LENGTH = TEXT.length();
|
||||
private static final String TEXT_END = "</TEXT>";
|
||||
|
||||
private static final String DATE = "<DATE>";
|
||||
private static final String[] DATE_NOISE_PREFIXES = {
|
||||
"DATE:",
|
||||
"date:", //TODO improve date extraction for this format
|
||||
"t.c.",
|
||||
};
|
||||
private static final String DATE_END = "</DATE>";
|
||||
|
||||
//TODO can we also extract title for this format?
|
||||
|
||||
@Override
|
||||
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
|
||||
int mark = 0; // that much is skipped
|
||||
// optionally skip some of the text, set date (no title?)
|
||||
Date date = null;
|
||||
int h1 = docBuf.indexOf(TEXT);
|
||||
if (h1>=0) {
|
||||
int h2 = docBuf.indexOf(TEXT_END,h1);
|
||||
mark = h1+TEXT_LENGTH;
|
||||
// date...
|
||||
String dateStr = extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES);
|
||||
if (dateStr != null) {
|
||||
dateStr = stripTags(dateStr,0).toString();
|
||||
date = trecSrc.parseDate(dateStr.trim());
|
||||
}
|
||||
}
|
||||
docData.clear();
|
||||
docData.setName(name);
|
||||
docData.setDate(date);
|
||||
docData.setBody(stripTags(docBuf, mark).toString());
|
||||
return docData;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,57 +1,57 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
|
||||
/**
|
||||
* Parser for the FT docs in trec disks 4+5 collection format
|
||||
*/
|
||||
public class TrecFTParser extends TrecDocParser {
|
||||
|
||||
private static final String DATE = "<DATE>";
|
||||
private static final String DATE_END = "</DATE>";
|
||||
|
||||
private static final String HEADLINE = "<HEADLINE>";
|
||||
private static final String HEADLINE_END = "</HEADLINE>";
|
||||
|
||||
@Override
|
||||
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
|
||||
int mark = 0; // that much is skipped
|
||||
|
||||
// date...
|
||||
Date date = null;
|
||||
String dateStr = extract(docBuf, DATE, DATE_END, -1, null);
|
||||
if (dateStr != null) {
|
||||
date = trecSrc.parseDate(dateStr);
|
||||
}
|
||||
|
||||
// title...
|
||||
String title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
|
||||
|
||||
docData.clear();
|
||||
docData.setName(name);
|
||||
docData.setDate(date);
|
||||
docData.setTitle(title);
|
||||
docData.setBody(stripTags(docBuf, mark).toString());
|
||||
return docData;
|
||||
}
|
||||
|
||||
}
|
||||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
|
||||
/**
|
||||
* Parser for the FT docs in trec disks 4+5 collection format
|
||||
*/
|
||||
public class TrecFTParser extends TrecDocParser {
|
||||
|
||||
private static final String DATE = "<DATE>";
|
||||
private static final String DATE_END = "</DATE>";
|
||||
|
||||
private static final String HEADLINE = "<HEADLINE>";
|
||||
private static final String HEADLINE_END = "</HEADLINE>";
|
||||
|
||||
@Override
|
||||
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
|
||||
int mark = 0; // that much is skipped
|
||||
|
||||
// date...
|
||||
Date date = null;
|
||||
String dateStr = extract(docBuf, DATE, DATE_END, -1, null);
|
||||
if (dateStr != null) {
|
||||
date = trecSrc.parseDate(dateStr);
|
||||
}
|
||||
|
||||
// title...
|
||||
String title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
|
||||
|
||||
docData.clear();
|
||||
docData.setName(name);
|
||||
docData.setDate(date);
|
||||
docData.setTitle(title);
|
||||
docData.setBody(stripTags(docBuf, mark).toString());
|
||||
return docData;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,71 +1,71 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
|
||||
/**
|
||||
* Parser for the FT docs in trec disks 4+5 collection format
|
||||
*/
|
||||
public class TrecLATimesParser extends TrecDocParser {
|
||||
|
||||
private static final String DATE = "<DATE>";
|
||||
private static final String DATE_END = "</DATE>";
|
||||
private static final String DATE_NOISE = "day,"; // anything aftre the ','
|
||||
|
||||
private static final String SUBJECT = "<SUBJECT>";
|
||||
private static final String SUBJECT_END = "</SUBJECT>";
|
||||
private static final String HEADLINE = "<HEADLINE>";
|
||||
private static final String HEADLINE_END = "</HEADLINE>";
|
||||
|
||||
@Override
|
||||
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
|
||||
int mark = 0; // that much is skipped
|
||||
|
||||
// date...
|
||||
Date date = null;
|
||||
String dateStr = extract(docBuf, DATE, DATE_END, -1, null);
|
||||
if (dateStr != null) {
|
||||
int d2a = dateStr.indexOf(DATE_NOISE);
|
||||
if (d2a > 0) {
|
||||
dateStr = dateStr.substring(0,d2a+3); // we need the "day" part
|
||||
}
|
||||
dateStr = stripTags(dateStr,0).toString();
|
||||
date = trecSrc.parseDate(dateStr.trim());
|
||||
}
|
||||
|
||||
// title... first try with SUBJECT, them with HEADLINE
|
||||
String title = extract(docBuf, SUBJECT, SUBJECT_END, -1, null);
|
||||
if (title==null) {
|
||||
title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
|
||||
}
|
||||
if (title!=null) {
|
||||
title = stripTags(title,0).toString().trim();
|
||||
}
|
||||
|
||||
docData.clear();
|
||||
docData.setName(name);
|
||||
docData.setDate(date);
|
||||
docData.setTitle(title);
|
||||
docData.setBody(stripTags(docBuf, mark).toString());
|
||||
return docData;
|
||||
}
|
||||
|
||||
}
|
||||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
|
||||
/**
|
||||
* Parser for the FT docs in trec disks 4+5 collection format
|
||||
*/
|
||||
public class TrecLATimesParser extends TrecDocParser {
|
||||
|
||||
private static final String DATE = "<DATE>";
|
||||
private static final String DATE_END = "</DATE>";
|
||||
private static final String DATE_NOISE = "day,"; // anything aftre the ','
|
||||
|
||||
private static final String SUBJECT = "<SUBJECT>";
|
||||
private static final String SUBJECT_END = "</SUBJECT>";
|
||||
private static final String HEADLINE = "<HEADLINE>";
|
||||
private static final String HEADLINE_END = "</HEADLINE>";
|
||||
|
||||
@Override
|
||||
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
|
||||
int mark = 0; // that much is skipped
|
||||
|
||||
// date...
|
||||
Date date = null;
|
||||
String dateStr = extract(docBuf, DATE, DATE_END, -1, null);
|
||||
if (dateStr != null) {
|
||||
int d2a = dateStr.indexOf(DATE_NOISE);
|
||||
if (d2a > 0) {
|
||||
dateStr = dateStr.substring(0,d2a+3); // we need the "day" part
|
||||
}
|
||||
dateStr = stripTags(dateStr,0).toString();
|
||||
date = trecSrc.parseDate(dateStr.trim());
|
||||
}
|
||||
|
||||
// title... first try with SUBJECT, them with HEADLINE
|
||||
String title = extract(docBuf, SUBJECT, SUBJECT_END, -1, null);
|
||||
if (title==null) {
|
||||
title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
|
||||
}
|
||||
if (title!=null) {
|
||||
title = stripTags(title,0).toString().trim();
|
||||
}
|
||||
|
||||
docData.clear();
|
||||
docData.setName(name);
|
||||
docData.setDate(date);
|
||||
docData.setTitle(title);
|
||||
docData.setBody(stripTags(docBuf, mark).toString());
|
||||
return docData;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,33 +1,33 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
/**
|
||||
* Parser for trec docs which selects the parser to apply according
|
||||
* to the source files path, defaulting to {@link TrecGov2Parser}.
|
||||
*/
|
||||
public class TrecParserByPath extends TrecDocParser {
|
||||
|
||||
@Override
|
||||
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
|
||||
return pathType2parser.get(pathType).parse(docData, name, trecSrc, docBuf, pathType);
|
||||
}
|
||||
|
||||
}
|
||||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
/**
|
||||
* Parser for trec docs which selects the parser to apply according
|
||||
* to the source files path, defaulting to {@link TrecGov2Parser}.
|
||||
*/
|
||||
public class TrecParserByPath extends TrecDocParser {
|
||||
|
||||
@Override
|
||||
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
|
||||
return pathType2parser.get(pathType).parse(docData, name, trecSrc, docBuf, pathType);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,37 +1,37 @@
|
|||
package org.apache.lucene.benchmark.byTask.tasks.alt;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.PerfTask;
|
||||
|
||||
/**
|
||||
* {@link PerfTask} which does nothing, but is in a different package
|
||||
*/
|
||||
public class AltTestTask extends PerfTask {
|
||||
|
||||
public AltTestTask(PerfRunData runData) {
|
||||
super(runData);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int doLogic() throws Exception {
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
package org.apache.lucene.benchmark.byTask.tasks.alt;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.PerfTask;
|
||||
|
||||
/**
|
||||
* {@link PerfTask} which does nothing, but is in a different package
|
||||
*/
|
||||
public class AltTestTask extends PerfTask {
|
||||
|
||||
public AltTestTask(PerfRunData runData) {
|
||||
super(runData);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int doLogic() throws Exception {
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1 +1,77 @@
|
|||
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update;
import java.util.HashMap;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.core.*;
import org.apache.solr.handler.XmlUpdateRequestHandler;
import org.apache.solr.request.SolrQueryRequestBase;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.util.AbstractSolrTestCase;
public class UpdateParamsTest extends AbstractSolrTestCase {
@Override
public String getSchemaFile() { return "schema.xml"; }
@Override
public String getSolrConfigFile() { return "solrconfig.xml"; }
/**
* Tests that both update.chain and update.processor works
* NOTE: This test will fail when support for update.processor is removed and should then be removed
*/
public void testUpdateProcessorParamDeprecation() throws Exception {
SolrCore core = h.getCore();
XmlUpdateRequestHandler handler = new XmlUpdateRequestHandler();
handler.init( null );
MapSolrParams params = new MapSolrParams( new HashMap<String, String>() );
params.getMap().put(UpdateParams.UPDATE_CHAIN_DEPRECATED, "nonexistant");
// Add a single document
SolrQueryResponse rsp = new SolrQueryResponse();
SolrQueryRequestBase req = new SolrQueryRequestBase( core, params ) {};
// First check that the old param behaves as it should
try {
handler.handleRequestBody(req, rsp);
assertFalse("Faulty update.processor parameter (deprecated but should work) not causing an error - i.e. it is not detected", true);
} catch (Exception e) {
assertEquals("Got wrong exception while testing update.chain", e.getMessage(), "unknown UpdateRequestProcessorChain: nonexistant");
}
// Then check that the new param behaves correctly
params.getMap().remove(UpdateParams.UPDATE_CHAIN_DEPRECATED);
params.getMap().put(UpdateParams.UPDATE_CHAIN, "nonexistant");
req.setParams(params);
try {
handler.handleRequestBody(req, rsp);
assertFalse("Faulty update.chain parameter not causing an error - i.e. it is not detected", true);
} catch (Exception e) {
assertEquals("Got wrong exception while testing update.chain", e.getMessage(), "unknown UpdateRequestProcessorChain: nonexistant");
}
}
}
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.update;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
import org.apache.solr.common.params.MapSolrParams;
|
||||
import org.apache.solr.common.params.UpdateParams;
|
||||
import org.apache.solr.core.*;
|
||||
import org.apache.solr.handler.XmlUpdateRequestHandler;
|
||||
import org.apache.solr.request.SolrQueryRequestBase;
|
||||
import org.apache.solr.response.SolrQueryResponse;
|
||||
import org.apache.solr.util.AbstractSolrTestCase;
|
||||
|
||||
|
||||
|
||||
public class UpdateParamsTest extends AbstractSolrTestCase {
|
||||
|
||||
@Override
|
||||
public String getSchemaFile() { return "schema.xml"; }
|
||||
@Override
|
||||
public String getSolrConfigFile() { return "solrconfig.xml"; }
|
||||
|
||||
/**
|
||||
* Tests that both update.chain and update.processor works
|
||||
* NOTE: This test will fail when support for update.processor is removed and should then be removed
|
||||
*/
|
||||
public void testUpdateProcessorParamDeprecation() throws Exception {
|
||||
SolrCore core = h.getCore();
|
||||
|
||||
XmlUpdateRequestHandler handler = new XmlUpdateRequestHandler();
|
||||
handler.init( null );
|
||||
|
||||
MapSolrParams params = new MapSolrParams( new HashMap<String, String>() );
|
||||
params.getMap().put(UpdateParams.UPDATE_CHAIN_DEPRECATED, "nonexistant");
|
||||
|
||||
// Add a single document
|
||||
SolrQueryResponse rsp = new SolrQueryResponse();
|
||||
SolrQueryRequestBase req = new SolrQueryRequestBase( core, params ) {};
|
||||
|
||||
// First check that the old param behaves as it should
|
||||
try {
|
||||
handler.handleRequestBody(req, rsp);
|
||||
assertFalse("Faulty update.processor parameter (deprecated but should work) not causing an error - i.e. it is not detected", true);
|
||||
} catch (Exception e) {
|
||||
assertEquals("Got wrong exception while testing update.chain", e.getMessage(), "unknown UpdateRequestProcessorChain: nonexistant");
|
||||
}
|
||||
|
||||
// Then check that the new param behaves correctly
|
||||
params.getMap().remove(UpdateParams.UPDATE_CHAIN_DEPRECATED);
|
||||
params.getMap().put(UpdateParams.UPDATE_CHAIN, "nonexistant");
|
||||
req.setParams(params);
|
||||
try {
|
||||
handler.handleRequestBody(req, rsp);
|
||||
assertFalse("Faulty update.chain parameter not causing an error - i.e. it is not detected", true);
|
||||
} catch (Exception e) {
|
||||
assertEquals("Got wrong exception while testing update.chain", e.getMessage(), "unknown UpdateRequestProcessorChain: nonexistant");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue