Merged /lucene/dev/trunk:r1440838-1441216

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1441220 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-01-31 22:09:07 +00:00
commit 8dd6108432
16 changed files with 698 additions and 118 deletions

View File

@ -123,6 +123,9 @@ Bug Fixes
* LUCENE-4732: Fixed TermsEnum.seekCeil/seekExact on term vectors.
(Adrien Grand, Robert Muir)
* LUCENE-4739: Fixed bugs that prevented FSTs more than ~1.1GB from
being saved and loaded (Adrien Grand, Mike McCandless)
======================= Lucene 4.1.0 =======================
Changes in backwards compatibility policy

View File

@ -46,7 +46,7 @@ class BytesStore extends DataOutput {
}
/** Pulls bytes from the provided IndexInput. */
public BytesStore(DataInput in, int numBytes, int maxBlockSize) throws IOException {
public BytesStore(DataInput in, long numBytes, int maxBlockSize) throws IOException {
int blockSize = 2;
int blockBits = 1;
while(blockSize < numBytes && blockSize < maxBlockSize) {
@ -56,9 +56,9 @@ class BytesStore extends DataOutput {
this.blockBits = blockBits;
this.blockSize = blockSize;
this.blockMask = blockSize-1;
int left = numBytes;
long left = numBytes;
while(left > 0) {
final int chunk = Math.min(blockSize, left);
final int chunk = (int) Math.min(blockSize, left);
byte[] block = new byte[chunk];
in.readBytes(block, 0, block.length);
blocks.add(block);

View File

@ -27,11 +27,6 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.Map;
/*
import java.io.Writer;
import java.io.OutputStreamWriter;
import java.io.FileOutputStream;
*/
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.ByteArrayDataOutput;
@ -41,12 +36,15 @@ import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.fst.Builder.UnCompiledNode;
import org.apache.lucene.util.packed.GrowableWriter;
import org.apache.lucene.util.packed.PackedInts;
//import java.io.Writer;
//import java.io.OutputStreamWriter;
// TODO: break this into WritableFST and ReadOnlyFST.. then
// we can have subclasses of ReadOnlyFST to handle the
@ -276,7 +274,6 @@ public final class FST<T> {
this.outputs = outputs;
this.allowArrayArcs = allowArrayArcs;
version = VERSION_CURRENT;
// 32 KB blocks:
bytes = new BytesStore(bytesPageBits);
// pad: ensure no node gets address 0 which is reserved to mean
// the stop state w/ no arcs
@ -295,9 +292,22 @@ public final class FST<T> {
nodeRefToAddress = null;
}
public static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28;
/** Load a previously saved FST. */
public FST(DataInput in, Outputs<T> outputs) throws IOException {
this(in, outputs, DEFAULT_MAX_BLOCK_BITS);
}
/** Load a previously saved FST; maxBlockBits allows you to
* control the size of the byte[] pages used to hold the FST bytes. */
public FST(DataInput in, Outputs<T> outputs, int maxBlockBits) throws IOException {
this.outputs = outputs;
if (maxBlockBits < 1 || maxBlockBits > 30) {
throw new IllegalArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits);
}
// NOTE: only reads most recent format; we don't have
// back-compat promise for FSTs (they are experimental):
version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_VINT_TARGET);
@ -345,13 +355,13 @@ public final class FST<T> {
} else {
nodeRefToAddress = null;
}
startNode = in.readVInt();
startNode = in.readVLong();
nodeCount = in.readVLong();
arcCount = in.readVLong();
arcWithOutputCount = in.readVLong();
int numBytes = in.readVInt();
bytes = new BytesStore(in, numBytes, Integer.MAX_VALUE);
long numBytes = in.readVLong();
bytes = new BytesStore(in, numBytes, 1<<maxBlockBits);
NO_OUTPUT = outputs.getNoOutput();

View File

@ -153,7 +153,7 @@ class Packed64 extends PackedInts.MutableImpl {
// bulk get
assert index % decoder.longValueCount() == 0;
int blockIndex = (int) ((long) index * bitsPerValue) >>> BLOCK_BITS;
int blockIndex = (int) (((long) index * bitsPerValue) >>> BLOCK_BITS);
assert (((long)index * bitsPerValue) & MOD_MASK) == 0;
final int iterations = len / decoder.longValueCount();
decoder.decode(blocks, blockIndex, arr, off, iterations);
@ -217,7 +217,7 @@ class Packed64 extends PackedInts.MutableImpl {
// bulk set
assert index % encoder.longValueCount() == 0;
int blockIndex = (int) ((long) index * bitsPerValue) >>> BLOCK_BITS;
int blockIndex = (int) (((long) index * bitsPerValue) >>> BLOCK_BITS);
assert (((long)index * bitsPerValue) & MOD_MASK) == 0;
final int iterations = len / encoder.longValueCount();
encoder.encode(arr, off, blocks, blockIndex, iterations);

View File

@ -1218,7 +1218,9 @@ public class PackedInts {
}
while (remaining > 0) {
final int written = dest.set(destPos, buf, 0, remaining);
destPos += written;
remaining -= written;
System.arraycopy(buf, written, buf, 0, remaining);
}
}
}

View File

@ -20,10 +20,16 @@ package org.apache.lucene.util.fst;
import java.util.Arrays;
import java.util.Random;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TimeUnits;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.packed.PackedInts;
import org.junit.Ignore;
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
@ -39,6 +45,8 @@ public class Test2BFST extends LuceneTestCase {
IntsRef input = new IntsRef(ints, 0, ints.length);
long seed = random().nextLong();
Directory dir = new MMapDirectory(_TestUtil.getTempDir("2BFST"));
for(int doPackIter=0;doPackIter<2;doPackIter++) {
boolean doPack = doPackIter == 1;
@ -72,42 +80,56 @@ public class Test2BFST extends LuceneTestCase {
FST<Object> fst = b.finish();
System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
for(int verify=0;verify<2;verify++) {
System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
Arrays.fill(ints2, 0);
r = new Random(seed);
Arrays.fill(ints2, 0);
r = new Random(seed);
for(int i=0;i<count;i++) {
if (i % 1000000 == 0) {
System.out.println(i + "...: ");
for(int i=0;i<count;i++) {
if (i % 1000000 == 0) {
System.out.println(i + "...: ");
}
for(int j=10;j<ints2.length;j++) {
ints2[j] = r.nextInt(256);
}
assertEquals(NO_OUTPUT, Util.get(fst, input2));
nextInput(r, ints2);
}
for(int j=10;j<ints2.length;j++) {
ints2[j] = r.nextInt(256);
System.out.println("\nTEST: enum all input/outputs");
IntsRefFSTEnum<Object> fstEnum = new IntsRefFSTEnum<Object>(fst);
Arrays.fill(ints2, 0);
r = new Random(seed);
int upto = 0;
while(true) {
IntsRefFSTEnum.InputOutput<Object> pair = fstEnum.next();
if (pair == null) {
break;
}
for(int j=10;j<ints2.length;j++) {
ints2[j] = r.nextInt(256);
}
assertEquals(input2, pair.input);
assertEquals(NO_OUTPUT, pair.output);
upto++;
nextInput(r, ints2);
}
assertEquals(count, upto);
if (verify == 0) {
System.out.println("\nTEST: save/load FST and re-verify");
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
fst.save(out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
fst = new FST<Object>(in, outputs);
in.close();
} else {
dir.deleteFile("fst");
}
assertEquals(NO_OUTPUT, Util.get(fst, input2));
nextInput(r, ints2);
}
System.out.println("\nTEST: enum all input/outputs");
IntsRefFSTEnum<Object> fstEnum = new IntsRefFSTEnum<Object>(fst);
Arrays.fill(ints2, 0);
r = new Random(seed);
int upto = 0;
while(true) {
IntsRefFSTEnum.InputOutput<Object> pair = fstEnum.next();
if (pair == null) {
break;
}
for(int j=10;j<ints2.length;j++) {
ints2[j] = r.nextInt(256);
}
assertEquals(input2, pair.input);
assertEquals(NO_OUTPUT, pair.output);
upto++;
nextInput(r, ints2);
}
assertEquals(count, upto);
}
// Build FST w/ ByteSequenceOutputs and stop when FST
@ -138,39 +160,53 @@ public class Test2BFST extends LuceneTestCase {
}
FST<BytesRef> fst = b.finish();
for(int verify=0;verify<2;verify++) {
System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
r = new Random(seed);
Arrays.fill(ints, 0);
r = new Random(seed);
Arrays.fill(ints, 0);
for(int i=0;i<count;i++) {
if (i % 1000000 == 0) {
System.out.println(i + "...: ");
for(int i=0;i<count;i++) {
if (i % 1000000 == 0) {
System.out.println(i + "...: ");
}
r.nextBytes(outputBytes);
assertEquals(output, Util.get(fst, input));
nextInput(r, ints);
}
r.nextBytes(outputBytes);
assertEquals(output, Util.get(fst, input));
nextInput(r, ints);
}
System.out.println("\nTEST: enum all input/outputs");
IntsRefFSTEnum<BytesRef> fstEnum = new IntsRefFSTEnum<BytesRef>(fst);
System.out.println("\nTEST: enum all input/outputs");
IntsRefFSTEnum<BytesRef> fstEnum = new IntsRefFSTEnum<BytesRef>(fst);
Arrays.fill(ints, 0);
r = new Random(seed);
int upto = 0;
while(true) {
IntsRefFSTEnum.InputOutput<BytesRef> pair = fstEnum.next();
if (pair == null) {
break;
Arrays.fill(ints, 0);
r = new Random(seed);
int upto = 0;
while(true) {
IntsRefFSTEnum.InputOutput<BytesRef> pair = fstEnum.next();
if (pair == null) {
break;
}
assertEquals(input, pair.input);
r.nextBytes(outputBytes);
assertEquals(output, pair.output);
upto++;
nextInput(r, ints);
}
assertEquals(count, upto);
if (verify == 0) {
System.out.println("\nTEST: save/load FST and re-verify");
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
fst.save(out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
fst = new FST<BytesRef>(in, outputs);
in.close();
} else {
dir.deleteFile("fst");
}
assertEquals(input, pair.input);
r.nextBytes(outputBytes);
assertEquals(output, pair.output);
upto++;
nextInput(r, ints);
}
assertEquals(count, upto);
}
// Build FST w/ PositiveIntOutputs and stop when FST
@ -202,46 +238,62 @@ public class Test2BFST extends LuceneTestCase {
FST<Long> fst = b.finish();
System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
for(int verify=0;verify<2;verify++) {
Arrays.fill(ints, 0);
System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
output = 1;
r = new Random(seed);
for(int i=0;i<count;i++) {
if (i % 1000000 == 0) {
System.out.println(i + "...: ");
Arrays.fill(ints, 0);
output = 1;
r = new Random(seed);
for(int i=0;i<count;i++) {
if (i % 1000000 == 0) {
System.out.println(i + "...: ");
}
// forward lookup:
assertEquals(output, Util.get(fst, input).longValue());
// reverse lookup:
assertEquals(input, Util.getByOutput(fst, output));
output += 1 + r.nextInt(10);
nextInput(r, ints);
}
// forward lookup:
assertEquals(output, Util.get(fst, input).longValue());
// reverse lookup:
assertEquals(input, Util.getByOutput(fst, output));
output += 1 + r.nextInt(10);
nextInput(r, ints);
}
System.out.println("\nTEST: enum all input/outputs");
IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<Long>(fst);
System.out.println("\nTEST: enum all input/outputs");
IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<Long>(fst);
Arrays.fill(ints, 0);
r = new Random(seed);
int upto = 0;
output = 1;
while(true) {
IntsRefFSTEnum.InputOutput<Long> pair = fstEnum.next();
if (pair == null) {
break;
Arrays.fill(ints, 0);
r = new Random(seed);
int upto = 0;
output = 1;
while(true) {
IntsRefFSTEnum.InputOutput<Long> pair = fstEnum.next();
if (pair == null) {
break;
}
assertEquals(input, pair.input);
assertEquals(output, pair.output.longValue());
output += 1 + r.nextInt(10);
upto++;
nextInput(r, ints);
}
assertEquals(count, upto);
if (verify == 0) {
System.out.println("\nTEST: save/load FST and re-verify");
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
fst.save(out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
fst = new FST<Long>(in, outputs);
in.close();
} else {
dir.deleteFile("fst");
}
assertEquals(input, pair.input);
assertEquals(output, pair.output.longValue());
output += 1 + r.nextInt(10);
upto++;
nextInput(r, ints);
}
assertEquals(count, upto);
}
}
dir.close();
}
private void nextInput(Random r, int[] ints) {

View File

@ -22,6 +22,7 @@ import org.junit.Assert;
import org.junit.Test;
import org.junit.runner.JUnitCore;
import org.junit.runner.Result;
import org.junit.runner.notification.Failure;
public class TestFailIfDirectoryNotClosed extends WithNestedTests {
public TestFailIfDirectoryNotClosed() {
@ -38,6 +39,10 @@ public class TestFailIfDirectoryNotClosed extends WithNestedTests {
@Test
public void testFailIfDirectoryNotClosed() {
Result r = JUnitCore.runClasses(Nested1.class);
for (Failure f : r.getFailures()) {
System.out.println("Failure: " + f);
}
Assert.assertEquals(1, r.getFailureCount());
Assert.assertTrue(r.getFailures().get(0).toString().contains("Resource in scope SUITE failed to close"));
}
}

View File

@ -20,15 +20,20 @@ package org.apache.lucene.util.junitcompat;
import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.util.List;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestRuleIgnoreTestSuites;
import org.apache.lucene.util.TestRuleMarkFailure;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Rule;
import org.junit.rules.RuleChain;
import org.junit.rules.TestRule;
import com.carrotsearch.randomizedtesting.rules.SystemPropertiesRestoreRule;
import com.carrotsearch.randomizedtesting.rules.TestRuleAdapter;
/**
* An abstract test class that prepares nested test classes to run.
@ -43,7 +48,6 @@ import com.carrotsearch.randomizedtesting.rules.SystemPropertiesRestoreRule;
* cause havoc (static fields).
*/
public abstract class WithNestedTests {
public static abstract class AbstractNestedTest extends LuceneTestCase
implements TestRuleIgnoreTestSuites.NestedTestSuite {
protected static boolean isRunningNested() {
@ -66,8 +70,23 @@ public abstract class WithNestedTests {
* Restore properties after test.
*/
@Rule
public SystemPropertiesRestoreRule restoreProperties = new SystemPropertiesRestoreRule();
public final TestRule rules;
{
final TestRuleMarkFailure marker = new TestRuleMarkFailure();
rules = RuleChain
.outerRule(new SystemPropertiesRestoreRule())
.around(new TestRuleAdapter() {
@Override
protected void afterAlways(List<Throwable> errors) throws Throwable {
if (marker.hadFailures() && suppressOutputStreams) {
System.out.println("sysout from nested test: " + getSysOut() + "\n");
System.out.println("syserr from nested test: " + getSysErr());
}
}
})
.around(marker);
}
@Before
public final void before() {
if (suppressOutputStreams) {

View File

@ -23,8 +23,20 @@ import com.carrotsearch.randomizedtesting.ThreadFilter;
* Last minute patches.
*/
public class QuickPatchThreadsFilter implements ThreadFilter {
static final boolean isJ9;
static {
isJ9 = System.getProperty("java.vm.info", "<?>").contains("IBM J9");
}
@Override
public boolean reject(Thread t) {
if (isJ9) {
StackTraceElement [] stack = t.getStackTrace();
if (stack.length > 0 && stack[stack.length - 1].getClassName().equals("java.util.Timer$TimerImpl")) {
return true; // LUCENE-4736
}
}
return false;
}
}

View File

@ -66,6 +66,8 @@ New Features
* SOLR-4043: Add ability to get success/failure responses from Collections API.
(Raintung Li, Mark Miller)
* SOLR-2827: RegexpBoost Update Processor (janhoy)
Bug Fixes
----------------------

View File

@ -0,0 +1,211 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update.processor;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.update.AddUpdateCommand;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A processor which will match content of "inputField" against regular expressions
* found in "boostFilename", and if it matches will return the corresponding boost
* value from the file and output this to "boostField" as a double value.
* If more than one pattern matches, the boosts from each are multiplied.
* <p>
* A typical use case may be to match a URL against patterns to boost or deboost
* web documents based on the URL itself:
* <pre>
* # Format of each line: &lt;pattern&gt;&lt;TAB&gt;&lt;boost&gt;
* # Example:
* https?://my.domain.com/temp.* 0.2
* </pre>
* <p>
* Both inputField, boostField and boostFilename are mandatory parameters.
*/
public class RegexpBoostProcessor extends UpdateRequestProcessor {
protected static final String INPUT_FIELD_PARAM = "inputField";
protected static final String BOOST_FIELD_PARAM = "boostField";
protected static final String BOOST_FILENAME_PARAM = "boostFilename";
private static final String DEFAULT_INPUT_FIELDNAME = "url";
private static final String DEFAULT_BOOST_FIELDNAME = "urlboost";
private static final Logger log = LoggerFactory.getLogger(RegexpBoostProcessor.class);
private boolean enabled = true;
private String inputFieldname = DEFAULT_INPUT_FIELDNAME;
private String boostFieldname = DEFAULT_BOOST_FIELDNAME;
private String boostFilename;
private List<BoostEntry> boostEntries = new ArrayList<BoostEntry>();
private static final String BOOST_ENTRIES_CACHE_KEY = "boost-entries";
RegexpBoostProcessor(SolrParams parameters,
SolrQueryRequest request,
SolrQueryResponse response,
UpdateRequestProcessor nextProcessor,
final Map<Object, Object> sharedObjectCache) {
super(nextProcessor);
this.initParameters(parameters);
if (this.boostFilename == null) {
log.warn("Null boost filename. Disabling processor.");
setEnabled(false);
}
if (!isEnabled()) {
return;
}
try {
synchronized (sharedObjectCache) {
List<BoostEntry> cachedBoostEntries =
(List<BoostEntry>) sharedObjectCache.get(BOOST_ENTRIES_CACHE_KEY);
if (cachedBoostEntries == null) {
log.debug("No pre-cached boost entry list found, initializing new");
InputStream is = request.getCore().getResourceLoader().openResource(boostFilename);
cachedBoostEntries = initBoostEntries(is);
sharedObjectCache.put(BOOST_ENTRIES_CACHE_KEY, cachedBoostEntries);
} else {
if (log.isDebugEnabled()) {
log.debug("Using cached boost entry list with " + cachedBoostEntries.size() + " elements.");
}
}
this.boostEntries = cachedBoostEntries;
}
} catch (IOException ioe) {
log.warn("IOException while initializing boost entries from file " + this.boostFilename, ioe);
}
}
private void initParameters(SolrParams parameters) {
if (parameters != null) {
this.setEnabled(parameters.getBool("enabled", true));
this.inputFieldname = parameters.get(INPUT_FIELD_PARAM, DEFAULT_INPUT_FIELDNAME);
this.boostFieldname = parameters.get(BOOST_FIELD_PARAM, DEFAULT_BOOST_FIELDNAME);
this.boostFilename = parameters.get(BOOST_FILENAME_PARAM);
}
}
private List<BoostEntry> initBoostEntries(InputStream is) throws IOException {
List<BoostEntry> newBoostEntries = new ArrayList<BoostEntry>();
BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
try {
String line = null;
while ((line = reader.readLine()) != null) {
// Remove comments
line = line.replaceAll("\\s+#.*$", "");
line = line.replaceAll("^#.*$", "");
// Skip empty lines or comment lines
if (line.trim().length() == 0) {
continue;
}
String[] fields = line.split("\\s+");
if (fields.length == 2) {
String regexp = fields[0];
String boost = fields[1];
newBoostEntries.add(new BoostEntry(Pattern.compile(regexp), Double.parseDouble(boost)));
log.debug("Read regexp " + regexp + " with boost " + boost);
} else {
log.warn("Malformed config input line: " + line + " (expected 2 fields, got " + fields.length + " fields). Skipping entry.");
continue;
}
}
} finally {
IOUtils.closeQuietly(reader);
}
return newBoostEntries;
}
@Override
public void processAdd(AddUpdateCommand command) throws IOException {
if (isEnabled()) {
processBoost(command);
}
super.processAdd(command);
}
public void processBoost(AddUpdateCommand command) {
SolrInputDocument document = command.getSolrInputDocument();
if (document.containsKey(inputFieldname)) {
String value = (String) document.getFieldValue(inputFieldname);
double boost = 1.0f;
for (BoostEntry boostEntry : boostEntries) {
if (boostEntry.getPattern().matcher(value).matches()) {
if (log.isDebugEnabled()) {
log.debug("Pattern match " + boostEntry.getPattern().pattern() + " for " + value);
}
boost = (boostEntry.getBoost() * 1000) * (boost * 1000) / 1000000;
}
}
document.setField(boostFieldname, boost);
if (log.isDebugEnabled()) {
log.debug("Value " + boost + ", applied to field " + boostFieldname);
}
}
}
public boolean isEnabled() {
return enabled;
}
public void setEnabled(boolean enabled) {
this.enabled = enabled;
}
private static class BoostEntry {
private Pattern pattern;
private double boost;
public BoostEntry(Pattern pattern, double d) {
this.pattern = pattern;
this.boost = d;
}
public Pattern getPattern() {
return pattern;
}
public double getBoost() {
return boost;
}
}
}

View File

@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update.processor;
import java.util.HashMap;
import java.util.Map;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
/**
* Factory which creates RegexBoostProcessors
* <p>
* The factory initializes a shared object cache which is passed to the processor
* and this way reduces rules file parsing to the first time the UpdateChain
* is initialized.
*/
public class RegexpBoostProcessorFactory extends UpdateRequestProcessorFactory {
private SolrParams params;
private final Map<Object, Object> sharedObjectCache = new HashMap<Object, Object>();
@Override
public void init(@SuppressWarnings("rawtypes") final NamedList args) {
if (args != null) {
this.params = SolrParams.toSolrParams(args);
}
}
@Override
public UpdateRequestProcessor getInstance(SolrQueryRequest request,
SolrQueryResponse response,
UpdateRequestProcessor nextProcessor) {
return new RegexpBoostProcessor(this.params, request, response, nextProcessor, this.sharedObjectCache);
}
}

View File

@ -28,7 +28,9 @@ import java.io.ByteArrayInputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
@ -36,6 +38,7 @@ import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.HashSet;
import java.util.TimeZone;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import java.util.zip.GZIPInputStream;
@ -190,20 +193,10 @@ public class SimplePostTool {
* @param millis the time in milliseconds
*/
private void displayTiming(long millis) {
// TODO: if the intent is user-display: this should use SimpleDateFormat
// or similar instead of homemade formatting.
long hours = millis / 3600000;
long minutes = (millis / 60000) % 60;
long seconds = (millis / 1000) % 60;
long milliseconds = millis % 1000;
if (hours>0) {
System.out.println(String.format(Locale.getDefault(), "Time taken: %02d:%02d:%02d.%03d", hours, minutes, seconds, milliseconds));
} else if (minutes>0) {
System.out.println(String.format(Locale.getDefault(), "Time taken: %02d:%02d.%03d", minutes, seconds, milliseconds));
} else {
System.out.println(String.format(Locale.getDefault(), "Time taken: %d.%03ds", seconds, milliseconds));
}
}
SimpleDateFormat df = new SimpleDateFormat("H:mm:ss.SSS", Locale.getDefault());
df.setTimeZone(TimeZone.getTimeZone("UTC"));
System.out.println("Time spent: "+df.format(new Date(millis)));
}
/**
* Parses incoming arguments and system params and initializes the tool

View File

@ -0,0 +1,10 @@
# Sample config file for RegexBoostProcessor
# This example applies boost on the "url" field to boost or deboost certain urls
# All rules are evaluated, and if several of them match, the boosts are multiplied.
# If for example one rule with boost 2.0 and one rule with boost 0.1 match, the resulting urlboost=0.2
https?://[^/]+/old/.* 0.1 #Comments are removed
https?://[^/]+/.*index\([0-9]\).html$ 0.5
# Prioritize certain sites over others
https?://www.mydomain.no/.* 1.5

View File

@ -252,6 +252,100 @@ public class BasicFunctionalityTest extends SolrTestCaseJ4 {
);
}
/**
* verify that delete by query works with the QParser framework and
* pure negative queries
*/
public void testNonTrivialDeleteByQuery() throws Exception {
clearIndex();
// setup
assertU( add(doc("id","101", "text", "red apple" )) );
assertU( add(doc("id","102", "text", "purple grape" )) );
assertU( add(doc("id","103", "text", "green grape" )) );
assertU( add(doc("id","104", "text", "green pear" )) );
assertU( add(doc("id","105", "text", "yellow banana" )) );
assertU( add(doc("id","106", "text", "red cherry" )) );
// sanity checks
assertU(commit());
assertQ(req("id:[100 TO 110]")
,"//*[@numFound='6']"
);
assertQ(req("*:*")
,"//*[@numFound='6']"
);
assertQ(req("text:red")
,"//*[@numFound='2']"
);
assertQ(req("-text:red")
,"//*[@numFound='4']"
);
assertQ(req("text:grape")
,"//*[@numFound='2']"
);
assertQ(req("-text:grape")
,"//*[@numFound='4']"
);
assertQ(req("-text:red -text:grape")
,"//*[@numFound='2']"
);
assertQ(req("{!lucene q.op=AND df=text}grape green")
,"//*[@numFound='1']"
,"//int[@name='id'][.='103']"
);
assertQ(req("-_val_:\"{!lucene q.op=AND df=text}grape green\"")
,"//*[@numFound='5']"
,"//int[@name='id'][.='101']"
,"//int[@name='id'][.='102']"
,"//int[@name='id'][.='104']"
,"//int[@name='id'][.='105']"
,"//int[@name='id'][.='106']"
);
// tests
assertU(delQ("-*:*")); // NOOP
assertU(commit());
assertQ(req("*:*")
,"//*[@numFound='6']"
);
assertU(delQ("-text:grape -text:red"));
assertU(commit());
assertQ(req("*:*")
,"//*[@numFound='4']"
,"//int[@name='id'][.='101']"
,"//int[@name='id'][.='102']"
,"//int[@name='id'][.='103']"
,"//int[@name='id'][.='106']"
);
assertU(delQ("{!term f=id}106"));
assertU(commit());
assertQ(req("*:*")
,"//*[@numFound='3']"
,"//int[@name='id'][.='101']"
,"//int[@name='id'][.='102']"
,"//int[@name='id'][.='103']"
);
assertU(delQ("-_val_:\"{!lucene q.op=AND df=text}grape green\""));
assertU(commit());
assertQ(req("*:*")
,"//*[@numFound='1']"
,"//int[@name='id'][.='103']"
);
assertU(delQ("-text:doesnotexist"));
assertU(commit());
assertQ(req("*:*")
,"//*[@numFound='0']"
);
}
@Test
public void testHTMLStrip() {
assertU(add(doc("id","200", "HTMLwhitetok","&#65;&#66;&#67;")));

View File

@ -0,0 +1,115 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update.processor;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.servlet.SolrRequestParsers;
import org.apache.solr.update.AddUpdateCommand;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
public class RegexBoostProcessorTest extends SolrTestCaseJ4 {
private static RegexpBoostProcessor reProcessor;
protected static SolrRequestParsers _parser;
protected static ModifiableSolrParams parameters;
private static RegexpBoostProcessorFactory factory;
private SolrInputDocument document;
@BeforeClass
public static void setUpBeforeClass() throws Exception {
initCore("solrconfig.xml", "schema12.xml");
SolrCore core = h.getCore();
_parser = new SolrRequestParsers( null );
SolrQueryResponse resp = null;
parameters = new ModifiableSolrParams();
parameters.set(RegexpBoostProcessor.BOOST_FILENAME_PARAM, "regex-boost-processor-test.txt");
parameters.set(RegexpBoostProcessor.INPUT_FIELD_PARAM, "url");
parameters.set(RegexpBoostProcessor.BOOST_FIELD_PARAM, "urlboost");
SolrQueryRequest req = _parser.buildRequestFrom(core, new ModifiableSolrParams(), null);
factory = new RegexpBoostProcessorFactory();
factory.init(parameters.toNamedList());
reProcessor = (RegexpBoostProcessor) factory.getInstance(req, resp, null);
}
@Before
public void setUp() throws Exception {
document = new SolrInputDocument();
super.setUp();
}
@Test
public void testNoBoost() throws Exception {
document.addField("id", "doc1");
document.addField("url", "http://www.nomatch.no");
processAdd(document);
assertEquals(1.0d, document.getFieldValue("urlboost"));
}
@Test
public void testDeboostOld() throws Exception {
document.addField("id", "doc1");
document.addField("url", "http://www.somedomain.no/old/test.html");
processAdd(document);
assertEquals(0.1d, document.getFieldValue("urlboost"));
// Test the other deboost rule
document = new SolrInputDocument();
document.addField("id", "doc1");
document.addField("url", "http://www.somedomain.no/foo/index(1).html");
processAdd(document);
assertEquals(0.5d, document.getFieldValue("urlboost"));
}
@Test
public void testBoostGood() throws Exception {
document.addField("id", "doc1");
document.addField("url", "http://www.mydomain.no/fifty-percent-boost");
processAdd(document);
assertEquals(1.5d, document.getFieldValue("urlboost"));
}
@Test
public void testTwoRules() throws Exception {
document.addField("id", "doc1");
document.addField("url", "http://www.mydomain.no/old/test.html");
processAdd(document);
assertEquals(0.15d, document.getFieldValue("urlboost"));
}
private void processAdd(SolrInputDocument doc) throws Exception {
AddUpdateCommand addCommand = new AddUpdateCommand(null);
addCommand.solrDoc = doc;
reProcessor.processAdd(addCommand);
}
}