diff --git a/dev-tools/eclipse/dot.classpath b/dev-tools/eclipse/dot.classpath
index c24f5f0b416..1d2abc15758 100644
--- a/dev-tools/eclipse/dot.classpath
+++ b/dev-tools/eclipse/dot.classpath
@@ -1,126 +1,126 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -133,47 +133,57 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/dev-tools/idea/solr/contrib/extraction/extraction.iml b/dev-tools/idea/solr/contrib/extraction/extraction.iml
index 6410dc3c3c5..877d72fd6cd 100644
--- a/dev-tools/idea/solr/contrib/extraction/extraction.iml
+++ b/dev-tools/idea/solr/contrib/extraction/extraction.iml
@@ -15,6 +15,6 @@
-
+
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 45d5be021ec..2d8d20e329e 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -75,6 +75,14 @@ Bug Fixes
encoders / stemmers via the ResourceLoader now instead of Class.forName().
Solr users should now no longer have to embed these in its war. (David Smiley)
+* SOLR-3737: StempelPolishStemFilterFactory loaded its stemmer table incorrectly.
+ Also, ensure immutability and use only one instance of this table in RAM (lazy
+ loaded) since its quite large. (sausarkar, Steven Rowe, Robert Muir)
+
+* LUCENE-4310: MappingCharFilter was failing to match input strings
+ containing non-BMP Unicode characters. (Dawid Weiss, Robert Muir,
+ Mike McCandless)
+
Build
* LUCENE-3985: Upgrade to randomizedtesting 2.0.0. Added support for
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java
index c6470611d2c..c22203a76a4 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java
@@ -111,9 +111,8 @@ public class NormalizeCharMap {
final org.apache.lucene.util.fst.Builder builder = new org.apache.lucene.util.fst.Builder(FST.INPUT_TYPE.BYTE2, outputs);
final IntsRef scratch = new IntsRef();
for(Map.Entry ent : pendingPairs.entrySet()) {
- builder.add(Util.toUTF32(ent.getKey(), scratch),
+ builder.add(Util.toUTF16(ent.getKey(), scratch),
new CharsRef(ent.getValue()));
-
}
map = builder.finish();
pendingPairs.clear();
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
index c4fab5519c0..358ab2d7b74 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;
public class TestMappingCharFilter extends BaseTokenStreamTestCase {
@@ -55,6 +56,11 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
builder.add( "empty", "" );
+ // BMP (surrogate pair):
+ builder.add(UnicodeUtil.newString(new int[] {0x1D122}, 0, 1), "fclef");
+
+ builder.add("\uff01", "full-width-exclamation");
+
normMap = builder.build();
}
@@ -128,6 +134,18 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
assertTokenStreamContents(ts, new String[0], new int[]{}, new int[]{}, 5);
}
+ public void testNonBMPChar() throws Exception {
+ CharFilter cs = new MappingCharFilter( normMap, new StringReader( UnicodeUtil.newString(new int[] {0x1D122}, 0, 1) ) );
+ TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+ assertTokenStreamContents(ts, new String[]{"fclef"}, new int[]{0}, new int[]{2}, 2);
+ }
+
+ public void testFullWidthChar() throws Exception {
+ CharFilter cs = new MappingCharFilter( normMap, new StringReader( "\uff01") );
+ TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+ assertTokenStreamContents(ts, new String[]{"full-width-exclamation"}, new int[]{0}, new int[]{1}, 1);
+ }
+
//
// 1111111111222
// 01234567890123456789012
diff --git a/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java b/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
index 32d0665c5ed..f1a8364c0de 100644
--- a/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
+++ b/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
@@ -58,6 +58,13 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
return DefaultsHolder.DEFAULT_STOP_SET;
}
+ /**
+ * Returns an unmodifiable instance of the default stemmer table.
+ */
+ public static Trie getDefaultTable() {
+ return DefaultsHolder.DEFAULT_TABLE;
+ }
+
/**
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
* accesses the static final set the first time.;
diff --git a/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelPolishStemFilterFactory.java b/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelPolishStemFilterFactory.java
index 9dce52feb3e..64abe3c84c9 100644
--- a/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelPolishStemFilterFactory.java
+++ b/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelPolishStemFilterFactory.java
@@ -17,28 +17,17 @@ package org.apache.lucene.analysis.stempel;
* limitations under the License.
*/
-import java.io.IOException;
-
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.pl.PolishAnalyzer;
import org.apache.lucene.analysis.stempel.StempelFilter;
import org.apache.lucene.analysis.stempel.StempelStemmer;
-import org.apache.lucene.analysis.util.ResourceLoader;
-import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
-import org.egothor.stemmer.Trie;
/**
* Factory for {@link StempelFilter} using a Polish stemming table.
*/
-public class StempelPolishStemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
- private Trie stemmer = null;
- private static final String STEMTABLE = "/org/apache/lucene/analysis/pl/stemmer_20000.tbl";
-
+public class StempelPolishStemFilterFactory extends TokenFilterFactory {
public TokenStream create(TokenStream input) {
- return new StempelFilter(input, new StempelStemmer(stemmer));
- }
-
- public void inform(ResourceLoader loader) throws IOException {
- stemmer = StempelStemmer.load(loader.openResource(STEMTABLE));
+ return new StempelFilter(input, new StempelStemmer(PolishAnalyzer.getDefaultTable()));
}
}
diff --git a/lucene/analysis/stempel/src/java/org/egothor/stemmer/Trie.java b/lucene/analysis/stempel/src/java/org/egothor/stemmer/Trie.java
index b32011fcf4f..20ea467efad 100644
--- a/lucene/analysis/stempel/src/java/org/egothor/stemmer/Trie.java
+++ b/lucene/analysis/stempel/src/java/org/egothor/stemmer/Trie.java
@@ -332,7 +332,7 @@ public class Trie {
* @param key the key
* @param cmd the patch command
*/
- public void add(CharSequence key, CharSequence cmd) {
+ void add(CharSequence key, CharSequence cmd) {
if (key == null || cmd == null) {
return;
}
diff --git a/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/TestStempelPolishStemFilterFactory.java b/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/TestStempelPolishStemFilterFactory.java
index 8c57d4ab1cc..e633c72b470 100644
--- a/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/TestStempelPolishStemFilterFactory.java
+++ b/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/TestStempelPolishStemFilterFactory.java
@@ -22,7 +22,6 @@ import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
-import org.apache.lucene.analysis.util.ClasspathResourceLoader;
/**
* Tests for {@link StempelPolishStemFilterFactory}
@@ -31,7 +30,6 @@ public class TestStempelPolishStemFilterFactory extends BaseTokenStreamTestCase
public void testBasics() throws Exception {
StringReader document = new StringReader("studenta studenci");
StempelPolishStemFilterFactory factory = new StempelPolishStemFilterFactory();
- factory.inform(new ClasspathResourceLoader(getClass()));
TokenStream ts = factory.create(new WhitespaceTokenizer(TEST_VERSION_CURRENT, document));
assertTokenStreamContents(ts,
new String[] { "student", "student" });
diff --git a/lucene/build.xml b/lucene/build.xml
index 82a65544788..784faf73889 100644
--- a/lucene/build.xml
+++ b/lucene/build.xml
@@ -234,10 +234,10 @@
-
+
-
+
@@ -247,7 +247,7 @@
-
+
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java b/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java
index 15c77beacc3..131b33973c6 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java
@@ -26,11 +26,15 @@ import org.apache.lucene.index.TermState;
* terms dict.
*/
public class BlockTermState extends OrdTermState {
- public int docFreq; // how many docs have this term
- public long totalTermFreq; // total number of occurrences of this term
+ /** how many docs have this term */
+ public int docFreq;
+ /** total number of occurrences of this term */
+ public long totalTermFreq;
- public int termBlockOrd; // the term's ord in the current block
- public long blockFilePointer; // fp into the terms dict primary file (_X.tim) that holds this term
+ /** the term's ord in the current block */
+ public int termBlockOrd;
+ /** fp into the terms dict primary file (_X.tim) that holds this term */
+ public long blockFilePointer;
@Override
public void copyFrom(TermState _other) {
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListReader.java b/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListReader.java
index 107e8656a26..1e70b5abde0 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListReader.java
@@ -36,7 +36,7 @@ import org.apache.lucene.util.MathUtil;
*/
public abstract class MultiLevelSkipListReader {
- // the maximum number of skip levels possible for this index
+ /** the maximum number of skip levels possible for this index */
protected int maxNumberOfSkipLevels;
// number of levels in this skip list
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListWriter.java
index facef8c1986..6f9309861dd 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListWriter.java
@@ -49,7 +49,7 @@ import org.apache.lucene.util.MathUtil;
*/
public abstract class MultiLevelSkipListWriter {
- // number of levels in this skip list
+ /** number of levels in this skip list */
protected int numberOfSkipLevels;
// the skip interval in the list with level = 0
@@ -77,8 +77,8 @@ public abstract class MultiLevelSkipListWriter {
}
}
+ /** creates new buffers or empties the existing ones */
protected void resetSkip() {
- // creates new buffers or empties the existing ones
if (skipBuffer == null) {
init();
} else {
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java
index 03ebc6caf44..b8dc734d197 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java
@@ -1796,7 +1796,7 @@ public class DirectPostingsFormat extends PostingsFormat {
}
// Docs + freqs:
- public final static class HighFreqDocsEnum extends DocsEnum {
+ private final static class HighFreqDocsEnum extends DocsEnum {
private int[] docIDs;
private int[] freqs;
private final Bits liveDocs;
@@ -1969,7 +1969,7 @@ public class DirectPostingsFormat extends PostingsFormat {
}
// TODO: specialize offsets and not
- public final static class HighFreqDocsAndPositionsEnum extends DocsAndPositionsEnum {
+ private final static class HighFreqDocsAndPositionsEnum extends DocsAndPositionsEnum {
private int[] docIDs;
private int[] freqs;
private int[][] positions;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/sep/IntIndexInput.java b/lucene/core/src/java/org/apache/lucene/codecs/sep/IntIndexInput.java
index 8a25fd77cf4..e473936755a 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/sep/IntIndexInput.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/sep/IntIndexInput.java
@@ -36,7 +36,7 @@ public abstract class IntIndexInput implements Closeable {
public abstract Index index() throws IOException;
- // TODO: -- can we simplify this?
+ /** Records a single skip-point in the {@link IntIndexInput.Reader}. */
public abstract static class Index {
public abstract void read(DataInput indexIn, boolean absolute) throws IOException;
@@ -50,6 +50,7 @@ public abstract class IntIndexInput implements Closeable {
public abstract Index clone();
}
+ /** Reads int values. */
public abstract static class Reader {
/** Reads next single int */
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/sep/IntIndexOutput.java b/lucene/core/src/java/org/apache/lucene/codecs/sep/IntIndexOutput.java
index 14723d2574b..fd1eb49b276 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/sep/IntIndexOutput.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/sep/IntIndexOutput.java
@@ -38,6 +38,7 @@ public abstract class IntIndexOutput implements Closeable {
* >= 0. */
public abstract void write(int v) throws IOException;
+ /** Records a single skip-point in the IndexOutput. */
public abstract static class Index {
/** Internally records the current location */
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/sep/IntStreamFactory.java b/lucene/core/src/java/org/apache/lucene/codecs/sep/IntStreamFactory.java
index 091d1a72d61..eace0335a8e 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/sep/IntStreamFactory.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/sep/IntStreamFactory.java
@@ -22,8 +22,15 @@ import org.apache.lucene.store.IOContext;
import java.io.IOException;
-/** @lucene.experimental */
+/** Provides int reader and writer to specified files.
+ *
+ * @lucene.experimental */
public abstract class IntStreamFactory {
+ /** Create an {@link IntIndexInput} on the provided
+ * fileName. */
public abstract IntIndexInput openInput(Directory dir, String fileName, IOContext context) throws IOException;
+
+ /** Create an {@link IntIndexOutput} on the provided
+ * fileName. */
public abstract IntIndexOutput createOutput(Directory dir, String fileName, IOContext context) throws IOException;
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/DocTermOrds.java b/lucene/core/src/java/org/apache/lucene/index/DocTermOrds.java
index 500dc3eb8f5..93ac2bb4c81 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DocTermOrds.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DocTermOrds.java
@@ -119,10 +119,13 @@ public class DocTermOrds {
protected final String field;
protected int numTermsInField;
- protected long termInstances; // total number of references to term numbers
+ /** total number of references to term numbers */
+ protected long termInstances;
private long memsz;
- protected int total_time; // total time to uninvert the field
- protected int phase1_time; // time for phase1 of the uninvert process
+ /** total time to uninvert the field */
+ protected int total_time;
+ /** time for phase1 of the uninvert process */
+ protected int phase1_time;
protected int[] index;
protected byte[][] tnums = new byte[256][];
@@ -234,7 +237,7 @@ public class DocTermOrds {
protected void setActualDocFreq(int termNum, int df) throws IOException {
}
- // Call this only once (if you subclass!)
+ /** Call this only once (if you subclass!) */
protected void uninvert(final AtomicReader reader, final BytesRef termPrefix) throws IOException {
//System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
final long startTime = System.currentTimeMillis();
diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
index f0ff871b64f..9b251153127 100644
--- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
@@ -249,11 +249,11 @@ public class FieldInfos implements Iterable {
return addOrUpdateInternal(name, -1, isIndexed, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, normType);
}
- // NOTE: this method does not carry over termVector
- // booleans nor docValuesType; the indexer chain
- // (TermVectorsConsumerPerField, DocFieldProcessor) must
- // set these fields when they succeed in consuming
- // the document:
+ /** NOTE: this method does not carry over termVector
+ * booleans nor docValuesType; the indexer chain
+ * (TermVectorsConsumerPerField, DocFieldProcessor) must
+ * set these fields when they succeed in consuming
+ * the document */
public FieldInfo addOrUpdate(String name, IndexableFieldType fieldType) {
// TODO: really, indexer shouldn't even call this
// method (it's only called from DocFieldProcessor);
diff --git a/lucene/core/src/java/org/apache/lucene/index/Fields.java b/lucene/core/src/java/org/apache/lucene/index/Fields.java
index 4ea80152919..0e2e726691d 100644
--- a/lucene/core/src/java/org/apache/lucene/index/Fields.java
+++ b/lucene/core/src/java/org/apache/lucene/index/Fields.java
@@ -33,10 +33,9 @@ public abstract class Fields implements Iterable {
* null if the field does not exist. */
public abstract Terms terms(String field) throws IOException;
- /** Returns the number of terms for all fields, or -1 if this
- * measure isn't stored by the codec. Note that, just like
- * other term measures, this measure does not take deleted
- * documents into account. */
+ /** Returns the number of fields or -1 if the number of
+ * distinct field names is unknown. If >= 0,
+ * {@link #iterator} will return as many field names. */
public abstract int size() throws IOException;
/** Returns the number of terms for all fields, or -1 if this
diff --git a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java
index be5420c103a..dc0263eb4ae 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java
@@ -243,6 +243,10 @@ public abstract class MergePolicy implements java.io.Closeable, Cloneable {
}
}
+ /** Thrown when a merge was explicity aborted because
+ * {@link IndexWriter#close(boolean)} was called with
+ * false. Normally this exception is
+ * privately caught and suppresed by {@link IndexWriter}. */
public static class MergeAbortedException extends IOException {
public MergeAbortedException() {
super("merge is aborted");
diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeState.java b/lucene/core/src/java/org/apache/lucene/index/MergeState.java
index fbfe063a5a7..a14da90899f 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MergeState.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MergeState.java
@@ -29,6 +29,9 @@ import org.apache.lucene.util.packed.PackedInts;
* @lucene.experimental */
public class MergeState {
+ /**
+ * Remaps docids around deletes during merge
+ */
public static abstract class DocMap {
private final Bits liveDocs;
@@ -197,6 +200,9 @@ public class MergeState {
public SegmentReader[] matchingSegmentReaders;
public int matchedCount;
+ /**
+ * Class for recording units of work when merging segments.
+ */
public static class CheckAbort {
private double workCount;
private final MergePolicy.OneMerge merge;
diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java
index abfc979c0dd..a079cd07d0c 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java
@@ -43,7 +43,7 @@ import org.apache.lucene.util.packed.PackedInts.Reader;
* @lucene.experimental
* @lucene.internal
*/
-public class MultiDocValues extends DocValues {
+class MultiDocValues extends DocValues {
private static DocValuesPuller DEFAULT_PULLER = new DocValuesPuller();
private static final DocValuesPuller NORMS_PULLER = new DocValuesPuller() {
diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiDocsAndPositionsEnum.java b/lucene/core/src/java/org/apache/lucene/index/MultiDocsAndPositionsEnum.java
index 062890a844d..9bf4559b37e 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MultiDocsAndPositionsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MultiDocsAndPositionsEnum.java
@@ -143,6 +143,8 @@ public final class MultiDocsAndPositionsEnum extends DocsAndPositionsEnum {
}
// TODO: implement bulk read more efficiently than super
+ /** Holds a {@link DocsAndPositionsEnum} along with the
+ * corresponding {@link ReaderSlice}. */
public final static class EnumWithSlice {
public DocsAndPositionsEnum docsAndPositionsEnum;
public ReaderSlice slice;
diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiDocsEnum.java b/lucene/core/src/java/org/apache/lucene/index/MultiDocsEnum.java
index 2d0fd252d06..af58ac427b8 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MultiDocsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MultiDocsEnum.java
@@ -122,6 +122,8 @@ public final class MultiDocsEnum extends DocsEnum {
}
// TODO: implement bulk read more efficiently than super
+ /** Holds a {@link DocsEnum} along with the
+ * corresponding {@link ReaderSlice}. */
public final static class EnumWithSlice {
public DocsEnum docsEnum;
public ReaderSlice slice;
diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java b/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java
index b92d31a1b65..f1b938e3dab 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java
@@ -133,7 +133,8 @@ public final class SegmentInfos implements Cloneable, Iterable userData = Collections.emptyMap(); // Opaque Map that user can specify during IndexWriter.commit
+ /** Opaque Map<String, String> that user can specify during IndexWriter.commit */
+ public Map userData = Collections.emptyMap();
private List segments = new ArrayList();
diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentReadState.java b/lucene/core/src/java/org/apache/lucene/index/SegmentReadState.java
index 71b03463968..79be20cc9b6 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SegmentReadState.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SegmentReadState.java
@@ -30,11 +30,11 @@ public class SegmentReadState {
public final FieldInfos fieldInfos;
public final IOContext context;
- // NOTE: if this is < 0, that means "defer terms index
- // load until needed". But if the codec must load the
- // terms index on init (preflex is the only once currently
- // that must do so), then it should negate this value to
- // get the app's terms divisor:
+ /** NOTE: if this is < 0, that means "defer terms index
+ * load until needed". But if the codec must load the
+ * terms index on init (preflex is the only once currently
+ * that must do so), then it should negate this value to
+ * get the app's terms divisor */
public int termsIndexDivisor;
public final String segmentSuffix;
diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentWriteState.java b/lucene/core/src/java/org/apache/lucene/index/SegmentWriteState.java
index 6b161df91d6..0c3dd5cf2eb 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SegmentWriteState.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SegmentWriteState.java
@@ -33,11 +33,11 @@ public class SegmentWriteState {
public final FieldInfos fieldInfos;
public int delCountOnFlush;
- // Deletes to apply while we are flushing the segment. A
- // Term is enrolled in here if it was deleted at one
- // point, and it's mapped to the docIDUpto, meaning any
- // docID < docIDUpto containing this term should be
- // deleted.
+ /** Deletes to apply while we are flushing the segment. A
+ * Term is enrolled in here if it was deleted at one
+ * point, and it's mapped to the docIDUpto, meaning any
+ * docID < docIDUpto containing this term should be
+ * deleted. */
public final BufferedDeletes segDeletes;
// Lazily created:
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedBytesMergeUtils.java b/lucene/core/src/java/org/apache/lucene/index/SortedBytesMergeUtils.java
index 023af1343f2..c4b4998bf4b 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedBytesMergeUtils.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedBytesMergeUtils.java
@@ -32,6 +32,9 @@ import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.packed.PackedInts;
/**
+ * Utility class for merging SortedBytes DocValues
+ * instances.
+ *
* @lucene.internal
*/
public final class SortedBytesMergeUtils {
@@ -54,7 +57,14 @@ public final class SortedBytesMergeUtils {
}
return new MergeContext(comp, mergeDocCount, size, type);
}
-
+ /**
+ * Encapsulates contextual information about the merge.
+ * This class holds document id to ordinal mappings, offsets for
+ * variable length values and the comparator to sort the merged
+ * bytes.
+ *
+ * @lucene.internal
+ */
public static final class MergeContext {
private final Comparator comp;
private final BytesRef missingValue = new BytesRef();
@@ -169,10 +179,36 @@ public final class SortedBytesMergeUtils {
return merger.currentOrd;
}
+ /**
+ * Implementation of this interface consume the merged bytes with their
+ * corresponding ordinal and byte offset. The offset is the byte offset in
+ * target sorted source where the currently merged {@link BytesRef} instance
+ * should be stored at.
+ */
public static interface BytesRefConsumer {
+
+ /**
+ * Consumes a single {@link BytesRef}. The provided {@link BytesRef}
+ * instances are strictly increasing with respect to the used
+ * {@link Comparator} used for merging
+ *
+ * @param ref
+ * the {@link BytesRef} to consume
+ * @param ord
+ * the ordinal of the given {@link BytesRef} in the merge target
+ * @param offset
+ * the byte offset of the given {@link BytesRef} in the merge
+ * target
+ * @throws IOException
+ * if an {@link IOException} occurs
+ */
public void consume(BytesRef ref, int ord, long offset) throws IOException;
}
+ /**
+ * A simple {@link BytesRefConsumer} that writes the merged {@link BytesRef}
+ * instances sequentially to an {@link IndexOutput}.
+ */
public static final class IndexOutputBytesRefConsumer implements BytesRefConsumer {
private final IndexOutput datOut;
@@ -186,7 +222,15 @@ public final class SortedBytesMergeUtils {
currentMergedBytes.length);
}
}
-
+
+ /**
+ * {@link RecordMerger} merges a list of {@link SortedSourceSlice} lazily by
+ * consuming the sorted source records one by one and de-duplicates records
+ * that are shared across slices. The algorithm is based on a lazy priority queue
+ * that prevents reading merge sources into heap memory.
+ *
+ * @lucene.internal
+ */
private static final class RecordMerger {
private final MergeQueue queue;
private final SortedSourceSlice[] top;
@@ -231,6 +275,12 @@ public final class SortedBytesMergeUtils {
}
}
+ /**
+ * {@link SortedSourceSlice} represents a single {@link SortedSource} merge candidate.
+ * It encapsulates ordinal and pre-calculated target doc id to ordinal mappings.
+ * This class also holds state private to the merge process.
+ * @lucene.internal
+ */
public static class SortedSourceSlice {
final SortedSource source;
final int readerIdx;
diff --git a/lucene/core/src/java/org/apache/lucene/search/FieldCache.java b/lucene/core/src/java/org/apache/lucene/search/FieldCache.java
index 590d784d77f..ff5994016cb 100644
--- a/lucene/core/src/java/org/apache/lucene/search/FieldCache.java
+++ b/lucene/core/src/java/org/apache/lucene/search/FieldCache.java
@@ -45,6 +45,9 @@ import org.apache.lucene.util.packed.PackedInts;
*/
public interface FieldCache {
+ /**
+ * Placeholder indicating creation of this cache is currently in-progress.
+ */
public static final class CreationPlaceholder {
Object value;
}
diff --git a/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java b/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java
index 83b44bf9b56..9d1fbafd7bf 100644
--- a/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java
+++ b/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java
@@ -194,6 +194,9 @@ public abstract class FieldComparator {
* than the provided value. */
public abstract int compareDocToValue(int doc, T value) throws IOException;
+ /**
+ * Base FieldComparator class for numeric types
+ */
public static abstract class NumericComparator extends FieldComparator {
protected final T missingValue;
protected final String field;
diff --git a/lucene/core/src/java/org/apache/lucene/search/FieldValueHitQueue.java b/lucene/core/src/java/org/apache/lucene/search/FieldValueHitQueue.java
index 5a4d44425f8..97a9974c6f1 100644
--- a/lucene/core/src/java/org/apache/lucene/search/FieldValueHitQueue.java
+++ b/lucene/core/src/java/org/apache/lucene/search/FieldValueHitQueue.java
@@ -33,6 +33,10 @@ import org.apache.lucene.util.PriorityQueue;
*/
public abstract class FieldValueHitQueue extends PriorityQueue {
+ /**
+ * Extension of ScoreDoc to also store the
+ * {@link FieldComparator} slot.
+ */
public static class Entry extends ScoreDoc {
public int slot;
diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
index 05e9b34f792..304b4a36acb 100644
--- a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
@@ -398,12 +398,17 @@ public class FuzzyTermsEnum extends TermsEnum {
return scale_factor;
}
- /** @lucene.internal */
+ /**
+ * reuses compiled automata across different segments,
+ * because they are independent of the index
+ * @lucene.internal */
public static interface LevenshteinAutomataAttribute extends Attribute {
public List automata();
}
- /** @lucene.internal */
+ /**
+ * Stores compiled automata as a list (indexed by edit distance)
+ * @lucene.internal */
public static final class LevenshteinAutomataAttributeImpl extends AttributeImpl implements LevenshteinAutomataAttribute {
private final List automata = new ArrayList();
diff --git a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
index e6359a8c7e1..503b22f075c 100644
--- a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
+++ b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
@@ -83,7 +83,7 @@ public class IndexSearcher {
// in the next release
protected final IndexReaderContext readerContext;
protected final List leafContexts;
- // used with executor - each slice holds a set of leafs executed within one thread
+ /** used with executor - each slice holds a set of leafs executed within one thread */
protected final LeafSlice[] leafSlices;
// These are only used for multi-threaded search
diff --git a/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java b/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java
index 53b5dc27f75..662e00f3542 100644
--- a/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java
+++ b/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java
@@ -32,7 +32,11 @@ import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
-/** @lucene.internal Only public to be accessible by spans package. */
+/**
+ * Base rewrite method that translates each term into a query, and keeps
+ * the scores as computed by the query.
+ *
+ * @lucene.internal Only public to be accessible by spans package. */
public abstract class ScoringRewrite extends TermCollectingRewrite {
/** A rewrite method that first translates each term into
diff --git a/lucene/core/src/java/org/apache/lucene/search/SortField.java b/lucene/core/src/java/org/apache/lucene/search/SortField.java
index 35e07c4ab5b..b0e1b0db1f7 100644
--- a/lucene/core/src/java/org/apache/lucene/search/SortField.java
+++ b/lucene/core/src/java/org/apache/lucene/search/SortField.java
@@ -38,6 +38,9 @@ import org.apache.lucene.util.StringHelper;
*/
public class SortField {
+ /**
+ * Specifies the type of the terms to be sorted, or special types such as CUSTOM
+ */
public static enum Type {
/** Sort by document score (relevance). Sort values are Float and higher
diff --git a/lucene/core/src/java/org/apache/lucene/search/TimeLimitingCollector.java b/lucene/core/src/java/org/apache/lucene/search/TimeLimitingCollector.java
index 1f8c472b40e..1694789d492 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TimeLimitingCollector.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TimeLimitingCollector.java
@@ -225,6 +225,8 @@ public class TimeLimitingCollector extends Collector {
}
/**
+ * Thread used to timeout search requests.
+ * Can be stopped completely with {@link TimerThread#stopTimer()}
* @lucene.experimental
*/
public static final class TimerThread extends Thread {
diff --git a/lucene/core/src/java/org/apache/lucene/search/TopDocsCollector.java b/lucene/core/src/java/org/apache/lucene/search/TopDocsCollector.java
index afbae972645..5e7dd50406f 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TopDocsCollector.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TopDocsCollector.java
@@ -33,8 +33,8 @@ import org.apache.lucene.util.PriorityQueue;
*/
public abstract class TopDocsCollector extends Collector {
- // This is used in case topDocs() is called with illegal parameters, or there
- // simply aren't (enough) results.
+ /** This is used in case topDocs() is called with illegal parameters, or there
+ * simply aren't (enough) results. */
protected static final TopDocs EMPTY_TOPDOCS = new TopDocs(0, new ScoreDoc[0], Float.NaN);
/**
diff --git a/lucene/core/src/java/org/apache/lucene/store/FSDirectory.java b/lucene/core/src/java/org/apache/lucene/store/FSDirectory.java
index adf3f7099ae..d5151718430 100644
--- a/lucene/core/src/java/org/apache/lucene/store/FSDirectory.java
+++ b/lucene/core/src/java/org/apache/lucene/store/FSDirectory.java
@@ -436,6 +436,9 @@ public abstract class FSDirectory extends Directory {
return chunkSize;
}
+ /**
+ * Writes output with {@link RandomAccessFile#write(byte[], int, int)}
+ */
protected static class FSIndexOutput extends BufferedIndexOutput {
private final FSDirectory parent;
private final String name;
diff --git a/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java b/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java
index 5f483fc6ba6..5098542a375 100644
--- a/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java
+++ b/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java
@@ -106,6 +106,9 @@ public class NIOFSDirectory extends FSDirectory {
};
}
+ /**
+ * Reads bytes with {@link FileChannel#read(ByteBuffer, long)}
+ */
protected static class NIOFSIndexInput extends SimpleFSDirectory.SimpleFSIndexInput {
private ByteBuffer byteBuf; // wraps the buffer for NIO
diff --git a/lucene/core/src/java/org/apache/lucene/store/RAMFile.java b/lucene/core/src/java/org/apache/lucene/store/RAMFile.java
index e34610779f1..b89d308f41a 100644
--- a/lucene/core/src/java/org/apache/lucene/store/RAMFile.java
+++ b/lucene/core/src/java/org/apache/lucene/store/RAMFile.java
@@ -19,7 +19,9 @@ package org.apache.lucene.store;
import java.util.ArrayList;
-/** @lucene.internal */
+/**
+ * Represents a file in RAM as a list of byte[] buffers.
+ * @lucene.internal */
public class RAMFile {
protected ArrayList buffers = new ArrayList();
long length;
diff --git a/lucene/core/src/java/org/apache/lucene/store/SimpleFSDirectory.java b/lucene/core/src/java/org/apache/lucene/store/SimpleFSDirectory.java
index e2deb92892a..e74e642ece9 100644
--- a/lucene/core/src/java/org/apache/lucene/store/SimpleFSDirectory.java
+++ b/lucene/core/src/java/org/apache/lucene/store/SimpleFSDirectory.java
@@ -85,8 +85,16 @@ public class SimpleFSDirectory extends FSDirectory {
};
}
+ /**
+ * Reads bytes with {@link RandomAccessFile#seek(long)} followed by
+ * {@link RandomAccessFile#read(byte[], int, int)}.
+ */
protected static class SimpleFSIndexInput extends BufferedIndexInput {
+ /**
+ * Extension of RandomAccessFile that tracks if the file is
+ * open.
+ */
protected static class Descriptor extends RandomAccessFile {
// remember if the file is open, so that we don't try to close it
// more than once
diff --git a/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java b/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java
index 3378a3e2141..9a863ffdd5b 100644
--- a/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java
+++ b/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java
@@ -117,10 +117,13 @@ public final class ByteBlockPool {
public byte[][] buffers = new byte[10][];
int bufferUpto = -1; // Which buffer we are upto
- public int byteUpto = BYTE_BLOCK_SIZE; // Where we are in head buffer
+ /** Where we are in head buffer */
+ public int byteUpto = BYTE_BLOCK_SIZE;
- public byte[] buffer; // Current head buffer
- public int byteOffset = -BYTE_BLOCK_SIZE; // Current head offset
+ /** Current head buffer */
+ public byte[] buffer;
+ /** Current head offset */
+ public int byteOffset = -BYTE_BLOCK_SIZE;
private final Allocator allocator;
diff --git a/lucene/core/src/java/org/apache/lucene/util/FilterIterator.java b/lucene/core/src/java/org/apache/lucene/util/FilterIterator.java
index d67a2ff7888..44de79a4ad0 100644
--- a/lucene/core/src/java/org/apache/lucene/util/FilterIterator.java
+++ b/lucene/core/src/java/org/apache/lucene/util/FilterIterator.java
@@ -20,12 +20,17 @@ import java.util.NoSuchElementException;
* the License.
*/
+/**
+ * An {@link Iterator} implementation that filters elements with a boolean predicate.
+ * @see #predicateFunction
+ */
public abstract class FilterIterator implements Iterator {
private final Iterator iterator;
private T next = null;
private boolean nextIsSet = false;
+ /** returns true, if this element should be returned by {@link #next()}. */
protected abstract boolean predicateFunction(T object);
public FilterIterator(Iterator baseIterator) {
diff --git a/lucene/core/src/java/org/apache/lucene/util/FuzzySet.java b/lucene/core/src/java/org/apache/lucene/util/FuzzySet.java
index dec9a6e67b2..a7593767652 100644
--- a/lucene/core/src/java/org/apache/lucene/util/FuzzySet.java
+++ b/lucene/core/src/java/org/apache/lucene/util/FuzzySet.java
@@ -48,6 +48,11 @@ public class FuzzySet {
public static final int FUZZY_SERIALIZATION_VERSION=1;
+ /**
+ * Result from {@link FuzzySet#contains(BytesRef)}:
+ * can never return definitively YES (always MAYBE),
+ * but can sometimes definitely return NO.
+ */
public enum ContainsResult {
MAYBE, NO
};
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
index dc099696b60..cd97d7086e4 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
@@ -158,7 +158,7 @@ public final class FST {
private final boolean packed;
private PackedInts.Reader nodeRefToAddress;
- // If arc has this label then that arc is final/accepted
+ /** If arc has this label then that arc is final/accepted */
public static final int END_LABEL = -1;
private boolean allowArrayArcs = true;
@@ -174,7 +174,7 @@ public final class FST {
// building an FST w/ willPackFST=true:
int node;
- // To node (ord or address):
+ /** To node (ord or address) */
public int target;
byte flags;
@@ -542,8 +542,8 @@ public final class FST {
return v;
}
- // returns true if the node at this address has any
- // outgoing arcs
+ /** returns true if the node at this address has any
+ * outgoing arcs */
public static boolean targetHasArcs(Arc arc) {
return arc.target > 0;
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
index 55823a3f829..0dbc7966eb5 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
@@ -767,6 +767,19 @@ public final class Util {
}
}
+ /** Just maps each UTF16 unit (char) to the ints in an
+ * IntsRef. */
+ public static IntsRef toUTF16(CharSequence s, IntsRef scratch) {
+ final int charLimit = s.length();
+ scratch.offset = 0;
+ scratch.length = charLimit;
+ scratch.grow(charLimit);
+ for (int idx = 0; idx < charLimit; idx++) {
+ scratch.ints[idx] = (int) s.charAt(idx);
+ }
+ return scratch;
+ }
+
/** Decodes the Unicode codepoints from the provided
* CharSequence and places them in the provided scratch
* IntsRef, which must not be null, returning it. */
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java b/lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java
index 0fa828566ee..7318e167aa4 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java
@@ -572,7 +572,7 @@ public class PackedInts {
}
return new Packed64(in, valueCount, bitsPerValue);
default:
- throw new AssertionError("Unknwown Writer format: " + format);
+ throw new AssertionError("Unknown Writer format: " + format);
}
}
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java b/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java
index 9df5afbcdbc..4efee8654ee 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java
@@ -20,12 +20,13 @@ package org.apache.lucene.search;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInvertState;
-import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.Norm;
import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.Similarity;
@@ -36,7 +37,7 @@ import org.apache.lucene.util.LuceneTestCase;
public class TestSimilarityProvider extends LuceneTestCase {
private Directory directory;
- private IndexReader reader;
+ private DirectoryReader reader;
private IndexSearcher searcher;
@Override
@@ -75,8 +76,9 @@ public class TestSimilarityProvider extends LuceneTestCase {
public void testBasics() throws Exception {
// sanity check of norms writer
// TODO: generalize
- byte fooNorms[] = (byte[]) MultiDocValues.getNormDocValues(reader, "foo").getSource().getArray();
- byte barNorms[] = (byte[]) MultiDocValues.getNormDocValues(reader, "bar").getSource().getArray();
+ AtomicReader slow = new SlowCompositeReaderWrapper(reader);
+ byte fooNorms[] = (byte[]) slow.normValues("foo").getSource().getArray();
+ byte barNorms[] = (byte[]) slow.normValues("bar").getSource().getArray();
for (int i = 0; i < fooNorms.length; i++) {
assertFalse(fooNorms[i] == barNorms[i]);
}
diff --git a/lucene/core/src/test/org/apache/lucene/store/TestMockDirectoryWrapper.java b/lucene/core/src/test/org/apache/lucene/store/TestMockDirectoryWrapper.java
new file mode 100644
index 00000000000..07bf50f7e48
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/store/TestMockDirectoryWrapper.java
@@ -0,0 +1,54 @@
+package org.apache.lucene.store;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestMockDirectoryWrapper extends LuceneTestCase {
+
+ public void testFailIfIndexWriterNotClosed() throws IOException {
+ MockDirectoryWrapper dir = newMockDirectory();
+ IndexWriter iw = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, null));
+ try {
+ dir.close();
+ fail();
+ } catch (Exception expected) {
+ assertTrue(expected.getMessage().contains("there are still open locks"));
+ }
+ iw.close();
+ dir.close();
+ }
+
+ public void testFailIfIndexWriterNotClosedChangeLockFactory() throws IOException {
+ MockDirectoryWrapper dir = newMockDirectory();
+ dir.setLockFactory(new SingleInstanceLockFactory());
+ IndexWriter iw = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, null));
+ try {
+ dir.close();
+ fail();
+ } catch (Exception expected) {
+ assertTrue(expected.getMessage().contains("there are still open locks"));
+ }
+ iw.close();
+ dir.close();
+ }
+}
diff --git a/lucene/core/src/test/org/apache/lucene/util/junitcompat/TestFailIfDirectoryNotClosed.java b/lucene/core/src/test/org/apache/lucene/util/junitcompat/TestFailIfDirectoryNotClosed.java
index 1079d291e41..6ccb95f41f1 100644
--- a/lucene/core/src/test/org/apache/lucene/util/junitcompat/TestFailIfDirectoryNotClosed.java
+++ b/lucene/core/src/test/org/apache/lucene/util/junitcompat/TestFailIfDirectoryNotClosed.java
@@ -17,13 +17,7 @@ package org.apache.lucene.util.junitcompat;
* limitations under the License.
*/
-import java.io.IOException;
-
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.MockDirectoryWrapper;
-import org.apache.lucene.store.SingleInstanceLockFactory;
import org.junit.Assert;
import org.junit.Test;
import org.junit.runner.JUnitCore;
@@ -40,39 +34,10 @@ public class TestFailIfDirectoryNotClosed extends WithNestedTests {
System.out.println(dir.toString());
}
}
-
- public static class Nested2 extends WithNestedTests.AbstractNestedTest {
- public void testDummy() throws IOException {
- MockDirectoryWrapper dir = newMockDirectory();
- IndexWriter iw = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, null));
- dir.close();
- }
- }
-
- public static class Nested3 extends WithNestedTests.AbstractNestedTest {
- public void testDummy() throws IOException {
- MockDirectoryWrapper dir = newMockDirectory();
- dir.setLockFactory(new SingleInstanceLockFactory());
- IndexWriter iw = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, null));
- dir.close();
- }
- }
@Test
public void testFailIfDirectoryNotClosed() {
Result r = JUnitCore.runClasses(Nested1.class);
Assert.assertEquals(1, r.getFailureCount());
}
-
- @Test
- public void testFailIfIndexWriterNotClosed() {
- Result r = JUnitCore.runClasses(Nested2.class);
- Assert.assertEquals(1, r.getFailureCount());
- }
-
- @Test
- public void testFailIfIndexWriterNotClosedChangeLockFactory() {
- Result r = JUnitCore.runClasses(Nested3.class);
- Assert.assertEquals(1, r.getFailureCount());
- }
}
diff --git a/lucene/core/src/test/org/apache/lucene/util/junitcompat/TestLeaveFilesIfTestFails.java b/lucene/core/src/test/org/apache/lucene/util/junitcompat/TestLeaveFilesIfTestFails.java
new file mode 100644
index 00000000000..e749b290a02
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/util/junitcompat/TestLeaveFilesIfTestFails.java
@@ -0,0 +1,49 @@
+package org.apache.lucene.util.junitcompat;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+
+import org.apache.lucene.util._TestUtil;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.JUnitCore;
+import org.junit.runner.Result;
+
+public class TestLeaveFilesIfTestFails extends WithNestedTests {
+ public TestLeaveFilesIfTestFails() {
+ super(true);
+ }
+
+ public static class Nested1 extends WithNestedTests.AbstractNestedTest {
+ static File file;
+ public void testDummy() {
+ file = _TestUtil.getTempDir("leftover");
+ file.mkdirs();
+ fail();
+ }
+ }
+
+ @Test
+ public void testLeaveFilesIfTestFails() {
+ Result r = JUnitCore.runClasses(Nested1.class);
+ Assert.assertEquals(1, r.getFailureCount());
+ Assert.assertTrue(Nested1.file.exists());
+ Nested1.file.delete();
+ }
+}
diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/AbstractDistinctValuesCollector.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/AbstractDistinctValuesCollector.java
index a3138a22761..419189b1b5f 100644
--- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/AbstractDistinctValuesCollector.java
+++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/AbstractDistinctValuesCollector.java
@@ -44,6 +44,10 @@ public abstract class AbstractDistinctValuesCollector {
public final GROUP_VALUE_TYPE groupValue;
diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/CollectedSearchGroup.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/CollectedSearchGroup.java
index c793f27224d..63c8871ed30 100644
--- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/CollectedSearchGroup.java
+++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/CollectedSearchGroup.java
@@ -17,7 +17,12 @@
package org.apache.lucene.search.grouping;
-/** @lucene.internal */
+import org.apache.lucene.search.FieldComparator; // javadocs
+
+/**
+ * Expert: representation of a group in {@link AbstractFirstPassGroupingCollector},
+ * tracking the top doc and {@link FieldComparator} slot.
+ * @lucene.internal */
public class CollectedSearchGroup extends SearchGroup {
int topDoc;
int comparatorSlot;
diff --git a/lucene/module-build.xml b/lucene/module-build.xml
index 87baa3d3180..233a0ef53a0 100644
--- a/lucene/module-build.xml
+++ b/lucene/module-build.xml
@@ -90,6 +90,28 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/DoubleConstValueSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/DoubleConstValueSource.java
index 5fad61e0bb4..a3cea886254 100755
--- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/DoubleConstValueSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/DoubleConstValueSource.java
@@ -24,6 +24,9 @@ import org.apache.lucene.queries.function.docvalues.DoubleDocValues;
import java.io.IOException;
import java.util.Map;
+/**
+ * Function that returns a constant double value for every document.
+ */
public class DoubleConstValueSource extends ConstNumberSource {
final double constant;
private final float fv;
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/IDFValueSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/IDFValueSource.java
index 319d755eaa4..0c620203462 100755
--- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/IDFValueSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/IDFValueSource.java
@@ -28,7 +28,13 @@ import org.apache.lucene.util.BytesRef;
import java.io.IOException;
import java.util.Map;
-/** @lucene.internal */
+/**
+ * Function that returns {@link TFIDFSimilarity #idf(long, long)}
+ * for every document.
+ *
+ * Note that the configured Similarity for the field must be
+ * a subclass of {@link TFIDFSimilarity}
+ * @lucene.internal */
public class IDFValueSource extends DocFreqValueSource {
public IDFValueSource(String field, String val, String indexedField, BytesRef indexedBytes) {
super(field, val, indexedField, indexedBytes);
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/IfFunction.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/IfFunction.java
index ff693b5c9ab..fde5c5c528a 100644
--- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/IfFunction.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/IfFunction.java
@@ -30,6 +30,10 @@ import java.util.List;
import java.util.Map;
+/**
+ * Depending on the boolean value of the ifSource function,
+ * returns the value of the trueSource or falseSource function.
+ */
public class IfFunction extends BoolFunction {
private final ValueSource ifSource;
private final ValueSource trueSource;
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/MaxDocValueSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/MaxDocValueSource.java
index 789f047253f..66c58e578a0 100755
--- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/MaxDocValueSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/MaxDocValueSource.java
@@ -17,6 +17,7 @@
package org.apache.lucene.queries.function.valuesource;
import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.IndexReader; // javadocs
import org.apache.lucene.queries.function.FunctionValues;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.search.IndexSearcher;
@@ -24,6 +25,11 @@ import org.apache.lucene.search.IndexSearcher;
import java.io.IOException;
import java.util.Map;
+/**
+ * Returns the value of {@link IndexReader#maxDoc()}
+ * for every document. This is the number of documents
+ * including deletions.
+ */
public class MaxDocValueSource extends ValueSource {
public String name() {
return "maxdoc";
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java
index 88b357c5b4d..acf454d1e5b 100755
--- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java
@@ -28,6 +28,13 @@ import org.apache.lucene.search.similarities.TFIDFSimilarity;
import java.io.IOException;
import java.util.Map;
+/**
+ * Function that returns {@link TFIDFSimilarity#decodeNormValue(byte)}
+ * for every document.
+ *
+ * Note that the configured Similarity for the field must be
+ * a subclass of {@link TFIDFSimilarity}
+ * @lucene.internal */
public class NormValueSource extends ValueSource {
protected final String field;
public NormValueSource(String field) {
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/SumTotalTermFreqValueSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/SumTotalTermFreqValueSource.java
index 54e9dac08fb..e9ab075e3b9 100644
--- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/SumTotalTermFreqValueSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/SumTotalTermFreqValueSource.java
@@ -30,7 +30,10 @@ import java.io.IOException;
import java.util.Map;
/**
- * TotalTermFreqValueSource returns the total term freq (sum of term freqs across all docuyments).
+ * SumTotalTermFreqValueSource returns the number of tokens.
+ * (sum of term freqs across all documents, across all terms).
+ * Returns -1 if frequencies were omitted for the field, or if
+ * the codec doesn't support this statistic.
* @lucene.internal
*/
public class SumTotalTermFreqValueSource extends ValueSource {
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/TFValueSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/TFValueSource.java
index d8803ace2a7..f0e4a9c8fbb 100755
--- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/TFValueSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/TFValueSource.java
@@ -28,6 +28,13 @@ import org.apache.lucene.util.BytesRef;
import java.io.IOException;
import java.util.Map;
+/**
+ * Function that returns {@link TFIDFSimilarity#tf(int)}
+ * for every document.
+ *
+ * Note that the configured Similarity for the field must be
+ * a subclass of {@link TFIDFSimilarity}
+ * @lucene.internal */
public class TFValueSource extends TermFreqValueSource {
public TFValueSource(String field, String val, String indexedField, BytesRef indexedBytes) {
super(field, val, indexedField, indexedBytes);
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/TermFreqValueSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/TermFreqValueSource.java
index eab10bcf835..c2b06542f01 100755
--- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/TermFreqValueSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/TermFreqValueSource.java
@@ -26,6 +26,13 @@ import org.apache.lucene.util.BytesRef;
import java.io.IOException;
import java.util.Map;
+/**
+ * Function that returns {@link DocsEnum#freq()} for the
+ * supplied term in every document.
+ *
+ * If the term does not exist in the document, returns 0.
+ * If frequencies are omitted, returns 1.
+ */
public class TermFreqValueSource extends DocFreqValueSource {
public TermFreqValueSource(String field, String val, String indexedField, BytesRef indexedBytes) {
super(field, val, indexedField, indexedBytes);
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/TotalTermFreqValueSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/TotalTermFreqValueSource.java
index 52cede17cc0..65b2abd1af5 100644
--- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/TotalTermFreqValueSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/TotalTermFreqValueSource.java
@@ -28,7 +28,10 @@ import java.io.IOException;
import java.util.Map;
/**
- * TotalTermFreqValueSource returns the total term freq (sum of term freqs across all docuyments).
+ * TotalTermFreqValueSource returns the total term freq
+ * (sum of term freqs across all documents).
+ * Returns -1 if frequencies were omitted for the field, or if
+ * the codec doesn't support this statistic.
* @lucene.internal
*/
public class TotalTermFreqValueSource extends ValueSource {
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockHoleInjectingTokenFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockHoleInjectingTokenFilter.java
index 7685da23405..1718c1ce213 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockHoleInjectingTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockHoleInjectingTokenFilter.java
@@ -29,7 +29,9 @@ import org.apache.lucene.util._TestUtil;
// a MockRemovesTokensTF, ideally subclassing FilteringTF
// (in modules/analysis)
-// Randomly injects holes:
+/**
+ * Randomly injects holes (similar to what a stopfilter would do)
+ */
public final class MockHoleInjectingTokenFilter extends TokenFilter {
private final long randomSeed;
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockPayloadAnalyzer.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockPayloadAnalyzer.java
index 2c17c78b3cf..ebc32408ec6 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockPayloadAnalyzer.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockPayloadAnalyzer.java
@@ -27,8 +27,9 @@ import java.io.Reader;
/**
- *
- *
+ * Wraps a whitespace tokenizer with a filter that sets
+ * the first token, and odd tokens to posinc=1, and all others
+ * to 0, encoding the position as pos: XXX in the payload.
**/
public final class MockPayloadAnalyzer extends Analyzer {
diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40ords/Lucene40WithOrds.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40ords/Lucene40WithOrds.java
index 5f34483d32b..38c0dcc3246 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40ords/Lucene40WithOrds.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40ords/Lucene40WithOrds.java
@@ -30,6 +30,7 @@ import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.TermsIndexReaderBase;
import org.apache.lucene.codecs.TermsIndexWriterBase;
+import org.apache.lucene.codecs.lucene40.Lucene40Codec; // javadocs
import org.apache.lucene.codecs.lucene40.Lucene40PostingsReader;
import org.apache.lucene.codecs.lucene40.Lucene40PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
@@ -39,6 +40,10 @@ import org.apache.lucene.util.BytesRef;
// TODO: we could make separate base class that can wrapp
// any PostingsBaseFormat and make it ord-able...
+/**
+ * Customized version of {@link Lucene40Codec} that uses
+ * {@link FixedGapTermsIndexWriter}.
+ */
public class Lucene40WithOrds extends PostingsFormat {
public Lucene40WithOrds() {
diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockFixedIntBlockPostingsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockFixedIntBlockPostingsFormat.java
index 8a0e9e157d8..468e1b51b73 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockFixedIntBlockPostingsFormat.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockFixedIntBlockPostingsFormat.java
@@ -72,6 +72,9 @@ public class MockFixedIntBlockPostingsFormat extends PostingsFormat {
return new MockIntFactory(blockSize);
}
+ /**
+ * Encodes blocks as vInts of a fixed block size.
+ */
public static class MockIntFactory extends IntStreamFactory {
private final int blockSize;
diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockVariableIntBlockPostingsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockVariableIntBlockPostingsFormat.java
index 8f569561d0e..1a38cb34e0d 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockVariableIntBlockPostingsFormat.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockVariableIntBlockPostingsFormat.java
@@ -70,6 +70,10 @@ public class MockVariableIntBlockPostingsFormat extends PostingsFormat {
return getName() + "(baseBlockSize="+ baseBlockSize + ")";
}
+ /**
+ * If the first value is <= 3, writes baseBlockSize vInts at once,
+ * otherwise writes 2*baseBlockSize vInts.
+ */
public static class MockIntFactory extends IntStreamFactory {
private final int baseBlockSize;
diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/mocksep/MockSingleIntFactory.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/mocksep/MockSingleIntFactory.java
index 8c48f1f1541..ca42debdca2 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/mocksep/MockSingleIntFactory.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/mocksep/MockSingleIntFactory.java
@@ -25,7 +25,10 @@ import org.apache.lucene.codecs.sep.IntStreamFactory;
import java.io.IOException;
-/** @lucene.experimental */
+/**
+ * Encodes ints directly as vInts with {@link MockSingleIntIndexOutput}
+ * @lucene.experimental
+ */
public class MockSingleIntFactory extends IntStreamFactory {
@Override
public IntIndexInput openInput(Directory dir, String fileName, IOContext context) throws IOException {
diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/mocksep/MockSingleIntIndexInput.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/mocksep/MockSingleIntIndexInput.java
index 924ba176748..328b1144438 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/mocksep/MockSingleIntIndexInput.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/mocksep/MockSingleIntIndexInput.java
@@ -28,7 +28,7 @@ import org.apache.lucene.store.IndexInput;
/** Reads IndexInputs written with {@link
* MockSingleIntIndexOutput}. NOTE: this class is just for
- * demonstration puprposes (it is a very slow way to read a
+ * demonstration purposes (it is a very slow way to read a
* block of ints).
*
* @lucene.experimental
@@ -54,6 +54,9 @@ public class MockSingleIntIndexInput extends IntIndexInput {
in.close();
}
+ /**
+ * Just reads a vInt directly from the file.
+ */
public static class Reader extends IntIndexInput.Reader {
// clone:
private final IndexInput in;
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/AlcoholicMergePolicy.java b/lucene/test-framework/src/java/org/apache/lucene/index/AlcoholicMergePolicy.java
index 64d54ecb80a..1ea95e12cca 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/AlcoholicMergePolicy.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/AlcoholicMergePolicy.java
@@ -68,7 +68,7 @@ public class AlcoholicMergePolicy extends LogMergePolicy {
return info.sizeInBytes();
}
- public static enum Drink {
+ private static enum Drink {
Beer(15), Wine(17), Champagne(21), WhiteRussian(22), SingleMalt(30);
@@ -77,11 +77,6 @@ public class AlcoholicMergePolicy extends LogMergePolicy {
Drink(long drunkFactor) {
this.drunkFactor = drunkFactor;
}
-
- public long drunk() {
- return drunkFactor;
- }
-
}
}
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/FieldFilterAtomicReader.java b/lucene/test-framework/src/java/org/apache/lucene/index/FieldFilterAtomicReader.java
index 8e364b338d4..6744e2bda9a 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/FieldFilterAtomicReader.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/FieldFilterAtomicReader.java
@@ -20,9 +20,14 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
-import java.util.NoSuchElementException;
import java.util.Set;
+import org.apache.lucene.util.FilterIterator;
+
+/**
+ * A {@link FilterAtomicReader} that exposes only a subset
+ * of fields from the underlying wrapped reader.
+ */
public final class FieldFilterAtomicReader extends FilterAtomicReader {
private final Set fields;
@@ -58,8 +63,9 @@ public final class FieldFilterAtomicReader extends FilterAtomicReader {
return null;
}
f = new FieldFilterFields(f);
- // we need to check for emptyness, so we can return null:
- return (f.iterator().next() == null) ? null : f;
+ // we need to check for emptyness, so we can return
+ // null:
+ return f.iterator().hasNext() ? f : null;
}
@Override
@@ -134,55 +140,16 @@ public final class FieldFilterAtomicReader extends FilterAtomicReader {
@Override
public int size() {
- // TODO: add faster implementation!
- int c = 0;
- final Iterator it = iterator();
- while (it.next() != null) {
- c++;
- }
- return c;
+ // this information is not cheap, return -1 like MultiFields does:
+ return -1;
}
@Override
public Iterator iterator() {
- final Iterator in = super.iterator();
- return new Iterator() {
- String cached = null;
-
+ return new FilterIterator(super.iterator()) {
@Override
- public String next() {
- if (cached != null) {
- String next = cached;
- cached = null;
- return next;
- } else {
- String next = doNext();
- if (next == null) {
- throw new NoSuchElementException();
- } else {
- return next;
- }
- }
- }
-
- @Override
- public boolean hasNext() {
- return cached != null || (cached = doNext()) != null;
- }
-
- private String doNext() {
- while (in.hasNext()) {
- String field = in.next();
- if (hasField(field)) {
- return field;
- }
- }
- return null;
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException();
+ protected boolean predicateFunction(String field) {
+ return hasField(field);
}
};
}
diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/ShardSearchingTestBase.java b/lucene/test-framework/src/java/org/apache/lucene/search/ShardSearchingTestBase.java
index fc98095728f..e99f02d9cca 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/search/ShardSearchingTestBase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/search/ShardSearchingTestBase.java
@@ -42,9 +42,15 @@ import org.apache.lucene.util._TestUtil;
// - doc blocks? so we can test joins/grouping...
// - controlled consistency (NRTMgr)
+/**
+ * Base test class for simulating distributed search across multiple shards.
+ */
public abstract class ShardSearchingTestBase extends LuceneTestCase {
// TODO: maybe SLM should throw this instead of returning null...
+ /**
+ * Thrown when the lease for a searcher has expired.
+ */
public static class SearcherExpiredException extends RuntimeException {
public SearcherExpiredException(String message) {
super(message);
@@ -604,6 +610,9 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
}
}
+ /**
+ * An IndexSearcher and associated version (lease)
+ */
protected static class SearcherAndVersion {
public final IndexSearcher searcher;
public final long version;
diff --git a/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java b/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java
index 5aff1dd43ce..c956f1f6e75 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java
@@ -146,6 +146,12 @@ public class MockDirectoryWrapper extends BaseDirectoryWrapper {
preventDoubleWrite = value;
}
+ /**
+ * Enum for controlling hard disk throttling.
+ * Set via {@link MockDirectoryWrapper #setThrottling(Throttling)}
+ *
+ * WARNING: can make tests very slow.
+ */
public static enum Throttling {
/** always emulate a slow hard disk. could be very slow! */
ALWAYS,
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/CloseableFile.java b/lucene/test-framework/src/java/org/apache/lucene/util/CloseableFile.java
index aa44cfbdf41..7aedb3fec35 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/CloseableFile.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/CloseableFile.java
@@ -24,25 +24,30 @@ import java.io.*;
*/
final class CloseableFile implements Closeable {
private final File file;
+ private final TestRuleMarkFailure failureMarker;
- public CloseableFile(File file) {
+ public CloseableFile(File file, TestRuleMarkFailure failureMarker) {
this.file = file;
+ this.failureMarker = failureMarker;
}
@Override
public void close() throws IOException {
- if (file.exists()) {
- try {
- _TestUtil.rmDir(file);
- } catch (IOException e) {
- // Ignore the exception from rmDir.
- }
-
- // Re-check.
+ // only if there were no other test failures.
+ if (failureMarker.wasSuccessful()) {
if (file.exists()) {
- throw new IOException(
+ try {
+ _TestUtil.rmDir(file);
+ } catch (IOException e) {
+ // Ignore the exception from rmDir.
+ }
+
+ // Re-check.
+ if (file.exists()) {
+ throw new IOException(
"Could not remove: " + file.getAbsolutePath());
- }
+ }
+ }
}
}
}
\ No newline at end of file
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/English.java b/lucene/test-framework/src/java/org/apache/lucene/util/English.java
index 1f1766f738c..5ff95a4ae2f 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/English.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/English.java
@@ -18,6 +18,7 @@ package org.apache.lucene.util;
*/
/**
+ * Converts numbers to english strings for testing.
* @lucene.internal
*/
public final class English {
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/RollingBuffer.java b/lucene/test-framework/src/java/org/apache/lucene/util/RollingBuffer.java
index c1ded254eb1..4450d3673a8 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/RollingBuffer.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/RollingBuffer.java
@@ -26,6 +26,9 @@ package org.apache.lucene.util;
* @lucene.internal */
public abstract class RollingBuffer {
+ /**
+ * Implement to reset an instance
+ */
public static interface Resettable {
public void reset();
}
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleFieldCacheSanity.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleFieldCacheSanity.java
index ea5d632c690..7ad81a581aa 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleFieldCacheSanity.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleFieldCacheSanity.java
@@ -1,10 +1,5 @@
package org.apache.lucene.util;
-import org.apache.lucene.search.FieldCache;
-import org.junit.rules.TestRule;
-import org.junit.runner.Description;
-import org.junit.runners.model.Statement;
-
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -22,6 +17,30 @@ import org.junit.runners.model.Statement;
* limitations under the License.
*/
+import org.apache.lucene.search.FieldCache;
+import org.apache.lucene.util.FieldCacheSanityChecker; // javadocs
+import org.junit.rules.TestRule;
+import org.junit.runner.Description;
+import org.junit.runners.model.Statement;
+
+/**
+ * This rule will fail the test if it has insane field caches.
+ *
+ * calling assertSaneFieldCaches here isn't as useful as having test
+ * classes call it directly from the scope where the index readers
+ * are used, because they could be gc'ed just before this tearDown
+ * method is called.
+ *
+ * But it's better then nothing.
+ *
+ * If you are testing functionality that you know for a fact
+ * "violates" FieldCache sanity, then you should either explicitly
+ * call purgeFieldCache at the end of your test method, or refactor
+ * your Test class so that the inconsistent FieldCache usages are
+ * isolated in distinct test methods
+ *
+ * @see FieldCacheSanityChecker
+ */
public class TestRuleFieldCacheSanity implements TestRule {
@Override
@@ -33,18 +52,6 @@ public class TestRuleFieldCacheSanity implements TestRule {
Throwable problem = null;
try {
- // calling assertSaneFieldCaches here isn't as useful as having test
- // classes call it directly from the scope where the index readers
- // are used, because they could be gc'ed just before this tearDown
- // method is called.
- //
- // But it's better then nothing.
- //
- // If you are testing functionality that you know for a fact
- // "violates" FieldCache sanity, then you should either explicitly
- // call purgeFieldCache at the end of your test method, or refactor
- // your Test class so that the inconsistent FieldCache usages are
- // isolated in distinct test methods
LuceneTestCase.assertSaneFieldCaches(d.getDisplayName());
} catch (Throwable t) {
problem = t;
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleStoreClassName.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleStoreClassName.java
index 093ed1dec88..1f17fbb5fff 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleStoreClassName.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleStoreClassName.java
@@ -21,6 +21,10 @@ import org.junit.rules.TestRule;
import org.junit.runner.Description;
import org.junit.runners.model.Statement;
+/**
+ * Stores the suite name so you can retrieve it
+ * from {@link #getTestClass()}
+ */
public class TestRuleStoreClassName implements TestRule {
private volatile Description description;
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java
index 0d584e5d874..cefdb0f1112 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java
@@ -94,7 +94,7 @@ public class _TestUtil {
try {
File f = createTempFile(desc, "tmp", LuceneTestCase.TEMP_DIR);
f.delete();
- LuceneTestCase.closeAfterSuite(new CloseableFile(f));
+ LuceneTestCase.closeAfterSuite(new CloseableFile(f, LuceneTestCase.suiteFailureMarker));
return f;
} catch (IOException e) {
throw new RuntimeException(e);
@@ -136,7 +136,7 @@ public class _TestUtil {
rmDir(destDir);
destDir.mkdir();
- LuceneTestCase.closeAfterSuite(new CloseableFile(destDir));
+ LuceneTestCase.closeAfterSuite(new CloseableFile(destDir, LuceneTestCase.suiteFailureMarker));
while (entries.hasMoreElements()) {
ZipEntry entry = entries.nextElement();
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java
index c0cf4ca963a..e770a69ee80 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java
@@ -127,6 +127,13 @@ public class AutomatonTestUtil {
return code;
}
+ /**
+ * Lets you retrieve random strings accepted
+ * by an Automaton.
+ *
+ * Once created, call {@link #getRandomAcceptedString(Random)}
+ * to get a new string (in UTF-32 codepoints).
+ */
public static class RandomAcceptedStrings {
private final Map leadsToAccept;
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 71a59093e23..3046fc4a642 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -26,6 +26,14 @@ $Id$
================== 4.0.0 ==================
+Versions of Major Components
+---------------------
+Apache Tika 1.2
+Carrot2 3.5.0
+Velocity 1.6.4 and Velocity Tools 2.0
+Apache UIMA 2.3.1
+Apache ZooKeeper 3.3.6
+
Upgrading from Solr 4.0.0-BETA
----------------------
@@ -63,6 +71,27 @@ Bug Fixes
* SOLR-3649: Fixed bug in JavabinLoader that caused deleteById(List ids)
to not work in SolrJ (siren)
+* SOLR-3730: Rollback is not implemented quite right and can cause corner case fails in
+ SolrCloud tests. (rmuir, Mark Miller)
+
+* SOLR-2981: Fixed StatsComponent to no longer return duplicated information
+ when requesting multiple stats.facet fields.
+ (Roman Kliewer via hossman)
+
+Other Changes
+----------------------
+
+* SOLR-3690: Fixed binary release packages to include dependencie needed for
+ the solr-test-framework (hossman)
+
+* SOLR-2857: The /update/json and /update/csv URLs were restored to aid
+ in the migration of existing clients. (yonik)
+
+* SOLR-3691: SimplePostTool: Mode for crawling/posting web pages
+ See http://wiki.apache.org/solr/ExtractingRequestHandler for examples (janhoy)
+
+* SOLR-3707: Upgrade Solr to Tika 1.2 (janhoy)
+
================== 4.0.0-BETA ===================
@@ -271,7 +300,6 @@ Other Changes
Also, the configuration itself can be passed using the "dataConfig" parameter rather than
using a file (this previously worked in debug mode only). When configuration errors are
encountered, the error message is returned in XML format. (James Dyer)
-
* SOLR-3439: Make SolrCell easier to use out of the box. Also improves "/browse" to display
rich-text documents correctly, along with facets for author and content_type.
With the new "content" field, highlighting of body is supported. See also SOLR-3672 for
diff --git a/solr/NOTICE.txt b/solr/NOTICE.txt
index 4b237d98ebe..b22247b12b0 100644
--- a/solr/NOTICE.txt
+++ b/solr/NOTICE.txt
@@ -310,12 +310,11 @@ Copyright 2004 Sun Microsystems, Inc. (Rome JAR)
Copyright 2002-2008 by John Cowan (TagSoup -- http://ccil.org/~cowan/XML/tagsoup/)
-Copyright (C) 1999-2007 Shigeru Chiba. All Rights Reserved.
-(Javassist, MPL licensed: http://www.csg.ci.i.u-tokyo.ac.jp/~chiba/javassist/)
-
Copyright (C) 1994-2007 by the Xiph.org Foundation, http://www.xiph.org/ (OggVorbis)
-Scannotation (C) Bill Burke
+Copyright 2012 Kohei Taketa juniversalchardet (http://code.google.com/p/juniversalchardet/)
+
+Lasse Collin and others, XZ for Java (http://tukaani.org/xz/java.html)
=========================================================================
== Language Detection Notices ==
diff --git a/solr/build.xml b/solr/build.xml
index 1bef7607cbf..5764f7b90eb 100644
--- a/solr/build.xml
+++ b/solr/build.xml
@@ -386,8 +386,9 @@
diff --git a/solr/common-build.xml b/solr/common-build.xml
index 4bda26913f5..3493cdd5626 100644
--- a/solr/common-build.xml
+++ b/solr/common-build.xml
@@ -193,7 +193,7 @@
-
+
diff --git a/solr/contrib/extraction/ivy.xml b/solr/contrib/extraction/ivy.xml
index 71d0d860f4f..335f7f7be34 100644
--- a/solr/contrib/extraction/ivy.xml
+++ b/solr/contrib/extraction/ivy.xml
@@ -20,36 +20,36 @@
-
-
-
+
+
+
-
-
-
-
-
-
+
+
+
+
+
+
-
-
-
-
+
+
+
+
-
-
-
+
+
+
diff --git a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
index b1995a718fc..cde7793315c 100644
--- a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
+++ b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
@@ -64,8 +64,7 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
"fmap.Creation-Date", "extractedDate",
- "fmap.AAPL:Keywords", "ignored_a",
- "fmap.xmpTPg:NPages", "ignored_a",
+ "uprefix", "ignored_",
"fmap.Author", "extractedAuthor",
"fmap.content", "extractedContent",
"literal.id", "one",
@@ -81,6 +80,7 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
"fmap.Author", "extractedAuthor",
"fmap.language", "extractedLanguage",
"literal.id", "two",
+ "uprefix", "ignored_",
"fmap.content", "extractedContent",
"fmap.Last-Modified", "extractedDate"
);
@@ -136,6 +136,7 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
"fmap.Author", "extractedAuthor",
"literal.id", "three",
+ "uprefix", "ignored_",
"fmap.content", "extractedContent",
"fmap.language", "extractedLanguage",
"fmap.Last-Modified", "extractedDate"
@@ -206,6 +207,7 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
"fmap.Author", "extractedAuthor",
"fmap.content", "extractedContent",
"literal.id", "one",
+ "uprefix", "ignored_",
"fmap.language", "extractedLanguage",
"literal.extractionLiteralMV", "one",
"literal.extractionLiteralMV", "two",
@@ -374,9 +376,8 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
loadLocal("extraction/arabic.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
"fmap.Creation-Date", "extractedDate",
- "fmap.AAPL:Keywords", "ignored_a",
- "fmap.xmpTPg:NPages", "ignored_a",
"fmap.Author", "extractedAuthor",
+ "uprefix", "ignored_",
"fmap.content", "wdf_nocase",
"literal.id", "one",
"fmap.Last-Modified", "extractedDate");
@@ -404,8 +405,7 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
loadLocal("extraction/password-is-solrcell.docx", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
"fmap.Creation-Date", "extractedDate",
- "fmap.AAPL:Keywords", "ignored_a",
- "fmap.xmpTPg:NPages", "ignored_a",
+ "uprefix", "ignored_",
"fmap.Author", "extractedAuthor",
"fmap.content", "wdf_nocase",
"literal.id", "one",
@@ -462,8 +462,7 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
"fmap.content", "extractedContent",
"fmap.language", "extractedLanguage",
"fmap.Creation-Date", "extractedDate",
- "fmap.AAPL:Keywords", "ignored_a",
- "fmap.xmpTPg:NPages", "ignored_a",
+ "uprefix", "ignored_",
"fmap.Last-Modified", "extractedDate");
// Here the literal value should override the Tika-parsed title:
@@ -478,8 +477,7 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
"fmap.content", "extractedContent",
"fmap.language", "extractedLanguage",
"fmap.Creation-Date", "extractedDate",
- "fmap.AAPL:Keywords", "ignored_a",
- "fmap.xmpTPg:NPages", "ignored_a",
+ "uprefix", "ignored_",
"fmap.Last-Modified", "extractedDate");
// Here we mimic the old behaviour where literals are added, not overridden
@@ -498,8 +496,7 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
"fmap.content", "extractedContent",
"fmap.language", "extractedLanguage",
"fmap.Creation-Date", "extractedDate",
- "fmap.AAPL:Keywords", "ignored_a",
- "fmap.xmpTPg:NPages", "ignored_a",
+ "uprefix", "ignored_",
"fmap.Last-Modified", "extractedDate");
assertU(commit());
diff --git a/solr/core/src/java/org/apache/solr/core/CachingDirectoryFactory.java b/solr/core/src/java/org/apache/solr/core/CachingDirectoryFactory.java
index e98fd5fcfb2..c5c7c4e1b36 100644
--- a/solr/core/src/java/org/apache/solr/core/CachingDirectoryFactory.java
+++ b/solr/core/src/java/org/apache/solr/core/CachingDirectoryFactory.java
@@ -125,6 +125,7 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
}
cacheValue.refCnt--;
if (cacheValue.refCnt == 0 && cacheValue.doneWithDir) {
+ log.info("Closing directory:" + cacheValue.path);
directory.close();
byDirectoryCache.remove(directory);
byPathCache.remove(cacheValue.path);
@@ -194,6 +195,7 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
byDirectoryCache.put(directory, newCacheValue);
byPathCache.put(fullPath, newCacheValue);
+ log.info("return new directory for " + fullPath + " forceNew:" + forceNew);
} else {
cacheValue.refCnt++;
}
diff --git a/solr/core/src/java/org/apache/solr/core/SolrCore.java b/solr/core/src/java/org/apache/solr/core/SolrCore.java
index b58546fd6c3..670972f8879 100644
--- a/solr/core/src/java/org/apache/solr/core/SolrCore.java
+++ b/solr/core/src/java/org/apache/solr/core/SolrCore.java
@@ -1554,7 +1554,7 @@ public final class SolrCore implements SolrInfoMBean {
} catch (Throwable e) {
// do not allow decref() operations to fail since they are typically called in finally blocks
// and throwing another exception would be very unexpected.
- SolrException.log(log, "Error closing searcher:", e);
+ SolrException.log(log, "Error closing searcher:" + this, e);
}
}
};
diff --git a/solr/core/src/java/org/apache/solr/handler/CSVRequestHandler.java b/solr/core/src/java/org/apache/solr/handler/CSVRequestHandler.java
index 5adce486c30..335edf9781d 100755
--- a/solr/core/src/java/org/apache/solr/handler/CSVRequestHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/CSVRequestHandler.java
@@ -29,7 +29,7 @@ public class CSVRequestHandler extends UpdateRequestHandler {
public void init(NamedList args) {
super.init(args);
setAssumeContentType("application/csv");
- log.warn("Using deprecated class: "+this.getClass().getSimpleName()+" -- replace with UpdateRequestHandler");
+ // log.warn("Using deprecated class: "+this.getClass().getSimpleName()+" -- replace with UpdateRequestHandler");
}
//////////////////////// SolrInfoMBeans methods //////////////////////
diff --git a/solr/core/src/java/org/apache/solr/handler/JsonUpdateRequestHandler.java b/solr/core/src/java/org/apache/solr/handler/JsonUpdateRequestHandler.java
index f4254a20e31..f8869f0276b 100644
--- a/solr/core/src/java/org/apache/solr/handler/JsonUpdateRequestHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/JsonUpdateRequestHandler.java
@@ -29,7 +29,7 @@ public class JsonUpdateRequestHandler extends UpdateRequestHandler {
public void init(NamedList args) {
super.init(args);
setAssumeContentType("application/json");
- log.warn("Using deprecated class: "+this.getClass().getSimpleName()+" -- replace with UpdateRequestHandler");
+ // log.warn("Using deprecated class: "+this.getClass().getSimpleName()+" -- replace with UpdateRequestHandler");
}
//////////////////////// SolrInfoMBeans methods //////////////////////
diff --git a/solr/core/src/java/org/apache/solr/handler/SnapPuller.java b/solr/core/src/java/org/apache/solr/handler/SnapPuller.java
index 72d52263f0a..6b9291fcecd 100644
--- a/solr/core/src/java/org/apache/solr/handler/SnapPuller.java
+++ b/solr/core/src/java/org/apache/solr/handler/SnapPuller.java
@@ -384,7 +384,7 @@ public class SnapPuller {
// may be closed
core.getDirectoryFactory().doneWithDirectory(oldDirectory);
}
- doCommit();
+ doCommit(isFullCopyNeeded);
}
replicationStartTime = 0;
@@ -533,11 +533,11 @@ public class SnapPuller {
return sb;
}
- private void doCommit() throws IOException {
+ private void doCommit(boolean isFullCopyNeeded) throws IOException {
SolrQueryRequest req = new LocalSolrQueryRequest(solrCore,
new ModifiableSolrParams());
// reboot the writer on the new index and get a new searcher
- solrCore.getUpdateHandler().newIndexWriter(true);
+ solrCore.getUpdateHandler().newIndexWriter(isFullCopyNeeded);
try {
// first try to open an NRT searcher so that the new
diff --git a/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java b/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java
index 1796da7cb1a..1505ce4680f 100644
--- a/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java
@@ -182,8 +182,8 @@ abstract class AbstractStatsValues implements StatsValues {
for (Map.Entry e2 : entry.getValue().entrySet()) {
nl2.add(e2.getKey(), e2.getValue().getStatsValues());
}
- res.add(FACETS, nl);
}
+ res.add(FACETS, nl);
return res;
}
diff --git a/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java b/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
index 67b43448ed3..6a53ed5e763 100644
--- a/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
+++ b/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
@@ -74,8 +74,7 @@ public final class DefaultSolrCoreState extends SolrCoreState {
}
if (indexWriter == null) {
- indexWriter = createMainIndexWriter(core, "DirectUpdateHandler2",
- false, false);
+ indexWriter = createMainIndexWriter(core, "DirectUpdateHandler2", false);
}
if (refCntWriter == null) {
refCntWriter = new RefCounted(indexWriter) {
@@ -110,18 +109,28 @@ public final class DefaultSolrCoreState extends SolrCoreState {
writerPauseLock.wait();
} catch (InterruptedException e) {}
}
-
+
try {
if (indexWriter != null) {
- try {
- log.info("Closing old IndexWriter... core=" + coreName);
- indexWriter.close();
- } catch (Throwable t) {
- SolrException.log(log, "Error closing old IndexWriter. core=" + coreName, t);
+ if (!rollback) {
+ try {
+ log.info("Closing old IndexWriter... core=" + coreName);
+ indexWriter.close();
+ } catch (Throwable t) {
+ SolrException.log(log, "Error closing old IndexWriter. core="
+ + coreName, t);
+ }
+ } else {
+ try {
+ log.info("Rollback old IndexWriter... core=" + coreName);
+ indexWriter.rollback();
+ } catch (Throwable t) {
+ SolrException.log(log, "Error rolling back old IndexWriter. core="
+ + coreName, t);
+ }
}
}
- indexWriter = createMainIndexWriter(core, "DirectUpdateHandler2",
- false, true);
+ indexWriter = createMainIndexWriter(core, "DirectUpdateHandler2", true);
log.info("New IndexWriter is ready to be used.");
// we need to null this so it picks up the new writer next get call
refCntWriter = null;
@@ -174,14 +183,12 @@ public final class DefaultSolrCoreState extends SolrCoreState {
@Override
public synchronized void rollbackIndexWriter(SolrCore core) throws IOException {
- indexWriter.rollback();
newIndexWriter(core, true);
}
- protected SolrIndexWriter createMainIndexWriter(SolrCore core, String name,
- boolean removeAllExisting, boolean forceNewDirectory) throws IOException {
+ protected SolrIndexWriter createMainIndexWriter(SolrCore core, String name, boolean forceNewDirectory) throws IOException {
return new SolrIndexWriter(name, core.getNewIndexDir(),
- core.getDirectoryFactory(), removeAllExisting, core.getSchema(),
+ core.getDirectoryFactory(), false, core.getSchema(),
core.getSolrConfig().indexConfig, core.getDeletionPolicy(), core.getCodec(), forceNewDirectory);
}
diff --git a/solr/core/src/java/org/apache/solr/update/MemOutputStream.java b/solr/core/src/java/org/apache/solr/update/MemOutputStream.java
new file mode 100644
index 00000000000..32b459e3cfe
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/update/MemOutputStream.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update;
+
+import org.apache.solr.common.util.FastOutputStream;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+
+/** @lucene.internal */
+public class MemOutputStream extends FastOutputStream {
+ public List buffers = new LinkedList();
+ public MemOutputStream(byte[] tempBuffer) {
+ super(null, tempBuffer, 0);
+ }
+
+ @Override
+ public void flush(byte[] arr, int offset, int len) throws IOException {
+ if (arr == buf && offset==0 && len==buf.length) {
+ buffers.add(buf); // steal the buffer
+ buf = new byte[8192];
+ } else if (len > 0) {
+ byte[] newBuf = new byte[len];
+ System.arraycopy(arr, offset, newBuf, 0, len);
+ buffers.add(newBuf);
+ }
+ }
+
+ public void writeAll(FastOutputStream fos) throws IOException {
+ for (byte[] buffer : buffers) {
+ fos.write(buffer);
+ }
+ if (pos > 0) {
+ fos.write(buf, 0, pos);
+ }
+ }
+}
diff --git a/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java b/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java
index d59164a9344..56dbca1992a 100644
--- a/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java
+++ b/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java
@@ -141,6 +141,8 @@ public class SolrIndexWriter extends IndexWriter {
super.rollback();
} finally {
isClosed = true;
+ directoryFactory.release(getDirectory());
+ numCloses.incrementAndGet();
}
}
diff --git a/solr/core/src/java/org/apache/solr/update/TransactionLog.java b/solr/core/src/java/org/apache/solr/update/TransactionLog.java
index e2f4a5882f7..c609a59861f 100644
--- a/solr/core/src/java/org/apache/solr/update/TransactionLog.java
+++ b/solr/core/src/java/org/apache/solr/update/TransactionLog.java
@@ -775,31 +775,3 @@ class ChannelFastInputStream extends FastInputStream {
}
-class MemOutputStream extends FastOutputStream {
- public List buffers = new LinkedList();
- public MemOutputStream(byte[] tempBuffer) {
- super(null, tempBuffer, 0);
- }
-
- @Override
- public void flush(byte[] arr, int offset, int len) throws IOException {
- if (arr == buf && offset==0 && len==buf.length) {
- buffers.add(buf); // steal the buffer
- buf = new byte[8192];
- } else if (len > 0) {
- byte[] newBuf = new byte[len];
- System.arraycopy(arr, offset, newBuf, 0, len);
- buffers.add(newBuf);
- }
- }
-
- public void writeAll(FastOutputStream fos) throws IOException {
- for (byte[] buffer : buffers) {
- fos.write(buffer);
- }
- if (pos > 0) {
- fos.write(buf, 0, pos);
- }
- }
-}
-
diff --git a/solr/core/src/java/org/apache/solr/util/FastWriter.java b/solr/core/src/java/org/apache/solr/util/FastWriter.java
index 672eb06a4af..363cf223221 100755
--- a/solr/core/src/java/org/apache/solr/util/FastWriter.java
+++ b/solr/core/src/java/org/apache/solr/util/FastWriter.java
@@ -28,7 +28,7 @@ public class FastWriter extends Writer {
// it won't cause double buffering.
private static final int BUFSIZE = 8192;
protected final Writer sink;
- protected final char[] buf;
+ protected char[] buf;
protected int pos;
public FastWriter(Writer w) {
@@ -69,42 +69,64 @@ public class FastWriter extends Writer {
}
@Override
- public void write(char cbuf[], int off, int len) throws IOException {
- int space = buf.length - pos;
- if (len < space) {
- System.arraycopy(cbuf, off, buf, pos, len);
- pos += len;
- } else if (len buf.length) {
+ if (pos>0) {
+ flush(buf,0,pos); // flush
+ pos=0;
+ }
+ // don't buffer, just write to sink
+ flush(arr, off, len);
+ return;
+ }
+
+ // buffer is too big to fit in the free space, but
+ // not big enough to warrant writing on its own.
+ // write whatever we can fit, then flush and iterate.
+
+ System.arraycopy(arr, off, buf, pos, space);
flush(buf, 0, buf.length);
- pos = len-space;
- System.arraycopy(cbuf, off+space, buf, 0, pos);
- } else {
- flush(buf,0,pos); // flush
- pos=0;
- // don't buffer, just write to sink
- flush(cbuf, off, len);
+ pos = 0;
+ off += space;
+ len -= space;
}
}
@Override
public void write(String str, int off, int len) throws IOException {
- int space = buf.length - pos;
- if (len < space) {
- str.getChars(off, off+len, buf, pos);
- pos += len;
- } else if (len buf.length) {
+ if (pos>0) {
+ flush(buf,0,pos); // flush
+ pos=0;
+ }
+ // don't buffer, just write to sink
+ flush(str, off, len);
+ return;
+ }
+
+ // buffer is too big to fit in the free space, but
+ // not big enough to warrant writing on its own.
+ // write whatever we can fit, then flush and iterate.
+
str.getChars(off, off+space, buf, pos);
flush(buf, 0, buf.length);
- str.getChars(off+space, off+len, buf, 0);
- pos = len-space;
- } else {
- flush(buf,0,pos); // flush
- pos=0;
- // don't buffer, just write to sink
- flush(str, off, len);
+ pos = 0;
+ off += space;
+ len -= space;
}
}
diff --git a/solr/core/src/java/org/apache/solr/util/SimplePostTool.java b/solr/core/src/java/org/apache/solr/util/SimplePostTool.java
index efbd2fb85fa..edf8f67e904 100644
--- a/solr/core/src/java/org/apache/solr/util/SimplePostTool.java
+++ b/solr/core/src/java/org/apache/solr/util/SimplePostTool.java
@@ -17,65 +17,110 @@ package org.apache.solr.util;
* limitations under the License.
*/
+import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ByteArrayInputStream;
+import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
-import java.util.Locale;
+import java.util.ArrayList;
import java.util.HashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
import java.util.Set;
import java.util.HashSet;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.Inflater;
+import java.util.zip.InflaterInputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.ProtocolException;
import java.net.URL;
import java.net.URLEncoder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathExpressionException;
+import javax.xml.xpath.XPathFactory;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
/**
* A simple utility class for posting raw updates to a Solr server,
* has a main method so it can be run on the command line.
+ * View this not as a best-practice code example, but as a standalone
+ * example built with an explicit purpose of not having external
+ * jar dependencies.
*/
public class SimplePostTool {
- public static final String DEFAULT_POST_URL = "http://localhost:8983/solr/update";
- public static final String VERSION_OF_THIS_TOOL = "1.5";
+ private static final String DEFAULT_POST_URL = "http://localhost:8983/solr/update";
+ private static final String VERSION_OF_THIS_TOOL = "1.5";
private static final String DEFAULT_COMMIT = "yes";
private static final String DEFAULT_OPTIMIZE = "no";
private static final String DEFAULT_OUT = "no";
private static final String DEFAULT_AUTO = "no";
- private static final String DEFAULT_RECURSIVE = "no";
-
+ private static final String DEFAULT_RECURSIVE = "0";
+ private static final int DEFAULT_WEB_DELAY = 10;
+ private static final int MAX_WEB_DEPTH = 10;
private static final String DEFAULT_CONTENT_TYPE = "application/xml";
private static final String DEFAULT_FILE_TYPES = "xml,json,csv,pdf,doc,docx,ppt,pptx,xls,xlsx,odt,odp,ods,ott,otp,ots,rtf,htm,html,txt,log";
- private static final String DATA_MODE_FILES = "files";
- private static final String DATA_MODE_ARGS = "args";
- private static final String DATA_MODE_STDIN = "stdin";
- private static final String DEFAULT_DATA_MODE = DATA_MODE_FILES;
+ static final String DATA_MODE_FILES = "files";
+ static final String DATA_MODE_ARGS = "args";
+ static final String DATA_MODE_STDIN = "stdin";
+ static final String DATA_MODE_WEB = "web";
+ static final String DEFAULT_DATA_MODE = DATA_MODE_FILES;
- private static final String TRUE_STRINGS = "true,on,yes,1";
+ // Input args
+ boolean auto = false;
+ int recursive = 0;
+ int delay = 0;
+ String fileTypes;
+ URL solrUrl;
+ OutputStream out = null;
+ String type;
+ String mode;
+ boolean commit;
+ boolean optimize;
+ String[] args;
- private boolean auto = false;
- private boolean recursive = false;
- private String fileTypes;
+ private int currentDepth;
+
+ static HashMap mimeMap;
+ GlobFileFilter globFileFilter;
+ // Backlog for crawling
+ List> backlog = new ArrayList>();
+ Set visited = new HashSet();
- private static HashMap mimeMap;
- private GlobFileFilter globFileFilter;
-
- private static final Set DATA_MODES = new HashSet();
- private static final String USAGE_STRING_SHORT =
- "Usage: java [SystemProperties] -jar post.jar [-h|-] [ [...]]";
+ static final Set DATA_MODES = new HashSet();
+ static final String USAGE_STRING_SHORT =
+ "Usage: java [SystemProperties] -jar post.jar [-h|-] [ [...]]";
+
+ // Used in tests to avoid doing actual network traffic
+ static boolean mockMode = false;
+ static PageFetcher pageFetcher;
static {
DATA_MODES.add(DATA_MODE_FILES);
DATA_MODES.add(DATA_MODE_ARGS);
DATA_MODES.add(DATA_MODE_STDIN);
+ DATA_MODES.add(DATA_MODE_WEB);
mimeMap = new HashMap();
mimeMap.put("xml", "text/xml");
@@ -100,97 +145,196 @@ public class SimplePostTool {
mimeMap.put("txt", "text/plain");
mimeMap.put("log", "text/plain");
}
-
- protected URL solrUrl;
+ /**
+ * See usage() for valid command line usage
+ * @param args the params on the command line
+ */
public static void main(String[] args) {
info("SimplePostTool version " + VERSION_OF_THIS_TOOL);
-
if (0 < args.length && ("-help".equals(args[0]) || "--help".equals(args[0]) || "-h".equals(args[0]))) {
usage();
- return;
- }
-
- OutputStream out = null;
- final String type = System.getProperty("type");
-
- final String params = System.getProperty("params", "");
-
- URL u = null;
- try {
- u = new URL(System.getProperty("url", SimplePostTool.appendParam(DEFAULT_POST_URL, params)));
- } catch (MalformedURLException e) {
- fatal("System Property 'url' is not a valid URL: " + u);
- }
- final SimplePostTool t = new SimplePostTool(u);
-
- if (isOn(System.getProperty("auto", DEFAULT_AUTO))) {
- t.setAuto(true);
- }
-
- if (isOn(System.getProperty("recursive", DEFAULT_RECURSIVE))) {
- t.setRecursive(true);
- }
-
- final String mode = System.getProperty("data", DEFAULT_DATA_MODE);
- if (! DATA_MODES.contains(mode)) {
- fatal("System Property 'data' is not valid for this tool: " + mode);
- }
-
- if (isOn(System.getProperty("out", DEFAULT_OUT))) {
- out = System.out;
- }
-
- t.setFileTypes(System.getProperty("filetypes", DEFAULT_FILE_TYPES));
-
- int numFilesPosted = 0;
-
- try {
- if (DATA_MODE_FILES.equals(mode)) {
- if (0 < args.length) {
- // Skip posting files if special param "-" given
- if (!args[0].equals("-")) {
- info("Posting files to base url " + u + (!t.auto?" using content-type "+(type==null?DEFAULT_CONTENT_TYPE:type):"")+"..");
- if(t.auto)
- info("Entering auto mode. File endings considered are "+t.getFileTypes());
- if(t.recursive)
- info("Entering recursive mode");
- numFilesPosted = t.postFiles(args, 0, out, type);
- info(numFilesPosted + " files indexed.");
- }
- } else {
- usageShort();
- return;
- }
- } else if (DATA_MODE_ARGS.equals(mode)) {
- if (0 < args.length) {
- info("POSTing args to " + u + "..");
- for (String a : args) {
- t.postData(SimplePostTool.stringToStream(a), null, out, type);
- }
- } else {
- usageShort();
- return;
- }
- } else if (DATA_MODE_STDIN.equals(mode)) {
- info("POSTing stdin to " + u + "..");
- t.postData(System.in, null, out, type);
- }
- if (isOn(System.getProperty("commit",DEFAULT_COMMIT))) {
- info("COMMITting Solr index changes to " + u + "..");
- t.commit();
- }
- if (isOn(System.getProperty("optimize",DEFAULT_OPTIMIZE))) {
- info("Performing an OPTIMIZE to " + u + "..");
- t.optimize();
- }
-
- } catch(RuntimeException e) {
- e.printStackTrace();
- fatal("RuntimeException " + e);
+ } else {
+ final SimplePostTool t = parseArgsAndInit(args);
+ t.execute();
}
}
+ /**
+ * After initialization, call execute to start the post job.
+ * This method delegates to the correct mode method.
+ */
+ public void execute() {
+ if (DATA_MODE_FILES.equals(mode) && args.length > 0) {
+ doFilesMode();
+ } else if(DATA_MODE_ARGS.equals(mode) && args.length > 0) {
+ doArgsMode();
+ } else if(DATA_MODE_WEB.equals(mode) && args.length > 0) {
+ doWebMode();
+ } else if(DATA_MODE_STDIN.equals(mode)) {
+ doStdinMode();
+ } else {
+ usageShort();
+ return;
+ }
+
+ if (commit) commit();
+ if (optimize) optimize();
+ }
+
+ /**
+ * Parses incoming arguments and system params and initializes the tool
+ * @param args the incoming cmd line args
+ * @return an instance of SimplePostTool
+ */
+ protected static SimplePostTool parseArgsAndInit(String[] args) {
+ String urlStr = null;
+ try {
+ // Parse args
+ final String mode = System.getProperty("data", DEFAULT_DATA_MODE);
+ if (! DATA_MODES.contains(mode)) {
+ fatal("System Property 'data' is not valid for this tool: " + mode);
+ }
+ String params = System.getProperty("params", "");
+ urlStr = System.getProperty("url", SimplePostTool.appendParam(DEFAULT_POST_URL, params));
+ URL url = new URL(urlStr);
+ boolean auto = isOn(System.getProperty("auto", DEFAULT_AUTO));
+ String type = System.getProperty("type");
+ // Recursive
+ int recursive = 0;
+ String r = System.getProperty("recursive", DEFAULT_RECURSIVE);
+ try {
+ recursive = Integer.parseInt(r);
+ } catch(Exception e) {
+ if (isOn(r))
+ recursive = DATA_MODE_WEB.equals(mode)?1:999;
+ }
+ // Delay
+ int delay = DATA_MODE_WEB.equals(mode) ? DEFAULT_WEB_DELAY : 0;
+ try {
+ delay = Integer.parseInt(System.getProperty("delay", ""+delay));
+ } catch(Exception e) { }
+ OutputStream out = isOn(System.getProperty("out", DEFAULT_OUT)) ? System.out : null;
+ String fileTypes = System.getProperty("filetypes", DEFAULT_FILE_TYPES);
+ boolean commit = isOn(System.getProperty("commit",DEFAULT_COMMIT));
+ boolean optimize = isOn(System.getProperty("optimize",DEFAULT_OPTIMIZE));
+
+ return new SimplePostTool(mode, url, auto, type, recursive, delay, fileTypes, out, commit, optimize, args);
+ } catch (MalformedURLException e) {
+ fatal("System Property 'url' is not a valid URL: " + urlStr);
+ return null;
+ }
+ }
+
+ /**
+ * Constructor which takes in all mandatory input for the tool to work.
+ * Also see usage() for further explanation of the params.
+ * @param mode whether to post files, web pages, params or stdin
+ * @param url the Solr base Url to post to, should end with /update
+ * @param auto if true, we'll guess type and add resourcename/url
+ * @param type content-type of the data you are posting
+ * @param recursive number of levels for file/web mode, or 0 if one file only
+ * @param delay if recursive then delay will be the wait time between posts
+ * @param fileTypes a comma separated list of file-name endings to accept for file/web
+ * @param out an OutputStream to write output to, e.g. stdout to print to console
+ * @param commit if true, will commit at end of posting
+ * @param optimize if true, will optimize at end of posting
+ * @param args a String[] of arguments, varies between modes
+ */
+ public SimplePostTool(String mode, URL url, boolean auto, String type,
+ int recursive, int delay, String fileTypes, OutputStream out,
+ boolean commit, boolean optimize, String[] args) {
+ this.mode = mode;
+ this.solrUrl = url;
+ this.auto = auto;
+ this.type = type;
+ this.recursive = recursive;
+ this.delay = delay;
+ this.fileTypes = fileTypes;
+ this.globFileFilter = getFileFilterFromFileTypes(fileTypes);
+ this.out = out;
+ this.commit = commit;
+ this.optimize = optimize;
+ this.args = args;
+ pageFetcher = new PageFetcher();
+ }
+
+ public SimplePostTool() {}
+
+ //
+ // Do some action depending on which mode we have
+ //
+ private void doFilesMode() {
+ currentDepth = 0;
+ // Skip posting files if special param "-" given
+ if (!args[0].equals("-")) {
+ info("Posting files to base url " + solrUrl + (!auto?" using content-type "+(type==null?DEFAULT_CONTENT_TYPE:type):"")+"..");
+ if(auto)
+ info("Entering auto mode. File endings considered are "+fileTypes);
+ if(recursive > 0)
+ info("Entering recursive mode, max depth="+recursive+", delay="+delay+"s");
+ int numFilesPosted = postFiles(args, 0, out, type);
+ info(numFilesPosted + " files indexed.");
+ }
+ }
+
+ private void doArgsMode() {
+ info("POSTing args to " + solrUrl + "..");
+ for (String a : args) {
+ postData(stringToStream(a), null, out, type, solrUrl);
+ }
+ }
+
+ private int doWebMode() {
+ reset();
+ int numPagesPosted = 0;
+ try {
+ if(type != null) {
+ fatal("Specifying content-type with \"-Ddata=web\" is not supported");
+ }
+ if (args[0].equals("-")) {
+ // Skip posting url if special param "-" given
+ return 0;
+ }
+ // Set Extracting handler as default
+ solrUrl = appendUrlPath(solrUrl, "/extract");
+
+ info("Posting web pages to Solr url "+solrUrl);
+ auto=true;
+ info("Entering auto mode. Indexing pages with content-types corresponding to file endings "+fileTypes);
+ if(recursive > 0) {
+ if(recursive > MAX_WEB_DEPTH) {
+ recursive = MAX_WEB_DEPTH;
+ warn("Too large recursion depth for web mode, limiting to "+MAX_WEB_DEPTH+"...");
+ }
+ if(delay < DEFAULT_WEB_DELAY)
+ warn("Never crawl an external web site faster than every 10 seconds, your IP will probably be blocked");
+ info("Entering recursive mode, depth="+recursive+", delay="+delay+"s");
+ }
+ numPagesPosted = postWebPages(args, 0, out);
+ info(numPagesPosted + " web pages indexed.");
+ } catch(MalformedURLException e) {
+ fatal("Wrong URL trying to append /extract to "+solrUrl);
+ }
+ return numPagesPosted;
+ }
+
+ private void doStdinMode() {
+ info("POSTing stdin to " + solrUrl + "..");
+ postData(System.in, null, out, type, solrUrl);
+ }
+
+ private void reset() {
+ fileTypes = DEFAULT_FILE_TYPES;
+ globFileFilter = this.getFileFilterFromFileTypes(fileTypes);
+ backlog = new ArrayList>();
+ visited = new HashSet();
+ }
+
+
+ //
+ // USAGE
+ //
private static void usageShort() {
System.out.println(USAGE_STRING_SHORT+"\n"+
" Please invoke with -h option for extended usage help.");
@@ -200,11 +344,12 @@ public class SimplePostTool {
System.out.println
(USAGE_STRING_SHORT+"\n\n" +
"Supported System Properties and their defaults:\n"+
- " -Ddata=files|args|stdin (default=" + DEFAULT_DATA_MODE + ")\n"+
+ " -Ddata=files|web|args|stdin (default=" + DEFAULT_DATA_MODE + ")\n"+
" -Dtype= (default=" + DEFAULT_CONTENT_TYPE + ")\n"+
" -Durl= (default=" + DEFAULT_POST_URL + ")\n"+
" -Dauto=yes|no (default=" + DEFAULT_AUTO + ")\n"+
- " -Drecursive=yes|no (default=" + DEFAULT_RECURSIVE + ")\n"+
+ " -Drecursive=yes|no| (default=" + DEFAULT_RECURSIVE + ")\n"+
+ " -Ddelay= (default=0 for files, 10 for web)\n"+
" -Dfiletypes=[,,...] (default=" + DEFAULT_FILE_TYPES + ")\n"+
" -Dparams=\"=[&=...]\" (values must be URL-encoded)\n"+
" -Dcommit=yes|no (default=" + DEFAULT_COMMIT + ")\n"+
@@ -212,11 +357,12 @@ public class SimplePostTool {
" -Dout=yes|no (default=" + DEFAULT_OUT + ")\n\n"+
"This is a simple command line tool for POSTing raw data to a Solr\n"+
"port. Data can be read from files specified as commandline args,\n"+
- "as raw commandline arg strings, or via STDIN.\n"+
+ "URLs specified as args, as raw commandline arg strings or via STDIN.\n"+
"Examples:\n"+
" java -jar post.jar *.xml\n"+
" java -Ddata=args -jar post.jar '42'\n"+
" java -Ddata=stdin -jar post.jar < hd.xml\n"+
+ " java -Ddata=web -jar post.jar http://example.com/\n"+
" java -Dtype=text/csv -jar post.jar *.csv\n"+
" java -Dtype=application/json -jar post.jar *.json\n"+
" java -Durl=http://localhost:8983/solr/update/extract -Dparams=literal.id=a -Dtype=application/pdf -jar post.jar a.pdf\n"+
@@ -228,13 +374,10 @@ public class SimplePostTool {
"or optimize should be executed, and whether the response should\n"+
"be written to STDOUT. If auto=yes the tool will try to set type\n"+
"and url automatically from file name. When posting rich documents\n"+
- "the file name will be propagated as \"resource.name\" and also used as \"literal.id\".\n" +
- "You may override these or any other request parameter through the -Dparams property.\n"+
- "If you want to do a commit only, use \"-\" as argument.");
- }
-
- private static boolean isOn(String property) {
- return(TRUE_STRINGS.indexOf(property) >= 0);
+ "the file name will be propagated as \"resource.name\" and also used\n"+
+ "as \"literal.id\". You may override these or any other request parameter\n"+
+ "through the -Dparams property. To do a commit only, use \"-\" as argument.\n"+
+ "The web mode is a simple crawler following links within domain, default delay=10s.");
}
/** Post all filenames provided in args
@@ -244,7 +387,8 @@ public class SimplePostTool {
* @param type default content-type to use when posting (may be overridden in auto mode)
* @return number of files posted
* */
- int postFiles(String [] args,int startIndexInArgs, OutputStream out, String type) {
+ public int postFiles(String [] args,int startIndexInArgs, OutputStream out, String type) {
+ reset();
int filesPosted = 0;
for (int j = startIndexInArgs; j < args.length; j++) {
File srcFile = new File(args[j]);
@@ -258,7 +402,7 @@ public class SimplePostTool {
String fileGlob = srcFile.getName();
GlobFileFilter ff = new GlobFileFilter(fileGlob, false);
File[] files = parent.listFiles(ff);
- if(files.length == 0) {
+ if(files == null || files.length == 0) {
warn("No files or directories matching "+srcFile);
continue;
}
@@ -268,32 +412,255 @@ public class SimplePostTool {
return filesPosted;
}
+ /** Post all filenames provided in args
+ * @param files array of Files
+ * @param startIndexInArgs offset to start
+ * @param out output stream to post data to
+ * @param type default content-type to use when posting (may be overridden in auto mode)
+ * @return number of files posted
+ * */
+ public int postFiles(File[] files, int startIndexInArgs, OutputStream out, String type) {
+ reset();
+ int filesPosted = 0;
+ for (File srcFile : files) {
+ if(srcFile.isDirectory() && srcFile.canRead()) {
+ filesPosted += postDirectory(srcFile, out, type);
+ } else if (srcFile.isFile() && srcFile.canRead()) {
+ filesPosted += postFiles(new File[] {srcFile}, out, type);
+ } else {
+ File parent = srcFile.getParentFile();
+ if(parent == null) parent = new File(".");
+ String fileGlob = srcFile.getName();
+ GlobFileFilter ff = new GlobFileFilter(fileGlob, false);
+ File[] fileList = parent.listFiles(ff);
+ if(fileList == null || fileList.length == 0) {
+ warn("No files or directories matching "+srcFile);
+ continue;
+ }
+ filesPosted += postFiles(fileList, out, type);
+ }
+ }
+ return filesPosted;
+ }
+
+ /**
+ * Posts a whole directory
+ * @return number of files posted total
+ */
private int postDirectory(File dir, OutputStream out, String type) {
if(dir.isHidden() && !dir.getName().equals("."))
return(0);
- info("Indexing directory "+dir.getPath());
+ info("Indexing directory "+dir.getPath()+" ("+dir.listFiles(globFileFilter).length+" files, depth="+currentDepth+")");
int posted = 0;
posted += postFiles(dir.listFiles(globFileFilter), out, type);
- if(recursive) {
+ if(recursive > currentDepth) {
for(File d : dir.listFiles()) {
- if(d.isDirectory())
+ if(d.isDirectory()) {
+ currentDepth++;
posted += postDirectory(d, out, type);
+ currentDepth--;
+ }
}
}
return posted;
}
+ /**
+ * Posts a list of file names
+ * @return number of files posted
+ */
int postFiles(File[] files, OutputStream out, String type) {
int filesPosted = 0;
for(File srcFile : files) {
- if(!srcFile.isFile() || srcFile.isHidden())
- continue;
- postFile(srcFile, out, type);
- filesPosted++;
+ try {
+ if(!srcFile.isFile() || srcFile.isHidden())
+ continue;
+ postFile(srcFile, out, type);
+ Thread.sleep(delay * 1000);
+ filesPosted++;
+ } catch (InterruptedException e) {
+ throw new RuntimeException();
+ }
}
return filesPosted;
}
+ /**
+ * This method takes as input a list of start URL strings for crawling,
+ * adds each one to a backlog and then starts crawling
+ * @param args the raw input args from main()
+ * @param startIndexInArgs offset for where to start
+ * @param out outputStream to write results to
+ * @return the number of web pages posted
+ */
+ public int postWebPages(String[] args, int startIndexInArgs, OutputStream out) {
+ reset();
+ LinkedHashSet s = new LinkedHashSet();
+ for (int j = startIndexInArgs; j < args.length; j++) {
+ try {
+ URL u = new URL(normalizeUrlEnding(args[j]));
+ s.add(u);
+ } catch(MalformedURLException e) {
+ warn("Skipping malformed input URL: "+args[j]);
+ }
+ }
+ // Add URLs to level 0 of the backlog and start recursive crawling
+ backlog.add(s);
+ return webCrawl(0, out);
+ }
+
+ /**
+ * Normalizes a URL string by removing anchor part and trailing slash
+ * @return the normalized URL string
+ */
+ protected static String normalizeUrlEnding(String link) {
+ if(link.indexOf("#") > -1)
+ link = link.substring(0,link.indexOf("#"));
+ if(link.endsWith("?"))
+ link = link.substring(0,link.length()-1);
+ if(link.endsWith("/"))
+ link = link.substring(0,link.length()-1);
+ return link;
+ }
+
+ /**
+ * A very simple crawler, pulling URLs to fetch from a backlog and then
+ * recurses N levels deep if recursive>0. Links are parsed from HTML
+ * through first getting an XHTML version using SolrCell with extractOnly,
+ * and followed if they are local. The crawler pauses for a default delay
+ * of 10 seconds between each fetch, this can be configured in the delay
+ * variable. This is only meant for test purposes, as it does not respect
+ * robots or anything else fancy :)
+ * @param level which level to crawl
+ * @param out output stream to write to
+ * @return number of pages crawled on this level and below
+ */
+ protected int webCrawl(int level, OutputStream out) {
+ int numPages = 0;
+ LinkedHashSet stack = backlog.get(level);
+ int rawStackSize = stack.size();
+ stack.removeAll(visited);
+ int stackSize = stack.size();
+ LinkedHashSet subStack = new LinkedHashSet();
+ info("Entering crawl at level "+level+" ("+rawStackSize+" links total, "+stackSize+" new)");
+ for(URL u : stack) {
+ try {
+ visited.add(u);
+ PageFetcherResult result = pageFetcher.readPageFromUrl(u);
+ if(result.httpStatus == 200) {
+ u = (result.redirectUrl != null) ? result.redirectUrl : u;
+ URL postUrl = new URL(appendParam(solrUrl.toString(),
+ "literal.id="+URLEncoder.encode(u.toString(),"UTF-8") +
+ "&literal.url="+URLEncoder.encode(u.toString(),"UTF-8")));
+ boolean success = postData(new ByteArrayInputStream(result.content), null, out, result.contentType, postUrl);
+ if (success) {
+ info("POSTed web resource "+u+" (depth: "+level+")");
+ Thread.sleep(delay * 1000);
+ numPages++;
+ // Pull links from HTML pages only
+ if(recursive > level && result.contentType.equals("text/html")) {
+ Set children = pageFetcher.getLinksFromWebPage(u, new ByteArrayInputStream(result.content), result.contentType, postUrl);
+ subStack.addAll(children);
+ }
+ } else {
+ warn("An error occurred while posting "+u);
+ }
+ } else {
+ warn("The URL "+u+" returned a HTTP result status of "+result.httpStatus);
+ }
+ } catch (IOException e) {
+ warn("Caught exception when trying to open connection to "+u+": "+e.getMessage());
+ } catch (InterruptedException e) {
+ throw new RuntimeException();
+ }
+ }
+ if(!subStack.isEmpty()) {
+ backlog.add(subStack);
+ numPages += webCrawl(level+1, out);
+ }
+ return numPages;
+ }
+
+ /**
+ * Reads an input stream into a byte array
+ * @param is the input stream
+ * @return the byte array
+ * @throws IOException
+ */
+ protected byte[] inputStreamToByteArray(InputStream is) throws IOException {
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ int next = is.read();
+ while (next > -1) {
+ bos.write(next);
+ next = is.read();
+ }
+ bos.flush();
+ is.close();
+ return bos.toByteArray();
+ }
+
+ /**
+ * Computes the full URL based on a base url and a possibly relative link found
+ * in the href param of an HTML anchor.
+ * @param baseUrl the base url from where the link was found
+ * @param link the absolute or relative link
+ * @return the string version of the full URL
+ */
+ protected String computeFullUrl(URL baseUrl, String link) {
+ if(link == null || link.length() == 0) {
+ return null;
+ }
+ if(!link.startsWith("http")) {
+ if(link.startsWith("/")) {
+ link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + link;
+ } else {
+ if(link.contains(":")) {
+ return null; // Skip non-relative URLs
+ }
+ String path = baseUrl.getPath();
+ if(!path.endsWith("/")) {
+ int sep = path.lastIndexOf("/");
+ String file = path.substring(sep+1);
+ if(file.contains(".") || file.contains("?"))
+ path = path.substring(0,sep);
+ }
+ link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + path + "/" + link;
+ }
+ }
+ link = normalizeUrlEnding(link);
+ String l = link.toLowerCase(Locale.ROOT);
+ // Simple brute force skip images
+ if(l.endsWith(".jpg") || l.endsWith(".jpeg") || l.endsWith(".png") || l.endsWith(".gif")) {
+ return null; // Skip images
+ }
+ return link;
+ }
+
+ /**
+ * Uses the mime-type map to reverse lookup whether the file ending for our type
+ * is supported by the fileTypes option
+ * @param type what content-type to lookup
+ * @return true if this is a supported content type
+ */
+ protected boolean typeSupported(String type) {
+ for(String key : mimeMap.keySet()) {
+ if(mimeMap.get(key).equals(type)) {
+ if(fileTypes.contains(key))
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Tests if a string is either "true", "on", "yes" or "1"
+ * @param property the string to test
+ * @return true if "on"
+ */
+ protected static boolean isOn(String property) {
+ return("true,on,yes,1".indexOf(property) > -1);
+ }
+
static void warn(String msg) {
System.err.println("SimplePostTool: WARNING: " + msg);
}
@@ -304,21 +671,14 @@ public class SimplePostTool {
static void fatal(String msg) {
System.err.println("SimplePostTool: FATAL: " + msg);
- System.exit(1);
- }
-
- /**
- * Constructs an instance for posting data to the specified Solr URL
- * (ie: "http://localhost:8983/solr/update")
- */
- public SimplePostTool(URL solrUrl) {
- this.solrUrl = solrUrl;
+ System.exit(2);
}
/**
* Does a simple commit operation
*/
public void commit() {
+ info("COMMITting Solr index changes to " + solrUrl + "..");
doGet(appendParam(solrUrl.toString(), "commit=true"));
}
@@ -326,9 +686,16 @@ public class SimplePostTool {
* Does a simple optimize operation
*/
public void optimize() {
+ info("Performing an OPTIMIZE to " + solrUrl + "..");
doGet(appendParam(solrUrl.toString(), "optimize=true"));
}
+ /**
+ * Appends a URL query parameter to a URL
+ * @param url the original URL
+ * @param param the parameter(s) to append, separated by "&"
+ * @return the string version of the resulting URL
+ */
public static String appendParam(String url, String param) {
String[] pa = param.split("&");
for(String p : pa) {
@@ -360,13 +727,12 @@ public class SimplePostTool {
// Default handler
} else {
// SolrCell
- String urlStr = url.getProtocol() + "://" + url.getAuthority() + url.getPath() + "/extract" + (url.getQuery() != null ? "?"+url.getQuery() : "");
+ String urlStr = appendUrlPath(solrUrl, "/extract").toString();
if(urlStr.indexOf("resource.name")==-1)
urlStr = appendParam(urlStr, "resource.name=" + URLEncoder.encode(file.getAbsolutePath(), "UTF-8"));
if(urlStr.indexOf("literal.id")==-1)
urlStr = appendParam(urlStr, "literal.id=" + URLEncoder.encode(file.getAbsolutePath(), "UTF-8"));
url = new URL(urlStr);
-// info("Indexing to ExtractingRequestHandler with URL "+url);
}
} else {
warn("Skipping "+file.getName()+". Unsupported file type for auto mode.");
@@ -390,7 +756,23 @@ public class SimplePostTool {
}
}
- private String guessType(File file) {
+ /**
+ * Appends to the path of the URL
+ * @param url the URL
+ * @param append the path to append
+ * @return the final URL version
+ * @throws MalformedURLException
+ */
+ protected static URL appendUrlPath(URL url, String append) throws MalformedURLException {
+ return new URL(url.getProtocol() + "://" + url.getAuthority() + url.getPath() + append + (url.getQuery() != null ? "?"+url.getQuery() : ""));
+ }
+
+ /**
+ * Guesses the type of a file, based on file name suffix
+ * @param file the file
+ * @return the content-type guessed
+ */
+ protected static String guessType(File file) {
String name = file.getName();
String suffix = name.substring(name.lastIndexOf(".")+1);
return mimeMap.get(suffix.toLowerCase(Locale.ROOT));
@@ -412,6 +794,7 @@ public class SimplePostTool {
*/
public static void doGet(URL url) {
try {
+ if(mockMode) return;
HttpURLConnection urlc = (HttpURLConnection) url.openConnection();
if (HttpURLConnection.HTTP_OK != urlc.getResponseCode()) {
warn("Solr returned an error #" + urlc.getResponseCode() +
@@ -422,15 +805,14 @@ public class SimplePostTool {
}
}
- public void postData(InputStream data, Integer length, OutputStream output, String type) {
- postData(data, length, output, type, solrUrl);
- }
-
/**
* Reads data from the data stream and posts it to solr,
* writes to the response to output
+ * @return true if success
*/
- public void postData(InputStream data, Integer length, OutputStream output, String type, URL url) {
+ public boolean postData(InputStream data, Integer length, OutputStream output, String type, URL url) {
+ if(mockMode) return true;
+ boolean success = true;
if(type == null)
type = DEFAULT_CONTENT_TYPE;
HttpURLConnection urlc = null;
@@ -441,7 +823,6 @@ public class SimplePostTool {
urlc.setRequestMethod("POST");
} catch (ProtocolException e) {
fatal("Shouldn't happen: HttpURLConnection doesn't support POST??"+e);
-
}
urlc.setDoOutput(true);
urlc.setDoInput(true);
@@ -453,6 +834,7 @@ public class SimplePostTool {
} catch (IOException e) {
fatal("Connection error (is Solr running at " + solrUrl + " ?): " + e);
+ success = false;
}
OutputStream out = null;
@@ -461,6 +843,7 @@ public class SimplePostTool {
pipe(data, out);
} catch (IOException e) {
fatal("IOException while posting data: " + e);
+ success = false;
} finally {
try { if(out!=null) out.close(); } catch (IOException x) { /*NOOP*/ }
}
@@ -470,12 +853,14 @@ public class SimplePostTool {
if (HttpURLConnection.HTTP_OK != urlc.getResponseCode()) {
warn("Solr returned an error #" + urlc.getResponseCode() +
" " + urlc.getResponseMessage());
+ success = false;
}
in = urlc.getInputStream();
pipe(in, output);
} catch (IOException e) {
warn("IOException while reading response: " + e);
+ success = false;
} finally {
try { if(in!=null) in.close(); } catch (IOException x) { /*NOOP*/ }
}
@@ -483,8 +868,14 @@ public class SimplePostTool {
} finally {
if(urlc!=null) urlc.disconnect();
}
+ return success;
}
+ /**
+ * Converts a string to an input stream
+ * @param s the string
+ * @return the input stream
+ */
public static InputStream stringToStream(String s) {
InputStream is = null;
try {
@@ -508,36 +899,64 @@ public class SimplePostTool {
if (null != dest) dest.flush();
}
- public boolean isAuto() {
- return auto;
- }
-
- public void setAuto(boolean auto) {
- this.auto = auto;
- }
-
- public boolean isRecursive() {
- return recursive;
- }
-
- public void setRecursive(boolean recursive) {
- this.recursive = recursive;
- }
-
- public String getFileTypes() {
- return fileTypes;
- }
-
- public void setFileTypes(String fileTypes) {
- this.fileTypes = fileTypes;
+ public GlobFileFilter getFileFilterFromFileTypes(String fileTypes) {
String glob;
if(fileTypes.equals("*"))
glob = ".*";
else
glob = "^.*\\.(" + fileTypes.replace(",", "|") + ")$";
- this.globFileFilter = new GlobFileFilter(glob, true);
+ return new GlobFileFilter(glob, true);
}
+ //
+ // Utility methods for XPath handing
+ //
+
+ /**
+ * Gets all nodes matching an XPath
+ */
+ public static NodeList getNodesFromXP(Node n, String xpath) throws XPathExpressionException {
+ XPathFactory factory = XPathFactory.newInstance();
+ XPath xp = factory.newXPath();
+ XPathExpression expr = xp.compile(xpath);
+ return (NodeList) expr.evaluate(n, XPathConstants.NODESET);
+ }
+
+ /**
+ * Gets the string content of the matching an XPath
+ * @param n the node (or doc)
+ * @param xpath the xpath string
+ * @param concatAll if true, text from all matching nodes will be concatenated, else only the first returned
+ */
+ public static String getXP(Node n, String xpath, boolean concatAll)
+ throws XPathExpressionException {
+ NodeList nodes = getNodesFromXP(n, xpath);
+ StringBuffer sb = new StringBuffer();
+ if (nodes.getLength() > 0) {
+ for(int i = 0; i < nodes.getLength() ; i++) {
+ sb.append(nodes.item(i).getNodeValue() + " ");
+ if(!concatAll) break;
+ }
+ return sb.toString().trim();
+ } else
+ return "";
+ }
+
+ /**
+ * Takes a string as input and returns a DOM
+ */
+ public static Document makeDom(String in, String inputEncoding) throws SAXException, IOException,
+ ParserConfigurationException {
+ InputStream is = new ByteArrayInputStream(in
+ .getBytes(inputEncoding));
+ Document dom = DocumentBuilderFactory.newInstance()
+ .newDocumentBuilder().parse(is);
+ return dom;
+ }
+
+ /**
+ * Inner class to filter files based on glob wildcards
+ */
class GlobFileFilter implements FileFilter
{
private String _pattern;
@@ -571,4 +990,170 @@ public class SimplePostTool {
return p.matcher(file.getName()).find();
}
}
+
+ //
+ // Simple crawler class which can fetch a page and check for robots.txt
+ //
+ class PageFetcher {
+ Map> robotsCache;
+ final String DISALLOW = "Disallow:";
+
+ public PageFetcher() {
+ robotsCache = new HashMap>();
+ }
+
+ public PageFetcherResult readPageFromUrl(URL u) {
+ PageFetcherResult res = new PageFetcherResult();
+ try {
+ if (isDisallowedByRobots(u)) {
+ warn("The URL "+u+" is disallowed by robots.txt and will not be crawled.");
+ res.httpStatus = 403;
+ visited.add(u);
+ return res;
+ }
+ res.httpStatus = 404;
+ HttpURLConnection conn = (HttpURLConnection) u.openConnection();
+ conn.setRequestProperty("User-Agent", "SimplePostTool-crawler/"+VERSION_OF_THIS_TOOL+" (http://lucene.apache.org/solr/)");
+ conn.setRequestProperty("Accept-Encoding", "gzip, deflate");
+ conn.connect();
+ res.httpStatus = conn.getResponseCode();
+ if(!normalizeUrlEnding(conn.getURL().toString()).equals(normalizeUrlEnding(u.toString()))) {
+ info("The URL "+u+" caused a redirect to "+conn.getURL());
+ u = conn.getURL();
+ res.redirectUrl = u;
+ visited.add(u);
+ }
+ if(res.httpStatus == 200) {
+ // Raw content type of form "text/html; encoding=utf-8"
+ String rawContentType = conn.getContentType();
+ String type = rawContentType.split(";")[0];
+ if(typeSupported(type)) {
+ String encoding = conn.getContentEncoding();
+ InputStream is;
+ if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
+ is = new GZIPInputStream(conn.getInputStream());
+ } else if (encoding != null && encoding.equalsIgnoreCase("deflate")) {
+ is = new InflaterInputStream(conn.getInputStream(), new Inflater(true));
+ } else {
+ is = conn.getInputStream();
+ }
+
+ // Read into memory, so that we later can pull links from the page without re-fetching
+ res.content = inputStreamToByteArray(is);
+ is.close();
+ } else {
+ warn("Skipping URL with unsupported type "+type);
+ res.httpStatus = 415;
+ }
+ }
+ } catch(IOException e) {
+ warn("IOException when reading page from url "+u+": "+e.getMessage());
+ }
+ return res;
+ }
+
+ public boolean isDisallowedByRobots(URL url) {
+ String host = url.getHost();
+ String strRobot = url.getProtocol() + "://" + host + "/robots.txt";
+ List disallows = robotsCache.get(host);
+ if(disallows == null) {
+ disallows = new ArrayList();
+ URL urlRobot;
+ try {
+ urlRobot = new URL(strRobot);
+ disallows = parseRobotsTxt(urlRobot.openStream());
+ } catch (MalformedURLException e) {
+ return true; // We cannot trust this robots URL, should not happen
+ } catch (IOException e) {
+ // There is no robots.txt, will cache an empty disallow list
+ }
+ }
+
+ robotsCache.put(host, disallows);
+
+ String strURL = url.getFile();
+ for (String path : disallows) {
+ if (path.equals("/") || strURL.indexOf(path) == 0)
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Very simple robots.txt parser which obeys all Disallow lines regardless
+ * of user agent or whether there are valid Allow: lines.
+ * @param is Input stream of the robots.txt file
+ * @return a list of disallow paths
+ * @throws IOException if problems reading the stream
+ */
+ protected List parseRobotsTxt(InputStream is) throws IOException {
+ List disallows = new ArrayList();
+ BufferedReader r = new BufferedReader(new InputStreamReader(is, "UTF-8"));
+ String l;
+ while((l = r.readLine()) != null) {
+ String[] arr = l.split("#");
+ if(arr.length == 0) continue;
+ l = arr[0].trim();
+ if(l.startsWith(DISALLOW)) {
+ l = l.substring(DISALLOW.length()).trim();
+ if(l.length() == 0) continue;
+ disallows.add(l);
+ }
+ }
+ is.close();
+ return disallows;
+ }
+
+ /**
+ * Finds links on a web page, using /extract?extractOnly=true
+ * @param u the URL of the web page
+ * @param is the input stream of the page
+ * @param type the content-type
+ * @param postUrl the URL (typically /solr/extract) in order to pull out links
+ * @return a set of URLs parsed from the page
+ */
+ protected Set getLinksFromWebPage(URL u, InputStream is, String type, URL postUrl) {
+ Set l = new HashSet();
+ URL url = null;
+ try {
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+ URL extractUrl = new URL(appendParam(postUrl.toString(), "extractOnly=true"));
+ boolean success = postData(is, null, os, type, extractUrl);
+ if(success) {
+ String rawXml = os.toString("UTF-8");
+ Document d = makeDom(rawXml, "UTF-8");
+ String innerXml = getXP(d, "/response/str/text()[1]", false);
+ d = makeDom(innerXml, "UTF-8");
+ NodeList links = getNodesFromXP(d, "/html/body//a/@href");
+ for(int i = 0; i < links.getLength(); i++) {
+ String link = links.item(i).getTextContent();
+ link = computeFullUrl(u, link);
+ if(link == null)
+ continue;
+ url = new URL(link);
+ if(url.getAuthority() == null || !url.getAuthority().equals(u.getAuthority()))
+ continue;
+ l.add(url);
+ }
+ }
+ } catch (MalformedURLException e) {
+ warn("Malformed URL "+url);
+ } catch (IOException e) {
+ warn("IOException opening URL "+url+": "+e.getMessage());
+ } catch (Exception e) {
+ throw new RuntimeException();
+ }
+ return l;
+ }
+ }
+
+ /**
+ * Utility class to hold the result form a page fetch
+ */
+ public class PageFetcherResult {
+ int httpStatus = 200;
+ String contentType = "text/html";
+ URL redirectUrl = null;
+ byte[] content;
+ }
}
diff --git a/solr/core/src/test-files/exampledocs/example.html b/solr/core/src/test-files/exampledocs/example.html
new file mode 100644
index 00000000000..5732f6214bc
--- /dev/null
+++ b/solr/core/src/test-files/exampledocs/example.html
@@ -0,0 +1,49 @@
+
+
+ Welcome to Solr
+
+
+