LUCENE-3918: port IndexSorter to trunk API

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1454801 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shai Erera 2013-03-10 07:55:38 +00:00
parent ff13a11930
commit 389bed49f8
8 changed files with 1386 additions and 0 deletions

View File

@ -31,6 +31,11 @@ New Features
* LUCENE-4815: DrillSideways now allows more than one FacetRequest per
dimension (Mike McCandless)
* LUCENE-3918: IndexSorter has been ported to 4.3 API and now supports
sorting documents by a numeric DocValues field, or reverse the order of
the documents in the index. Additionally, apps can implement their own
sort criteria. (Anat Hashavit, Shai Erera)
* LUCENE-4817: Added KeywordRepeatFilter that allows to emit a token twice
once as a keyword and once as an ordinary token allow stemmers to emit

View File

@ -0,0 +1,65 @@
package org.apache.lucene.index.sorter;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.AbstractList;
import java.util.List;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.NumericDocValues;
/**
* A {@link Sorter} which sorts documents according to their
* {@link NumericDocValues}.
*
* @lucene.experimental
*/
public class NumericDocValuesSorter extends Sorter {
private final String fieldName;
public NumericDocValuesSorter(final String fieldName) {
this.fieldName = fieldName;
}
@Override
public int[] oldToNew(final AtomicReader reader) throws IOException {
final NumericDocValues ndv = reader.getNumericDocValues(fieldName);
final int maxDoc = reader.maxDoc();
final int[] docs = new int[maxDoc];
final List<Long> values = new AbstractList<Long>() {
@Override
public Long get(int doc) {
return ndv.get(doc);
}
@Override
public int size() {
return reader.maxDoc();
}
};
for (int i = 0; i < maxDoc; i++) {
docs[i] = i;
}
return compute(docs, values);
}
}

View File

@ -0,0 +1,106 @@
package org.apache.lucene.index.sorter;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.List;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.util.SorterTemplate;
/**
* Sorts documents in a given index by returning a permutation on the docs.
* Implementations can call {@link #compute(int[], List)} to compute the
* old-to-new permutation over the given documents and values.
*
* @lucene.experimental
*/
public abstract class Sorter {
/** Sorts documents in reverse order. */
public static final Sorter REVERSE_DOCS = new Sorter() {
@Override
public int[] oldToNew(final AtomicReader reader) throws IOException {
final int maxDoc = reader.maxDoc();
int[] reverseDocs = new int[maxDoc];
for (int i = 0; i < maxDoc; i++) {
reverseDocs[i] = maxDoc - (i + 1);
}
return reverseDocs;
}
};
private static final class DocValueSorterTemplate<T extends Comparable<? super T>> extends SorterTemplate {
private final int[] docs;
private final List<T> values;
private T pivot;
public DocValueSorterTemplate(int[] docs, List<T> values) {
this.docs = docs;
this.values = values;
}
@Override
protected int compare(int i, int j) {
return values.get(docs[i]).compareTo(values.get(docs[j]));
}
@Override
protected int comparePivot(int j) {
return pivot.compareTo(values.get(docs[j]));
}
@Override
protected void setPivot(int i) {
pivot = values.get(docs[i]);
}
@Override
protected void swap(int i, int j) {
int tmpDoc = docs[i];
docs[i] = docs[j];
docs[j] = tmpDoc;
}
}
/** Computes the old-to-new permutation over the given documents and values. */
protected static <T extends Comparable<? super T>> int[] compute(int[] docs, List<T> values) {
SorterTemplate sorter = new DocValueSorterTemplate<T>(docs, values);
sorter.quickSort(0, docs.length - 1);
final int[] oldToNew = new int[docs.length];
for (int i = 0; i < docs.length; i++) {
oldToNew[docs[i]] = i;
}
return oldToNew;
}
/**
* Returns a mapping from the old document ID to its new location in the
* sorted index. Implementations can use the auxiliary
* {@link #compute(int[], List)} to compute the old-to-new permutation
* given an array of documents and their corresponding values.
* <p>
* <b>NOTE:</b> deleted documents are expected to appear in the mapping as
* well, they will however be dropped when the index is actually sorted.
*/
public abstract int[] oldToNew(AtomicReader reader) throws IOException;
}

View File

@ -0,0 +1,652 @@
package org.apache.lucene.index.sorter;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FilterAtomicReader;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMFile;
import org.apache.lucene.store.RAMInputStream;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.SorterTemplate;
/**
* An {@link AtomicReader} which supports sorting documents by a given
* {@link Sorter}. You can use this class to sort an index as follows:
*
* <pre class="prettyprint">
* IndexWriter writer; // writer to which the sorted index will be added
* DirectoryReader reader; // reader on the input index
* Sorter sorter; // determines how the documents are sorted
* AtomicReader sortingReader = new SortingAtomicReader(reader, sorter);
* writer.addIndexes(reader);
* writer.close();
* reader.close();
* </pre>
*
* @lucene.experimental
*/
public class SortingAtomicReader extends FilterAtomicReader {
private static class SortingFields extends FilterFields {
private final int[] old2new;
private final Bits inLiveDocs;
private final FieldInfos infos;
public SortingFields(final Fields in, final Bits inLiveDocs, FieldInfos infos, final int[] old2new) {
super(in);
this.old2new = old2new;
this.inLiveDocs = inLiveDocs;
this.infos = infos;
}
@Override
public Terms terms(final String field) throws IOException {
Terms terms = in.terms(field);
if (terms == null) {
return null;
} else {
return new SortingTerms(terms, inLiveDocs, infos.fieldInfo(field).getIndexOptions(), old2new);
}
}
}
private static class SortingTerms extends FilterTerms {
private final int[] old2new;
private final Bits inLiveDocs;
private final IndexOptions indexOptions;
public SortingTerms(final Terms in, final Bits inLiveDocs, IndexOptions indexOptions, final int[] old2new) {
super(in);
this.old2new = old2new;
this.inLiveDocs = inLiveDocs;
this.indexOptions = indexOptions;
}
@Override
public TermsEnum iterator(final TermsEnum reuse) throws IOException {
return new SortingTermsEnum(in.iterator(reuse), inLiveDocs, old2new, indexOptions);
}
}
private static class SortingTermsEnum extends FilterTermsEnum {
private final int[] old2new;
private final Bits inLiveDocs;
private final IndexOptions indexOptions;
public SortingTermsEnum(final TermsEnum in, final Bits inLiveDocs, final int[] old2new, IndexOptions indexOptions) {
super(in);
this.old2new = old2new;
this.inLiveDocs = inLiveDocs;
this.indexOptions = indexOptions;
}
@Override
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, final int flags) throws IOException {
if (liveDocs != null) {
liveDocs = inLiveDocs;
}
// if we're asked to reuse the given DocsEnum and it is Sorting, return
// the wrapped one, since some Codecs expect it.
if (reuse != null && reuse instanceof SortingDocsEnum) {
reuse = ((SortingDocsEnum) reuse).getWrapped();
}
boolean withFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >=0 && (flags & DocsEnum.FLAG_FREQS) != 0;
return new SortingDocsEnum(in.docs(liveDocs, reuse, flags), withFreqs, old2new);
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, final int flags) throws IOException {
if (liveDocs != null) {
liveDocs = inLiveDocs;
}
// if we're asked to reuse the given DocsAndPositionsEnum and it is
// Sorting, return the wrapped one, since some Codecs expect it.
if (reuse != null && reuse instanceof SortingDocsAndPositionsEnum) {
reuse = ((SortingDocsAndPositionsEnum) reuse).getWrapped();
}
final DocsAndPositionsEnum positions = in.docsAndPositions(liveDocs, reuse, flags);
if (positions == null) {
return null;
} else {
// we ignore the fact that offsets may be stored but not asked for,
// since this code is expected to be used during addIndexes which will
// ask for everything. if that assumption changes in the future, we can
// factor in whether 'flags' says offsets are not required.
boolean storeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
return new SortingDocsAndPositionsEnum(positions, old2new, storeOffsets);
}
}
}
private static class SortingBinaryDocValues extends BinaryDocValues {
private final BinaryDocValues in;
private final int[] new2old;
SortingBinaryDocValues(BinaryDocValues in, int[] new2old) {
this.in = in;
this.new2old = new2old;
}
@Override
public void get(int docID, BytesRef result) {
in.get(new2old[docID], result);
}
}
private static class SortingNumericDocValues extends NumericDocValues {
private final NumericDocValues in;
private final int[] new2old;
public SortingNumericDocValues(final NumericDocValues in, final int[] new2old) {
this.in = in;
this.new2old = new2old;
}
@Override
public long get(int docID) {
return in.get(new2old[docID]);
}
}
private static class SortingSortedDocValues extends SortedDocValues {
private final SortedDocValues in;
private final int[] new2old;
SortingSortedDocValues(SortedDocValues in, int[] new2old) {
this.in = in;
this.new2old = new2old;
}
@Override
public int getOrd(int docID) {
return in.getOrd(new2old[docID]);
}
@Override
public void lookupOrd(int ord, BytesRef result) {
in.lookupOrd(ord, result);
}
@Override
public int getValueCount() {
return in.getValueCount();
}
@Override
public void get(int docID, BytesRef result) {
in.get(new2old[docID], result);
}
@Override
public int lookupTerm(BytesRef key) {
return in.lookupTerm(key);
}
}
private static class SortingSortedSetDocValues extends SortedSetDocValues {
private final SortedSetDocValues in;
private final int[] new2old;
SortingSortedSetDocValues(SortedSetDocValues in, int[] new2old) {
this.in = in;
this.new2old = new2old;
}
@Override
public long nextOrd() {
return in.nextOrd();
}
@Override
public void setDocument(int docID) {
in.setDocument(new2old[docID]);
}
@Override
public void lookupOrd(long ord, BytesRef result) {
in.lookupOrd(ord, result);
}
@Override
public long getValueCount() {
return in.getValueCount();
}
@Override
public long lookupTerm(BytesRef key) {
return in.lookupTerm(key);
}
}
private static class SortingDocsEnum extends FilterDocsEnum {
private static final class DocFreqSorterTemplate extends SorterTemplate {
private final int[] docs;
private final int[] freqs;
private int pivot;
public DocFreqSorterTemplate(int[] docs, int[] freqs) {
this.docs = docs;
this.freqs = freqs;
}
@Override
protected int compare(int i, int j) {
return docs[i] - docs[j];
}
@Override
protected int comparePivot(int j) {
return pivot - docs[j];
}
@Override
protected void setPivot(int i) {
pivot = docs[i];
}
@Override
protected void swap(int i, int j) {
int tmpDoc = docs[i];
docs[i] = docs[j];
docs[j] = tmpDoc;
int tmpFreq = freqs[i];
freqs[i] = freqs[j];
freqs[j] = tmpFreq;
}
}
private int[] docs = new int[64];
private int[] freqs;
private int docIt = -1;
private final int upto;
private final boolean withFreqs;
public SortingDocsEnum(final DocsEnum in, boolean withFreqs, final int[] old2new) throws IOException {
super(in);
this.withFreqs = withFreqs;
int i = 0;
int doc;
if (withFreqs) {
freqs = new int[docs.length];
while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS){
if (i >= docs.length) {
docs = ArrayUtil.grow(docs, docs.length + 1);
freqs = ArrayUtil.grow(freqs, freqs.length + 1);
}
docs[i] = old2new[doc];
freqs[i] = in.freq();
++i;
}
SorterTemplate sorter = new DocFreqSorterTemplate(docs, freqs);
sorter.quickSort(0, i - 1);
} else {
freqs = null;
while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS){
if (i >= docs.length) {
docs = ArrayUtil.grow(docs, docs.length + 1);
}
docs[i++] = old2new[doc];
}
Arrays.sort(docs, 0, i);
}
upto = i;
}
@Override
public int advance(final int target) throws IOException {
// need to support it for checkIndex, but in practice it won't be called, so
// don't bother to implement efficiently for now.
while (nextDoc() < target) {}
return docID();
}
@Override
public int docID() {
return docIt >= upto ? NO_MORE_DOCS : docs[docIt];
}
@Override
public int freq() throws IOException {
return withFreqs && docIt < upto ? freqs[docIt] : 1;
}
@Override
public int nextDoc() throws IOException {
if (++docIt >= upto) return NO_MORE_DOCS;
return docs[docIt];
}
/** Returns the wrapped {@link DocsEnum}. */
DocsEnum getWrapped() {
return in;
}
}
private static class SortingDocsAndPositionsEnum extends FilterDocsAndPositionsEnum {
/**
* A {@link SorterTemplate} which sorts two parallel arrays of doc IDs and
* offsets in one go. Everytime a doc ID is 'swapped', its correponding offset
* is swapped too.
*/
private static final class DocOffsetSorterTemplate extends SorterTemplate {
private final int[] docs;
private final long[] offsets;
private int pivot;
public DocOffsetSorterTemplate(int[] docs, long[] offsets) {
this.docs = docs;
this.offsets = offsets;
}
@Override
protected int compare(int i, int j) {
return docs[i] - docs[j];
}
@Override
protected int comparePivot(int j) {
return pivot - docs[j];
}
@Override
protected void setPivot(int i) {
pivot = docs[i];
}
@Override
protected void swap(int i, int j) {
int tmpDoc = docs[i];
docs[i] = docs[j];
docs[j] = tmpDoc;
long tmpOffset = offsets[i];
offsets[i] = offsets[j];
offsets[j] = tmpOffset;
}
}
private int[] docs;
private long[] offsets;
private final int upto;
private final IndexInput postingInput;
private final boolean storeOffsets;
private int docIt = -1;
private int pos;
private int startOffset = -1;
private int endOffset = -1;
private final BytesRef payload = new BytesRef(32);
private int currFreq;
public SortingDocsAndPositionsEnum(final DocsAndPositionsEnum in, final int[] old2new, boolean storeOffsets) throws IOException {
super(in);
this.storeOffsets = storeOffsets;
final RAMFile file = new RAMFile();
final IndexOutput out = new RAMOutputStream(file);
docs = new int[32];
offsets = new long[32];
int doc;
int i = 0;
while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (i == docs.length) {
docs = ArrayUtil.grow(docs, i + 1);
offsets = ArrayUtil.grow(offsets, i + 1);
}
docs[i] = old2new[doc];
offsets[i] = out.getFilePointer();
addPositions(in, out);
i++;
}
upto = i;
SorterTemplate sorter = new DocOffsetSorterTemplate(docs, offsets);
sorter.quickSort(0, upto - 1);
out.close();
this.postingInput = new RAMInputStream("", file);
}
private void addPositions(final DocsAndPositionsEnum in, final IndexOutput out) throws IOException {
int freq = in.freq();
out.writeVInt(freq);
for (int i = 0; i < freq; i++) {
final int pos = in.nextPosition();
out.writeVInt(pos);
if (storeOffsets) { // don't encode offsets if they are not stored
out.writeVInt(in.startOffset());
out.writeVInt(in.endOffset());
}
BytesRef payload = in.getPayload();
if (payload != null) {
out.writeVInt(payload.length);
out.writeBytes(payload.bytes, payload.offset, payload.length);
} else {
out.writeVInt(0);
}
}
}
@Override
public int advance(final int target) throws IOException {
// need to support it for checkIndex, but in practice it won't be called, so
// don't bother to implement efficiently for now.
while (nextDoc() < target) {}
return docID();
}
@Override
public int docID() {
return docIt >= upto ? NO_MORE_DOCS : docs[docIt];
}
@Override
public int endOffset() throws IOException {
return endOffset;
}
@Override
public int freq() throws IOException {
return currFreq;
}
@Override
public BytesRef getPayload() throws IOException {
return payload.length == 0 ? null : payload;
}
@Override
public int nextDoc() throws IOException {
if (++docIt >= upto) return DocIdSetIterator.NO_MORE_DOCS;
postingInput.seek(offsets[docIt]);
currFreq = postingInput.readVInt();
return docs[docIt];
}
@Override
public int nextPosition() throws IOException {
pos = postingInput.readVInt();
if (storeOffsets) {
startOffset = postingInput.readVInt();
endOffset = postingInput.readVInt();
}
int length = postingInput.readVInt();
if (length > 0) {
if (length >= payload.bytes.length) {
payload.grow(length + 1);
}
postingInput.readBytes(payload.bytes, 0, length);
}
payload.length = length;
return pos;
}
@Override
public int startOffset() throws IOException {
return startOffset;
}
/** Returns the wrapped {@link DocsAndPositionsEnum}. */
DocsAndPositionsEnum getWrapped() {
return in;
}
}
private final int[] old2new, new2old;
private final FixedBitSet mappedLiveDocs;
public SortingAtomicReader(final AtomicReader in, final Sorter sorter) throws IOException {
super(in);
old2new = sorter.oldToNew(in);
if (old2new.length != in.maxDoc()) {
throw new IllegalArgumentException("sorter should provide mapping for every document in the index, including deleted ones");
}
new2old = new int[old2new.length];
for (int i = 0; i < new2old.length; i++) {
new2old[old2new[i]] = i;
}
if (!in.hasDeletions()) {
mappedLiveDocs = null;
} else {
mappedLiveDocs = new FixedBitSet(in.maxDoc());
mappedLiveDocs.set(0, in.maxDoc());
Bits liveDocs = in.getLiveDocs();
int len = liveDocs.length();
for (int i = 0; i < len; i++) {
if (!liveDocs.get(i)) {
mappedLiveDocs.clear(old2new[i]);
}
}
}
}
@Override
public void document(final int docID, final StoredFieldVisitor visitor) throws IOException {
in.document(new2old[docID], visitor);
}
@Override
public Fields fields() throws IOException {
Fields fields = in.fields();
if (fields == null) {
return null;
} else {
return new SortingFields(fields, in.getLiveDocs(), in.getFieldInfos(), old2new);
}
}
@Override
public BinaryDocValues getBinaryDocValues(String field) throws IOException {
BinaryDocValues oldDocValues = in.getBinaryDocValues(field);
if (oldDocValues == null) {
return null;
} else {
return new SortingBinaryDocValues(oldDocValues, new2old);
}
}
@Override
public Bits getLiveDocs() {
ensureOpen();
return mappedLiveDocs;
}
@Override
public NumericDocValues getNormValues(String field) throws IOException {
final NumericDocValues norm = in.getNormValues(field);
if (norm == null) {
return null;
} else {
return new SortingNumericDocValues(norm, new2old);
}
}
@Override
public NumericDocValues getNumericDocValues(String field) throws IOException {
final NumericDocValues oldDocValues = in.getNumericDocValues(field);
if (oldDocValues == null) return null;
return new SortingNumericDocValues(oldDocValues, new2old);
}
@Override
public SortedDocValues getSortedDocValues(String field) throws IOException {
SortedDocValues sortedDV = in.getSortedDocValues(field);
if (sortedDV == null) {
return null;
} else {
return new SortingSortedDocValues(sortedDV, new2old);
}
}
@Override
public SortedSetDocValues getSortedSetDocValues(String field) throws IOException {
SortedSetDocValues sortedSetDV = in.getSortedSetDocValues(field);
if (sortedSetDV == null) {
return null;
} else {
return new SortingSortedSetDocValues(sortedSetDV, new2old);
}
}
@Override
public Fields getTermVectors(final int docID) throws IOException {
return in.getTermVectors(new2old[docID]);
}
}

View File

@ -0,0 +1,27 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
Provides index sorting capablities. The application can use one of the
pre-existing Sorter implementations, e.g. to sort by a numeric
DocValues or reverse the order of the documents. Additionally, the
application can implement a Sorter which returns a permutation on
a source Directory's document IDs, to sort the input documents by additional
values.
</body>
</html>

View File

@ -0,0 +1,79 @@
package org.apache.lucene.index.sorter;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util._TestUtil;
import org.junit.BeforeClass;
public class IndexSortingTest extends SorterTestBase {
private static final Sorter[] SORTERS = new Sorter[] {
new NumericDocValuesSorter(NUMERIC_DV_FIELD),
Sorter.REVERSE_DOCS,
};
@BeforeClass
public static void beforeClassSorterUtilTest() throws Exception {
// only read the values of the undeleted documents, since after addIndexes,
// the deleted ones will be dropped from the index.
Bits liveDocs = reader.getLiveDocs();
List<Integer> values = new ArrayList<Integer>();
for (int i = 0; i < reader.maxDoc(); i++) {
if (liveDocs == null || liveDocs.get(i)) {
values.add(Integer.valueOf(reader.document(i).get(ID_FIELD)));
}
}
Sorter sorter = SORTERS[random().nextInt(SORTERS.length)];
if (sorter == Sorter.REVERSE_DOCS) {
Collections.reverse(values);
} else {
Collections.sort(values);
}
sortedValues = values.toArray(new Integer[values.size()]);
if (VERBOSE) {
System.out.println("sortedValues: " + sortedValues);
System.out.println("Sorter: " + sorter);
}
Directory target = newDirectory();
IndexWriter writer = new IndexWriter(target, newIndexWriterConfig(TEST_VERSION_CURRENT, null));
reader = new SortingAtomicReader(reader, sorter);
writer.addIndexes(reader);
writer.close();
reader.close();
dir.close();
// CheckIndex the target directory
dir = target;
_TestUtil.checkIndex(dir);
// set reader for tests
reader = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(dir));
assertFalse("index should not have deletions", reader.hasDeletions());
}
}

View File

@ -0,0 +1,377 @@
package org.apache.lucene.index.sorter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public abstract class SorterTestBase extends LuceneTestCase {
static final class NormsSimilarity extends Similarity {
private final Similarity in;
public NormsSimilarity(Similarity in) {
this.in = in;
}
@Override
public long computeNorm(FieldInvertState state) {
if (state.getName().equals(NORMS_FIELD)) {
return Float.floatToIntBits(state.getBoost());
} else {
return in.computeNorm(state);
}
}
@Override
public SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) {
return in.computeWeight(queryBoost, collectionStats, termStats);
}
@Override
public ExactSimScorer exactSimScorer(SimWeight weight, AtomicReaderContext context) throws IOException {
return in.exactSimScorer(weight, context);
}
@Override
public SloppySimScorer sloppySimScorer(SimWeight weight, AtomicReaderContext context) throws IOException {
return in.sloppySimScorer(weight, context);
}
}
static final class PositionsTokenStream extends TokenStream {
private final CharTermAttribute term;
private final PayloadAttribute payload;
private final OffsetAttribute offset;
private int pos, off;
public PositionsTokenStream() {
term = addAttribute(CharTermAttribute.class);
term.append(DOC_POSITIONS_TERM);
payload = addAttribute(PayloadAttribute.class);
offset = addAttribute(OffsetAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
if (pos == 0) {
return false;
}
payload.setPayload(new BytesRef(Integer.toString(pos)));
offset.setOffset(off, off);
--pos;
++off;
return true;
}
void setId(int id) {
pos = id / 10 + 1;
off = 0;
}
}
protected static final String ID_FIELD = "id";
protected static final String DOCS_ENUM_FIELD = "docs";
protected static final String DOCS_ENUM_TERM = "$all$";
protected static final String DOC_POSITIONS_FIELD = "positions";
protected static final String DOC_POSITIONS_TERM = "$all$";
protected static final String NUMERIC_DV_FIELD = "numeric";
protected static final String NORMS_FIELD = "norm";
protected static final String BINARY_DV_FIELD = "binary";
protected static final String SORTED_DV_FIELD = "sorted";
protected static final String SORTED_SET_DV_FIELD = "sorted_set";
protected static final String TERM_VECTORS_FIELD = "term_vectors";
private static final FieldType TERM_VECTORS_TYPE = new FieldType(TextField.TYPE_NOT_STORED);
static {
TERM_VECTORS_TYPE.setStoreTermVectors(true);
TERM_VECTORS_TYPE.freeze();
}
private static final FieldType POSITIONS_TYPE = new FieldType(TextField.TYPE_NOT_STORED);
static {
POSITIONS_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
POSITIONS_TYPE.freeze();
}
protected static Directory dir;
protected static AtomicReader reader;
protected static Integer[] sortedValues;
private static Document doc(final int id, PositionsTokenStream positions) {
final Document doc = new Document();
doc.add(new StringField(ID_FIELD, Integer.toString(id), Store.YES));
doc.add(new StringField(DOCS_ENUM_FIELD, DOCS_ENUM_TERM, Store.NO));
positions.setId(id);
if (doesntSupportOffsets.contains(_TestUtil.getPostingsFormat(DOC_POSITIONS_FIELD))) {
// codec doesnt support offsets: just index positions for the field
doc.add(new Field(DOC_POSITIONS_FIELD, positions, TextField.TYPE_NOT_STORED));
} else {
doc.add(new Field(DOC_POSITIONS_FIELD, positions, POSITIONS_TYPE));
}
doc.add(new NumericDocValuesField(NUMERIC_DV_FIELD, id));
TextField norms = new TextField(NORMS_FIELD, Integer.toString(id), Store.NO);
norms.setBoost(Float.intBitsToFloat(id));
doc.add(norms);
doc.add(new BinaryDocValuesField(BINARY_DV_FIELD, new BytesRef(Integer.toString(id))));
doc.add(new SortedDocValuesField(SORTED_DV_FIELD, new BytesRef(Integer.toString(id))));
if (defaultCodecSupportsSortedSet()) {
doc.add(new SortedSetDocValuesField(SORTED_SET_DV_FIELD, new BytesRef(Integer.toString(id))));
doc.add(new SortedSetDocValuesField(SORTED_SET_DV_FIELD, new BytesRef(Integer.toString(id + 1))));
}
doc.add(new Field(TERM_VECTORS_FIELD, Integer.toString(id), TERM_VECTORS_TYPE));
return doc;
}
/** Creates an index for sorting. */
public static void createIndex(Directory dir, int numDocs, Random random) throws IOException {
List<Integer> ids = new ArrayList<Integer>();
for (int i = 0; i < numDocs; i++) {
ids.add(Integer.valueOf(i * 10));
}
// shuffle them for indexing
Collections.shuffle(ids, random);
if (VERBOSE) {
System.out.println("Shuffled IDs for indexing: " + Arrays.toString(ids.toArray()));
}
PositionsTokenStream positions = new PositionsTokenStream();
IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random));
conf.setMaxBufferedDocs(4); // create some segments
conf.setSimilarity(new NormsSimilarity(conf.getSimilarity())); // for testing norms field
RandomIndexWriter writer = new RandomIndexWriter(random, dir, conf);
writer.setDoRandomForceMerge(false);
for (int id : ids) {
writer.addDocument(doc(id, positions));
}
// delete some documents
writer.commit();
for (Integer id : ids) {
if (random.nextDouble() < 0.2) {
if (VERBOSE) {
System.out.println("delete doc_id " + id);
}
writer.deleteDocuments(new Term(ID_FIELD, id.toString()));
}
}
writer.close();
}
@BeforeClass
public static void beforeClassSorterTestBase() throws Exception {
dir = newDirectory();
int numDocs = atLeast(20);
createIndex(dir, numDocs, random());
reader = new SlowCompositeReaderWrapper(DirectoryReader.open(dir));
}
@AfterClass
public static void afterClassSorterTestBase() throws Exception {
reader.close();
dir.close();
}
@Test
public void testBinaryDocValuesField() throws Exception {
BinaryDocValues dv = reader.getBinaryDocValues(BINARY_DV_FIELD);
BytesRef bytes = new BytesRef();
for (int i = 0; i < reader.maxDoc(); i++) {
dv.get(i, bytes);
assertEquals("incorrect binary DocValues for doc " + i, sortedValues[i].toString(), bytes.utf8ToString());
}
}
@Test
public void testDocsAndPositionsEnum() throws Exception {
Term term = new Term(DOC_POSITIONS_FIELD, DOC_POSITIONS_TERM);
DocsAndPositionsEnum sortedPositions = reader.termPositionsEnum(term);
int doc;
// test nextDoc()
while ((doc = sortedPositions.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
int freq = sortedPositions.freq();
assertEquals("incorrect freq for doc=" + doc, sortedValues[doc].intValue() / 10 + 1, freq);
for (int i = 0; i < freq; i++) {
assertEquals("incorrect position for doc=" + doc, i, sortedPositions.nextPosition());
if (!doesntSupportOffsets.contains(_TestUtil.getPostingsFormat(DOC_POSITIONS_FIELD))) {
assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.startOffset());
assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.endOffset());
}
assertEquals("incorrect payload for doc=" + doc, freq - i, Integer.parseInt(sortedPositions.getPayload().utf8ToString()));
}
}
// test advance()
sortedPositions = reader.termPositionsEnum(term);
doc = 0;
while ((doc = sortedPositions.advance(doc)) != DocIdSetIterator.NO_MORE_DOCS) {
int freq = sortedPositions.freq();
assertEquals("incorrect freq for doc=" + doc, sortedValues[doc].intValue() / 10 + 1, freq);
for (int i = 0; i < freq; i++) {
assertEquals("incorrect position for doc=" + doc, i, sortedPositions.nextPosition());
if (!doesntSupportOffsets.contains(_TestUtil.getPostingsFormat(DOC_POSITIONS_FIELD))) {
assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.startOffset());
assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.endOffset());
}
assertEquals("incorrect payload for doc=" + doc, freq - i, Integer.parseInt(sortedPositions.getPayload().utf8ToString()));
}
}
}
@Test
public void testDocsEnum() throws Exception {
Term term = new Term(DOCS_ENUM_FIELD, DOCS_ENUM_TERM);
DocsEnum docs = reader.termDocsEnum(term);
Bits mappedLiveDocs = reader.getLiveDocs();
int doc;
int prev = -1;
while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (mappedLiveDocs != null) {
assertTrue("document " + doc + " marked as deleted", mappedLiveDocs.get(doc));
}
assertEquals("incorrect value; doc " + doc, sortedValues[doc].intValue(), Integer.parseInt(reader.document(doc).get(ID_FIELD)));
while (++prev < doc) {
assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs.get(prev));
}
}
docs = reader.termDocsEnum(term);
doc = 0;
prev = -1;
while ((doc = docs.advance(doc)) != DocIdSetIterator.NO_MORE_DOCS) {
if (mappedLiveDocs != null) {
assertTrue("document " + doc + " marked as deleted", mappedLiveDocs.get(doc));
}
assertEquals("incorrect value; doc " + doc, sortedValues[doc].intValue(), Integer.parseInt(reader.document(doc).get(ID_FIELD)));
while (++prev < doc) {
assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs.get(prev));
}
}
}
@Test
public void testNormValues() throws Exception {
NumericDocValues dv = reader.getNormValues(NORMS_FIELD);
int maxDoc = reader.maxDoc();
for (int i = 0; i < maxDoc; i++) {
assertEquals("incorrect norm value for doc " + i, sortedValues[i].intValue(), dv.get(i));
}
}
@Test
public void testNumericDocValuesField() throws Exception {
NumericDocValues dv = reader.getNumericDocValues(NUMERIC_DV_FIELD);
int maxDoc = reader.maxDoc();
for (int i = 0; i < maxDoc; i++) {
assertEquals("incorrect numeric DocValues for doc " + i, sortedValues[i].intValue(), dv.get(i));
}
}
@Test
public void testSortedDocValuesField() throws Exception {
SortedDocValues dv = reader.getSortedDocValues(SORTED_DV_FIELD);
int maxDoc = reader.maxDoc();
BytesRef bytes = new BytesRef();
for (int i = 0; i < maxDoc; i++) {
dv.get(i, bytes);
assertEquals("incorrect sorted DocValues for doc " + i, sortedValues[i].toString(), bytes.utf8ToString());
}
}
@Test
public void testSortedSetDocValuesField() throws Exception {
assumeTrue("default codec does not support SORTED_SET", defaultCodecSupportsSortedSet());
SortedSetDocValues dv = reader.getSortedSetDocValues(SORTED_SET_DV_FIELD);
int maxDoc = reader.maxDoc();
BytesRef bytes = new BytesRef();
for (int i = 0; i < maxDoc; i++) {
dv.setDocument(i);
dv.lookupOrd(dv.nextOrd(), bytes);
int value = sortedValues[i].intValue();
assertEquals("incorrect sorted-set DocValues for doc " + i, Integer.valueOf(value).toString(), bytes.utf8ToString());
dv.lookupOrd(dv.nextOrd(), bytes);
assertEquals("incorrect sorted-set DocValues for doc " + i, Integer.valueOf(value + 1).toString(), bytes.utf8ToString());
assertEquals(SortedSetDocValues.NO_MORE_ORDS, dv.nextOrd());
}
}
@Test
public void testTermVectors() throws Exception {
int maxDoc = reader.maxDoc();
for (int i = 0; i < maxDoc; i++) {
Terms terms = reader.getTermVector(i, TERM_VECTORS_FIELD);
assertNotNull("term vectors not found for doc " + i + " field [" + TERM_VECTORS_FIELD + "]", terms);
assertEquals("incorrect term vector for doc " + i, sortedValues[i].toString(), terms.iterator(null).next().utf8ToString());
}
}
}

View File

@ -0,0 +1,75 @@
package org.apache.lucene.index.sorter;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util._TestUtil;
import org.junit.BeforeClass;
public class SortingAtomicReaderTest extends SorterTestBase {
@BeforeClass
public static void beforeClassSortingAtomicReaderTest() throws Exception {
// build the mapping from the reader, since we deleted documents, some of
// them might have disappeared from the index (e.g. if an entire segment is
// dropped b/c all its docs are deleted)
Integer[] values = new Integer[reader.maxDoc()];
int[] docs = new int[reader.maxDoc()];
for (int i = 0; i < reader.maxDoc(); i++) {
docs[i] = i;
values[i] = Integer.valueOf(reader.document(i).get(ID_FIELD));
}
final int[] oldToNew = Sorter.compute(docs, Collections.unmodifiableList(Arrays.asList(values)));
// Sorter.compute also sorts the values
sortedValues = new Integer[reader.maxDoc()];
for (int i = 0; i < reader.maxDoc(); ++i) {
sortedValues[oldToNew[i]] = values[i];
}
if (VERBOSE) {
System.out.println("oldToNew: " + Arrays.toString(oldToNew));
System.out.println("sortedValues: " + Arrays.toString(sortedValues));
}
reader = new SortingAtomicReader(reader, new Sorter() {
@Override
public int[] oldToNew(AtomicReader reader) throws IOException {
return oldToNew;
}
});
if (VERBOSE) {
System.out.print("mapped-deleted-docs: ");
Bits mappedLiveDocs = reader.getLiveDocs();
for (int i = 0; i < mappedLiveDocs.length(); i++) {
if (!mappedLiveDocs.get(i)) {
System.out.print(i + " ");
}
}
System.out.println();
}
_TestUtil.checkReader(reader);
}
}