mirror of
https://github.com/apache/lucene.git
synced 2025-02-17 15:35:20 +00:00
LUCENE-3918: port IndexSorter to trunk API
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1454801 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ff13a11930
commit
389bed49f8
@ -31,6 +31,11 @@ New Features
|
||||
|
||||
* LUCENE-4815: DrillSideways now allows more than one FacetRequest per
|
||||
dimension (Mike McCandless)
|
||||
|
||||
* LUCENE-3918: IndexSorter has been ported to 4.3 API and now supports
|
||||
sorting documents by a numeric DocValues field, or reverse the order of
|
||||
the documents in the index. Additionally, apps can implement their own
|
||||
sort criteria. (Anat Hashavit, Shai Erera)
|
||||
|
||||
* LUCENE-4817: Added KeywordRepeatFilter that allows to emit a token twice
|
||||
once as a keyword and once as an ordinary token allow stemmers to emit
|
||||
|
@ -0,0 +1,65 @@
|
||||
package org.apache.lucene.index.sorter;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.AbstractList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.AtomicReader;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
|
||||
/**
|
||||
* A {@link Sorter} which sorts documents according to their
|
||||
* {@link NumericDocValues}.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class NumericDocValuesSorter extends Sorter {
|
||||
|
||||
private final String fieldName;
|
||||
|
||||
public NumericDocValuesSorter(final String fieldName) {
|
||||
this.fieldName = fieldName;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int[] oldToNew(final AtomicReader reader) throws IOException {
|
||||
final NumericDocValues ndv = reader.getNumericDocValues(fieldName);
|
||||
final int maxDoc = reader.maxDoc();
|
||||
final int[] docs = new int[maxDoc];
|
||||
final List<Long> values = new AbstractList<Long>() {
|
||||
|
||||
@Override
|
||||
public Long get(int doc) {
|
||||
return ndv.get(doc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return reader.maxDoc();
|
||||
}
|
||||
|
||||
};
|
||||
for (int i = 0; i < maxDoc; i++) {
|
||||
docs[i] = i;
|
||||
}
|
||||
return compute(docs, values);
|
||||
}
|
||||
|
||||
}
|
106
lucene/misc/src/java/org/apache/lucene/index/sorter/Sorter.java
Normal file
106
lucene/misc/src/java/org/apache/lucene/index/sorter/Sorter.java
Normal file
@ -0,0 +1,106 @@
|
||||
package org.apache.lucene.index.sorter;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.AtomicReader;
|
||||
import org.apache.lucene.util.SorterTemplate;
|
||||
|
||||
/**
|
||||
* Sorts documents in a given index by returning a permutation on the docs.
|
||||
* Implementations can call {@link #compute(int[], List)} to compute the
|
||||
* old-to-new permutation over the given documents and values.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class Sorter {
|
||||
|
||||
/** Sorts documents in reverse order. */
|
||||
public static final Sorter REVERSE_DOCS = new Sorter() {
|
||||
@Override
|
||||
public int[] oldToNew(final AtomicReader reader) throws IOException {
|
||||
final int maxDoc = reader.maxDoc();
|
||||
int[] reverseDocs = new int[maxDoc];
|
||||
for (int i = 0; i < maxDoc; i++) {
|
||||
reverseDocs[i] = maxDoc - (i + 1);
|
||||
}
|
||||
return reverseDocs;
|
||||
}
|
||||
};
|
||||
|
||||
private static final class DocValueSorterTemplate<T extends Comparable<? super T>> extends SorterTemplate {
|
||||
|
||||
private final int[] docs;
|
||||
private final List<T> values;
|
||||
|
||||
private T pivot;
|
||||
|
||||
public DocValueSorterTemplate(int[] docs, List<T> values) {
|
||||
this.docs = docs;
|
||||
this.values = values;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int compare(int i, int j) {
|
||||
return values.get(docs[i]).compareTo(values.get(docs[j]));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int comparePivot(int j) {
|
||||
return pivot.compareTo(values.get(docs[j]));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void setPivot(int i) {
|
||||
pivot = values.get(docs[i]);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void swap(int i, int j) {
|
||||
int tmpDoc = docs[i];
|
||||
docs[i] = docs[j];
|
||||
docs[j] = tmpDoc;
|
||||
}
|
||||
}
|
||||
|
||||
/** Computes the old-to-new permutation over the given documents and values. */
|
||||
protected static <T extends Comparable<? super T>> int[] compute(int[] docs, List<T> values) {
|
||||
SorterTemplate sorter = new DocValueSorterTemplate<T>(docs, values);
|
||||
sorter.quickSort(0, docs.length - 1);
|
||||
|
||||
final int[] oldToNew = new int[docs.length];
|
||||
for (int i = 0; i < docs.length; i++) {
|
||||
oldToNew[docs[i]] = i;
|
||||
}
|
||||
return oldToNew;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a mapping from the old document ID to its new location in the
|
||||
* sorted index. Implementations can use the auxiliary
|
||||
* {@link #compute(int[], List)} to compute the old-to-new permutation
|
||||
* given an array of documents and their corresponding values.
|
||||
* <p>
|
||||
* <b>NOTE:</b> deleted documents are expected to appear in the mapping as
|
||||
* well, they will however be dropped when the index is actually sorted.
|
||||
*/
|
||||
public abstract int[] oldToNew(AtomicReader reader) throws IOException;
|
||||
|
||||
}
|
@ -0,0 +1,652 @@
|
||||
package org.apache.lucene.index.sorter;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.index.AtomicReader;
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.FilterAtomicReader;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.SortedDocValues;
|
||||
import org.apache.lucene.index.SortedSetDocValues;
|
||||
import org.apache.lucene.index.StoredFieldVisitor;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.RAMFile;
|
||||
import org.apache.lucene.store.RAMInputStream;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.SorterTemplate;
|
||||
|
||||
/**
|
||||
* An {@link AtomicReader} which supports sorting documents by a given
|
||||
* {@link Sorter}. You can use this class to sort an index as follows:
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
* IndexWriter writer; // writer to which the sorted index will be added
|
||||
* DirectoryReader reader; // reader on the input index
|
||||
* Sorter sorter; // determines how the documents are sorted
|
||||
* AtomicReader sortingReader = new SortingAtomicReader(reader, sorter);
|
||||
* writer.addIndexes(reader);
|
||||
* writer.close();
|
||||
* reader.close();
|
||||
* </pre>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class SortingAtomicReader extends FilterAtomicReader {
|
||||
|
||||
private static class SortingFields extends FilterFields {
|
||||
|
||||
private final int[] old2new;
|
||||
private final Bits inLiveDocs;
|
||||
private final FieldInfos infos;
|
||||
|
||||
public SortingFields(final Fields in, final Bits inLiveDocs, FieldInfos infos, final int[] old2new) {
|
||||
super(in);
|
||||
this.old2new = old2new;
|
||||
this.inLiveDocs = inLiveDocs;
|
||||
this.infos = infos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Terms terms(final String field) throws IOException {
|
||||
Terms terms = in.terms(field);
|
||||
if (terms == null) {
|
||||
return null;
|
||||
} else {
|
||||
return new SortingTerms(terms, inLiveDocs, infos.fieldInfo(field).getIndexOptions(), old2new);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static class SortingTerms extends FilterTerms {
|
||||
|
||||
private final int[] old2new;
|
||||
private final Bits inLiveDocs;
|
||||
private final IndexOptions indexOptions;
|
||||
|
||||
public SortingTerms(final Terms in, final Bits inLiveDocs, IndexOptions indexOptions, final int[] old2new) {
|
||||
super(in);
|
||||
this.old2new = old2new;
|
||||
this.inLiveDocs = inLiveDocs;
|
||||
this.indexOptions = indexOptions;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermsEnum iterator(final TermsEnum reuse) throws IOException {
|
||||
return new SortingTermsEnum(in.iterator(reuse), inLiveDocs, old2new, indexOptions);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static class SortingTermsEnum extends FilterTermsEnum {
|
||||
|
||||
private final int[] old2new;
|
||||
private final Bits inLiveDocs;
|
||||
private final IndexOptions indexOptions;
|
||||
|
||||
public SortingTermsEnum(final TermsEnum in, final Bits inLiveDocs, final int[] old2new, IndexOptions indexOptions) {
|
||||
super(in);
|
||||
this.old2new = old2new;
|
||||
this.inLiveDocs = inLiveDocs;
|
||||
this.indexOptions = indexOptions;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, final int flags) throws IOException {
|
||||
if (liveDocs != null) {
|
||||
liveDocs = inLiveDocs;
|
||||
}
|
||||
|
||||
// if we're asked to reuse the given DocsEnum and it is Sorting, return
|
||||
// the wrapped one, since some Codecs expect it.
|
||||
if (reuse != null && reuse instanceof SortingDocsEnum) {
|
||||
reuse = ((SortingDocsEnum) reuse).getWrapped();
|
||||
}
|
||||
boolean withFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >=0 && (flags & DocsEnum.FLAG_FREQS) != 0;
|
||||
return new SortingDocsEnum(in.docs(liveDocs, reuse, flags), withFreqs, old2new);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, final int flags) throws IOException {
|
||||
if (liveDocs != null) {
|
||||
liveDocs = inLiveDocs;
|
||||
}
|
||||
|
||||
// if we're asked to reuse the given DocsAndPositionsEnum and it is
|
||||
// Sorting, return the wrapped one, since some Codecs expect it.
|
||||
if (reuse != null && reuse instanceof SortingDocsAndPositionsEnum) {
|
||||
reuse = ((SortingDocsAndPositionsEnum) reuse).getWrapped();
|
||||
}
|
||||
|
||||
final DocsAndPositionsEnum positions = in.docsAndPositions(liveDocs, reuse, flags);
|
||||
if (positions == null) {
|
||||
return null;
|
||||
} else {
|
||||
// we ignore the fact that offsets may be stored but not asked for,
|
||||
// since this code is expected to be used during addIndexes which will
|
||||
// ask for everything. if that assumption changes in the future, we can
|
||||
// factor in whether 'flags' says offsets are not required.
|
||||
boolean storeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
return new SortingDocsAndPositionsEnum(positions, old2new, storeOffsets);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static class SortingBinaryDocValues extends BinaryDocValues {
|
||||
|
||||
private final BinaryDocValues in;
|
||||
private final int[] new2old;
|
||||
|
||||
SortingBinaryDocValues(BinaryDocValues in, int[] new2old) {
|
||||
this.in = in;
|
||||
this.new2old = new2old;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void get(int docID, BytesRef result) {
|
||||
in.get(new2old[docID], result);
|
||||
}
|
||||
}
|
||||
|
||||
private static class SortingNumericDocValues extends NumericDocValues {
|
||||
|
||||
private final NumericDocValues in;
|
||||
private final int[] new2old;
|
||||
|
||||
public SortingNumericDocValues(final NumericDocValues in, final int[] new2old) {
|
||||
this.in = in;
|
||||
this.new2old = new2old;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long get(int docID) {
|
||||
return in.get(new2old[docID]);
|
||||
}
|
||||
}
|
||||
|
||||
private static class SortingSortedDocValues extends SortedDocValues {
|
||||
|
||||
private final SortedDocValues in;
|
||||
private final int[] new2old;
|
||||
|
||||
SortingSortedDocValues(SortedDocValues in, int[] new2old) {
|
||||
this.in = in;
|
||||
this.new2old = new2old;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getOrd(int docID) {
|
||||
return in.getOrd(new2old[docID]);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void lookupOrd(int ord, BytesRef result) {
|
||||
in.lookupOrd(ord, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getValueCount() {
|
||||
return in.getValueCount();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void get(int docID, BytesRef result) {
|
||||
in.get(new2old[docID], result);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int lookupTerm(BytesRef key) {
|
||||
return in.lookupTerm(key);
|
||||
}
|
||||
}
|
||||
|
||||
private static class SortingSortedSetDocValues extends SortedSetDocValues {
|
||||
|
||||
private final SortedSetDocValues in;
|
||||
private final int[] new2old;
|
||||
|
||||
SortingSortedSetDocValues(SortedSetDocValues in, int[] new2old) {
|
||||
this.in = in;
|
||||
this.new2old = new2old;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long nextOrd() {
|
||||
return in.nextOrd();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setDocument(int docID) {
|
||||
in.setDocument(new2old[docID]);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void lookupOrd(long ord, BytesRef result) {
|
||||
in.lookupOrd(ord, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getValueCount() {
|
||||
return in.getValueCount();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long lookupTerm(BytesRef key) {
|
||||
return in.lookupTerm(key);
|
||||
}
|
||||
}
|
||||
|
||||
private static class SortingDocsEnum extends FilterDocsEnum {
|
||||
|
||||
private static final class DocFreqSorterTemplate extends SorterTemplate {
|
||||
|
||||
private final int[] docs;
|
||||
private final int[] freqs;
|
||||
|
||||
private int pivot;
|
||||
|
||||
public DocFreqSorterTemplate(int[] docs, int[] freqs) {
|
||||
this.docs = docs;
|
||||
this.freqs = freqs;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int compare(int i, int j) {
|
||||
return docs[i] - docs[j];
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int comparePivot(int j) {
|
||||
return pivot - docs[j];
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void setPivot(int i) {
|
||||
pivot = docs[i];
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void swap(int i, int j) {
|
||||
int tmpDoc = docs[i];
|
||||
docs[i] = docs[j];
|
||||
docs[j] = tmpDoc;
|
||||
|
||||
int tmpFreq = freqs[i];
|
||||
freqs[i] = freqs[j];
|
||||
freqs[j] = tmpFreq;
|
||||
}
|
||||
}
|
||||
|
||||
private int[] docs = new int[64];
|
||||
private int[] freqs;
|
||||
private int docIt = -1;
|
||||
private final int upto;
|
||||
private final boolean withFreqs;
|
||||
|
||||
public SortingDocsEnum(final DocsEnum in, boolean withFreqs, final int[] old2new) throws IOException {
|
||||
super(in);
|
||||
this.withFreqs = withFreqs;
|
||||
int i = 0;
|
||||
int doc;
|
||||
if (withFreqs) {
|
||||
freqs = new int[docs.length];
|
||||
while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS){
|
||||
if (i >= docs.length) {
|
||||
docs = ArrayUtil.grow(docs, docs.length + 1);
|
||||
freqs = ArrayUtil.grow(freqs, freqs.length + 1);
|
||||
}
|
||||
docs[i] = old2new[doc];
|
||||
freqs[i] = in.freq();
|
||||
++i;
|
||||
}
|
||||
SorterTemplate sorter = new DocFreqSorterTemplate(docs, freqs);
|
||||
sorter.quickSort(0, i - 1);
|
||||
} else {
|
||||
freqs = null;
|
||||
while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS){
|
||||
if (i >= docs.length) {
|
||||
docs = ArrayUtil.grow(docs, docs.length + 1);
|
||||
}
|
||||
docs[i++] = old2new[doc];
|
||||
}
|
||||
Arrays.sort(docs, 0, i);
|
||||
}
|
||||
upto = i;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(final int target) throws IOException {
|
||||
// need to support it for checkIndex, but in practice it won't be called, so
|
||||
// don't bother to implement efficiently for now.
|
||||
while (nextDoc() < target) {}
|
||||
return docID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return docIt >= upto ? NO_MORE_DOCS : docs[docIt];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int freq() throws IOException {
|
||||
return withFreqs && docIt < upto ? freqs[docIt] : 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
if (++docIt >= upto) return NO_MORE_DOCS;
|
||||
return docs[docIt];
|
||||
}
|
||||
|
||||
/** Returns the wrapped {@link DocsEnum}. */
|
||||
DocsEnum getWrapped() {
|
||||
return in;
|
||||
}
|
||||
}
|
||||
|
||||
private static class SortingDocsAndPositionsEnum extends FilterDocsAndPositionsEnum {
|
||||
|
||||
/**
|
||||
* A {@link SorterTemplate} which sorts two parallel arrays of doc IDs and
|
||||
* offsets in one go. Everytime a doc ID is 'swapped', its correponding offset
|
||||
* is swapped too.
|
||||
*/
|
||||
private static final class DocOffsetSorterTemplate extends SorterTemplate {
|
||||
|
||||
private final int[] docs;
|
||||
private final long[] offsets;
|
||||
|
||||
private int pivot;
|
||||
|
||||
public DocOffsetSorterTemplate(int[] docs, long[] offsets) {
|
||||
this.docs = docs;
|
||||
this.offsets = offsets;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int compare(int i, int j) {
|
||||
return docs[i] - docs[j];
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int comparePivot(int j) {
|
||||
return pivot - docs[j];
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void setPivot(int i) {
|
||||
pivot = docs[i];
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void swap(int i, int j) {
|
||||
int tmpDoc = docs[i];
|
||||
docs[i] = docs[j];
|
||||
docs[j] = tmpDoc;
|
||||
|
||||
long tmpOffset = offsets[i];
|
||||
offsets[i] = offsets[j];
|
||||
offsets[j] = tmpOffset;
|
||||
}
|
||||
}
|
||||
|
||||
private int[] docs;
|
||||
private long[] offsets;
|
||||
private final int upto;
|
||||
|
||||
private final IndexInput postingInput;
|
||||
private final boolean storeOffsets;
|
||||
|
||||
private int docIt = -1;
|
||||
private int pos;
|
||||
private int startOffset = -1;
|
||||
private int endOffset = -1;
|
||||
private final BytesRef payload = new BytesRef(32);
|
||||
private int currFreq;
|
||||
|
||||
public SortingDocsAndPositionsEnum(final DocsAndPositionsEnum in, final int[] old2new, boolean storeOffsets) throws IOException {
|
||||
super(in);
|
||||
this.storeOffsets = storeOffsets;
|
||||
final RAMFile file = new RAMFile();
|
||||
final IndexOutput out = new RAMOutputStream(file);
|
||||
docs = new int[32];
|
||||
offsets = new long[32];
|
||||
int doc;
|
||||
int i = 0;
|
||||
while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
if (i == docs.length) {
|
||||
docs = ArrayUtil.grow(docs, i + 1);
|
||||
offsets = ArrayUtil.grow(offsets, i + 1);
|
||||
}
|
||||
docs[i] = old2new[doc];
|
||||
offsets[i] = out.getFilePointer();
|
||||
addPositions(in, out);
|
||||
i++;
|
||||
}
|
||||
upto = i;
|
||||
SorterTemplate sorter = new DocOffsetSorterTemplate(docs, offsets);
|
||||
sorter.quickSort(0, upto - 1);
|
||||
out.close();
|
||||
this.postingInput = new RAMInputStream("", file);
|
||||
}
|
||||
|
||||
private void addPositions(final DocsAndPositionsEnum in, final IndexOutput out) throws IOException {
|
||||
int freq = in.freq();
|
||||
out.writeVInt(freq);
|
||||
for (int i = 0; i < freq; i++) {
|
||||
final int pos = in.nextPosition();
|
||||
out.writeVInt(pos);
|
||||
if (storeOffsets) { // don't encode offsets if they are not stored
|
||||
out.writeVInt(in.startOffset());
|
||||
out.writeVInt(in.endOffset());
|
||||
}
|
||||
BytesRef payload = in.getPayload();
|
||||
if (payload != null) {
|
||||
out.writeVInt(payload.length);
|
||||
out.writeBytes(payload.bytes, payload.offset, payload.length);
|
||||
} else {
|
||||
out.writeVInt(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(final int target) throws IOException {
|
||||
// need to support it for checkIndex, but in practice it won't be called, so
|
||||
// don't bother to implement efficiently for now.
|
||||
while (nextDoc() < target) {}
|
||||
return docID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return docIt >= upto ? NO_MORE_DOCS : docs[docIt];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
return endOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int freq() throws IOException {
|
||||
return currFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() throws IOException {
|
||||
return payload.length == 0 ? null : payload;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
if (++docIt >= upto) return DocIdSetIterator.NO_MORE_DOCS;
|
||||
postingInput.seek(offsets[docIt]);
|
||||
currFreq = postingInput.readVInt();
|
||||
return docs[docIt];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextPosition() throws IOException {
|
||||
pos = postingInput.readVInt();
|
||||
if (storeOffsets) {
|
||||
startOffset = postingInput.readVInt();
|
||||
endOffset = postingInput.readVInt();
|
||||
}
|
||||
int length = postingInput.readVInt();
|
||||
if (length > 0) {
|
||||
if (length >= payload.bytes.length) {
|
||||
payload.grow(length + 1);
|
||||
}
|
||||
postingInput.readBytes(payload.bytes, 0, length);
|
||||
}
|
||||
payload.length = length;
|
||||
return pos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
return startOffset;
|
||||
}
|
||||
|
||||
/** Returns the wrapped {@link DocsAndPositionsEnum}. */
|
||||
DocsAndPositionsEnum getWrapped() {
|
||||
return in;
|
||||
}
|
||||
}
|
||||
|
||||
private final int[] old2new, new2old;
|
||||
private final FixedBitSet mappedLiveDocs;
|
||||
|
||||
public SortingAtomicReader(final AtomicReader in, final Sorter sorter) throws IOException {
|
||||
super(in);
|
||||
old2new = sorter.oldToNew(in);
|
||||
if (old2new.length != in.maxDoc()) {
|
||||
throw new IllegalArgumentException("sorter should provide mapping for every document in the index, including deleted ones");
|
||||
}
|
||||
new2old = new int[old2new.length];
|
||||
for (int i = 0; i < new2old.length; i++) {
|
||||
new2old[old2new[i]] = i;
|
||||
}
|
||||
|
||||
if (!in.hasDeletions()) {
|
||||
mappedLiveDocs = null;
|
||||
} else {
|
||||
mappedLiveDocs = new FixedBitSet(in.maxDoc());
|
||||
mappedLiveDocs.set(0, in.maxDoc());
|
||||
Bits liveDocs = in.getLiveDocs();
|
||||
int len = liveDocs.length();
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (!liveDocs.get(i)) {
|
||||
mappedLiveDocs.clear(old2new[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void document(final int docID, final StoredFieldVisitor visitor) throws IOException {
|
||||
in.document(new2old[docID], visitor);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Fields fields() throws IOException {
|
||||
Fields fields = in.fields();
|
||||
if (fields == null) {
|
||||
return null;
|
||||
} else {
|
||||
return new SortingFields(fields, in.getLiveDocs(), in.getFieldInfos(), old2new);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public BinaryDocValues getBinaryDocValues(String field) throws IOException {
|
||||
BinaryDocValues oldDocValues = in.getBinaryDocValues(field);
|
||||
if (oldDocValues == null) {
|
||||
return null;
|
||||
} else {
|
||||
return new SortingBinaryDocValues(oldDocValues, new2old);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Bits getLiveDocs() {
|
||||
ensureOpen();
|
||||
return mappedLiveDocs;
|
||||
}
|
||||
|
||||
@Override
|
||||
public NumericDocValues getNormValues(String field) throws IOException {
|
||||
final NumericDocValues norm = in.getNormValues(field);
|
||||
if (norm == null) {
|
||||
return null;
|
||||
} else {
|
||||
return new SortingNumericDocValues(norm, new2old);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public NumericDocValues getNumericDocValues(String field) throws IOException {
|
||||
final NumericDocValues oldDocValues = in.getNumericDocValues(field);
|
||||
if (oldDocValues == null) return null;
|
||||
return new SortingNumericDocValues(oldDocValues, new2old);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SortedDocValues getSortedDocValues(String field) throws IOException {
|
||||
SortedDocValues sortedDV = in.getSortedDocValues(field);
|
||||
if (sortedDV == null) {
|
||||
return null;
|
||||
} else {
|
||||
return new SortingSortedDocValues(sortedDV, new2old);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public SortedSetDocValues getSortedSetDocValues(String field) throws IOException {
|
||||
SortedSetDocValues sortedSetDV = in.getSortedSetDocValues(field);
|
||||
if (sortedSetDV == null) {
|
||||
return null;
|
||||
} else {
|
||||
return new SortingSortedSetDocValues(sortedSetDV, new2old);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Fields getTermVectors(final int docID) throws IOException {
|
||||
return in.getTermVectors(new2old[docID]);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,27 @@
|
||||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<body>
|
||||
Provides index sorting capablities. The application can use one of the
|
||||
pre-existing Sorter implementations, e.g. to sort by a numeric
|
||||
DocValues or reverse the order of the documents. Additionally, the
|
||||
application can implement a Sorter which returns a permutation on
|
||||
a source Directory's document IDs, to sort the input documents by additional
|
||||
values.
|
||||
</body>
|
||||
</html>
|
@ -0,0 +1,79 @@
|
||||
package org.apache.lucene.index.sorter;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.SlowCompositeReaderWrapper;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
public class IndexSortingTest extends SorterTestBase {
|
||||
|
||||
private static final Sorter[] SORTERS = new Sorter[] {
|
||||
new NumericDocValuesSorter(NUMERIC_DV_FIELD),
|
||||
Sorter.REVERSE_DOCS,
|
||||
};
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClassSorterUtilTest() throws Exception {
|
||||
// only read the values of the undeleted documents, since after addIndexes,
|
||||
// the deleted ones will be dropped from the index.
|
||||
Bits liveDocs = reader.getLiveDocs();
|
||||
List<Integer> values = new ArrayList<Integer>();
|
||||
for (int i = 0; i < reader.maxDoc(); i++) {
|
||||
if (liveDocs == null || liveDocs.get(i)) {
|
||||
values.add(Integer.valueOf(reader.document(i).get(ID_FIELD)));
|
||||
}
|
||||
}
|
||||
Sorter sorter = SORTERS[random().nextInt(SORTERS.length)];
|
||||
if (sorter == Sorter.REVERSE_DOCS) {
|
||||
Collections.reverse(values);
|
||||
} else {
|
||||
Collections.sort(values);
|
||||
}
|
||||
sortedValues = values.toArray(new Integer[values.size()]);
|
||||
if (VERBOSE) {
|
||||
System.out.println("sortedValues: " + sortedValues);
|
||||
System.out.println("Sorter: " + sorter);
|
||||
}
|
||||
|
||||
Directory target = newDirectory();
|
||||
IndexWriter writer = new IndexWriter(target, newIndexWriterConfig(TEST_VERSION_CURRENT, null));
|
||||
reader = new SortingAtomicReader(reader, sorter);
|
||||
writer.addIndexes(reader);
|
||||
writer.close();
|
||||
reader.close();
|
||||
dir.close();
|
||||
|
||||
// CheckIndex the target directory
|
||||
dir = target;
|
||||
_TestUtil.checkIndex(dir);
|
||||
|
||||
// set reader for tests
|
||||
reader = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(dir));
|
||||
assertFalse("index should not have deletions", reader.hasDeletions());
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,377 @@
|
||||
package org.apache.lucene.index.sorter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.document.BinaryDocValuesField;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.NumericDocValuesField;
|
||||
import org.apache.lucene.document.SortedDocValuesField;
|
||||
import org.apache.lucene.document.SortedSetDocValuesField;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.AtomicReader;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.SlowCompositeReaderWrapper;
|
||||
import org.apache.lucene.index.SortedDocValues;
|
||||
import org.apache.lucene.index.SortedSetDocValues;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
public abstract class SorterTestBase extends LuceneTestCase {
|
||||
|
||||
static final class NormsSimilarity extends Similarity {
|
||||
|
||||
private final Similarity in;
|
||||
|
||||
public NormsSimilarity(Similarity in) {
|
||||
this.in = in;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long computeNorm(FieldInvertState state) {
|
||||
if (state.getName().equals(NORMS_FIELD)) {
|
||||
return Float.floatToIntBits(state.getBoost());
|
||||
} else {
|
||||
return in.computeNorm(state);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
return in.computeWeight(queryBoost, collectionStats, termStats);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ExactSimScorer exactSimScorer(SimWeight weight, AtomicReaderContext context) throws IOException {
|
||||
return in.exactSimScorer(weight, context);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SloppySimScorer sloppySimScorer(SimWeight weight, AtomicReaderContext context) throws IOException {
|
||||
return in.sloppySimScorer(weight, context);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static final class PositionsTokenStream extends TokenStream {
|
||||
|
||||
private final CharTermAttribute term;
|
||||
private final PayloadAttribute payload;
|
||||
private final OffsetAttribute offset;
|
||||
|
||||
private int pos, off;
|
||||
|
||||
public PositionsTokenStream() {
|
||||
term = addAttribute(CharTermAttribute.class);
|
||||
term.append(DOC_POSITIONS_TERM);
|
||||
payload = addAttribute(PayloadAttribute.class);
|
||||
offset = addAttribute(OffsetAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (pos == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
payload.setPayload(new BytesRef(Integer.toString(pos)));
|
||||
offset.setOffset(off, off);
|
||||
--pos;
|
||||
++off;
|
||||
return true;
|
||||
}
|
||||
|
||||
void setId(int id) {
|
||||
pos = id / 10 + 1;
|
||||
off = 0;
|
||||
}
|
||||
}
|
||||
|
||||
protected static final String ID_FIELD = "id";
|
||||
protected static final String DOCS_ENUM_FIELD = "docs";
|
||||
protected static final String DOCS_ENUM_TERM = "$all$";
|
||||
protected static final String DOC_POSITIONS_FIELD = "positions";
|
||||
protected static final String DOC_POSITIONS_TERM = "$all$";
|
||||
protected static final String NUMERIC_DV_FIELD = "numeric";
|
||||
protected static final String NORMS_FIELD = "norm";
|
||||
protected static final String BINARY_DV_FIELD = "binary";
|
||||
protected static final String SORTED_DV_FIELD = "sorted";
|
||||
protected static final String SORTED_SET_DV_FIELD = "sorted_set";
|
||||
protected static final String TERM_VECTORS_FIELD = "term_vectors";
|
||||
|
||||
private static final FieldType TERM_VECTORS_TYPE = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
static {
|
||||
TERM_VECTORS_TYPE.setStoreTermVectors(true);
|
||||
TERM_VECTORS_TYPE.freeze();
|
||||
}
|
||||
|
||||
private static final FieldType POSITIONS_TYPE = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
static {
|
||||
POSITIONS_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
POSITIONS_TYPE.freeze();
|
||||
}
|
||||
|
||||
protected static Directory dir;
|
||||
protected static AtomicReader reader;
|
||||
protected static Integer[] sortedValues;
|
||||
|
||||
private static Document doc(final int id, PositionsTokenStream positions) {
|
||||
final Document doc = new Document();
|
||||
doc.add(new StringField(ID_FIELD, Integer.toString(id), Store.YES));
|
||||
doc.add(new StringField(DOCS_ENUM_FIELD, DOCS_ENUM_TERM, Store.NO));
|
||||
positions.setId(id);
|
||||
if (doesntSupportOffsets.contains(_TestUtil.getPostingsFormat(DOC_POSITIONS_FIELD))) {
|
||||
// codec doesnt support offsets: just index positions for the field
|
||||
doc.add(new Field(DOC_POSITIONS_FIELD, positions, TextField.TYPE_NOT_STORED));
|
||||
} else {
|
||||
doc.add(new Field(DOC_POSITIONS_FIELD, positions, POSITIONS_TYPE));
|
||||
}
|
||||
doc.add(new NumericDocValuesField(NUMERIC_DV_FIELD, id));
|
||||
TextField norms = new TextField(NORMS_FIELD, Integer.toString(id), Store.NO);
|
||||
norms.setBoost(Float.intBitsToFloat(id));
|
||||
doc.add(norms);
|
||||
doc.add(new BinaryDocValuesField(BINARY_DV_FIELD, new BytesRef(Integer.toString(id))));
|
||||
doc.add(new SortedDocValuesField(SORTED_DV_FIELD, new BytesRef(Integer.toString(id))));
|
||||
if (defaultCodecSupportsSortedSet()) {
|
||||
doc.add(new SortedSetDocValuesField(SORTED_SET_DV_FIELD, new BytesRef(Integer.toString(id))));
|
||||
doc.add(new SortedSetDocValuesField(SORTED_SET_DV_FIELD, new BytesRef(Integer.toString(id + 1))));
|
||||
}
|
||||
doc.add(new Field(TERM_VECTORS_FIELD, Integer.toString(id), TERM_VECTORS_TYPE));
|
||||
return doc;
|
||||
}
|
||||
|
||||
/** Creates an index for sorting. */
|
||||
public static void createIndex(Directory dir, int numDocs, Random random) throws IOException {
|
||||
List<Integer> ids = new ArrayList<Integer>();
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
ids.add(Integer.valueOf(i * 10));
|
||||
}
|
||||
// shuffle them for indexing
|
||||
Collections.shuffle(ids, random);
|
||||
if (VERBOSE) {
|
||||
System.out.println("Shuffled IDs for indexing: " + Arrays.toString(ids.toArray()));
|
||||
}
|
||||
|
||||
PositionsTokenStream positions = new PositionsTokenStream();
|
||||
IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random));
|
||||
conf.setMaxBufferedDocs(4); // create some segments
|
||||
conf.setSimilarity(new NormsSimilarity(conf.getSimilarity())); // for testing norms field
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random, dir, conf);
|
||||
writer.setDoRandomForceMerge(false);
|
||||
for (int id : ids) {
|
||||
writer.addDocument(doc(id, positions));
|
||||
}
|
||||
// delete some documents
|
||||
writer.commit();
|
||||
for (Integer id : ids) {
|
||||
if (random.nextDouble() < 0.2) {
|
||||
if (VERBOSE) {
|
||||
System.out.println("delete doc_id " + id);
|
||||
}
|
||||
writer.deleteDocuments(new Term(ID_FIELD, id.toString()));
|
||||
}
|
||||
}
|
||||
writer.close();
|
||||
}
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClassSorterTestBase() throws Exception {
|
||||
dir = newDirectory();
|
||||
int numDocs = atLeast(20);
|
||||
createIndex(dir, numDocs, random());
|
||||
|
||||
reader = new SlowCompositeReaderWrapper(DirectoryReader.open(dir));
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void afterClassSorterTestBase() throws Exception {
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBinaryDocValuesField() throws Exception {
|
||||
BinaryDocValues dv = reader.getBinaryDocValues(BINARY_DV_FIELD);
|
||||
BytesRef bytes = new BytesRef();
|
||||
for (int i = 0; i < reader.maxDoc(); i++) {
|
||||
dv.get(i, bytes);
|
||||
assertEquals("incorrect binary DocValues for doc " + i, sortedValues[i].toString(), bytes.utf8ToString());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDocsAndPositionsEnum() throws Exception {
|
||||
Term term = new Term(DOC_POSITIONS_FIELD, DOC_POSITIONS_TERM);
|
||||
DocsAndPositionsEnum sortedPositions = reader.termPositionsEnum(term);
|
||||
int doc;
|
||||
|
||||
// test nextDoc()
|
||||
while ((doc = sortedPositions.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
int freq = sortedPositions.freq();
|
||||
assertEquals("incorrect freq for doc=" + doc, sortedValues[doc].intValue() / 10 + 1, freq);
|
||||
for (int i = 0; i < freq; i++) {
|
||||
assertEquals("incorrect position for doc=" + doc, i, sortedPositions.nextPosition());
|
||||
if (!doesntSupportOffsets.contains(_TestUtil.getPostingsFormat(DOC_POSITIONS_FIELD))) {
|
||||
assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.startOffset());
|
||||
assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.endOffset());
|
||||
}
|
||||
assertEquals("incorrect payload for doc=" + doc, freq - i, Integer.parseInt(sortedPositions.getPayload().utf8ToString()));
|
||||
}
|
||||
}
|
||||
|
||||
// test advance()
|
||||
sortedPositions = reader.termPositionsEnum(term);
|
||||
doc = 0;
|
||||
while ((doc = sortedPositions.advance(doc)) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
int freq = sortedPositions.freq();
|
||||
assertEquals("incorrect freq for doc=" + doc, sortedValues[doc].intValue() / 10 + 1, freq);
|
||||
for (int i = 0; i < freq; i++) {
|
||||
assertEquals("incorrect position for doc=" + doc, i, sortedPositions.nextPosition());
|
||||
if (!doesntSupportOffsets.contains(_TestUtil.getPostingsFormat(DOC_POSITIONS_FIELD))) {
|
||||
assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.startOffset());
|
||||
assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.endOffset());
|
||||
}
|
||||
assertEquals("incorrect payload for doc=" + doc, freq - i, Integer.parseInt(sortedPositions.getPayload().utf8ToString()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDocsEnum() throws Exception {
|
||||
Term term = new Term(DOCS_ENUM_FIELD, DOCS_ENUM_TERM);
|
||||
DocsEnum docs = reader.termDocsEnum(term);
|
||||
Bits mappedLiveDocs = reader.getLiveDocs();
|
||||
int doc;
|
||||
int prev = -1;
|
||||
while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
if (mappedLiveDocs != null) {
|
||||
assertTrue("document " + doc + " marked as deleted", mappedLiveDocs.get(doc));
|
||||
}
|
||||
assertEquals("incorrect value; doc " + doc, sortedValues[doc].intValue(), Integer.parseInt(reader.document(doc).get(ID_FIELD)));
|
||||
while (++prev < doc) {
|
||||
assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs.get(prev));
|
||||
}
|
||||
}
|
||||
|
||||
docs = reader.termDocsEnum(term);
|
||||
doc = 0;
|
||||
prev = -1;
|
||||
while ((doc = docs.advance(doc)) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
if (mappedLiveDocs != null) {
|
||||
assertTrue("document " + doc + " marked as deleted", mappedLiveDocs.get(doc));
|
||||
}
|
||||
assertEquals("incorrect value; doc " + doc, sortedValues[doc].intValue(), Integer.parseInt(reader.document(doc).get(ID_FIELD)));
|
||||
while (++prev < doc) {
|
||||
assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs.get(prev));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormValues() throws Exception {
|
||||
NumericDocValues dv = reader.getNormValues(NORMS_FIELD);
|
||||
int maxDoc = reader.maxDoc();
|
||||
for (int i = 0; i < maxDoc; i++) {
|
||||
assertEquals("incorrect norm value for doc " + i, sortedValues[i].intValue(), dv.get(i));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNumericDocValuesField() throws Exception {
|
||||
NumericDocValues dv = reader.getNumericDocValues(NUMERIC_DV_FIELD);
|
||||
int maxDoc = reader.maxDoc();
|
||||
for (int i = 0; i < maxDoc; i++) {
|
||||
assertEquals("incorrect numeric DocValues for doc " + i, sortedValues[i].intValue(), dv.get(i));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSortedDocValuesField() throws Exception {
|
||||
SortedDocValues dv = reader.getSortedDocValues(SORTED_DV_FIELD);
|
||||
int maxDoc = reader.maxDoc();
|
||||
BytesRef bytes = new BytesRef();
|
||||
for (int i = 0; i < maxDoc; i++) {
|
||||
dv.get(i, bytes);
|
||||
assertEquals("incorrect sorted DocValues for doc " + i, sortedValues[i].toString(), bytes.utf8ToString());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSortedSetDocValuesField() throws Exception {
|
||||
assumeTrue("default codec does not support SORTED_SET", defaultCodecSupportsSortedSet());
|
||||
SortedSetDocValues dv = reader.getSortedSetDocValues(SORTED_SET_DV_FIELD);
|
||||
int maxDoc = reader.maxDoc();
|
||||
BytesRef bytes = new BytesRef();
|
||||
for (int i = 0; i < maxDoc; i++) {
|
||||
dv.setDocument(i);
|
||||
dv.lookupOrd(dv.nextOrd(), bytes);
|
||||
int value = sortedValues[i].intValue();
|
||||
assertEquals("incorrect sorted-set DocValues for doc " + i, Integer.valueOf(value).toString(), bytes.utf8ToString());
|
||||
dv.lookupOrd(dv.nextOrd(), bytes);
|
||||
assertEquals("incorrect sorted-set DocValues for doc " + i, Integer.valueOf(value + 1).toString(), bytes.utf8ToString());
|
||||
assertEquals(SortedSetDocValues.NO_MORE_ORDS, dv.nextOrd());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTermVectors() throws Exception {
|
||||
int maxDoc = reader.maxDoc();
|
||||
for (int i = 0; i < maxDoc; i++) {
|
||||
Terms terms = reader.getTermVector(i, TERM_VECTORS_FIELD);
|
||||
assertNotNull("term vectors not found for doc " + i + " field [" + TERM_VECTORS_FIELD + "]", terms);
|
||||
assertEquals("incorrect term vector for doc " + i, sortedValues[i].toString(), terms.iterator(null).next().utf8ToString());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,75 @@
|
||||
package org.apache.lucene.index.sorter;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
|
||||
import org.apache.lucene.index.AtomicReader;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
public class SortingAtomicReaderTest extends SorterTestBase {
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClassSortingAtomicReaderTest() throws Exception {
|
||||
// build the mapping from the reader, since we deleted documents, some of
|
||||
// them might have disappeared from the index (e.g. if an entire segment is
|
||||
// dropped b/c all its docs are deleted)
|
||||
Integer[] values = new Integer[reader.maxDoc()];
|
||||
int[] docs = new int[reader.maxDoc()];
|
||||
for (int i = 0; i < reader.maxDoc(); i++) {
|
||||
docs[i] = i;
|
||||
values[i] = Integer.valueOf(reader.document(i).get(ID_FIELD));
|
||||
}
|
||||
|
||||
final int[] oldToNew = Sorter.compute(docs, Collections.unmodifiableList(Arrays.asList(values)));
|
||||
// Sorter.compute also sorts the values
|
||||
sortedValues = new Integer[reader.maxDoc()];
|
||||
for (int i = 0; i < reader.maxDoc(); ++i) {
|
||||
sortedValues[oldToNew[i]] = values[i];
|
||||
}
|
||||
if (VERBOSE) {
|
||||
System.out.println("oldToNew: " + Arrays.toString(oldToNew));
|
||||
System.out.println("sortedValues: " + Arrays.toString(sortedValues));
|
||||
}
|
||||
|
||||
reader = new SortingAtomicReader(reader, new Sorter() {
|
||||
@Override
|
||||
public int[] oldToNew(AtomicReader reader) throws IOException {
|
||||
return oldToNew;
|
||||
}
|
||||
});
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.print("mapped-deleted-docs: ");
|
||||
Bits mappedLiveDocs = reader.getLiveDocs();
|
||||
for (int i = 0; i < mappedLiveDocs.length(); i++) {
|
||||
if (!mappedLiveDocs.get(i)) {
|
||||
System.out.print(i + " ");
|
||||
}
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
_TestUtil.checkReader(reader);
|
||||
}
|
||||
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user