From 0c333c60dd03aedde4fc75aca29713ca9b87b6ff Mon Sep 17 00:00:00 2001 From: Uwe Schindler Date: Mon, 5 Jul 2010 08:33:25 +0000 Subject: [PATCH] LUCENE-2514: Term is no longer character based git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@960484 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 18 +-- .../lucene/search/highlight/TokenSources.java | 7 +- .../TokenStreamFromTermPositionVector.java | 7 +- .../vectorhighlight/FieldTermStack.java | 7 +- .../store/instantiated/InstantiatedIndex.java | 2 +- .../instantiated/InstantiatedIndexReader.java | 4 +- .../InstantiatedTermFreqVector.java | 13 ++- .../instantiated/InstantiatedTermsEnum.java | 4 +- .../lucene/index/memory/MemoryIndex.java | 58 +++++----- .../lucene/index/TermVectorAccessor.java | 8 +- .../lucene/index/TestTermVectorAccessor.java | 6 +- .../lucene/search/FuzzyLikeThisQuery.java | 2 +- .../org/apache/lucene/search/TermsFilter.java | 2 +- .../lucene/search/similar/MoreLikeThis.java | 5 +- .../apache/lucene/index/DocumentsWriter.java | 7 +- .../index/FieldSortedTermVectorMapper.java | 4 +- .../org/apache/lucene/index/IndexReader.java | 4 +- .../index/PositionBasedTermVectorMapper.java | 14 ++- .../index/SegmentTermPositionVector.java | 4 +- .../lucene/index/SegmentTermVector.java | 14 ++- .../lucene/index/SortedTermVectorMapper.java | 6 +- .../java/org/apache/lucene/index/Term.java | 109 +++++++++++++++--- .../apache/lucene/index/TermFreqVector.java | 8 +- .../apache/lucene/index/TermVectorEntry.java | 8 +- .../apache/lucene/index/TermVectorMapper.java | 5 +- .../lucene/index/TermVectorsReader.java | 12 +- .../lucene/index/TermVectorsWriter.java | 22 ++-- .../index/codecs/preflex/PreFlexFields.java | 52 ++++----- .../index/codecs/preflex/TermBuffer.java | 9 +- .../index/codecs/preflex/TermInfosReader.java | 16 +-- .../lucene/search/FieldCacheTermsFilter.java | 13 ++- .../lucene/search/MultiPhraseQuery.java | 5 +- .../apache/lucene/search/MultiTermQuery.java | 102 ++++++++-------- .../search/MultiTermQueryWrapperFilter.java | 4 - .../org/apache/lucene/search/PhraseQuery.java | 7 +- .../org/apache/lucene/search/PrefixQuery.java | 2 +- .../apache/lucene/search/PrefixTermsEnum.java | 2 +- .../apache/lucene/search/QueryTermVector.java | 32 ++--- .../apache/lucene/search/SingleTermsEnum.java | 2 +- .../org/apache/lucene/search/TermQuery.java | 4 +- .../lucene/search/spans/SpanTermQuery.java | 5 +- .../java/org/apache/lucene/util/BytesRef.java | 75 ++++++++++++ .../org/apache/lucene/util/PagedBytes.java | 22 +++- .../apache/lucene/index/TestAddIndexes.java | 2 +- .../org/apache/lucene/index/TestPayloads.java | 2 +- .../TestPositionBasedTermVectorMapper.java | 3 +- .../lucene/index/TestSegmentMerger.java | 4 +- .../lucene/index/TestSegmentReader.java | 4 +- .../lucene/index/TestStressIndexing2.java | 4 +- .../lucene/index/TestTermVectorsReader.java | 17 +-- .../index/codecs/preflex/TermInfosWriter.java | 4 +- .../index/codecs/preflex/TestSurrogates.java | 45 +++----- .../search/TestMultiThreadTermVectors.java | 5 +- .../lucene/search/TestQueryTermVector.java | 11 +- .../apache/lucene/search/TestTermVectors.java | 29 ++--- .../handler/admin/LukeRequestHandler.java | 2 +- .../component/TermVectorComponent.java | 8 +- .../apache/solr/request/UnInvertedField.java | 6 +- .../apache/solr/search/SolrIndexSearcher.java | 4 +- .../solr/update/DirectUpdateHandler.java | 2 +- 60 files changed, 515 insertions(+), 349 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index e3352ce6d15..3cf30e2c38f 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -102,6 +102,15 @@ Changes in backwards compatibility policy API Changes +* LUCENE-2302, LUCENE-1458, LUCENE-2111, LUCENE-2514: Terms are no longer + required to be character based. Lucene views a term as an arbitrary byte[]: + during analysis, character-based terms are converted to UTF8 byte[], + but analyzers are free to directly create terms as byte[] + (NumericField does this, for example). The term data is buffered as + byte[] during indexing, written as byte[] into the terms dictionary, + and iterated as byte[] (wrapped in a BytesRef) by IndexReader for + searching. + * LUCENE-1458, LUCENE-2111: IndexReader now directly exposes its deleted docs (getDeletedDocs), providing a new Bits interface to directly query by doc ID. @@ -147,15 +156,6 @@ New features standard codec), and int block (really a "base" for using block-based compressors like PForDelta for storing postings data). -* LUCENE-2302, LUCENE-1458, LUCENE-2111: Terms are no longer required - to be character based. Lucene views a term as an arbitrary byte[]: - during analysis, character-based terms are converted to UTF8 byte[], - but analyzers are free to directly create terms as byte[] - (NumericField does this, for example). The term data is buffered as - byte[] during indexing, written as byte[] into the terms dictionary, - and iterated as byte[] (wrapped in a BytesRef) by IndexReader for - searching. - * LUCENE-2385: Moved NoDeletionPolicy from benchmark to core. NoDeletionPolicy can be used to prevent commits from ever getting deleted from the index. (Shai Erera) diff --git a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java index e5ecc8bd92a..197f1444d1c 100644 --- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java +++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java @@ -36,6 +36,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.TermFreqVector; import org.apache.lucene.index.TermPositionVector; import org.apache.lucene.index.TermVectorOffsetInfo; +import org.apache.lucene.util.BytesRef; /** * Hides implementation issues associated with obtaining a TokenStream for use @@ -176,7 +177,7 @@ public class TokenSources { } } // code to reconstruct the original sequence of Tokens - String[] terms = tpv.getTerms(); + BytesRef[] terms = tpv.getTerms(); int[] freq = tpv.getTermFrequencies(); int totalTokens = 0; @@ -204,7 +205,7 @@ public class TokenSources { unsortedTokens = new ArrayList(); } for (int tp = 0; tp < offsets.length; tp++) { - Token token = new Token(terms[t], offsets[tp].getStartOffset(), offsets[tp] + Token token = new Token(terms[t].utf8ToString(), offsets[tp].getStartOffset(), offsets[tp] .getEndOffset()); unsortedTokens.add(token); } @@ -220,7 +221,7 @@ public class TokenSources { // tokens stored with positions - can use this to index straight into // sorted array for (int tp = 0; tp < pos.length; tp++) { - Token token = new Token(terms[t], offsets[tp].getStartOffset(), + Token token = new Token(terms[t].utf8ToString(), offsets[tp].getStartOffset(), offsets[tp].getEndOffset()); tokensInOriginalOrder[pos[tp]] = token; } diff --git a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java index 810441677c5..410db99fcd4 100644 --- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java +++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java @@ -30,6 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.index.TermPositionVector; import org.apache.lucene.index.TermVectorOffsetInfo; +import org.apache.lucene.util.BytesRef; public final class TokenStreamFromTermPositionVector extends TokenStream { @@ -54,18 +55,18 @@ public final class TokenStreamFromTermPositionVector extends TokenStream { termAttribute = addAttribute(CharTermAttribute.class); positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class); offsetAttribute = addAttribute(OffsetAttribute.class); - final String[] terms = termPositionVector.getTerms(); + final BytesRef[] terms = termPositionVector.getTerms(); for (int i = 0; i < terms.length; i++) { final TermVectorOffsetInfo[] offsets = termPositionVector.getOffsets(i); final int[] termPositions = termPositionVector.getTermPositions(i); for (int j = 0; j < termPositions.length; j++) { Token token; if (offsets != null) { - token = new Token(terms[i].toCharArray(), 0, terms[i].length(), + token = new Token(terms[i].utf8ToString(), offsets[j].getStartOffset(), offsets[j].getEndOffset()); } else { token = new Token(); - token.setEmpty().append(terms[i]); + token.setEmpty().append(terms[i].utf8ToString()); } // Yes - this is the position, not the increment! This is for // sorting. This value diff --git a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java index 1282d669e1e..9ff5b4d86ad 100644 --- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java +++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java @@ -25,6 +25,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.TermFreqVector; import org.apache.lucene.index.TermPositionVector; import org.apache.lucene.index.TermVectorOffsetInfo; +import org.apache.lucene.util.BytesRef; /** * FieldTermStack is a stack that keeps query terms in the specified field @@ -80,15 +81,15 @@ public class FieldTermStack { // just return to make null snippet if un-matched fieldName specified when fieldMatch == true if( termSet == null ) return; - for( String term : tpv.getTerms() ){ - if( !termSet.contains( term ) ) continue; + for( BytesRef term : tpv.getTerms() ){ + if( !termSet.contains( term.utf8ToString() ) ) continue; int index = tpv.indexOf( term ); TermVectorOffsetInfo[] tvois = tpv.getOffsets( index ); if( tvois == null ) return; // just return to make null snippets int[] poss = tpv.getTermPositions( index ); if( poss == null ) return; // just return to make null snippets for( int i = 0; i < tvois.length; i++ ) - termList.add( new TermInfo( term, tvois[i].getStartOffset(), tvois[i].getEndOffset(), poss[i] ) ); + termList.add( new TermInfo( term.utf8ToString(), tvois[i].getStartOffset(), tvois[i].getEndOffset(), poss[i] ) ); } // sort by position diff --git a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java index 8f2a3046ae8..2ae92135da5 100644 --- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java +++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java @@ -290,7 +290,7 @@ public class InstantiatedIndex TermPositionVector termPositionVector = (TermPositionVector) sourceIndexReader.getTermFreqVector(document.getDocumentNumber(), field.name()); if (termPositionVector != null) { for (int i = 0; i < termPositionVector.getTerms().length; i++) { - String token = termPositionVector.getTerms()[i]; + String token = termPositionVector.getTerms()[i].utf8ToString(); InstantiatedTerm term = findTerm(field.name(), token); InstantiatedTermDocumentInformation termDocumentInformation = term.getAssociatedDocument(document.getDocumentNumber()); termDocumentInformation.setTermOffsets(termPositionVector.getOffsets(i)); diff --git a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java index 611bfb5f674..81670dcbb3c 100644 --- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java +++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java @@ -464,7 +464,7 @@ public class InstantiatedIndexReader extends IndexReader { List tv = doc.getVectorSpace().get(field); mapper.setExpectations(field, tv.size(), true, true); for (InstantiatedTermDocumentInformation tdi : tv) { - mapper.map(tdi.getTerm().text(), tdi.getTermPositions().length, tdi.getTermOffsets(), tdi.getTermPositions()); + mapper.map(tdi.getTerm().getTerm().bytes(), tdi.getTermPositions().length, tdi.getTermOffsets(), tdi.getTermPositions()); } } } @@ -475,7 +475,7 @@ public class InstantiatedIndexReader extends IndexReader { for (Map.Entry> e : doc.getVectorSpace().entrySet()) { mapper.setExpectations(e.getKey(), e.getValue().size(), true, true); for (InstantiatedTermDocumentInformation tdi : e.getValue()) { - mapper.map(tdi.getTerm().text(), tdi.getTermPositions().length, tdi.getTermOffsets(), tdi.getTermPositions()); + mapper.map(tdi.getTerm().getTerm().bytes(), tdi.getTermPositions().length, tdi.getTermOffsets(), tdi.getTermPositions()); } } } diff --git a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermFreqVector.java b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermFreqVector.java index b31d6c02fe9..e688b6fe6e7 100644 --- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermFreqVector.java +++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermFreqVector.java @@ -1,6 +1,7 @@ package org.apache.lucene.store.instantiated; import org.apache.lucene.index.TermFreqVector; +import org.apache.lucene.util.BytesRef; import java.io.Serializable; import java.util.Arrays; @@ -34,18 +35,18 @@ public class InstantiatedTermFreqVector private final List termDocumentInformations; private final String field; - private final String terms[]; + private final BytesRef terms[]; private final int termFrequencies[]; public InstantiatedTermFreqVector(InstantiatedDocument document, String field) { this.field = field; termDocumentInformations = document.getVectorSpace().get(field); - terms = new String[termDocumentInformations.size()]; + terms = new BytesRef[termDocumentInformations.size()]; termFrequencies = new int[termDocumentInformations.size()]; for (int i = 0; i < termDocumentInformations.size(); i++) { InstantiatedTermDocumentInformation termDocumentInformation = termDocumentInformations.get(i); - terms[i] = termDocumentInformation.getTerm().text(); + terms[i] = termDocumentInformation.getTerm().getTerm().bytes(); termFrequencies[i] = termDocumentInformation.getTermPositions().length; } } @@ -77,7 +78,7 @@ public class InstantiatedTermFreqVector return terms == null ? 0 : terms.length; } - public String[] getTerms() { + public BytesRef[] getTerms() { return terms; } @@ -85,14 +86,14 @@ public class InstantiatedTermFreqVector return termFrequencies; } - public int indexOf(String termText) { + public int indexOf(BytesRef termText) { if (terms == null) return -1; int res = Arrays.binarySearch(terms, termText); return res >= 0 ? res : -1; } - public int[] indexesOf(String[] termNumbers, int start, int len) { + public int[] indexesOf(BytesRef[] termNumbers, int start, int len) { // TODO: there must be a more efficient way of doing this. // At least, we could advance the lower bound of the terms array // as we find valid indices. Also, it might be possible to leverage diff --git a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java index 24360283160..580485c1781 100644 --- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java +++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java @@ -41,14 +41,14 @@ public class InstantiatedTermsEnum extends TermsEnum { @Override public SeekStatus seek(BytesRef text, boolean useCache) { - final Term t = new Term(field, text.utf8ToString()); + final Term t = new Term(field, text); int loc = Arrays.binarySearch(terms, t, InstantiatedTerm.termComparator); if (loc < 0) { upto = -loc - 1; if (upto >= terms.length) { return SeekStatus.END; } else { - br.copy(terms[upto].getTerm().text()); + br.copy(terms[upto].getTerm().bytes()); return SeekStatus.NOT_FOUND; } } else { diff --git a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index a4d1f5caefe..efa6e8f8cf9 100644 --- a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -207,7 +207,7 @@ public class MemoryIndex implements Serializable { if (o1 instanceof Map.Entry) o1 = ((Map.Entry) o1).getKey(); if (o2 instanceof Map.Entry) o2 = ((Map.Entry) o2).getKey(); if (o1 == o2) return 0; - return ((String) o1).compareTo((String) o2); + return ((Comparable) o1).compareTo((Comparable) o2); } }; @@ -341,21 +341,19 @@ public class MemoryIndex implements Serializable { if (fields.get(fieldName) != null) throw new IllegalArgumentException("field must not be added more than once"); - HashMap terms = new HashMap(); + HashMap terms = new HashMap(); int numTokens = 0; int numOverlapTokens = 0; int pos = -1; - TermToBytesRefAttribute termAtt = stream.addAttribute(TermToBytesRefAttribute.class); + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); BytesRef ref = new BytesRef(10); stream.reset(); while (stream.incrementToken()) { termAtt.toBytesRef(ref); - // TODO: support non-UTF8 strings (like numerics) here - String term = ref.utf8ToString(); - if (term.length() == 0) continue; // nothing to do + if (ref.length == 0) continue; // nothing to do // if (DEBUG) System.err.println("token='" + term + "'"); numTokens++; final int posIncr = posIncrAttribute.getPositionIncrement(); @@ -363,10 +361,10 @@ public class MemoryIndex implements Serializable { numOverlapTokens++; pos += posIncr; - ArrayIntList positions = terms.get(term); + ArrayIntList positions = terms.get(ref); if (positions == null) { // term not seen before positions = new ArrayIntList(stride); - terms.put(term, positions); + terms.put(new BytesRef(ref), positions); } if (stride == 1) { positions.add(pos); @@ -490,9 +488,10 @@ public class MemoryIndex implements Serializable { int len = info.terms.size(); size += VM.sizeOfHashMap(len); - Iterator> iter2 = info.terms.entrySet().iterator(); + Iterator> iter2 = info.terms.entrySet().iterator(); while (--len >= 0) { // for each term - Map.Entry e = iter2.next(); + Map.Entry e = iter2.next(); + // FIXME: this calculation is probably not correct since we use bytes now. size += VM.sizeOfObject(PTR + 3*INT); // assumes substring() memory overlay // size += STR + 2 * ((String) e.getKey()).length(); ArrayIntList positions = e.getValue(); @@ -534,7 +533,7 @@ public class MemoryIndex implements Serializable { public String toString() { StringBuilder result = new StringBuilder(256); sortFields(); - int sumChars = 0; + int sumBytes = 0; int sumPositions = 0; int sumTerms = 0; @@ -545,32 +544,32 @@ public class MemoryIndex implements Serializable { info.sortTerms(); result.append(fieldName + ":\n"); - int numChars = 0; + int numBytes = 0; int numPositions = 0; for (int j=0; j < info.sortedTerms.length; j++) { - Map.Entry e = info.sortedTerms[j]; - String term = e.getKey(); + Map.Entry e = info.sortedTerms[j]; + BytesRef term = e.getKey(); ArrayIntList positions = e.getValue(); result.append("\t'" + term + "':" + numPositions(positions) + ":"); result.append(positions.toString(stride)); // ignore offsets result.append("\n"); numPositions += numPositions(positions); - numChars += term.length(); + numBytes += term.length; } result.append("\tterms=" + info.sortedTerms.length); result.append(", positions=" + numPositions); - result.append(", Kchars=" + (numChars/1000.0f)); + result.append(", Kbytes=" + (numBytes/1000.0f)); result.append("\n"); sumPositions += numPositions; - sumChars += numChars; + sumBytes += numBytes; sumTerms += info.sortedTerms.length; } result.append("\nfields=" + sortedFields.length); result.append(", terms=" + sumTerms); result.append(", positions=" + sumPositions); - result.append(", Kchars=" + (sumChars/1000.0f)); + result.append(", Kbytes=" + (sumBytes/1000.0f)); return result.toString(); } @@ -588,10 +587,10 @@ public class MemoryIndex implements Serializable { * Term strings and their positions for this field: Map */ - private final HashMap terms; + private final HashMap terms; /** Terms sorted ascending by term text; computed on demand */ - private transient Map.Entry[] sortedTerms; + private transient Map.Entry[] sortedTerms; /** Number of added tokens for this field */ private final int numTokens; @@ -607,7 +606,7 @@ public class MemoryIndex implements Serializable { private static final long serialVersionUID = 2882195016849084649L; - public Info(HashMap terms, int numTokens, int numOverlapTokens, float boost) { + public Info(HashMap terms, int numTokens, int numOverlapTokens, float boost) { this.terms = terms; this.numTokens = numTokens; this.numOverlapTokens = numOverlapTokens; @@ -627,7 +626,7 @@ public class MemoryIndex implements Serializable { } /** note that the frequency can be calculated as numPosition(getPositions(x)) */ - public ArrayIntList getPositions(String term) { + public ArrayIntList getPositions(BytesRef term) { return terms.get(term); } @@ -759,7 +758,7 @@ public class MemoryIndex implements Serializable { public int docFreq(Term term) { Info info = getInfo(term.field()); int freq = 0; - if (info != null) freq = info.getPositions(term.text()) != null ? 1 : 0; + if (info != null) freq = info.getPositions(term.bytes()) != null ? 1 : 0; if (DEBUG) System.err.println("MemoryIndexReader.docFreq: " + term + ", freq:" + freq); return freq; } @@ -833,8 +832,7 @@ public class MemoryIndex implements Serializable { @Override public SeekStatus seek(BytesRef text, boolean useCache) { - final String s = text.utf8ToString(); - termUpto = Arrays.binarySearch(info.sortedTerms, s, termComparator); + termUpto = Arrays.binarySearch(info.sortedTerms, text, termComparator); if (termUpto < 0) { // not found; choose successor termUpto = -termUpto -1; if (termUpto >= info.sortedTerms.length) { @@ -1061,7 +1059,7 @@ public class MemoryIndex implements Serializable { return new TermPositionVector() { - private final Map.Entry[] sortedTerms = info.sortedTerms; + private final Map.Entry[] sortedTerms = info.sortedTerms; public String getField() { return fieldName; @@ -1071,8 +1069,8 @@ public class MemoryIndex implements Serializable { return sortedTerms.length; } - public String[] getTerms() { - String[] terms = new String[sortedTerms.length]; + public BytesRef[] getTerms() { + BytesRef[] terms = new BytesRef[sortedTerms.length]; for (int i=sortedTerms.length; --i >= 0; ) { terms[i] = sortedTerms[i].getKey(); } @@ -1087,12 +1085,12 @@ public class MemoryIndex implements Serializable { return freqs; } - public int indexOf(String term) { + public int indexOf(BytesRef term) { int i = Arrays.binarySearch(sortedTerms, term, termComparator); return i >= 0 ? i : -1; } - public int[] indexesOf(String[] terms, int start, int len) { + public int[] indexesOf(BytesRef[] terms, int start, int len) { int[] indexes = new int[len]; for (int i=0; i < len; i++) { indexes[i] = indexOf(terms[start++]); diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/index/TermVectorAccessor.java b/lucene/contrib/misc/src/java/org/apache/lucene/index/TermVectorAccessor.java index 2c1400c4766..4d42bb33110 100644 --- a/lucene/contrib/misc/src/java/org/apache/lucene/index/TermVectorAccessor.java +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/TermVectorAccessor.java @@ -69,7 +69,7 @@ public class TermVectorAccessor { } /** Instance reused to save garbage collector some time */ - private List tokens; + private List tokens; /** Instance reused to save garbage collector some time */ private List positions; @@ -91,7 +91,7 @@ public class TermVectorAccessor { private void build(IndexReader indexReader, String field, TermVectorMapper mapper, int documentNumber) throws IOException { if (tokens == null) { - tokens = new ArrayList(500); + tokens = new ArrayList(500); positions = new ArrayList(500); frequencies = new ArrayList(500); } else { @@ -122,7 +122,7 @@ public class TermVectorAccessor { if (docID == documentNumber) { frequencies.add(Integer.valueOf(docs.freq())); - tokens.add(text.utf8ToString()); + tokens.add(new BytesRef(text)); if (!mapper.isIgnoringPositions()) { int[] positions = new int[docs.freq()]; @@ -173,7 +173,7 @@ public class TermVectorAccessor { } @Override - public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { + public void map(BytesRef term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { decorated.map(term, frequency, offsets, positions); } diff --git a/lucene/contrib/misc/src/test/org/apache/lucene/index/TestTermVectorAccessor.java b/lucene/contrib/misc/src/test/org/apache/lucene/index/TestTermVectorAccessor.java index fa03d5f77aa..452af1f6004 100644 --- a/lucene/contrib/misc/src/test/org/apache/lucene/index/TestTermVectorAccessor.java +++ b/lucene/contrib/misc/src/test/org/apache/lucene/index/TestTermVectorAccessor.java @@ -76,21 +76,21 @@ public class TestTermVectorAccessor extends LuceneTestCase { mapper = new ParallelArrayTermVectorMapper(); accessor.accept(ir, i, "a", mapper); tfv = mapper.materializeVector(); - assertEquals("doc " + i, "a", tfv.getTerms()[0]); + assertEquals("doc " + i, "a", tfv.getTerms()[0].utf8ToString()); assertEquals("doc " + i, 8, tfv.getTermFrequencies()[0]); mapper = new ParallelArrayTermVectorMapper(); accessor.accept(ir, i, "b", mapper); tfv = mapper.materializeVector(); assertEquals("doc " + i, 8, tfv.getTermFrequencies().length); - assertEquals("doc " + i, "b", tfv.getTerms()[1]); + assertEquals("doc " + i, "b", tfv.getTerms()[1].utf8ToString()); assertEquals("doc " + i, 7, tfv.getTermFrequencies()[1]); mapper = new ParallelArrayTermVectorMapper(); accessor.accept(ir, i, "c", mapper); tfv = mapper.materializeVector(); assertEquals("doc " + i, 8, tfv.getTermFrequencies().length); - assertEquals("doc " + i, "c", tfv.getTerms()[2]); + assertEquals("doc " + i, "c", tfv.getTerms()[2].utf8ToString()); assertEquals("doc " + i, 7, tfv.getTermFrequencies()[2]); mapper = new ParallelArrayTermVectorMapper(); diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java index cfef2072376..989a4c58b9e 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java @@ -213,7 +213,7 @@ public class FuzzyLikeThisQuery extends Query totalVariantDocFreqs+=fe.docFreq(); float score=boostAtt.getBoost(); if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){ - ScoreTerm st=new ScoreTerm(new Term(startTerm.field(), possibleMatch.utf8ToString()),score,startTerm); + ScoreTerm st=new ScoreTerm(new Term(startTerm.field(), new BytesRef(possibleMatch)),score,startTerm); variantsQ.insertWithOverflow(st); minScore = variantsQ.top().score; // maintain minScore } diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java index 642de6df390..94df30d73e2 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java @@ -77,7 +77,7 @@ public class TermsFilter extends Filter } if (terms != null) { - br.copy(term.text()); + br.copy(term.bytes()); if (termsEnum.seek(br) == TermsEnum.SeekStatus.FOUND) { docs = termsEnum.docs(delDocs, docs); while(docs.nextDoc() != DocsEnum.NO_MORE_DOCS) { diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java index d54e237dd4b..53df3c30bc6 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java @@ -47,6 +47,7 @@ import org.apache.lucene.search.Similarity; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.PriorityQueue; @@ -848,10 +849,10 @@ public final class MoreLikeThis { */ private void addTermFrequencies(Map termFreqMap, TermFreqVector vector) { - String[] terms = vector.getTerms(); + BytesRef[] terms = vector.getTerms(); int freqs[]=vector.getTermFrequencies(); for (int j = 0; j < terms.length; j++) { - String term = terms[j]; + String term = terms[j].utf8ToString(); if(isNoiseWord(term)){ continue; diff --git a/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java b/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java index 090d66a9fe5..523f932a837 100644 --- a/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java @@ -1073,7 +1073,6 @@ final class DocumentsWriter { TermsEnum termsEnum = null; String currentField = null; - BytesRef termRef = new BytesRef(); DocsEnum docs = null; for (Entry entry: deletesFlushed.terms.entrySet()) { @@ -1097,9 +1096,7 @@ final class DocumentsWriter { } assert checkDeleteTerm(term); - termRef.copy(term.text()); - - if (termsEnum.seek(termRef, false) == TermsEnum.SeekStatus.FOUND) { + if (termsEnum.seek(term.bytes(), false) == TermsEnum.SeekStatus.FOUND) { DocsEnum docsEnum = termsEnum.docs(reader.getDeletedDocs(), docs); if (docsEnum != null) { @@ -1166,7 +1163,7 @@ final class DocumentsWriter { num.setNum(docIDUpto); deletesInRAM.numTerms++; - deletesInRAM.addBytesUsed(BYTES_PER_DEL_TERM + term.text.length()*CHAR_NUM_BYTE); + deletesInRAM.addBytesUsed(BYTES_PER_DEL_TERM + term.bytes.length); } // Buffer a specific docID for deletion. Currently only diff --git a/lucene/src/java/org/apache/lucene/index/FieldSortedTermVectorMapper.java b/lucene/src/java/org/apache/lucene/index/FieldSortedTermVectorMapper.java index e3290617174..cf11b9f6466 100644 --- a/lucene/src/java/org/apache/lucene/index/FieldSortedTermVectorMapper.java +++ b/lucene/src/java/org/apache/lucene/index/FieldSortedTermVectorMapper.java @@ -2,6 +2,8 @@ package org.apache.lucene.index; import java.util.*; +import org.apache.lucene.util.BytesRef; + /** * Copyright 2007 The Apache Software Foundation *

@@ -44,7 +46,7 @@ public class FieldSortedTermVectorMapper extends TermVectorMapper{ } @Override - public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { + public void map(BytesRef term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { TermVectorEntry entry = new TermVectorEntry(currentField, term, frequency, offsets, positions); currentSet.add(entry); } diff --git a/lucene/src/java/org/apache/lucene/index/IndexReader.java b/lucene/src/java/org/apache/lucene/index/IndexReader.java index 5922b3df982..000f0313908 100644 --- a/lucene/src/java/org/apache/lucene/index/IndexReader.java +++ b/lucene/src/java/org/apache/lucene/index/IndexReader.java @@ -883,7 +883,7 @@ public abstract class IndexReader implements Cloneable,Closeable { public abstract Fields fields() throws IOException; public int docFreq(Term term) throws IOException { - return docFreq(term.field(), new BytesRef(term.text())); + return docFreq(term.field(), term.bytes()); } /** Returns the number of documents containing the term @@ -1000,7 +1000,7 @@ public abstract class IndexReader implements Cloneable,Closeable { DocsEnum docs = MultiFields.getTermDocsEnum(this, MultiFields.getDeletedDocs(this), term.field(), - new BytesRef(term.text())); + term.bytes()); if (docs == null) return 0; int n = 0; int doc; diff --git a/lucene/src/java/org/apache/lucene/index/PositionBasedTermVectorMapper.java b/lucene/src/java/org/apache/lucene/index/PositionBasedTermVectorMapper.java index 59b570cd972..3dd0a036dd2 100644 --- a/lucene/src/java/org/apache/lucene/index/PositionBasedTermVectorMapper.java +++ b/lucene/src/java/org/apache/lucene/index/PositionBasedTermVectorMapper.java @@ -21,6 +21,8 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import org.apache.lucene.util.BytesRef; + /** * For each Field, store position by position information. It ignores frequency information *

@@ -69,7 +71,7 @@ public class PositionBasedTermVectorMapper extends TermVectorMapper{ * @param positions */ @Override - public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { + public void map(BytesRef term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { for (int i = 0; i < positions.length; i++) { Integer posVal = Integer.valueOf(positions[i]); TVPositionInfo pos = currentPositions.get(posVal); @@ -120,20 +122,20 @@ public class PositionBasedTermVectorMapper extends TermVectorMapper{ public static class TVPositionInfo{ private int position; - private List terms; + private List terms; private List offsets; public TVPositionInfo(int position, boolean storeOffsets) { this.position = position; - terms = new ArrayList(); + terms = new ArrayList(); if (storeOffsets) { offsets = new ArrayList(); } } - void addTerm(String term, TermVectorOffsetInfo info) + void addTerm(BytesRef term, TermVectorOffsetInfo info) { terms.add(term); if (offsets != null) { @@ -151,9 +153,9 @@ public class PositionBasedTermVectorMapper extends TermVectorMapper{ /** * Note, there may be multiple terms at the same position - * @return A List of Strings + * @return A List of BytesRefs */ - public List getTerms() { + public List getTerms() { return terms; } diff --git a/lucene/src/java/org/apache/lucene/index/SegmentTermPositionVector.java b/lucene/src/java/org/apache/lucene/index/SegmentTermPositionVector.java index 9d9e3b4177a..bba528d85f0 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentTermPositionVector.java +++ b/lucene/src/java/org/apache/lucene/index/SegmentTermPositionVector.java @@ -1,5 +1,7 @@ package org.apache.lucene.index; +import org.apache.lucene.util.BytesRef; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -22,7 +24,7 @@ class SegmentTermPositionVector extends SegmentTermVector implements TermPositio protected TermVectorOffsetInfo[][] offsets; public static final int[] EMPTY_TERM_POS = new int[0]; - public SegmentTermPositionVector(String field, String terms[], int termFreqs[], int[][] positions, TermVectorOffsetInfo[][] offsets) { + public SegmentTermPositionVector(String field, BytesRef terms[], int termFreqs[], int[][] positions, TermVectorOffsetInfo[][] offsets) { super(field, terms, termFreqs); this.offsets = offsets; this.positions = positions; diff --git a/lucene/src/java/org/apache/lucene/index/SegmentTermVector.java b/lucene/src/java/org/apache/lucene/index/SegmentTermVector.java index 22a0ffb3157..145608efa0f 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentTermVector.java +++ b/lucene/src/java/org/apache/lucene/index/SegmentTermVector.java @@ -19,13 +19,15 @@ package org.apache.lucene.index; import java.util.*; +import org.apache.lucene.util.BytesRef; + class SegmentTermVector implements TermFreqVector { private String field; - private String terms[]; + private BytesRef terms[]; private int termFreqs[]; - SegmentTermVector(String field, String terms[], int termFreqs[]) { + SegmentTermVector(String field, BytesRef terms[], int termFreqs[]) { this.field = field; this.terms = terms; this.termFreqs = termFreqs; @@ -59,7 +61,7 @@ class SegmentTermVector implements TermFreqVector { return terms == null ? 0 : terms.length; } - public String [] getTerms() { + public BytesRef [] getTerms() { return terms; } @@ -67,14 +69,14 @@ class SegmentTermVector implements TermFreqVector { return termFreqs; } - public int indexOf(String termText) { + public int indexOf(BytesRef termBytes) { if(terms == null) return -1; - int res = Arrays.binarySearch(terms, termText); + int res = Arrays.binarySearch(terms, termBytes); return res >= 0 ? res : -1; } - public int[] indexesOf(String [] termNumbers, int start, int len) { + public int[] indexesOf(BytesRef [] termNumbers, int start, int len) { // TODO: there must be a more efficient way of doing this. // At least, we could advance the lower bound of the terms array // as we find valid indexes. Also, it might be possible to leverage diff --git a/lucene/src/java/org/apache/lucene/index/SortedTermVectorMapper.java b/lucene/src/java/org/apache/lucene/index/SortedTermVectorMapper.java index 6fccec87935..b5ea1366e06 100644 --- a/lucene/src/java/org/apache/lucene/index/SortedTermVectorMapper.java +++ b/lucene/src/java/org/apache/lucene/index/SortedTermVectorMapper.java @@ -17,6 +17,8 @@ package org.apache.lucene.index; import java.util.*; +import org.apache.lucene.util.BytesRef; + /** * Store a sorted collection of {@link org.apache.lucene.index.TermVectorEntry}s. Collects all term information * into a single, SortedSet. @@ -30,7 +32,7 @@ public class SortedTermVectorMapper extends TermVectorMapper{ private SortedSet currentSet; - private Map termToTVE = new HashMap(); + private Map termToTVE = new HashMap(); private boolean storeOffsets; private boolean storePositions; /** @@ -61,7 +63,7 @@ public class SortedTermVectorMapper extends TermVectorMapper{ */ //We need to combine any previous mentions of the term @Override - public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { + public void map(BytesRef term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { TermVectorEntry entry = termToTVE.get(term); if (entry == null) { entry = new TermVectorEntry(ALL, term, frequency, diff --git a/lucene/src/java/org/apache/lucene/index/Term.java b/lucene/src/java/org/apache/lucene/index/Term.java index 03ec2fb8bef..d69c4811db5 100644 --- a/lucene/src/java/org/apache/lucene/index/Term.java +++ b/lucene/src/java/org/apache/lucene/index/Term.java @@ -17,6 +17,9 @@ package org.apache.lucene.index; * limitations under the License. */ +import java.util.Comparator; + +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.StringHelper; /** @@ -29,14 +32,26 @@ import org.apache.lucene.util.StringHelper; public final class Term implements Comparable, java.io.Serializable { String field; - String text; + BytesRef bytes; + /** Constructs a Term with the given field and bytes. + *

Note that a null field or null bytes value results in undefined + * behavior for most Lucene APIs that accept a Term parameter. + *

WARNING: the provided BytesRef is not copied, but used directly. + * Therefore the bytes should not be modified after construction, for + * example, you should clone a copy rather than pass reused bytes from + * a TermsEnum. + */ + public Term(String fld, BytesRef bytes) { + field = fld == null ? null : StringHelper.intern(fld); + this.bytes = bytes; + } + /** Constructs a Term with the given field and text. *

Note that a null field or null text value results in undefined * behavior for most Lucene APIs that accept a Term parameter. */ - public Term(String fld, String txt) { - field = fld == null ? null : StringHelper.intern(fld); - text = txt; + public Term(String fld, String text) { + this(fld, new BytesRef(text)); } /** Constructs a Term with the given field and empty text. @@ -46,15 +61,27 @@ public final class Term implements Comparable, java.io.Serializable { * @param fld */ public Term(String fld) { - this(fld, "", true); + this(fld, new BytesRef(), true); + } + + /** + * WARNING: the provided BytesRef is not copied, but used directly. + * Therefore the bytes should not be modified after construction, for + * example, you should clone a copy rather than pass reused bytes from + * a TermsEnum. + * + * @lucene.experimental + */ + public Term(String fld, BytesRef bytes, boolean intern) { + field = intern ? StringHelper.intern(fld) : fld; // field names are interned + this.bytes = bytes; // unless already known to be } /** @lucene.experimental */ - public Term(String fld, String txt, boolean intern) { - field = intern ? StringHelper.intern(fld) : fld; // field names are interned - text = txt; // unless already known to be + public Term(String fld, String text, boolean intern) { + this(fld, new BytesRef(text), intern); } - + /** Returns the field of this term, an interned string. The field indicates the part of a document which this term came from. */ public final String field() { return field; } @@ -62,8 +89,26 @@ public final class Term implements Comparable, java.io.Serializable { /** Returns the text of this term. In the case of words, this is simply the text of the word. In the case of dates and other types, this is an encoding of the object as a string. */ - public final String text() { return text; } - + public final String text() { return bytes.utf8ToString(); } + + /** Returns the bytes of this term. */ + public final BytesRef bytes() { return bytes; } + + /** + * Optimized construction of new Terms by reusing same field as this Term + * - avoids field.intern() overhead + *

WARNING: the provided BytesRef is not copied, but used directly. + * Therefore the bytes should not be modified after construction, for + * example, you should clone a copy rather than pass reused bytes from + * a TermsEnum. + * @param text The bytes of the new term (field is implicitly same as this Term instance) + * @return A new Term + */ + public Term createTerm(BytesRef bytes) + { + return new Term(field,bytes,false); + } + /** * Optimized construction of new Terms by reusing same field as this Term * - avoids field.intern() overhead @@ -89,10 +134,10 @@ public final class Term implements Comparable, java.io.Serializable { return false; } else if (!field.equals(other.field)) return false; - if (text == null) { - if (other.text != null) + if (bytes == null) { + if (other.bytes != null) return false; - } else if (!text.equals(other.text)) + } else if (!bytes.equals(other.bytes)) return false; return true; } @@ -102,7 +147,7 @@ public final class Term implements Comparable, java.io.Serializable { final int prime = 31; int result = 1; result = prime * result + ((field == null) ? 0 : field.hashCode()); - result = prime * result + ((text == null) ? 0 : text.hashCode()); + result = prime * result + ((bytes == null) ? 0 : bytes.hashCode()); return result; } @@ -113,19 +158,47 @@ public final class Term implements Comparable, java.io.Serializable { The ordering of terms is first by field, then by text.*/ public final int compareTo(Term other) { if (field == other.field) // fields are interned - return text.compareTo(other.text); + return bytes.compareTo(other.bytes); else return field.compareTo(other.field); } + + @Deprecated + private static final Comparator legacyComparator = + BytesRef.getUTF8SortedAsUTF16Comparator(); + + /** + * @deprecated For internal backwards compatibility use only + * @lucene.internal + */ + @Deprecated + public final int compareToUTF16(Term other) { + if (field == other.field) // fields are interned + return legacyComparator.compare(this.bytes, other.bytes); + else + return field.compareTo(other.field); + } + + /** + * Resets the field and text of a Term. + *

WARNING: the provided BytesRef is not copied, but used directly. + * Therefore the bytes should not be modified after construction, for + * example, you should clone a copy rather than pass reused bytes from + * a TermsEnum. + */ + final void set(String fld, BytesRef bytes) { + field = fld; + this.bytes = bytes; + } /** Resets the field and text of a Term. */ final void set(String fld, String txt) { field = fld; - text = txt; + this.bytes = new BytesRef(txt); } @Override - public final String toString() { return field + ":" + text; } + public final String toString() { return field + ":" + bytes.utf8ToString(); } private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, ClassNotFoundException diff --git a/lucene/src/java/org/apache/lucene/index/TermFreqVector.java b/lucene/src/java/org/apache/lucene/index/TermFreqVector.java index 518766524ee..29c695a0933 100644 --- a/lucene/src/java/org/apache/lucene/index/TermFreqVector.java +++ b/lucene/src/java/org/apache/lucene/index/TermFreqVector.java @@ -1,5 +1,7 @@ package org.apache.lucene.index; +import org.apache.lucene.util.BytesRef; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -38,7 +40,7 @@ public interface TermFreqVector { /** * @return An Array of term texts in ascending order. */ - public String[] getTerms(); + public BytesRef[] getTerms(); /** Array of term frequencies. Locations of the array correspond one to one @@ -54,7 +56,7 @@ public interface TermFreqVector { * term appears. If this term does not appear in the array, * return -1. */ - public int indexOf(String term); + public int indexOf(BytesRef term); /** Just like indexOf(int) but searches for a number of terms @@ -66,6 +68,6 @@ public interface TermFreqVector { * @param start index in the array where the list of terms starts * @param len the number of terms in the list */ - public int[] indexesOf(String[] terms, int start, int len); + public int[] indexesOf(BytesRef[] terms, int start, int len); } diff --git a/lucene/src/java/org/apache/lucene/index/TermVectorEntry.java b/lucene/src/java/org/apache/lucene/index/TermVectorEntry.java index ce54a02f2e2..85c73405c07 100644 --- a/lucene/src/java/org/apache/lucene/index/TermVectorEntry.java +++ b/lucene/src/java/org/apache/lucene/index/TermVectorEntry.java @@ -1,5 +1,7 @@ package org.apache.lucene.index; +import org.apache.lucene.util.BytesRef; + /** * Copyright 2007 The Apache Software Foundation *

@@ -21,7 +23,7 @@ package org.apache.lucene.index; */ public class TermVectorEntry { private String field; - private String term; + private BytesRef term; private int frequency; private TermVectorOffsetInfo [] offsets; int [] positions; @@ -30,7 +32,7 @@ public class TermVectorEntry { public TermVectorEntry() { } - public TermVectorEntry(String field, String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { + public TermVectorEntry(String field, BytesRef term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { this.field = field; this.term = term; this.frequency = frequency; @@ -55,7 +57,7 @@ public class TermVectorEntry { return positions; } - public String getTerm() { + public BytesRef getTerm() { return term; } diff --git a/lucene/src/java/org/apache/lucene/index/TermVectorMapper.java b/lucene/src/java/org/apache/lucene/index/TermVectorMapper.java index cc5d079cf78..c0da5bb25ec 100644 --- a/lucene/src/java/org/apache/lucene/index/TermVectorMapper.java +++ b/lucene/src/java/org/apache/lucene/index/TermVectorMapper.java @@ -1,4 +1,7 @@ package org.apache.lucene.index; + +import org.apache.lucene.util.BytesRef; + /** * Copyright 2007 The Apache Software Foundation * @@ -62,7 +65,7 @@ public abstract class TermVectorMapper { * @param offsets null if the offset is not specified, otherwise the offset into the field of the term * @param positions null if the position is not specified, otherwise the position in the field of the term */ - public abstract void map(String term, int frequency, TermVectorOffsetInfo [] offsets, int [] positions); + public abstract void map(BytesRef term, int frequency, TermVectorOffsetInfo [] offsets, int [] positions); /** * Indicate to Lucene that even if there are positions stored, this mapper is not interested in them and they diff --git a/lucene/src/java/org/apache/lucene/index/TermVectorsReader.java b/lucene/src/java/org/apache/lucene/index/TermVectorsReader.java index 17147299b3f..c0c18d95e71 100644 --- a/lucene/src/java/org/apache/lucene/index/TermVectorsReader.java +++ b/lucene/src/java/org/apache/lucene/index/TermVectorsReader.java @@ -21,6 +21,7 @@ import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; import java.io.IOException; import java.util.Arrays; @@ -415,14 +416,15 @@ class TermVectorsReader implements Cloneable { deltaLength = tvf.readVInt(); totalLength = start + deltaLength; - final String term; + final BytesRef term = new BytesRef(totalLength); // Term stored as utf8 bytes if (byteBuffer.length < totalLength) { byteBuffer = ArrayUtil.grow(byteBuffer, totalLength); } tvf.readBytes(byteBuffer, start, deltaLength); - term = new String(byteBuffer, 0, totalLength, "UTF-8"); + System.arraycopy(byteBuffer, 0, term.bytes, 0, totalLength); + term.length = totalLength; int freq = tvf.readVInt(); int [] positions = null; if (storePositions) { //read in the positions @@ -491,7 +493,7 @@ class TermVectorsReader implements Cloneable { class ParallelArrayTermVectorMapper extends TermVectorMapper { - private String[] terms; + private BytesRef[] terms; private int[] termFreqs; private int positions[][]; private TermVectorOffsetInfo offsets[][]; @@ -503,7 +505,7 @@ class ParallelArrayTermVectorMapper extends TermVectorMapper @Override public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) { this.field = field; - terms = new String[numTerms]; + terms = new BytesRef[numTerms]; termFreqs = new int[numTerms]; this.storingOffsets = storeOffsets; this.storingPositions = storePositions; @@ -514,7 +516,7 @@ class ParallelArrayTermVectorMapper extends TermVectorMapper } @Override - public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { + public void map(BytesRef term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { terms[currentPosition] = term; termFreqs[currentPosition] = frequency; if (storingOffsets) diff --git a/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java b/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java index 28a14269d7c..5789f00dcd2 100644 --- a/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java @@ -21,7 +21,6 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.StringHelper; -import org.apache.lucene.util.UnicodeUtil; import java.io.IOException; @@ -29,7 +28,6 @@ final class TermVectorsWriter { private IndexOutput tvx = null, tvd = null, tvf = null; private FieldInfos fieldInfos; - final BytesRef[] utf8Results = new BytesRef[] {new BytesRef(10), new BytesRef(10)}; public TermVectorsWriter(Directory directory, String segment, FieldInfos fieldInfos) @@ -97,25 +95,19 @@ final class TermVectorsWriter { tvf.writeVInt(bits); - final String[] terms = vectors[i].getTerms(); + final BytesRef[] terms = vectors[i].getTerms(); final int[] freqs = vectors[i].getTermFrequencies(); - int utf8Upto = 0; - utf8Results[1].length = 0; - for (int j=0; j seekPrefix; pendingPrefix[seekPrefix] = UnicodeUtil.UNI_SUR_HIGH_START; - Term t2 = protoTerm.createTerm(new String(pendingPrefix, 0, 1+seekPrefix)); + pendingPrefix[1+seekPrefix] = UnicodeUtil.UNI_SUR_LOW_START; + Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 2+seekPrefix)); if (DEBUG_SURROGATES) { System.out.println(" do pop; seek back to " + UnicodeUtil.toHexString(t2.text())); } @@ -334,7 +334,7 @@ public class PreFlexFields extends FieldsProducer { assert pendingPrefix != null; assert pendingPrefix.length > seekPrefix; pendingPrefix[seekPrefix] = 0xffff; - Term t2 = protoTerm.createTerm(new String(pendingPrefix, 0, 1+seekPrefix)); + Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 1+seekPrefix)); if (DEBUG_SURROGATES) { System.out.println(" finish pop; seek fwd to " + UnicodeUtil.toHexString(t2.text())); } @@ -358,6 +358,9 @@ public class PreFlexFields extends FieldsProducer { return false; } + private UnicodeUtil.UTF16Result termBuffer = new UnicodeUtil.UTF16Result(); + private UnicodeUtil.UTF16Result seekBuffer = new UnicodeUtil.UTF16Result(); + private boolean pushNewSurrogate() throws IOException { if (DEBUG_SURROGATES) { System.out.println(" check push newSuffix=" + newSuffixStart + " stack=" + getStack()); @@ -366,11 +369,12 @@ public class PreFlexFields extends FieldsProducer { if (t == null || t.field() != fieldInfo.name) { return false; } - final String text = t.text(); - final int textLen = text.length(); - for(int i=Math.max(0,newSuffixStart);i= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END && (surrogateSeekUpto == 0 || i > surrogateSeekPending[surrogateSeekUpto-1])) { if (DEBUG_SURROGATES) { @@ -385,24 +389,27 @@ public class PreFlexFields extends FieldsProducer { // surrogate range; if so, we must first iterate // them, then seek back to the surrogates - char[] testPrefix = new char[i+1]; + char[] testPrefix = new char[i+2]; for(int j=0;j= lo) { int mid = (lo + hi) >>> 1; - int delta = term.compareTo(indexTerms[mid]); + int delta = term.compareToUTF16(indexTerms[mid]); if (delta < 0) hi = mid - 1; else if (delta > 0) @@ -234,17 +234,17 @@ public final class TermInfosReader { // optimize sequential access: first try scanning cached enum w/o seeking if (enumerator.term() != null // term is at or past current - && ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0) - || term.compareTo(enumerator.term()) >= 0)) { + && ((enumerator.prev() != null && term.compareToUTF16(enumerator.prev())> 0) + || term.compareToUTF16(enumerator.term()) >= 0)) { int enumOffset = (int)(enumerator.position/totalIndexInterval)+1; if (indexTerms.length == enumOffset // but before end of block - || term.compareTo(indexTerms[enumOffset]) < 0) { + || term.compareToUTF16(indexTerms[enumOffset]) < 0) { // no need to seek final TermInfo ti; int numScans = enumerator.scanTo(term); - if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) { + if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) { ti = enumerator.termInfo(); if (numScans > 1) { // we only want to put this TermInfo into the cache if @@ -279,7 +279,7 @@ public final class TermInfosReader { seekEnum(enumerator, indexPos); enumerator.scanTo(term); final TermInfo ti; - if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) { + if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) { ti = enumerator.termInfo(); if (tiOrd == null) { termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, (int) enumerator.position)); @@ -328,9 +328,9 @@ public final class TermInfosReader { SegmentTermEnum enumerator = getThreadResources().termEnum; seekEnum(enumerator, indexOffset); - while(term.compareTo(enumerator.term()) > 0 && enumerator.next()) {} + while(term.compareToUTF16(enumerator.term()) > 0 && enumerator.next()) {} - if (term.compareTo(enumerator.term()) == 0) + if (term.compareToUTF16(enumerator.term()) == 0) return enumerator.position; else return -1; diff --git a/lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java b/lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java index 5112fcefe40..f95a51775e9 100644 --- a/lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java +++ b/lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java @@ -95,11 +95,18 @@ import org.apache.lucene.util.BytesRef; public class FieldCacheTermsFilter extends Filter { private String field; - private String[] terms; + private BytesRef[] terms; + + public FieldCacheTermsFilter(String field, BytesRef... terms) { + this.field = field; + this.terms = terms; + } public FieldCacheTermsFilter(String field, String... terms) { this.field = field; - this.terms = terms; + this.terms = new BytesRef[terms.length]; + for (int i = 0; i < terms.length; i++) + this.terms[i] = new BytesRef(terms[i]); } public FieldCache getFieldCache() { @@ -121,7 +128,7 @@ public class FieldCacheTermsFilter extends Filter { openBitSet = new OpenBitSet(this.fcsi.size()); final BytesRef spare = new BytesRef(); for (int i=0;i 0) { openBitSet.fastSet(termNumber); } diff --git a/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java b/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java index 330089c8052..b25c3b61b42 100644 --- a/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java +++ b/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java @@ -499,14 +499,13 @@ class UnionDocsAndPositionsEnum extends DocsAndPositionsEnum { List docsEnums = new LinkedList(); final Bits delDocs = MultiFields.getDeletedDocs(indexReader); for (int i = 0; i < terms.length; i++) { - final BytesRef text = new BytesRef(terms[i].text()); DocsAndPositionsEnum postings = indexReader.termPositionsEnum(delDocs, terms[i].field(), - text); + terms[i].bytes()); if (postings != null) { docsEnums.add(postings); } else { - if (MultiFields.getTermDocsEnum(indexReader, delDocs, terms[i].field(), text) != null) { + if (MultiFields.getTermDocsEnum(indexReader, delDocs, terms[i].field(), terms[i].bytes()) != null) { // term does exist, but has no positions throw new IllegalStateException("field \"" + terms[i].field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + terms[i].text() + ")"); } diff --git a/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java b/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java index 47c73fdff4f..89c90907fa8 100644 --- a/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java +++ b/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java @@ -32,6 +32,7 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.queryParser.QueryParser; // for javadoc import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.PagedBytes; /** * An abstract {@link Query} that matches documents @@ -177,11 +178,6 @@ public abstract class MultiTermQuery extends Query { private abstract static class BooleanQueryRewrite extends RewriteMethod { protected final int collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException { - - if (query.field == null) { - throw new NullPointerException("If you implement getTermsEnum(), you must specify a non-null field in the constructor of MultiTermQuery."); - } - final Fields fields = MultiFields.getFields(reader); if (fields == null) { // reader has no fields @@ -203,10 +199,9 @@ public abstract class MultiTermQuery extends Query { termsEnum.attributes().addAttribute(BoostAttribute.class); collector.boostAtt = boostAtt; int count = 0; - BytesRef term; - final Term placeholderTerm = new Term(query.field); - while ((term = termsEnum.next()) != null) { - if (collector.collect(placeholderTerm.createTerm(term.utf8ToString()), boostAtt.getBoost())) { + BytesRef bytes; + while ((bytes = termsEnum.next()) != null) { + if (collector.collect(bytes, boostAtt.getBoost())) { count++; } else { break; @@ -217,15 +212,15 @@ public abstract class MultiTermQuery extends Query { } protected static abstract class TermCollector { - /** this field is only set if a boostAttribute is used (e.g. {@link FuzzyTermsEnum}) */ private BoostAttribute boostAtt = null; /** return false to stop collecting */ - public abstract boolean collect(Term t, float boost) throws IOException; + public abstract boolean collect(BytesRef bytes, float boost) throws IOException; /** set the minimum boost as a hint for the term producer */ protected final void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) { - if (boostAtt != null) boostAtt.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost); + assert boostAtt != null; + boostAtt.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost); } } } @@ -234,9 +229,11 @@ public abstract class MultiTermQuery extends Query { @Override public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { final BooleanQuery result = new BooleanQuery(true); + final Term placeholderTerm = new Term(query.field); query.incTotalNumberOfTerms(collectTerms(reader, query, new TermCollector() { - public boolean collect(Term t, float boost) { - TermQuery tq = new TermQuery(t); // found a match + public boolean collect(BytesRef bytes, float boost) { + // add new TQ, we must clone the term, else it may get overwritten! + TermQuery tq = new TermQuery(placeholderTerm.createTerm(new BytesRef(bytes))); tq.setBoost(query.getBoost() * boost); // set the boost result.add(tq, BooleanClause.Occur.SHOULD); // add to query return true; @@ -297,16 +294,16 @@ public abstract class MultiTermQuery extends Query { protected abstract Query getQuery(Term term); @Override - public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { + public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount()); final PriorityQueue stQueue = new PriorityQueue(); collectTerms(reader, query, new TermCollector() { - public boolean collect(Term t, float boost) { + public boolean collect(BytesRef bytes, float boost) { // ignore uncompetetive hits if (stQueue.size() >= maxSize && boost <= stQueue.peek().boost) return true; - // add new entry in PQ - st.term = t; + // add new entry in PQ, we must clone the term, else it may get overwritten! + st.bytes.copy(bytes); st.boost = boost; stQueue.offer(st); // possibly drop entries from queue @@ -319,9 +316,11 @@ public abstract class MultiTermQuery extends Query { private ScoreTerm st = new ScoreTerm(); }); + final Term placeholderTerm = new Term(query.field); final BooleanQuery bq = new BooleanQuery(true); for (final ScoreTerm st : stQueue) { - Query tq = getQuery(st.term); // found a match + // add new query, we must clone the term, else it may get overwritten! + Query tq = getQuery(placeholderTerm.createTerm(st.bytes)); tq.setBoost(query.getBoost() * st.boost); // set the boost bq.add(tq, BooleanClause.Occur.SHOULD); // add to query } @@ -348,12 +347,13 @@ public abstract class MultiTermQuery extends Query { } private static class ScoreTerm implements Comparable { - public Term term; + public final BytesRef bytes = new BytesRef(); public float boost; public int compareTo(ScoreTerm other) { if (this.boost == other.boost) - return other.term.compareTo(this.term); + // TODO: is it OK to use default compare here? + return other.bytes.compareTo(this.bytes); else return Float.compare(this.boost, other.boost); } @@ -530,58 +530,67 @@ public abstract class MultiTermQuery extends Query { final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc()); final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff); - final CutOffTermCollector col = new CutOffTermCollector(reader, docCountCutoff, termCountLimit); + final CutOffTermCollector col = new CutOffTermCollector(reader, query.field, docCountCutoff, termCountLimit); collectTerms(reader, query, col); if (col.hasCutOff) { return CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query); + } else if (col.termCount == 0) { + return new BooleanQuery(true); } else { - final Query result; - if (col.pendingTerms.isEmpty()) { - result = new BooleanQuery(true); - } else { - BooleanQuery bq = new BooleanQuery(true); - for(Term term : col.pendingTerms) { - TermQuery tq = new TermQuery(term); - bq.add(tq, BooleanClause.Occur.SHOULD); + final PagedBytes.Reader bytesReader = col.pendingTerms.freeze(false); + try { + final BooleanQuery bq = new BooleanQuery(true); + final Term placeholderTerm = new Term(query.field); + long start = col.startOffset; + for(int i = 0; i < col.termCount; i++) { + final BytesRef bytes = new BytesRef(); + start = bytesReader.fillUsingLengthPrefix3(bytes, start); + bq.add(new TermQuery(placeholderTerm.createTerm(bytes)), BooleanClause.Occur.SHOULD); } // Strip scores - result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); + final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); result.setBoost(query.getBoost()); + query.incTotalNumberOfTerms(col.termCount); + return result; + } finally { + bytesReader.close(); } - query.incTotalNumberOfTerms(col.pendingTerms.size()); - return result; } } private static final class CutOffTermCollector extends TermCollector { - CutOffTermCollector(IndexReader reader, int docCountCutoff, int termCountLimit) { + CutOffTermCollector(IndexReader reader, String field, int docCountCutoff, int termCountLimit) { this.reader = reader; + this.field = field; this.docCountCutoff = docCountCutoff; this.termCountLimit = termCountLimit; } - public boolean collect(Term t, float boost) throws IOException { - pendingTerms.add(t); - if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) { + public boolean collect(BytesRef bytes, float boost) throws IOException { + termCount++; + if (termCount >= termCountLimit || docVisitCount >= docCountCutoff) { hasCutOff = true; return false; } + pendingTerms.copyUsingLengthPrefix(bytes); // Loading the TermInfo from the terms dict here // should not be costly, because 1) the // query/filter will load the TermInfo when it // runs, and 2) the terms dict has a cache: - // @deprecated: in 4.0 use BytesRef for collectTerms() - docVisitCount += reader.docFreq(t); + docVisitCount += reader.docFreq(field, bytes); return true; } int docVisitCount = 0; boolean hasCutOff = false; + int termCount = 0; final IndexReader reader; + final String field; final int docCountCutoff, termCountLimit; - final ArrayList pendingTerms = new ArrayList(); + final PagedBytes pendingTerms = new PagedBytes(15); // max term size is 32 KiB + final long startOffset = pendingTerms.getPointer(); } @Override @@ -647,18 +656,7 @@ public abstract class MultiTermQuery extends Query { */ public MultiTermQuery(final String field) { this.field = field; - } - - /** - * Constructs a query matching terms that cannot be represented with a single - * Term. - * @deprecated Use {@link #MultiTermQuery(String)}, as the flex branch can - * only work on one field per terms enum. If you override - * {@link #getTermsEnum(IndexReader)}, you cannot use this ctor. - */ - @Deprecated - public MultiTermQuery() { - this(null); + assert field != null; } /** Returns the field name for this query */ diff --git a/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java b/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java index a6b5c87d59f..52c6840fc7a 100644 --- a/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java +++ b/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java @@ -106,10 +106,6 @@ public class MultiTermQueryWrapperFilter extends Filte */ @Override public DocIdSet getDocIdSet(IndexReader reader) throws IOException { - if (query.field == null) { - throw new NullPointerException("If you implement getTermsEnum(), you must specify a non-null field in the constructor of MultiTermQuery."); - } - final Fields fields = MultiFields.getFields(reader); if (fields == null) { // reader has no fields diff --git a/lucene/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/src/java/org/apache/lucene/search/PhraseQuery.java index f583174ba0a..1c498cdbe6e 100644 --- a/lucene/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/src/java/org/apache/lucene/search/PhraseQuery.java @@ -184,15 +184,14 @@ public class PhraseQuery extends Query { final Bits delDocs = MultiFields.getDeletedDocs(reader); for (int i = 0; i < terms.size(); i++) { final Term t = terms.get(i); - final BytesRef text = new BytesRef(t.text()); DocsAndPositionsEnum postingsEnum = MultiFields.getTermPositionsEnum(reader, delDocs, t.field(), - text); + t.bytes()); // PhraseQuery on a field that did not index // positions. if (postingsEnum == null) { - if (MultiFields.getTermDocsEnum(reader, delDocs, t.field(), text) != null) { + if (MultiFields.getTermDocsEnum(reader, delDocs, t.field(), t.bytes()) != null) { // term does exist, but has no positions throw new IllegalStateException("field \"" + t.field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + t.text() + ")"); } else { @@ -200,7 +199,7 @@ public class PhraseQuery extends Query { return null; } } - postingsFreqs[i] = new PostingsAndFreq(postingsEnum, reader.docFreq(t.field(), text), positions.get(i).intValue()); + postingsFreqs[i] = new PostingsAndFreq(postingsEnum, reader.docFreq(t.field(), t.bytes()), positions.get(i).intValue()); } // sort by increasing docFreq order diff --git a/lucene/src/java/org/apache/lucene/search/PrefixQuery.java b/lucene/src/java/org/apache/lucene/search/PrefixQuery.java index 5215a12155e..02ea9638903 100644 --- a/lucene/src/java/org/apache/lucene/search/PrefixQuery.java +++ b/lucene/src/java/org/apache/lucene/search/PrefixQuery.java @@ -46,7 +46,7 @@ public class PrefixQuery extends MultiTermQuery { @Override protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { - if (prefix.text().length() == 0) { + if (prefix.bytes().length == 0) { // no prefix -- match all terms for this field: final Terms terms = MultiFields.getTerms(reader, getField()); return (terms != null) ? terms.iterator() : TermsEnum.EMPTY; diff --git a/lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java b/lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java index 650dbd03edd..bf450e985a1 100644 --- a/lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java +++ b/lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java @@ -36,7 +36,7 @@ public class PrefixTermsEnum extends FilteredTermsEnum { public PrefixTermsEnum(IndexReader reader, Term prefix) throws IOException { super(reader, prefix.field()); - setInitialSeekTerm(prefixRef = new BytesRef(prefix.text())); + setInitialSeekTerm(prefixRef = prefix.bytes()); } @Override diff --git a/lucene/src/java/org/apache/lucene/search/QueryTermVector.java b/lucene/src/java/org/apache/lucene/search/QueryTermVector.java index fcb9f37f048..bbf64b977d5 100644 --- a/lucene/src/java/org/apache/lucene/search/QueryTermVector.java +++ b/lucene/src/java/org/apache/lucene/search/QueryTermVector.java @@ -29,14 +29,16 @@ import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.index.TermFreqVector; +import org.apache.lucene.util.BytesRef; /** * * **/ public class QueryTermVector implements TermFreqVector { - private String [] terms = new String[0]; + private BytesRef [] terms = new BytesRef[0]; private int [] termFreqs = new int[0]; public String getField() { return null; } @@ -45,7 +47,7 @@ public class QueryTermVector implements TermFreqVector { * * @param queryTerms The original list of terms from the query, can contain duplicates */ - public QueryTermVector(String [] queryTerms) { + public QueryTermVector(BytesRef [] queryTerms) { processTerms(queryTerms); } @@ -56,35 +58,37 @@ public class QueryTermVector implements TermFreqVector { TokenStream stream = analyzer.tokenStream("", new StringReader(queryString)); if (stream != null) { - List terms = new ArrayList(); + List terms = new ArrayList(); try { boolean hasMoreTokens = false; stream.reset(); - final CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); + final TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); hasMoreTokens = stream.incrementToken(); while (hasMoreTokens) { - terms.add(termAtt.toString()); + BytesRef bytes = new BytesRef(); + termAtt.toBytesRef(bytes); + terms.add(bytes); hasMoreTokens = stream.incrementToken(); } - processTerms(terms.toArray(new String[terms.size()])); + processTerms(terms.toArray(new BytesRef[terms.size()])); } catch (IOException e) { } } } } - private void processTerms(String[] queryTerms) { + private void processTerms(BytesRef[] queryTerms) { if (queryTerms != null) { Arrays.sort(queryTerms); - Map tmpSet = new HashMap(queryTerms.length); + Map tmpSet = new HashMap(queryTerms.length); //filter out duplicates - List tmpList = new ArrayList(queryTerms.length); + List tmpList = new ArrayList(queryTerms.length); List tmpFreqs = new ArrayList(queryTerms.length); int j = 0; for (int i = 0; i < queryTerms.length; i++) { - String term = queryTerms[i]; + BytesRef term = queryTerms[i]; Integer position = tmpSet.get(term); if (position == null) { tmpSet.put(term, Integer.valueOf(j++)); @@ -112,7 +116,7 @@ public class QueryTermVector implements TermFreqVector { sb.append('{'); for (int i=0; i0) sb.append(", "); - sb.append(terms[i]).append('/').append(termFreqs[i]); + sb.append(terms[i].utf8ToString()).append('/').append(termFreqs[i]); } sb.append('}'); return sb.toString(); @@ -123,7 +127,7 @@ public class QueryTermVector implements TermFreqVector { return terms.length; } - public String[] getTerms() { + public BytesRef[] getTerms() { return terms; } @@ -131,12 +135,12 @@ public class QueryTermVector implements TermFreqVector { return termFreqs; } - public int indexOf(String term) { + public int indexOf(BytesRef term) { int res = Arrays.binarySearch(terms, term); return res >= 0 ? res : -1; } - public int[] indexesOf(String[] terms, int start, int len) { + public int[] indexesOf(BytesRef[] terms, int start, int len) { int res[] = new int[len]; for (int i=0; i < len; i++) { diff --git a/lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java b/lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java index 1e5acdae119..65318913270 100644 --- a/lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java +++ b/lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java @@ -41,7 +41,7 @@ public final class SingleTermsEnum extends FilteredTermsEnum { */ public SingleTermsEnum(IndexReader reader, Term singleTerm) throws IOException { super(reader, singleTerm.field()); - singleRef = new BytesRef(singleTerm.text()); + singleRef = singleTerm.bytes(); setInitialSeekTerm(singleRef); } diff --git a/lucene/src/java/org/apache/lucene/search/TermQuery.java b/lucene/src/java/org/apache/lucene/search/TermQuery.java index 7fdf3e82395..c55d0f163ba 100644 --- a/lucene/src/java/org/apache/lucene/search/TermQuery.java +++ b/lucene/src/java/org/apache/lucene/search/TermQuery.java @@ -75,7 +75,7 @@ public class TermQuery extends Query { public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { // NOTE: debateably, the caller should never pass in a // multi reader... - DocsEnum docs = MultiFields.getTermDocsEnum(reader, MultiFields.getDeletedDocs(reader), term.field(), new BytesRef(term.text())); + DocsEnum docs = MultiFields.getTermDocsEnum(reader, MultiFields.getDeletedDocs(reader), term.field(), term.bytes()); if (docs == null) { return null; } @@ -118,7 +118,7 @@ public class TermQuery extends Query { Explanation tfExplanation = new Explanation(); int tf = 0; - DocsEnum docs = reader.termDocsEnum(MultiFields.getDeletedDocs(reader), term.field(), new BytesRef(term.text())); + DocsEnum docs = reader.termDocsEnum(MultiFields.getDeletedDocs(reader), term.field(), term.bytes()); if (docs != null) { int newDoc = docs.advance(doc); if (newDoc == doc) { diff --git a/lucene/src/java/org/apache/lucene/search/spans/SpanTermQuery.java b/lucene/src/java/org/apache/lucene/search/spans/SpanTermQuery.java index 40d5a885639..9d69561b6f8 100644 --- a/lucene/src/java/org/apache/lucene/search/spans/SpanTermQuery.java +++ b/lucene/src/java/org/apache/lucene/search/spans/SpanTermQuery.java @@ -85,16 +85,15 @@ public class SpanTermQuery extends SpanQuery { public Spans getSpans(final IndexReader reader) throws IOException { // NOTE: debateably, the caller should never pass in a // multi reader... - final BytesRef textBytes = new BytesRef(term.text()); final DocsAndPositionsEnum postings = MultiFields.getTermPositionsEnum(reader, MultiFields.getDeletedDocs(reader), term.field(), - textBytes); + term.bytes()); if (postings != null) { return new TermSpans(postings, term); } else { - if (MultiFields.getTermDocsEnum(reader, MultiFields.getDeletedDocs(reader), term.field(), textBytes) != null) { + if (MultiFields.getTermDocsEnum(reader, MultiFields.getDeletedDocs(reader), term.field(), term.bytes()) != null) { // term does exist, but has no positions throw new IllegalStateException("field \"" + term.field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run SpanTermQuery (term=" + term.text() + ")"); } else { diff --git a/lucene/src/java/org/apache/lucene/util/BytesRef.java b/lucene/src/java/org/apache/lucene/util/BytesRef.java index d3e79be3073..ab0ef4e14ca 100644 --- a/lucene/src/java/org/apache/lucene/util/BytesRef.java +++ b/lucene/src/java/org/apache/lucene/util/BytesRef.java @@ -77,6 +77,16 @@ public final class BytesRef implements Comparable, Externalizable { this(); copy(text); } + + /** + * @param text Initialize the byte[] from the UTF8 bytes + * for the provided array. This must be well-formed + * unicode text, with no unpaired surrogates or U+FFFF. + */ + public BytesRef(char text[], int offset, int length) { + this(length * 4); + copy(text, offset, length); + } public BytesRef(BytesRef other) { this(); @@ -106,6 +116,15 @@ public final class BytesRef implements Comparable, Externalizable { UnicodeUtil.UTF16toUTF8(text, 0, text.length(), this); } + /** + * Copies the UTF8 bytes for this string. + * + * @param text Must be well-formed unicode text, with no + * unpaired surrogates or invalid UTF16 code units. + */ + public void copy(char text[], int offset, int length) { + UnicodeUtil.UTF16toUTF8(text, offset, length, this); + } public boolean bytesEquals(BytesRef other) { if (length == other.length) { int otherUpto = other.offset; @@ -277,6 +296,62 @@ public final class BytesRef implements Comparable, Externalizable { } } + private final static Comparator utf8SortedAsUTF16SortOrder = new UTF8SortedAsUTF16Comparator(); + + public static Comparator getUTF8SortedAsUTF16Comparator() { + return utf8SortedAsUTF16SortOrder; + } + + private static class UTF8SortedAsUTF16Comparator implements Comparator { + // Only singleton + private UTF8SortedAsUTF16Comparator() {}; + + public int compare(BytesRef a, BytesRef b) { + + final byte[] aBytes = a.bytes; + int aUpto = a.offset; + final byte[] bBytes = b.bytes; + int bUpto = b.offset; + + final int aStop; + if (a.length < b.length) { + aStop = aUpto + a.length; + } else { + aStop = aUpto + b.length; + } + + while(aUpto < aStop) { + int aByte = aBytes[aUpto++] & 0xff; + int bByte = bBytes[bUpto++] & 0xff; + + if (aByte != bByte) { + + // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order + + // We know the terms are not equal, but, we may + // have to carefully fixup the bytes at the + // difference to match UTF16's sort order: + if (aByte >= 0xee && bByte >= 0xee) { + if ((aByte & 0xfe) == 0xee) { + aByte += 0x10; + } + if ((bByte&0xfe) == 0xee) { + bByte += 0x10; + } + } + return aByte - bByte; + } + } + + // One is a prefix of the other, or, they are equal: + return a.length - b.length; + } + + public boolean equals(Object other) { + return this == other; + } + } + public void writeExternal(ObjectOutput out) throws IOException { diff --git a/lucene/src/java/org/apache/lucene/util/PagedBytes.java b/lucene/src/java/org/apache/lucene/util/PagedBytes.java index 1d830fde141..ce920d4b455 100644 --- a/lucene/src/java/org/apache/lucene/util/PagedBytes.java +++ b/lucene/src/java/org/apache/lucene/util/PagedBytes.java @@ -125,6 +125,26 @@ public final class PagedBytes { return index; } + /** @lucene.internal Reads length as 1 or 2 byte vInt prefix, starting @ start. + * Returns the start offset of the next part, suitable as start parameter on next call + * to sequentially read all BytesRefs. */ + public long fillUsingLengthPrefix3(BytesRef b, long start) { + final int index = (int) (start >> blockBits); + final int offset = (int) (start & blockMask); + final byte[] block = b.bytes = blocks[index]; + + if ((block[offset] & 128) == 0) { + b.length = block[offset]; + b.offset = offset+1; + start += 1L + b.length; + } else { + b.length = (((int) (block[offset] & 0x7f)) << 8) | (block[1+offset] & 0xff); + b.offset = offset+2; + start += 2L + b.length; + assert b.length > 0; + } + return start; + } /** @lucene.internal */ public byte[][] getBlocks() { @@ -230,7 +250,7 @@ public final class PagedBytes { /** Commits final byte[], trimming it if necessary and if trim=true */ public Reader freeze(boolean trim) { - if (upto < blockSize) { + if (trim && upto < blockSize) { final byte[] newBlock = new byte[upto]; System.arraycopy(currentBlock, 0, newBlock, 0, upto); currentBlock = newBlock; diff --git a/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java b/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java index cd1d329b517..bcf0e56c2eb 100755 --- a/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java +++ b/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java @@ -464,7 +464,7 @@ public class TestAddIndexes extends LuceneTestCase { private void verifyTermDocs(Directory dir, Term term, int numDocs) throws IOException { IndexReader reader = IndexReader.open(dir, true); - DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, null, term.field, new BytesRef(term.text)); + DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, null, term.field, term.bytes); int count = 0; while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) count++; diff --git a/lucene/src/test/org/apache/lucene/index/TestPayloads.java b/lucene/src/test/org/apache/lucene/index/TestPayloads.java index 4de508a1c61..a8882e11635 100644 --- a/lucene/src/test/org/apache/lucene/index/TestPayloads.java +++ b/lucene/src/test/org/apache/lucene/index/TestPayloads.java @@ -188,7 +188,7 @@ public class TestPayloads extends LuceneTestCase { Term[] terms = generateTerms(fieldName, numTerms); StringBuilder sb = new StringBuilder(); for (int i = 0; i < terms.length; i++) { - sb.append(terms[i].text); + sb.append(terms[i].text()); sb.append(" "); } String content = sb.toString(); diff --git a/lucene/src/test/org/apache/lucene/index/TestPositionBasedTermVectorMapper.java b/lucene/src/test/org/apache/lucene/index/TestPositionBasedTermVectorMapper.java index 9dc45ac197c..62cf45a488d 100644 --- a/lucene/src/test/org/apache/lucene/index/TestPositionBasedTermVectorMapper.java +++ b/lucene/src/test/org/apache/lucene/index/TestPositionBasedTermVectorMapper.java @@ -15,6 +15,7 @@ package org.apache.lucene.index; * limitations under the License. */ +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import java.io.IOException; @@ -65,7 +66,7 @@ public class TestPositionBasedTermVectorMapper extends LuceneTestCase { //Test single position for (int i = 0; i < tokens.length; i++) { String token = tokens[i]; - mapper.map(token, 1, null, thePositions[i]); + mapper.map(new BytesRef(token), 1, null, thePositions[i]); } Map> map = mapper.getFieldToTerms(); diff --git a/lucene/src/test/org/apache/lucene/index/TestSegmentMerger.java b/lucene/src/test/org/apache/lucene/index/TestSegmentMerger.java index 4a53073d3fc..29c9d611c5b 100644 --- a/lucene/src/test/org/apache/lucene/index/TestSegmentMerger.java +++ b/lucene/src/test/org/apache/lucene/index/TestSegmentMerger.java @@ -100,7 +100,7 @@ public class TestSegmentMerger extends LuceneTestCase { TermFreqVector vector = mergedReader.getTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY); assertTrue(vector != null); - String [] terms = vector.getTerms(); + BytesRef [] terms = vector.getTerms(); assertTrue(terms != null); //System.out.println("Terms size: " + terms.length); assertTrue(terms.length == 3); @@ -110,7 +110,7 @@ public class TestSegmentMerger extends LuceneTestCase { assertTrue(vector instanceof TermPositionVector == true); for (int i = 0; i < terms.length; i++) { - String term = terms[i]; + String term = terms[i].utf8ToString(); int freq = freqs[i]; //System.out.println("Term: " + term + " Freq: " + freq); assertTrue(DocHelper.FIELD_2_TEXT.indexOf(term) != -1); diff --git a/lucene/src/test/org/apache/lucene/index/TestSegmentReader.java b/lucene/src/test/org/apache/lucene/index/TestSegmentReader.java index 05398171945..21cbff36820 100644 --- a/lucene/src/test/org/apache/lucene/index/TestSegmentReader.java +++ b/lucene/src/test/org/apache/lucene/index/TestSegmentReader.java @@ -192,11 +192,11 @@ public class TestSegmentReader extends LuceneTestCase { public void testTermVectors() throws IOException { TermFreqVector result = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY); assertTrue(result != null); - String [] terms = result.getTerms(); + BytesRef [] terms = result.getTerms(); int [] freqs = result.getTermFrequencies(); assertTrue(terms != null && terms.length == 3 && freqs != null && freqs.length == 3); for (int i = 0; i < terms.length; i++) { - String term = terms[i]; + String term = terms[i].utf8ToString(); int freq = freqs[i]; assertTrue(DocHelper.FIELD_2_TEXT.indexOf(term) != -1); assertTrue(freq > 0); diff --git a/lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java b/lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java index 217abe53df3..d42e792053f 100644 --- a/lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java +++ b/lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java @@ -516,8 +516,8 @@ public class TestStressIndexing2 extends MultiCodecTestCase { System.out.println("v1=" + v1 + " v2=" + v2 + " i=" + i + " of " + d1.length); assertEquals(v1.size(), v2.size()); int numTerms = v1.size(); - String[] terms1 = v1.getTerms(); - String[] terms2 = v2.getTerms(); + BytesRef[] terms1 = v1.getTerms(); + BytesRef[] terms2 = v2.getTerms(); int[] freq1 = v1.getTermFrequencies(); int[] freq2 = v2.getTermFrequencies(); for(int j=0;j { - String field; - BytesRef text; - - public FieldAndText(Term t) { - field = t.field(); - text = new BytesRef(t.text()); - } - - public int compareTo(FieldAndText other) { - if (other.field == field) { - return text.compareTo(other.text); - } else { - return field.compareTo(other.field); - } - } - } - // chooses from a very limited alphabet to exacerbate the // surrogate seeking required private static String makeDifficultRandomUnicodeString(Random r) { @@ -76,7 +57,7 @@ public class TestSurrogates extends LuceneTestCaseJ4 { return new String(buffer, 0, end); } - private SegmentInfo makePreFlexSegment(Random r, String segName, Directory dir, FieldInfos fieldInfos, Codec codec, List fieldTerms) throws IOException { + private SegmentInfo makePreFlexSegment(Random r, String segName, Directory dir, FieldInfos fieldInfos, Codec codec, List fieldTerms) throws IOException { final int numField = _TestUtil.nextInt(r, 2, 5); @@ -110,11 +91,14 @@ public class TestSurrogates extends LuceneTestCaseJ4 { fieldInfos.write(dir, segName); // sorts in UTF16 order, just like preflex: - Collections.sort(terms); + Collections.sort(terms, new Comparator() { + public int compare(Term o1, Term o2) { + return o1.compareToUTF16(o2); + } + }); TermInfosWriter w = new TermInfosWriter(dir, segName, fieldInfos, 128); TermInfo ti = new TermInfo(); - BytesRef utf8 = new BytesRef(10); String lastText = null; int uniqueTermCount = 0; if (VERBOSE) { @@ -127,23 +111,22 @@ public class TestSurrogates extends LuceneTestCaseJ4 { if (lastText != null && lastText.equals(text)) { continue; } - fieldTerms.add(new FieldAndText(t)); + fieldTerms.add(t); uniqueTermCount++; lastText = text; - UnicodeUtil.UTF16toUTF8(text, 0, text.length(), utf8); if (VERBOSE) { System.out.println(" " + toHexString(t)); } - w.add(fi.number, utf8.bytes, utf8.length, ti); + w.add(fi.number, t.bytes().bytes, t.bytes().length, ti); } w.close(); Collections.sort(fieldTerms); if (VERBOSE) { System.out.println("\nTEST: codepoint order"); - for(FieldAndText t: fieldTerms) { - System.out.println(" " + t.field + ":" + UnicodeUtil.toHexString(t.text.utf8ToString())); + for(Term t: fieldTerms) { + System.out.println(" " + t.field() + ":" + toHexString(t)); } } @@ -166,7 +149,7 @@ public class TestSurrogates extends LuceneTestCaseJ4 { Random r = newRandom(); FieldInfos fieldInfos = new FieldInfos(); - List fieldTerms = new ArrayList(); + List fieldTerms = new ArrayList(); SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms); // hack alert!! @@ -188,8 +171,8 @@ public class TestSurrogates extends LuceneTestCaseJ4 { BytesRef text; BytesRef lastText = null; while((text = termsEnum.next()) != null) { - UnicodeUtil.UTF8toUTF16(text.bytes, text.offset, text.length, utf16); if (VERBOSE) { + UnicodeUtil.UTF8toUTF16(text.bytes, text.offset, text.length, utf16); System.out.println("got term=" + field + ":" + UnicodeUtil.toHexString(new String(utf16.result, 0, utf16.length))); System.out.println(); } @@ -199,8 +182,8 @@ public class TestSurrogates extends LuceneTestCaseJ4 { assertTrue(lastText.compareTo(text) < 0); lastText.copy(text); } - assertEquals(fieldTerms.get(termCount).field, field); - assertEquals(fieldTerms.get(termCount).text, text); + assertEquals(fieldTerms.get(termCount).field(), field); + assertEquals(fieldTerms.get(termCount).bytes(), text); termCount++; } if (VERBOSE) { diff --git a/lucene/src/test/org/apache/lucene/search/TestMultiThreadTermVectors.java b/lucene/src/test/org/apache/lucene/search/TestMultiThreadTermVectors.java index 73f54aaaaac..4afccc02c4f 100644 --- a/lucene/src/test/org/apache/lucene/search/TestMultiThreadTermVectors.java +++ b/lucene/src/test/org/apache/lucene/search/TestMultiThreadTermVectors.java @@ -17,6 +17,7 @@ package org.apache.lucene.search; * limitations under the License. */ +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.*; @@ -175,11 +176,11 @@ class MultiThreadTermVectorsReader implements Runnable { private void verifyVectors(TermFreqVector[] vectors, int num) { StringBuilder temp = new StringBuilder(); - String[] terms = null; + BytesRef[] terms = null; for (int i = 0; i < vectors.length; i++) { terms = vectors[i].getTerms(); for (int z = 0; z < terms.length; z++) { - temp.append(terms[z]); + temp.append(terms[z].utf8ToString()); } } diff --git a/lucene/src/test/org/apache/lucene/search/TestQueryTermVector.java b/lucene/src/test/org/apache/lucene/search/TestQueryTermVector.java index 3ff352751bd..18e9464bbbe 100644 --- a/lucene/src/test/org/apache/lucene/search/TestQueryTermVector.java +++ b/lucene/src/test/org/apache/lucene/search/TestQueryTermVector.java @@ -17,6 +17,7 @@ package org.apache.lucene.search; * limitations under the License. */ +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.analysis.MockAnalyzer; @@ -28,12 +29,14 @@ public class TestQueryTermVector extends LuceneTestCase { } public void testConstructor() { - String [] queryTerm = {"foo", "bar", "foo", "again", "foo", "bar", "go", "go", "go"}; + BytesRef [] queryTerm = {new BytesRef("foo"), new BytesRef("bar"), new BytesRef("foo"), + new BytesRef("again"), new BytesRef("foo"), new BytesRef("bar"), new BytesRef("go"), + new BytesRef("go"), new BytesRef("go")}; //Items are sorted lexicographically - String [] gold = {"again", "bar", "foo", "go"}; + BytesRef [] gold = {new BytesRef("again"), new BytesRef("bar"), new BytesRef("foo"), new BytesRef("go")}; int [] goldFreqs = {1, 2, 3, 3}; QueryTermVector result = new QueryTermVector(queryTerm); - String [] terms = result.getTerms(); + BytesRef [] terms = result.getTerms(); assertTrue(terms.length == 4); int [] freq = result.getTermFrequencies(); assertTrue(freq.length == 4); @@ -49,7 +52,7 @@ public class TestQueryTermVector extends LuceneTestCase { checkGold(terms, gold, freq, goldFreqs); } - private void checkGold(String[] terms, String[] gold, int[] freq, int[] goldFreqs) { + private void checkGold(BytesRef[] terms, BytesRef[] gold, int[] freq, int[] goldFreqs) { for (int i = 0; i < terms.length; i++) { assertTrue(terms[i].equals(gold[i])); assertTrue(freq[i] == goldFreqs[i]); diff --git a/lucene/src/test/org/apache/lucene/search/TestTermVectors.java b/lucene/src/test/org/apache/lucene/search/TestTermVectors.java index 3d5249efb53..6c11a280bc1 100644 --- a/lucene/src/test/org/apache/lucene/search/TestTermVectors.java +++ b/lucene/src/test/org/apache/lucene/search/TestTermVectors.java @@ -17,6 +17,7 @@ package org.apache.lucene.search; * limitations under the License. */ +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; @@ -123,11 +124,11 @@ public class TestTermVectors extends LuceneTestCase { for(int i=0;i 0); for (int j = 0; j < terms.length; j++) { @@ -184,7 +185,7 @@ public class TestTermVectors extends LuceneTestCase { } catch(ClassCastException ignore){ TermFreqVector freqVec = vector[0]; - String [] terms = freqVec.getTerms(); + BytesRef [] terms = freqVec.getTerms(); assertTrue(terms != null && terms.length > 0); } @@ -277,11 +278,11 @@ public class TestTermVectors extends LuceneTestCase { //float coord = sim.coord() //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm); assertTrue(vector != null); - String[] vTerms = vector.getTerms(); + BytesRef[] vTerms = vector.getTerms(); int [] freqs = vector.getTermFrequencies(); for (int i = 0; i < vTerms.length; i++) { - if (text.equals(vTerms[i])) + if (text.equals(vTerms[i].utf8ToString())) { assertTrue(freqs[i] == freq); } @@ -306,11 +307,11 @@ public class TestTermVectors extends LuceneTestCase { TermFreqVector vector = knownSearcher.reader.getTermFreqVector(hits[1].doc, "field"); assertTrue(vector != null); //System.out.println("Vector: " + vector); - String[] terms = vector.getTerms(); + BytesRef[] terms = vector.getTerms(); int [] freqs = vector.getTermFrequencies(); assertTrue(terms != null && terms.length == 10); for (int i = 0; i < terms.length; i++) { - String term = terms[i]; + String term = terms[i].utf8ToString(); //System.out.println("Term: " + term); int freq = freqs[i]; assertTrue(test4.indexOf(term) != -1); @@ -327,7 +328,7 @@ public class TestTermVectors extends LuceneTestCase { if (tve != null && last != null) { assertTrue("terms are not properly sorted", last.getFrequency() >= tve.getFrequency()); - Integer expectedFreq = test4Map.get(tve.getTerm()); + Integer expectedFreq = test4Map.get(tve.getTerm().utf8ToString()); //we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields assertTrue("Frequency is not correct:", tve.getFrequency() == 2*expectedFreq.intValue()); } @@ -421,9 +422,9 @@ public class TestTermVectors extends LuceneTestCase { assertTrue(vector.length == 1); TermPositionVector tfv = (TermPositionVector) vector[0]; assertTrue(tfv.getField().equals("field")); - String[] terms = tfv.getTerms(); + BytesRef[] terms = tfv.getTerms(); assertEquals(1, terms.length); - assertEquals(terms[0], "one"); + assertEquals(terms[0].utf8ToString(), "one"); assertEquals(5, tfv.getTermFrequencies()[0]); int[] positions = tfv.getTermPositions(0); @@ -447,7 +448,7 @@ public class TestTermVectors extends LuceneTestCase { } @Override - public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { + public void map(BytesRef term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { } } diff --git a/solr/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java b/solr/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java index cf7169b4c27..0ccd0ae757a 100644 --- a/solr/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java +++ b/solr/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java @@ -265,7 +265,7 @@ public class LukeRequestHandler extends RequestHandlerBase if( v != null ) { SimpleOrderedMap tfv = new SimpleOrderedMap(); for( int i=0; i= startTerm && tt.termNum < endTerm) { - counts[tt.termNum] = searcher.numDocs(new TermQuery(new Term(ti.field, tt.term.utf8ToString())), docs); + counts[tt.termNum] = searcher.numDocs(new TermQuery(new Term(ti.field, tt.term)), docs); } } @@ -712,7 +712,7 @@ public class UnInvertedField { for (TopTerm tt : bigTerms.values()) { // TODO: counts could be deferred if sorted==false if (tt.termNum >= 0 && tt.termNum < numTermsInField) { - final Term t = new Term(ti.field, tt.term.utf8ToString()); + final Term t = new Term(ti.field, tt.term); if (finfo.length == 0) { counts[tt.termNum] = searcher.numDocs(new TermQuery(t), docs); } else { diff --git a/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java b/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java index cda252ed242..75ef22f6fd1 100644 --- a/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java +++ b/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java @@ -480,7 +480,7 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean { if (fields == null) return -1; Terms terms = fields.terms(t.field()); if (terms == null) return -1; - BytesRef termBytes = new BytesRef(t.text()); + BytesRef termBytes = t.bytes(); DocsEnum docs = terms.docs(MultiFields.getDeletedDocs(reader), termBytes, null); if (docs == null) return -1; int id = docs.nextDoc(); @@ -754,7 +754,7 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean { Fields fields = sir.fields(); Terms terms = fields.terms(t.field()); - BytesRef termBytes = new BytesRef(t.text()); + BytesRef termBytes = t.bytes(); Bits skipDocs = sir.getDeletedDocs(); DocsEnum docsEnum = terms==null ? null : terms.docs(skipDocs, termBytes, null); diff --git a/solr/src/java/org/apache/solr/update/DirectUpdateHandler.java b/solr/src/java/org/apache/solr/update/DirectUpdateHandler.java index 9a3f8e0d995..69b60d322cf 100644 --- a/solr/src/java/org/apache/solr/update/DirectUpdateHandler.java +++ b/solr/src/java/org/apache/solr/update/DirectUpdateHandler.java @@ -118,7 +118,7 @@ public class DirectUpdateHandler extends UpdateHandler { DocsEnum tdocs = MultiFields.getTermDocsEnum(ir, MultiFields.getDeletedDocs(ir), idTerm.field(), - new BytesRef(idTerm.text())); + idTerm.bytes()); if (tdocs != null) { return tdocs.nextDoc() != DocsEnum.NO_MORE_DOCS; } else {