mirror of https://github.com/apache/lucene.git
added getPositionIncrementGap(String fieldName) to Analyzer
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@348060 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a4c714d9d5
commit
c958e10770
|
@ -49,5 +49,23 @@ public abstract class Analyzer {
|
||||||
{
|
{
|
||||||
return tokenStream(null, reader);
|
return tokenStream(null, reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Invoked, by DocumentWriter, before indexing a Field instance if
|
||||||
|
* terms have already been added to that field. This allows custom
|
||||||
|
* analyzers to place an automatic position increment gap between
|
||||||
|
* Field instances using the same field name. The default value
|
||||||
|
* position increment gap is 0. With a 0 position increment gap and
|
||||||
|
* the typical default token position increment of 1, all terms in a field,
|
||||||
|
* including across Field instances, are in successive positions, allowing
|
||||||
|
* exact PhraseQuery matches, for instance, across Field instance boundaries.
|
||||||
|
*
|
||||||
|
* @param fieldName Field name being indexed.
|
||||||
|
* @return position increment gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
|
||||||
|
*/
|
||||||
|
public int getPositionIncrementGap(String fieldName)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -135,6 +135,7 @@ final class DocumentWriter {
|
||||||
|
|
||||||
int length = fieldLengths[fieldNumber]; // length of field
|
int length = fieldLengths[fieldNumber]; // length of field
|
||||||
int position = fieldPositions[fieldNumber]; // position in field
|
int position = fieldPositions[fieldNumber]; // position in field
|
||||||
|
if (length>0) position+=analyzer.getPositionIncrementGap(fieldName);
|
||||||
int offset = fieldOffsets[fieldNumber]; // offset field
|
int offset = fieldOffsets[fieldNumber]; // offset field
|
||||||
|
|
||||||
if (field.isIndexed()) {
|
if (field.isIndexed()) {
|
||||||
|
|
|
@ -1,12 +1,19 @@
|
||||||
package org.apache.lucene.index;
|
package org.apache.lucene.index;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Created by IntelliJ IDEA.
|
* Copyright 2005 The Apache Software Foundation
|
||||||
* User: Grant Ingersoll
|
*
|
||||||
* Date: Feb 2, 2004
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
* Time: 6:16:12 PM
|
* you may not use this file except in compliance with the License.
|
||||||
* $Id$
|
* You may obtain a copy of the License at
|
||||||
* Copyright 2004. Center For Natural Language Processing
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
@ -21,10 +28,6 @@ import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Enumeration;
|
import java.util.Enumeration;
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
*
|
|
||||||
**/
|
|
||||||
class DocHelper {
|
class DocHelper {
|
||||||
public static final String FIELD_1_TEXT = "field one text";
|
public static final String FIELD_1_TEXT = "field one text";
|
||||||
public static final String TEXT_FIELD_1_KEY = "textField1";
|
public static final String TEXT_FIELD_1_KEY = "textField1";
|
||||||
|
@ -68,6 +71,14 @@ class DocHelper {
|
||||||
public static Field unStoredField2 = new Field(UNSTORED_FIELD_2_KEY, UNSTORED_2_FIELD_TEXT,
|
public static Field unStoredField2 = new Field(UNSTORED_FIELD_2_KEY, UNSTORED_2_FIELD_TEXT,
|
||||||
Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES);
|
Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES);
|
||||||
|
|
||||||
|
public static final String REPEATED_1_TEXT = "repeated one";
|
||||||
|
public static final String REPEATED_KEY = "repeated";
|
||||||
|
public static Field repeatedField1 = new Field(REPEATED_KEY, REPEATED_1_TEXT,
|
||||||
|
Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO);
|
||||||
|
public static final String REPEATED_2_TEXT = "repeated two";
|
||||||
|
public static Field repeatedField2 = new Field(REPEATED_KEY, REPEATED_2_TEXT,
|
||||||
|
Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO);
|
||||||
|
|
||||||
public static Map nameValues = null;
|
public static Map nameValues = null;
|
||||||
|
|
||||||
// ordered list of all the fields...
|
// ordered list of all the fields...
|
||||||
|
@ -81,6 +92,8 @@ class DocHelper {
|
||||||
unIndField,
|
unIndField,
|
||||||
unStoredField1,
|
unStoredField1,
|
||||||
unStoredField2,
|
unStoredField2,
|
||||||
|
repeatedField1,
|
||||||
|
repeatedField2
|
||||||
};
|
};
|
||||||
|
|
||||||
// Map<String fieldName, Field field>
|
// Map<String fieldName, Field field>
|
||||||
|
@ -156,9 +169,8 @@ class DocHelper {
|
||||||
*/
|
*/
|
||||||
public static void writeDoc(Directory dir, String segment, Document doc) throws IOException
|
public static void writeDoc(Directory dir, String segment, Document doc) throws IOException
|
||||||
{
|
{
|
||||||
Analyzer analyzer = new WhitespaceAnalyzer();
|
|
||||||
Similarity similarity = Similarity.getDefault();
|
Similarity similarity = Similarity.getDefault();
|
||||||
writeDoc(dir, analyzer, similarity, segment, doc);
|
writeDoc(dir, new WhitespaceAnalyzer(), similarity, segment, doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -17,14 +17,16 @@ package org.apache.lucene.index;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
import org.apache.lucene.store.RAMDirectory;
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||||
import org.apache.lucene.search.Similarity;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.search.Similarity;
|
||||||
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.Reader;
|
||||||
|
|
||||||
public class TestDocumentWriter extends TestCase {
|
public class TestDocumentWriter extends TestCase {
|
||||||
private RAMDirectory dir = new RAMDirectory();
|
private RAMDirectory dir = new RAMDirectory();
|
||||||
|
@ -48,54 +50,62 @@ public class TestDocumentWriter extends TestCase {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testAddDocument() {
|
public void testAddDocument() throws Exception {
|
||||||
Analyzer analyzer = new WhitespaceAnalyzer();
|
Analyzer analyzer = new Analyzer() {
|
||||||
Similarity similarity = Similarity.getDefault();
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
|
return new WhitespaceTokenizer(reader);
|
||||||
assertTrue(writer != null);
|
|
||||||
try {
|
|
||||||
String segName="test";
|
|
||||||
writer.addDocument(segName, testDoc);
|
|
||||||
//After adding the document, we should be able to read it back in
|
|
||||||
SegmentReader reader = SegmentReader.get(new SegmentInfo(segName, 1, dir));
|
|
||||||
assertTrue(reader != null);
|
|
||||||
Document doc = reader.document(0);
|
|
||||||
assertTrue(doc != null);
|
|
||||||
|
|
||||||
//System.out.println("Document: " + doc);
|
|
||||||
Field [] fields = doc.getFields("textField2");
|
|
||||||
assertTrue(fields != null && fields.length == 1);
|
|
||||||
assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_2_TEXT));
|
|
||||||
assertTrue(fields[0].isTermVectorStored() == true);
|
|
||||||
|
|
||||||
fields = doc.getFields("textField1");
|
|
||||||
assertTrue(fields != null && fields.length == 1);
|
|
||||||
assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_1_TEXT));
|
|
||||||
assertTrue(fields[0].isTermVectorStored() == false);
|
|
||||||
|
|
||||||
fields = doc.getFields("keyField");
|
|
||||||
assertTrue(fields != null && fields.length == 1);
|
|
||||||
assertTrue(fields[0].stringValue().equals(DocHelper.KEYWORD_TEXT));
|
|
||||||
|
|
||||||
fields = doc.getFields(DocHelper.NO_NORMS_KEY);
|
|
||||||
assertTrue(fields != null && fields.length == 1);
|
|
||||||
assertTrue(fields[0].stringValue().equals(DocHelper.NO_NORMS_TEXT));
|
|
||||||
|
|
||||||
fields = doc.getFields(DocHelper.TEXT_FIELD_3_KEY);
|
|
||||||
assertTrue(fields != null && fields.length == 1);
|
|
||||||
assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_3_TEXT));
|
|
||||||
|
|
||||||
// test that the norm file is not present if omitNorms is true
|
|
||||||
for (int i=0; i<reader.fieldInfos.size(); i++) {
|
|
||||||
FieldInfo fi = reader.fieldInfos.fieldInfo(i);
|
|
||||||
if (fi.isIndexed) {
|
|
||||||
assertTrue(fi.omitNorms == !dir.fileExists(segName + ".f" + i));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (IOException e) {
|
public int getPositionIncrementGap(String fieldName) {
|
||||||
e.printStackTrace();
|
return 500;
|
||||||
assertTrue(false);
|
}
|
||||||
|
};
|
||||||
|
Similarity similarity = Similarity.getDefault();
|
||||||
|
DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
|
||||||
|
String segName = "test";
|
||||||
|
writer.addDocument(segName, testDoc);
|
||||||
|
//After adding the document, we should be able to read it back in
|
||||||
|
SegmentReader reader = SegmentReader.get(new SegmentInfo(segName, 1, dir));
|
||||||
|
assertTrue(reader != null);
|
||||||
|
Document doc = reader.document(0);
|
||||||
|
assertTrue(doc != null);
|
||||||
|
|
||||||
|
//System.out.println("Document: " + doc);
|
||||||
|
Field [] fields = doc.getFields("textField2");
|
||||||
|
assertTrue(fields != null && fields.length == 1);
|
||||||
|
assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_2_TEXT));
|
||||||
|
assertTrue(fields[0].isTermVectorStored());
|
||||||
|
|
||||||
|
fields = doc.getFields("textField1");
|
||||||
|
assertTrue(fields != null && fields.length == 1);
|
||||||
|
assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_1_TEXT));
|
||||||
|
assertFalse(fields[0].isTermVectorStored());
|
||||||
|
|
||||||
|
fields = doc.getFields("keyField");
|
||||||
|
assertTrue(fields != null && fields.length == 1);
|
||||||
|
assertTrue(fields[0].stringValue().equals(DocHelper.KEYWORD_TEXT));
|
||||||
|
|
||||||
|
fields = doc.getFields(DocHelper.NO_NORMS_KEY);
|
||||||
|
assertTrue(fields != null && fields.length == 1);
|
||||||
|
assertTrue(fields[0].stringValue().equals(DocHelper.NO_NORMS_TEXT));
|
||||||
|
|
||||||
|
fields = doc.getFields(DocHelper.TEXT_FIELD_3_KEY);
|
||||||
|
assertTrue(fields != null && fields.length == 1);
|
||||||
|
assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_3_TEXT));
|
||||||
|
|
||||||
|
// test that the norm file is not present if omitNorms is true
|
||||||
|
for (int i = 0; i < reader.fieldInfos.size(); i++) {
|
||||||
|
FieldInfo fi = reader.fieldInfos.fieldInfo(i);
|
||||||
|
if (fi.isIndexed) {
|
||||||
|
assertTrue(fi.omitNorms == !dir.fileExists(segName + ".f" + i));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TermPositions termPositions = reader.termPositions(new Term(DocHelper.REPEATED_KEY, "repeated"));
|
||||||
|
assertTrue(termPositions.next());
|
||||||
|
int freq = termPositions.freq();
|
||||||
|
assertEquals(2, freq);
|
||||||
|
assertEquals(0, termPositions.nextPosition());
|
||||||
|
assertEquals(502, termPositions.nextPosition());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,10 +17,14 @@ package org.apache.lucene.search;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||||
import org.apache.lucene.analysis.StopAnalyzer;
|
import org.apache.lucene.analysis.StopAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
@ -41,10 +45,22 @@ public class TestPhraseQuery extends TestCase {
|
||||||
|
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
directory = new RAMDirectory();
|
directory = new RAMDirectory();
|
||||||
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true);
|
Analyzer analyzer = new Analyzer() {
|
||||||
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
return new WhitespaceTokenizer(reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getPositionIncrementGap(String fieldName) {
|
||||||
|
return 100;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
IndexWriter writer = new IndexWriter(directory, analyzer, true);
|
||||||
|
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
doc.add(new Field("field", "one two three four five", Field.Store.YES, Field.Index.TOKENIZED));
|
doc.add(new Field("field", "one two three four five", Field.Store.YES, Field.Index.TOKENIZED));
|
||||||
|
doc.add(new Field("repeated", "this is a repeated field - first part", Field.Store.YES, Field.Index.TOKENIZED));
|
||||||
|
Field repeatedField = new Field("repeated", "second part of a repeated field", Field.Store.YES, Field.Index.TOKENIZED);
|
||||||
|
doc.add(repeatedField);
|
||||||
writer.addDocument(doc);
|
writer.addDocument(doc);
|
||||||
|
|
||||||
writer.optimize();
|
writer.optimize();
|
||||||
|
@ -294,4 +310,15 @@ public class TestPhraseQuery extends TestCase {
|
||||||
assertEquals(2, hits.id(2));
|
assertEquals(2, hits.id(2));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testWrappedPhrase() throws IOException {
|
||||||
|
query.add(new Term("repeated", "first"));
|
||||||
|
query.add(new Term("repeated", "part"));
|
||||||
|
query.add(new Term("repeated", "second"));
|
||||||
|
query.add(new Term("repeated", "part"));
|
||||||
|
query.setSlop(99);
|
||||||
|
|
||||||
|
Hits hits = searcher.search(query);
|
||||||
|
assertEquals(0, hits.length());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue