diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java index d929bfd099e..d9ea43c2345 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java @@ -107,7 +107,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase { public void testFactoryHtmlStripClassicFolding() throws Exception { CustomAnalyzer a = CustomAnalyzer.builder() - .withDefaultMatchVersion(Version.LUCENE_6_0_0) + .withDefaultMatchVersion(Version.LUCENE_7_0_0) .addCharFilter(HTMLStripCharFilterFactory.class) .withTokenizer(ClassicTokenizerFactory.class) .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "true") @@ -126,7 +126,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase { assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass()); assertEquals(100, a.getPositionIncrementGap("dummy")); assertEquals(1000, a.getOffsetGap("dummy")); - assertSame(Version.LUCENE_6_0_0, a.getVersion()); + assertSame(Version.LUCENE_7_0_0, a.getVersion()); assertAnalyzesTo(a, "
foo bar
FOO BAR", new String[] { "foo", "bar", "foo", "bar" }, @@ -139,7 +139,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase { public void testHtmlStripClassicFolding() throws Exception { CustomAnalyzer a = CustomAnalyzer.builder() - .withDefaultMatchVersion(Version.LUCENE_6_0_0) + .withDefaultMatchVersion(Version.LUCENE_7_0_0) .addCharFilter("htmlstrip") .withTokenizer("classic") .addTokenFilter("asciifolding", "preserveOriginal", "true") @@ -158,7 +158,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase { assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass()); assertEquals(100, a.getPositionIncrementGap("dummy")); assertEquals(1000, a.getOffsetGap("dummy")); - assertSame(Version.LUCENE_6_0_0, a.getVersion()); + assertSame(Version.LUCENE_7_0_0, a.getVersion()); assertAnalyzesTo(a, "foo bar
FOO BAR", new String[] { "foo", "bar", "foo", "bar" }, diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoFormat.java deleted file mode 100644 index d2a384e975f..00000000000 --- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoFormat.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.codecs.lucene50; - - -import java.io.IOException; -import java.util.Map; -import java.util.Set; - -import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.codecs.SegmentInfoFormat; -import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.SegmentInfo; // javadocs -import org.apache.lucene.store.ChecksumIndexInput; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.IOContext; -import org.apache.lucene.util.Version; - -/** - * Lucene 5.0 Segment info format. - * @deprecated Only for reading old 5.0-6.0 segments - */ -@Deprecated -public class Lucene50SegmentInfoFormat extends SegmentInfoFormat { - - /** Sole constructor. */ - public Lucene50SegmentInfoFormat() { - } - - @Override - public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context) throws IOException { - final String fileName = IndexFileNames.segmentFileName(segment, "", Lucene50SegmentInfoFormat.SI_EXTENSION); - try (ChecksumIndexInput input = dir.openChecksumInput(fileName, context)) { - Throwable priorE = null; - SegmentInfo si = null; - try { - CodecUtil.checkIndexHeader(input, Lucene50SegmentInfoFormat.CODEC_NAME, - Lucene50SegmentInfoFormat.VERSION_START, - Lucene50SegmentInfoFormat.VERSION_CURRENT, - segmentID, ""); - final Version version = Version.fromBits(input.readInt(), input.readInt(), input.readInt()); - - final int docCount = input.readInt(); - if (docCount < 0) { - throw new CorruptIndexException("invalid docCount: " + docCount, input); - } - final boolean isCompoundFile = input.readByte() == SegmentInfo.YES; - - final Map- * Encodes normalization values by encoding each value with the minimum - * number of bytes needed to represent the range (which can be zero). - *
- * Files: - *
The Norms metadata or .nvm file.
- *For each norms field, this stores metadata, such as the offset into the - * Norms data (.nvd)
- *Norms metadata (.dvm) --> Header,<Entry>NumFields,Footer
- *FieldNumber of -1 indicates the end of metadata.
- *Offset is the pointer to the start of the data in the norms data (.nvd), or the singleton value - * when BytesPerValue = 0
- *The Norms data or .nvd file.
- *For each Norms field, this stores the actual per-document data (the heavy-lifting)
- *Norms data (.nvd) --> Header,< Data >NumFields,Footer
- *- * Encodes the five per-document value types (Numeric,Binary,Sorted,SortedSet,SortedNumeric) with these strategies: - *
- * {@link DocValuesType#NUMERIC NUMERIC}: - *
- * {@link DocValuesType#BINARY BINARY}: - *
- * {@link DocValuesType#SORTED SORTED}: - *
- * {@link DocValuesType#SORTED_SET SORTED_SET}: - *
- * {@link DocValuesType#SORTED_NUMERIC SORTED_NUMERIC}: - *
- * Files: - *
This document defines the index file formats used in this version of Lucene.
- * If you are using a different version of Lucene, please consult the copy of
- * docs/
that was distributed with
- * the version you are using.
Apache Lucene is written in Java, but several efforts are underway to write - * versions of - * Lucene in other programming languages. If these versions are to remain - * compatible with Apache Lucene, then a language-independent definition of the - * Lucene index format is required. This document thus attempts to provide a - * complete and independent definition of the Apache Lucene file formats.
- *As Lucene evolves, this document should evolve. Versions of Lucene in - * different programming languages should endeavor to agree on file formats, and - * generate new versions of this document.
- *The fundamental concepts in Lucene are index, document, field and term.
- *An index contains a sequence of documents.
- *The same sequence of bytes in two different fields is considered a different - * term. Thus terms are represented as a pair: the string naming the field, and the - * bytes within the field.
- * - *The index stores statistics about terms in order to make term-based search - * more efficient. Lucene's index falls into the family of indexes known as an - * inverted index. This is because it can list, for a term, the documents - * that contain it. This is the inverse of the natural relationship, in which - * documents list terms.
- * - *In Lucene, fields may be stored, in which case their text is stored - * in the index literally, in a non-inverted manner. Fields that are inverted are - * called indexed. A field may be both stored and indexed.
- *The text of a field may be tokenized into terms to be indexed, or the - * text of a field may be used literally as a term to be indexed. Most fields are - * tokenized, but sometimes it is useful for certain identifier fields to be - * indexed literally.
- *See the {@link org.apache.lucene.document.Field Field} - * java docs for more information on Fields.
- * - *Lucene indexes may be composed of multiple sub-indexes, or segments. - * Each segment is a fully independent index, which could be searched separately. - * Indexes evolve by:
- *Searches may involve multiple segments and/or multiple indexes, each index - * potentially composed of a set of segments.
- * - *Internally, Lucene refers to documents by an integer document number. - * The first document added to an index is numbered zero, and each subsequent - * document added gets a number one greater than the previous.
- *Note that a document's number may change, so caution should be taken when - * storing these numbers outside of Lucene. In particular, numbers may change in - * the following situations:
- *The numbers stored in each segment are unique only within the segment, and - * must be converted before they can be used in a larger context. The standard - * technique is to allocate each segment a range of values, based on the range of - * numbers used in that segment. To convert a document number from a segment to an - * external value, the segment's base document number is added. To convert - * an external value back to a segment-specific value, the segment is identified - * by the range that the external value is in, and the segment's base value is - * subtracted. For example two five document segments might be combined, so that - * the first segment has a base value of zero, and the second of five. Document - * three from the second segment would have an external value of eight.
- *When documents are deleted, gaps are created in the numbering. These are - * eventually removed as the index evolves through merging. Deleted documents are - * dropped when segments are merged. A freshly-merged segment thus has no gaps in - * its numbering.
- *Each segment index maintains the following:
- *Details on each of these are provided in their linked pages.
- *All files belonging to a segment have the same name with varying extensions. - * The extensions correspond to the different file formats described below. When - * using the Compound File format (default in 1.4 and greater) these files (except - * for the Segment info file, the Lock file, and Deleted documents file) are collapsed - * into a single .cfs file (see below for details)
- *Typically, all segments in an index are stored in a single directory, - * although this is not required.
- *As of version 2.1 (lock-less commits), file names are never re-used. - * That is, when any file is saved - * to the Directory it is given a never before used filename. This is achieved - * using a simple generations approach. For example, the first segments file is - * segments_1, then segments_2, etc. The generation is a sequential long integer - * represented in alpha-numeric (base 36) form.
- *The following table summarizes the names and extensions of the files in - * Lucene:
- *Name | - *Extension | - *Brief Description | - *
---|---|---|
{@link org.apache.lucene.index.SegmentInfos Segments File} | - *segments_N | - *Stores information about a commit point | - *
Lock File | - *write.lock | - *The Write lock prevents multiple IndexWriters from writing to the same - * file. | - *
{@link org.apache.lucene.codecs.lucene62.Lucene62SegmentInfoFormat Segment Info} | - *.si | - *Stores metadata about a segment | - *
{@link org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat Compound File} | - *.cfs, .cfe | - *An optional "virtual" file consisting of all the other index files for - * systems that frequently run out of file handles. | - *
{@link org.apache.lucene.codecs.lucene50.Lucene50FieldInfosFormat Fields} | - *.fnm | - *Stores information about the fields | - *
{@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Field Index} | - *.fdx | - *Contains pointers to field data | - *
{@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Field Data} | - *.fdt | - *The stored fields for documents | - *
{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Dictionary} | - *.tim | - *The term dictionary, stores term info | - *
{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Index} | - *.tip | - *The index into the Term Dictionary | - *
{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Frequencies} | - *.doc | - *Contains the list of docs which contain each term along with frequency | - *
{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Positions} | - *.pos | - *Stores position information about where a term occurs in the index | - *
{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Payloads} | - *.pay | - *Stores additional per-position metadata information such as character offsets and user payloads | - *
{@link org.apache.lucene.codecs.lucene53.Lucene53NormsFormat Norms} | - *.nvd, .nvm | - *Encodes length and boost factors for docs and fields | - *
{@link org.apache.lucene.codecs.lucene54.Lucene54DocValuesFormat Per-Document Values} | - *.dvd, .dvm | - *Encodes additional scoring factors or other per-document information. | - *
{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Index} | - *.tvx | - *Stores offset into the document data file | - *
{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Documents} | - *.tvd | - *Contains information about each document that has term vectors | - *
{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Fields} | - *.tvf | - *The field level info about term vectors | - *
{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents} | - *.liv | - *Info about what files are live | - *
Compatibility notes are provided in this document, describing how file - * formats have changed from prior versions:
- *Lucene uses a Java int
to refer to
- * document numbers, and the index file format uses an Int32
- * on-disk to store document numbers. This is a limitation
- * of both the index file format and the current implementation. Eventually these
- * should be replaced with either UInt64
values, or
- * better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.