mirror of https://github.com/apache/lucene.git
merge trunk (1233476:1235908)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3661@1235919 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
commit
58e5ec6979
|
@ -100,7 +100,7 @@
|
|||
<classpathentry kind="lib" path="modules/benchmark/lib/commons-digester-1.7.jar"/>
|
||||
<classpathentry kind="lib" path="modules/benchmark/lib/commons-logging-1.0.4.jar"/>
|
||||
<classpathentry kind="lib" path="modules/benchmark/lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/apache-solr-noggit-r1209632.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/apache-solr-noggit-r1211150.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/commons-csv-1.0-SNAPSHOT-r966014.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/commons-fileupload-1.2.1.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/commons-httpclient-3.1.jar"/>
|
||||
|
@ -115,7 +115,7 @@
|
|||
<classpathentry kind="lib" path="solr/lib/slf4j-api-1.6.1.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/slf4j-jdk14-1.6.1.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/wstx-asl-3.2.7.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/zookeeper-3.3.3.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/zookeeper-3.3.4.jar"/>
|
||||
<classpathentry kind="lib" path="solr/example/lib/jetty-6.1.26-patched-JETTY-1340.jar"/>
|
||||
<classpathentry kind="lib" path="solr/example/lib/jetty-util-6.1.26-patched-JETTY-1340.jar"/>
|
||||
<classpathentry kind="lib" path="solr/example/lib/servlet-api-2.5-20081211.jar"/>
|
||||
|
@ -136,7 +136,7 @@
|
|||
<classpathentry kind="lib" path="solr/contrib/extraction/lib/bcmail-jdk15-1.45.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/extraction/lib/bcprov-jdk15-1.45.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/extraction/lib/boilerpipe-1.1.0.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/extraction/lib/commons-compress-1.2.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/extraction/lib/commons-compress-1.3.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/extraction/lib/dom4j-1.6.1.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/extraction/lib/fontbox-1.6.0.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/extraction/lib/jempbox-1.6.0.jar"/>
|
||||
|
@ -149,8 +149,8 @@
|
|||
<classpathentry kind="lib" path="solr/contrib/extraction/lib/poi-scratchpad-3.8-beta4.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/extraction/lib/rome-0.9.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/extraction/lib/tagsoup-1.2.1.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-core-0.10.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-parsers-0.10.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-core-1.0.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-parsers-1.0.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/extraction/lib/xmlbeans-2.3.0.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/langid/lib/langdetect-r111.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/langid/lib/jsonic-1.2.0.jar"/>
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
<jetty.version>6.1.26</jetty.version>
|
||||
<patched.jetty.version>6.1.26-patched-JETTY-1340</patched.jetty.version>
|
||||
<slf4j.version>1.6.1</slf4j.version>
|
||||
<tika.version>0.10</tika.version>
|
||||
<tika.version>1.0</tika.version>
|
||||
</properties>
|
||||
<issueManagement>
|
||||
<system>JIRA</system>
|
||||
|
@ -283,7 +283,7 @@
|
|||
<dependency>
|
||||
<groupId>org.apache.zookeeper</groupId>
|
||||
<artifactId>zookeeper</artifactId>
|
||||
<version>3.3.3</version>
|
||||
<version>3.3.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.carrot2</groupId>
|
||||
|
@ -362,6 +362,19 @@
|
|||
</dependency>
|
||||
</dependencies>
|
||||
</dependencyManagement>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<!-- Maven 2.2.X has a bug that omits as duplicate all JUnit -->
|
||||
<!-- dependencies from the classpath when compiling solr-clustering, -->
|
||||
<!-- causing test compilation to fail. Maven 3.0.4 test compilation -->
|
||||
<!-- succeeds with the exact same dependencies, so apparently the -->
|
||||
<!-- bug has been fixed. This dependency can be removed when the -->
|
||||
<!-- minimum Maven version is upgraded to 3.0.4+. -->
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<build>
|
||||
<directory>lucene/build/lucene-parent</directory>
|
||||
<pluginManagement>
|
||||
|
@ -385,6 +398,11 @@
|
|||
<target>${java.compat.version}</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-dependency-plugin</artifactId>
|
||||
<version>2.4</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-deploy-plugin</artifactId>
|
||||
|
@ -652,7 +670,7 @@
|
|||
<artifactId>solr-noggit</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<packaging>jar</packaging>
|
||||
<file>solr/lib/apache-solr-noggit-r1209632.jar</file>
|
||||
<file>solr/lib/apache-solr-noggit-r1211150.jar</file>
|
||||
</configuration>
|
||||
</execution>
|
||||
<execution>
|
||||
|
|
|
@ -202,6 +202,12 @@
|
|||
<testResource>
|
||||
<directory>src/test-files</directory>
|
||||
</testResource>
|
||||
<testResource>
|
||||
<directory>${project.build.testSourceDirectory}</directory>
|
||||
<excludes>
|
||||
<exclude>**/*.java</exclude>
|
||||
</excludes>
|
||||
</testResource>
|
||||
<testResource>
|
||||
<directory>../solrj/src/test-files</directory>
|
||||
</testResource>
|
||||
|
|
|
@ -113,6 +113,13 @@
|
|||
<skip>true</skip> <!-- Tests are run from solr-core module -->
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<skip>true</skip> <!-- This skips test compilation - tests are run from solr-core module -->
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
||||
|
|
|
@ -742,6 +742,9 @@ Changes in backwards compatibility policy
|
|||
behavior. Added seekExact() to FSTEnum, and added FST.save/read
|
||||
from a File. (Mike McCandless, Dawid Weiss, Robert Muir)
|
||||
|
||||
* LUCENE-3712: Removed unused and untested ReaderUtil#subReader methods.
|
||||
(Uwe Schindler)
|
||||
|
||||
Security fixes
|
||||
|
||||
* LUCENE-3588: Try harder to prevent SIGSEGV on cloned MMapIndexInputs:
|
||||
|
@ -790,6 +793,12 @@ New Features
|
|||
input mapping to it) for FSTs that have strictly monotonic long
|
||||
outputs (such as an ord). (Mike McCandless)
|
||||
|
||||
* LUCENE-3671: Add TypeTokenFilter that filters tokens based on
|
||||
their TypeAttribute. (Tommaso Teofili via Uwe Schindler)
|
||||
|
||||
* LUCENE-3690: Added HTMLStripCharFilter, a CharFilter that strips HTML
|
||||
markup. (Steve Rowe)
|
||||
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-3595: Fixed FieldCacheRangeFilter and FieldCacheTermsFilter
|
||||
|
@ -808,9 +817,11 @@ Bug fixes
|
|||
* LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners
|
||||
to clones/reopened readers. (Uwe Schindler)
|
||||
|
||||
* LUCENE-3642: Fixed bugs in CharTokenizer, n-gram filters, and smart chinese
|
||||
where they would create invalid offsets in some situations, leading to problems
|
||||
in highlighting. (Max Beutel via Robert Muir)
|
||||
* LUCENE-3642, SOLR-2891, LUCENE-3717: Fixed bugs in CharTokenizer, n-gram tokenizers/filters,
|
||||
compound token filters, thai word filter, icutokenizer, pattern analyzer,
|
||||
wikipediatokenizer, and smart chinese where they would create invalid offsets in
|
||||
some situations, leading to problems in highlighting.
|
||||
(Max Beutel, Edwin Steiner via Robert Muir)
|
||||
|
||||
* LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to
|
||||
Float.MIN_VALUE when it should be Float.NaN, when there were 0
|
||||
|
@ -825,6 +836,12 @@ Bug fixes
|
|||
* LUCENE-3605: don't sleep in a retry loop when trying to locate the
|
||||
segments_N file (Robert Muir, Mike McCandless)
|
||||
|
||||
* LUCENE-3711: SentinelIntSet with a small initial size can go into
|
||||
an infinite loop when expanded. This can affect grouping using
|
||||
TermAllGroupsCollector or TermAllGroupHeadsCollector if instantiated with a
|
||||
non default small size. (Martijn van Groningen, yonik)
|
||||
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-3653: Improve concurrency in VirtualMethod and AttributeSource by
|
||||
|
|
|
@ -52,12 +52,12 @@ public abstract class Analyzer {
|
|||
* @param fieldName
|
||||
* the name of the fields content passed to the
|
||||
* {@link TokenStreamComponents} sink as a reader
|
||||
* @param aReader
|
||||
* @param reader
|
||||
* the reader passed to the {@link Tokenizer} constructor
|
||||
* @return the {@link TokenStreamComponents} for this analyzer.
|
||||
*/
|
||||
protected abstract TokenStreamComponents createComponents(String fieldName,
|
||||
Reader aReader);
|
||||
Reader reader);
|
||||
|
||||
/**
|
||||
* Creates a TokenStream that is allowed to be re-use from the previous time
|
||||
|
|
|
@ -206,7 +206,7 @@ public class SegmentTermDocs {
|
|||
skipListReader = new Lucene40SkipListReader((IndexInput) freqStream.clone(), maxSkipLevels, skipInterval); // lazily clone
|
||||
|
||||
if (!haveSkipped) { // lazily initialize skip stream
|
||||
skipListReader.init(skipPointer, freqBasePointer, proxBasePointer, df, currentFieldStoresPayloads);
|
||||
skipListReader.init(skipPointer, freqBasePointer, proxBasePointer, df, currentFieldStoresPayloads, false);
|
||||
haveSkipped = true;
|
||||
}
|
||||
|
||||
|
|
|
@ -85,11 +85,11 @@ public class Lucene40FieldInfosReader extends FieldInfosReader {
|
|||
// LUCENE-3027: past indices were able to write
|
||||
// storePayloads=true when omitTFAP is also true,
|
||||
// which is invalid. We correct that, here:
|
||||
if (indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
|
||||
storePayloads = false;
|
||||
}
|
||||
hasVectors |= storeTermVector;
|
||||
hasProx |= isIndexed && indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
|
||||
hasProx |= isIndexed && indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
hasFreq |= isIndexed && indexOptions != IndexOptions.DOCS_ONLY;
|
||||
// DV Types are packed in one byte
|
||||
byte val = input.readByte();
|
||||
|
|
|
@ -58,7 +58,7 @@ public class Lucene40FieldInfosWriter extends FieldInfosWriter {
|
|||
output.writeVInt(FORMAT_CURRENT);
|
||||
output.writeVInt(infos.size());
|
||||
for (FieldInfo fi : infos) {
|
||||
assert fi.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !fi.storePayloads;
|
||||
assert fi.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !fi.storePayloads;
|
||||
byte bits = 0x0;
|
||||
if (fi.isIndexed) bits |= IS_INDEXED;
|
||||
if (fi.storeTermVector) bits |= STORE_TERMVECTOR;
|
||||
|
|
|
@ -197,7 +197,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
// undefined
|
||||
}
|
||||
|
||||
if (fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
if (fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
|
||||
if (isFirstTerm) {
|
||||
termState.proxOffset = termState.bytesReader.readVLong();
|
||||
} else {
|
||||
|
@ -245,23 +245,23 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
DocsAndPositionsEnum reuse, boolean needsOffsets)
|
||||
throws IOException {
|
||||
|
||||
if (needsOffsets) {
|
||||
// TODO: once we index offsets into postings fix this!
|
||||
return null;
|
||||
boolean hasOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
if (needsOffsets && !hasOffsets) {
|
||||
return null; // not available
|
||||
}
|
||||
|
||||
// TODO: refactor
|
||||
if (fieldInfo.storePayloads) {
|
||||
SegmentDocsAndPositionsAndPayloadsEnum docsEnum;
|
||||
if (reuse == null || !(reuse instanceof SegmentDocsAndPositionsAndPayloadsEnum)) {
|
||||
docsEnum = new SegmentDocsAndPositionsAndPayloadsEnum(freqIn, proxIn);
|
||||
if (fieldInfo.storePayloads || hasOffsets) {
|
||||
SegmentFullPositionsEnum docsEnum;
|
||||
if (reuse == null || !(reuse instanceof SegmentFullPositionsEnum)) {
|
||||
docsEnum = new SegmentFullPositionsEnum(freqIn, proxIn);
|
||||
} else {
|
||||
docsEnum = (SegmentDocsAndPositionsAndPayloadsEnum) reuse;
|
||||
docsEnum = (SegmentFullPositionsEnum) reuse;
|
||||
if (docsEnum.startFreqIn != freqIn) {
|
||||
// If you are using ParellelReader, and pass in a
|
||||
// reused DocsEnum, it could have come from another
|
||||
// reader also using standard codec
|
||||
docsEnum = new SegmentDocsAndPositionsAndPayloadsEnum(freqIn, proxIn);
|
||||
docsEnum = new SegmentFullPositionsEnum(freqIn, proxIn);
|
||||
}
|
||||
}
|
||||
return docsEnum.reset(fieldInfo, (StandardTermState) termState, liveDocs);
|
||||
|
@ -295,6 +295,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
|
||||
protected boolean indexOmitsTF; // does current field omit term freq?
|
||||
protected boolean storePayloads; // does current field store payloads?
|
||||
protected boolean storeOffsets; // does current field store offsets?
|
||||
|
||||
protected int limit; // number of docs in this posting
|
||||
protected int ord; // how many docs we've read
|
||||
|
@ -324,6 +325,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
DocsEnum reset(FieldInfo fieldInfo, StandardTermState termState) throws IOException {
|
||||
indexOmitsTF = fieldInfo.indexOptions == IndexOptions.DOCS_ONLY;
|
||||
storePayloads = fieldInfo.storePayloads;
|
||||
storeOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
freqOffset = termState.freqOffset;
|
||||
skipOffset = termState.skipOffset;
|
||||
|
||||
|
@ -471,7 +473,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
|
||||
skipper.init(freqOffset + skipOffset,
|
||||
freqOffset, 0,
|
||||
limit, storePayloads);
|
||||
limit, storePayloads, storeOffsets);
|
||||
|
||||
skipped = true;
|
||||
}
|
||||
|
@ -519,7 +521,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
return doc = docs[i];
|
||||
}
|
||||
}
|
||||
return refill();
|
||||
return doc = refill();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -602,7 +604,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
return doc = docs[i];
|
||||
}
|
||||
}
|
||||
return refill();
|
||||
return doc = refill();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -665,7 +667,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
|
||||
// TODO specialize DocsAndPosEnum too
|
||||
|
||||
// Decodes docs & positions. payloads are not present.
|
||||
// Decodes docs & positions. payloads nor offsets are present.
|
||||
private final class SegmentDocsAndPositionsEnum extends DocsAndPositionsEnum {
|
||||
final IndexInput startFreqIn;
|
||||
private final IndexInput freqIn;
|
||||
|
@ -792,7 +794,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
|
||||
skipper.init(freqOffset+skipOffset,
|
||||
freqOffset, proxOffset,
|
||||
limit, false);
|
||||
limit, false, false);
|
||||
|
||||
skipped = true;
|
||||
}
|
||||
|
@ -868,8 +870,8 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
}
|
||||
|
||||
// Decodes docs & positions & payloads
|
||||
private class SegmentDocsAndPositionsAndPayloadsEnum extends DocsAndPositionsEnum {
|
||||
// Decodes docs & positions & (payloads and/or offsets)
|
||||
private class SegmentFullPositionsEnum extends DocsAndPositionsEnum {
|
||||
final IndexInput startFreqIn;
|
||||
private final IndexInput freqIn;
|
||||
private final IndexInput proxIn;
|
||||
|
@ -895,16 +897,24 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
Lucene40SkipListReader skipper;
|
||||
private BytesRef payload;
|
||||
private long lazyProxPointer;
|
||||
|
||||
boolean storePayloads;
|
||||
boolean storeOffsets;
|
||||
|
||||
int offsetLength;
|
||||
int startOffset;
|
||||
|
||||
public SegmentDocsAndPositionsAndPayloadsEnum(IndexInput freqIn, IndexInput proxIn) throws IOException {
|
||||
public SegmentFullPositionsEnum(IndexInput freqIn, IndexInput proxIn) throws IOException {
|
||||
startFreqIn = freqIn;
|
||||
this.freqIn = (IndexInput) freqIn.clone();
|
||||
this.proxIn = (IndexInput) proxIn.clone();
|
||||
}
|
||||
|
||||
public SegmentDocsAndPositionsAndPayloadsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits liveDocs) throws IOException {
|
||||
assert fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
|
||||
assert fieldInfo.storePayloads;
|
||||
public SegmentFullPositionsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits liveDocs) throws IOException {
|
||||
storeOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
storePayloads = fieldInfo.storePayloads;
|
||||
assert fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
assert storePayloads || storeOffsets;
|
||||
if (payload == null) {
|
||||
payload = new BytesRef();
|
||||
payload.bytes = new byte[1];
|
||||
|
@ -923,6 +933,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
doc = -1;
|
||||
accum = 0;
|
||||
position = 0;
|
||||
startOffset = 0;
|
||||
|
||||
skipped = false;
|
||||
posPendingCount = 0;
|
||||
|
@ -963,6 +974,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
position = 0;
|
||||
startOffset = 0;
|
||||
|
||||
//System.out.println("StandardR.D&PE nextDoc seg=" + segment + " return doc=" + doc);
|
||||
return (doc = accum);
|
||||
|
@ -1001,7 +1013,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
//System.out.println(" init skipper freqOffset=" + freqOffset + " skipOffset=" + skipOffset + " vs len=" + freqIn.length());
|
||||
skipper.init(freqOffset+skipOffset,
|
||||
freqOffset, proxOffset,
|
||||
limit, true);
|
||||
limit, storePayloads, storeOffsets);
|
||||
|
||||
skipped = true;
|
||||
}
|
||||
|
@ -1016,8 +1028,10 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
lazyProxPointer = skipper.getProxPointer();
|
||||
posPendingCount = 0;
|
||||
position = 0;
|
||||
startOffset = 0;
|
||||
payloadPending = false;
|
||||
payloadLength = skipper.getPayloadLength();
|
||||
offsetLength = skipper.getOffsetLength();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1038,27 +1052,38 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
if (payloadPending && payloadLength > 0) {
|
||||
// payload of last position as never retrieved -- skip it
|
||||
// payload of last position was never retrieved -- skip it
|
||||
proxIn.seek(proxIn.getFilePointer() + payloadLength);
|
||||
payloadPending = false;
|
||||
}
|
||||
|
||||
// scan over any docs that were iterated without their positions
|
||||
while(posPendingCount > freq) {
|
||||
|
||||
final int code = proxIn.readVInt();
|
||||
|
||||
if ((code & 1) != 0) {
|
||||
// new payload length
|
||||
payloadLength = proxIn.readVInt();
|
||||
assert payloadLength >= 0;
|
||||
if (storePayloads) {
|
||||
if ((code & 1) != 0) {
|
||||
// new payload length
|
||||
payloadLength = proxIn.readVInt();
|
||||
assert payloadLength >= 0;
|
||||
}
|
||||
assert payloadLength != -1;
|
||||
}
|
||||
|
||||
assert payloadLength != -1;
|
||||
proxIn.seek(proxIn.getFilePointer() + payloadLength);
|
||||
if (storeOffsets) {
|
||||
if ((proxIn.readVInt() & 1) != 0) {
|
||||
// new offset length
|
||||
offsetLength = proxIn.readVInt();
|
||||
}
|
||||
}
|
||||
|
||||
if (storePayloads) {
|
||||
proxIn.seek(proxIn.getFilePointer() + payloadLength);
|
||||
}
|
||||
|
||||
posPendingCount--;
|
||||
position = 0;
|
||||
startOffset = 0;
|
||||
payloadPending = false;
|
||||
//System.out.println("StandardR.D&PE skipPos");
|
||||
}
|
||||
|
@ -1069,16 +1094,28 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
proxIn.seek(proxIn.getFilePointer()+payloadLength);
|
||||
}
|
||||
|
||||
final int code = proxIn.readVInt();
|
||||
if ((code & 1) != 0) {
|
||||
// new payload length
|
||||
payloadLength = proxIn.readVInt();
|
||||
assert payloadLength >= 0;
|
||||
}
|
||||
assert payloadLength != -1;
|
||||
int code = proxIn.readVInt();
|
||||
if (storePayloads) {
|
||||
if ((code & 1) != 0) {
|
||||
// new payload length
|
||||
payloadLength = proxIn.readVInt();
|
||||
assert payloadLength >= 0;
|
||||
}
|
||||
assert payloadLength != -1;
|
||||
|
||||
payloadPending = true;
|
||||
position += code >>> 1;
|
||||
payloadPending = true;
|
||||
code >>>= 1;
|
||||
}
|
||||
position += code;
|
||||
|
||||
if (storeOffsets) {
|
||||
int offsetCode = proxIn.readVInt();
|
||||
if ((offsetCode & 1) != 0) {
|
||||
// new offset length
|
||||
offsetLength = proxIn.readVInt();
|
||||
}
|
||||
startOffset += offsetCode >>> 1;
|
||||
}
|
||||
|
||||
posPendingCount--;
|
||||
|
||||
|
@ -1090,32 +1127,36 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
return -1;
|
||||
return storeOffsets ? startOffset : -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
return -1;
|
||||
return storeOffsets ? startOffset + offsetLength : -1;
|
||||
}
|
||||
|
||||
/** Returns the payload at this position, or null if no
|
||||
* payload was indexed. */
|
||||
@Override
|
||||
public BytesRef getPayload() throws IOException {
|
||||
assert lazyProxPointer == -1;
|
||||
assert posPendingCount < freq;
|
||||
if (!payloadPending) {
|
||||
throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once.");
|
||||
}
|
||||
if (payloadLength > payload.bytes.length) {
|
||||
payload.grow(payloadLength);
|
||||
}
|
||||
if (storePayloads) {
|
||||
assert lazyProxPointer == -1;
|
||||
assert posPendingCount < freq;
|
||||
if (!payloadPending) {
|
||||
throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once.");
|
||||
}
|
||||
if (payloadLength > payload.bytes.length) {
|
||||
payload.grow(payloadLength);
|
||||
}
|
||||
|
||||
proxIn.readBytes(payload.bytes, 0, payloadLength);
|
||||
payload.length = payloadLength;
|
||||
payloadPending = false;
|
||||
proxIn.readBytes(payload.bytes, 0, payloadLength);
|
||||
payload.length = payloadLength;
|
||||
payloadPending = false;
|
||||
|
||||
return payload;
|
||||
return payload;
|
||||
} else {
|
||||
throw new IOException("No payloads exist for this field!");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -73,12 +73,15 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
|
||||
IndexOptions indexOptions;
|
||||
boolean storePayloads;
|
||||
boolean storeOffsets;
|
||||
// Starts a new term
|
||||
long freqStart;
|
||||
long proxStart;
|
||||
FieldInfo fieldInfo;
|
||||
int lastPayloadLength;
|
||||
int lastOffsetLength;
|
||||
int lastPosition;
|
||||
int lastOffset;
|
||||
|
||||
// private String segment;
|
||||
|
||||
|
@ -137,6 +140,8 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
proxStart = proxOut.getFilePointer();
|
||||
// force first payload to write its length
|
||||
lastPayloadLength = -1;
|
||||
// force first offset to write its length
|
||||
lastOffsetLength = -1;
|
||||
}
|
||||
skipListWriter.resetSkip();
|
||||
}
|
||||
|
@ -155,10 +160,8 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
*/
|
||||
this.fieldInfo = fieldInfo;
|
||||
indexOptions = fieldInfo.indexOptions;
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
|
||||
throw new UnsupportedOperationException("this codec cannot index offsets");
|
||||
}
|
||||
|
||||
|
||||
storeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
storePayloads = fieldInfo.storePayloads;
|
||||
//System.out.println(" set init blockFreqStart=" + freqStart);
|
||||
//System.out.println(" set init blockProxStart=" + proxStart);
|
||||
|
@ -180,7 +183,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
}
|
||||
|
||||
if ((++df % skipInterval) == 0) {
|
||||
skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength);
|
||||
skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength, storeOffsets, lastOffsetLength);
|
||||
skipListWriter.bufferSkip(df);
|
||||
}
|
||||
|
||||
|
@ -197,31 +200,26 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
}
|
||||
|
||||
lastPosition = 0;
|
||||
lastOffset = 0;
|
||||
}
|
||||
|
||||
/** Add a new position & payload */
|
||||
@Override
|
||||
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
|
||||
//if (DEBUG) System.out.println("SPW: addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.length + " bytes")) + " proxFP=" + proxOut.getFilePointer());
|
||||
assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS: "invalid indexOptions: " + indexOptions;
|
||||
assert indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 : "invalid indexOptions: " + indexOptions;
|
||||
assert proxOut != null;
|
||||
|
||||
// TODO: when we add offsets... often
|
||||
// endOffset-startOffset will be constant or near
|
||||
// constant for all docs (eg if the term wasn't stemmed
|
||||
// then this will usually be the utf16 length of the
|
||||
// term); would be nice to write that length once up
|
||||
// front and then not encode endOffset for each
|
||||
// position..
|
||||
|
||||
final int delta = position - lastPosition;
|
||||
|
||||
assert delta >= 0: "position=" + position + " lastPosition=" + lastPosition; // not quite right (if pos=0 is repeated twice we don't catch it)
|
||||
|
||||
lastPosition = position;
|
||||
|
||||
int payloadLength = 0;
|
||||
|
||||
if (storePayloads) {
|
||||
final int payloadLength = payload == null ? 0 : payload.length;
|
||||
payloadLength = payload == null ? 0 : payload.length;
|
||||
|
||||
if (payloadLength != lastPayloadLength) {
|
||||
lastPayloadLength = payloadLength;
|
||||
|
@ -230,13 +228,28 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
} else {
|
||||
proxOut.writeVInt(delta << 1);
|
||||
}
|
||||
|
||||
if (payloadLength > 0) {
|
||||
proxOut.writeBytes(payload.bytes, payload.offset, payloadLength);
|
||||
}
|
||||
} else {
|
||||
proxOut.writeVInt(delta);
|
||||
}
|
||||
|
||||
if (storeOffsets) {
|
||||
// don't use startOffset - lastEndOffset, because this creates lots of negative vints for synonyms,
|
||||
// and the numbers aren't that much smaller anyways.
|
||||
int offsetDelta = startOffset - lastOffset;
|
||||
int offsetLength = endOffset - startOffset;
|
||||
if (offsetLength != lastOffsetLength) {
|
||||
proxOut.writeVInt(offsetDelta << 1 | 1);
|
||||
proxOut.writeVInt(offsetLength);
|
||||
} else {
|
||||
proxOut.writeVInt(offsetDelta << 1);
|
||||
}
|
||||
lastOffset = startOffset;
|
||||
lastOffsetLength = offsetLength;
|
||||
}
|
||||
|
||||
if (payloadLength > 0) {
|
||||
proxOut.writeBytes(payload.bytes, payload.offset, payloadLength);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -304,7 +317,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
assert firstTerm.skipOffset > 0;
|
||||
bytesWriter.writeVInt(firstTerm.skipOffset);
|
||||
}
|
||||
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
|
||||
bytesWriter.writeVLong(firstTerm.proxStart);
|
||||
}
|
||||
long lastFreqStart = firstTerm.freqStart;
|
||||
|
@ -319,7 +332,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
assert term.skipOffset > 0;
|
||||
bytesWriter.writeVInt(term.skipOffset);
|
||||
}
|
||||
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
|
||||
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
|
||||
bytesWriter.writeVLong(term.proxStart - lastProxStart);
|
||||
lastProxStart = term.proxStart;
|
||||
}
|
||||
|
|
|
@ -30,13 +30,16 @@ import org.apache.lucene.store.IndexInput;
|
|||
*/
|
||||
public class Lucene40SkipListReader extends MultiLevelSkipListReader {
|
||||
private boolean currentFieldStoresPayloads;
|
||||
private boolean currentFieldStoresOffsets;
|
||||
private long freqPointer[];
|
||||
private long proxPointer[];
|
||||
private int payloadLength[];
|
||||
private int offsetLength[];
|
||||
|
||||
private long lastFreqPointer;
|
||||
private long lastProxPointer;
|
||||
private int lastPayloadLength;
|
||||
private int lastOffsetLength;
|
||||
|
||||
|
||||
public Lucene40SkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) {
|
||||
|
@ -44,17 +47,20 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader {
|
|||
freqPointer = new long[maxSkipLevels];
|
||||
proxPointer = new long[maxSkipLevels];
|
||||
payloadLength = new int[maxSkipLevels];
|
||||
offsetLength = new int[maxSkipLevels];
|
||||
}
|
||||
|
||||
public void init(long skipPointer, long freqBasePointer, long proxBasePointer, int df, boolean storesPayloads) {
|
||||
public void init(long skipPointer, long freqBasePointer, long proxBasePointer, int df, boolean storesPayloads, boolean storesOffsets) {
|
||||
super.init(skipPointer, df);
|
||||
this.currentFieldStoresPayloads = storesPayloads;
|
||||
this.currentFieldStoresOffsets = storesOffsets;
|
||||
lastFreqPointer = freqBasePointer;
|
||||
lastProxPointer = proxBasePointer;
|
||||
|
||||
Arrays.fill(freqPointer, freqBasePointer);
|
||||
Arrays.fill(proxPointer, proxBasePointer);
|
||||
Arrays.fill(payloadLength, 0);
|
||||
Arrays.fill(offsetLength, 0);
|
||||
}
|
||||
|
||||
/** Returns the freq pointer of the doc to which the last call of
|
||||
|
@ -76,12 +82,20 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader {
|
|||
return lastPayloadLength;
|
||||
}
|
||||
|
||||
/** Returns the offset length (endOffset-startOffset) of the position stored just before
|
||||
* the doc to which the last call of {@link MultiLevelSkipListReader#skipTo(int)}
|
||||
* has skipped. */
|
||||
public int getOffsetLength() {
|
||||
return lastOffsetLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void seekChild(int level) throws IOException {
|
||||
super.seekChild(level);
|
||||
freqPointer[level] = lastFreqPointer;
|
||||
proxPointer[level] = lastProxPointer;
|
||||
payloadLength[level] = lastPayloadLength;
|
||||
offsetLength[level] = lastOffsetLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -90,6 +104,7 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader {
|
|||
lastFreqPointer = freqPointer[level];
|
||||
lastProxPointer = proxPointer[level];
|
||||
lastPayloadLength = payloadLength[level];
|
||||
lastOffsetLength = offsetLength[level];
|
||||
}
|
||||
|
||||
|
||||
|
@ -110,6 +125,11 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader {
|
|||
} else {
|
||||
delta = skipStream.readVInt();
|
||||
}
|
||||
|
||||
if (currentFieldStoresOffsets) {
|
||||
offsetLength[level] = skipStream.readVInt();
|
||||
}
|
||||
|
||||
freqPointer[level] += skipStream.readVInt();
|
||||
proxPointer[level] += skipStream.readVInt();
|
||||
|
||||
|
|
|
@ -40,7 +40,9 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter {
|
|||
|
||||
private int curDoc;
|
||||
private boolean curStorePayloads;
|
||||
private boolean curStoreOffsets;
|
||||
private int curPayloadLength;
|
||||
private int curOffsetLength;
|
||||
private long curFreqPointer;
|
||||
private long curProxPointer;
|
||||
|
||||
|
@ -58,10 +60,12 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter {
|
|||
/**
|
||||
* Sets the values for the current skip data.
|
||||
*/
|
||||
public void setSkipData(int doc, boolean storePayloads, int payloadLength) {
|
||||
public void setSkipData(int doc, boolean storePayloads, int payloadLength, boolean storeOffsets, int offsetLength) {
|
||||
this.curDoc = doc;
|
||||
this.curStorePayloads = storePayloads;
|
||||
this.curPayloadLength = payloadLength;
|
||||
this.curStoreOffsets = storeOffsets;
|
||||
this.curOffsetLength = offsetLength;
|
||||
this.curFreqPointer = freqOutput.getFilePointer();
|
||||
if (proxOutput != null)
|
||||
this.curProxPointer = proxOutput.getFilePointer();
|
||||
|
@ -116,6 +120,12 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter {
|
|||
// current field does not store payloads
|
||||
skipBuffer.writeVInt(curDoc - lastSkipDoc[level]);
|
||||
}
|
||||
|
||||
// TODO: not sure it really helps to shove this somewhere else if its the same as the last skip
|
||||
if (curStoreOffsets) {
|
||||
skipBuffer.writeVInt(curOffsetLength);
|
||||
}
|
||||
|
||||
skipBuffer.writeVInt((int) (curFreqPointer - lastSkipFreqPointer[level]));
|
||||
skipBuffer.writeVInt((int) (curProxPointer - lastSkipProxPointer[level]));
|
||||
|
||||
|
|
|
@ -548,8 +548,9 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+DOC.length, scratch.length-DOC.length, scratchUTF16);
|
||||
int docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
|
||||
visitedDocs.set(docID);
|
||||
} else if (StringHelper.startsWith(scratch, POS)) {
|
||||
totalTermFreq++;
|
||||
} else if (StringHelper.startsWith(scratch, FREQ)) {
|
||||
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+FREQ.length, scratch.length-FREQ.length, scratchUTF16);
|
||||
totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
|
||||
} else if (StringHelper.startsWith(scratch, TERM)) {
|
||||
if (lastDocsStart != -1) {
|
||||
b.add(Util.toIntsRef(lastTerm, scratchIntsRef), new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
|
||||
|
|
|
@ -404,7 +404,7 @@ public final class FieldInfos implements Iterable<FieldInfo> {
|
|||
|
||||
public boolean hasNorms() {
|
||||
for (FieldInfo fi : this) {
|
||||
if (fi.isIndexed && !fi.omitNorms) {
|
||||
if (fi.normsPresent()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -921,13 +921,7 @@ public abstract class IndexReader implements Closeable {
|
|||
* If this method returns an empty array, that means this
|
||||
* reader is a null reader (for example a MultiReader
|
||||
* that has no sub readers).
|
||||
* <p>
|
||||
* NOTE: You should not try using sub-readers returned by
|
||||
* this method to make any changes (deleteDocument,
|
||||
* etc.). While this might succeed for one composite reader
|
||||
* (like MultiReader), it will most likely lead to index
|
||||
* corruption for other readers (like DirectoryReader obtained
|
||||
* through {@link #open}. Use the parent reader directly. */
|
||||
*/
|
||||
public IndexReader[] getSequentialSubReaders() {
|
||||
ensureOpen();
|
||||
return null;
|
||||
|
|
|
@ -32,6 +32,10 @@ public class CollectionStatistics {
|
|||
private final long sumDocFreq;
|
||||
|
||||
public CollectionStatistics(String field, int maxDoc, int docCount, long sumTotalTermFreq, long sumDocFreq) {
|
||||
assert maxDoc >= 0;
|
||||
assert docCount >= -1 && docCount <= maxDoc; // #docs with field must be <= #docs
|
||||
assert sumDocFreq >= -1;
|
||||
assert sumTotalTermFreq == -1 || sumTotalTermFreq >= sumDocFreq; // #positions must be >= #postings
|
||||
this.field = field;
|
||||
this.maxDoc = maxDoc;
|
||||
this.docCount = docCount;
|
||||
|
|
|
@ -29,6 +29,8 @@ public class TermStatistics {
|
|||
private final long totalTermFreq;
|
||||
|
||||
public TermStatistics(BytesRef term, int docFreq, long totalTermFreq) {
|
||||
assert docFreq >= 0;
|
||||
assert totalTermFreq == -1 || totalTermFreq >= docFreq; // #positions must be >= #postings
|
||||
this.term = term;
|
||||
this.docFreq = docFreq;
|
||||
this.totalTermFreq = totalTermFreq;
|
||||
|
|
|
@ -87,6 +87,8 @@ public abstract class SimilarityBase extends Similarity {
|
|||
/** Fills all member fields defined in {@code BasicStats} in {@code stats}.
|
||||
* Subclasses can override this method to fill additional stats. */
|
||||
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
|
||||
// #positions(field) must be >= #positions(term)
|
||||
assert collectionStats.sumTotalTermFreq() == -1 || collectionStats.sumTotalTermFreq() >= termStats.totalTermFreq();
|
||||
int numberOfDocuments = collectionStats.maxDoc();
|
||||
|
||||
int docFreq = termStats.docFreq();
|
||||
|
|
|
@ -70,8 +70,9 @@ import org.apache.lucene.util.ToStringUtils;
|
|||
* and 'jones' in position 1). </p>
|
||||
*
|
||||
* <p>Note: as {@link #getField()} returns the masked field, scoring will be
|
||||
* done using the norms of the field name supplied. This may lead to unexpected
|
||||
* scoring behaviour.</p>
|
||||
* done using the Similarity and collection statistics of the field name supplied,
|
||||
* but with the term statistics of the real field. This may lead to exceptions,
|
||||
* poor performance, and unexpected scoring behaviour.</p>
|
||||
*/
|
||||
public class FieldMaskingSpanQuery extends SpanQuery {
|
||||
private SpanQuery maskedQuery;
|
||||
|
|
|
@ -255,10 +255,8 @@ final class CompoundFileWriter implements Closeable{
|
|||
assert !seenIDs.contains(id): "file=\"" + name + "\" maps to id=\"" + id + "\", which was already written";
|
||||
seenIDs.add(id);
|
||||
final DirectCFSIndexOutput out;
|
||||
if (outputTaken.compareAndSet(false, true)) {
|
||||
if ((outputLocked = outputTaken.compareAndSet(false, true))) {
|
||||
out = new DirectCFSIndexOutput(getOutput(), entry, false);
|
||||
outputLocked = true;
|
||||
success = true;
|
||||
} else {
|
||||
entry.dir = this.directory;
|
||||
if (directory.fileExists(name)) {
|
||||
|
|
|
@ -120,42 +120,6 @@ public final class ReaderUtil {
|
|||
|
||||
protected abstract void add(int base, IndexReader r) throws IOException;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns sub IndexReader that contains the given document id.
|
||||
*
|
||||
* @param doc id of document
|
||||
* @param reader parent reader
|
||||
* @return sub reader of parent which contains the specified doc id
|
||||
*/
|
||||
public static IndexReader subReader(int doc, IndexReader reader) {
|
||||
List<IndexReader> subReadersList = new ArrayList<IndexReader>();
|
||||
ReaderUtil.gatherSubReaders(subReadersList, reader);
|
||||
IndexReader[] subReaders = subReadersList
|
||||
.toArray(new IndexReader[subReadersList.size()]);
|
||||
int[] docStarts = new int[subReaders.length];
|
||||
int maxDoc = 0;
|
||||
for (int i = 0; i < subReaders.length; i++) {
|
||||
docStarts[i] = maxDoc;
|
||||
maxDoc += subReaders[i].maxDoc();
|
||||
}
|
||||
return subReaders[subIndex(doc, docStarts)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns sub-reader subIndex from reader.
|
||||
*
|
||||
* @param reader parent reader
|
||||
* @param subIndex index of desired sub reader
|
||||
* @return the subreader at subIndex
|
||||
*/
|
||||
public static IndexReader subReader(IndexReader reader, int subIndex) {
|
||||
List<IndexReader> subReadersList = new ArrayList<IndexReader>();
|
||||
ReaderUtil.gatherSubReaders(subReadersList, reader);
|
||||
IndexReader[] subReaders = subReadersList
|
||||
.toArray(new IndexReader[subReadersList.size()]);
|
||||
return subReaders[subIndex];
|
||||
}
|
||||
|
||||
public static ReaderContext buildReaderContext(IndexReader reader) {
|
||||
return new ReaderContextBuilder(reader).build();
|
||||
|
|
|
@ -96,13 +96,13 @@ public class SentinelIntSet {
|
|||
public int put(int key) {
|
||||
int s = find(key);
|
||||
if (s < 0) {
|
||||
count++;
|
||||
if (count >= rehashCount) {
|
||||
rehash();
|
||||
s = getSlot(key);
|
||||
} else {
|
||||
s = -s-1;
|
||||
}
|
||||
count++;
|
||||
keys[s] = key;
|
||||
}
|
||||
return s;
|
||||
|
|
|
@ -32,12 +32,13 @@ import org.apache.lucene.util.BytesRef;
|
|||
public final class ByteSequenceOutputs extends Outputs<BytesRef> {
|
||||
|
||||
private final static BytesRef NO_OUTPUT = new BytesRef();
|
||||
private final static ByteSequenceOutputs singleton = new ByteSequenceOutputs();
|
||||
|
||||
private ByteSequenceOutputs() {
|
||||
}
|
||||
|
||||
public static ByteSequenceOutputs getSingleton() {
|
||||
return new ByteSequenceOutputs();
|
||||
return singleton;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -32,12 +32,13 @@ import org.apache.lucene.util.IntsRef;
|
|||
public final class IntSequenceOutputs extends Outputs<IntsRef> {
|
||||
|
||||
private final static IntsRef NO_OUTPUT = new IntsRef();
|
||||
private final static IntSequenceOutputs singleton = new IntSequenceOutputs();
|
||||
|
||||
private IntSequenceOutputs() {
|
||||
}
|
||||
|
||||
public static IntSequenceOutputs getSingleton() {
|
||||
return new IntSequenceOutputs();
|
||||
return singleton;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
|
@ -289,8 +290,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
|
||||
checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean());
|
||||
}
|
||||
|
||||
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter) throws IOException {
|
||||
for (int i = 0; i < iterations; i++) {
|
||||
String text;
|
||||
switch(_TestUtil.nextInt(random, 0, 4)) {
|
||||
|
@ -311,7 +316,9 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
|
||||
}
|
||||
|
||||
TokenStream ts = a.tokenStream("dummy", new StringReader(text));
|
||||
int remainder = random.nextInt(10);
|
||||
Reader reader = new StringReader(text);
|
||||
TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
||||
assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
|
||||
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
|
||||
|
@ -339,30 +346,38 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
if (VERBOSE) {
|
||||
System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis");
|
||||
}
|
||||
reader = new StringReader(text);
|
||||
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
||||
if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
|
||||
// offset + pos + type
|
||||
assertAnalyzesToReuse(a, text,
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
types.toArray(new String[types.size()]),
|
||||
toIntArray(positions));
|
||||
toIntArray(positions),
|
||||
text.length());
|
||||
} else if (posIncAtt != null && offsetAtt != null) {
|
||||
// offset + pos
|
||||
assertAnalyzesToReuse(a, text,
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
toIntArray(positions));
|
||||
null,
|
||||
toIntArray(positions),
|
||||
text.length());
|
||||
} else if (offsetAtt != null) {
|
||||
// offset
|
||||
assertAnalyzesToReuse(a, text,
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets));
|
||||
toIntArray(endOffsets),
|
||||
null,
|
||||
null,
|
||||
text.length());
|
||||
} else {
|
||||
// terms only
|
||||
assertAnalyzesToReuse(a, text,
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,100 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.SortedMap;
|
||||
import java.util.TreeMap;
|
||||
|
||||
// the purpose of this charfilter is to send offsets out of bounds
|
||||
// if the analyzer doesn't use correctOffset or does incorrect offset math.
|
||||
class MockCharFilter extends CharStream {
|
||||
final Reader in;
|
||||
final int remainder;
|
||||
|
||||
// for testing only
|
||||
public MockCharFilter(Reader in, int remainder) {
|
||||
this.in = in;
|
||||
this.remainder = remainder;
|
||||
assert remainder >= 0 && remainder < 10 : "invalid parameter";
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
in.close();
|
||||
}
|
||||
|
||||
int currentOffset = -1;
|
||||
int delta = 0;
|
||||
int bufferedCh = -1;
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
// we have a buffered character, add an offset correction and return it
|
||||
if (bufferedCh >= 0) {
|
||||
int ch = bufferedCh;
|
||||
bufferedCh = -1;
|
||||
currentOffset++;
|
||||
|
||||
addOffCorrectMap(currentOffset+delta, delta-1);
|
||||
delta--;
|
||||
return ch;
|
||||
}
|
||||
|
||||
// otherwise actually read one
|
||||
int ch = in.read();
|
||||
if (ch < 0)
|
||||
return ch;
|
||||
|
||||
currentOffset++;
|
||||
if ((ch % 10) != remainder || Character.isHighSurrogate((char)ch) || Character.isLowSurrogate((char)ch)) {
|
||||
return ch;
|
||||
}
|
||||
|
||||
// we will double this character, so buffer it.
|
||||
bufferedCh = ch;
|
||||
return ch;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
int numRead = 0;
|
||||
for (int i = off; i < off + len; i++) {
|
||||
int c = read();
|
||||
if (c == -1) break;
|
||||
cbuf[i] = (char) c;
|
||||
numRead++;
|
||||
}
|
||||
return numRead == 0 ? -1 : numRead;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int correctOffset(int currentOff) {
|
||||
SortedMap<Integer,Integer> subMap = corrections.subMap(0, currentOff+1);
|
||||
int ret = subMap.isEmpty() ? currentOff : currentOff + subMap.get(subMap.lastKey());
|
||||
assert ret >= 0 : "currentOff=" + currentOff + ",diff=" + (ret-currentOff);
|
||||
return ret;
|
||||
}
|
||||
|
||||
protected void addOffCorrectMap(int off, int cumulativeDiff) {
|
||||
corrections.put(off, cumulativeDiff);
|
||||
}
|
||||
|
||||
TreeMap<Integer,Integer> corrections = new TreeMap<Integer,Integer>();
|
||||
}
|
|
@ -137,7 +137,7 @@ class PreFlexRWFieldsWriter extends FieldsConsumer {
|
|||
}
|
||||
|
||||
if ((++df % termsOut.skipInterval) == 0) {
|
||||
skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength);
|
||||
skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength, false, 0);
|
||||
skipListWriter.bufferSkip(df);
|
||||
}
|
||||
|
||||
|
|
|
@ -268,8 +268,19 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
|
|||
assert subStats != null;
|
||||
}
|
||||
|
||||
docFreq += subStats.docFreq();
|
||||
totalTermFreq += subStats.totalTermFreq();
|
||||
int nodeDocFreq = subStats.docFreq();
|
||||
if (docFreq >= 0 && nodeDocFreq >= 0) {
|
||||
docFreq += nodeDocFreq;
|
||||
} else {
|
||||
docFreq = -1;
|
||||
}
|
||||
|
||||
long nodeTotalTermFreq = subStats.totalTermFreq();
|
||||
if (totalTermFreq >= 0 && nodeTotalTermFreq >= 0) {
|
||||
totalTermFreq += nodeTotalTermFreq;
|
||||
} else {
|
||||
totalTermFreq = -1;
|
||||
}
|
||||
}
|
||||
|
||||
return new TermStatistics(term.bytes(), docFreq, totalTermFreq);
|
||||
|
@ -299,9 +310,29 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
|
|||
// Collection stats are pre-shared on reopen, so,
|
||||
// we better not have a cache miss:
|
||||
assert nodeStats != null: "myNodeID=" + myNodeID + " nodeID=" + nodeID + " version=" + nodeVersions[nodeID] + " field=" + field;
|
||||
docCount += nodeStats.docCount();
|
||||
sumTotalTermFreq += nodeStats.sumTotalTermFreq();
|
||||
sumDocFreq += nodeStats.sumDocFreq();
|
||||
|
||||
int nodeDocCount = nodeStats.docCount();
|
||||
if (docCount >= 0 && nodeDocCount >= 0) {
|
||||
docCount += nodeDocCount;
|
||||
} else {
|
||||
docCount = -1;
|
||||
}
|
||||
|
||||
long nodeSumTotalTermFreq = nodeStats.sumTotalTermFreq();
|
||||
if (sumTotalTermFreq >= 0 && nodeSumTotalTermFreq >= 0) {
|
||||
sumTotalTermFreq += nodeSumTotalTermFreq;
|
||||
} else {
|
||||
sumTotalTermFreq = -1;
|
||||
}
|
||||
|
||||
long nodeSumDocFreq = nodeStats.sumDocFreq();
|
||||
if (sumDocFreq >= 0 && nodeSumDocFreq >= 0) {
|
||||
sumDocFreq += nodeSumDocFreq;
|
||||
} else {
|
||||
sumDocFreq = -1;
|
||||
}
|
||||
|
||||
assert nodeStats.maxDoc() >= 0;
|
||||
maxDoc += nodeStats.maxDoc();
|
||||
}
|
||||
|
||||
|
|
|
@ -283,7 +283,8 @@ public abstract class LuceneTestCase extends Assert {
|
|||
int randomVal = random.nextInt(10);
|
||||
|
||||
if ("Lucene3x".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal < 2)) { // preflex-only setup
|
||||
codec = new PreFlexRWCodec();
|
||||
codec = Codec.forName("Lucene3x");
|
||||
assert (codec instanceof PreFlexRWCodec) : "fix your classpath to have tests-framework.jar before lucene-core.jar";
|
||||
PREFLEX_IMPERSONATION_IS_ACTIVE = true;
|
||||
} else if ("SimpleText".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 9)) {
|
||||
codec = new SimpleTextCodec();
|
||||
|
|
|
@ -249,7 +249,42 @@ public class _TestUtil {
|
|||
}
|
||||
}
|
||||
|
||||
// TODO: make this more evil
|
||||
private static final String[] HTML_CHAR_ENTITIES = {
|
||||
"AElig", "Aacute", "Acirc", "Agrave", "Alpha", "AMP", "Aring", "Atilde",
|
||||
"Auml", "Beta", "COPY", "Ccedil", "Chi", "Dagger", "Delta", "ETH",
|
||||
"Eacute", "Ecirc", "Egrave", "Epsilon", "Eta", "Euml", "Gamma", "GT",
|
||||
"Iacute", "Icirc", "Igrave", "Iota", "Iuml", "Kappa", "Lambda", "LT",
|
||||
"Mu", "Ntilde", "Nu", "OElig", "Oacute", "Ocirc", "Ograve", "Omega",
|
||||
"Omicron", "Oslash", "Otilde", "Ouml", "Phi", "Pi", "Prime", "Psi",
|
||||
"QUOT", "REG", "Rho", "Scaron", "Sigma", "THORN", "Tau", "Theta",
|
||||
"Uacute", "Ucirc", "Ugrave", "Upsilon", "Uuml", "Xi", "Yacute", "Yuml",
|
||||
"Zeta", "aacute", "acirc", "acute", "aelig", "agrave", "alefsym",
|
||||
"alpha", "amp", "and", "ang", "apos", "aring", "asymp", "atilde",
|
||||
"auml", "bdquo", "beta", "brvbar", "bull", "cap", "ccedil", "cedil",
|
||||
"cent", "chi", "circ", "clubs", "cong", "copy", "crarr", "cup",
|
||||
"curren", "dArr", "dagger", "darr", "deg", "delta", "diams", "divide",
|
||||
"eacute", "ecirc", "egrave", "empty", "emsp", "ensp", "epsilon",
|
||||
"equiv", "eta", "eth", "euml", "euro", "exist", "fnof", "forall",
|
||||
"frac12", "frac14", "frac34", "frasl", "gamma", "ge", "gt", "hArr",
|
||||
"harr", "hearts", "hellip", "iacute", "icirc", "iexcl", "igrave",
|
||||
"image", "infin", "int", "iota", "iquest", "isin", "iuml", "kappa",
|
||||
"lArr", "lambda", "lang", "laquo", "larr", "lceil", "ldquo", "le",
|
||||
"lfloor", "lowast", "loz", "lrm", "lsaquo", "lsquo", "lt", "macr",
|
||||
"mdash", "micro", "middot", "minus", "mu", "nabla", "nbsp", "ndash",
|
||||
"ne", "ni", "not", "notin", "nsub", "ntilde", "nu", "oacute", "ocirc",
|
||||
"oelig", "ograve", "oline", "omega", "omicron", "oplus", "or", "ordf",
|
||||
"ordm", "oslash", "otilde", "otimes", "ouml", "para", "part", "permil",
|
||||
"perp", "phi", "pi", "piv", "plusmn", "pound", "prime", "prod", "prop",
|
||||
"psi", "quot", "rArr", "radic", "rang", "raquo", "rarr", "rceil",
|
||||
"rdquo", "real", "reg", "rfloor", "rho", "rlm", "rsaquo", "rsquo",
|
||||
"sbquo", "scaron", "sdot", "sect", "shy", "sigma", "sigmaf", "sim",
|
||||
"spades", "sub", "sube", "sum", "sup", "sup1", "sup2", "sup3", "supe",
|
||||
"szlig", "tau", "there4", "theta", "thetasym", "thinsp", "thorn",
|
||||
"tilde", "times", "trade", "uArr", "uacute", "uarr", "ucirc", "ugrave",
|
||||
"uml", "upsih", "upsilon", "uuml", "weierp", "xi", "yacute", "yen",
|
||||
"yuml", "zeta", "zwj", "zwnj"
|
||||
};
|
||||
|
||||
public static String randomHtmlishString(Random random, int numElements) {
|
||||
final int end = random.nextInt(numElements);
|
||||
if (end == 0) {
|
||||
|
@ -258,17 +293,80 @@ public class _TestUtil {
|
|||
}
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < end; i++) {
|
||||
int val = random.nextInt(10);
|
||||
int val = random.nextInt(25);
|
||||
switch(val) {
|
||||
case 0: sb.append("<p>"); break;
|
||||
case 1: sb.append("</p>"); break;
|
||||
case 2: sb.append("<!--"); break;
|
||||
case 3: sb.append("-->"); break;
|
||||
case 4: sb.append("&#"); break;
|
||||
case 5: sb.append(";"); break;
|
||||
case 6: sb.append((char)_TestUtil.nextInt(random, '0', '9')); break;
|
||||
default:
|
||||
sb.append((char)_TestUtil.nextInt(random, 'a', 'z'));
|
||||
case 1: {
|
||||
sb.append("<");
|
||||
sb.append(" ".substring(nextInt(random, 0, 4)));
|
||||
sb.append(randomSimpleString(random));
|
||||
for (int j = 0 ; j < nextInt(random, 0, 10) ; ++j) {
|
||||
sb.append(' ');
|
||||
sb.append(randomSimpleString(random));
|
||||
sb.append(" ".substring(nextInt(random, 0, 1)));
|
||||
sb.append('=');
|
||||
sb.append(" ".substring(nextInt(random, 0, 1)));
|
||||
sb.append("\"".substring(nextInt(random, 0, 1)));
|
||||
sb.append(randomSimpleString(random));
|
||||
sb.append("\"".substring(nextInt(random, 0, 1)));
|
||||
}
|
||||
sb.append(" ".substring(nextInt(random, 0, 4)));
|
||||
sb.append("/".substring(nextInt(random, 0, 1)));
|
||||
sb.append(">".substring(nextInt(random, 0, 1)));
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
sb.append("</");
|
||||
sb.append(" ".substring(nextInt(random, 0, 4)));
|
||||
sb.append(randomSimpleString(random));
|
||||
sb.append(" ".substring(nextInt(random, 0, 4)));
|
||||
sb.append(">".substring(nextInt(random, 0, 1)));
|
||||
break;
|
||||
}
|
||||
case 3: sb.append(">"); break;
|
||||
case 4: sb.append("</p>"); break;
|
||||
case 5: sb.append("<!--"); break;
|
||||
case 6: sb.append("<!--#"); break;
|
||||
case 7: sb.append("<script><!-- f('"); break;
|
||||
case 8: sb.append("</script>"); break;
|
||||
case 9: sb.append("<?"); break;
|
||||
case 10: sb.append("?>"); break;
|
||||
case 11: sb.append("\""); break;
|
||||
case 12: sb.append("\\\""); break;
|
||||
case 13: sb.append("'"); break;
|
||||
case 14: sb.append("\\'"); break;
|
||||
case 15: sb.append("-->"); break;
|
||||
case 16: {
|
||||
sb.append("&");
|
||||
switch(nextInt(random, 0, 2)) {
|
||||
case 0: sb.append(randomSimpleString(random)); break;
|
||||
case 1: sb.append(HTML_CHAR_ENTITIES[random.nextInt(HTML_CHAR_ENTITIES.length)]); break;
|
||||
}
|
||||
sb.append(";".substring(nextInt(random, 0, 1)));
|
||||
break;
|
||||
}
|
||||
case 17: {
|
||||
sb.append("&#");
|
||||
if (0 == nextInt(random, 0, 1)) {
|
||||
sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1));
|
||||
sb.append(";".substring(nextInt(random, 0, 1)));
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 18: {
|
||||
sb.append("&#x");
|
||||
if (0 == nextInt(random, 0, 1)) {
|
||||
sb.append(Integer.toString(nextInt(random, 0, Integer.MAX_VALUE - 1), 16));
|
||||
sb.append(";".substring(nextInt(random, 0, 1)));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case 19: sb.append(";"); break;
|
||||
case 20: sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1)); break;
|
||||
case 21: sb.append("\n");
|
||||
case 22: sb.append(" ".substring(nextInt(random, 0, 10)));
|
||||
default: sb.append(randomSimpleString(random));
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
public class TestMockCharFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void test() throws IOException {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
return new MockCharFilter(CharReader.get(reader), 7);
|
||||
}
|
||||
};
|
||||
|
||||
assertAnalyzesTo(analyzer, "ab",
|
||||
new String[] { "aab" },
|
||||
new int[] { 0 },
|
||||
new int[] { 2 }
|
||||
);
|
||||
|
||||
assertAnalyzesTo(analyzer, "aba",
|
||||
new String[] { "aabaa" },
|
||||
new int[] { 0 },
|
||||
new int[] { 3 }
|
||||
);
|
||||
|
||||
assertAnalyzesTo(analyzer, "abcdefga",
|
||||
new String[] { "aabcdefgaa" },
|
||||
new int[] { 0 },
|
||||
new int[] { 8 }
|
||||
);
|
||||
}
|
||||
}
|
|
@ -22,29 +22,46 @@ import java.util.HashMap;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CannedAnalyzer;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockPayloadAnalyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.NumericField;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.FieldCache;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.English;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Assume;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
public class TestPostingsOffsets extends LuceneTestCase {
|
||||
IndexWriterConfig iwc;
|
||||
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
// Currently only SimpleText and Lucene40 can index offsets into postings:
|
||||
assumeTrue("codec does not support offsets", Codec.getDefault().getName().equals("SimpleText") || Codec.getDefault().getName().equals("Lucene40"));
|
||||
iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random));
|
||||
|
||||
if (Codec.getDefault().getName().equals("Lucene40")) {
|
||||
// pulsing etc are not implemented
|
||||
iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
|
||||
}
|
||||
}
|
||||
|
||||
public void testBasic() throws Exception {
|
||||
|
||||
// Currently only SimpleText can index offsets into postings:
|
||||
Assume.assumeTrue(Codec.getDefault().getName().equals("SimpleText"));
|
||||
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random, dir);
|
||||
|
||||
RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc);
|
||||
Document doc = new Document();
|
||||
|
||||
FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
|
||||
|
@ -94,16 +111,117 @@ public class TestPostingsOffsets extends LuceneTestCase {
|
|||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testSkipping() throws Exception {
|
||||
doTestNumbers(false);
|
||||
}
|
||||
|
||||
public void testPayloads() throws Exception {
|
||||
doTestNumbers(true);
|
||||
}
|
||||
|
||||
public void doTestNumbers(boolean withPayloads) throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
Analyzer analyzer = withPayloads ? new MockPayloadAnalyzer() : new MockAnalyzer(random);
|
||||
iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
|
||||
if (Codec.getDefault().getName().equals("Lucene40")) {
|
||||
// pulsing etc are not implemented
|
||||
iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
|
||||
}
|
||||
iwc.setMergePolicy(newLogMergePolicy()); // will rely on docids a bit for skipping
|
||||
RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc);
|
||||
|
||||
FieldType ft = new FieldType(TextField.TYPE_STORED);
|
||||
ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
if (random.nextBoolean()) {
|
||||
ft.setStoreTermVectors(true);
|
||||
ft.setStoreTermVectorOffsets(random.nextBoolean());
|
||||
ft.setStoreTermVectorPositions(random.nextBoolean());
|
||||
}
|
||||
|
||||
int numDocs = atLeast(500);
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("numbers", English.intToEnglish(i), ft));
|
||||
doc.add(new Field("oddeven", (i % 2) == 0 ? "even" : "odd", ft));
|
||||
doc.add(new StringField("id", "" + i));
|
||||
w.addDocument(doc);
|
||||
}
|
||||
|
||||
IndexReader reader = w.getReader();
|
||||
w.close();
|
||||
|
||||
String terms[] = { "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "hundred" };
|
||||
|
||||
for (String term : terms) {
|
||||
DocsAndPositionsEnum dp = MultiFields.getTermPositionsEnum(reader, null, "numbers", new BytesRef(term), true);
|
||||
int doc;
|
||||
while((doc = dp.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
|
||||
String storedNumbers = reader.document(doc).get("numbers");
|
||||
int freq = dp.freq();
|
||||
for (int i = 0; i < freq; i++) {
|
||||
dp.nextPosition();
|
||||
int start = dp.startOffset();
|
||||
assert start >= 0;
|
||||
int end = dp.endOffset();
|
||||
assert end >= 0 && end >= start;
|
||||
// check that the offsets correspond to the term in the src text
|
||||
assertTrue(storedNumbers.substring(start, end).equals(term));
|
||||
if (withPayloads) {
|
||||
// check that we have a payload and it starts with "pos"
|
||||
assertTrue(dp.hasPayload());
|
||||
BytesRef payload = dp.getPayload();
|
||||
assertTrue(payload.utf8ToString().startsWith("pos:"));
|
||||
} // note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer!
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// check we can skip correctly
|
||||
int numSkippingTests = atLeast(50);
|
||||
|
||||
for (int j = 0; j < numSkippingTests; j++) {
|
||||
int num = _TestUtil.nextInt(random, 100, Math.min(numDocs-1, 999));
|
||||
DocsAndPositionsEnum dp = MultiFields.getTermPositionsEnum(reader, null, "numbers", new BytesRef("hundred"), true);
|
||||
int doc = dp.advance(num);
|
||||
assertEquals(num, doc);
|
||||
int freq = dp.freq();
|
||||
for (int i = 0; i < freq; i++) {
|
||||
String storedNumbers = reader.document(doc).get("numbers");
|
||||
dp.nextPosition();
|
||||
int start = dp.startOffset();
|
||||
assert start >= 0;
|
||||
int end = dp.endOffset();
|
||||
assert end >= 0 && end >= start;
|
||||
// check that the offsets correspond to the term in the src text
|
||||
assertTrue(storedNumbers.substring(start, end).equals("hundred"));
|
||||
if (withPayloads) {
|
||||
// check that we have a payload and it starts with "pos"
|
||||
assertTrue(dp.hasPayload());
|
||||
BytesRef payload = dp.getPayload();
|
||||
assertTrue(payload.utf8ToString().startsWith("pos:"));
|
||||
} // note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer!
|
||||
}
|
||||
}
|
||||
|
||||
// check that other fields (without offsets) work correctly
|
||||
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
DocsEnum dp = MultiFields.getTermDocsEnum(reader, null, "id", new BytesRef("" + i), false);
|
||||
assertEquals(i, dp.nextDoc());
|
||||
assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());
|
||||
}
|
||||
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
// Currently only SimpleText can index offsets into postings:
|
||||
Assume.assumeTrue(Codec.getDefault().getName().equals("SimpleText"));
|
||||
|
||||
// token -> docID -> tokens
|
||||
final Map<String,Map<Integer,List<Token>>> actualTokens = new HashMap<String,Map<Integer,List<Token>>>();
|
||||
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random, dir);
|
||||
RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc);
|
||||
|
||||
final int numDocs = atLeast(20);
|
||||
//final int numDocs = atLeast(5);
|
||||
|
|
|
@ -31,6 +31,7 @@ import org.apache.lucene.search.CheckHits;
|
|||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.QueryUtils;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.AfterClass;
|
||||
|
@ -240,6 +241,8 @@ public class TestFieldMaskingSpanQuery extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testSimple2() throws Exception {
|
||||
assumeTrue("Broken scoring: LUCENE-3723",
|
||||
searcher.getSimilarityProvider().get("id") instanceof TFIDFSimilarity);
|
||||
SpanQuery q1 = new SpanTermQuery(new Term("gender", "female"));
|
||||
SpanQuery q2 = new SpanTermQuery(new Term("last", "smith"));
|
||||
SpanQuery q = new SpanNearQuery(new SpanQuery[]
|
||||
|
@ -310,6 +313,8 @@ public class TestFieldMaskingSpanQuery extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testSpans2() throws Exception {
|
||||
assumeTrue("Broken scoring: LUCENE-3723",
|
||||
searcher.getSimilarityProvider().get("id") instanceof TFIDFSimilarity);
|
||||
SpanQuery qA1 = new SpanTermQuery(new Term("gender", "female"));
|
||||
SpanQuery qA2 = new SpanTermQuery(new Term("first", "james"));
|
||||
SpanQuery qA = new SpanOrQuery(qA1, new FieldMaskingSpanQuery(qA2, "gender"));
|
||||
|
|
|
@ -20,6 +20,8 @@ package org.apache.lucene.util;
|
|||
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.HashSet;
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
|
@ -45,4 +47,32 @@ public class TestSentinelIntSet extends LuceneTestCase {
|
|||
assertEquals(20, set.size());
|
||||
assertEquals(24, set.rehashCount);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testRandom() throws Exception {
|
||||
for (int i=0; i<10000; i++) {
|
||||
int initSz = random.nextInt(20);
|
||||
int num = random.nextInt(30);
|
||||
int maxVal = (random.nextBoolean() ? random.nextInt(50) : random.nextInt(Integer.MAX_VALUE)) + 1;
|
||||
|
||||
HashSet<Integer> a = new HashSet<Integer>(initSz);
|
||||
SentinelIntSet b = new SentinelIntSet(initSz, -1);
|
||||
|
||||
for (int j=0; j<num; j++) {
|
||||
int val = random.nextInt(maxVal);
|
||||
boolean exists = !a.add(val);
|
||||
boolean existsB = b.exists(val);
|
||||
assertEquals(exists, existsB);
|
||||
int slot = b.find(val);
|
||||
assertEquals(exists, slot>=0);
|
||||
b.put(val);
|
||||
|
||||
assertEquals(a.size(), b.size());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1055,6 +1055,50 @@ public class TestFSTs extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
// NOTE: this test shows a case where our current builder
|
||||
// fails to produce minimal FST:
|
||||
/*
|
||||
public void test3() throws Exception {
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
IntsRef scratchIntsRef = new IntsRef();
|
||||
builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.get(0));
|
||||
builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), 1L);
|
||||
builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), 2L);
|
||||
final FST<Long> fst = builder.finish();
|
||||
//System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount());
|
||||
// NOTE: we produce 7 nodes today
|
||||
assertEquals(6, fst.getNodeCount());
|
||||
// NOTE: we produce 8 arcs today
|
||||
assertEquals(7, fst.getNodeCount());
|
||||
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
|
||||
//Util.toDot(fst, w, false, false);
|
||||
//w.close();
|
||||
}
|
||||
*/
|
||||
|
||||
// NOTE: this test shows a case where our current builder
|
||||
// fails to produce minimal FST:
|
||||
/*
|
||||
public void test4() throws Exception {
|
||||
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||
Builder<BytesRef> builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
IntsRef scratchIntsRef = new IntsRef();
|
||||
builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.getNoOutput());
|
||||
builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), new BytesRef("1"));
|
||||
builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), new BytesRef("11"));
|
||||
final FST<BytesRef> fst = builder.finish();
|
||||
//System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount());
|
||||
// NOTE: we produce 7 nodes today
|
||||
assertEquals(6, fst.getNodeCount());
|
||||
// NOTE: we produce 8 arcs today
|
||||
assertEquals(7, fst.getNodeCount());
|
||||
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
|
||||
//Util.toDot(fst, w, false, false);
|
||||
//w.close();
|
||||
}
|
||||
*/
|
||||
|
||||
// Build FST for all unique terms in the test line docs
|
||||
// file, up until a time limit
|
||||
public void testRealTerms() throws Exception {
|
||||
|
|
|
@ -31,14 +31,38 @@
|
|||
<target name="compile-core" depends="jflex-notice, common.compile-core"/>
|
||||
|
||||
<target name="jflex" depends="jflex-check,clean-jflex,gen-uax29-supp-macros,
|
||||
jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,jflex-wiki-tokenizer"/>
|
||||
jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,
|
||||
jflex-wiki-tokenizer,jflex-HTMLStripCharFilter"/>
|
||||
|
||||
<target name="gen-uax29-supp-macros">
|
||||
<subant target="gen-uax29-supp-macros">
|
||||
<fileset dir="../icu" includes="build.xml"/>
|
||||
</subant>
|
||||
</target>
|
||||
|
||||
|
||||
<target name="jflex-HTMLStripCharFilter"
|
||||
depends="init,jflex-check,generate-jflex-html-char-entities"
|
||||
if="jflex.present">
|
||||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||
<classpath refid="jflex.classpath"/>
|
||||
</taskdef>
|
||||
<jflex file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex"
|
||||
outdir="src/java/org/apache/lucene/analysis/charfilter"
|
||||
nobak="on"/>
|
||||
<!-- Remove the inappropriate JFlex-generated constructors -->
|
||||
<replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
|
||||
match="/\*\*\s*\*\s*Creates a new scanner.*this\(new java\.io\.InputStreamReader\(in\)\);\s*\}"
|
||||
replace="" flags="sg"/>
|
||||
</target>
|
||||
|
||||
<target name="generate-jflex-html-char-entities">
|
||||
<exec dir="src/java/org/apache/lucene/analysis/charfilter"
|
||||
output="src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex"
|
||||
executable="${python.exe}" failonerror="true" logerror="true">
|
||||
<arg value="htmlentity.py"/>
|
||||
</exec>
|
||||
</target>
|
||||
|
||||
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
|
||||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||
<classpath refid="jflex.classpath"/>
|
||||
|
|
|
@ -20,6 +20,8 @@ package org.apache.lucene.analysis.charfilter;
|
|||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Base utility class for implementing a {@link CharFilter}.
|
||||
* You subclass this, and then record mappings by calling
|
||||
|
@ -71,6 +73,19 @@ public abstract class BaseCharFilter extends CharFilter {
|
|||
0 : diffs[size-1];
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Adds an offset correction mapping at the given output stream offset.
|
||||
* </p>
|
||||
* <p>
|
||||
* Assumption: the offset given with each successive call to this method
|
||||
* will not be smaller than the offset given at the previous invocation.
|
||||
* </p>
|
||||
*
|
||||
* @param off The output stream offset at which to apply the correction
|
||||
* @param cumulativeDiff The input offset is given by adding this
|
||||
* to the output offset
|
||||
*/
|
||||
protected void addOffCorrectMap(int off, int cumulativeDiff) {
|
||||
if (offsets == null) {
|
||||
offsets = new int[64];
|
||||
|
@ -80,7 +95,15 @@ public abstract class BaseCharFilter extends CharFilter {
|
|||
diffs = ArrayUtil.grow(diffs);
|
||||
}
|
||||
|
||||
offsets[size] = off;
|
||||
diffs[size++] = cumulativeDiff;
|
||||
assert (size == 0 || off >= offsets[size])
|
||||
: "Offset #" + size + "(" + off + ") is less than the last recorded offset "
|
||||
+ offsets[size] + "\n" + Arrays.toString(offsets) + "\n" + Arrays.toString(diffs);
|
||||
|
||||
if (size == 0 || off != offsets[size - 1]) {
|
||||
offsets[size] = off;
|
||||
diffs[size++] = cumulativeDiff;
|
||||
} else { // Overwrite the diff at the last recorded offset
|
||||
diffs[size - 1] = cumulativeDiff;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,153 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
CharacterEntities = ( "AElig" | "Aacute" | "Acirc" | "Agrave" | "Alpha"
|
||||
| "Aring" | "Atilde" | "Auml" | "Beta" | "Ccedil" | "Chi"
|
||||
| "Dagger" | "Delta" | "ETH" | "Eacute" | "Ecirc"
|
||||
| "Egrave" | "Epsilon" | "Eta" | "Euml" | "Gamma"
|
||||
| "Iacute" | "Icirc" | "Igrave" | "Iota" | "Iuml" | "Kappa"
|
||||
| "Lambda" | "Mu" | "Ntilde" | "Nu" | "OElig" | "Oacute"
|
||||
| "Ocirc" | "Ograve" | "Omega" | "Omicron" | "Oslash"
|
||||
| "Otilde" | "Ouml" | "Phi" | "Pi" | "Prime" | "Psi"
|
||||
| "Rho" | "Scaron" | "Sigma" | "THORN" | "Tau" | "Theta"
|
||||
| "Uacute" | "Ucirc" | "Ugrave" | "Upsilon" | "Uuml" | "Xi"
|
||||
| "Yacute" | "Yuml" | "Zeta" | "aacute" | "acirc" | "acute"
|
||||
| "aelig" | "agrave" | "alefsym" | "alpha" | "amp" | "AMP"
|
||||
| "and" | "ang" | "apos" | "aring" | "asymp" | "atilde"
|
||||
| "auml" | "bdquo" | "beta" | "brvbar" | "bull" | "cap"
|
||||
| "ccedil" | "cedil" | "cent" | "chi" | "circ" | "clubs"
|
||||
| "cong" | "copy" | "COPY" | "crarr" | "cup" | "curren"
|
||||
| "dArr" | "dagger" | "darr" | "deg" | "delta" | "diams"
|
||||
| "divide" | "eacute" | "ecirc" | "egrave" | "empty"
|
||||
| "emsp" | "ensp" | "epsilon" | "equiv" | "eta" | "eth"
|
||||
| "euml" | "euro" | "exist" | "fnof" | "forall" | "frac12"
|
||||
| "frac14" | "frac34" | "frasl" | "gamma" | "ge" | "gt"
|
||||
| "GT" | "hArr" | "harr" | "hearts" | "hellip" | "iacute"
|
||||
| "icirc" | "iexcl" | "igrave" | "image" | "infin" | "int"
|
||||
| "iota" | "iquest" | "isin" | "iuml" | "kappa" | "lArr"
|
||||
| "lambda" | "lang" | "laquo" | "larr" | "lceil" | "ldquo"
|
||||
| "le" | "lfloor" | "lowast" | "loz" | "lrm" | "lsaquo"
|
||||
| "lsquo" | "lt" | "LT" | "macr" | "mdash" | "micro"
|
||||
| "middot" | "minus" | "mu" | "nabla" | "nbsp" | "ndash"
|
||||
| "ne" | "ni" | "not" | "notin" | "nsub" | "ntilde" | "nu"
|
||||
| "oacute" | "ocirc" | "oelig" | "ograve" | "oline"
|
||||
| "omega" | "omicron" | "oplus" | "or" | "ordf" | "ordm"
|
||||
| "oslash" | "otilde" | "otimes" | "ouml" | "para" | "part"
|
||||
| "permil" | "perp" | "phi" | "pi" | "piv" | "plusmn"
|
||||
| "pound" | "prime" | "prod" | "prop" | "psi" | "quot"
|
||||
| "QUOT" | "rArr" | "radic" | "rang" | "raquo" | "rarr"
|
||||
| "rceil" | "rdquo" | "real" | "reg" | "REG" | "rfloor"
|
||||
| "rho" | "rlm" | "rsaquo" | "rsquo" | "sbquo" | "scaron"
|
||||
| "sdot" | "sect" | "shy" | "sigma" | "sigmaf" | "sim"
|
||||
| "spades" | "sub" | "sube" | "sum" | "sup" | "sup1"
|
||||
| "sup2" | "sup3" | "supe" | "szlig" | "tau" | "there4"
|
||||
| "theta" | "thetasym" | "thinsp" | "thorn" | "tilde"
|
||||
| "times" | "trade" | "uArr" | "uacute" | "uarr" | "ucirc"
|
||||
| "ugrave" | "uml" | "upsih" | "upsilon" | "uuml"
|
||||
| "weierp" | "xi" | "yacute" | "yen" | "yuml" | "zeta"
|
||||
| "zwj" | "zwnj" )
|
||||
%{
|
||||
private static final Set<String> upperCaseVariantsAccepted
|
||||
= new HashSet<String>(Arrays.asList("quot","copy","gt","lt","reg","amp"));
|
||||
private static final CharArrayMap<Character> entityValues
|
||||
= new CharArrayMap<Character>(Version.LUCENE_40, 253, false);
|
||||
static {
|
||||
String[] entities = {
|
||||
"AElig", "\u00C6", "Aacute", "\u00C1", "Acirc", "\u00C2",
|
||||
"Agrave", "\u00C0", "Alpha", "\u0391", "Aring", "\u00C5",
|
||||
"Atilde", "\u00C3", "Auml", "\u00C4", "Beta", "\u0392",
|
||||
"Ccedil", "\u00C7", "Chi", "\u03A7", "Dagger", "\u2021",
|
||||
"Delta", "\u0394", "ETH", "\u00D0", "Eacute", "\u00C9",
|
||||
"Ecirc", "\u00CA", "Egrave", "\u00C8", "Epsilon", "\u0395",
|
||||
"Eta", "\u0397", "Euml", "\u00CB", "Gamma", "\u0393", "Iacute", "\u00CD",
|
||||
"Icirc", "\u00CE", "Igrave", "\u00CC", "Iota", "\u0399",
|
||||
"Iuml", "\u00CF", "Kappa", "\u039A", "Lambda", "\u039B", "Mu", "\u039C",
|
||||
"Ntilde", "\u00D1", "Nu", "\u039D", "OElig", "\u0152",
|
||||
"Oacute", "\u00D3", "Ocirc", "\u00D4", "Ograve", "\u00D2",
|
||||
"Omega", "\u03A9", "Omicron", "\u039F", "Oslash", "\u00D8",
|
||||
"Otilde", "\u00D5", "Ouml", "\u00D6", "Phi", "\u03A6", "Pi", "\u03A0",
|
||||
"Prime", "\u2033", "Psi", "\u03A8", "Rho", "\u03A1", "Scaron", "\u0160",
|
||||
"Sigma", "\u03A3", "THORN", "\u00DE", "Tau", "\u03A4", "Theta", "\u0398",
|
||||
"Uacute", "\u00DA", "Ucirc", "\u00DB", "Ugrave", "\u00D9",
|
||||
"Upsilon", "\u03A5", "Uuml", "\u00DC", "Xi", "\u039E",
|
||||
"Yacute", "\u00DD", "Yuml", "\u0178", "Zeta", "\u0396",
|
||||
"aacute", "\u00E1", "acirc", "\u00E2", "acute", "\u00B4",
|
||||
"aelig", "\u00E6", "agrave", "\u00E0", "alefsym", "\u2135",
|
||||
"alpha", "\u03B1", "amp", "\u0026", "and", "\u2227", "ang", "\u2220",
|
||||
"apos", "\u0027", "aring", "\u00E5", "asymp", "\u2248",
|
||||
"atilde", "\u00E3", "auml", "\u00E4", "bdquo", "\u201E",
|
||||
"beta", "\u03B2", "brvbar", "\u00A6", "bull", "\u2022", "cap", "\u2229",
|
||||
"ccedil", "\u00E7", "cedil", "\u00B8", "cent", "\u00A2", "chi", "\u03C7",
|
||||
"circ", "\u02C6", "clubs", "\u2663", "cong", "\u2245", "copy", "\u00A9",
|
||||
"crarr", "\u21B5", "cup", "\u222A", "curren", "\u00A4", "dArr", "\u21D3",
|
||||
"dagger", "\u2020", "darr", "\u2193", "deg", "\u00B0", "delta", "\u03B4",
|
||||
"diams", "\u2666", "divide", "\u00F7", "eacute", "\u00E9",
|
||||
"ecirc", "\u00EA", "egrave", "\u00E8", "empty", "\u2205",
|
||||
"emsp", "\u2003", "ensp", "\u2002", "epsilon", "\u03B5",
|
||||
"equiv", "\u2261", "eta", "\u03B7", "eth", "\u00F0", "euml", "\u00EB",
|
||||
"euro", "\u20AC", "exist", "\u2203", "fnof", "\u0192",
|
||||
"forall", "\u2200", "frac12", "\u00BD", "frac14", "\u00BC",
|
||||
"frac34", "\u00BE", "frasl", "\u2044", "gamma", "\u03B3", "ge", "\u2265",
|
||||
"gt", "\u003E", "hArr", "\u21D4", "harr", "\u2194", "hearts", "\u2665",
|
||||
"hellip", "\u2026", "iacute", "\u00ED", "icirc", "\u00EE",
|
||||
"iexcl", "\u00A1", "igrave", "\u00EC", "image", "\u2111",
|
||||
"infin", "\u221E", "int", "\u222B", "iota", "\u03B9", "iquest", "\u00BF",
|
||||
"isin", "\u2208", "iuml", "\u00EF", "kappa", "\u03BA", "lArr", "\u21D0",
|
||||
"lambda", "\u03BB", "lang", "\u2329", "laquo", "\u00AB",
|
||||
"larr", "\u2190", "lceil", "\u2308", "ldquo", "\u201C", "le", "\u2264",
|
||||
"lfloor", "\u230A", "lowast", "\u2217", "loz", "\u25CA", "lrm", "\u200E",
|
||||
"lsaquo", "\u2039", "lsquo", "\u2018", "lt", "\u003C", "macr", "\u00AF",
|
||||
"mdash", "\u2014", "micro", "\u00B5", "middot", "\u00B7",
|
||||
"minus", "\u2212", "mu", "\u03BC", "nabla", "\u2207", "nbsp", " ",
|
||||
"ndash", "\u2013", "ne", "\u2260", "ni", "\u220B", "not", "\u00AC",
|
||||
"notin", "\u2209", "nsub", "\u2284", "ntilde", "\u00F1", "nu", "\u03BD",
|
||||
"oacute", "\u00F3", "ocirc", "\u00F4", "oelig", "\u0153",
|
||||
"ograve", "\u00F2", "oline", "\u203E", "omega", "\u03C9",
|
||||
"omicron", "\u03BF", "oplus", "\u2295", "or", "\u2228", "ordf", "\u00AA",
|
||||
"ordm", "\u00BA", "oslash", "\u00F8", "otilde", "\u00F5",
|
||||
"otimes", "\u2297", "ouml", "\u00F6", "para", "\u00B6", "part", "\u2202",
|
||||
"permil", "\u2030", "perp", "\u22A5", "phi", "\u03C6", "pi", "\u03C0",
|
||||
"piv", "\u03D6", "plusmn", "\u00B1", "pound", "\u00A3",
|
||||
"prime", "\u2032", "prod", "\u220F", "prop", "\u221D", "psi", "\u03C8",
|
||||
"quot", "\"", "rArr", "\u21D2", "radic", "\u221A", "rang", "\u232A",
|
||||
"raquo", "\u00BB", "rarr", "\u2192", "rceil", "\u2309",
|
||||
"rdquo", "\u201D", "real", "\u211C", "reg", "\u00AE", "rfloor", "\u230B",
|
||||
"rho", "\u03C1", "rlm", "\u200F", "rsaquo", "\u203A", "rsquo", "\u2019",
|
||||
"sbquo", "\u201A", "scaron", "\u0161", "sdot", "\u22C5",
|
||||
"sect", "\u00A7", "shy", "\u00AD", "sigma", "\u03C3", "sigmaf", "\u03C2",
|
||||
"sim", "\u223C", "spades", "\u2660", "sub", "\u2282", "sube", "\u2286",
|
||||
"sum", "\u2211", "sup", "\u2283", "sup1", "\u00B9", "sup2", "\u00B2",
|
||||
"sup3", "\u00B3", "supe", "\u2287", "szlig", "\u00DF", "tau", "\u03C4",
|
||||
"there4", "\u2234", "theta", "\u03B8", "thetasym", "\u03D1",
|
||||
"thinsp", "\u2009", "thorn", "\u00FE", "tilde", "\u02DC",
|
||||
"times", "\u00D7", "trade", "\u2122", "uArr", "\u21D1",
|
||||
"uacute", "\u00FA", "uarr", "\u2191", "ucirc", "\u00FB",
|
||||
"ugrave", "\u00F9", "uml", "\u00A8", "upsih", "\u03D2",
|
||||
"upsilon", "\u03C5", "uuml", "\u00FC", "weierp", "\u2118",
|
||||
"xi", "\u03BE", "yacute", "\u00FD", "yen", "\u00A5", "yuml", "\u00FF",
|
||||
"zeta", "\u03B6", "zwj", "\u200D", "zwnj", "\u200C"
|
||||
};
|
||||
for (int i = 0 ; i < entities.length ; i += 2) {
|
||||
Character value = entities[i + 1].charAt(0);
|
||||
entityValues.put(entities[i], value);
|
||||
if (upperCaseVariantsAccepted.contains(entities[i])) {
|
||||
entityValues.put(entities[i].toUpperCase(), value);
|
||||
}
|
||||
}
|
||||
}
|
||||
%}
|
|
@ -0,0 +1,58 @@
|
|||
/*
|
||||
* Copyright 2010 The Apache Software Foundation.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Generated using ICU4J 4.8.1.1 on Friday, January 13, 2012 6:20:39 PM UTC
|
||||
// by org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros
|
||||
|
||||
|
||||
ID_Start_Supp = (
|
||||
[\uD81A][\uDC00-\uDE38]
|
||||
| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
|
||||
| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB]
|
||||
| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
|
||||
| [\uD82C][\uDC00\uDC01]
|
||||
| [\uD804][\uDC03-\uDC37\uDC83-\uDCAF]
|
||||
| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
|
||||
| [\uD87E][\uDC00-\uDE1D]
|
||||
| [\uD809][\uDC00-\uDC62]
|
||||
| [\uD808][\uDC00-\uDF6E]
|
||||
| [\uD803][\uDC00-\uDC48]
|
||||
| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
|
||||
| [\uD80D][\uDC00-\uDC2E]
|
||||
| [\uD86E][\uDC00-\uDC1D]
|
||||
| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
|
||||
| [\uD801][\uDC00-\uDC9D]
|
||||
)
|
||||
ID_Continue_Supp = (
|
||||
[\uD81A][\uDC00-\uDE38]
|
||||
| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
|
||||
| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
|
||||
| [\uD804][\uDC00-\uDC46\uDC66-\uDC6F\uDC80-\uDCBA]
|
||||
| [\uD82C][\uDC00\uDC01]
|
||||
| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00-\uDE03\uDE05\uDE06\uDE0C-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE38-\uDE3A\uDE3F\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
|
||||
| [\uD801][\uDC00-\uDC9D\uDCA0-\uDCA9]
|
||||
| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
|
||||
| [\uD87E][\uDC00-\uDE1D]
|
||||
| [\uD809][\uDC00-\uDC62]
|
||||
| [\uD808][\uDC00-\uDF6E]
|
||||
| [\uD803][\uDC00-\uDC48]
|
||||
| [\uD80D][\uDC00-\uDC2E]
|
||||
| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDDFD\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
|
||||
| [\uD86E][\uDC00-\uDC1D]
|
||||
| [\uDB40][\uDD00-\uDDEF]
|
||||
| [\uD834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44]
|
||||
| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB\uDFCE-\uDFFF]
|
||||
)
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,874 @@
|
|||
package org.apache.lucene.analysis.charfilter;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.OpenStringBuilder;
|
||||
|
||||
|
||||
/**
|
||||
* A CharFilter that wraps another Reader and attempts to strip out HTML constructs.
|
||||
*/
|
||||
@SuppressWarnings("fallthrough")
|
||||
%%
|
||||
|
||||
%unicode 6.0
|
||||
%apiprivate
|
||||
%type int
|
||||
%final
|
||||
%public
|
||||
%char
|
||||
%function nextChar
|
||||
%class HTMLStripCharFilter
|
||||
%extends BaseCharFilter
|
||||
%xstate AMPERSAND, NUMERIC_CHARACTER, CHARACTER_REFERENCE_TAIL
|
||||
%xstate LEFT_ANGLE_BRACKET, BANG, COMMENT, SCRIPT, SCRIPT_COMMENT
|
||||
%xstate LEFT_ANGLE_BRACKET_SLASH, LEFT_ANGLE_BRACKET_SPACE, CDATA
|
||||
%xstate SERVER_SIDE_INCLUDE, SINGLE_QUOTED_STRING, DOUBLE_QUOTED_STRING
|
||||
%xstate END_TAG_TAIL_INCLUDE, END_TAG_TAIL_EXCLUDE, END_TAG_TAIL_SUBSTITUTE
|
||||
%xstate START_TAG_TAIL_INCLUDE, START_TAG_TAIL_EXCLUDE, START_TAG_TAIL_SUBSTITUTE
|
||||
%xstate STYLE, STYLE_COMMENT
|
||||
|
||||
// From XML 1.0 <http://www.w3.org/TR/xml/>:
|
||||
//
|
||||
// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [...]
|
||||
// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | [...]
|
||||
// [5] Name ::= NameStartChar (NameChar)*
|
||||
//
|
||||
// From UAX #31: Unicode Identifier and Pattern Syntax
|
||||
// <http://unicode.org/reports/tr31/>:
|
||||
//
|
||||
// D1. Default Identifier Syntax
|
||||
//
|
||||
// <identifier> := <ID_Start> <ID_Continue>*
|
||||
//
|
||||
Name = ( ( [:_\p{ID_Start}] | {ID_Start_Supp} ) ( [-.:_\p{ID_Continue}] | {ID_Continue_Supp} )* )
|
||||
|
||||
// From Apache httpd mod_include documentation
|
||||
// <http://httpd.apache.org/docs/current/mod/mod_include.html>:
|
||||
//
|
||||
// Basic Elements
|
||||
//
|
||||
// The document is parsed as an HTML document, with special commands
|
||||
// embedded as SGML comments. A command has the syntax:
|
||||
//
|
||||
// <!--#element attribute=value attribute=value ... -->
|
||||
//
|
||||
// The value will often be enclosed in double quotes, but single quotes (')
|
||||
// and backticks (`) are also possible. Many commands only allow a single
|
||||
// attribute-value pair. Note that the comment terminator (-->) should be
|
||||
// preceded by whitespace to ensure that it isn't considered part of an SSI
|
||||
// token. Note that the leading <!--# is one token and may not contain any
|
||||
// whitespaces.
|
||||
//
|
||||
|
||||
EventAttributeSuffixes = ( [aA][bB][oO][rR][tT] |
|
||||
[bB][lL][uU][rR] |
|
||||
[cC][hH][aA][nN][gG][eE] |
|
||||
[cC][lL][iI][cC][kK] |
|
||||
[dD][bB][lL][cC][lL][iI][cC][kK] |
|
||||
[eE][rR][rR][oO][rR] |
|
||||
[fF][oO][cC][uU][sS] |
|
||||
[kK][eE][yY][dD][oO][wW][nN] |
|
||||
[kK][eE][yY][pP][rR][eE][sS][sS] |
|
||||
[kK][eE][yY][uU][pP] |
|
||||
[lL][oO][aA][dD] |
|
||||
[mM][oO][uU][sS][eE][dD][oO][wW][nN] |
|
||||
[mM][oO][uU][sS][eE][mM][oO][vV][eE] |
|
||||
[mM][oO][uU][sS][eE][oO][uU][tT] |
|
||||
[mM][oO][uU][sS][eE][oO][vV][eE][rR] |
|
||||
[mM][oO][uU][sS][eE][uU][pP] |
|
||||
[rR][eE][sS][eE][tT] |
|
||||
[sS][eE][lL][eE][cC][tT] |
|
||||
[sS][uU][bB][mM][iI][tT] |
|
||||
[uU][nN][lL][oO][aA][dD] )
|
||||
|
||||
SingleQuoted = ( "'" ( "\\'" | [^']* )* "'" )
|
||||
DoubleQuoted = ( "\"" ( "\\\"" | [^\"]* )* "\"" )
|
||||
ServerSideInclude = ( "<!--#" ( [^'\"] | {SingleQuoted} | {DoubleQuoted} )* "-->" )
|
||||
EventAttribute = [oO][nN] {EventAttributeSuffixes} \s* "=" \s* ( {SingleQuoted} | {DoubleQuoted} )
|
||||
OpenTagContent = ( {EventAttribute} | [^<>] | {ServerSideInclude} )*
|
||||
|
||||
InlineElment = ( [aAbBiIqQsSuU] |
|
||||
[aA][bB][bB][rR] |
|
||||
[aA][cC][rR][oO][nN][yY][mM] |
|
||||
[bB][aA][sS][eE][fF][oO][nN][tT] |
|
||||
[bB][dD][oO] |
|
||||
[bB][iI][gG] |
|
||||
[cC][iI][tT][eE] |
|
||||
[cC][oO][dD][eE] |
|
||||
[dD][fF][nN] |
|
||||
[eE][mM] |
|
||||
[fF][oO][nN][tT] |
|
||||
[iI][mM][gG] |
|
||||
[iI][nN][pP][uU][tT] |
|
||||
[kK][bB][dD] |
|
||||
[lL][aA][bB][eE][lL] |
|
||||
[sS][aA][mM][pP] |
|
||||
[sS][eE][lL][eE][cC][tT] |
|
||||
[sS][mM][aA][lL][lL] |
|
||||
[sS][pP][aA][nN] |
|
||||
[sS][tT][rR][iI][kK][eE] |
|
||||
[sS][tT][rR][oO][nN][gG] |
|
||||
[sS][uU][bB] |
|
||||
[sS][uU][pP] |
|
||||
[tT][eE][xX][tT][aA][rR][eE][aA] |
|
||||
[tT][tT] |
|
||||
[vV][aA][rR] )
|
||||
|
||||
|
||||
%include src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
|
||||
|
||||
%include src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
|
||||
|
||||
%{
|
||||
private static final int INITIAL_INPUT_SEGMENT_SIZE = 1024;
|
||||
private static final char BLOCK_LEVEL_START_TAG_REPLACEMENT = '\n';
|
||||
private static final char BLOCK_LEVEL_END_TAG_REPLACEMENT = '\n';
|
||||
private static final char BR_START_TAG_REPLACEMENT = '\n';
|
||||
private static final char BR_END_TAG_REPLACEMENT = '\n';
|
||||
private static final char SCRIPT_REPLACEMENT = '\n';
|
||||
private static final char STYLE_REPLACEMENT = '\n';
|
||||
private static final char REPLACEMENT_CHARACTER = '\uFFFD';
|
||||
|
||||
private CharArraySet escapedTags = null;
|
||||
private int inputStart;
|
||||
private int cumulativeDiff;
|
||||
private boolean escapeBR = false;
|
||||
private boolean escapeSCRIPT = false;
|
||||
private boolean escapeSTYLE = false;
|
||||
private int restoreState;
|
||||
private int previousRestoreState;
|
||||
private int outputCharCount;
|
||||
private int eofReturnValue;
|
||||
private TextSegment inputSegment
|
||||
= new TextSegment(INITIAL_INPUT_SEGMENT_SIZE);
|
||||
private TextSegment outputSegment = inputSegment;
|
||||
private TextSegment entitySegment = new TextSegment(2);
|
||||
|
||||
/**
|
||||
* @param source
|
||||
*/
|
||||
public HTMLStripCharFilter(CharStream source) {
|
||||
super(source);
|
||||
this.zzReader = source;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param source
|
||||
* @param escapedTags Tags in this set (both start and end tags)
|
||||
* will not be filtered out.
|
||||
*/
|
||||
public HTMLStripCharFilter(CharStream source, Set<String> escapedTags) {
|
||||
super(source);
|
||||
this.zzReader = source;
|
||||
if (null != escapedTags) {
|
||||
for (String tag : escapedTags) {
|
||||
if (tag.equalsIgnoreCase("BR")) {
|
||||
escapeBR = true;
|
||||
} else if (tag.equalsIgnoreCase("SCRIPT")) {
|
||||
escapeSCRIPT = true;
|
||||
} else if (tag.equalsIgnoreCase("STYLE")) {
|
||||
escapeSTYLE = true;
|
||||
} else {
|
||||
if (null == this.escapedTags) {
|
||||
this.escapedTags = new CharArraySet(Version.LUCENE_40, 16, true);
|
||||
}
|
||||
this.escapedTags.add(tag);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
if (outputSegment.isRead()) {
|
||||
if (zzAtEOF) {
|
||||
return -1;
|
||||
}
|
||||
int ch = nextChar();
|
||||
++outputCharCount;
|
||||
return ch;
|
||||
}
|
||||
int ch = outputSegment.nextChar();
|
||||
++outputCharCount;
|
||||
return ch;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char cbuf[], int off, int len) throws IOException {
|
||||
int i = 0;
|
||||
for ( ; i < len ; ++i) {
|
||||
int ch = read();
|
||||
if (ch == -1) break;
|
||||
cbuf[off++] = (char)ch;
|
||||
}
|
||||
return i > 0 ? i : (len == 0 ? 0 : -1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
yyclose();
|
||||
}
|
||||
|
||||
static int getInitialBufferSize() { // Package private, for testing purposes
|
||||
return ZZ_BUFFERSIZE;
|
||||
}
|
||||
|
||||
private class TextSegment extends OpenStringBuilder {
|
||||
/** The position from which the next char will be read. */
|
||||
int pos = 0;
|
||||
|
||||
/** Wraps the given buffer and sets this.len to the given length. */
|
||||
TextSegment(char[] buffer, int length) {
|
||||
super(buffer, length);
|
||||
}
|
||||
|
||||
/** Allocates an internal buffer of the given size. */
|
||||
TextSegment(int size) {
|
||||
super(size);
|
||||
}
|
||||
|
||||
/** Sets len = 0 and pos = 0. */
|
||||
void clear() {
|
||||
reset();
|
||||
restart();
|
||||
}
|
||||
|
||||
/** Sets pos = 0 */
|
||||
void restart() {
|
||||
pos = 0;
|
||||
}
|
||||
|
||||
/** Returns the next char in the segment. */
|
||||
int nextChar() {
|
||||
assert (! isRead()): "Attempting to read past the end of a segment.";
|
||||
return buf[pos++];
|
||||
}
|
||||
|
||||
/** Returns true when all characters in the text segment have been read */
|
||||
boolean isRead() {
|
||||
return pos >= len;
|
||||
}
|
||||
}
|
||||
%}
|
||||
|
||||
%eofval{
|
||||
return eofReturnValue;
|
||||
%eofval}
|
||||
%eof{
|
||||
switch (zzLexicalState) {
|
||||
case SCRIPT:
|
||||
case COMMENT:
|
||||
case SCRIPT_COMMENT:
|
||||
case STYLE:
|
||||
case STYLE_COMMENT:
|
||||
case SINGLE_QUOTED_STRING:
|
||||
case DOUBLE_QUOTED_STRING:
|
||||
case END_TAG_TAIL_EXCLUDE:
|
||||
case END_TAG_TAIL_SUBSTITUTE:
|
||||
case START_TAG_TAIL_EXCLUDE:
|
||||
case SERVER_SIDE_INCLUDE:
|
||||
case START_TAG_TAIL_SUBSTITUTE: { // Exclude
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
outputSegment.clear();
|
||||
eofReturnValue = -1;
|
||||
break;
|
||||
}
|
||||
case CHARACTER_REFERENCE_TAIL: { // Substitute
|
||||
// At end of file, allow char refs without semicolons
|
||||
cumulativeDiff += inputSegment.length() - outputSegment.length();
|
||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||
eofReturnValue = outputSegment.nextChar();
|
||||
break;
|
||||
}
|
||||
case BANG:
|
||||
case CDATA:
|
||||
case AMPERSAND:
|
||||
case NUMERIC_CHARACTER:
|
||||
case END_TAG_TAIL_INCLUDE:
|
||||
case START_TAG_TAIL_INCLUDE:
|
||||
case LEFT_ANGLE_BRACKET:
|
||||
case LEFT_ANGLE_BRACKET_SLASH:
|
||||
case LEFT_ANGLE_BRACKET_SPACE: { // Include
|
||||
outputSegment = inputSegment;
|
||||
eofReturnValue = outputSegment.nextChar();
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
eofReturnValue = -1;
|
||||
}
|
||||
}
|
||||
%eof}
|
||||
|
||||
%%
|
||||
|
||||
"&" {
|
||||
inputStart = yychar;
|
||||
inputSegment.clear();
|
||||
inputSegment.append('&');
|
||||
yybegin(AMPERSAND);
|
||||
}
|
||||
|
||||
"<" {
|
||||
inputStart = yychar;
|
||||
inputSegment.clear();
|
||||
inputSegment.append('<');
|
||||
yybegin(LEFT_ANGLE_BRACKET);
|
||||
}
|
||||
|
||||
<AMPERSAND> {
|
||||
{CharacterEntities} {
|
||||
int length = yylength();
|
||||
inputSegment.write(zzBuffer, zzStartRead, length);
|
||||
entitySegment.clear();
|
||||
char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
|
||||
entitySegment.append(ch);
|
||||
outputSegment = entitySegment;
|
||||
yybegin(CHARACTER_REFERENCE_TAIL);
|
||||
}
|
||||
"#" { inputSegment.append('#'); yybegin(NUMERIC_CHARACTER); }
|
||||
|
||||
// 1 1 11 11
|
||||
// 0 1 2 3 45 678 9 0 1 23 45
|
||||
"#" [xX][dD][89aAbB][0-9a-fA-F]{2} ";&#" [xX][dD][c-fC-F][0-9a-fA-F]{2} ";" {
|
||||
// Handle paired UTF-16 surrogates.
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
String surrogatePair = yytext();
|
||||
char highSurrogate = '\u0000';
|
||||
try {
|
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing high surrogate '"
|
||||
+ surrogatePair.substring(2, 6) + "'";
|
||||
}
|
||||
try {
|
||||
outputSegment.unsafeWrite
|
||||
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing low surrogate '"
|
||||
+ surrogatePair.substring(10, 14) + "'";
|
||||
}
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return highSurrogate;
|
||||
}
|
||||
|
||||
// 1 1 11 11
|
||||
// 01 2 345 678 9 0 1 23 45
|
||||
"#5" [56] \d{3} ";&#" [xX][dD][c-fC-F][0-9a-fA-F]{2} ";" {
|
||||
// Handle paired UTF-16 surrogates.
|
||||
String surrogatePair = yytext();
|
||||
char highSurrogate = '\u0000';
|
||||
try { // High surrogates are in decimal range [55296, 56319]
|
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing high surrogate '"
|
||||
+ surrogatePair.substring(1, 6) + "'";
|
||||
}
|
||||
if (Character.isHighSurrogate(highSurrogate)) {
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
try {
|
||||
outputSegment.unsafeWrite
|
||||
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing low surrogate '"
|
||||
+ surrogatePair.substring(10, 14) + "'";
|
||||
}
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return highSurrogate;
|
||||
}
|
||||
yypushback(surrogatePair.length() - 1); // Consume only '#'
|
||||
inputSegment.append('#');
|
||||
yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
|
||||
// 1 111 11
|
||||
// 0 1 2 3 45 6789 0 123 45
|
||||
"#" [xX][dD][89aAbB][0-9a-fA-F]{2} ";" [67] \d{3} ";" {
|
||||
// Handle paired UTF-16 surrogates.
|
||||
String surrogatePair = yytext();
|
||||
char highSurrogate = '\u0000';
|
||||
char lowSurrogate = '\u0000';
|
||||
try {
|
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing high surrogate '"
|
||||
+ surrogatePair.substring(2, 6) + "'";
|
||||
}
|
||||
try { // Low surrogates are in decimal range [56320, 57343]
|
||||
lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing low surrogate '"
|
||||
+ surrogatePair.substring(9, 14) + "'";
|
||||
}
|
||||
if (Character.isLowSurrogate(lowSurrogate)) {
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
outputSegment.unsafeWrite(lowSurrogate);
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return highSurrogate;
|
||||
}
|
||||
yypushback(surrogatePair.length() - 1); // Consume only '#'
|
||||
inputSegment.append('#');
|
||||
yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
|
||||
// 1 111 11
|
||||
// 01 2 345 6789 0 123 45
|
||||
"#5" [56] \d{3} ";" [67] \d{3} ";" {
|
||||
// Handle paired UTF-16 surrogates.
|
||||
String surrogatePair = yytext();
|
||||
char highSurrogate = '\u0000';
|
||||
try { // High surrogates are in decimal range [55296, 56319]
|
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing high surrogate '"
|
||||
+ surrogatePair.substring(1, 6) + "'";
|
||||
}
|
||||
if (Character.isHighSurrogate(highSurrogate)) {
|
||||
char lowSurrogate = '\u0000';
|
||||
try { // Low surrogates are in decimal range [56320, 57343]
|
||||
lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing low surrogate '"
|
||||
+ surrogatePair.substring(9, 14) + "'";
|
||||
}
|
||||
if (Character.isLowSurrogate(lowSurrogate)) {
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
outputSegment.unsafeWrite(lowSurrogate);
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return highSurrogate;
|
||||
}
|
||||
}
|
||||
yypushback(surrogatePair.length() - 1); // Consume only '#'
|
||||
inputSegment.append('#');
|
||||
yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
}
|
||||
|
||||
<NUMERIC_CHARACTER> {
|
||||
[xX] [0-9A-Fa-f]+ {
|
||||
int matchLength = yylength();
|
||||
inputSegment.write(zzBuffer, zzStartRead, matchLength);
|
||||
if (matchLength <= 6) { // 10FFFF: max 6 hex chars
|
||||
String hexCharRef
|
||||
= new String(zzBuffer, zzStartRead + 1, matchLength - 1);
|
||||
int codePoint = 0;
|
||||
try {
|
||||
codePoint = Integer.parseInt(hexCharRef, 16);
|
||||
} catch(Exception e) {
|
||||
assert false: "Exception parsing hex code point '" + hexCharRef + "'";
|
||||
}
|
||||
if (codePoint <= 0x10FFFF) {
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
if (codePoint >= Character.MIN_SURROGATE
|
||||
&& codePoint <= Character.MAX_SURROGATE) {
|
||||
outputSegment.unsafeWrite(REPLACEMENT_CHARACTER);
|
||||
} else {
|
||||
outputSegment.setLength
|
||||
(Character.toChars(codePoint, outputSegment.getArray(), 0));
|
||||
}
|
||||
yybegin(CHARACTER_REFERENCE_TAIL);
|
||||
} else {
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
} else {
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
[0-9]+ {
|
||||
int matchLength = yylength();
|
||||
inputSegment.write(zzBuffer, zzStartRead, matchLength);
|
||||
if (matchLength <= 7) { // 0x10FFFF = 1114111: max 7 decimal chars
|
||||
String decimalCharRef = yytext();
|
||||
int codePoint = 0;
|
||||
try {
|
||||
codePoint = Integer.parseInt(decimalCharRef);
|
||||
} catch(Exception e) {
|
||||
assert false: "Exception parsing code point '" + decimalCharRef + "'";
|
||||
}
|
||||
if (codePoint <= 0x10FFFF) {
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
if (codePoint >= Character.MIN_SURROGATE
|
||||
&& codePoint <= Character.MAX_SURROGATE) {
|
||||
outputSegment.unsafeWrite(REPLACEMENT_CHARACTER);
|
||||
} else {
|
||||
outputSegment.setLength
|
||||
(Character.toChars(codePoint, outputSegment.getArray(), 0));
|
||||
}
|
||||
yybegin(CHARACTER_REFERENCE_TAIL);
|
||||
} else {
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
} else {
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
<CHARACTER_REFERENCE_TAIL> {
|
||||
";" {
|
||||
cumulativeDiff
|
||||
+= inputSegment.length() + yylength() - outputSegment.length();
|
||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
|
||||
<LEFT_ANGLE_BRACKET_SLASH> {
|
||||
\s+ { inputSegment.write(zzBuffer, zzStartRead, yylength()); }
|
||||
[bB][rR] \s* ">" {
|
||||
yybegin(YYINITIAL);
|
||||
if (escapeBR) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
return outputSegment.nextChar();
|
||||
} else {
|
||||
cumulativeDiff
|
||||
+= inputSegment.length() + yylength() - outputSegment.length();
|
||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||
inputSegment.reset();
|
||||
return BR_END_TAG_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
{InlineElment} {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
|
||||
yybegin(END_TAG_TAIL_INCLUDE);
|
||||
} else {
|
||||
yybegin(END_TAG_TAIL_EXCLUDE);
|
||||
}
|
||||
}
|
||||
{Name} {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
|
||||
yybegin(END_TAG_TAIL_INCLUDE);
|
||||
} else {
|
||||
yybegin(END_TAG_TAIL_SUBSTITUTE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
<END_TAG_TAIL_INCLUDE> {
|
||||
\s* ">" {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
|
||||
<END_TAG_TAIL_EXCLUDE> {
|
||||
\s* ">" {
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
}
|
||||
|
||||
<END_TAG_TAIL_SUBSTITUTE> {
|
||||
\s* ">" {
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return BLOCK_LEVEL_END_TAG_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
|
||||
<LEFT_ANGLE_BRACKET> {
|
||||
"!" { inputSegment.append('!'); yybegin(BANG); }
|
||||
"/" { inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH); }
|
||||
\s+ {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
yybegin(LEFT_ANGLE_BRACKET_SPACE);
|
||||
}
|
||||
"?" [^>]* [/?] ">" {
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
\s* [bB][rR] ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
|
||||
yybegin(YYINITIAL);
|
||||
if (escapeBR) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
return outputSegment.nextChar();
|
||||
} else {
|
||||
cumulativeDiff
|
||||
+= inputSegment.length() + yylength() - outputSegment.length();
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.reset();
|
||||
return BR_START_TAG_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
\s* [sS][cC][rR][iI][pP][tT] ( \s+ {OpenTagContent} )? \s* ">" {
|
||||
yybegin(SCRIPT);
|
||||
if (escapeSCRIPT) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
inputStart += 1 + yylength();
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
\s* [sS][tT][yY][lL][eE] ( \s+ {OpenTagContent} )? \s* ">" {
|
||||
yybegin(STYLE);
|
||||
if (escapeSTYLE) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
inputStart += 1 + yylength();
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
<LEFT_ANGLE_BRACKET, LEFT_ANGLE_BRACKET_SPACE> {
|
||||
{InlineElment} {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
|
||||
yybegin(START_TAG_TAIL_INCLUDE);
|
||||
} else {
|
||||
yybegin(START_TAG_TAIL_EXCLUDE);
|
||||
}
|
||||
}
|
||||
{Name} {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
|
||||
yybegin(START_TAG_TAIL_INCLUDE);
|
||||
} else {
|
||||
yybegin(START_TAG_TAIL_SUBSTITUTE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
<START_TAG_TAIL_INCLUDE> {
|
||||
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
|
||||
<START_TAG_TAIL_EXCLUDE> {
|
||||
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
}
|
||||
|
||||
<START_TAG_TAIL_SUBSTITUTE> {
|
||||
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return BLOCK_LEVEL_START_TAG_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
|
||||
<BANG> {
|
||||
"--" { yybegin(COMMENT); }
|
||||
">" {
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
// From XML 1.0 <http://www.w3.org/TR/xml/>:
|
||||
//
|
||||
// [18] CDSect ::= CDStart CData CDEnd
|
||||
// [19] CDStart ::= '<![CDATA['
|
||||
// [20] CData ::= (Char* - (Char* ']]>' Char*))
|
||||
// [21] CDEnd ::= ']]>'
|
||||
//
|
||||
"[CDATA[" {
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(CDATA);
|
||||
}
|
||||
[^] {
|
||||
inputSegment.append(zzBuffer[zzStartRead]);
|
||||
}
|
||||
}
|
||||
|
||||
<CDATA> {
|
||||
"]]>" {
|
||||
cumulativeDiff += yylength();
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
[^] { return zzBuffer[zzStartRead]; }
|
||||
}
|
||||
|
||||
<COMMENT> {
|
||||
"<!--#" { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
|
||||
"-->" {
|
||||
cumulativeDiff += yychar - inputStart + yylength();
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
[^] { }
|
||||
}
|
||||
|
||||
<SERVER_SIDE_INCLUDE> {
|
||||
"-->" { yybegin(restoreState); }
|
||||
"'" {
|
||||
previousRestoreState = restoreState;
|
||||
restoreState = SERVER_SIDE_INCLUDE;
|
||||
yybegin(SINGLE_QUOTED_STRING);
|
||||
}
|
||||
"\"" {
|
||||
previousRestoreState = restoreState;
|
||||
restoreState = SERVER_SIDE_INCLUDE;
|
||||
yybegin(DOUBLE_QUOTED_STRING);
|
||||
}
|
||||
[^] { }
|
||||
}
|
||||
|
||||
<SCRIPT_COMMENT> {
|
||||
"<!--#" { restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
|
||||
"'" { restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING); }
|
||||
"\"" { restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING); }
|
||||
"-->" { yybegin(SCRIPT); }
|
||||
[^] { }
|
||||
}
|
||||
|
||||
<STYLE_COMMENT> {
|
||||
"<!--#" { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
|
||||
"'" { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING); }
|
||||
"\"" { restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING); }
|
||||
"-->" { yybegin(STYLE); }
|
||||
[^] { }
|
||||
}
|
||||
|
||||
<SINGLE_QUOTED_STRING> {
|
||||
"\\" [^] { }
|
||||
"'" { yybegin(restoreState); restoreState = previousRestoreState; }
|
||||
[^] { }
|
||||
}
|
||||
|
||||
<DOUBLE_QUOTED_STRING> {
|
||||
"\\" [^] { }
|
||||
"\"" { yybegin(restoreState); restoreState = previousRestoreState; }
|
||||
[^] { }
|
||||
}
|
||||
|
||||
<SCRIPT> {
|
||||
"<!--" { yybegin(SCRIPT_COMMENT); }
|
||||
"</" \s* [sS][cC][rR][iI][pP][tT] \s* ">" {
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
int outputEnd = outputCharCount;
|
||||
int returnValue;
|
||||
if (escapeSCRIPT) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
returnValue = outputSegment.nextChar();
|
||||
} else {
|
||||
cumulativeDiff += yylength() - 1;
|
||||
++outputEnd;
|
||||
returnValue = SCRIPT_REPLACEMENT;
|
||||
}
|
||||
addOffCorrectMap(outputEnd, cumulativeDiff);
|
||||
return returnValue;
|
||||
}
|
||||
[^] { }
|
||||
}
|
||||
|
||||
<STYLE> {
|
||||
"<!--" { yybegin(STYLE_COMMENT); }
|
||||
"</" \s* [sS][tT][yY][lL][eE] \s* ">" {
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
int outputEnd = outputCharCount;
|
||||
int returnValue;
|
||||
if (escapeSTYLE) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
returnValue = outputSegment.nextChar();
|
||||
} else {
|
||||
cumulativeDiff += yylength() - 1;
|
||||
++outputEnd;
|
||||
returnValue = STYLE_REPLACEMENT;
|
||||
}
|
||||
addOffCorrectMap(outputEnd, cumulativeDiff);
|
||||
return returnValue;
|
||||
}
|
||||
[^] { }
|
||||
}
|
||||
|
||||
<AMPERSAND,NUMERIC_CHARACTER,CHARACTER_REFERENCE_TAIL,LEFT_ANGLE_BRACKET_SLASH,END_TAG_TAIL_INCLUDE,END_TAG_TAIL_EXCLUDE,END_TAG_TAIL_SUBSTITUTE,LEFT_ANGLE_BRACKET,LEFT_ANGLE_BRACKET_SPACE,START_TAG_TAIL_INCLUDE,START_TAG_TAIL_EXCLUDE,START_TAG_TAIL_SUBSTITUTE,BANG> {
|
||||
[^] {
|
||||
yypushback(1);
|
||||
outputSegment = inputSegment;
|
||||
outputSegment.restart();
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
|
||||
[^] { return zzBuffer[zzStartRead]; }
|
|
@ -0,0 +1,530 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
|
||||
# A simple python script to generate an HTML entity map and a regex alternation
|
||||
# for inclusion in HTMLStripCharFilter.jflex.
|
||||
|
||||
def main():
|
||||
print get_apache_license()
|
||||
codes = {}
|
||||
regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
|
||||
for line in get_entity_text().split('\n'):
|
||||
match = regex.match(line)
|
||||
if match:
|
||||
key = match.group(1)
|
||||
if key == 'quot': codes[key] = r'\"'
|
||||
elif key == 'nbsp': codes[key] = ' ';
|
||||
else : codes[key] = r'\u%04X' % int(match.group(2))
|
||||
|
||||
keys = sorted(codes)
|
||||
|
||||
first_entry = True
|
||||
output_line = 'CharacterEntities = ( '
|
||||
for key in keys:
|
||||
new_entry = ('"%s"' if first_entry else ' | "%s"') % key
|
||||
first_entry = False
|
||||
if len(output_line) + len(new_entry) >= 80:
|
||||
print output_line
|
||||
output_line = ' '
|
||||
output_line += new_entry
|
||||
if key in ('quot','copy','gt','lt','reg','amp'):
|
||||
new_entry = ' | "%s"' % key.upper()
|
||||
if len(output_line) + len(new_entry) >= 80:
|
||||
print output_line
|
||||
output_line = ' '
|
||||
output_line += new_entry
|
||||
print output_line, ')'
|
||||
|
||||
print '%{'
|
||||
print ' private static final Set<String> upperCaseVariantsAccepted'
|
||||
print ' = new HashSet<String>(Arrays.asList("quot","copy","gt","lt","reg","amp"));'
|
||||
print ' private static final CharArrayMap<Character> entityValues'
|
||||
print ' = new CharArrayMap<Character>(Version.LUCENE_40, %i, false);' % len(keys)
|
||||
print ' static {'
|
||||
print ' String[] entities = {'
|
||||
output_line = ' '
|
||||
for key in keys:
|
||||
new_entry = ' "%s", "%s",' % (key, codes[key])
|
||||
if len(output_line) + len(new_entry) >= 80:
|
||||
print output_line
|
||||
output_line = ' '
|
||||
output_line += new_entry
|
||||
print output_line[:-1]
|
||||
print ' };'
|
||||
print ' for (int i = 0 ; i < entities.length ; i += 2) {'
|
||||
print ' Character value = entities[i + 1].charAt(0);'
|
||||
print ' entityValues.put(entities[i], value);'
|
||||
print ' if (upperCaseVariantsAccepted.contains(entities[i])) {'
|
||||
print ' entityValues.put(entities[i].toUpperCase(), value);'
|
||||
print ' }'
|
||||
print ' }'
|
||||
print " }"
|
||||
print "%}"
|
||||
|
||||
def get_entity_text():
|
||||
# The text below is taken verbatim from
|
||||
# <http://www.w3.org/TR/REC-html40/sgml/entities.html>:
|
||||
text = r"""
|
||||
F.1. XHTML Character Entities
|
||||
|
||||
XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section.
|
||||
F.1.1. XHTML Latin 1 Character Entities
|
||||
|
||||
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent.
|
||||
|
||||
<!-- ...................................................................... -->
|
||||
<!-- XML-compatible ISO Latin 1 Character Entity Set for XHTML ............ -->
|
||||
<!-- file: xhtml-lat1.ent
|
||||
|
||||
Typical invocation:
|
||||
|
||||
<!ENTITY % xhtml-lat1
|
||||
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
|
||||
"xhtml-lat1.ent" >
|
||||
%xhtml-lat1;
|
||||
|
||||
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
|
||||
|
||||
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
|
||||
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent"
|
||||
|
||||
Revision: $Id: xhtml-lat1.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
|
||||
|
||||
Portions (C) International Organization for Standardization 1986:
|
||||
Permission to copy in any form is granted for use with conforming
|
||||
SGML systems and applications as defined in ISO 8879, provided
|
||||
this notice is included in all copies.
|
||||
-->
|
||||
|
||||
<!ENTITY nbsp " " ><!-- no-break space = non-breaking space, U+00A0 ISOnum -->
|
||||
<!ENTITY iexcl "¡" ><!-- inverted exclamation mark, U+00A1 ISOnum -->
|
||||
<!ENTITY cent "¢" ><!-- cent sign, U+00A2 ISOnum -->
|
||||
<!ENTITY pound "£" ><!-- pound sign, U+00A3 ISOnum -->
|
||||
<!ENTITY curren "¤" ><!-- currency sign, U+00A4 ISOnum -->
|
||||
<!ENTITY yen "¥" ><!-- yen sign = yuan sign, U+00A5 ISOnum -->
|
||||
<!ENTITY brvbar "¦" ><!-- broken bar = broken vertical bar, U+00A6 ISOnum -->
|
||||
<!ENTITY sect "§" ><!-- section sign, U+00A7 ISOnum -->
|
||||
<!ENTITY uml "¨" ><!-- diaeresis = spacing diaeresis, U+00A8 ISOdia -->
|
||||
<!ENTITY copy "©" ><!-- copyright sign, U+00A9 ISOnum -->
|
||||
<!ENTITY ordf "ª" ><!-- feminine ordinal indicator, U+00AA ISOnum -->
|
||||
<!ENTITY laquo "«" ><!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum -->
|
||||
<!ENTITY not "¬" ><!-- not sign, U+00AC ISOnum -->
|
||||
<!ENTITY shy "­" ><!-- soft hyphen = discretionary hyphen, U+00AD ISOnum -->
|
||||
<!ENTITY reg "®" ><!-- registered sign = registered trade mark sign, U+00AE ISOnum -->
|
||||
<!ENTITY macr "¯" ><!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia -->
|
||||
<!ENTITY deg "°" ><!-- degree sign, U+00B0 ISOnum -->
|
||||
<!ENTITY plusmn "±" ><!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum -->
|
||||
<!ENTITY sup2 "²" ><!-- superscript two = superscript digit two = squared, U+00B2 ISOnum -->
|
||||
<!ENTITY sup3 "³" ><!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum -->
|
||||
<!ENTITY acute "´" ><!-- acute accent = spacing acute, U+00B4 ISOdia -->
|
||||
<!ENTITY micro "µ" ><!-- micro sign, U+00B5 ISOnum -->
|
||||
<!ENTITY para "¶" ><!-- pilcrow sign = paragraph sign, U+00B6 ISOnum -->
|
||||
<!ENTITY middot "·" ><!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum -->
|
||||
<!ENTITY cedil "¸" ><!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
|
||||
<!ENTITY sup1 "¹" ><!-- superscript one = superscript digit one, U+00B9 ISOnum -->
|
||||
<!ENTITY ordm "º" ><!-- masculine ordinal indicator, U+00BA ISOnum -->
|
||||
<!ENTITY raquo "»" ><!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum -->
|
||||
<!ENTITY frac14 "¼" ><!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum -->
|
||||
<!ENTITY frac12 "½" ><!-- vulgar fraction one half = fraction one half, U+00BD ISOnum -->
|
||||
<!ENTITY frac34 "¾" ><!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum -->
|
||||
<!ENTITY iquest "¿" ><!-- inverted question mark = turned question mark, U+00BF ISOnum -->
|
||||
<!ENTITY Agrave "À" ><!-- latin capital A with grave = latin capital A grave, U+00C0 ISOlat1 -->
|
||||
<!ENTITY Aacute "Á" ><!-- latin capital A with acute, U+00C1 ISOlat1 -->
|
||||
<!ENTITY Acirc "Â" ><!-- latin capital A with circumflex, U+00C2 ISOlat1 -->
|
||||
<!ENTITY Atilde "Ã" ><!-- latin capital A with tilde, U+00C3 ISOlat1 -->
|
||||
<!ENTITY Auml "Ä" ><!-- latin capital A with diaeresis, U+00C4 ISOlat1 -->
|
||||
<!ENTITY Aring "Å" ><!-- latin capital A with ring above = latin capital A ring, U+00C5 ISOlat1 -->
|
||||
<!ENTITY AElig "Æ" ><!-- latin capital AE = latin capital ligature AE, U+00C6 ISOlat1 -->
|
||||
<!ENTITY Ccedil "Ç" ><!-- latin capital C with cedilla, U+00C7 ISOlat1 -->
|
||||
<!ENTITY Egrave "È" ><!-- latin capital E with grave, U+00C8 ISOlat1 -->
|
||||
<!ENTITY Eacute "É" ><!-- latin capital E with acute, U+00C9 ISOlat1 -->
|
||||
<!ENTITY Ecirc "Ê" ><!-- latin capital E with circumflex, U+00CA ISOlat1 -->
|
||||
<!ENTITY Euml "Ë" ><!-- latin capital E with diaeresis, U+00CB ISOlat1 -->
|
||||
<!ENTITY Igrave "Ì" ><!-- latin capital I with grave, U+00CC ISOlat1 -->
|
||||
<!ENTITY Iacute "Í" ><!-- latin capital I with acute, U+00CD ISOlat1 -->
|
||||
<!ENTITY Icirc "Î" ><!-- latin capital I with circumflex, U+00CE ISOlat1 -->
|
||||
<!ENTITY Iuml "Ï" ><!-- latin capital I with diaeresis, U+00CF ISOlat1 -->
|
||||
<!ENTITY ETH "Ð" ><!-- latin capital ETH, U+00D0 ISOlat1 -->
|
||||
<!ENTITY Ntilde "Ñ" ><!-- latin capital N with tilde, U+00D1 ISOlat1 -->
|
||||
<!ENTITY Ograve "Ò" ><!-- latin capital O with grave, U+00D2 ISOlat1 -->
|
||||
<!ENTITY Oacute "Ó" ><!-- latin capital O with acute, U+00D3 ISOlat1 -->
|
||||
<!ENTITY Ocirc "Ô" ><!-- latin capital O with circumflex, U+00D4 ISOlat1 -->
|
||||
<!ENTITY Otilde "Õ" ><!-- latin capital O with tilde, U+00D5 ISOlat1 -->
|
||||
<!ENTITY Ouml "Ö" ><!-- latin capital O with diaeresis, U+00D6 ISOlat1 -->
|
||||
<!ENTITY times "×" ><!-- multiplication sign, U+00D7 ISOnum -->
|
||||
<!ENTITY Oslash "Ø" ><!-- latin capital O with stroke = latin capital O slash, U+00D8 ISOlat1 -->
|
||||
<!ENTITY Ugrave "Ù" ><!-- latin capital U with grave, U+00D9 ISOlat1 -->
|
||||
<!ENTITY Uacute "Ú" ><!-- latin capital U with acute, U+00DA ISOlat1 -->
|
||||
<!ENTITY Ucirc "Û" ><!-- latin capital U with circumflex, U+00DB ISOlat1 -->
|
||||
<!ENTITY Uuml "Ü" ><!-- latin capital U with diaeresis, U+00DC ISOlat1 -->
|
||||
<!ENTITY Yacute "Ý" ><!-- latin capital Y with acute, U+00DD ISOlat1 -->
|
||||
<!ENTITY THORN "Þ" ><!-- latin capital THORN, U+00DE ISOlat1 -->
|
||||
<!ENTITY szlig "ß" ><!-- latin small sharp s = ess-zed, U+00DF ISOlat1 -->
|
||||
<!ENTITY agrave "à" ><!-- latin small a with grave = latin small a grave, U+00E0 ISOlat1 -->
|
||||
<!ENTITY aacute "á" ><!-- latin small a with acute, U+00E1 ISOlat1 -->
|
||||
<!ENTITY acirc "â" ><!-- latin small a with circumflex, U+00E2 ISOlat1 -->
|
||||
<!ENTITY atilde "ã" ><!-- latin small a with tilde, U+00E3 ISOlat1 -->
|
||||
<!ENTITY auml "ä" ><!-- latin small a with diaeresis, U+00E4 ISOlat1 -->
|
||||
<!ENTITY aring "å" ><!-- latin small a with ring above = latin small a ring, U+00E5 ISOlat1 -->
|
||||
<!ENTITY aelig "æ" ><!-- latin small ae = latin small ligature ae, U+00E6 ISOlat1 -->
|
||||
<!ENTITY ccedil "ç" ><!-- latin small c with cedilla, U+00E7 ISOlat1 -->
|
||||
<!ENTITY egrave "è" ><!-- latin small e with grave, U+00E8 ISOlat1 -->
|
||||
<!ENTITY eacute "é" ><!-- latin small e with acute, U+00E9 ISOlat1 -->
|
||||
<!ENTITY ecirc "ê" ><!-- latin small e with circumflex, U+00EA ISOlat1 -->
|
||||
<!ENTITY euml "ë" ><!-- latin small e with diaeresis, U+00EB ISOlat1 -->
|
||||
<!ENTITY igrave "ì" ><!-- latin small i with grave, U+00EC ISOlat1 -->
|
||||
<!ENTITY iacute "í" ><!-- latin small i with acute, U+00ED ISOlat1 -->
|
||||
<!ENTITY icirc "î" ><!-- latin small i with circumflex, U+00EE ISOlat1 -->
|
||||
<!ENTITY iuml "ï" ><!-- latin small i with diaeresis, U+00EF ISOlat1 -->
|
||||
<!ENTITY eth "ð" ><!-- latin small eth, U+00F0 ISOlat1 -->
|
||||
<!ENTITY ntilde "ñ" ><!-- latin small n with tilde, U+00F1 ISOlat1 -->
|
||||
<!ENTITY ograve "ò" ><!-- latin small o with grave, U+00F2 ISOlat1 -->
|
||||
<!ENTITY oacute "ó" ><!-- latin small o with acute, U+00F3 ISOlat1 -->
|
||||
<!ENTITY ocirc "ô" ><!-- latin small o with circumflex, U+00F4 ISOlat1 -->
|
||||
<!ENTITY otilde "õ" ><!-- latin small o with tilde, U+00F5 ISOlat1 -->
|
||||
<!ENTITY ouml "ö" ><!-- latin small o with diaeresis, U+00F6 ISOlat1 -->
|
||||
<!ENTITY divide "÷" ><!-- division sign, U+00F7 ISOnum -->
|
||||
<!ENTITY oslash "ø" ><!-- latin small o with stroke, = latin small o slash, U+00F8 ISOlat1 -->
|
||||
<!ENTITY ugrave "ù" ><!-- latin small u with grave, U+00F9 ISOlat1 -->
|
||||
<!ENTITY uacute "ú" ><!-- latin small u with acute, U+00FA ISOlat1 -->
|
||||
<!ENTITY ucirc "û" ><!-- latin small u with circumflex, U+00FB ISOlat1 -->
|
||||
<!ENTITY uuml "ü" ><!-- latin small u with diaeresis, U+00FC ISOlat1 -->
|
||||
<!ENTITY yacute "ý" ><!-- latin small y with acute, U+00FD ISOlat1 -->
|
||||
<!ENTITY thorn "þ" ><!-- latin small thorn with, U+00FE ISOlat1 -->
|
||||
<!ENTITY yuml "ÿ" ><!-- latin small y with diaeresis, U+00FF ISOlat1 -->
|
||||
<!-- end of xhtml-lat1.ent -->
|
||||
|
||||
F.1.2. XHTML Special Characters
|
||||
|
||||
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent.
|
||||
|
||||
<!-- ...................................................................... -->
|
||||
<!-- XML-compatible ISO Special Character Entity Set for XHTML ............ -->
|
||||
<!-- file: xhtml-special.ent
|
||||
|
||||
Typical invocation:
|
||||
|
||||
<!ENTITY % xhtml-special
|
||||
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
|
||||
"xhtml-special.ent" >
|
||||
%xhtml-special;
|
||||
|
||||
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
|
||||
|
||||
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
|
||||
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-special.ent"
|
||||
|
||||
Revision: $Id: xhtml-special.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
|
||||
|
||||
Portions (C) International Organization for Standardization 1986:
|
||||
Permission to copy in any form is granted for use with conforming
|
||||
SGML systems and applications as defined in ISO 8879, provided
|
||||
this notice is included in all copies.
|
||||
|
||||
Revisions:
|
||||
2000-10-28: added ' and altered XML Predefined Entities for compatibility
|
||||
-->
|
||||
|
||||
<!-- Relevant ISO entity set is given unless names are newly introduced.
|
||||
New names (i.e., not in ISO 8879 [SGML] list) do not clash with
|
||||
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
|
||||
numbers are given for each character, in hex. Entity values are
|
||||
decimal conversions of the ISO 10646 values and refer to the
|
||||
document character set. Names are Unicode [UNICODE] names.
|
||||
-->
|
||||
|
||||
<!-- C0 Controls and Basic Latin -->
|
||||
<!ENTITY lt "&#60;" ><!-- less-than sign, U+003C ISOnum -->
|
||||
<!ENTITY gt ">" ><!-- greater-than sign, U+003E ISOnum -->
|
||||
<!ENTITY amp "&#38;" ><!-- ampersand, U+0026 ISOnum -->
|
||||
<!ENTITY apos "'" ><!-- The Apostrophe (Apostrophe Quote, APL Quote), U+0027 ISOnum -->
|
||||
<!ENTITY quot """ ><!-- quotation mark (Quote Double), U+0022 ISOnum -->
|
||||
|
||||
<!-- Latin Extended-A -->
|
||||
<!ENTITY OElig "Œ" ><!-- latin capital ligature OE, U+0152 ISOlat2 -->
|
||||
<!ENTITY oelig "œ" ><!-- latin small ligature oe, U+0153 ISOlat2 -->
|
||||
|
||||
<!-- ligature is a misnomer, this is a separate character in some languages -->
|
||||
<!ENTITY Scaron "Š" ><!-- latin capital letter S with caron, U+0160 ISOlat2 -->
|
||||
<!ENTITY scaron "š" ><!-- latin small letter s with caron, U+0161 ISOlat2 -->
|
||||
<!ENTITY Yuml "Ÿ" ><!-- latin capital letter Y with diaeresis, U+0178 ISOlat2 -->
|
||||
|
||||
<!-- Spacing Modifier Letters -->
|
||||
<!ENTITY circ "ˆ" ><!-- modifier letter circumflex accent, U+02C6 ISOpub -->
|
||||
<!ENTITY tilde "˜" ><!-- small tilde, U+02DC ISOdia -->
|
||||
|
||||
<!-- General Punctuation -->
|
||||
<!ENTITY ensp " " ><!-- en space, U+2002 ISOpub -->
|
||||
<!ENTITY emsp " " ><!-- em space, U+2003 ISOpub -->
|
||||
<!ENTITY thinsp " " ><!-- thin space, U+2009 ISOpub -->
|
||||
<!ENTITY zwnj "‌" ><!-- zero width non-joiner, U+200C NEW RFC 2070 -->
|
||||
<!ENTITY zwj "‍" ><!-- zero width joiner, U+200D NEW RFC 2070 -->
|
||||
<!ENTITY lrm "‎" ><!-- left-to-right mark, U+200E NEW RFC 2070 -->
|
||||
<!ENTITY rlm "‏" ><!-- right-to-left mark, U+200F NEW RFC 2070 -->
|
||||
<!ENTITY ndash "–" ><!-- en dash, U+2013 ISOpub -->
|
||||
<!ENTITY mdash "—" ><!-- em dash, U+2014 ISOpub -->
|
||||
<!ENTITY lsquo "‘" ><!-- left single quotation mark, U+2018 ISOnum -->
|
||||
<!ENTITY rsquo "’" ><!-- right single quotation mark, U+2019 ISOnum -->
|
||||
<!ENTITY sbquo "‚" ><!-- single low-9 quotation mark, U+201A NEW -->
|
||||
<!ENTITY ldquo "“" ><!-- left double quotation mark, U+201C ISOnum -->
|
||||
<!ENTITY rdquo "”" ><!-- right double quotation mark, U+201D ISOnum -->
|
||||
<!ENTITY bdquo "„" ><!-- double low-9 quotation mark, U+201E NEW -->
|
||||
<!ENTITY dagger "†" ><!-- dagger, U+2020 ISOpub -->
|
||||
<!ENTITY Dagger "‡" ><!-- double dagger, U+2021 ISOpub -->
|
||||
<!ENTITY permil "‰" ><!-- per mille sign, U+2030 ISOtech -->
|
||||
|
||||
<!-- lsaquo is proposed but not yet ISO standardized -->
|
||||
<!ENTITY lsaquo "‹" ><!-- single left-pointing angle quotation mark, U+2039 ISO proposed -->
|
||||
<!-- rsaquo is proposed but not yet ISO standardized -->
|
||||
<!ENTITY rsaquo "›" ><!-- single right-pointing angle quotation mark, U+203A ISO proposed -->
|
||||
<!ENTITY euro "€" ><!-- euro sign, U+20AC NEW -->
|
||||
|
||||
<!-- end of xhtml-special.ent -->
|
||||
|
||||
F.1.3. XHTML Mathematical, Greek, and Symbolic Characters
|
||||
|
||||
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent.
|
||||
|
||||
<!-- ...................................................................... -->
|
||||
<!-- ISO Math, Greek and Symbolic Character Entity Set for XHTML .......... -->
|
||||
<!-- file: xhtml-symbol.ent
|
||||
|
||||
Typical invocation:
|
||||
|
||||
<!ENTITY % xhtml-symbol
|
||||
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
|
||||
"xhtml-symbol.ent" >
|
||||
%xhtml-symbol;
|
||||
|
||||
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
|
||||
|
||||
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
|
||||
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent"
|
||||
|
||||
Revision: $Id: xhtml-symbol.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
|
||||
|
||||
Portions (C) International Organization for Standardization 1986:
|
||||
Permission to copy in any form is granted for use with conforming
|
||||
SGML systems and applications as defined in ISO 8879, provided
|
||||
this notice is included in all copies.
|
||||
-->
|
||||
|
||||
<!-- Relevant ISO entity set is given unless names are newly introduced.
|
||||
New names (i.e., not in ISO 8879 [SGML] list) do not clash with
|
||||
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
|
||||
numbers are given for each character, in hex. Entity values are
|
||||
decimal conversions of the ISO 10646 values and refer to the
|
||||
document character set. Names are Unicode [UNICODE] names.
|
||||
-->
|
||||
|
||||
<!-- Latin Extended-B -->
|
||||
<!ENTITY fnof "ƒ" ><!-- latin small f with hook = function
|
||||
= florin, U+0192 ISOtech -->
|
||||
|
||||
<!-- Greek -->
|
||||
<!ENTITY Alpha "Α" ><!-- greek capital letter alpha, U+0391 -->
|
||||
<!ENTITY Beta "Β" ><!-- greek capital letter beta, U+0392 -->
|
||||
<!ENTITY Gamma "Γ" ><!-- greek capital letter gamma, U+0393 ISOgrk3 -->
|
||||
<!ENTITY Delta "Δ" ><!-- greek capital letter delta, U+0394 ISOgrk3 -->
|
||||
<!ENTITY Epsilon "Ε" ><!-- greek capital letter epsilon, U+0395 -->
|
||||
<!ENTITY Zeta "Ζ" ><!-- greek capital letter zeta, U+0396 -->
|
||||
<!ENTITY Eta "Η" ><!-- greek capital letter eta, U+0397 -->
|
||||
<!ENTITY Theta "Θ" ><!-- greek capital letter theta, U+0398 ISOgrk3 -->
|
||||
<!ENTITY Iota "Ι" ><!-- greek capital letter iota, U+0399 -->
|
||||
<!ENTITY Kappa "Κ" ><!-- greek capital letter kappa, U+039A -->
|
||||
<!ENTITY Lambda "Λ" ><!-- greek capital letter lambda, U+039B ISOgrk3 -->
|
||||
<!ENTITY Mu "Μ" ><!-- greek capital letter mu, U+039C -->
|
||||
<!ENTITY Nu "Ν" ><!-- greek capital letter nu, U+039D -->
|
||||
<!ENTITY Xi "Ξ" ><!-- greek capital letter xi, U+039E ISOgrk3 -->
|
||||
<!ENTITY Omicron "Ο" ><!-- greek capital letter omicron, U+039F -->
|
||||
<!ENTITY Pi "Π" ><!-- greek capital letter pi, U+03A0 ISOgrk3 -->
|
||||
<!ENTITY Rho "Ρ" ><!-- greek capital letter rho, U+03A1 -->
|
||||
<!-- there is no Sigmaf, and no U+03A2 character either -->
|
||||
<!ENTITY Sigma "Σ" ><!-- greek capital letter sigma, U+03A3 ISOgrk3 -->
|
||||
<!ENTITY Tau "Τ" ><!-- greek capital letter tau, U+03A4 -->
|
||||
<!ENTITY Upsilon "Υ" ><!-- greek capital letter upsilon,
|
||||
U+03A5 ISOgrk3 -->
|
||||
<!ENTITY Phi "Φ" ><!-- greek capital letter phi, U+03A6 ISOgrk3 -->
|
||||
<!ENTITY Chi "Χ" ><!-- greek capital letter chi, U+03A7 -->
|
||||
<!ENTITY Psi "Ψ" ><!-- greek capital letter psi, U+03A8 ISOgrk3 -->
|
||||
<!ENTITY Omega "Ω" ><!-- greek capital letter omega, U+03A9 ISOgrk3 -->
|
||||
<!ENTITY alpha "α" ><!-- greek small letter alpha, U+03B1 ISOgrk3 -->
|
||||
<!ENTITY beta "β" ><!-- greek small letter beta, U+03B2 ISOgrk3 -->
|
||||
<!ENTITY gamma "γ" ><!-- greek small letter gamma, U+03B3 ISOgrk3 -->
|
||||
<!ENTITY delta "δ" ><!-- greek small letter delta, U+03B4 ISOgrk3 -->
|
||||
<!ENTITY epsilon "ε" ><!-- greek small letter epsilon, U+03B5 ISOgrk3 -->
|
||||
<!ENTITY zeta "ζ" ><!-- greek small letter zeta, U+03B6 ISOgrk3 -->
|
||||
<!ENTITY eta "η" ><!-- greek small letter eta, U+03B7 ISOgrk3 -->
|
||||
<!ENTITY theta "θ" ><!-- greek small letter theta, U+03B8 ISOgrk3 -->
|
||||
<!ENTITY iota "ι" ><!-- greek small letter iota, U+03B9 ISOgrk3 -->
|
||||
<!ENTITY kappa "κ" ><!-- greek small letter kappa, U+03BA ISOgrk3 -->
|
||||
<!ENTITY lambda "λ" ><!-- greek small letter lambda, U+03BB ISOgrk3 -->
|
||||
<!ENTITY mu "μ" ><!-- greek small letter mu, U+03BC ISOgrk3 -->
|
||||
<!ENTITY nu "ν" ><!-- greek small letter nu, U+03BD ISOgrk3 -->
|
||||
<!ENTITY xi "ξ" ><!-- greek small letter xi, U+03BE ISOgrk3 -->
|
||||
<!ENTITY omicron "ο" ><!-- greek small letter omicron, U+03BF NEW -->
|
||||
<!ENTITY pi "π" ><!-- greek small letter pi, U+03C0 ISOgrk3 -->
|
||||
<!ENTITY rho "ρ" ><!-- greek small letter rho, U+03C1 ISOgrk3 -->
|
||||
<!ENTITY sigmaf "ς" ><!-- greek small letter final sigma, U+03C2 ISOgrk3 -->
|
||||
<!ENTITY sigma "σ" ><!-- greek small letter sigma, U+03C3 ISOgrk3 -->
|
||||
<!ENTITY tau "τ" ><!-- greek small letter tau, U+03C4 ISOgrk3 -->
|
||||
<!ENTITY upsilon "υ" ><!-- greek small letter upsilon, U+03C5 ISOgrk3 -->
|
||||
<!ENTITY phi "φ" ><!-- greek small letter phi, U+03C6 ISOgrk3 -->
|
||||
<!ENTITY chi "χ" ><!-- greek small letter chi, U+03C7 ISOgrk3 -->
|
||||
<!ENTITY psi "ψ" ><!-- greek small letter psi, U+03C8 ISOgrk3 -->
|
||||
<!ENTITY omega "ω" ><!-- greek small letter omega, U+03C9 ISOgrk3 -->
|
||||
<!ENTITY thetasym "ϑ" ><!-- greek small letter theta symbol, U+03D1 NEW -->
|
||||
<!ENTITY upsih "ϒ" ><!-- greek upsilon with hook symbol, U+03D2 NEW -->
|
||||
<!ENTITY piv "ϖ" ><!-- greek pi symbol, U+03D6 ISOgrk3 -->
|
||||
|
||||
<!-- General Punctuation -->
|
||||
<!ENTITY bull "•" ><!-- bullet = black small circle, U+2022 ISOpub -->
|
||||
<!-- bullet is NOT the same as bullet operator, U+2219 -->
|
||||
<!ENTITY hellip "…" ><!-- horizontal ellipsis = three dot leader, U+2026 ISOpub -->
|
||||
<!ENTITY prime "′" ><!-- prime = minutes = feet, U+2032 ISOtech -->
|
||||
<!ENTITY Prime "″" ><!-- double prime = seconds = inches, U+2033 ISOtech -->
|
||||
<!ENTITY oline "‾" ><!-- overline = spacing overscore, U+203E NEW -->
|
||||
<!ENTITY frasl "⁄" ><!-- fraction slash, U+2044 NEW -->
|
||||
|
||||
<!-- Letterlike Symbols -->
|
||||
<!ENTITY weierp "℘" ><!-- script capital P = power set = Weierstrass p, U+2118 ISOamso -->
|
||||
<!ENTITY image "ℑ" ><!-- blackletter capital I = imaginary part, U+2111 ISOamso -->
|
||||
<!ENTITY real "ℜ" ><!-- blackletter capital R = real part symbol, U+211C ISOamso -->
|
||||
<!ENTITY trade "™" ><!-- trade mark sign, U+2122 ISOnum -->
|
||||
<!ENTITY alefsym "ℵ" ><!-- alef symbol = first transfinite cardinal, U+2135 NEW -->
|
||||
<!-- alef symbol is NOT the same as hebrew letter alef, U+05D0 although
|
||||
the same glyph could be used to depict both characters -->
|
||||
|
||||
<!-- Arrows -->
|
||||
<!ENTITY larr "←" ><!-- leftwards arrow, U+2190 ISOnum -->
|
||||
<!ENTITY uarr "↑" ><!-- upwards arrow, U+2191 ISOnum-->
|
||||
<!ENTITY rarr "→" ><!-- rightwards arrow, U+2192 ISOnum -->
|
||||
<!ENTITY darr "↓" ><!-- downwards arrow, U+2193 ISOnum -->
|
||||
<!ENTITY harr "↔" ><!-- left right arrow, U+2194 ISOamsa -->
|
||||
<!ENTITY crarr "↵" ><!-- downwards arrow with corner leftwards
|
||||
= carriage return, U+21B5 NEW -->
|
||||
<!ENTITY lArr "⇐" ><!-- leftwards double arrow, U+21D0 ISOtech -->
|
||||
<!-- Unicode does not say that lArr is the same as the 'is implied by' arrow
|
||||
but also does not have any other character for that function. So ? lArr can
|
||||
be used for 'is implied by' as ISOtech suggests -->
|
||||
<!ENTITY uArr "⇑" ><!-- upwards double arrow, U+21D1 ISOamsa -->
|
||||
<!ENTITY rArr "⇒" ><!-- rightwards double arrow, U+21D2 ISOtech -->
|
||||
<!-- Unicode does not say this is the 'implies' character but does not have
|
||||
another character with this function so ?
|
||||
rArr can be used for 'implies' as ISOtech suggests -->
|
||||
<!ENTITY dArr "⇓" ><!-- downwards double arrow, U+21D3 ISOamsa -->
|
||||
<!ENTITY hArr "⇔" ><!-- left right double arrow, U+21D4 ISOamsa -->
|
||||
|
||||
<!-- Mathematical Operators -->
|
||||
<!ENTITY forall "∀" ><!-- for all, U+2200 ISOtech -->
|
||||
<!ENTITY part "∂" ><!-- partial differential, U+2202 ISOtech -->
|
||||
<!ENTITY exist "∃" ><!-- there exists, U+2203 ISOtech -->
|
||||
<!ENTITY empty "∅" ><!-- empty set = null set, U+2205 ISOamso -->
|
||||
<!ENTITY nabla "∇" ><!-- nabla = backward difference, U+2207 ISOtech -->
|
||||
<!ENTITY isin "∈" ><!-- element of, U+2208 ISOtech -->
|
||||
<!ENTITY notin "∉" ><!-- not an element of, U+2209 ISOtech -->
|
||||
<!ENTITY ni "∋" ><!-- contains as member, U+220B ISOtech -->
|
||||
<!-- should there be a more memorable name than 'ni'? -->
|
||||
<!ENTITY prod "∏" ><!-- n-ary product = product sign, U+220F ISOamsb -->
|
||||
<!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
|
||||
the same glyph might be used for both -->
|
||||
<!ENTITY sum "∑" ><!-- n-ary sumation, U+2211 ISOamsb -->
|
||||
<!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
|
||||
though the same glyph might be used for both -->
|
||||
<!ENTITY minus "−" ><!-- minus sign, U+2212 ISOtech -->
|
||||
<!ENTITY lowast "∗" ><!-- asterisk operator, U+2217 ISOtech -->
|
||||
<!ENTITY radic "√" ><!-- square root = radical sign, U+221A ISOtech -->
|
||||
<!ENTITY prop "∝" ><!-- proportional to, U+221D ISOtech -->
|
||||
<!ENTITY infin "∞" ><!-- infinity, U+221E ISOtech -->
|
||||
<!ENTITY ang "∠" ><!-- angle, U+2220 ISOamso -->
|
||||
<!ENTITY and "∧" ><!-- logical and = wedge, U+2227 ISOtech -->
|
||||
<!ENTITY or "∨" ><!-- logical or = vee, U+2228 ISOtech -->
|
||||
<!ENTITY cap "∩" ><!-- intersection = cap, U+2229 ISOtech -->
|
||||
<!ENTITY cup "∪" ><!-- union = cup, U+222A ISOtech -->
|
||||
<!ENTITY int "∫" ><!-- integral, U+222B ISOtech -->
|
||||
<!ENTITY there4 "∴" ><!-- therefore, U+2234 ISOtech -->
|
||||
<!ENTITY sim "∼" ><!-- tilde operator = varies with = similar to, U+223C ISOtech -->
|
||||
<!-- tilde operator is NOT the same character as the tilde, U+007E,
|
||||
although the same glyph might be used to represent both -->
|
||||
<!ENTITY cong "≅" ><!-- approximately equal to, U+2245 ISOtech -->
|
||||
<!ENTITY asymp "≈" ><!-- almost equal to = asymptotic to, U+2248 ISOamsr -->
|
||||
<!ENTITY ne "≠" ><!-- not equal to, U+2260 ISOtech -->
|
||||
<!ENTITY equiv "≡" ><!-- identical to, U+2261 ISOtech -->
|
||||
<!ENTITY le "≤" ><!-- less-than or equal to, U+2264 ISOtech -->
|
||||
<!ENTITY ge "≥" ><!-- greater-than or equal to, U+2265 ISOtech -->
|
||||
<!ENTITY sub "⊂" ><!-- subset of, U+2282 ISOtech -->
|
||||
<!ENTITY sup "⊃" ><!-- superset of, U+2283 ISOtech -->
|
||||
<!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
|
||||
font encoding and is not included. Should it be, for symmetry?
|
||||
It is in ISOamsn -->
|
||||
<!ENTITY nsub "⊄" ><!-- not a subset of, U+2284 ISOamsn -->
|
||||
<!ENTITY sube "⊆" ><!-- subset of or equal to, U+2286 ISOtech -->
|
||||
<!ENTITY supe "⊇" ><!-- superset of or equal to, U+2287 ISOtech -->
|
||||
<!ENTITY oplus "⊕" ><!-- circled plus = direct sum, U+2295 ISOamsb -->
|
||||
<!ENTITY otimes "⊗" ><!-- circled times = vector product, U+2297 ISOamsb -->
|
||||
<!ENTITY perp "⊥" ><!-- up tack = orthogonal to = perpendicular, U+22A5 ISOtech -->
|
||||
<!ENTITY sdot "⋅" ><!-- dot operator, U+22C5 ISOamsb -->
|
||||
<!-- dot operator is NOT the same character as U+00B7 middle dot -->
|
||||
|
||||
<!-- Miscellaneous Technical -->
|
||||
<!ENTITY lceil "⌈" ><!-- left ceiling = apl upstile, U+2308 ISOamsc -->
|
||||
<!ENTITY rceil "⌉" ><!-- right ceiling, U+2309 ISOamsc -->
|
||||
<!ENTITY lfloor "⌊" ><!-- left floor = apl downstile, U+230A ISOamsc -->
|
||||
<!ENTITY rfloor "⌋" ><!-- right floor, U+230B ISOamsc -->
|
||||
<!ENTITY lang "〈" ><!-- left-pointing angle bracket = bra, U+2329 ISOtech -->
|
||||
<!-- lang is NOT the same character as U+003C 'less than'
|
||||
or U+2039 'single left-pointing angle quotation mark' -->
|
||||
<!ENTITY rang "〉" ><!-- right-pointing angle bracket = ket, U+232A ISOtech -->
|
||||
<!-- rang is NOT the same character as U+003E 'greater than'
|
||||
or U+203A 'single right-pointing angle quotation mark' -->
|
||||
|
||||
<!-- Geometric Shapes -->
|
||||
<!ENTITY loz "◊" ><!-- lozenge, U+25CA ISOpub -->
|
||||
|
||||
<!-- Miscellaneous Symbols -->
|
||||
<!ENTITY spades "♠" ><!-- black spade suit, U+2660 ISOpub -->
|
||||
<!-- black here seems to mean filled as opposed to hollow -->
|
||||
<!ENTITY clubs "♣" ><!-- black club suit = shamrock, U+2663 ISOpub -->
|
||||
<!ENTITY hearts "♥" ><!-- black heart suit = valentine, U+2665 ISOpub -->
|
||||
<!ENTITY diams "♦" ><!-- black diamond suit, U+2666 ISOpub -->
|
||||
|
||||
<!-- end of xhtml-symbol.ent -->
|
||||
"""
|
||||
return text
|
||||
|
||||
def get_apache_license():
|
||||
license = r"""/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
"""
|
||||
return license
|
||||
|
||||
main()
|
|
@ -17,6 +17,42 @@
|
|||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Filters that normalize text before tokenization.
|
||||
<p>
|
||||
Chainable filters that normalize text before tokenization and provide
|
||||
mappings between normalized text offsets and the corresponding offset
|
||||
in the original text.
|
||||
</p>
|
||||
<H2>CharFilter offset mappings</H2>
|
||||
<p>
|
||||
CharFilters modify an input stream via a series of substring
|
||||
replacements (including deletions and insertions) to produce an output
|
||||
stream. There are three possible replacement cases: the replacement
|
||||
string has the same length as the original substring; the replacement
|
||||
is shorter; and the replacement is longer. In the latter two cases
|
||||
(when the replacement has a different length than the original),
|
||||
one or more offset correction mappings are required.
|
||||
</p>
|
||||
<p>
|
||||
When the replacement is shorter than the original (e.g. when the
|
||||
replacement is the empty string), a single offset correction mapping
|
||||
should be added at the replacement's end offset in the output stream.
|
||||
The <code>cumulativeDiff</code> parameter to the
|
||||
<code>addOffCorrectMapping()</code> method will be the sum of all
|
||||
previous replacement offset adjustments, with the addition of the
|
||||
difference between the lengths of the original substring and the
|
||||
replacement string (a positive value).
|
||||
</p>
|
||||
<p>
|
||||
When the replacement is longer than the original (e.g. when the
|
||||
original is the empty string), you should add as many offset
|
||||
correction mappings as the difference between the lengths of the
|
||||
replacement string and the original substring, starting at the
|
||||
end offset the original substring would have had in the output stream.
|
||||
The <code>cumulativeDiff</code> parameter to the
|
||||
<code>addOffCorrectMapping()</code> method will be the sum of all
|
||||
previous replacement offset adjustments, with the addition of the
|
||||
difference between the lengths of the original substring and the
|
||||
replacement string so far (a negative value).
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -154,13 +154,22 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
|
||||
/** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */
|
||||
public CompoundToken(int offset, int length) {
|
||||
final int newStart = CompoundWordTokenFilterBase.this.offsetAtt.startOffset() + offset;
|
||||
this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length);
|
||||
// TODO: This ignores the original endOffset, if a CharFilter/Tokenizer/Filter removed
|
||||
// chars from the term, offsets may not match correctly (other filters producing tokens
|
||||
// may also have this problem):
|
||||
this.startOffset = newStart;
|
||||
this.endOffset = newStart + length;
|
||||
|
||||
// offsets of the original word
|
||||
int startOff = CompoundWordTokenFilterBase.this.offsetAtt.startOffset();
|
||||
int endOff = CompoundWordTokenFilterBase.this.offsetAtt.endOffset();
|
||||
|
||||
if (endOff - startOff != CompoundWordTokenFilterBase.this.termAtt.length()) {
|
||||
// if length by start + end offsets doesn't match the term text then assume
|
||||
// this is a synonym and don't adjust the offsets.
|
||||
this.startOffset = startOff;
|
||||
this.endOffset = endOff;
|
||||
} else {
|
||||
final int newStart = startOff + offset;
|
||||
this.startOffset = newStart;
|
||||
this.endOffset = newStart + length;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
package org.apache.lucene.analysis.core;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Removes tokens whose types appear in a set of blocked types from a token stream.
|
||||
*/
|
||||
public final class TypeTokenFilter extends FilteringTokenFilter {
|
||||
|
||||
private final Set<String> stopTypes;
|
||||
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
|
||||
|
||||
public TypeTokenFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes) {
|
||||
super(enablePositionIncrements, input);
|
||||
this.stopTypes = stopTypes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next input Token whose typeAttribute.type() is not a stop type.
|
||||
*/
|
||||
@Override
|
||||
protected boolean accept() throws IOException {
|
||||
return !stopTypes.contains(typeAttribute.type());
|
||||
}
|
||||
}
|
|
@ -60,6 +60,7 @@ public final class HyphenatedWordsFilter extends TokenFilter {
|
|||
private final StringBuilder hyphenated = new StringBuilder();
|
||||
private State savedState;
|
||||
private boolean exhausted = false;
|
||||
private int lastEndOffset = 0;
|
||||
|
||||
/**
|
||||
* Creates a new HyphenatedWordsFilter
|
||||
|
@ -78,6 +79,7 @@ public final class HyphenatedWordsFilter extends TokenFilter {
|
|||
while (!exhausted && input.incrementToken()) {
|
||||
char[] term = termAttribute.buffer();
|
||||
int termLength = termAttribute.length();
|
||||
lastEndOffset = offsetAttribute.endOffset();
|
||||
|
||||
if (termLength > 0 && term[termLength - 1] == '-') {
|
||||
// a hyphenated word
|
||||
|
@ -119,6 +121,7 @@ public final class HyphenatedWordsFilter extends TokenFilter {
|
|||
hyphenated.setLength(0);
|
||||
savedState = null;
|
||||
exhausted = false;
|
||||
lastEndOffset = 0;
|
||||
}
|
||||
|
||||
// ================================================= Helper Methods ================================================
|
||||
|
@ -127,8 +130,6 @@ public final class HyphenatedWordsFilter extends TokenFilter {
|
|||
* Writes the joined unhyphenated term
|
||||
*/
|
||||
private void unhyphenate() {
|
||||
int endOffset = offsetAttribute.endOffset();
|
||||
|
||||
restoreState(savedState);
|
||||
savedState = null;
|
||||
|
||||
|
@ -140,7 +141,7 @@ public final class HyphenatedWordsFilter extends TokenFilter {
|
|||
|
||||
hyphenated.getChars(0, length, term, 0);
|
||||
termAttribute.setLength(length);
|
||||
offsetAttribute.setOffset(offsetAttribute.startOffset(), endOffset);
|
||||
offsetAttribute.setOffset(offsetAttribute.startOffset(), lastEndOffset);
|
||||
hyphenated.setLength(0);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -183,31 +183,33 @@ public final class PatternAnalyzer extends Analyzer {
|
|||
*
|
||||
* @param fieldName
|
||||
* the name of the field to tokenize (currently ignored).
|
||||
* @param reader
|
||||
* reader (e.g. charfilter) of the original text. can be null.
|
||||
* @param text
|
||||
* the string to tokenize
|
||||
* @return a new token stream
|
||||
*/
|
||||
public TokenStreamComponents createComponents(String fieldName, String text) {
|
||||
public TokenStreamComponents createComponents(String fieldName, Reader reader, String text) {
|
||||
// Ideally the Analyzer superclass should have a method with the same signature,
|
||||
// with a default impl that simply delegates to the StringReader flavour.
|
||||
if (text == null)
|
||||
throw new IllegalArgumentException("text must not be null");
|
||||
|
||||
if (pattern == NON_WORD_PATTERN) { // fast path
|
||||
return new TokenStreamComponents(new FastStringTokenizer(text, true, toLowerCase, stopWords));
|
||||
return new TokenStreamComponents(new FastStringTokenizer(reader, text, true, toLowerCase, stopWords));
|
||||
} else if (pattern == WHITESPACE_PATTERN) { // fast path
|
||||
return new TokenStreamComponents(new FastStringTokenizer(text, false, toLowerCase, stopWords));
|
||||
return new TokenStreamComponents(new FastStringTokenizer(reader, text, false, toLowerCase, stopWords));
|
||||
}
|
||||
|
||||
Tokenizer tokenizer = new PatternTokenizer(text, pattern, toLowerCase);
|
||||
Tokenizer tokenizer = new PatternTokenizer(reader, text, pattern, toLowerCase);
|
||||
TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer;
|
||||
return new TokenStreamComponents(tokenizer, result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a token stream that tokenizes all the text in the given Reader;
|
||||
* This implementation forwards to <code>tokenStream(String, String)</code> and is
|
||||
* less efficient than <code>tokenStream(String, String)</code>.
|
||||
* This implementation forwards to <code>tokenStream(String, Reader, String)</code> and is
|
||||
* less efficient than <code>tokenStream(String, Reader, String)</code>.
|
||||
*
|
||||
* @param fieldName
|
||||
* the name of the field to tokenize (currently ignored).
|
||||
|
@ -219,7 +221,7 @@ public final class PatternAnalyzer extends Analyzer {
|
|||
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
try {
|
||||
String text = toString(reader);
|
||||
return createComponents(fieldName, text);
|
||||
return createComponents(fieldName, reader, text);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
@ -332,7 +334,8 @@ public final class PatternAnalyzer extends Analyzer {
|
|||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
|
||||
public PatternTokenizer(Reader input, String str, Pattern pattern, boolean toLowerCase) {
|
||||
super(input);
|
||||
this.pattern = pattern;
|
||||
this.str = str;
|
||||
this.matcher = pattern.matcher(str);
|
||||
|
@ -359,7 +362,7 @@ public final class PatternAnalyzer extends Analyzer {
|
|||
String text = str.substring(start, end);
|
||||
if (toLowerCase) text = text.toLowerCase(locale);
|
||||
termAtt.setEmpty().append(text);
|
||||
offsetAtt.setOffset(start, end);
|
||||
offsetAtt.setOffset(correctOffset(start), correctOffset(end));
|
||||
return true;
|
||||
}
|
||||
if (!isMatch) return false;
|
||||
|
@ -369,7 +372,7 @@ public final class PatternAnalyzer extends Analyzer {
|
|||
@Override
|
||||
public final void end() {
|
||||
// set final offset
|
||||
final int finalOffset = str.length();
|
||||
final int finalOffset = correctOffset(str.length());
|
||||
this.offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
|
@ -406,7 +409,8 @@ public final class PatternAnalyzer extends Analyzer {
|
|||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set<?> stopWords) {
|
||||
public FastStringTokenizer(Reader input, String str, boolean isLetter, boolean toLowerCase, Set<?> stopWords) {
|
||||
super(input);
|
||||
this.str = str;
|
||||
this.isLetter = isLetter;
|
||||
this.toLowerCase = toLowerCase;
|
||||
|
@ -458,7 +462,7 @@ public final class PatternAnalyzer extends Analyzer {
|
|||
return false;
|
||||
}
|
||||
termAtt.setEmpty().append(text);
|
||||
offsetAtt.setOffset(start, i);
|
||||
offsetAtt.setOffset(correctOffset(start), correctOffset(i));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -466,7 +470,7 @@ public final class PatternAnalyzer extends Analyzer {
|
|||
public final void end() {
|
||||
// set final offset
|
||||
final int finalOffset = str.length();
|
||||
this.offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
this.offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset));
|
||||
}
|
||||
|
||||
private boolean isTokenChar(char c, boolean isLetter) {
|
||||
|
@ -479,6 +483,7 @@ public final class PatternAnalyzer extends Analyzer {
|
|||
|
||||
@Override
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
this.str = PatternAnalyzer.toString(input);
|
||||
}
|
||||
|
||||
|
|
|
@ -68,7 +68,7 @@ public final class TrimFilter extends TokenFilter {
|
|||
} else {
|
||||
termAtt.setEmpty();
|
||||
}
|
||||
if (updateOffsets) {
|
||||
if (updateOffsets && len == offsetAtt.endOffset() - offsetAtt.startOffset()) {
|
||||
int newStart = offsetAtt.startOffset()+start;
|
||||
int newEnd = offsetAtt.endOffset() - (start<end ? endOff:0);
|
||||
offsetAtt.setOffset(newStart, newEnd);
|
||||
|
|
|
@ -405,10 +405,20 @@ public final class WordDelimiterFilter extends TokenFilter {
|
|||
clearAttributes();
|
||||
termAttribute.copyBuffer(savedBuffer, iterator.current, iterator.end - iterator.current);
|
||||
|
||||
int startOffSet = (isSingleWord || !hasIllegalOffsets) ? savedStartOffset + iterator.current : savedStartOffset;
|
||||
int endOffSet = (hasIllegalOffsets) ? savedEndOffset : savedStartOffset + iterator.end;
|
||||
|
||||
offsetAttribute.setOffset(startOffSet, endOffSet);
|
||||
int startOffset = savedStartOffset + iterator.current;
|
||||
int endOffset = savedStartOffset + iterator.end;
|
||||
|
||||
if (hasIllegalOffsets) {
|
||||
// historically this filter did this regardless for 'isSingleWord',
|
||||
// but we must do a sanity check:
|
||||
if (isSingleWord && startOffset <= savedEndOffset) {
|
||||
offsetAttribute.setOffset(startOffset, savedEndOffset);
|
||||
} else {
|
||||
offsetAttribute.setOffset(savedStartOffset, savedEndOffset);
|
||||
}
|
||||
} else {
|
||||
offsetAttribute.setOffset(startOffset, endOffset);
|
||||
}
|
||||
posIncAttribute.setPositionIncrement(position(false));
|
||||
typeAttribute.setType(savedType);
|
||||
}
|
||||
|
|
|
@ -74,7 +74,8 @@ public final class EdgeNGramTokenizer extends Tokenizer {
|
|||
private int gramSize;
|
||||
private Side side;
|
||||
private boolean started = false;
|
||||
private int inLen;
|
||||
private int inLen; // length of the input AFTER trim()
|
||||
private int charsRead; // length of the input
|
||||
private String inStr;
|
||||
|
||||
|
||||
|
@ -183,7 +184,11 @@ public final class EdgeNGramTokenizer extends Tokenizer {
|
|||
if (!started) {
|
||||
started = true;
|
||||
char[] chars = new char[1024];
|
||||
int charsRead = input.read(chars);
|
||||
charsRead = input.read(chars);
|
||||
if (charsRead < 0) {
|
||||
charsRead = inLen = 0;
|
||||
return false;
|
||||
}
|
||||
inStr = new String(chars, 0, charsRead).trim(); // remove any leading or trailing spaces
|
||||
inLen = inStr.length();
|
||||
gramSize = minGram;
|
||||
|
@ -211,7 +216,7 @@ public final class EdgeNGramTokenizer extends Tokenizer {
|
|||
@Override
|
||||
public final void end() {
|
||||
// set final offset
|
||||
final int finalOffset = inLen;
|
||||
final int finalOffset = correctOffset(charsRead);
|
||||
this.offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
|
@ -225,5 +230,6 @@ public final class EdgeNGramTokenizer extends Tokenizer {
|
|||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
started = false;
|
||||
charsRead = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -35,7 +35,8 @@ public final class NGramTokenizer extends Tokenizer {
|
|||
private int minGram, maxGram;
|
||||
private int gramSize;
|
||||
private int pos = 0;
|
||||
private int inLen;
|
||||
private int inLen; // length of the input AFTER trim()
|
||||
private int charsRead; // length of the input
|
||||
private String inStr;
|
||||
private boolean started = false;
|
||||
|
||||
|
@ -104,7 +105,11 @@ public final class NGramTokenizer extends Tokenizer {
|
|||
started = true;
|
||||
gramSize = minGram;
|
||||
char[] chars = new char[1024];
|
||||
input.read(chars);
|
||||
charsRead = input.read(chars);
|
||||
if (charsRead < 0) {
|
||||
charsRead = inLen = 0;
|
||||
return false;
|
||||
}
|
||||
inStr = new String(chars).trim(); // remove any trailing empty strings
|
||||
inLen = inStr.length();
|
||||
}
|
||||
|
@ -128,7 +133,7 @@ public final class NGramTokenizer extends Tokenizer {
|
|||
@Override
|
||||
public final void end() {
|
||||
// set final offset
|
||||
final int finalOffset = inLen;
|
||||
final int finalOffset = correctOffset(charsRead);
|
||||
this.offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
|
@ -143,5 +148,6 @@ public final class NGramTokenizer extends Tokenizer {
|
|||
super.reset();
|
||||
started = false;
|
||||
pos = 0;
|
||||
charsRead = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
|||
public final class PositionFilter extends TokenFilter {
|
||||
|
||||
/** Position increment to assign to all but the first token - default = 0 */
|
||||
private int positionIncrement = 0;
|
||||
private final int positionIncrement;
|
||||
|
||||
/** The first token must have non-zero positionIncrement **/
|
||||
private boolean firstTokenPositioned = false;
|
||||
|
@ -44,7 +44,7 @@ public final class PositionFilter extends TokenFilter {
|
|||
* @param input the input stream
|
||||
*/
|
||||
public PositionFilter(final TokenStream input) {
|
||||
super(input);
|
||||
this(input, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -56,7 +56,7 @@ public final class PositionFilter extends TokenFilter {
|
|||
* token from the input stream
|
||||
*/
|
||||
public PositionFilter(final TokenStream input, final int positionIncrement) {
|
||||
this(input);
|
||||
super(input);
|
||||
this.positionIncrement = positionIncrement;
|
||||
}
|
||||
|
||||
|
|
|
@ -68,6 +68,7 @@ public final class ThaiWordFilter extends TokenFilter {
|
|||
private CharTermAttribute clonedTermAtt = null;
|
||||
private OffsetAttribute clonedOffsetAtt = null;
|
||||
private boolean hasMoreTokensInClone = false;
|
||||
private boolean hasIllegalOffsets = false; // only if the length changed before this filter
|
||||
|
||||
/** Creates a new ThaiWordFilter with the specified match version. */
|
||||
public ThaiWordFilter(Version matchVersion, TokenStream input) {
|
||||
|
@ -86,7 +87,11 @@ public final class ThaiWordFilter extends TokenFilter {
|
|||
if (end != BreakIterator.DONE) {
|
||||
clonedToken.copyTo(this);
|
||||
termAtt.copyBuffer(clonedTermAtt.buffer(), start, end - start);
|
||||
offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
|
||||
if (hasIllegalOffsets) {
|
||||
offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
|
||||
} else {
|
||||
offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
|
||||
}
|
||||
if (handlePosIncr) posAtt.setPositionIncrement(1);
|
||||
return true;
|
||||
}
|
||||
|
@ -102,6 +107,10 @@ public final class ThaiWordFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
hasMoreTokensInClone = true;
|
||||
|
||||
// if length by start + end offsets doesn't match the term text then assume
|
||||
// this is a synonym and don't adjust the offsets.
|
||||
hasIllegalOffsets = offsetAtt.endOffset() - offsetAtt.startOffset() != termAtt.length();
|
||||
|
||||
// we lazy init the cloned token, as in ctor not all attributes may be added
|
||||
if (clonedToken == null) {
|
||||
|
@ -118,7 +127,11 @@ public final class ThaiWordFilter extends TokenFilter {
|
|||
int end = breaker.next();
|
||||
if (end != BreakIterator.DONE) {
|
||||
termAtt.setLength(end);
|
||||
offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
|
||||
if (hasIllegalOffsets) {
|
||||
offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
|
||||
} else {
|
||||
offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
|
||||
}
|
||||
// position increment keeps as it is for first token
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -306,13 +306,14 @@ public final class WikipediaTokenizer extends Tokenizer {
|
|||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
scanner.yyreset(input);
|
||||
tokens = null;
|
||||
scanner.reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader reader) throws IOException {
|
||||
super.reset(reader);
|
||||
reset();
|
||||
scanner.yyreset(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/30/11 12:11 PM */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 1/22/12 10:26 PM */
|
||||
|
||||
package org.apache.lucene.analysis.wikipedia;
|
||||
|
||||
|
@ -25,8 +25,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
/**
|
||||
* This class is a scanner generated by
|
||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||
* on 9/30/11 12:11 PM from the specification file
|
||||
* <tt>/lucene/jflex/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
|
||||
* on 1/22/12 10:26 PM from the specification file
|
||||
* <tt>/home/rmuir/workspace/lucene-clean-trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
|
||||
*/
|
||||
class WikipediaTokenizerImpl {
|
||||
|
||||
|
@ -498,6 +498,14 @@ final int setText(StringBuilder buffer){
|
|||
return length;
|
||||
}
|
||||
|
||||
final void reset() {
|
||||
currentTokType = 0;
|
||||
numBalanced = 0;
|
||||
positionInc = 1;
|
||||
numLinkToks = 0;
|
||||
numWikiTokensSeen = 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -91,6 +91,14 @@ final int setText(StringBuilder buffer){
|
|||
return length;
|
||||
}
|
||||
|
||||
final void reset() {
|
||||
currentTokType = 0;
|
||||
numBalanced = 0;
|
||||
positionInc = 1;
|
||||
numLinkToks = 0;
|
||||
numWikiTokensSeen = 0;
|
||||
}
|
||||
|
||||
|
||||
%}
|
||||
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.io.InputStream;
|
|||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
|
@ -31,7 +32,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
|||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.junit.Ignore;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||
|
||||
|
@ -41,9 +42,9 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
String html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " +
|
||||
"another <a href=\"http://lucene.apache.org/\">link</a>. " +
|
||||
"This is an entity: & plus a <. Here is an &. <!-- is a comment -->";
|
||||
String gold = " this is some text here is a link and " +
|
||||
"another link . " +
|
||||
"This is an entity: & plus a <. Here is an &. ";
|
||||
String gold = "\nthis is some text\n here is a link and " +
|
||||
"another link. " +
|
||||
"This is an entity: & plus a <. Here is an &. ";
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new StringReader(html)));
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = -1;
|
||||
|
@ -56,7 +57,8 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
+ " Buffer so far: " + builder + "<EOB>", theChar == goldArray[position]);
|
||||
position++;
|
||||
}
|
||||
assertEquals(gold, builder.toString());
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
//Some sanity checks, but not a full-fledged check
|
||||
|
@ -77,6 +79,24 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
}
|
||||
|
||||
public void testMSWord14GeneratedHTML() throws Exception {
|
||||
InputStream stream = getClass().getResourceAsStream("MS-Word 14 generated.htm");
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
|
||||
String gold = "This is a test";
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString().trim() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString().trim());
|
||||
}
|
||||
|
||||
|
||||
public void testGamma() throws Exception {
|
||||
String test = "Γ";
|
||||
String gold = "\u0393";
|
||||
|
@ -89,9 +109,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
builder.append((char)ch);
|
||||
}
|
||||
String result = builder.toString();
|
||||
// System.out.println("Resu: " + result + "<EOL>");
|
||||
// System.out.println("Gold: " + gold + "<EOL>");
|
||||
assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
|
||||
assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
|
||||
}
|
||||
|
||||
public void testEntities() throws Exception {
|
||||
|
@ -106,9 +124,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
builder.append((char)ch);
|
||||
}
|
||||
String result = builder.toString();
|
||||
// System.out.println("Resu: " + result + "<EOL>");
|
||||
// System.out.println("Gold: " + gold + "<EOL>");
|
||||
assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
|
||||
assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
|
||||
}
|
||||
|
||||
public void testMoreEntities() throws Exception {
|
||||
|
@ -123,9 +139,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
builder.append((char)ch);
|
||||
}
|
||||
String result = builder.toString();
|
||||
// System.out.println("Resu: " + result + "<EOL>");
|
||||
// System.out.println("Gold: " + gold + "<EOL>");
|
||||
assertTrue(result + " is not equal to " + gold, result.equals(gold) == true);
|
||||
assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
|
||||
}
|
||||
|
||||
public void testReserved() throws Exception {
|
||||
|
@ -147,45 +161,248 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testMalformedHTML() throws Exception {
|
||||
String test = "a <a hr<ef=aa<a>> </close</a>";
|
||||
String gold = "a <a hr<ef=aa > </close ";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
String[] testGold = {
|
||||
"a <a hr<ef=aa<a>> </close</a>",
|
||||
"a <a hr<ef=aa> </close",
|
||||
|
||||
"<a href=http://dmoz.org/cgi-bin/add.cgi?where=/arts/\" class=lu style=\"font-size: 9px\" target=dmoz>Submit a Site</a>",
|
||||
"Submit a Site",
|
||||
|
||||
"<a href=javascript:ioSwitch('p8','http://www.csmonitor.com/') title=expand id=e8 class=expanded rel=http://www.csmonitor.com/>Christian Science",
|
||||
"Christian Science",
|
||||
|
||||
"<link rel=\"alternate\" type=\"application/rss+xml\" title=\"San Francisco \" 2008 RSS Feed\" href=\"http://2008.sf.wordcamp.org/feed/\" />",
|
||||
"\n",
|
||||
|
||||
// "<" before ">" inhibits tag recognition
|
||||
"<a href=\" http://www.surgery4was.happyhost.org/video-of-arthroscopic-knee-surgery symptoms.html, heat congestive heart failure <a href=\" http://www.symptoms1bad.happyhost.org/canine",
|
||||
"<a href=\" http://www.surgery4was.happyhost.org/video-of-arthroscopic-knee-surgery symptoms.html, heat congestive heart failure <a href=\" http://www.symptoms1bad.happyhost.org/canine",
|
||||
|
||||
"<a href=\"http://ucblibraries.colorado.edu/how/index.htm\"class=\"pageNavAreaText\">",
|
||||
"",
|
||||
|
||||
"<link title=\"^\\\" 21Sta's Blog\" rel=\"search\" type=\"application/opensearchdescription+xml\" href=\"http://21sta.com/blog/inc/opensearch.php\" />",
|
||||
"\n",
|
||||
|
||||
"<a href=\"#postcomment\" title=\"\"Leave a comment\";\">?",
|
||||
"?",
|
||||
|
||||
"<a href='/modern-furniture' ' id='21txt' class='offtab' onMouseout=\"this.className='offtab'; return true;\" onMouseover=\"this.className='ontab'; return true;\">",
|
||||
"",
|
||||
|
||||
"<a href='http://alievi.wordpress.com/category/01-todos-posts/' style='font-size: 275%; padding: 1px; margin: 1px;' title='01 - Todos Post's (83)'>",
|
||||
"",
|
||||
|
||||
"The <a href=<a href=\"http://www.advancedmd.com>medical\">http://www.advancedmd.com>medical</a> practice software</a>",
|
||||
"The <a href=medical\">http://www.advancedmd.com>medical practice software",
|
||||
|
||||
"<a href=\"node/21426\" class=\"clipTitle2\" title=\"Levi.com/BMX 2008 Clip of the Week 29 \"Morgan Wade Leftover Clips\"\">Levi.com/BMX 2008 Clip of the Week 29...",
|
||||
"Levi.com/BMX 2008 Clip of the Week 29...",
|
||||
|
||||
"<a href=\"printer_friendly.php?branch=&year=&submit=go&screen=\";\">Printer Friendly",
|
||||
"Printer Friendly",
|
||||
|
||||
"<a href=#\" ondragstart=\"return false\" onclick=\"window.external.AddFavorite('http://www.amazingtextures.com', 'Amazing Textures');return false\" onmouseover=\"window.status='Add to Favorites';return true\">Add to Favorites",
|
||||
"Add to Favorites",
|
||||
|
||||
"<a href=\"../at_home/at_home_search.html\"../_home/at_home_search.html\">At",
|
||||
"At",
|
||||
|
||||
"E-mail: <a href=\"\"mailto:XXXXXX@example.com\" \">XXXXXX@example.com </a>",
|
||||
"E-mail: XXXXXX@example.com ",
|
||||
|
||||
"<li class=\"farsi\"><a title=\"A'13?\" alt=\"A'13?\" href=\"http://www.america.gov/persian\" alt=\"\" name=\"A'13?\"A'13? title=\"A'13?\">A'13?</a></li>",
|
||||
"\nA'13?\n",
|
||||
|
||||
"<li><a href=\"#28\" title=\"Hubert \"Geese\" Ausby\">Hubert \"Geese\" Ausby</a></li>",
|
||||
"\nHubert \"Geese\" Ausby\n",
|
||||
|
||||
"<href=\"http://anbportal.com/mms/login.asp\">",
|
||||
"\n",
|
||||
|
||||
"<a href=\"",
|
||||
"<a href=\"",
|
||||
|
||||
"<a href=\">",
|
||||
"",
|
||||
|
||||
"<a rel=\"nofollow\" href=\"http://anissanina31.skyrock.com/1895039493-Hi-tout-le-monde.html\" title=\" Hi, tout le monde !>#</a>",
|
||||
"#",
|
||||
|
||||
"<a href=\"http://annunciharleydavidsonusate.myblog.it/\" title=\"Annunci Moto e Accessori Harley Davidson\" target=\"_blank\"><img src=\"http://annunciharleydavidsonusate.myblog.it/images/Antipixel.gif\" /></a>",
|
||||
"",
|
||||
|
||||
"<a href=\"video/addvideo&v=120838887181\" onClick=\"return confirm('Are you sure you want add this video to your profile? If it exists some video in your profile will be overlapped by this video!!')\" \" onmouseover=\"this.className='border2'\" onmouseout=\"this.className=''\">",
|
||||
"",
|
||||
|
||||
"<a href=#Services & Support>",
|
||||
"",
|
||||
|
||||
// "<" and ">" chars are accepted in on[Event] attribute values
|
||||
"<input type=\"image\" src=\"http://apologyindex.com/ThemeFiles/83401-72905/images/btn_search.gif\"value=\"Search\" name=\"Search\" alt=\"Search\" class=\"searchimage\" onclick=\"incom ='&sc=' + document.getElementById('sel').value ; var dt ='&dt=' + document.getElementById('dt').value; var searchKeyword = document.getElementById('q').value ; searchKeyword = searchKeyword.replace(/\\s/g,''); if (searchKeyword.length < 3){alert('Nothing to search. Search keyword should contain atleast 3 chars.'); return false; } var al='&al=' + document.getElementById('advancedlink').style.display ; document.location.href='http://apologyindex.com/search.aspx?q=' + document.getElementById('q').value + incom + dt + al;\" />",
|
||||
"",
|
||||
|
||||
"<input type=\"image\" src=\"images/afbe.gif\" width=\"22\" height=\"22\" hspace=\"4\" title=\"Add to Favorite\" alt=\"Add to Favorite\"onClick=\" if(window.sidebar){ window.sidebar.addPanel(document.title,location.href,''); }else if(window.external){ window.external.AddFavorite(location.href,document.title); }else if(window.opera&&window.print) { return true; }\">",
|
||||
"",
|
||||
|
||||
"<area shape=\"rect\" coords=\"12,153,115,305\" href=\"http://statenislandtalk.com/v-web/gallery/Osmundsen-family\"Art's Norwegian Roots in Rogaland\">",
|
||||
"\n",
|
||||
|
||||
"<a rel=\"nofollow\" href=\"http://arth26.skyrock.com/660188240-bonzai.html\" title=\"bonza>#",
|
||||
"#",
|
||||
|
||||
"<a href= >",
|
||||
"",
|
||||
|
||||
"<ahref=http:..",
|
||||
"<ahref=http:..",
|
||||
|
||||
"<ahref=http:..>",
|
||||
"\n",
|
||||
|
||||
"<ahref=\"http://aseigo.bddf.ca/cms/1025\">A",
|
||||
"\nA",
|
||||
|
||||
"<a href=\"javascript:calendar_window=window.open('/calendar.aspx?formname=frmCalendar.txtDate','calendar_window','width=154,height=188');calendar_window.focus()\">",
|
||||
"",
|
||||
|
||||
"<a href=\"/applications/defenseaerospace/19+rackmounts\" title=\"19\" Rackmounts\">",
|
||||
"",
|
||||
|
||||
"<a href=http://www.azimprimerie.fr/flash/backup/lewes-zip-code/savage-model-110-manual.html title=savage model 110 manual rel=dofollow>",
|
||||
"",
|
||||
|
||||
"<a class=\"at\" name=\"Lamborghini href=\"http://lamborghini.coolbegin.com\">Lamborghini /a>",
|
||||
"Lamborghini /a>",
|
||||
|
||||
"<A href='newslink.php?news_link=http%3A%2F%2Fwww.worldnetdaily.com%2Findex.php%3Ffa%3DPAGE.view%26pageId%3D85729&news_title=Florida QB makes 'John 3:16' hottest Google search Tebow inscribed Bible reference on eye black for championship game' TARGET=_blank>",
|
||||
"",
|
||||
|
||||
"<a href=/myspace !style='color:#993333'>",
|
||||
"",
|
||||
|
||||
"<meta name=3DProgId content=3DExcel.Sheet>",
|
||||
"\n",
|
||||
|
||||
"<link id=3D\"shLink\" href=3D\"PSABrKelly-BADMINTONCupResults08FINAL2008_09_19=_files/sheet004.htm\">",
|
||||
"\n",
|
||||
|
||||
"<td bgcolor=3D\"#FFFFFF\" nowrap>",
|
||||
"\n",
|
||||
|
||||
"<a href=\"http://basnect.info/usersearch/\"predicciones-mundiales-2009\".html\">\"predicciones mundiales 2009\"</a>",
|
||||
"\"predicciones mundiales 2009\"",
|
||||
|
||||
"<a class=\"comment-link\" href=\"https://www.blogger.com/comment.g?blogID=19402125&postID=114070605958684588\"location.href=https://www.blogger.com/comment.g?blogID=19402125&postID=114070605958684588;>",
|
||||
"",
|
||||
|
||||
"<a href = \"/videos/Bishop\"/\" title = \"click to see more Bishop\" videos\">Bishop\"</a>",
|
||||
"Bishop\"",
|
||||
|
||||
"<a href=\"http://bhaa.ie/calendar/event.php?eid=20081203150127531\"\">BHAA Eircom 2 & 5 miles CC combined start</a>",
|
||||
"BHAA Eircom 2 & 5 miles CC combined start",
|
||||
|
||||
"<a href=\"http://people.tribe.net/wolfmana\" onClick='setClick(\"Application[tribe].Person[bb7df210-9dc0-478c-917f-436b896bcb79]\")'\" title=\"Mana\">",
|
||||
"",
|
||||
|
||||
"<a href=\"http://blog.edu-cyberpg.com/ct.ashx?id=6143c528-080c-4bb2-b765-5ec56c8256d3&url=http%3a%2f%2fwww.gsa.ac.uk%2fmackintoshsketchbook%2f\"\" eudora=\"autourl\">",
|
||||
"",
|
||||
|
||||
// "<" before ">" inhibits tag recognition
|
||||
"<input type=\"text\" value=\"<search here>\">",
|
||||
"<input type=\"text\" value=\"\n\">",
|
||||
|
||||
"<input type=\"text\" value=\"<search here\">",
|
||||
"<input type=\"text\" value=\"\n",
|
||||
|
||||
"<input type=\"text\" value=\"search here>\">",
|
||||
"\">",
|
||||
|
||||
// "<" and ">" chars are accepted in on[Event] attribute values
|
||||
"<input type=\"text\" value=\"<search here>\" onFocus=\"this.value='<search here>'\">",
|
||||
"",
|
||||
|
||||
"<![if ! IE]>\n<link href=\"http://i.deviantart.com/icons/favicon.png\" rel=\"shortcut icon\"/>\n<![endif]>",
|
||||
"\n\n\n",
|
||||
|
||||
"<![if supportMisalignedColumns]>\n<tr height=0 style='display:none'>\n<td width=64 style='width:48pt'></td>\n</tr>\n<![endif]>",
|
||||
"\n\n\n\n\n\n\n\n",
|
||||
};
|
||||
for (int i = 0 ; i < testGold.length ; i += 2) {
|
||||
String test = testGold[i];
|
||||
String gold = testGold[i + 1];
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
String result = builder.toString();
|
||||
assertEquals("Test: '" + test + "'", gold, result);
|
||||
}
|
||||
String result = builder.toString();
|
||||
// System.out.println("Resu: " + result + "<EOL>");
|
||||
// System.out.println("Gold: " + gold + "<EOL>");
|
||||
assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
|
||||
}
|
||||
|
||||
|
||||
public void testBufferOverflow() throws Exception {
|
||||
StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.DEFAULT_READ_AHEAD + 50);
|
||||
StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.getInitialBufferSize() + 50);
|
||||
testBuilder.append("ah<?> ??????");
|
||||
appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
|
||||
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
|
||||
processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions
|
||||
|
||||
testBuilder.setLength(0);
|
||||
testBuilder.append("<!--");//comments
|
||||
appendChars(testBuilder, 3*HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);//comments have two lookaheads
|
||||
appendChars(testBuilder, 3 * HTMLStripCharFilter.getInitialBufferSize() + 500);//comments have two lookaheads
|
||||
|
||||
testBuilder.append("-->foo");
|
||||
processBuffer(testBuilder.toString(), "Failed w/ comment");
|
||||
String gold = "foo";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
|
||||
testBuilder.setLength(0);
|
||||
testBuilder.append("<?");
|
||||
appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
|
||||
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
|
||||
testBuilder.append("?>");
|
||||
processBuffer(testBuilder.toString(), "Failed with proc. instr.");
|
||||
gold = "";
|
||||
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
|
||||
ch = 0;
|
||||
builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
|
||||
testBuilder.setLength(0);
|
||||
testBuilder.append("<b ");
|
||||
appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
|
||||
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
|
||||
testBuilder.append("/>");
|
||||
processBuffer(testBuilder.toString(), "Failed on tag");
|
||||
|
||||
gold = "";
|
||||
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
|
||||
ch = 0;
|
||||
builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
private void appendChars(StringBuilder testBuilder, int numChars) {
|
||||
|
@ -208,13 +425,14 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
} finally {
|
||||
// System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
|
||||
}
|
||||
assertTrue(assertMsg + "::: " + builder.toString() + " is not equal to " + test, builder.toString().equals(test) == true);
|
||||
assertEquals(assertMsg + "::: " + builder.toString() + " is not equal to " + test,
|
||||
test, builder.toString());
|
||||
}
|
||||
|
||||
public void testComment() throws Exception {
|
||||
|
||||
String test = "<!--- three dashes, still a valid comment ---> ";
|
||||
String gold = " ";
|
||||
String gold = " ";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
|
@ -225,7 +443,8 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true);
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
|
||||
|
@ -247,15 +466,32 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testOffsets() throws Exception {
|
||||
doTestOffsets("hello X how X are you");
|
||||
// doTestOffsets("hello X how X are you");
|
||||
doTestOffsets("hello <p> X<p> how <p>X are you");
|
||||
doTestOffsets("X & X ( X < > X");
|
||||
|
||||
// test backtracking
|
||||
doTestOffsets("X < &zz >X &# < X > < &l > &g < X");
|
||||
}
|
||||
|
||||
@Ignore("broken offsets: see LUCENE-2208")
|
||||
|
||||
static void assertLegalOffsets(String in) throws Exception {
|
||||
int length = in.length();
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
|
||||
int ch = 0;
|
||||
int off = 0;
|
||||
while ((ch = reader.read()) != -1) {
|
||||
int correction = reader.correctOffset(off);
|
||||
assertTrue("invalid offset correction: " + off + "->" + correction + " for doc of length: " + length,
|
||||
correction <= length);
|
||||
off++;
|
||||
}
|
||||
}
|
||||
|
||||
public void testLegalOffsets() throws Exception {
|
||||
assertLegalOffsets("hello world");
|
||||
assertLegalOffsets("hello &#x world");
|
||||
}
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
|
||||
|
@ -267,11 +503,361 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
|
||||
return new HTMLStripCharFilter(CharReader.get(reader));
|
||||
}
|
||||
};
|
||||
|
||||
int numRounds = RANDOM_MULTIPLIER * 10000;
|
||||
checkRandomData(random, analyzer, numRounds);
|
||||
}
|
||||
|
||||
public void testServerSideIncludes() throws Exception {
|
||||
String test = "one<img src=\"image.png\"\n"
|
||||
+ " alt = \"Alt: <!--#echo var='${IMAGE_CAPTION:<!--comment-->\\'Comment\\'}' -->\"\n\n"
|
||||
+ " title=\"Title: <!--#echo var=\"IMAGE_CAPTION\"-->\">two";
|
||||
String gold = "onetwo";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertTrue(builder.toString() + " is not equal to " + gold, builder.toString().equals(gold));
|
||||
|
||||
test = "one<script><!-- <!--#config comment=\"<!-- \\\"comment\\\"-->\"--> --></script>two";
|
||||
gold = "one\ntwo";
|
||||
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
ch = 0;
|
||||
builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testScriptQuotes() throws Exception {
|
||||
String test = "one<script attr= bare><!-- action('<!-- comment -->', \"\\\"-->\\\"\"); --></script>two";
|
||||
String gold = "one\ntwo";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
|
||||
test = "hello<script><!-- f('<!--internal--></script>'); --></script>";
|
||||
gold = "hello\n";
|
||||
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
ch = 0;
|
||||
builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testEscapeScript() throws Exception {
|
||||
String test = "one<script no-value-attr>callSomeMethod();</script>two";
|
||||
String gold = "one<script no-value-attr></script>two";
|
||||
Set<String> escapedTags = new HashSet<String>(Arrays.asList("SCRIPT"));
|
||||
Reader reader = new HTMLStripCharFilter
|
||||
(CharReader.get(new StringReader(test)), escapedTags);
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testStyle() throws Exception {
|
||||
String test = "one<style type=\"text/css\">\n"
|
||||
+ "<!--\n"
|
||||
+ "@import url('http://www.lasletrasdecanciones.com/css.css');\n"
|
||||
+ "-->\n"
|
||||
+ "</style>two";
|
||||
String gold = "one\ntwo";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testEscapeStyle() throws Exception {
|
||||
String test = "one<style type=\"text/css\"> body,font,a { font-family:arial; } </style>two";
|
||||
String gold = "one<style type=\"text/css\"></style>two";
|
||||
Set<String> escapedTags = new HashSet<String>(Arrays.asList("STYLE"));
|
||||
Reader reader = new HTMLStripCharFilter
|
||||
(CharReader.get(new StringReader(test)), escapedTags);
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testBR() throws Exception {
|
||||
String[] testGold = {
|
||||
"one<BR />two<br>three",
|
||||
"one\ntwo\nthree",
|
||||
|
||||
"one<BR some stuff here too>two</BR>",
|
||||
"one\ntwo\n",
|
||||
};
|
||||
for (int i = 0 ; i < testGold.length ; i += 2) {
|
||||
String test = testGold[i];
|
||||
String gold = testGold[i + 1];
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
String result = builder.toString();
|
||||
assertEquals("Test: '" + test + "'", gold, result);
|
||||
}
|
||||
}
|
||||
public void testEscapeBR() throws Exception {
|
||||
String test = "one<BR class='whatever'>two</\nBR\n>";
|
||||
String gold = "one<BR class='whatever'>two</\nBR\n>";
|
||||
Set<String> escapedTags = new HashSet<String>(Arrays.asList("BR"));
|
||||
Reader reader = new HTMLStripCharFilter
|
||||
(CharReader.get(new StringReader(test)), escapedTags);
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testInlineTagsNoSpace() throws Exception {
|
||||
String test = "one<sPAn class=\"invisible\">two<sup>2<sup>e</sup></sup>.</SpaN>three";
|
||||
String gold = "onetwo2e.three";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testCDATA() throws Exception {
|
||||
String test = "one<![CDATA[<one><two>three<four></four></two></one>]]>two";
|
||||
String gold = "one<one><two>three<four></four></two></one>two";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
|
||||
test = "one<![CDATA[two<![CDATA[three]]]]><![CDATA[>four]]>five";
|
||||
gold = "onetwo<![CDATA[three]]>fourfive";
|
||||
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
ch = 0;
|
||||
builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testUppercaseCharacterEntityVariants() throws Exception {
|
||||
String test = " "-©>><<®&";
|
||||
String gold = " \"-\u00A9>><<\u00AE&";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testMSWordMalformedProcessingInstruction() throws Exception {
|
||||
String test = "one<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />two";
|
||||
String gold = "onetwo";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testSupplementaryCharsInTags() throws Exception {
|
||||
String test = "one<𩬅艱鍟䇹愯瀛>two<瀛愯𩬅>three 瀛愯𩬅</瀛愯𩬅>four</𩬅艱鍟䇹愯瀛>five<𠀀𠀀>six<𠀀𠀀/>seven";
|
||||
String gold = "one\ntwo\nthree 瀛愯𩬅\nfour\nfive\nsix\nseven";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testRandomBrokenHTML() throws Exception {
|
||||
int maxNumElements = 10000;
|
||||
String text = _TestUtil.randomHtmlishString(random, maxNumElements);
|
||||
Reader reader = new HTMLStripCharFilter
|
||||
(CharReader.get(new StringReader(text)));
|
||||
while (reader.read() != -1);
|
||||
}
|
||||
|
||||
public void testRandomText() throws Exception {
|
||||
StringBuilder text = new StringBuilder();
|
||||
int minNumWords = 10;
|
||||
int maxNumWords = 10000;
|
||||
int minWordLength = 3;
|
||||
int maxWordLength = 20;
|
||||
int numWords = _TestUtil.nextInt(random, minNumWords, maxNumWords);
|
||||
switch (_TestUtil.nextInt(random, 0, 4)) {
|
||||
case 0: {
|
||||
for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
|
||||
text.append(_TestUtil.randomUnicodeString(random, maxWordLength));
|
||||
text.append(' ');
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 1: {
|
||||
for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
|
||||
text.append(_TestUtil.randomRealisticUnicodeString
|
||||
(random, minWordLength, maxWordLength));
|
||||
text.append(' ');
|
||||
}
|
||||
break;
|
||||
}
|
||||
default: { // ASCII 50% of the time
|
||||
for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
|
||||
text.append(_TestUtil.randomSimpleString(random));
|
||||
text.append(' ');
|
||||
}
|
||||
}
|
||||
}
|
||||
Reader reader = new HTMLStripCharFilter
|
||||
(CharReader.get(new StringReader(text.toString())));
|
||||
while (reader.read() != -1);
|
||||
}
|
||||
|
||||
public void testUTF16Surrogates() throws Exception {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
|
||||
}
|
||||
};
|
||||
// Paired surrogates
|
||||
assertAnalyzesTo(analyzer, " one two ��three",
|
||||
new String[] { "one", "two", "\uD86C\uDC01three" } );
|
||||
assertAnalyzesTo(analyzer, " ��", new String[] { "\uD86C\uDC01" } );
|
||||
assertAnalyzesTo(analyzer, " ��", new String[] { "\uD86C\uDC01" } );
|
||||
assertAnalyzesTo(analyzer, " ��", new String[] { "\uD86C\uDC01" } );
|
||||
|
||||
// Improperly paired surrogates
|
||||
assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD\uE28F" } );
|
||||
assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD\uE28F" } );
|
||||
assertAnalyzesTo(analyzer, " 훚�", new String[] { "\uD6DA\uFFFD" } );
|
||||
assertAnalyzesTo(analyzer, " 훚�", new String[] { "\uD6DA\uFFFD" } );
|
||||
|
||||
// Unpaired high surrogates
|
||||
assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD" } );
|
||||
assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD" } );
|
||||
assertAnalyzesTo(analyzer, " �<br>", new String[] { "�" } );
|
||||
assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD" } );
|
||||
assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD" } );
|
||||
assertAnalyzesTo(analyzer, " �<br>", new String[] { "�" } );
|
||||
|
||||
// Unpaired low surrogates
|
||||
assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD" } );
|
||||
assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD" } );
|
||||
assertAnalyzesTo(analyzer, " �<br>", new String[] { "�" } );
|
||||
assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD" } );
|
||||
assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD" } );
|
||||
assertAnalyzesTo(analyzer, " �<br>", new String[] { "�" } );
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,653 @@
|
|||
<html xmlns:v="urn:schemas-microsoft-com:vml"
|
||||
xmlns:o="urn:schemas-microsoft-com:office:office"
|
||||
xmlns:w="urn:schemas-microsoft-com:office:word"
|
||||
xmlns:m="http://schemas.microsoft.com/office/2004/12/omml"
|
||||
xmlns="http://www.w3.org/TR/REC-html40">
|
||||
|
||||
<head>
|
||||
<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
|
||||
<meta name=ProgId content=Word.Document>
|
||||
<meta name=Generator content="Microsoft Word 14">
|
||||
<meta name=Originator content="Microsoft Word 14">
|
||||
<link rel=File-List href="This%20is%20a%20test_files/filelist.xml">
|
||||
<!--[if gte mso 9]><xml>
|
||||
<o:DocumentProperties>
|
||||
<o:Author>s</o:Author>
|
||||
<o:LastAuthor>s</o:LastAuthor>
|
||||
<o:Revision>1</o:Revision>
|
||||
<o:TotalTime>1</o:TotalTime>
|
||||
<o:Created>2012-01-13T03:36:00Z</o:Created>
|
||||
<o:LastSaved>2012-01-13T03:37:00Z</o:LastSaved>
|
||||
<o:Pages>1</o:Pages>
|
||||
<o:Words>8</o:Words>
|
||||
<o:Characters>48</o:Characters>
|
||||
<o:Lines>1</o:Lines>
|
||||
<o:Paragraphs>1</o:Paragraphs>
|
||||
<o:CharactersWithSpaces>55</o:CharactersWithSpaces>
|
||||
<o:Version>14.00</o:Version>
|
||||
</o:DocumentProperties>
|
||||
<o:OfficeDocumentSettings>
|
||||
<o:AllowPNG/>
|
||||
</o:OfficeDocumentSettings>
|
||||
</xml><![endif]-->
|
||||
<link rel=themeData href="This%20is%20a%20test_files/themedata.thmx">
|
||||
<link rel=colorSchemeMapping
|
||||
href="This%20is%20a%20test_files/colorschememapping.xml">
|
||||
<!--[if gte mso 9]><xml>
|
||||
<w:WordDocument>
|
||||
<w:SpellingState>Clean</w:SpellingState>
|
||||
<w:GrammarState>Clean</w:GrammarState>
|
||||
<w:TrackMoves>false</w:TrackMoves>
|
||||
<w:TrackFormatting/>
|
||||
<w:PunctuationKerning/>
|
||||
<w:ValidateAgainstSchemas/>
|
||||
<w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
|
||||
<w:IgnoreMixedContent>false</w:IgnoreMixedContent>
|
||||
<w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
|
||||
<w:DoNotPromoteQF/>
|
||||
<w:LidThemeOther>EN-US</w:LidThemeOther>
|
||||
<w:LidThemeAsian>X-NONE</w:LidThemeAsian>
|
||||
<w:LidThemeComplexScript>X-NONE</w:LidThemeComplexScript>
|
||||
<w:Compatibility>
|
||||
<w:BreakWrappedTables/>
|
||||
<w:SnapToGridInCell/>
|
||||
<w:WrapTextWithPunct/>
|
||||
<w:UseAsianBreakRules/>
|
||||
<w:DontGrowAutofit/>
|
||||
<w:SplitPgBreakAndParaMark/>
|
||||
<w:EnableOpenTypeKerning/>
|
||||
<w:DontFlipMirrorIndents/>
|
||||
<w:OverrideTableStyleHps/>
|
||||
</w:Compatibility>
|
||||
<m:mathPr>
|
||||
<m:mathFont m:val="Cambria Math"/>
|
||||
<m:brkBin m:val="before"/>
|
||||
<m:brkBinSub m:val="--"/>
|
||||
<m:smallFrac m:val="off"/>
|
||||
<m:dispDef/>
|
||||
<m:lMargin m:val="0"/>
|
||||
<m:rMargin m:val="0"/>
|
||||
<m:defJc m:val="centerGroup"/>
|
||||
<m:wrapIndent m:val="1440"/>
|
||||
<m:intLim m:val="subSup"/>
|
||||
<m:naryLim m:val="undOvr"/>
|
||||
</m:mathPr></w:WordDocument>
|
||||
</xml><![endif]--><!--[if gte mso 9]><xml>
|
||||
<w:LatentStyles DefLockedState="false" DefUnhideWhenUsed="true"
|
||||
DefSemiHidden="true" DefQFormat="false" DefPriority="99"
|
||||
LatentStyleCount="267">
|
||||
<w:LsdException Locked="false" Priority="0" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Normal"/>
|
||||
<w:LsdException Locked="false" Priority="9" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="heading 1"/>
|
||||
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 2"/>
|
||||
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 3"/>
|
||||
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 4"/>
|
||||
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 5"/>
|
||||
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 6"/>
|
||||
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 7"/>
|
||||
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 8"/>
|
||||
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 9"/>
|
||||
<w:LsdException Locked="false" Priority="39" Name="toc 1"/>
|
||||
<w:LsdException Locked="false" Priority="39" Name="toc 2"/>
|
||||
<w:LsdException Locked="false" Priority="39" Name="toc 3"/>
|
||||
<w:LsdException Locked="false" Priority="39" Name="toc 4"/>
|
||||
<w:LsdException Locked="false" Priority="39" Name="toc 5"/>
|
||||
<w:LsdException Locked="false" Priority="39" Name="toc 6"/>
|
||||
<w:LsdException Locked="false" Priority="39" Name="toc 7"/>
|
||||
<w:LsdException Locked="false" Priority="39" Name="toc 8"/>
|
||||
<w:LsdException Locked="false" Priority="39" Name="toc 9"/>
|
||||
<w:LsdException Locked="false" Priority="35" QFormat="true" Name="caption"/>
|
||||
<w:LsdException Locked="false" Priority="10" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Title"/>
|
||||
<w:LsdException Locked="false" Priority="1" Name="Default Paragraph Font"/>
|
||||
<w:LsdException Locked="false" Priority="11" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Subtitle"/>
|
||||
<w:LsdException Locked="false" Priority="22" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Strong"/>
|
||||
<w:LsdException Locked="false" Priority="20" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Emphasis"/>
|
||||
<w:LsdException Locked="false" Priority="59" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Table Grid"/>
|
||||
<w:LsdException Locked="false" UnhideWhenUsed="false" Name="Placeholder Text"/>
|
||||
<w:LsdException Locked="false" Priority="1" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="No Spacing"/>
|
||||
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Shading"/>
|
||||
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light List"/>
|
||||
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Grid"/>
|
||||
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 1"/>
|
||||
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 2"/>
|
||||
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 1"/>
|
||||
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 2"/>
|
||||
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 1"/>
|
||||
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 2"/>
|
||||
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 3"/>
|
||||
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Dark List"/>
|
||||
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Shading"/>
|
||||
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful List"/>
|
||||
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Grid"/>
|
||||
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Shading Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light List Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Grid Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 1 Accent 1"/>
|
||||
<w:LsdException Locked="false" UnhideWhenUsed="false" Name="Revision"/>
|
||||
<w:LsdException Locked="false" Priority="34" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="List Paragraph"/>
|
||||
<w:LsdException Locked="false" Priority="29" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Quote"/>
|
||||
<w:LsdException Locked="false" Priority="30" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Intense Quote"/>
|
||||
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 2 Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Dark List Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Shading Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful List Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Grid Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Shading Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light List Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Grid Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 1 Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 2 Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Dark List Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Shading Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful List Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Grid Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Shading Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light List Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Grid Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 1 Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 2 Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Dark List Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Shading Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful List Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Grid Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Shading Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light List Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Grid Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 1 Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 2 Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Dark List Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Shading Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful List Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Grid Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Shading Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light List Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Grid Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 1 Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 2 Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Dark List Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Shading Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful List Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Grid Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Shading Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light List Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Grid Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 1 Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 2 Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Dark List Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Shading Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful List Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Grid Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="19" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Subtle Emphasis"/>
|
||||
<w:LsdException Locked="false" Priority="21" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Intense Emphasis"/>
|
||||
<w:LsdException Locked="false" Priority="31" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Subtle Reference"/>
|
||||
<w:LsdException Locked="false" Priority="32" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Intense Reference"/>
|
||||
<w:LsdException Locked="false" Priority="33" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Book Title"/>
|
||||
<w:LsdException Locked="false" Priority="37" Name="Bibliography"/>
|
||||
<w:LsdException Locked="false" Priority="39" QFormat="true" Name="TOC Heading"/>
|
||||
</w:LatentStyles>
|
||||
</xml><![endif]-->
|
||||
<style>
|
||||
<!--
|
||||
/* Font Definitions */
|
||||
@font-face
|
||||
{font-family:"Cambria Math";
|
||||
panose-1:2 4 5 3 5 4 6 3 2 4;
|
||||
mso-font-charset:1;
|
||||
mso-generic-font-family:roman;
|
||||
mso-font-format:other;
|
||||
mso-font-pitch:variable;
|
||||
mso-font-signature:0 0 0 0 0 0;}
|
||||
@font-face
|
||||
{font-family:Cambria;
|
||||
panose-1:2 4 5 3 5 4 6 3 2 4;
|
||||
mso-font-charset:0;
|
||||
mso-generic-font-family:roman;
|
||||
mso-font-pitch:variable;
|
||||
mso-font-signature:-536870145 1073743103 0 0 415 0;}
|
||||
@font-face
|
||||
{font-family:Calibri;
|
||||
panose-1:2 15 5 2 2 2 4 3 2 4;
|
||||
mso-font-charset:0;
|
||||
mso-generic-font-family:swiss;
|
||||
mso-font-pitch:variable;
|
||||
mso-font-signature:-520092929 1073786111 9 0 415 0;}
|
||||
/* Style Definitions */
|
||||
p.MsoNormal, li.MsoNormal, div.MsoNormal
|
||||
{mso-style-unhide:no;
|
||||
mso-style-qformat:yes;
|
||||
mso-style-parent:"";
|
||||
margin-top:0in;
|
||||
margin-right:0in;
|
||||
margin-bottom:10.0pt;
|
||||
margin-left:0in;
|
||||
line-height:115%;
|
||||
mso-pagination:widow-orphan;
|
||||
font-size:11.0pt;
|
||||
font-family:"Calibri","sans-serif";
|
||||
mso-ascii-font-family:Calibri;
|
||||
mso-ascii-theme-font:minor-latin;
|
||||
mso-fareast-font-family:Calibri;
|
||||
mso-fareast-theme-font:minor-latin;
|
||||
mso-hansi-font-family:Calibri;
|
||||
mso-hansi-theme-font:minor-latin;
|
||||
mso-bidi-font-family:"Times New Roman";
|
||||
mso-bidi-theme-font:minor-bidi;}
|
||||
h1
|
||||
{mso-style-priority:9;
|
||||
mso-style-unhide:no;
|
||||
mso-style-qformat:yes;
|
||||
mso-style-link:"Heading 1 Char";
|
||||
mso-style-next:Normal;
|
||||
margin-top:24.0pt;
|
||||
margin-right:0in;
|
||||
margin-bottom:0in;
|
||||
margin-left:0in;
|
||||
margin-bottom:.0001pt;
|
||||
line-height:115%;
|
||||
mso-pagination:widow-orphan lines-together;
|
||||
page-break-after:avoid;
|
||||
mso-outline-level:1;
|
||||
font-size:14.0pt;
|
||||
font-family:"Cambria","serif";
|
||||
mso-ascii-font-family:Cambria;
|
||||
mso-ascii-theme-font:major-latin;
|
||||
mso-fareast-font-family:"Times New Roman";
|
||||
mso-fareast-theme-font:major-fareast;
|
||||
mso-hansi-font-family:Cambria;
|
||||
mso-hansi-theme-font:major-latin;
|
||||
mso-bidi-font-family:"Times New Roman";
|
||||
mso-bidi-theme-font:major-bidi;
|
||||
color:#365F91;
|
||||
mso-themecolor:accent1;
|
||||
mso-themeshade:191;
|
||||
mso-font-kerning:0pt;}
|
||||
p.MsoTitle, li.MsoTitle, div.MsoTitle
|
||||
{mso-style-priority:10;
|
||||
mso-style-unhide:no;
|
||||
mso-style-qformat:yes;
|
||||
mso-style-link:"Title Char";
|
||||
mso-style-next:Normal;
|
||||
margin-top:0in;
|
||||
margin-right:0in;
|
||||
margin-bottom:15.0pt;
|
||||
margin-left:0in;
|
||||
mso-add-space:auto;
|
||||
mso-pagination:widow-orphan;
|
||||
border:none;
|
||||
mso-border-bottom-alt:solid #4F81BD 1.0pt;
|
||||
mso-border-bottom-themecolor:accent1;
|
||||
padding:0in;
|
||||
mso-padding-alt:0in 0in 4.0pt 0in;
|
||||
font-size:26.0pt;
|
||||
font-family:"Cambria","serif";
|
||||
mso-ascii-font-family:Cambria;
|
||||
mso-ascii-theme-font:major-latin;
|
||||
mso-fareast-font-family:"Times New Roman";
|
||||
mso-fareast-theme-font:major-fareast;
|
||||
mso-hansi-font-family:Cambria;
|
||||
mso-hansi-theme-font:major-latin;
|
||||
mso-bidi-font-family:"Times New Roman";
|
||||
mso-bidi-theme-font:major-bidi;
|
||||
color:#17365D;
|
||||
mso-themecolor:text2;
|
||||
mso-themeshade:191;
|
||||
letter-spacing:.25pt;
|
||||
mso-font-kerning:14.0pt;}
|
||||
p.MsoTitleCxSpFirst, li.MsoTitleCxSpFirst, div.MsoTitleCxSpFirst
|
||||
{mso-style-priority:10;
|
||||
mso-style-unhide:no;
|
||||
mso-style-qformat:yes;
|
||||
mso-style-link:"Title Char";
|
||||
mso-style-next:Normal;
|
||||
mso-style-type:export-only;
|
||||
margin:0in;
|
||||
margin-bottom:.0001pt;
|
||||
mso-add-space:auto;
|
||||
mso-pagination:widow-orphan;
|
||||
border:none;
|
||||
mso-border-bottom-alt:solid #4F81BD 1.0pt;
|
||||
mso-border-bottom-themecolor:accent1;
|
||||
padding:0in;
|
||||
mso-padding-alt:0in 0in 4.0pt 0in;
|
||||
font-size:26.0pt;
|
||||
font-family:"Cambria","serif";
|
||||
mso-ascii-font-family:Cambria;
|
||||
mso-ascii-theme-font:major-latin;
|
||||
mso-fareast-font-family:"Times New Roman";
|
||||
mso-fareast-theme-font:major-fareast;
|
||||
mso-hansi-font-family:Cambria;
|
||||
mso-hansi-theme-font:major-latin;
|
||||
mso-bidi-font-family:"Times New Roman";
|
||||
mso-bidi-theme-font:major-bidi;
|
||||
color:#17365D;
|
||||
mso-themecolor:text2;
|
||||
mso-themeshade:191;
|
||||
letter-spacing:.25pt;
|
||||
mso-font-kerning:14.0pt;}
|
||||
p.MsoTitleCxSpMiddle, li.MsoTitleCxSpMiddle, div.MsoTitleCxSpMiddle
|
||||
{mso-style-priority:10;
|
||||
mso-style-unhide:no;
|
||||
mso-style-qformat:yes;
|
||||
mso-style-link:"Title Char";
|
||||
mso-style-next:Normal;
|
||||
mso-style-type:export-only;
|
||||
margin:0in;
|
||||
margin-bottom:.0001pt;
|
||||
mso-add-space:auto;
|
||||
mso-pagination:widow-orphan;
|
||||
border:none;
|
||||
mso-border-bottom-alt:solid #4F81BD 1.0pt;
|
||||
mso-border-bottom-themecolor:accent1;
|
||||
padding:0in;
|
||||
mso-padding-alt:0in 0in 4.0pt 0in;
|
||||
font-size:26.0pt;
|
||||
font-family:"Cambria","serif";
|
||||
mso-ascii-font-family:Cambria;
|
||||
mso-ascii-theme-font:major-latin;
|
||||
mso-fareast-font-family:"Times New Roman";
|
||||
mso-fareast-theme-font:major-fareast;
|
||||
mso-hansi-font-family:Cambria;
|
||||
mso-hansi-theme-font:major-latin;
|
||||
mso-bidi-font-family:"Times New Roman";
|
||||
mso-bidi-theme-font:major-bidi;
|
||||
color:#17365D;
|
||||
mso-themecolor:text2;
|
||||
mso-themeshade:191;
|
||||
letter-spacing:.25pt;
|
||||
mso-font-kerning:14.0pt;}
|
||||
p.MsoTitleCxSpLast, li.MsoTitleCxSpLast, div.MsoTitleCxSpLast
|
||||
{mso-style-priority:10;
|
||||
mso-style-unhide:no;
|
||||
mso-style-qformat:yes;
|
||||
mso-style-link:"Title Char";
|
||||
mso-style-next:Normal;
|
||||
mso-style-type:export-only;
|
||||
margin-top:0in;
|
||||
margin-right:0in;
|
||||
margin-bottom:15.0pt;
|
||||
margin-left:0in;
|
||||
mso-add-space:auto;
|
||||
mso-pagination:widow-orphan;
|
||||
border:none;
|
||||
mso-border-bottom-alt:solid #4F81BD 1.0pt;
|
||||
mso-border-bottom-themecolor:accent1;
|
||||
padding:0in;
|
||||
mso-padding-alt:0in 0in 4.0pt 0in;
|
||||
font-size:26.0pt;
|
||||
font-family:"Cambria","serif";
|
||||
mso-ascii-font-family:Cambria;
|
||||
mso-ascii-theme-font:major-latin;
|
||||
mso-fareast-font-family:"Times New Roman";
|
||||
mso-fareast-theme-font:major-fareast;
|
||||
mso-hansi-font-family:Cambria;
|
||||
mso-hansi-theme-font:major-latin;
|
||||
mso-bidi-font-family:"Times New Roman";
|
||||
mso-bidi-theme-font:major-bidi;
|
||||
color:#17365D;
|
||||
mso-themecolor:text2;
|
||||
mso-themeshade:191;
|
||||
letter-spacing:.25pt;
|
||||
mso-font-kerning:14.0pt;}
|
||||
span.TitleChar
|
||||
{mso-style-name:"Title Char";
|
||||
mso-style-priority:10;
|
||||
mso-style-unhide:no;
|
||||
mso-style-locked:yes;
|
||||
mso-style-link:Title;
|
||||
mso-ansi-font-size:26.0pt;
|
||||
mso-bidi-font-size:26.0pt;
|
||||
font-family:"Cambria","serif";
|
||||
mso-ascii-font-family:Cambria;
|
||||
mso-ascii-theme-font:major-latin;
|
||||
mso-fareast-font-family:"Times New Roman";
|
||||
mso-fareast-theme-font:major-fareast;
|
||||
mso-hansi-font-family:Cambria;
|
||||
mso-hansi-theme-font:major-latin;
|
||||
mso-bidi-font-family:"Times New Roman";
|
||||
mso-bidi-theme-font:major-bidi;
|
||||
color:#17365D;
|
||||
mso-themecolor:text2;
|
||||
mso-themeshade:191;
|
||||
letter-spacing:.25pt;
|
||||
mso-font-kerning:14.0pt;}
|
||||
span.Heading1Char
|
||||
{mso-style-name:"Heading 1 Char";
|
||||
mso-style-priority:9;
|
||||
mso-style-unhide:no;
|
||||
mso-style-locked:yes;
|
||||
mso-style-link:"Heading 1";
|
||||
mso-ansi-font-size:14.0pt;
|
||||
mso-bidi-font-size:14.0pt;
|
||||
font-family:"Cambria","serif";
|
||||
mso-ascii-font-family:Cambria;
|
||||
mso-ascii-theme-font:major-latin;
|
||||
mso-fareast-font-family:"Times New Roman";
|
||||
mso-fareast-theme-font:major-fareast;
|
||||
mso-hansi-font-family:Cambria;
|
||||
mso-hansi-theme-font:major-latin;
|
||||
mso-bidi-font-family:"Times New Roman";
|
||||
mso-bidi-theme-font:major-bidi;
|
||||
color:#365F91;
|
||||
mso-themecolor:accent1;
|
||||
mso-themeshade:191;
|
||||
font-weight:bold;}
|
||||
.MsoChpDefault
|
||||
{mso-style-type:export-only;
|
||||
mso-default-props:yes;
|
||||
font-family:"Calibri","sans-serif";
|
||||
mso-ascii-font-family:Calibri;
|
||||
mso-ascii-theme-font:minor-latin;
|
||||
mso-fareast-font-family:Calibri;
|
||||
mso-fareast-theme-font:minor-latin;
|
||||
mso-hansi-font-family:Calibri;
|
||||
mso-hansi-theme-font:minor-latin;
|
||||
mso-bidi-font-family:"Times New Roman";
|
||||
mso-bidi-theme-font:minor-bidi;}
|
||||
.MsoPapDefault
|
||||
{mso-style-type:export-only;
|
||||
margin-bottom:10.0pt;
|
||||
line-height:115%;}
|
||||
@page WordSection1
|
||||
{size:8.5in 11.0in;
|
||||
margin:1.0in 1.0in 1.0in 1.0in;
|
||||
mso-header-margin:.5in;
|
||||
mso-footer-margin:.5in;
|
||||
mso-paper-source:0;}
|
||||
div.WordSection1
|
||||
{page:WordSection1;}
|
||||
-->
|
||||
</style>
|
||||
<!--[if gte mso 10]>
|
||||
<style>
|
||||
/* Style Definitions */
|
||||
table.MsoNormalTable
|
||||
{mso-style-name:"Table Normal";
|
||||
mso-tstyle-rowband-size:0;
|
||||
mso-tstyle-colband-size:0;
|
||||
mso-style-noshow:yes;
|
||||
mso-style-priority:99;
|
||||
mso-style-parent:"";
|
||||
mso-padding-alt:0in 5.4pt 0in 5.4pt;
|
||||
mso-para-margin-top:0in;
|
||||
mso-para-margin-right:0in;
|
||||
mso-para-margin-bottom:10.0pt;
|
||||
mso-para-margin-left:0in;
|
||||
line-height:115%;
|
||||
mso-pagination:widow-orphan;
|
||||
font-size:11.0pt;
|
||||
font-family:"Calibri","sans-serif";
|
||||
mso-ascii-font-family:Calibri;
|
||||
mso-ascii-theme-font:minor-latin;
|
||||
mso-hansi-font-family:Calibri;
|
||||
mso-hansi-theme-font:minor-latin;
|
||||
mso-bidi-font-family:"Times New Roman";
|
||||
mso-bidi-theme-font:minor-bidi;}
|
||||
</style>
|
||||
<![endif]--><!--[if gte mso 9]><xml>
|
||||
<o:shapedefaults v:ext="edit" spidmax="1026"/>
|
||||
</xml><![endif]--><!--[if gte mso 9]><xml>
|
||||
<o:shapelayout v:ext="edit">
|
||||
<o:idmap v:ext="edit" data="1"/>
|
||||
</o:shapelayout></xml><![endif]-->
|
||||
</head>
|
||||
|
||||
<body lang=EN-US style='tab-interval:.5in'>
|
||||
|
||||
<div class=WordSection1>
|
||||
|
||||
<div style='mso-element:para-border-div;border:none;border-bottom:solid #4F81BD 1.0pt;
|
||||
mso-border-bottom-themecolor:accent1;padding:0in 0in 4.0pt 0in'>
|
||||
|
||||
<p class=MsoTitle>This is a test</p>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
||||
|
|
@ -117,5 +117,10 @@ public class TestChineseTokenizer extends BaseTokenStreamTestCase
|
|||
assertAnalyzesTo(justFilter, "This is a Test. b c d",
|
||||
new String[] { "This", "Test." });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random, new ChineseAnalyzer(), 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -306,4 +306,31 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
|||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
assertTokenStreamContents(nsf, new String[] { "the_of" });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, t, commonWords);
|
||||
return new TokenStreamComponents(t, cgf);
|
||||
}
|
||||
};
|
||||
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
|
||||
Analyzer b = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, t, commonWords);
|
||||
return new TokenStreamComponents(t, new CommonGramsQueryFilter(cgf));
|
||||
}
|
||||
};
|
||||
|
||||
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,14 +18,19 @@ package org.apache.lucene.analysis.compound;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
|
||||
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
@ -299,5 +304,61 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// SOLR-2891
|
||||
// *CompoundWordTokenFilter blindly adds term length to offset, but this can take things out of bounds
|
||||
// wrt original text if a previous filter increases the length of the word (in this case ü -> ue)
|
||||
// so in this case we behave like WDF, and preserve any modified offsets
|
||||
public void testInvalidOffsets() throws Exception {
|
||||
final CharArraySet dict = makeDictionary("fall");
|
||||
final NormalizeCharMap normMap = new NormalizeCharMap();
|
||||
normMap.add("ü", "ue");
|
||||
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenFilter filter = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
return new MappingCharFilter(normMap, CharReader.get(reader));
|
||||
}
|
||||
};
|
||||
|
||||
assertAnalyzesTo(analyzer, "banküberfall",
|
||||
new String[] { "bankueberfall", "fall" },
|
||||
new int[] { 0, 0 },
|
||||
new int[] { 12, 12 });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
|
||||
Analyzer a = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict));
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
|
||||
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
|
||||
final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
|
||||
Analyzer b = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenFilter filter = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, hyphenator);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
};
|
||||
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,92 @@
|
|||
package org.apache.lucene.analysis.core;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.English;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
public class TestTypeTokenFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testTypeFilter() throws IOException {
|
||||
StringReader reader = new StringReader("121 is palindrome, while 123 is not");
|
||||
Set<String> stopTypes = asSet("<NUM>");
|
||||
TokenStream stream = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes);
|
||||
assertTokenStreamContents(stream, new String[]{"is", "palindrome", "while", "is", "not"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Position increments applied by TypeTokenFilter with and without enabling this option.
|
||||
*/
|
||||
public void testStopPositons() throws IOException {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 10; i < 20; i++) {
|
||||
if (i % 3 != 0) {
|
||||
sb.append(i).append(" ");
|
||||
} else {
|
||||
String w = English.intToEnglish(i).trim();
|
||||
sb.append(w).append(" ");
|
||||
}
|
||||
}
|
||||
log(sb.toString());
|
||||
String stopTypes[] = new String[]{"<NUM>"};
|
||||
Set<String> stopSet = asSet(stopTypes);
|
||||
|
||||
// with increments
|
||||
StringReader reader = new StringReader(sb.toString());
|
||||
TypeTokenFilter typeTokenFilter = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
|
||||
testPositons(typeTokenFilter);
|
||||
|
||||
// without increments
|
||||
reader = new StringReader(sb.toString());
|
||||
typeTokenFilter = new TypeTokenFilter(false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
|
||||
testPositons(typeTokenFilter);
|
||||
|
||||
}
|
||||
|
||||
private void testPositons(TypeTokenFilter stpf) throws IOException {
|
||||
TypeAttribute typeAtt = stpf.getAttribute(TypeAttribute.class);
|
||||
CharTermAttribute termAttribute = stpf.getAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute posIncrAtt = stpf.getAttribute(PositionIncrementAttribute.class);
|
||||
stpf.reset();
|
||||
boolean enablePositionIncrements = stpf.getEnablePositionIncrements();
|
||||
while (stpf.incrementToken()) {
|
||||
log("Token: " + termAttribute.toString() + ": " + typeAtt.type() + " - " + posIncrAtt.getPositionIncrement());
|
||||
assertEquals("if position increment is enabled the positionIncrementAttribute value should be 3, otherwise 1",
|
||||
posIncrAtt.getPositionIncrement(), enablePositionIncrements ? 3 : 1);
|
||||
}
|
||||
stpf.end();
|
||||
stpf.close();
|
||||
}
|
||||
|
||||
// print debug info depending on VERBOSE
|
||||
private static void log(String s) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(s);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -18,12 +18,15 @@ package org.apache.lucene.analysis.hunspell;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.text.ParseException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.junit.BeforeClass;
|
||||
|
@ -57,4 +60,17 @@ public class HunspellStemFilterTest extends BaseTokenStreamTestCase {
|
|||
filter = new HunspellStemFilter(new KeywordMarkerFilter(tokenizer, set), DICTIONARY);
|
||||
assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY));
|
||||
}
|
||||
};
|
||||
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.StringReader;
|
|||
import java.util.Arrays;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.StopAnalyzer;
|
||||
|
@ -132,4 +133,10 @@ public class PatternAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
TokenStream ts2 = analyzer.tokenStream("dummy", new StringReader(document));
|
||||
assertTokenStreamContents(ts2, expected);
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,11 +17,14 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
@ -1907,4 +1910,17 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
|
|||
assertTrue(stream.incrementToken());
|
||||
assertEquals(expected, termAtt.toString());
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, new ASCIIFoldingFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,12 +18,14 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
@ -117,4 +119,18 @@ public class TestCapitalizationFilter extends BaseTokenStreamTestCase {
|
|||
new String[] { expected }, onlyFirstWord, keep, forceFirstLetter, okPrefix,
|
||||
minWordLength, maxWordCount, maxTokenLength);
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomString() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, new CapitalizationFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,11 +17,14 @@
|
|||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
/**
|
||||
* HyphenatedWordsFilter test
|
||||
|
@ -46,5 +49,29 @@ public class TestHyphenatedWordsFilter extends BaseTokenStreamTestCase {
|
|||
ts = new HyphenatedWordsFilter(ts);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecology-" });
|
||||
}
|
||||
}
|
||||
|
||||
public void testOffsets() throws Exception {
|
||||
String input = "abc- def geh 1234- 5678-";
|
||||
TokenStream ts = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
ts = new HyphenatedWordsFilter(ts);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "abcdef", "geh", "12345678-" },
|
||||
new int[] { 0, 9, 13 },
|
||||
new int[] { 8, 12, 24 });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomString() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, new HyphenatedWordsFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,13 +17,16 @@
|
|||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
/** Test {@link KeepWordFilter} */
|
||||
|
@ -57,4 +60,23 @@ public class TestKeepWordFilter extends BaseTokenStreamTestCase {
|
|||
stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 1 });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
final Set<String> words = new HashSet<String>();
|
||||
words.add( "a" );
|
||||
words.add( "b" );
|
||||
|
||||
Analyzer a = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream stream = new KeepWordFilter(true, tokenizer, new CharArraySet(TEST_VERSION_CURRENT, words, true));
|
||||
return new TokenStreamComponents(tokenizer, stream);
|
||||
}
|
||||
};
|
||||
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,13 +17,21 @@
|
|||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Iterator;
|
||||
import java.util.Arrays;
|
||||
|
||||
|
@ -116,6 +124,45 @@ public class TestRemoveDuplicatesTokenFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
}
|
||||
|
||||
// some helper methods for the below test with synonyms
|
||||
private String randomNonEmptyString() {
|
||||
while(true) {
|
||||
final String s = _TestUtil.randomUnicodeString(random).trim();
|
||||
if (s.length() != 0 && s.indexOf('\u0000') == -1) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void add(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
|
||||
b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
|
||||
new CharsRef(output.replaceAll(" +", "\u0000")),
|
||||
keepOrig);
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
final int numIters = atLeast(10);
|
||||
for (int i = 0; i < numIters; i++) {
|
||||
SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean());
|
||||
final int numEntries = atLeast(10);
|
||||
for (int j = 0; j < numEntries; j++) {
|
||||
add(b, randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
|
||||
}
|
||||
final SynonymMap map = b.build();
|
||||
final boolean ignoreCase = random.nextBoolean();
|
||||
|
||||
final Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
|
||||
TokenStream stream = new SynonymFilter(tokenizer, map, ignoreCase);
|
||||
return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream));
|
||||
}
|
||||
};
|
||||
|
||||
checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -18,11 +18,15 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Collection;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.*;
|
||||
|
||||
/**
|
||||
|
@ -103,4 +107,27 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
|
||||
return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, false));
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
|
||||
Analyzer b = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
|
||||
return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, true));
|
||||
}
|
||||
};
|
||||
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -298,4 +298,28 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
new int[] { 10, 15, 15 },
|
||||
new int[] { 2, 1, 0 });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
int numIterations = atLeast(5);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
final int flags = random.nextInt(512);
|
||||
final CharArraySet protectedWords;
|
||||
if (random.nextBoolean()) {
|
||||
protectedWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet<String>(Arrays.asList("a", "b", "cd")), false);
|
||||
} else {
|
||||
protectedWords = null;
|
||||
}
|
||||
|
||||
Analyzer a = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -129,4 +129,27 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
|
||||
new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer,
|
||||
new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.FRONT, 2, 15));
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
|
||||
Analyzer b = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer,
|
||||
new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 15));
|
||||
}
|
||||
};
|
||||
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,9 +18,13 @@ package org.apache.lucene.analysis.ngram;
|
|||
*/
|
||||
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
|
||||
|
||||
/**
|
||||
* Tests {@link EdgeNGramTokenizer} for correctness.
|
||||
|
@ -95,4 +99,25 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
|
|||
tokenizer.reset(new StringReader("abcde"));
|
||||
assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3}, 5 /* abcde */);
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new EdgeNGramTokenizer(reader, EdgeNGramTokenizer.Side.FRONT, 2, 15);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
|
||||
Analyzer b = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new EdgeNGramTokenizer(reader, EdgeNGramTokenizer.Side.BACK, 2, 15);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import org.apache.lucene.analysis.TokenFilter;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
|
||||
|
||||
|
@ -33,89 +34,102 @@ import java.io.StringReader;
|
|||
* Tests {@link NGramTokenFilter} for correctness.
|
||||
*/
|
||||
public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||
private TokenStream input;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
input = new MockTokenizer(new StringReader("abcde"), MockTokenizer.WHITESPACE, false);
|
||||
private TokenStream input;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
input = new MockTokenizer(new StringReader("abcde"), MockTokenizer.WHITESPACE, false);
|
||||
}
|
||||
|
||||
public void testInvalidInput() throws Exception {
|
||||
boolean gotException = false;
|
||||
try {
|
||||
new NGramTokenFilter(input, 2, 1);
|
||||
} catch (IllegalArgumentException e) {
|
||||
gotException = true;
|
||||
}
|
||||
|
||||
public void testInvalidInput() throws Exception {
|
||||
boolean gotException = false;
|
||||
try {
|
||||
new NGramTokenFilter(input, 2, 1);
|
||||
} catch (IllegalArgumentException e) {
|
||||
gotException = true;
|
||||
}
|
||||
assertTrue(gotException);
|
||||
assertTrue(gotException);
|
||||
}
|
||||
|
||||
public void testInvalidInput2() throws Exception {
|
||||
boolean gotException = false;
|
||||
try {
|
||||
new NGramTokenFilter(input, 0, 1);
|
||||
} catch (IllegalArgumentException e) {
|
||||
gotException = true;
|
||||
}
|
||||
|
||||
public void testInvalidInput2() throws Exception {
|
||||
boolean gotException = false;
|
||||
try {
|
||||
new NGramTokenFilter(input, 0, 1);
|
||||
} catch (IllegalArgumentException e) {
|
||||
gotException = true;
|
||||
}
|
||||
assertTrue(gotException);
|
||||
}
|
||||
|
||||
public void testUnigrams() throws Exception {
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
|
||||
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
|
||||
}
|
||||
|
||||
public void testBigrams() throws Exception {
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
|
||||
assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5});
|
||||
}
|
||||
|
||||
public void testNgrams() throws Exception {
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
|
||||
assertTokenStreamContents(filter,
|
||||
assertTrue(gotException);
|
||||
}
|
||||
|
||||
public void testUnigrams() throws Exception {
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
|
||||
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
|
||||
}
|
||||
|
||||
public void testBigrams() throws Exception {
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
|
||||
assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5});
|
||||
}
|
||||
|
||||
public void testNgrams() throws Exception {
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
|
||||
assertTokenStreamContents(filter,
|
||||
new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
|
||||
new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
|
||||
new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5}
|
||||
);
|
||||
}
|
||||
|
||||
public void testOversizedNgrams() throws Exception {
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
|
||||
assertTokenStreamContents(filter, new String[0], new int[0], new int[0]);
|
||||
}
|
||||
|
||||
public void testSmallTokenInStream() throws Exception {
|
||||
input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
|
||||
assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
|
||||
}
|
||||
|
||||
public void testReset() throws Exception {
|
||||
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
|
||||
NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
|
||||
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
|
||||
tokenizer.reset(new StringReader("abcde"));
|
||||
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
|
||||
}
|
||||
|
||||
// LUCENE-3642
|
||||
// EdgeNgram blindly adds term length to offset, but this can take things out of bounds
|
||||
// wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
|
||||
// so in this case we behave like WDF, and preserve any modified offsets
|
||||
public void testInvalidOffsets() throws Exception {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
|
||||
filters = new NGramTokenFilter(filters, 2, 2);
|
||||
return new TokenStreamComponents(tokenizer, filters);
|
||||
}
|
||||
};
|
||||
assertAnalyzesTo(analyzer, "mosfellsbær",
|
||||
new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
|
||||
new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
|
||||
new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 });
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
public void testOversizedNgrams() throws Exception {
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
|
||||
assertTokenStreamContents(filter, new String[0], new int[0], new int[0]);
|
||||
}
|
||||
|
||||
public void testSmallTokenInStream() throws Exception {
|
||||
input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
|
||||
assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
|
||||
}
|
||||
|
||||
public void testReset() throws Exception {
|
||||
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
|
||||
NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
|
||||
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
|
||||
tokenizer.reset(new StringReader("abcde"));
|
||||
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
|
||||
}
|
||||
|
||||
// LUCENE-3642
|
||||
// EdgeNgram blindly adds term length to offset, but this can take things out of bounds
|
||||
// wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
|
||||
// so in this case we behave like WDF, and preserve any modified offsets
|
||||
public void testInvalidOffsets() throws Exception {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
|
||||
filters = new NGramTokenFilter(filters, 2, 2);
|
||||
return new TokenStreamComponents(tokenizer, filters);
|
||||
}
|
||||
};
|
||||
assertAnalyzesTo(analyzer, "mosfellsbær",
|
||||
new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
|
||||
new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
|
||||
new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer,
|
||||
new NGramTokenFilter(tokenizer, 2, 15));
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,71 +18,86 @@ package org.apache.lucene.analysis.ngram;
|
|||
*/
|
||||
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
/**
|
||||
* Tests {@link NGramTokenizer} for correctness.
|
||||
*/
|
||||
public class NGramTokenizerTest extends BaseTokenStreamTestCase {
|
||||
private StringReader input;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
input = new StringReader("abcde");
|
||||
private StringReader input;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
input = new StringReader("abcde");
|
||||
}
|
||||
|
||||
public void testInvalidInput() throws Exception {
|
||||
boolean gotException = false;
|
||||
try {
|
||||
new NGramTokenizer(input, 2, 1);
|
||||
} catch (IllegalArgumentException e) {
|
||||
gotException = true;
|
||||
}
|
||||
|
||||
public void testInvalidInput() throws Exception {
|
||||
boolean gotException = false;
|
||||
try {
|
||||
new NGramTokenizer(input, 2, 1);
|
||||
} catch (IllegalArgumentException e) {
|
||||
gotException = true;
|
||||
}
|
||||
assertTrue(gotException);
|
||||
assertTrue(gotException);
|
||||
}
|
||||
|
||||
public void testInvalidInput2() throws Exception {
|
||||
boolean gotException = false;
|
||||
try {
|
||||
new NGramTokenizer(input, 0, 1);
|
||||
} catch (IllegalArgumentException e) {
|
||||
gotException = true;
|
||||
}
|
||||
|
||||
public void testInvalidInput2() throws Exception {
|
||||
boolean gotException = false;
|
||||
try {
|
||||
new NGramTokenizer(input, 0, 1);
|
||||
} catch (IllegalArgumentException e) {
|
||||
gotException = true;
|
||||
}
|
||||
assertTrue(gotException);
|
||||
}
|
||||
|
||||
public void testUnigrams() throws Exception {
|
||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
|
||||
}
|
||||
|
||||
public void testBigrams() throws Exception {
|
||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, 5 /* abcde */);
|
||||
}
|
||||
|
||||
public void testNgrams() throws Exception {
|
||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
|
||||
assertTokenStreamContents(tokenizer,
|
||||
new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
|
||||
new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
|
||||
new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
|
||||
5 /* abcde */
|
||||
assertTrue(gotException);
|
||||
}
|
||||
|
||||
public void testUnigrams() throws Exception {
|
||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
|
||||
}
|
||||
|
||||
public void testBigrams() throws Exception {
|
||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, 5 /* abcde */);
|
||||
}
|
||||
|
||||
public void testNgrams() throws Exception {
|
||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
|
||||
assertTokenStreamContents(tokenizer,
|
||||
new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
|
||||
new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
|
||||
new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
|
||||
5 /* abcde */
|
||||
);
|
||||
}
|
||||
|
||||
public void testOversizedNgrams() throws Exception {
|
||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
|
||||
assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
|
||||
}
|
||||
|
||||
public void testReset() throws Exception {
|
||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
|
||||
tokenizer.reset(new StringReader("abcde"));
|
||||
assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
|
||||
}
|
||||
}
|
||||
|
||||
public void testOversizedNgrams() throws Exception {
|
||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
|
||||
assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
|
||||
}
|
||||
|
||||
public void testReset() throws Exception {
|
||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
|
||||
tokenizer.reset(new StringReader("abcde"));
|
||||
assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new NGramTokenizer(reader, 2, 15);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,10 +17,13 @@ package org.apache.lucene.analysis.path;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
|
||||
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
|
||||
|
||||
|
@ -193,4 +196,16 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new int[]{1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new PathHierarchyTokenizer(reader);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,9 +17,13 @@ package org.apache.lucene.analysis.path;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
|
||||
|
||||
public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
||||
|
||||
|
@ -154,4 +158,16 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
new int[]{1, 0},
|
||||
path.length());
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new ReversePathHierarchyTokenizer(reader);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,14 +18,17 @@
|
|||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
/**
|
||||
* Tests {@link PatternReplaceCharFilter}
|
||||
|
@ -172,4 +175,21 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
|
|||
private Pattern pattern( String p ){
|
||||
return Pattern.compile( p );
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
return new PatternReplaceCharFilter(Pattern.compile("a"), "b", CharReader.get(reader));
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,10 +17,13 @@
|
|||
|
||||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
@ -77,5 +80,28 @@ public class TestPatternReplaceFilter extends BaseTokenStreamTestCase {
|
|||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa$fooaa$fooa$foo$", "a$", "caaaaaaaaa$" });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream filter = new PatternReplaceFilter(tokenizer, Pattern.compile("a"), "b", false);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
|
||||
Analyzer b = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream filter = new PatternReplaceFilter(tokenizer, Pattern.compile("a"), "b", true);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
};
|
||||
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -18,17 +18,22 @@
|
|||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
|
||||
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
|
||||
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
|
||||
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
public class TestPatternTokenizer extends BaseTokenStreamTestCase
|
||||
|
@ -117,4 +122,35 @@ public class TestPatternTokenizer extends BaseTokenStreamTestCase
|
|||
in.close();
|
||||
return out.toString();
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = null;
|
||||
try {
|
||||
tokenizer = new PatternTokenizer(reader, Pattern.compile("a"), -1);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
|
||||
Analyzer b = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = null;
|
||||
try {
|
||||
tokenizer = new PatternTokenizer(reader, Pattern.compile("a"), 0);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,11 +17,14 @@
|
|||
|
||||
package org.apache.lucene.analysis.reverse;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestReverseStringFilter extends BaseTokenStreamTestCase {
|
||||
|
@ -96,4 +99,16 @@ public class TestReverseStringFilter extends BaseTokenStreamTestCase {
|
|||
ReverseStringFilter.reverse(TEST_VERSION_CURRENT, buffer, 3, 7);
|
||||
assertEquals("abcfed𩬅愯瀛", new String(buffer));
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(TEST_VERSION_CURRENT, tokenizer));
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,9 +18,12 @@ package org.apache.lucene.analysis.shingle;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
@ -1129,4 +1132,16 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
token.setPositionIncrement(positionIncrement);
|
||||
return token;
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, new ShingleFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -160,7 +160,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
|
|||
hasSentence = false;
|
||||
clearAttributes();
|
||||
termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd-sentenceStart);
|
||||
offsetAtt.setOffset(offset+sentenceStart, offset+sentenceEnd);
|
||||
offsetAtt.setOffset(correctOffset(offset+sentenceStart), correctOffset(offset+sentenceEnd));
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
|
@ -215,7 +215,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
|
|||
|
||||
clearAttributes();
|
||||
termAtt.copyBuffer(buffer, wordStart, wordEnd-wordStart);
|
||||
offsetAtt.setOffset(offset+wordStart, offset+wordEnd);
|
||||
offsetAtt.setOffset(correctOffset(offset+wordStart), correctOffset(offset+wordEnd));
|
||||
posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posBoost);
|
||||
posBoost = 0;
|
||||
return true;
|
||||
|
|
|
@ -18,12 +18,15 @@
|
|||
|
||||
package org.apache.lucene.analysis.wikipedia;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
|
||||
import static org.apache.lucene.analysis.wikipedia.WikipediaTokenizer.*;
|
||||
|
@ -169,4 +172,17 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
|
|||
assertFalse(tf.incrementToken());
|
||||
tf.close();
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new WikipediaTokenizer(reader);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -112,7 +112,24 @@ are part of the ICU4C package. See http://site.icu-project.org/ </echo>
|
|||
</assertions>
|
||||
</java>
|
||||
</target>
|
||||
|
||||
|
||||
<property name="html.strip.charfilter.supp.macros.output.file"
|
||||
location="../common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro"/>
|
||||
|
||||
<target name="gen-html-strip-charfilter-supp-macros" depends="compile-tools">
|
||||
<java
|
||||
classname="org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros"
|
||||
dir="."
|
||||
fork="true"
|
||||
failonerror="true"
|
||||
output="${html.strip.charfilter.supp.macros.output.file}">
|
||||
<classpath>
|
||||
<path refid="additional.dependencies"/>
|
||||
<pathelement location="${build.dir}/classes/tools"/>
|
||||
</classpath>
|
||||
</java>
|
||||
</target>
|
||||
|
||||
<target name="compile-tools" depends="common.compile-tools">
|
||||
<compile
|
||||
srcdir="src/tools/java"
|
||||
|
|
|
@ -111,7 +111,7 @@ public final class ICUTokenizer extends Tokenizer {
|
|||
@Override
|
||||
public void end() throws IOException {
|
||||
final int finalOffset = (length < 0) ? offset : offset + length;
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset));
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -0,0 +1,110 @@
|
|||
package org.apache.lucene.analysis.icu;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.text.DateFormat;
|
||||
import java.util.*;
|
||||
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
import com.ibm.icu.util.VersionInfo;
|
||||
|
||||
/** creates a macro to augment jflex's unicode support for > BMP */
|
||||
public class GenerateHTMLStripCharFilterSupplementaryMacros {
|
||||
private static final UnicodeSet BMP = new UnicodeSet("[\u0000-\uFFFF]");
|
||||
private static final String NL = System.getProperty("line.separator");
|
||||
private static final DateFormat DATE_FORMAT = DateFormat.getDateTimeInstance
|
||||
(DateFormat.FULL, DateFormat.FULL, Locale.US);
|
||||
static {
|
||||
DATE_FORMAT.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||
}
|
||||
|
||||
private static final String APACHE_LICENSE
|
||||
= "/*" + NL
|
||||
+ " * Copyright 2010 The Apache Software Foundation." + NL
|
||||
+ " *" + NL
|
||||
+ " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
|
||||
+ " * you may not use this file except in compliance with the License." + NL
|
||||
+ " * You may obtain a copy of the License at" + NL
|
||||
+ " *" + NL
|
||||
+ " * http://www.apache.org/licenses/LICENSE-2.0" + NL
|
||||
+ " *" + NL
|
||||
+ " * Unless required by applicable law or agreed to in writing, software" + NL
|
||||
+ " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
|
||||
+ " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
|
||||
+ " * See the License for the specific language governing permissions and" + NL
|
||||
+ " * limitations under the License." + NL
|
||||
+ " */" + NL + NL;
|
||||
|
||||
|
||||
public static void main(String args[]) throws Exception {
|
||||
outputHeader();
|
||||
outputMacro("ID_Start_Supp", "[:ID_Start:]");
|
||||
outputMacro("ID_Continue_Supp", "[:ID_Continue:]");
|
||||
}
|
||||
|
||||
static void outputHeader() {
|
||||
System.out.print(APACHE_LICENSE);
|
||||
System.out.print("// Generated using ICU4J " + VersionInfo.ICU_VERSION.toString() + " on ");
|
||||
System.out.println(DATE_FORMAT.format(new Date()));
|
||||
System.out.println("// by " + GenerateHTMLStripCharFilterSupplementaryMacros.class.getName());
|
||||
System.out.print(NL + NL);
|
||||
}
|
||||
|
||||
// we have to carefully output the possibilities as compact utf-16
|
||||
// range expressions, or jflex will OOM!
|
||||
static void outputMacro(String name, String pattern) {
|
||||
UnicodeSet set = new UnicodeSet(pattern);
|
||||
set.removeAll(BMP);
|
||||
System.out.println(name + " = (");
|
||||
// if the set is empty, we have to do this or jflex will barf
|
||||
if (set.isEmpty()) {
|
||||
System.out.println("\t []");
|
||||
}
|
||||
|
||||
HashMap<Character,UnicodeSet> utf16ByLead = new HashMap<Character,UnicodeSet>();
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) {
|
||||
char utf16[] = Character.toChars(it.codepoint);
|
||||
UnicodeSet trails = utf16ByLead.get(utf16[0]);
|
||||
if (trails == null) {
|
||||
trails = new UnicodeSet();
|
||||
utf16ByLead.put(utf16[0], trails);
|
||||
}
|
||||
trails.add(utf16[1]);
|
||||
}
|
||||
|
||||
Map<String,UnicodeSet> utf16ByTrail = new HashMap<String,UnicodeSet>();
|
||||
for (Map.Entry<Character,UnicodeSet> entry : utf16ByLead.entrySet()) {
|
||||
String trail = entry.getValue().getRegexEquivalent();
|
||||
UnicodeSet leads = utf16ByTrail.get(trail);
|
||||
if (leads == null) {
|
||||
leads = new UnicodeSet();
|
||||
utf16ByTrail.put(trail, leads);
|
||||
}
|
||||
leads.add(entry.getKey());
|
||||
}
|
||||
|
||||
boolean isFirst = true;
|
||||
for (Map.Entry<String,UnicodeSet> entry : utf16ByTrail.entrySet()) {
|
||||
System.out.print( isFirst ? "\t " : "\t| ");
|
||||
isFirst = false;
|
||||
System.out.println(entry.getValue().getRegexEquivalent() + entry.getKey());
|
||||
}
|
||||
System.out.println(")");
|
||||
}
|
||||
}
|
|
@ -102,4 +102,9 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
|
|||
assertPOSToken(ts, "list", "subst:sg:loc.voc:m3");
|
||||
assertPOSToken(ts, "lista", "subst:sg:dat.loc:f");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandom() throws Exception {
|
||||
checkRandomData(random, getTestAnalyzer(), 10000 * RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,11 +16,17 @@
|
|||
*/
|
||||
package org.apache.lucene.analysis.phonetic;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
public class DoubleMetaphoneFilterTest extends BaseTokenStreamTestCase {
|
||||
|
||||
|
@ -65,4 +71,28 @@ public class DoubleMetaphoneFilterTest extends BaseTokenStreamTestCase {
|
|||
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
|
||||
}
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
final int codeLen = _TestUtil.nextInt(random, 1, 8);
|
||||
Analyzer a = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, new DoubleMetaphoneFilter(tokenizer, codeLen, false));
|
||||
}
|
||||
|
||||
};
|
||||
checkRandomData(random, a, 1000 * RANDOM_MULTIPLIER);
|
||||
|
||||
Analyzer b = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, new DoubleMetaphoneFilter(tokenizer, codeLen, true));
|
||||
}
|
||||
|
||||
};
|
||||
checkRandomData(random, b, 1000 * RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
|
||||
package org.apache.lucene.analysis.phonetic;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.commons.codec.Encoder;
|
||||
|
@ -25,7 +27,9 @@ import org.apache.commons.codec.language.DoubleMetaphone;
|
|||
import org.apache.commons.codec.language.Metaphone;
|
||||
import org.apache.commons.codec.language.RefinedSoundex;
|
||||
import org.apache.commons.codec.language.Soundex;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
|
||||
|
@ -70,4 +74,33 @@ public class TestPhoneticFilter extends BaseTokenStreamTestCase {
|
|||
PhoneticFilter filter = new PhoneticFilter(tokenizer, encoder, inject);
|
||||
assertTokenStreamContents(filter, expected);
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws IOException {
|
||||
Encoder encoders[] = new Encoder[] {
|
||||
new Metaphone(), new DoubleMetaphone(), new Soundex(), new RefinedSoundex(), new Caverphone()
|
||||
};
|
||||
|
||||
for (final Encoder e : encoders) {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, false));
|
||||
}
|
||||
};
|
||||
|
||||
checkRandomData(random, a, 1000*RANDOM_MULTIPLIER);
|
||||
|
||||
Analyzer b = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, false));
|
||||
}
|
||||
};
|
||||
|
||||
checkRandomData(random, b, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -199,9 +199,6 @@ public abstract class PerfTask implements Cloneable {
|
|||
return new String(c);
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see java.lang.Object#toString()
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
String padd = getPadding();
|
||||
|
@ -248,22 +245,23 @@ public abstract class PerfTask implements Cloneable {
|
|||
}
|
||||
|
||||
/**
|
||||
* Task setup work that should not be measured for that specific task.
|
||||
* By default it does nothing, but tasks can implement this, moving work from
|
||||
* doLogic() to this method. Only the work done in doLogicis measured for this task.
|
||||
* Notice that higher level (sequence) tasks containing this task would then
|
||||
* measure larger time than the sum of their contained tasks.
|
||||
* @throws Exception
|
||||
* Task setup work that should not be measured for that specific task. By
|
||||
* default it does nothing, but tasks can implement this, moving work from
|
||||
* {@link #doLogic()} to this method. Only the work done in {@link #doLogic()}
|
||||
* is measured for this task. Notice that higher level (sequence) tasks
|
||||
* containing this task would then measure larger time than the sum of their
|
||||
* contained tasks.
|
||||
*/
|
||||
public void setup () throws Exception {
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Task tearDown work that should not be measured for that specific task.
|
||||
* By default it does nothing, but tasks can implement this, moving work from
|
||||
* doLogic() to this method. Only the work done in doLogicis measured for this task.
|
||||
* Notice that higher level (sequence) tasks containing this task would then
|
||||
* measure larger time than the sum of their contained tasks.
|
||||
* Task tearDown work that should not be measured for that specific task. By
|
||||
* default it does nothing, but tasks can implement this, moving work from
|
||||
* {@link #doLogic()} to this method. Only the work done in {@link #doLogic()}
|
||||
* is measured for this task. Notice that higher level (sequence) tasks
|
||||
* containing this task would then measure larger time than the sum of their
|
||||
* contained tasks.
|
||||
*/
|
||||
public void tearDown() throws Exception {
|
||||
if (++logStepCount % logStep == 0) {
|
||||
|
@ -274,16 +272,20 @@ public abstract class PerfTask implements Cloneable {
|
|||
}
|
||||
|
||||
/**
|
||||
* Sub classes that supports parameters must override this method to return true.
|
||||
* Sub classes that support parameters must override this method to return
|
||||
* true.
|
||||
*
|
||||
* @return true iff this task supports command line params.
|
||||
*/
|
||||
public boolean supportsParams () {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Set the params of this task.
|
||||
* @exception UnsupportedOperationException for tasks supporting command line parameters.
|
||||
*
|
||||
* @exception UnsupportedOperationException
|
||||
* for tasks supporting command line parameters.
|
||||
*/
|
||||
public void setParams(String params) {
|
||||
if (!supportsParams()) {
|
||||
|
|
|
@ -4,6 +4,7 @@ import java.io.IOException;
|
|||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.concurrent.locks.ReadWriteLock;
|
||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||
import java.util.logging.Level;
|
||||
|
@ -100,6 +101,9 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
|
|||
|
||||
private volatile boolean closed = false;
|
||||
|
||||
// set refCount to 1 at start
|
||||
private final AtomicInteger refCount = new AtomicInteger(1);
|
||||
|
||||
/**
|
||||
* Open for reading a taxonomy stored in a given {@link Directory}.
|
||||
* @param directory
|
||||
|
@ -130,7 +134,7 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
|
|||
* @throws AlreadyClosedException if this IndexReader is closed
|
||||
*/
|
||||
protected final void ensureOpen() throws AlreadyClosedException {
|
||||
if (indexReader.getRefCount() <= 0) {
|
||||
if (getRefCount() <= 0) {
|
||||
throw new AlreadyClosedException("this TaxonomyReader is closed");
|
||||
}
|
||||
}
|
||||
|
@ -415,8 +419,12 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
|
|||
|
||||
public void close() throws IOException {
|
||||
if (!closed) {
|
||||
decRef();
|
||||
closed = true;
|
||||
synchronized (this) {
|
||||
if (!closed) {
|
||||
decRef();
|
||||
closed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -555,27 +563,31 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
|
|||
}
|
||||
|
||||
/**
|
||||
* Expert: decreases the refCount of this TaxonomyReader instance.
|
||||
* If the refCount drops to 0, then pending changes (if any) are
|
||||
* committed to the taxonomy index and this reader is closed.
|
||||
* @throws IOException
|
||||
* Expert: decreases the refCount of this TaxonomyReader instance. If the
|
||||
* refCount drops to 0, then this reader is closed.
|
||||
*/
|
||||
public void decRef() throws IOException {
|
||||
ensureOpen();
|
||||
if (indexReader.getRefCount() == 1) {
|
||||
// Do not decRef the indexReader - doClose does it by calling reader.close()
|
||||
doClose();
|
||||
} else {
|
||||
indexReader.decRef();
|
||||
final int rc = refCount.decrementAndGet();
|
||||
if (rc == 0) {
|
||||
boolean success = false;
|
||||
try {
|
||||
doClose();
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
// Put reference back on failure
|
||||
refCount.incrementAndGet();
|
||||
}
|
||||
}
|
||||
} else if (rc < 0) {
|
||||
throw new IllegalStateException("too many decRef calls: refCount is " + rc + " after decrement");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: returns the current refCount for this taxonomy reader
|
||||
*/
|
||||
/** Expert: returns the current refCount for this taxonomy reader */
|
||||
public int getRefCount() {
|
||||
ensureOpen();
|
||||
return this.indexReader.getRefCount();
|
||||
return refCount.get();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -587,6 +599,6 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
|
|||
*/
|
||||
public void incRef() {
|
||||
ensureOpen();
|
||||
this.indexReader.incRef();
|
||||
refCount.incrementAndGet();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,6 +11,7 @@ import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
|
|||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Test;
|
||||
|
@ -178,4 +179,28 @@ public class TestDirectoryTaxonomyReader extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRefreshAndRefCount() throws Exception {
|
||||
Directory dir = new RAMDirectory(); // no need for random directories here
|
||||
|
||||
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(dir);
|
||||
taxoWriter.addCategory(new CategoryPath("a"));
|
||||
taxoWriter.commit();
|
||||
|
||||
DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(dir);
|
||||
assertEquals("wrong refCount", 1, taxoReader.getRefCount());
|
||||
|
||||
taxoReader.incRef();
|
||||
assertEquals("wrong refCount", 2, taxoReader.getRefCount());
|
||||
|
||||
taxoWriter.addCategory(new CategoryPath("a", "b"));
|
||||
taxoWriter.commit();
|
||||
taxoReader.refresh();
|
||||
assertEquals("wrong refCount", 2, taxoReader.getRefCount());
|
||||
|
||||
taxoWriter.close();
|
||||
taxoReader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -42,7 +42,7 @@
|
|||
<h2>Search-time joins</h2>
|
||||
|
||||
<p>
|
||||
The query time joining is terms based and implemented as two pass search. The first pass collects all the terms from a fromField
|
||||
The query time joining is index term based and implemented as two pass search. The first pass collects all the terms from a fromField
|
||||
that match the fromQuery. The second pass returns all documents that have matching terms in a toField to the terms
|
||||
collected in the first pass.
|
||||
</p>
|
||||
|
@ -62,7 +62,7 @@
|
|||
<pre class="prettyprint">
|
||||
String fromField = "from"; // Name of the from field
|
||||
boolean multipleValuesPerDocument = false; // Set only yo true in the case when your fromField has multiple values per document in your index
|
||||
String fromField = "to"; // Name of the to field
|
||||
String toField = "to"; // Name of the to field
|
||||
Query fromQuery = new TermQuery(new Term("content", searchTerm)); // Query executed to collect from values to join to the to values
|
||||
|
||||
MultiTermQuery joinQuery = JoinUtil.createJoinQuery(fromField, multipleValuesPerDocument, toField, fromQuery, fromSearcher);
|
||||
|
|
|
@ -24,11 +24,11 @@ $Id$
|
|||
================== 4.0.0-dev ==================
|
||||
Versions of Major Components
|
||||
---------------------
|
||||
Apache Tika 0.10
|
||||
Apache Tika 1.0
|
||||
Carrot2 3.5.0
|
||||
Velocity 1.6.4 and Velocity Tools 2.0
|
||||
Apache UIMA 2.3.1
|
||||
Apache ZooKeeper 3.3.3
|
||||
Apache ZooKeeper 3.3.4
|
||||
|
||||
|
||||
Upgrading from Solr 3.6-dev
|
||||
|
@ -401,6 +401,14 @@ Upgrading from Solr 3.5
|
|||
* As doGet() methods in SimplePostTool was changed to static, the client applications of this
|
||||
class need to be recompiled.
|
||||
|
||||
* In Solr version 3.5 and earlier, HTMLStripCharFilter had known bugs in the
|
||||
character offsets it provided, triggering e.g. exceptions in highlighting.
|
||||
HTMLStripCharFilter has been re-implemented, addressing this and other
|
||||
issues. See the entry for LUCENE-3690 in the Bug Fixes section below for a
|
||||
detailed list of changes. For people who depend on the behavior of
|
||||
HTMLStripCharFilter in Solr version 3.5 and earlier: the old implementation
|
||||
(bugs and all) is preserved as LegacyHTMLStripCharFilter.
|
||||
|
||||
New Features
|
||||
----------------------
|
||||
* SOLR-2904: BinaryUpdateRequestHandler should be able to accept multiple update requests from
|
||||
|
@ -442,6 +450,10 @@ New Features
|
|||
* SOLR-1709: Distributed support for Date and Numeric Range Faceting
|
||||
(Peter Sturge, David Smiley, hossman, Simon Willnauer)
|
||||
|
||||
* SOLR-3054, LUCENE-3671: Add TypeTokenFilterFactory that creates TypeTokenFilter
|
||||
that filters tokens based on their TypeAttribute. (Tommaso Teofili via
|
||||
Uwe Schindler)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
* SOLR-1931: Speedup for LukeRequestHandler and admin/schema browser. New parameter
|
||||
|
@ -483,6 +495,52 @@ Bug Fixes
|
|||
|
||||
* SOLR-2970: CSV ResponseWriter returns fields defined as stored=false in schema (janhoy)
|
||||
|
||||
* LUCENE-3690, LUCENE-2208, SOLR-882, SOLR-42: Re-implemented
|
||||
HTMLStripCharFilter as a JFlex-generated scanner. See below for a list
|
||||
of bug fixes and other changes. To get the same behavior as
|
||||
HTMLStripCharFilter in Solr version 3.5 and earlier (including the bugs),
|
||||
use LegacyHTMLStripCharFilter, which is the previous implementation.
|
||||
|
||||
Behavior changes from the previous version:
|
||||
|
||||
- Known offset bugs are fixed.
|
||||
- The "Mark invalid" exceptions reported in SOLR-1283 are no longer
|
||||
triggered (the bug is still present in LegacyHTMLStripCharFilter).
|
||||
- The character entity "'" is now always properly decoded.
|
||||
- More cases of <script> tags are now properly stripped.
|
||||
- CDATA sections are now handled properly.
|
||||
- Valid tag name characters now include the supplementary Unicode characters
|
||||
from Unicode character classes [:ID_Start:] and [:ID_Continue:].
|
||||
- Uppercase character entities """, "©", ">", "<", "®",
|
||||
and "&" are now recognized and handled as if they were in lowercase.
|
||||
- The REPLACEMENT CHARACTER U+FFFD is now used to replace numeric character
|
||||
entities for unpaired UTF-16 low and high surrogates (in the range
|
||||
[U+D800-U+DFFF]).
|
||||
- Properly paired numeric character entities for UTF-16 surrogates are now
|
||||
converted to the corresponding code units.
|
||||
- Opening tags with unbalanced quotation marks are now properly stripped.
|
||||
- Literal "<" and ">" characters in opening tags, regardless of whether they
|
||||
appear inside quotation marks, now inhibit recognition (and stripping) of
|
||||
the tags. The only exception to this is for values of event-handler
|
||||
attributes, e.g. "onClick", "onLoad", "onSelect".
|
||||
- A newline '\n' is substituted instead of a space for stripped HTML markup.
|
||||
- Nothing is substituted for opening and closing inline tags - they are
|
||||
simply removed. The list of inline tags is (case insensitively): <a>,
|
||||
<abbr>, <acronym>, <b>, <basefont>, <bdo>, <big>, <cite>, <code>, <dfn>,
|
||||
<em>, <font>, <i>, <img>, <input>, <kbd>, <label>, <q>, <s>, <samp>,
|
||||
<select>, <small>, <span>, <strike>, <strong>, <sub>, <sup>, <textarea>,
|
||||
<tt>, <u>, and <var>.
|
||||
- HTMLStripCharFilterFactory now handles HTMLStripCharFilter's "escapedTags"
|
||||
feature: opening and closing tags with the given names, including any
|
||||
attributes and their values, are left intact in the output.
|
||||
(Steve Rowe)
|
||||
|
||||
* LUCENE-3717: Fixed offset bugs in TrimFilter, WordDelimiterFilter, and
|
||||
HyphenatedWordsFilter where they would create invalid offsets in
|
||||
some situations, leading to problems in highlighting. (Robert Muir)
|
||||
|
||||
* SOLR-2280: commitWithin ignored for a delete query (Juan Grande via janhoy)
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
* SOLR-2922: Upgrade commons-io and commons-lang to 2.1 and 2.6, respectively. (koji)
|
||||
|
@ -498,6 +556,8 @@ Other Changes
|
|||
* SOLR-2718: Add ability to lazy load response writers, defined with startup="lazy".
|
||||
(ehatcher)
|
||||
|
||||
* SOLR-2901: Upgrade Solr to Tika 1.0 (janhoy)
|
||||
|
||||
Build
|
||||
----------------------
|
||||
* SOLR-2487: Add build target to package war without slf4j jars (janhoy)
|
||||
|
|
|
@ -482,7 +482,7 @@
|
|||
<packageset dir="contrib/langid/src/java"/>
|
||||
<packageset dir="contrib/uima/src/java"/>
|
||||
<group title="Core" packages="org.apache.*" />
|
||||
<group title="SolrJ" packages="org.apache.solr.common.*,org.apache.solr.client.solrj*" />
|
||||
<group title="SolrJ" packages="org.apache.solr.common.*,org.apache.solr.client.solrj.*,org.apache.zookeeper.*" />
|
||||
<group title="contrib: Clustering" packages="org.apache.solr.handler.clustering*" />
|
||||
<group title="contrib: DataImportHandler" packages="org.apache.solr.handler.dataimport*" />
|
||||
<group title="contrib: Solr Cell" packages="org.apache.solr.handler.extraction*" />
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
cd ..
|
||||
|
||||
rm -r -f example2
|
||||
|
||||
rm -r -f dist
|
||||
rm -r -f build
|
||||
rm -r -f example/solr/zoo_data
|
||||
rm -f example/example.log
|
||||
|
||||
ant example dist
|
||||
|
||||
cp -r -f example example2
|
||||
|
||||
|
||||
cd example
|
||||
java -DzkRun -DnumShards=2 -DSTOP.PORT=7983 -DSTOP.KEY=key -Dbootstrap_confdir=solr/conf -jar start.jar 1>example.log 2>&1 &
|
||||
|
||||
sleep 10
|
||||
|
||||
cd ../example2
|
||||
java -Djetty.port=9574 -DzkRun -DzkHost=localhost:9983 -DnumShards=2 -DSTOP.PORT=6574 -DSTOP.KEY=key -jar start.jar 1>example2.log 2>&1 &
|
||||
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
cd ..
|
||||
|
||||
rm -r -f example2
|
||||
rm -r -f example3
|
||||
rm -r -f example4
|
||||
|
||||
rm -r -f dist
|
||||
rm -r -f build
|
||||
rm -r -f example/solr/zoo_data
|
||||
rm -f example/example.log
|
||||
|
||||
ant example dist
|
||||
|
||||
cp -r -f example example2
|
||||
cp -r -f example example3
|
||||
cp -r -f example example4
|
||||
|
||||
|
||||
cd example
|
||||
java -DzkRun -DnumShards=2 -DSTOP.PORT=7983 -DSTOP.KEY=key -Dbootstrap_confdir=solr/conf -jar start.jar 1>example.log 2>&1 &
|
||||
|
||||
sleep 10
|
||||
|
||||
cd ../example2
|
||||
java -Djetty.port=9574 -DzkRun -DzkHost=localhost:9983 -DnumShards=2 -DSTOP.PORT=6574 -DSTOP.KEY=key -jar start.jar 1>example2.log 2>&1 &
|
||||
|
||||
cd ../example3
|
||||
java -Djetty.port=9575 -DzkRun -DzkHost=localhost:9983 -DnumShards=2 -DSTOP.PORT=6575 -DSTOP.KEY=key -jar start.jar 1>example3.log 2>&1 &
|
||||
|
||||
cd ../example4
|
||||
java -Djetty.port=9576 -DzkHost=localhost:9983 -DnumShards=2 -DSTOP.PORT=6576 -DSTOP.KEY=key -jar start.jar 1>example4.log 2>&1 &
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
cd ..
|
||||
|
||||
rm -r -f example2
|
||||
rm -r -f example3
|
||||
rm -r -f example4
|
||||
|
||||
rm -r -f dist
|
||||
rm -r -f build
|
||||
rm -r -f example/solr/zoo_data
|
||||
rm -f example/example.log
|
||||
|
||||
ant example dist
|
||||
|
||||
cp -r -f example example2
|
||||
cp -r -f example example3
|
||||
cp -r -f example example4
|
||||
|
||||
|
||||
cd example
|
||||
java -DzkRun -DnumShards=2 -DSTOP.PORT=7983 -DSTOP.KEY=key -Dbootstrap_confdir=solr/conf -DzkHost=localhost:9983,localhost:14574,localhost:14585 -jar start.jar 1>example.log 2>&1 &
|
||||
|
||||
sleep 10
|
||||
|
||||
cd ../example2
|
||||
java -Djetty.port=13574 -DzkRun -DzkHost=localhost:9983,localhost:14574,localhost:14575 -DnumShards=2 -DSTOP.PORT=6574 -DSTOP.KEY=key -jar start.jar 1>example2.log 2>&1 &
|
||||
|
||||
cd ../example3
|
||||
java -Djetty.port=13585 -DzkRun -DzkHost=localhost:9983,localhost:14574,localhost:14585 -DnumShards=2 -DSTOP.PORT=6575 -DSTOP.KEY=key -jar start.jar 1>example3.log 2>&1 &
|
||||
|
||||
cd ../example4
|
||||
java -Djetty.port=13596 -DzkHost=localhost:9983,localhost:14574,localhost:14585 -DnumShards=2 -DSTOP.PORT=6576 -DSTOP.KEY=key -jar start.jar 1>example4.log 2>&1 &
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue