merge trunk (1233476:1235908)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3661@1235919 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-01-25 20:32:44 +00:00
commit 58e5ec6979
282 changed files with 55892 additions and 4375 deletions

View File

@ -100,7 +100,7 @@
<classpathentry kind="lib" path="modules/benchmark/lib/commons-digester-1.7.jar"/>
<classpathentry kind="lib" path="modules/benchmark/lib/commons-logging-1.0.4.jar"/>
<classpathentry kind="lib" path="modules/benchmark/lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar"/>
<classpathentry kind="lib" path="solr/lib/apache-solr-noggit-r1209632.jar"/>
<classpathentry kind="lib" path="solr/lib/apache-solr-noggit-r1211150.jar"/>
<classpathentry kind="lib" path="solr/lib/commons-csv-1.0-SNAPSHOT-r966014.jar"/>
<classpathentry kind="lib" path="solr/lib/commons-fileupload-1.2.1.jar"/>
<classpathentry kind="lib" path="solr/lib/commons-httpclient-3.1.jar"/>
@ -115,7 +115,7 @@
<classpathentry kind="lib" path="solr/lib/slf4j-api-1.6.1.jar"/>
<classpathentry kind="lib" path="solr/lib/slf4j-jdk14-1.6.1.jar"/>
<classpathentry kind="lib" path="solr/lib/wstx-asl-3.2.7.jar"/>
<classpathentry kind="lib" path="solr/lib/zookeeper-3.3.3.jar"/>
<classpathentry kind="lib" path="solr/lib/zookeeper-3.3.4.jar"/>
<classpathentry kind="lib" path="solr/example/lib/jetty-6.1.26-patched-JETTY-1340.jar"/>
<classpathentry kind="lib" path="solr/example/lib/jetty-util-6.1.26-patched-JETTY-1340.jar"/>
<classpathentry kind="lib" path="solr/example/lib/servlet-api-2.5-20081211.jar"/>
@ -136,7 +136,7 @@
<classpathentry kind="lib" path="solr/contrib/extraction/lib/bcmail-jdk15-1.45.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/bcprov-jdk15-1.45.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/boilerpipe-1.1.0.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/commons-compress-1.2.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/commons-compress-1.3.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/dom4j-1.6.1.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/fontbox-1.6.0.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/jempbox-1.6.0.jar"/>
@ -149,8 +149,8 @@
<classpathentry kind="lib" path="solr/contrib/extraction/lib/poi-scratchpad-3.8-beta4.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/rome-0.9.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/tagsoup-1.2.1.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-core-0.10.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-parsers-0.10.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-core-1.0.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-parsers-1.0.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/xmlbeans-2.3.0.jar"/>
<classpathentry kind="lib" path="solr/contrib/langid/lib/langdetect-r111.jar"/>
<classpathentry kind="lib" path="solr/contrib/langid/lib/jsonic-1.2.0.jar"/>

View File

@ -45,7 +45,7 @@
<jetty.version>6.1.26</jetty.version>
<patched.jetty.version>6.1.26-patched-JETTY-1340</patched.jetty.version>
<slf4j.version>1.6.1</slf4j.version>
<tika.version>0.10</tika.version>
<tika.version>1.0</tika.version>
</properties>
<issueManagement>
<system>JIRA</system>
@ -283,7 +283,7 @@
<dependency>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
<version>3.3.3</version>
<version>3.3.4</version>
</dependency>
<dependency>
<groupId>org.carrot2</groupId>
@ -362,6 +362,19 @@
</dependency>
</dependencies>
</dependencyManagement>
<dependencies>
<dependency>
<!-- Maven 2.2.X has a bug that omits as duplicate all JUnit -->
<!-- dependencies from the classpath when compiling solr-clustering, -->
<!-- causing test compilation to fail. Maven 3.0.4 test compilation -->
<!-- succeeds with the exact same dependencies, so apparently the -->
<!-- bug has been fixed. This dependency can be removed when the -->
<!-- minimum Maven version is upgraded to 3.0.4+. -->
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<directory>lucene/build/lucene-parent</directory>
<pluginManagement>
@ -385,6 +398,11 @@
<target>${java.compat.version}</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<version>2.4</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-deploy-plugin</artifactId>
@ -652,7 +670,7 @@
<artifactId>solr-noggit</artifactId>
<version>${project.version}</version>
<packaging>jar</packaging>
<file>solr/lib/apache-solr-noggit-r1209632.jar</file>
<file>solr/lib/apache-solr-noggit-r1211150.jar</file>
</configuration>
</execution>
<execution>

View File

@ -202,6 +202,12 @@
<testResource>
<directory>src/test-files</directory>
</testResource>
<testResource>
<directory>${project.build.testSourceDirectory}</directory>
<excludes>
<exclude>**/*.java</exclude>
</excludes>
</testResource>
<testResource>
<directory>../solrj/src/test-files</directory>
</testResource>

View File

@ -113,6 +113,13 @@
<skip>true</skip> <!-- Tests are run from solr-core module -->
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<skip>true</skip> <!-- This skips test compilation - tests are run from solr-core module -->
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -742,6 +742,9 @@ Changes in backwards compatibility policy
behavior. Added seekExact() to FSTEnum, and added FST.save/read
from a File. (Mike McCandless, Dawid Weiss, Robert Muir)
* LUCENE-3712: Removed unused and untested ReaderUtil#subReader methods.
(Uwe Schindler)
Security fixes
* LUCENE-3588: Try harder to prevent SIGSEGV on cloned MMapIndexInputs:
@ -790,6 +793,12 @@ New Features
input mapping to it) for FSTs that have strictly monotonic long
outputs (such as an ord). (Mike McCandless)
* LUCENE-3671: Add TypeTokenFilter that filters tokens based on
their TypeAttribute. (Tommaso Teofili via Uwe Schindler)
* LUCENE-3690: Added HTMLStripCharFilter, a CharFilter that strips HTML
markup. (Steve Rowe)
Bug fixes
* LUCENE-3595: Fixed FieldCacheRangeFilter and FieldCacheTermsFilter
@ -808,9 +817,11 @@ Bug fixes
* LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners
to clones/reopened readers. (Uwe Schindler)
* LUCENE-3642: Fixed bugs in CharTokenizer, n-gram filters, and smart chinese
where they would create invalid offsets in some situations, leading to problems
in highlighting. (Max Beutel via Robert Muir)
* LUCENE-3642, SOLR-2891, LUCENE-3717: Fixed bugs in CharTokenizer, n-gram tokenizers/filters,
compound token filters, thai word filter, icutokenizer, pattern analyzer,
wikipediatokenizer, and smart chinese where they would create invalid offsets in
some situations, leading to problems in highlighting.
(Max Beutel, Edwin Steiner via Robert Muir)
* LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to
Float.MIN_VALUE when it should be Float.NaN, when there were 0
@ -825,6 +836,12 @@ Bug fixes
* LUCENE-3605: don't sleep in a retry loop when trying to locate the
segments_N file (Robert Muir, Mike McCandless)
* LUCENE-3711: SentinelIntSet with a small initial size can go into
an infinite loop when expanded. This can affect grouping using
TermAllGroupsCollector or TermAllGroupHeadsCollector if instantiated with a
non default small size. (Martijn van Groningen, yonik)
Optimizations
* LUCENE-3653: Improve concurrency in VirtualMethod and AttributeSource by

View File

@ -52,12 +52,12 @@ public abstract class Analyzer {
* @param fieldName
* the name of the fields content passed to the
* {@link TokenStreamComponents} sink as a reader
* @param aReader
* @param reader
* the reader passed to the {@link Tokenizer} constructor
* @return the {@link TokenStreamComponents} for this analyzer.
*/
protected abstract TokenStreamComponents createComponents(String fieldName,
Reader aReader);
Reader reader);
/**
* Creates a TokenStream that is allowed to be re-use from the previous time

View File

@ -206,7 +206,7 @@ public class SegmentTermDocs {
skipListReader = new Lucene40SkipListReader((IndexInput) freqStream.clone(), maxSkipLevels, skipInterval); // lazily clone
if (!haveSkipped) { // lazily initialize skip stream
skipListReader.init(skipPointer, freqBasePointer, proxBasePointer, df, currentFieldStoresPayloads);
skipListReader.init(skipPointer, freqBasePointer, proxBasePointer, df, currentFieldStoresPayloads, false);
haveSkipped = true;
}

View File

@ -85,11 +85,11 @@ public class Lucene40FieldInfosReader extends FieldInfosReader {
// LUCENE-3027: past indices were able to write
// storePayloads=true when omitTFAP is also true,
// which is invalid. We correct that, here:
if (indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
storePayloads = false;
}
hasVectors |= storeTermVector;
hasProx |= isIndexed && indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
hasProx |= isIndexed && indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
hasFreq |= isIndexed && indexOptions != IndexOptions.DOCS_ONLY;
// DV Types are packed in one byte
byte val = input.readByte();

View File

@ -58,7 +58,7 @@ public class Lucene40FieldInfosWriter extends FieldInfosWriter {
output.writeVInt(FORMAT_CURRENT);
output.writeVInt(infos.size());
for (FieldInfo fi : infos) {
assert fi.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !fi.storePayloads;
assert fi.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !fi.storePayloads;
byte bits = 0x0;
if (fi.isIndexed) bits |= IS_INDEXED;
if (fi.storeTermVector) bits |= STORE_TERMVECTOR;

View File

@ -197,7 +197,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
// undefined
}
if (fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
if (fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
if (isFirstTerm) {
termState.proxOffset = termState.bytesReader.readVLong();
} else {
@ -245,23 +245,23 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
DocsAndPositionsEnum reuse, boolean needsOffsets)
throws IOException {
if (needsOffsets) {
// TODO: once we index offsets into postings fix this!
return null;
boolean hasOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
if (needsOffsets && !hasOffsets) {
return null; // not available
}
// TODO: refactor
if (fieldInfo.storePayloads) {
SegmentDocsAndPositionsAndPayloadsEnum docsEnum;
if (reuse == null || !(reuse instanceof SegmentDocsAndPositionsAndPayloadsEnum)) {
docsEnum = new SegmentDocsAndPositionsAndPayloadsEnum(freqIn, proxIn);
if (fieldInfo.storePayloads || hasOffsets) {
SegmentFullPositionsEnum docsEnum;
if (reuse == null || !(reuse instanceof SegmentFullPositionsEnum)) {
docsEnum = new SegmentFullPositionsEnum(freqIn, proxIn);
} else {
docsEnum = (SegmentDocsAndPositionsAndPayloadsEnum) reuse;
docsEnum = (SegmentFullPositionsEnum) reuse;
if (docsEnum.startFreqIn != freqIn) {
// If you are using ParellelReader, and pass in a
// reused DocsEnum, it could have come from another
// reader also using standard codec
docsEnum = new SegmentDocsAndPositionsAndPayloadsEnum(freqIn, proxIn);
docsEnum = new SegmentFullPositionsEnum(freqIn, proxIn);
}
}
return docsEnum.reset(fieldInfo, (StandardTermState) termState, liveDocs);
@ -295,6 +295,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
protected boolean indexOmitsTF; // does current field omit term freq?
protected boolean storePayloads; // does current field store payloads?
protected boolean storeOffsets; // does current field store offsets?
protected int limit; // number of docs in this posting
protected int ord; // how many docs we've read
@ -324,6 +325,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
DocsEnum reset(FieldInfo fieldInfo, StandardTermState termState) throws IOException {
indexOmitsTF = fieldInfo.indexOptions == IndexOptions.DOCS_ONLY;
storePayloads = fieldInfo.storePayloads;
storeOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
freqOffset = termState.freqOffset;
skipOffset = termState.skipOffset;
@ -471,7 +473,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
skipper.init(freqOffset + skipOffset,
freqOffset, 0,
limit, storePayloads);
limit, storePayloads, storeOffsets);
skipped = true;
}
@ -519,7 +521,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
return doc = docs[i];
}
}
return refill();
return doc = refill();
}
@Override
@ -602,7 +604,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
return doc = docs[i];
}
}
return refill();
return doc = refill();
}
@Override
@ -665,7 +667,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
// TODO specialize DocsAndPosEnum too
// Decodes docs & positions. payloads are not present.
// Decodes docs & positions. payloads nor offsets are present.
private final class SegmentDocsAndPositionsEnum extends DocsAndPositionsEnum {
final IndexInput startFreqIn;
private final IndexInput freqIn;
@ -792,7 +794,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
skipper.init(freqOffset+skipOffset,
freqOffset, proxOffset,
limit, false);
limit, false, false);
skipped = true;
}
@ -868,8 +870,8 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
}
}
// Decodes docs & positions & payloads
private class SegmentDocsAndPositionsAndPayloadsEnum extends DocsAndPositionsEnum {
// Decodes docs & positions & (payloads and/or offsets)
private class SegmentFullPositionsEnum extends DocsAndPositionsEnum {
final IndexInput startFreqIn;
private final IndexInput freqIn;
private final IndexInput proxIn;
@ -895,16 +897,24 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
Lucene40SkipListReader skipper;
private BytesRef payload;
private long lazyProxPointer;
boolean storePayloads;
boolean storeOffsets;
int offsetLength;
int startOffset;
public SegmentDocsAndPositionsAndPayloadsEnum(IndexInput freqIn, IndexInput proxIn) throws IOException {
public SegmentFullPositionsEnum(IndexInput freqIn, IndexInput proxIn) throws IOException {
startFreqIn = freqIn;
this.freqIn = (IndexInput) freqIn.clone();
this.proxIn = (IndexInput) proxIn.clone();
}
public SegmentDocsAndPositionsAndPayloadsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits liveDocs) throws IOException {
assert fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
assert fieldInfo.storePayloads;
public SegmentFullPositionsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits liveDocs) throws IOException {
storeOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
storePayloads = fieldInfo.storePayloads;
assert fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
assert storePayloads || storeOffsets;
if (payload == null) {
payload = new BytesRef();
payload.bytes = new byte[1];
@ -923,6 +933,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
doc = -1;
accum = 0;
position = 0;
startOffset = 0;
skipped = false;
posPendingCount = 0;
@ -963,6 +974,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
}
position = 0;
startOffset = 0;
//System.out.println("StandardR.D&PE nextDoc seg=" + segment + " return doc=" + doc);
return (doc = accum);
@ -1001,7 +1013,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
//System.out.println(" init skipper freqOffset=" + freqOffset + " skipOffset=" + skipOffset + " vs len=" + freqIn.length());
skipper.init(freqOffset+skipOffset,
freqOffset, proxOffset,
limit, true);
limit, storePayloads, storeOffsets);
skipped = true;
}
@ -1016,8 +1028,10 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
lazyProxPointer = skipper.getProxPointer();
posPendingCount = 0;
position = 0;
startOffset = 0;
payloadPending = false;
payloadLength = skipper.getPayloadLength();
offsetLength = skipper.getOffsetLength();
}
}
@ -1038,27 +1052,38 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
}
if (payloadPending && payloadLength > 0) {
// payload of last position as never retrieved -- skip it
// payload of last position was never retrieved -- skip it
proxIn.seek(proxIn.getFilePointer() + payloadLength);
payloadPending = false;
}
// scan over any docs that were iterated without their positions
while(posPendingCount > freq) {
final int code = proxIn.readVInt();
if ((code & 1) != 0) {
// new payload length
payloadLength = proxIn.readVInt();
assert payloadLength >= 0;
if (storePayloads) {
if ((code & 1) != 0) {
// new payload length
payloadLength = proxIn.readVInt();
assert payloadLength >= 0;
}
assert payloadLength != -1;
}
assert payloadLength != -1;
proxIn.seek(proxIn.getFilePointer() + payloadLength);
if (storeOffsets) {
if ((proxIn.readVInt() & 1) != 0) {
// new offset length
offsetLength = proxIn.readVInt();
}
}
if (storePayloads) {
proxIn.seek(proxIn.getFilePointer() + payloadLength);
}
posPendingCount--;
position = 0;
startOffset = 0;
payloadPending = false;
//System.out.println("StandardR.D&PE skipPos");
}
@ -1069,16 +1094,28 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
proxIn.seek(proxIn.getFilePointer()+payloadLength);
}
final int code = proxIn.readVInt();
if ((code & 1) != 0) {
// new payload length
payloadLength = proxIn.readVInt();
assert payloadLength >= 0;
}
assert payloadLength != -1;
int code = proxIn.readVInt();
if (storePayloads) {
if ((code & 1) != 0) {
// new payload length
payloadLength = proxIn.readVInt();
assert payloadLength >= 0;
}
assert payloadLength != -1;
payloadPending = true;
position += code >>> 1;
payloadPending = true;
code >>>= 1;
}
position += code;
if (storeOffsets) {
int offsetCode = proxIn.readVInt();
if ((offsetCode & 1) != 0) {
// new offset length
offsetLength = proxIn.readVInt();
}
startOffset += offsetCode >>> 1;
}
posPendingCount--;
@ -1090,32 +1127,36 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
@Override
public int startOffset() throws IOException {
return -1;
return storeOffsets ? startOffset : -1;
}
@Override
public int endOffset() throws IOException {
return -1;
return storeOffsets ? startOffset + offsetLength : -1;
}
/** Returns the payload at this position, or null if no
* payload was indexed. */
@Override
public BytesRef getPayload() throws IOException {
assert lazyProxPointer == -1;
assert posPendingCount < freq;
if (!payloadPending) {
throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once.");
}
if (payloadLength > payload.bytes.length) {
payload.grow(payloadLength);
}
if (storePayloads) {
assert lazyProxPointer == -1;
assert posPendingCount < freq;
if (!payloadPending) {
throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once.");
}
if (payloadLength > payload.bytes.length) {
payload.grow(payloadLength);
}
proxIn.readBytes(payload.bytes, 0, payloadLength);
payload.length = payloadLength;
payloadPending = false;
proxIn.readBytes(payload.bytes, 0, payloadLength);
payload.length = payloadLength;
payloadPending = false;
return payload;
return payload;
} else {
throw new IOException("No payloads exist for this field!");
}
}
@Override

View File

@ -73,12 +73,15 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
IndexOptions indexOptions;
boolean storePayloads;
boolean storeOffsets;
// Starts a new term
long freqStart;
long proxStart;
FieldInfo fieldInfo;
int lastPayloadLength;
int lastOffsetLength;
int lastPosition;
int lastOffset;
// private String segment;
@ -137,6 +140,8 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
proxStart = proxOut.getFilePointer();
// force first payload to write its length
lastPayloadLength = -1;
// force first offset to write its length
lastOffsetLength = -1;
}
skipListWriter.resetSkip();
}
@ -155,10 +160,8 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
*/
this.fieldInfo = fieldInfo;
indexOptions = fieldInfo.indexOptions;
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
throw new UnsupportedOperationException("this codec cannot index offsets");
}
storeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
storePayloads = fieldInfo.storePayloads;
//System.out.println(" set init blockFreqStart=" + freqStart);
//System.out.println(" set init blockProxStart=" + proxStart);
@ -180,7 +183,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
}
if ((++df % skipInterval) == 0) {
skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength);
skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength, storeOffsets, lastOffsetLength);
skipListWriter.bufferSkip(df);
}
@ -197,31 +200,26 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
}
lastPosition = 0;
lastOffset = 0;
}
/** Add a new position & payload */
@Override
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
//if (DEBUG) System.out.println("SPW: addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.length + " bytes")) + " proxFP=" + proxOut.getFilePointer());
assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS: "invalid indexOptions: " + indexOptions;
assert indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 : "invalid indexOptions: " + indexOptions;
assert proxOut != null;
// TODO: when we add offsets... often
// endOffset-startOffset will be constant or near
// constant for all docs (eg if the term wasn't stemmed
// then this will usually be the utf16 length of the
// term); would be nice to write that length once up
// front and then not encode endOffset for each
// position..
final int delta = position - lastPosition;
assert delta >= 0: "position=" + position + " lastPosition=" + lastPosition; // not quite right (if pos=0 is repeated twice we don't catch it)
lastPosition = position;
int payloadLength = 0;
if (storePayloads) {
final int payloadLength = payload == null ? 0 : payload.length;
payloadLength = payload == null ? 0 : payload.length;
if (payloadLength != lastPayloadLength) {
lastPayloadLength = payloadLength;
@ -230,13 +228,28 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
} else {
proxOut.writeVInt(delta << 1);
}
if (payloadLength > 0) {
proxOut.writeBytes(payload.bytes, payload.offset, payloadLength);
}
} else {
proxOut.writeVInt(delta);
}
if (storeOffsets) {
// don't use startOffset - lastEndOffset, because this creates lots of negative vints for synonyms,
// and the numbers aren't that much smaller anyways.
int offsetDelta = startOffset - lastOffset;
int offsetLength = endOffset - startOffset;
if (offsetLength != lastOffsetLength) {
proxOut.writeVInt(offsetDelta << 1 | 1);
proxOut.writeVInt(offsetLength);
} else {
proxOut.writeVInt(offsetDelta << 1);
}
lastOffset = startOffset;
lastOffsetLength = offsetLength;
}
if (payloadLength > 0) {
proxOut.writeBytes(payload.bytes, payload.offset, payloadLength);
}
}
@Override
@ -304,7 +317,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
assert firstTerm.skipOffset > 0;
bytesWriter.writeVInt(firstTerm.skipOffset);
}
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
bytesWriter.writeVLong(firstTerm.proxStart);
}
long lastFreqStart = firstTerm.freqStart;
@ -319,7 +332,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
assert term.skipOffset > 0;
bytesWriter.writeVInt(term.skipOffset);
}
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
bytesWriter.writeVLong(term.proxStart - lastProxStart);
lastProxStart = term.proxStart;
}

View File

@ -30,13 +30,16 @@ import org.apache.lucene.store.IndexInput;
*/
public class Lucene40SkipListReader extends MultiLevelSkipListReader {
private boolean currentFieldStoresPayloads;
private boolean currentFieldStoresOffsets;
private long freqPointer[];
private long proxPointer[];
private int payloadLength[];
private int offsetLength[];
private long lastFreqPointer;
private long lastProxPointer;
private int lastPayloadLength;
private int lastOffsetLength;
public Lucene40SkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) {
@ -44,17 +47,20 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader {
freqPointer = new long[maxSkipLevels];
proxPointer = new long[maxSkipLevels];
payloadLength = new int[maxSkipLevels];
offsetLength = new int[maxSkipLevels];
}
public void init(long skipPointer, long freqBasePointer, long proxBasePointer, int df, boolean storesPayloads) {
public void init(long skipPointer, long freqBasePointer, long proxBasePointer, int df, boolean storesPayloads, boolean storesOffsets) {
super.init(skipPointer, df);
this.currentFieldStoresPayloads = storesPayloads;
this.currentFieldStoresOffsets = storesOffsets;
lastFreqPointer = freqBasePointer;
lastProxPointer = proxBasePointer;
Arrays.fill(freqPointer, freqBasePointer);
Arrays.fill(proxPointer, proxBasePointer);
Arrays.fill(payloadLength, 0);
Arrays.fill(offsetLength, 0);
}
/** Returns the freq pointer of the doc to which the last call of
@ -76,12 +82,20 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader {
return lastPayloadLength;
}
/** Returns the offset length (endOffset-startOffset) of the position stored just before
* the doc to which the last call of {@link MultiLevelSkipListReader#skipTo(int)}
* has skipped. */
public int getOffsetLength() {
return lastOffsetLength;
}
@Override
protected void seekChild(int level) throws IOException {
super.seekChild(level);
freqPointer[level] = lastFreqPointer;
proxPointer[level] = lastProxPointer;
payloadLength[level] = lastPayloadLength;
offsetLength[level] = lastOffsetLength;
}
@Override
@ -90,6 +104,7 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader {
lastFreqPointer = freqPointer[level];
lastProxPointer = proxPointer[level];
lastPayloadLength = payloadLength[level];
lastOffsetLength = offsetLength[level];
}
@ -110,6 +125,11 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader {
} else {
delta = skipStream.readVInt();
}
if (currentFieldStoresOffsets) {
offsetLength[level] = skipStream.readVInt();
}
freqPointer[level] += skipStream.readVInt();
proxPointer[level] += skipStream.readVInt();

View File

@ -40,7 +40,9 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter {
private int curDoc;
private boolean curStorePayloads;
private boolean curStoreOffsets;
private int curPayloadLength;
private int curOffsetLength;
private long curFreqPointer;
private long curProxPointer;
@ -58,10 +60,12 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter {
/**
* Sets the values for the current skip data.
*/
public void setSkipData(int doc, boolean storePayloads, int payloadLength) {
public void setSkipData(int doc, boolean storePayloads, int payloadLength, boolean storeOffsets, int offsetLength) {
this.curDoc = doc;
this.curStorePayloads = storePayloads;
this.curPayloadLength = payloadLength;
this.curStoreOffsets = storeOffsets;
this.curOffsetLength = offsetLength;
this.curFreqPointer = freqOutput.getFilePointer();
if (proxOutput != null)
this.curProxPointer = proxOutput.getFilePointer();
@ -116,6 +120,12 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter {
// current field does not store payloads
skipBuffer.writeVInt(curDoc - lastSkipDoc[level]);
}
// TODO: not sure it really helps to shove this somewhere else if its the same as the last skip
if (curStoreOffsets) {
skipBuffer.writeVInt(curOffsetLength);
}
skipBuffer.writeVInt((int) (curFreqPointer - lastSkipFreqPointer[level]));
skipBuffer.writeVInt((int) (curProxPointer - lastSkipProxPointer[level]));

View File

@ -548,8 +548,9 @@ class SimpleTextFieldsReader extends FieldsProducer {
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+DOC.length, scratch.length-DOC.length, scratchUTF16);
int docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
visitedDocs.set(docID);
} else if (StringHelper.startsWith(scratch, POS)) {
totalTermFreq++;
} else if (StringHelper.startsWith(scratch, FREQ)) {
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+FREQ.length, scratch.length-FREQ.length, scratchUTF16);
totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
} else if (StringHelper.startsWith(scratch, TERM)) {
if (lastDocsStart != -1) {
b.add(Util.toIntsRef(lastTerm, scratchIntsRef), new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,

View File

@ -404,7 +404,7 @@ public final class FieldInfos implements Iterable<FieldInfo> {
public boolean hasNorms() {
for (FieldInfo fi : this) {
if (fi.isIndexed && !fi.omitNorms) {
if (fi.normsPresent()) {
return true;
}
}

View File

@ -921,13 +921,7 @@ public abstract class IndexReader implements Closeable {
* If this method returns an empty array, that means this
* reader is a null reader (for example a MultiReader
* that has no sub readers).
* <p>
* NOTE: You should not try using sub-readers returned by
* this method to make any changes (deleteDocument,
* etc.). While this might succeed for one composite reader
* (like MultiReader), it will most likely lead to index
* corruption for other readers (like DirectoryReader obtained
* through {@link #open}. Use the parent reader directly. */
*/
public IndexReader[] getSequentialSubReaders() {
ensureOpen();
return null;

View File

@ -32,6 +32,10 @@ public class CollectionStatistics {
private final long sumDocFreq;
public CollectionStatistics(String field, int maxDoc, int docCount, long sumTotalTermFreq, long sumDocFreq) {
assert maxDoc >= 0;
assert docCount >= -1 && docCount <= maxDoc; // #docs with field must be <= #docs
assert sumDocFreq >= -1;
assert sumTotalTermFreq == -1 || sumTotalTermFreq >= sumDocFreq; // #positions must be >= #postings
this.field = field;
this.maxDoc = maxDoc;
this.docCount = docCount;

View File

@ -29,6 +29,8 @@ public class TermStatistics {
private final long totalTermFreq;
public TermStatistics(BytesRef term, int docFreq, long totalTermFreq) {
assert docFreq >= 0;
assert totalTermFreq == -1 || totalTermFreq >= docFreq; // #positions must be >= #postings
this.term = term;
this.docFreq = docFreq;
this.totalTermFreq = totalTermFreq;

View File

@ -87,6 +87,8 @@ public abstract class SimilarityBase extends Similarity {
/** Fills all member fields defined in {@code BasicStats} in {@code stats}.
* Subclasses can override this method to fill additional stats. */
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
// #positions(field) must be >= #positions(term)
assert collectionStats.sumTotalTermFreq() == -1 || collectionStats.sumTotalTermFreq() >= termStats.totalTermFreq();
int numberOfDocuments = collectionStats.maxDoc();
int docFreq = termStats.docFreq();

View File

@ -70,8 +70,9 @@ import org.apache.lucene.util.ToStringUtils;
* and 'jones' in position 1). </p>
*
* <p>Note: as {@link #getField()} returns the masked field, scoring will be
* done using the norms of the field name supplied. This may lead to unexpected
* scoring behaviour.</p>
* done using the Similarity and collection statistics of the field name supplied,
* but with the term statistics of the real field. This may lead to exceptions,
* poor performance, and unexpected scoring behaviour.</p>
*/
public class FieldMaskingSpanQuery extends SpanQuery {
private SpanQuery maskedQuery;

View File

@ -255,10 +255,8 @@ final class CompoundFileWriter implements Closeable{
assert !seenIDs.contains(id): "file=\"" + name + "\" maps to id=\"" + id + "\", which was already written";
seenIDs.add(id);
final DirectCFSIndexOutput out;
if (outputTaken.compareAndSet(false, true)) {
if ((outputLocked = outputTaken.compareAndSet(false, true))) {
out = new DirectCFSIndexOutput(getOutput(), entry, false);
outputLocked = true;
success = true;
} else {
entry.dir = this.directory;
if (directory.fileExists(name)) {

View File

@ -120,42 +120,6 @@ public final class ReaderUtil {
protected abstract void add(int base, IndexReader r) throws IOException;
}
/**
* Returns sub IndexReader that contains the given document id.
*
* @param doc id of document
* @param reader parent reader
* @return sub reader of parent which contains the specified doc id
*/
public static IndexReader subReader(int doc, IndexReader reader) {
List<IndexReader> subReadersList = new ArrayList<IndexReader>();
ReaderUtil.gatherSubReaders(subReadersList, reader);
IndexReader[] subReaders = subReadersList
.toArray(new IndexReader[subReadersList.size()]);
int[] docStarts = new int[subReaders.length];
int maxDoc = 0;
for (int i = 0; i < subReaders.length; i++) {
docStarts[i] = maxDoc;
maxDoc += subReaders[i].maxDoc();
}
return subReaders[subIndex(doc, docStarts)];
}
/**
* Returns sub-reader subIndex from reader.
*
* @param reader parent reader
* @param subIndex index of desired sub reader
* @return the subreader at subIndex
*/
public static IndexReader subReader(IndexReader reader, int subIndex) {
List<IndexReader> subReadersList = new ArrayList<IndexReader>();
ReaderUtil.gatherSubReaders(subReadersList, reader);
IndexReader[] subReaders = subReadersList
.toArray(new IndexReader[subReadersList.size()]);
return subReaders[subIndex];
}
public static ReaderContext buildReaderContext(IndexReader reader) {
return new ReaderContextBuilder(reader).build();

View File

@ -96,13 +96,13 @@ public class SentinelIntSet {
public int put(int key) {
int s = find(key);
if (s < 0) {
count++;
if (count >= rehashCount) {
rehash();
s = getSlot(key);
} else {
s = -s-1;
}
count++;
keys[s] = key;
}
return s;

View File

@ -32,12 +32,13 @@ import org.apache.lucene.util.BytesRef;
public final class ByteSequenceOutputs extends Outputs<BytesRef> {
private final static BytesRef NO_OUTPUT = new BytesRef();
private final static ByteSequenceOutputs singleton = new ByteSequenceOutputs();
private ByteSequenceOutputs() {
}
public static ByteSequenceOutputs getSingleton() {
return new ByteSequenceOutputs();
return singleton;
}
@Override

View File

@ -32,12 +32,13 @@ import org.apache.lucene.util.IntsRef;
public final class IntSequenceOutputs extends Outputs<IntsRef> {
private final static IntsRef NO_OUTPUT = new IntsRef();
private final static IntSequenceOutputs singleton = new IntSequenceOutputs();
private IntSequenceOutputs() {
}
public static IntSequenceOutputs getSingleton() {
return new IntSequenceOutputs();
return singleton;
}
@Override

View File

@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import java.io.IOException;
import java.util.ArrayList;
@ -289,8 +290,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}
}
};
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean());
}
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter) throws IOException {
for (int i = 0; i < iterations; i++) {
String text;
switch(_TestUtil.nextInt(random, 0, 4)) {
@ -311,7 +316,9 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
}
TokenStream ts = a.tokenStream("dummy", new StringReader(text));
int remainder = random.nextInt(10);
Reader reader = new StringReader(text);
TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
@ -339,30 +346,38 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
if (VERBOSE) {
System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis");
}
reader = new StringReader(text);
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
// offset + pos + type
assertAnalyzesToReuse(a, text,
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
types.toArray(new String[types.size()]),
toIntArray(positions));
toIntArray(positions),
text.length());
} else if (posIncAtt != null && offsetAtt != null) {
// offset + pos
assertAnalyzesToReuse(a, text,
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
toIntArray(positions));
null,
toIntArray(positions),
text.length());
} else if (offsetAtt != null) {
// offset
assertAnalyzesToReuse(a, text,
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets));
toIntArray(endOffsets),
null,
null,
text.length());
} else {
// terms only
assertAnalyzesToReuse(a, text,
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]));
}
}

View File

@ -0,0 +1,100 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.SortedMap;
import java.util.TreeMap;
// the purpose of this charfilter is to send offsets out of bounds
// if the analyzer doesn't use correctOffset or does incorrect offset math.
class MockCharFilter extends CharStream {
final Reader in;
final int remainder;
// for testing only
public MockCharFilter(Reader in, int remainder) {
this.in = in;
this.remainder = remainder;
assert remainder >= 0 && remainder < 10 : "invalid parameter";
}
@Override
public void close() throws IOException {
in.close();
}
int currentOffset = -1;
int delta = 0;
int bufferedCh = -1;
@Override
public int read() throws IOException {
// we have a buffered character, add an offset correction and return it
if (bufferedCh >= 0) {
int ch = bufferedCh;
bufferedCh = -1;
currentOffset++;
addOffCorrectMap(currentOffset+delta, delta-1);
delta--;
return ch;
}
// otherwise actually read one
int ch = in.read();
if (ch < 0)
return ch;
currentOffset++;
if ((ch % 10) != remainder || Character.isHighSurrogate((char)ch) || Character.isLowSurrogate((char)ch)) {
return ch;
}
// we will double this character, so buffer it.
bufferedCh = ch;
return ch;
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
int numRead = 0;
for (int i = off; i < off + len; i++) {
int c = read();
if (c == -1) break;
cbuf[i] = (char) c;
numRead++;
}
return numRead == 0 ? -1 : numRead;
}
@Override
public int correctOffset(int currentOff) {
SortedMap<Integer,Integer> subMap = corrections.subMap(0, currentOff+1);
int ret = subMap.isEmpty() ? currentOff : currentOff + subMap.get(subMap.lastKey());
assert ret >= 0 : "currentOff=" + currentOff + ",diff=" + (ret-currentOff);
return ret;
}
protected void addOffCorrectMap(int off, int cumulativeDiff) {
corrections.put(off, cumulativeDiff);
}
TreeMap<Integer,Integer> corrections = new TreeMap<Integer,Integer>();
}

View File

@ -137,7 +137,7 @@ class PreFlexRWFieldsWriter extends FieldsConsumer {
}
if ((++df % termsOut.skipInterval) == 0) {
skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength);
skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength, false, 0);
skipListWriter.bufferSkip(df);
}

View File

@ -268,8 +268,19 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
assert subStats != null;
}
docFreq += subStats.docFreq();
totalTermFreq += subStats.totalTermFreq();
int nodeDocFreq = subStats.docFreq();
if (docFreq >= 0 && nodeDocFreq >= 0) {
docFreq += nodeDocFreq;
} else {
docFreq = -1;
}
long nodeTotalTermFreq = subStats.totalTermFreq();
if (totalTermFreq >= 0 && nodeTotalTermFreq >= 0) {
totalTermFreq += nodeTotalTermFreq;
} else {
totalTermFreq = -1;
}
}
return new TermStatistics(term.bytes(), docFreq, totalTermFreq);
@ -299,9 +310,29 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
// Collection stats are pre-shared on reopen, so,
// we better not have a cache miss:
assert nodeStats != null: "myNodeID=" + myNodeID + " nodeID=" + nodeID + " version=" + nodeVersions[nodeID] + " field=" + field;
docCount += nodeStats.docCount();
sumTotalTermFreq += nodeStats.sumTotalTermFreq();
sumDocFreq += nodeStats.sumDocFreq();
int nodeDocCount = nodeStats.docCount();
if (docCount >= 0 && nodeDocCount >= 0) {
docCount += nodeDocCount;
} else {
docCount = -1;
}
long nodeSumTotalTermFreq = nodeStats.sumTotalTermFreq();
if (sumTotalTermFreq >= 0 && nodeSumTotalTermFreq >= 0) {
sumTotalTermFreq += nodeSumTotalTermFreq;
} else {
sumTotalTermFreq = -1;
}
long nodeSumDocFreq = nodeStats.sumDocFreq();
if (sumDocFreq >= 0 && nodeSumDocFreq >= 0) {
sumDocFreq += nodeSumDocFreq;
} else {
sumDocFreq = -1;
}
assert nodeStats.maxDoc() >= 0;
maxDoc += nodeStats.maxDoc();
}

View File

@ -283,7 +283,8 @@ public abstract class LuceneTestCase extends Assert {
int randomVal = random.nextInt(10);
if ("Lucene3x".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal < 2)) { // preflex-only setup
codec = new PreFlexRWCodec();
codec = Codec.forName("Lucene3x");
assert (codec instanceof PreFlexRWCodec) : "fix your classpath to have tests-framework.jar before lucene-core.jar";
PREFLEX_IMPERSONATION_IS_ACTIVE = true;
} else if ("SimpleText".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 9)) {
codec = new SimpleTextCodec();

View File

@ -249,7 +249,42 @@ public class _TestUtil {
}
}
// TODO: make this more evil
private static final String[] HTML_CHAR_ENTITIES = {
"AElig", "Aacute", "Acirc", "Agrave", "Alpha", "AMP", "Aring", "Atilde",
"Auml", "Beta", "COPY", "Ccedil", "Chi", "Dagger", "Delta", "ETH",
"Eacute", "Ecirc", "Egrave", "Epsilon", "Eta", "Euml", "Gamma", "GT",
"Iacute", "Icirc", "Igrave", "Iota", "Iuml", "Kappa", "Lambda", "LT",
"Mu", "Ntilde", "Nu", "OElig", "Oacute", "Ocirc", "Ograve", "Omega",
"Omicron", "Oslash", "Otilde", "Ouml", "Phi", "Pi", "Prime", "Psi",
"QUOT", "REG", "Rho", "Scaron", "Sigma", "THORN", "Tau", "Theta",
"Uacute", "Ucirc", "Ugrave", "Upsilon", "Uuml", "Xi", "Yacute", "Yuml",
"Zeta", "aacute", "acirc", "acute", "aelig", "agrave", "alefsym",
"alpha", "amp", "and", "ang", "apos", "aring", "asymp", "atilde",
"auml", "bdquo", "beta", "brvbar", "bull", "cap", "ccedil", "cedil",
"cent", "chi", "circ", "clubs", "cong", "copy", "crarr", "cup",
"curren", "dArr", "dagger", "darr", "deg", "delta", "diams", "divide",
"eacute", "ecirc", "egrave", "empty", "emsp", "ensp", "epsilon",
"equiv", "eta", "eth", "euml", "euro", "exist", "fnof", "forall",
"frac12", "frac14", "frac34", "frasl", "gamma", "ge", "gt", "hArr",
"harr", "hearts", "hellip", "iacute", "icirc", "iexcl", "igrave",
"image", "infin", "int", "iota", "iquest", "isin", "iuml", "kappa",
"lArr", "lambda", "lang", "laquo", "larr", "lceil", "ldquo", "le",
"lfloor", "lowast", "loz", "lrm", "lsaquo", "lsquo", "lt", "macr",
"mdash", "micro", "middot", "minus", "mu", "nabla", "nbsp", "ndash",
"ne", "ni", "not", "notin", "nsub", "ntilde", "nu", "oacute", "ocirc",
"oelig", "ograve", "oline", "omega", "omicron", "oplus", "or", "ordf",
"ordm", "oslash", "otilde", "otimes", "ouml", "para", "part", "permil",
"perp", "phi", "pi", "piv", "plusmn", "pound", "prime", "prod", "prop",
"psi", "quot", "rArr", "radic", "rang", "raquo", "rarr", "rceil",
"rdquo", "real", "reg", "rfloor", "rho", "rlm", "rsaquo", "rsquo",
"sbquo", "scaron", "sdot", "sect", "shy", "sigma", "sigmaf", "sim",
"spades", "sub", "sube", "sum", "sup", "sup1", "sup2", "sup3", "supe",
"szlig", "tau", "there4", "theta", "thetasym", "thinsp", "thorn",
"tilde", "times", "trade", "uArr", "uacute", "uarr", "ucirc", "ugrave",
"uml", "upsih", "upsilon", "uuml", "weierp", "xi", "yacute", "yen",
"yuml", "zeta", "zwj", "zwnj"
};
public static String randomHtmlishString(Random random, int numElements) {
final int end = random.nextInt(numElements);
if (end == 0) {
@ -258,17 +293,80 @@ public class _TestUtil {
}
StringBuilder sb = new StringBuilder();
for (int i = 0; i < end; i++) {
int val = random.nextInt(10);
int val = random.nextInt(25);
switch(val) {
case 0: sb.append("<p>"); break;
case 1: sb.append("</p>"); break;
case 2: sb.append("<!--"); break;
case 3: sb.append("-->"); break;
case 4: sb.append("&#"); break;
case 5: sb.append(";"); break;
case 6: sb.append((char)_TestUtil.nextInt(random, '0', '9')); break;
default:
sb.append((char)_TestUtil.nextInt(random, 'a', 'z'));
case 1: {
sb.append("<");
sb.append(" ".substring(nextInt(random, 0, 4)));
sb.append(randomSimpleString(random));
for (int j = 0 ; j < nextInt(random, 0, 10) ; ++j) {
sb.append(' ');
sb.append(randomSimpleString(random));
sb.append(" ".substring(nextInt(random, 0, 1)));
sb.append('=');
sb.append(" ".substring(nextInt(random, 0, 1)));
sb.append("\"".substring(nextInt(random, 0, 1)));
sb.append(randomSimpleString(random));
sb.append("\"".substring(nextInt(random, 0, 1)));
}
sb.append(" ".substring(nextInt(random, 0, 4)));
sb.append("/".substring(nextInt(random, 0, 1)));
sb.append(">".substring(nextInt(random, 0, 1)));
break;
}
case 2: {
sb.append("</");
sb.append(" ".substring(nextInt(random, 0, 4)));
sb.append(randomSimpleString(random));
sb.append(" ".substring(nextInt(random, 0, 4)));
sb.append(">".substring(nextInt(random, 0, 1)));
break;
}
case 3: sb.append(">"); break;
case 4: sb.append("</p>"); break;
case 5: sb.append("<!--"); break;
case 6: sb.append("<!--#"); break;
case 7: sb.append("<script><!-- f('"); break;
case 8: sb.append("</script>"); break;
case 9: sb.append("<?"); break;
case 10: sb.append("?>"); break;
case 11: sb.append("\""); break;
case 12: sb.append("\\\""); break;
case 13: sb.append("'"); break;
case 14: sb.append("\\'"); break;
case 15: sb.append("-->"); break;
case 16: {
sb.append("&");
switch(nextInt(random, 0, 2)) {
case 0: sb.append(randomSimpleString(random)); break;
case 1: sb.append(HTML_CHAR_ENTITIES[random.nextInt(HTML_CHAR_ENTITIES.length)]); break;
}
sb.append(";".substring(nextInt(random, 0, 1)));
break;
}
case 17: {
sb.append("&#");
if (0 == nextInt(random, 0, 1)) {
sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1));
sb.append(";".substring(nextInt(random, 0, 1)));
}
break;
}
case 18: {
sb.append("&#x");
if (0 == nextInt(random, 0, 1)) {
sb.append(Integer.toString(nextInt(random, 0, Integer.MAX_VALUE - 1), 16));
sb.append(";".substring(nextInt(random, 0, 1)));
}
break;
}
case 19: sb.append(";"); break;
case 20: sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1)); break;
case 21: sb.append("\n");
case 22: sb.append(" ".substring(nextInt(random, 0, 10)));
default: sb.append(randomSimpleString(random));
}
}
return sb.toString();

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TestMockCharFilter extends BaseTokenStreamTestCase {
public void test() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, tokenizer);
}
@Override
protected Reader initReader(Reader reader) {
return new MockCharFilter(CharReader.get(reader), 7);
}
};
assertAnalyzesTo(analyzer, "ab",
new String[] { "aab" },
new int[] { 0 },
new int[] { 2 }
);
assertAnalyzesTo(analyzer, "aba",
new String[] { "aabaa" },
new int[] { 0 },
new int[] { 3 }
);
assertAnalyzesTo(analyzer, "abcdefga",
new String[] { "aabcdefgaa" },
new int[] { 0 },
new int[] { 8 }
);
}
}

View File

@ -22,29 +22,46 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CannedAnalyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockPayloadAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.English;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Assume;
import org.apache.lucene.util._TestUtil;
public class TestPostingsOffsets extends LuceneTestCase {
IndexWriterConfig iwc;
public void setUp() throws Exception {
super.setUp();
// Currently only SimpleText and Lucene40 can index offsets into postings:
assumeTrue("codec does not support offsets", Codec.getDefault().getName().equals("SimpleText") || Codec.getDefault().getName().equals("Lucene40"));
iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random));
if (Codec.getDefault().getName().equals("Lucene40")) {
// pulsing etc are not implemented
iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
}
}
public void testBasic() throws Exception {
// Currently only SimpleText can index offsets into postings:
Assume.assumeTrue(Codec.getDefault().getName().equals("SimpleText"));
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random, dir);
RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc);
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
@ -94,16 +111,117 @@ public class TestPostingsOffsets extends LuceneTestCase {
r.close();
dir.close();
}
public void testSkipping() throws Exception {
doTestNumbers(false);
}
public void testPayloads() throws Exception {
doTestNumbers(true);
}
public void doTestNumbers(boolean withPayloads) throws Exception {
Directory dir = newDirectory();
Analyzer analyzer = withPayloads ? new MockPayloadAnalyzer() : new MockAnalyzer(random);
iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
if (Codec.getDefault().getName().equals("Lucene40")) {
// pulsing etc are not implemented
iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
}
iwc.setMergePolicy(newLogMergePolicy()); // will rely on docids a bit for skipping
RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc);
FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
if (random.nextBoolean()) {
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(random.nextBoolean());
ft.setStoreTermVectorPositions(random.nextBoolean());
}
int numDocs = atLeast(500);
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
doc.add(new Field("numbers", English.intToEnglish(i), ft));
doc.add(new Field("oddeven", (i % 2) == 0 ? "even" : "odd", ft));
doc.add(new StringField("id", "" + i));
w.addDocument(doc);
}
IndexReader reader = w.getReader();
w.close();
String terms[] = { "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "hundred" };
for (String term : terms) {
DocsAndPositionsEnum dp = MultiFields.getTermPositionsEnum(reader, null, "numbers", new BytesRef(term), true);
int doc;
while((doc = dp.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
String storedNumbers = reader.document(doc).get("numbers");
int freq = dp.freq();
for (int i = 0; i < freq; i++) {
dp.nextPosition();
int start = dp.startOffset();
assert start >= 0;
int end = dp.endOffset();
assert end >= 0 && end >= start;
// check that the offsets correspond to the term in the src text
assertTrue(storedNumbers.substring(start, end).equals(term));
if (withPayloads) {
// check that we have a payload and it starts with "pos"
assertTrue(dp.hasPayload());
BytesRef payload = dp.getPayload();
assertTrue(payload.utf8ToString().startsWith("pos:"));
} // note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer!
}
}
}
// check we can skip correctly
int numSkippingTests = atLeast(50);
for (int j = 0; j < numSkippingTests; j++) {
int num = _TestUtil.nextInt(random, 100, Math.min(numDocs-1, 999));
DocsAndPositionsEnum dp = MultiFields.getTermPositionsEnum(reader, null, "numbers", new BytesRef("hundred"), true);
int doc = dp.advance(num);
assertEquals(num, doc);
int freq = dp.freq();
for (int i = 0; i < freq; i++) {
String storedNumbers = reader.document(doc).get("numbers");
dp.nextPosition();
int start = dp.startOffset();
assert start >= 0;
int end = dp.endOffset();
assert end >= 0 && end >= start;
// check that the offsets correspond to the term in the src text
assertTrue(storedNumbers.substring(start, end).equals("hundred"));
if (withPayloads) {
// check that we have a payload and it starts with "pos"
assertTrue(dp.hasPayload());
BytesRef payload = dp.getPayload();
assertTrue(payload.utf8ToString().startsWith("pos:"));
} // note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer!
}
}
// check that other fields (without offsets) work correctly
for (int i = 0; i < numDocs; i++) {
DocsEnum dp = MultiFields.getTermDocsEnum(reader, null, "id", new BytesRef("" + i), false);
assertEquals(i, dp.nextDoc());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());
}
reader.close();
dir.close();
}
public void testRandom() throws Exception {
// Currently only SimpleText can index offsets into postings:
Assume.assumeTrue(Codec.getDefault().getName().equals("SimpleText"));
// token -> docID -> tokens
final Map<String,Map<Integer,List<Token>>> actualTokens = new HashMap<String,Map<Integer,List<Token>>>();
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random, dir);
RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc);
final int numDocs = atLeast(20);
//final int numDocs = atLeast(5);

View File

@ -31,6 +31,7 @@ import org.apache.lucene.search.CheckHits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryUtils;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.AfterClass;
@ -240,6 +241,8 @@ public class TestFieldMaskingSpanQuery extends LuceneTestCase {
}
public void testSimple2() throws Exception {
assumeTrue("Broken scoring: LUCENE-3723",
searcher.getSimilarityProvider().get("id") instanceof TFIDFSimilarity);
SpanQuery q1 = new SpanTermQuery(new Term("gender", "female"));
SpanQuery q2 = new SpanTermQuery(new Term("last", "smith"));
SpanQuery q = new SpanNearQuery(new SpanQuery[]
@ -310,6 +313,8 @@ public class TestFieldMaskingSpanQuery extends LuceneTestCase {
}
public void testSpans2() throws Exception {
assumeTrue("Broken scoring: LUCENE-3723",
searcher.getSimilarityProvider().get("id") instanceof TFIDFSimilarity);
SpanQuery qA1 = new SpanTermQuery(new Term("gender", "female"));
SpanQuery qA2 = new SpanTermQuery(new Term("first", "james"));
SpanQuery qA = new SpanOrQuery(qA1, new FieldMaskingSpanQuery(qA2, "gender"));

View File

@ -20,6 +20,8 @@ package org.apache.lucene.util;
import org.junit.Test;
import java.util.HashSet;
/**
*
*
@ -45,4 +47,32 @@ public class TestSentinelIntSet extends LuceneTestCase {
assertEquals(20, set.size());
assertEquals(24, set.rehashCount);
}
@Test
public void testRandom() throws Exception {
for (int i=0; i<10000; i++) {
int initSz = random.nextInt(20);
int num = random.nextInt(30);
int maxVal = (random.nextBoolean() ? random.nextInt(50) : random.nextInt(Integer.MAX_VALUE)) + 1;
HashSet<Integer> a = new HashSet<Integer>(initSz);
SentinelIntSet b = new SentinelIntSet(initSz, -1);
for (int j=0; j<num; j++) {
int val = random.nextInt(maxVal);
boolean exists = !a.add(val);
boolean existsB = b.exists(val);
assertEquals(exists, existsB);
int slot = b.find(val);
assertEquals(exists, slot>=0);
b.put(val);
assertEquals(a.size(), b.size());
}
}
}
}

View File

@ -1055,6 +1055,50 @@ public class TestFSTs extends LuceneTestCase {
}
}
// NOTE: this test shows a case where our current builder
// fails to produce minimal FST:
/*
public void test3() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
IntsRef scratchIntsRef = new IntsRef();
builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.get(0));
builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), 1L);
builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), 2L);
final FST<Long> fst = builder.finish();
//System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount());
// NOTE: we produce 7 nodes today
assertEquals(6, fst.getNodeCount());
// NOTE: we produce 8 arcs today
assertEquals(7, fst.getNodeCount());
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
//Util.toDot(fst, w, false, false);
//w.close();
}
*/
// NOTE: this test shows a case where our current builder
// fails to produce minimal FST:
/*
public void test4() throws Exception {
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
Builder<BytesRef> builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
IntsRef scratchIntsRef = new IntsRef();
builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.getNoOutput());
builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), new BytesRef("1"));
builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), new BytesRef("11"));
final FST<BytesRef> fst = builder.finish();
//System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount());
// NOTE: we produce 7 nodes today
assertEquals(6, fst.getNodeCount());
// NOTE: we produce 8 arcs today
assertEquals(7, fst.getNodeCount());
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
//Util.toDot(fst, w, false, false);
//w.close();
}
*/
// Build FST for all unique terms in the test line docs
// file, up until a time limit
public void testRealTerms() throws Exception {

View File

@ -31,14 +31,38 @@
<target name="compile-core" depends="jflex-notice, common.compile-core"/>
<target name="jflex" depends="jflex-check,clean-jflex,gen-uax29-supp-macros,
jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,jflex-wiki-tokenizer"/>
jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,
jflex-wiki-tokenizer,jflex-HTMLStripCharFilter"/>
<target name="gen-uax29-supp-macros">
<subant target="gen-uax29-supp-macros">
<fileset dir="../icu" includes="build.xml"/>
</subant>
</target>
<target name="jflex-HTMLStripCharFilter"
depends="init,jflex-check,generate-jflex-html-char-entities"
if="jflex.present">
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
<classpath refid="jflex.classpath"/>
</taskdef>
<jflex file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex"
outdir="src/java/org/apache/lucene/analysis/charfilter"
nobak="on"/>
<!-- Remove the inappropriate JFlex-generated constructors -->
<replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
match="/\*\*\s*\*\s*Creates a new scanner.*this\(new java\.io\.InputStreamReader\(in\)\);\s*\}"
replace="" flags="sg"/>
</target>
<target name="generate-jflex-html-char-entities">
<exec dir="src/java/org/apache/lucene/analysis/charfilter"
output="src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex"
executable="${python.exe}" failonerror="true" logerror="true">
<arg value="htmlentity.py"/>
</exec>
</target>
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
<classpath refid="jflex.classpath"/>

View File

@ -20,6 +20,8 @@ package org.apache.lucene.analysis.charfilter;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.util.ArrayUtil;
import java.util.Arrays;
/**
* Base utility class for implementing a {@link CharFilter}.
* You subclass this, and then record mappings by calling
@ -71,6 +73,19 @@ public abstract class BaseCharFilter extends CharFilter {
0 : diffs[size-1];
}
/**
* <p>
* Adds an offset correction mapping at the given output stream offset.
* </p>
* <p>
* Assumption: the offset given with each successive call to this method
* will not be smaller than the offset given at the previous invocation.
* </p>
*
* @param off The output stream offset at which to apply the correction
* @param cumulativeDiff The input offset is given by adding this
* to the output offset
*/
protected void addOffCorrectMap(int off, int cumulativeDiff) {
if (offsets == null) {
offsets = new int[64];
@ -80,7 +95,15 @@ public abstract class BaseCharFilter extends CharFilter {
diffs = ArrayUtil.grow(diffs);
}
offsets[size] = off;
diffs[size++] = cumulativeDiff;
assert (size == 0 || off >= offsets[size])
: "Offset #" + size + "(" + off + ") is less than the last recorded offset "
+ offsets[size] + "\n" + Arrays.toString(offsets) + "\n" + Arrays.toString(diffs);
if (size == 0 || off != offsets[size - 1]) {
offsets[size] = off;
diffs[size++] = cumulativeDiff;
} else { // Overwrite the diff at the last recorded offset
diffs[size - 1] = cumulativeDiff;
}
}
}

View File

@ -0,0 +1,153 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CharacterEntities = ( "AElig" | "Aacute" | "Acirc" | "Agrave" | "Alpha"
| "Aring" | "Atilde" | "Auml" | "Beta" | "Ccedil" | "Chi"
| "Dagger" | "Delta" | "ETH" | "Eacute" | "Ecirc"
| "Egrave" | "Epsilon" | "Eta" | "Euml" | "Gamma"
| "Iacute" | "Icirc" | "Igrave" | "Iota" | "Iuml" | "Kappa"
| "Lambda" | "Mu" | "Ntilde" | "Nu" | "OElig" | "Oacute"
| "Ocirc" | "Ograve" | "Omega" | "Omicron" | "Oslash"
| "Otilde" | "Ouml" | "Phi" | "Pi" | "Prime" | "Psi"
| "Rho" | "Scaron" | "Sigma" | "THORN" | "Tau" | "Theta"
| "Uacute" | "Ucirc" | "Ugrave" | "Upsilon" | "Uuml" | "Xi"
| "Yacute" | "Yuml" | "Zeta" | "aacute" | "acirc" | "acute"
| "aelig" | "agrave" | "alefsym" | "alpha" | "amp" | "AMP"
| "and" | "ang" | "apos" | "aring" | "asymp" | "atilde"
| "auml" | "bdquo" | "beta" | "brvbar" | "bull" | "cap"
| "ccedil" | "cedil" | "cent" | "chi" | "circ" | "clubs"
| "cong" | "copy" | "COPY" | "crarr" | "cup" | "curren"
| "dArr" | "dagger" | "darr" | "deg" | "delta" | "diams"
| "divide" | "eacute" | "ecirc" | "egrave" | "empty"
| "emsp" | "ensp" | "epsilon" | "equiv" | "eta" | "eth"
| "euml" | "euro" | "exist" | "fnof" | "forall" | "frac12"
| "frac14" | "frac34" | "frasl" | "gamma" | "ge" | "gt"
| "GT" | "hArr" | "harr" | "hearts" | "hellip" | "iacute"
| "icirc" | "iexcl" | "igrave" | "image" | "infin" | "int"
| "iota" | "iquest" | "isin" | "iuml" | "kappa" | "lArr"
| "lambda" | "lang" | "laquo" | "larr" | "lceil" | "ldquo"
| "le" | "lfloor" | "lowast" | "loz" | "lrm" | "lsaquo"
| "lsquo" | "lt" | "LT" | "macr" | "mdash" | "micro"
| "middot" | "minus" | "mu" | "nabla" | "nbsp" | "ndash"
| "ne" | "ni" | "not" | "notin" | "nsub" | "ntilde" | "nu"
| "oacute" | "ocirc" | "oelig" | "ograve" | "oline"
| "omega" | "omicron" | "oplus" | "or" | "ordf" | "ordm"
| "oslash" | "otilde" | "otimes" | "ouml" | "para" | "part"
| "permil" | "perp" | "phi" | "pi" | "piv" | "plusmn"
| "pound" | "prime" | "prod" | "prop" | "psi" | "quot"
| "QUOT" | "rArr" | "radic" | "rang" | "raquo" | "rarr"
| "rceil" | "rdquo" | "real" | "reg" | "REG" | "rfloor"
| "rho" | "rlm" | "rsaquo" | "rsquo" | "sbquo" | "scaron"
| "sdot" | "sect" | "shy" | "sigma" | "sigmaf" | "sim"
| "spades" | "sub" | "sube" | "sum" | "sup" | "sup1"
| "sup2" | "sup3" | "supe" | "szlig" | "tau" | "there4"
| "theta" | "thetasym" | "thinsp" | "thorn" | "tilde"
| "times" | "trade" | "uArr" | "uacute" | "uarr" | "ucirc"
| "ugrave" | "uml" | "upsih" | "upsilon" | "uuml"
| "weierp" | "xi" | "yacute" | "yen" | "yuml" | "zeta"
| "zwj" | "zwnj" )
%{
private static final Set<String> upperCaseVariantsAccepted
= new HashSet<String>(Arrays.asList("quot","copy","gt","lt","reg","amp"));
private static final CharArrayMap<Character> entityValues
= new CharArrayMap<Character>(Version.LUCENE_40, 253, false);
static {
String[] entities = {
"AElig", "\u00C6", "Aacute", "\u00C1", "Acirc", "\u00C2",
"Agrave", "\u00C0", "Alpha", "\u0391", "Aring", "\u00C5",
"Atilde", "\u00C3", "Auml", "\u00C4", "Beta", "\u0392",
"Ccedil", "\u00C7", "Chi", "\u03A7", "Dagger", "\u2021",
"Delta", "\u0394", "ETH", "\u00D0", "Eacute", "\u00C9",
"Ecirc", "\u00CA", "Egrave", "\u00C8", "Epsilon", "\u0395",
"Eta", "\u0397", "Euml", "\u00CB", "Gamma", "\u0393", "Iacute", "\u00CD",
"Icirc", "\u00CE", "Igrave", "\u00CC", "Iota", "\u0399",
"Iuml", "\u00CF", "Kappa", "\u039A", "Lambda", "\u039B", "Mu", "\u039C",
"Ntilde", "\u00D1", "Nu", "\u039D", "OElig", "\u0152",
"Oacute", "\u00D3", "Ocirc", "\u00D4", "Ograve", "\u00D2",
"Omega", "\u03A9", "Omicron", "\u039F", "Oslash", "\u00D8",
"Otilde", "\u00D5", "Ouml", "\u00D6", "Phi", "\u03A6", "Pi", "\u03A0",
"Prime", "\u2033", "Psi", "\u03A8", "Rho", "\u03A1", "Scaron", "\u0160",
"Sigma", "\u03A3", "THORN", "\u00DE", "Tau", "\u03A4", "Theta", "\u0398",
"Uacute", "\u00DA", "Ucirc", "\u00DB", "Ugrave", "\u00D9",
"Upsilon", "\u03A5", "Uuml", "\u00DC", "Xi", "\u039E",
"Yacute", "\u00DD", "Yuml", "\u0178", "Zeta", "\u0396",
"aacute", "\u00E1", "acirc", "\u00E2", "acute", "\u00B4",
"aelig", "\u00E6", "agrave", "\u00E0", "alefsym", "\u2135",
"alpha", "\u03B1", "amp", "\u0026", "and", "\u2227", "ang", "\u2220",
"apos", "\u0027", "aring", "\u00E5", "asymp", "\u2248",
"atilde", "\u00E3", "auml", "\u00E4", "bdquo", "\u201E",
"beta", "\u03B2", "brvbar", "\u00A6", "bull", "\u2022", "cap", "\u2229",
"ccedil", "\u00E7", "cedil", "\u00B8", "cent", "\u00A2", "chi", "\u03C7",
"circ", "\u02C6", "clubs", "\u2663", "cong", "\u2245", "copy", "\u00A9",
"crarr", "\u21B5", "cup", "\u222A", "curren", "\u00A4", "dArr", "\u21D3",
"dagger", "\u2020", "darr", "\u2193", "deg", "\u00B0", "delta", "\u03B4",
"diams", "\u2666", "divide", "\u00F7", "eacute", "\u00E9",
"ecirc", "\u00EA", "egrave", "\u00E8", "empty", "\u2205",
"emsp", "\u2003", "ensp", "\u2002", "epsilon", "\u03B5",
"equiv", "\u2261", "eta", "\u03B7", "eth", "\u00F0", "euml", "\u00EB",
"euro", "\u20AC", "exist", "\u2203", "fnof", "\u0192",
"forall", "\u2200", "frac12", "\u00BD", "frac14", "\u00BC",
"frac34", "\u00BE", "frasl", "\u2044", "gamma", "\u03B3", "ge", "\u2265",
"gt", "\u003E", "hArr", "\u21D4", "harr", "\u2194", "hearts", "\u2665",
"hellip", "\u2026", "iacute", "\u00ED", "icirc", "\u00EE",
"iexcl", "\u00A1", "igrave", "\u00EC", "image", "\u2111",
"infin", "\u221E", "int", "\u222B", "iota", "\u03B9", "iquest", "\u00BF",
"isin", "\u2208", "iuml", "\u00EF", "kappa", "\u03BA", "lArr", "\u21D0",
"lambda", "\u03BB", "lang", "\u2329", "laquo", "\u00AB",
"larr", "\u2190", "lceil", "\u2308", "ldquo", "\u201C", "le", "\u2264",
"lfloor", "\u230A", "lowast", "\u2217", "loz", "\u25CA", "lrm", "\u200E",
"lsaquo", "\u2039", "lsquo", "\u2018", "lt", "\u003C", "macr", "\u00AF",
"mdash", "\u2014", "micro", "\u00B5", "middot", "\u00B7",
"minus", "\u2212", "mu", "\u03BC", "nabla", "\u2207", "nbsp", " ",
"ndash", "\u2013", "ne", "\u2260", "ni", "\u220B", "not", "\u00AC",
"notin", "\u2209", "nsub", "\u2284", "ntilde", "\u00F1", "nu", "\u03BD",
"oacute", "\u00F3", "ocirc", "\u00F4", "oelig", "\u0153",
"ograve", "\u00F2", "oline", "\u203E", "omega", "\u03C9",
"omicron", "\u03BF", "oplus", "\u2295", "or", "\u2228", "ordf", "\u00AA",
"ordm", "\u00BA", "oslash", "\u00F8", "otilde", "\u00F5",
"otimes", "\u2297", "ouml", "\u00F6", "para", "\u00B6", "part", "\u2202",
"permil", "\u2030", "perp", "\u22A5", "phi", "\u03C6", "pi", "\u03C0",
"piv", "\u03D6", "plusmn", "\u00B1", "pound", "\u00A3",
"prime", "\u2032", "prod", "\u220F", "prop", "\u221D", "psi", "\u03C8",
"quot", "\"", "rArr", "\u21D2", "radic", "\u221A", "rang", "\u232A",
"raquo", "\u00BB", "rarr", "\u2192", "rceil", "\u2309",
"rdquo", "\u201D", "real", "\u211C", "reg", "\u00AE", "rfloor", "\u230B",
"rho", "\u03C1", "rlm", "\u200F", "rsaquo", "\u203A", "rsquo", "\u2019",
"sbquo", "\u201A", "scaron", "\u0161", "sdot", "\u22C5",
"sect", "\u00A7", "shy", "\u00AD", "sigma", "\u03C3", "sigmaf", "\u03C2",
"sim", "\u223C", "spades", "\u2660", "sub", "\u2282", "sube", "\u2286",
"sum", "\u2211", "sup", "\u2283", "sup1", "\u00B9", "sup2", "\u00B2",
"sup3", "\u00B3", "supe", "\u2287", "szlig", "\u00DF", "tau", "\u03C4",
"there4", "\u2234", "theta", "\u03B8", "thetasym", "\u03D1",
"thinsp", "\u2009", "thorn", "\u00FE", "tilde", "\u02DC",
"times", "\u00D7", "trade", "\u2122", "uArr", "\u21D1",
"uacute", "\u00FA", "uarr", "\u2191", "ucirc", "\u00FB",
"ugrave", "\u00F9", "uml", "\u00A8", "upsih", "\u03D2",
"upsilon", "\u03C5", "uuml", "\u00FC", "weierp", "\u2118",
"xi", "\u03BE", "yacute", "\u00FD", "yen", "\u00A5", "yuml", "\u00FF",
"zeta", "\u03B6", "zwj", "\u200D", "zwnj", "\u200C"
};
for (int i = 0 ; i < entities.length ; i += 2) {
Character value = entities[i + 1].charAt(0);
entityValues.put(entities[i], value);
if (upperCaseVariantsAccepted.contains(entities[i])) {
entityValues.put(entities[i].toUpperCase(), value);
}
}
}
%}

View File

@ -0,0 +1,58 @@
/*
* Copyright 2010 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Generated using ICU4J 4.8.1.1 on Friday, January 13, 2012 6:20:39 PM UTC
// by org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros
ID_Start_Supp = (
[\uD81A][\uDC00-\uDE38]
| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB]
| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
| [\uD82C][\uDC00\uDC01]
| [\uD804][\uDC03-\uDC37\uDC83-\uDCAF]
| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
| [\uD87E][\uDC00-\uDE1D]
| [\uD809][\uDC00-\uDC62]
| [\uD808][\uDC00-\uDF6E]
| [\uD803][\uDC00-\uDC48]
| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
| [\uD80D][\uDC00-\uDC2E]
| [\uD86E][\uDC00-\uDC1D]
| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
| [\uD801][\uDC00-\uDC9D]
)
ID_Continue_Supp = (
[\uD81A][\uDC00-\uDE38]
| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
| [\uD804][\uDC00-\uDC46\uDC66-\uDC6F\uDC80-\uDCBA]
| [\uD82C][\uDC00\uDC01]
| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00-\uDE03\uDE05\uDE06\uDE0C-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE38-\uDE3A\uDE3F\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
| [\uD801][\uDC00-\uDC9D\uDCA0-\uDCA9]
| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
| [\uD87E][\uDC00-\uDE1D]
| [\uD809][\uDC00-\uDC62]
| [\uD808][\uDC00-\uDF6E]
| [\uD803][\uDC00-\uDC48]
| [\uD80D][\uDC00-\uDC2E]
| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDDFD\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
| [\uD86E][\uDC00-\uDC1D]
| [\uDB40][\uDD00-\uDDEF]
| [\uD834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44]
| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB\uDFCE-\uDFFF]
)

View File

@ -0,0 +1,874 @@
package org.apache.lucene.analysis.charfilter;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.util.Version;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.OpenStringBuilder;
/**
* A CharFilter that wraps another Reader and attempts to strip out HTML constructs.
*/
@SuppressWarnings("fallthrough")
%%
%unicode 6.0
%apiprivate
%type int
%final
%public
%char
%function nextChar
%class HTMLStripCharFilter
%extends BaseCharFilter
%xstate AMPERSAND, NUMERIC_CHARACTER, CHARACTER_REFERENCE_TAIL
%xstate LEFT_ANGLE_BRACKET, BANG, COMMENT, SCRIPT, SCRIPT_COMMENT
%xstate LEFT_ANGLE_BRACKET_SLASH, LEFT_ANGLE_BRACKET_SPACE, CDATA
%xstate SERVER_SIDE_INCLUDE, SINGLE_QUOTED_STRING, DOUBLE_QUOTED_STRING
%xstate END_TAG_TAIL_INCLUDE, END_TAG_TAIL_EXCLUDE, END_TAG_TAIL_SUBSTITUTE
%xstate START_TAG_TAIL_INCLUDE, START_TAG_TAIL_EXCLUDE, START_TAG_TAIL_SUBSTITUTE
%xstate STYLE, STYLE_COMMENT
// From XML 1.0 <http://www.w3.org/TR/xml/>:
//
// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [...]
// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | [...]
// [5] Name ::= NameStartChar (NameChar)*
//
// From UAX #31: Unicode Identifier and Pattern Syntax
// <http://unicode.org/reports/tr31/>:
//
// D1. Default Identifier Syntax
//
// <identifier> := <ID_Start> <ID_Continue>*
//
Name = ( ( [:_\p{ID_Start}] | {ID_Start_Supp} ) ( [-.:_\p{ID_Continue}] | {ID_Continue_Supp} )* )
// From Apache httpd mod_include documentation
// <http://httpd.apache.org/docs/current/mod/mod_include.html>:
//
// Basic Elements
//
// The document is parsed as an HTML document, with special commands
// embedded as SGML comments. A command has the syntax:
//
// <!--#element attribute=value attribute=value ... -->
//
// The value will often be enclosed in double quotes, but single quotes (')
// and backticks (`) are also possible. Many commands only allow a single
// attribute-value pair. Note that the comment terminator (-->) should be
// preceded by whitespace to ensure that it isn't considered part of an SSI
// token. Note that the leading <!--# is one token and may not contain any
// whitespaces.
//
EventAttributeSuffixes = ( [aA][bB][oO][rR][tT] |
[bB][lL][uU][rR] |
[cC][hH][aA][nN][gG][eE] |
[cC][lL][iI][cC][kK] |
[dD][bB][lL][cC][lL][iI][cC][kK] |
[eE][rR][rR][oO][rR] |
[fF][oO][cC][uU][sS] |
[kK][eE][yY][dD][oO][wW][nN] |
[kK][eE][yY][pP][rR][eE][sS][sS] |
[kK][eE][yY][uU][pP] |
[lL][oO][aA][dD] |
[mM][oO][uU][sS][eE][dD][oO][wW][nN] |
[mM][oO][uU][sS][eE][mM][oO][vV][eE] |
[mM][oO][uU][sS][eE][oO][uU][tT] |
[mM][oO][uU][sS][eE][oO][vV][eE][rR] |
[mM][oO][uU][sS][eE][uU][pP] |
[rR][eE][sS][eE][tT] |
[sS][eE][lL][eE][cC][tT] |
[sS][uU][bB][mM][iI][tT] |
[uU][nN][lL][oO][aA][dD] )
SingleQuoted = ( "'" ( "\\'" | [^']* )* "'" )
DoubleQuoted = ( "\"" ( "\\\"" | [^\"]* )* "\"" )
ServerSideInclude = ( "<!--#" ( [^'\"] | {SingleQuoted} | {DoubleQuoted} )* "-->" )
EventAttribute = [oO][nN] {EventAttributeSuffixes} \s* "=" \s* ( {SingleQuoted} | {DoubleQuoted} )
OpenTagContent = ( {EventAttribute} | [^<>] | {ServerSideInclude} )*
InlineElment = ( [aAbBiIqQsSuU] |
[aA][bB][bB][rR] |
[aA][cC][rR][oO][nN][yY][mM] |
[bB][aA][sS][eE][fF][oO][nN][tT] |
[bB][dD][oO] |
[bB][iI][gG] |
[cC][iI][tT][eE] |
[cC][oO][dD][eE] |
[dD][fF][nN] |
[eE][mM] |
[fF][oO][nN][tT] |
[iI][mM][gG] |
[iI][nN][pP][uU][tT] |
[kK][bB][dD] |
[lL][aA][bB][eE][lL] |
[sS][aA][mM][pP] |
[sS][eE][lL][eE][cC][tT] |
[sS][mM][aA][lL][lL] |
[sS][pP][aA][nN] |
[sS][tT][rR][iI][kK][eE] |
[sS][tT][rR][oO][nN][gG] |
[sS][uU][bB] |
[sS][uU][pP] |
[tT][eE][xX][tT][aA][rR][eE][aA] |
[tT][tT] |
[vV][aA][rR] )
%include src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
%include src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
%{
private static final int INITIAL_INPUT_SEGMENT_SIZE = 1024;
private static final char BLOCK_LEVEL_START_TAG_REPLACEMENT = '\n';
private static final char BLOCK_LEVEL_END_TAG_REPLACEMENT = '\n';
private static final char BR_START_TAG_REPLACEMENT = '\n';
private static final char BR_END_TAG_REPLACEMENT = '\n';
private static final char SCRIPT_REPLACEMENT = '\n';
private static final char STYLE_REPLACEMENT = '\n';
private static final char REPLACEMENT_CHARACTER = '\uFFFD';
private CharArraySet escapedTags = null;
private int inputStart;
private int cumulativeDiff;
private boolean escapeBR = false;
private boolean escapeSCRIPT = false;
private boolean escapeSTYLE = false;
private int restoreState;
private int previousRestoreState;
private int outputCharCount;
private int eofReturnValue;
private TextSegment inputSegment
= new TextSegment(INITIAL_INPUT_SEGMENT_SIZE);
private TextSegment outputSegment = inputSegment;
private TextSegment entitySegment = new TextSegment(2);
/**
* @param source
*/
public HTMLStripCharFilter(CharStream source) {
super(source);
this.zzReader = source;
}
/**
* @param source
* @param escapedTags Tags in this set (both start and end tags)
* will not be filtered out.
*/
public HTMLStripCharFilter(CharStream source, Set<String> escapedTags) {
super(source);
this.zzReader = source;
if (null != escapedTags) {
for (String tag : escapedTags) {
if (tag.equalsIgnoreCase("BR")) {
escapeBR = true;
} else if (tag.equalsIgnoreCase("SCRIPT")) {
escapeSCRIPT = true;
} else if (tag.equalsIgnoreCase("STYLE")) {
escapeSTYLE = true;
} else {
if (null == this.escapedTags) {
this.escapedTags = new CharArraySet(Version.LUCENE_40, 16, true);
}
this.escapedTags.add(tag);
}
}
}
}
@Override
public int read() throws IOException {
if (outputSegment.isRead()) {
if (zzAtEOF) {
return -1;
}
int ch = nextChar();
++outputCharCount;
return ch;
}
int ch = outputSegment.nextChar();
++outputCharCount;
return ch;
}
@Override
public int read(char cbuf[], int off, int len) throws IOException {
int i = 0;
for ( ; i < len ; ++i) {
int ch = read();
if (ch == -1) break;
cbuf[off++] = (char)ch;
}
return i > 0 ? i : (len == 0 ? 0 : -1);
}
@Override
public void close() throws IOException {
yyclose();
}
static int getInitialBufferSize() { // Package private, for testing purposes
return ZZ_BUFFERSIZE;
}
private class TextSegment extends OpenStringBuilder {
/** The position from which the next char will be read. */
int pos = 0;
/** Wraps the given buffer and sets this.len to the given length. */
TextSegment(char[] buffer, int length) {
super(buffer, length);
}
/** Allocates an internal buffer of the given size. */
TextSegment(int size) {
super(size);
}
/** Sets len = 0 and pos = 0. */
void clear() {
reset();
restart();
}
/** Sets pos = 0 */
void restart() {
pos = 0;
}
/** Returns the next char in the segment. */
int nextChar() {
assert (! isRead()): "Attempting to read past the end of a segment.";
return buf[pos++];
}
/** Returns true when all characters in the text segment have been read */
boolean isRead() {
return pos >= len;
}
}
%}
%eofval{
return eofReturnValue;
%eofval}
%eof{
switch (zzLexicalState) {
case SCRIPT:
case COMMENT:
case SCRIPT_COMMENT:
case STYLE:
case STYLE_COMMENT:
case SINGLE_QUOTED_STRING:
case DOUBLE_QUOTED_STRING:
case END_TAG_TAIL_EXCLUDE:
case END_TAG_TAIL_SUBSTITUTE:
case START_TAG_TAIL_EXCLUDE:
case SERVER_SIDE_INCLUDE:
case START_TAG_TAIL_SUBSTITUTE: { // Exclude
cumulativeDiff += yychar - inputStart;
addOffCorrectMap(outputCharCount, cumulativeDiff);
outputSegment.clear();
eofReturnValue = -1;
break;
}
case CHARACTER_REFERENCE_TAIL: { // Substitute
// At end of file, allow char refs without semicolons
cumulativeDiff += inputSegment.length() - outputSegment.length();
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
eofReturnValue = outputSegment.nextChar();
break;
}
case BANG:
case CDATA:
case AMPERSAND:
case NUMERIC_CHARACTER:
case END_TAG_TAIL_INCLUDE:
case START_TAG_TAIL_INCLUDE:
case LEFT_ANGLE_BRACKET:
case LEFT_ANGLE_BRACKET_SLASH:
case LEFT_ANGLE_BRACKET_SPACE: { // Include
outputSegment = inputSegment;
eofReturnValue = outputSegment.nextChar();
break;
}
default: {
eofReturnValue = -1;
}
}
%eof}
%%
"&" {
inputStart = yychar;
inputSegment.clear();
inputSegment.append('&');
yybegin(AMPERSAND);
}
"<" {
inputStart = yychar;
inputSegment.clear();
inputSegment.append('<');
yybegin(LEFT_ANGLE_BRACKET);
}
<AMPERSAND> {
{CharacterEntities} {
int length = yylength();
inputSegment.write(zzBuffer, zzStartRead, length);
entitySegment.clear();
char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
entitySegment.append(ch);
outputSegment = entitySegment;
yybegin(CHARACTER_REFERENCE_TAIL);
}
"#" { inputSegment.append('#'); yybegin(NUMERIC_CHARACTER); }
// 1 1 11 11
// 0 1 2 3 45 678 9 0 1 23 45
"#" [xX][dD][89aAbB][0-9a-fA-F]{2} ";&#" [xX][dD][c-fC-F][0-9a-fA-F]{2} ";" {
// Handle paired UTF-16 surrogates.
outputSegment = entitySegment;
outputSegment.clear();
String surrogatePair = yytext();
char highSurrogate = '\u0000';
try {
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(2, 6) + "'";
}
try {
outputSegment.unsafeWrite
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(10, 14) + "'";
}
cumulativeDiff += inputSegment.length() + yylength() - 2;
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
// 1 1 11 11
// 01 2 345 678 9 0 1 23 45
"#5" [56] \d{3} ";&#" [xX][dD][c-fC-F][0-9a-fA-F]{2} ";" {
// Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
char highSurrogate = '\u0000';
try { // High surrogates are in decimal range [55296, 56319]
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(1, 6) + "'";
}
if (Character.isHighSurrogate(highSurrogate)) {
outputSegment = entitySegment;
outputSegment.clear();
try {
outputSegment.unsafeWrite
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(10, 14) + "'";
}
cumulativeDiff += inputSegment.length() + yylength() - 2;
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
yypushback(surrogatePair.length() - 1); // Consume only '#'
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
// 1 111 11
// 0 1 2 3 45 6789 0 123 45
"#" [xX][dD][89aAbB][0-9a-fA-F]{2} ";&#5" [67] \d{3} ";" {
// Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
char highSurrogate = '\u0000';
char lowSurrogate = '\u0000';
try {
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(2, 6) + "'";
}
try { // Low surrogates are in decimal range [56320, 57343]
lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(9, 14) + "'";
}
if (Character.isLowSurrogate(lowSurrogate)) {
outputSegment = entitySegment;
outputSegment.clear();
outputSegment.unsafeWrite(lowSurrogate);
cumulativeDiff += inputSegment.length() + yylength() - 2;
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
yypushback(surrogatePair.length() - 1); // Consume only '#'
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
// 1 111 11
// 01 2 345 6789 0 123 45
"#5" [56] \d{3} ";&#5" [67] \d{3} ";" {
// Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
char highSurrogate = '\u0000';
try { // High surrogates are in decimal range [55296, 56319]
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(1, 6) + "'";
}
if (Character.isHighSurrogate(highSurrogate)) {
char lowSurrogate = '\u0000';
try { // Low surrogates are in decimal range [56320, 57343]
lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(9, 14) + "'";
}
if (Character.isLowSurrogate(lowSurrogate)) {
outputSegment = entitySegment;
outputSegment.clear();
outputSegment.unsafeWrite(lowSurrogate);
cumulativeDiff += inputSegment.length() + yylength() - 2;
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
}
yypushback(surrogatePair.length() - 1); // Consume only '#'
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
}
<NUMERIC_CHARACTER> {
[xX] [0-9A-Fa-f]+ {
int matchLength = yylength();
inputSegment.write(zzBuffer, zzStartRead, matchLength);
if (matchLength <= 6) { // 10FFFF: max 6 hex chars
String hexCharRef
= new String(zzBuffer, zzStartRead + 1, matchLength - 1);
int codePoint = 0;
try {
codePoint = Integer.parseInt(hexCharRef, 16);
} catch(Exception e) {
assert false: "Exception parsing hex code point '" + hexCharRef + "'";
}
if (codePoint <= 0x10FFFF) {
outputSegment = entitySegment;
outputSegment.clear();
if (codePoint >= Character.MIN_SURROGATE
&& codePoint <= Character.MAX_SURROGATE) {
outputSegment.unsafeWrite(REPLACEMENT_CHARACTER);
} else {
outputSegment.setLength
(Character.toChars(codePoint, outputSegment.getArray(), 0));
}
yybegin(CHARACTER_REFERENCE_TAIL);
} else {
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
} else {
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
}
[0-9]+ {
int matchLength = yylength();
inputSegment.write(zzBuffer, zzStartRead, matchLength);
if (matchLength <= 7) { // 0x10FFFF = 1114111: max 7 decimal chars
String decimalCharRef = yytext();
int codePoint = 0;
try {
codePoint = Integer.parseInt(decimalCharRef);
} catch(Exception e) {
assert false: "Exception parsing code point '" + decimalCharRef + "'";
}
if (codePoint <= 0x10FFFF) {
outputSegment = entitySegment;
outputSegment.clear();
if (codePoint >= Character.MIN_SURROGATE
&& codePoint <= Character.MAX_SURROGATE) {
outputSegment.unsafeWrite(REPLACEMENT_CHARACTER);
} else {
outputSegment.setLength
(Character.toChars(codePoint, outputSegment.getArray(), 0));
}
yybegin(CHARACTER_REFERENCE_TAIL);
} else {
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
} else {
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
}
}
<CHARACTER_REFERENCE_TAIL> {
";" {
cumulativeDiff
+= inputSegment.length() + yylength() - outputSegment.length();
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
}
<LEFT_ANGLE_BRACKET_SLASH> {
\s+ { inputSegment.write(zzBuffer, zzStartRead, yylength()); }
[bB][rR] \s* ">" {
yybegin(YYINITIAL);
if (escapeBR) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
return outputSegment.nextChar();
} else {
cumulativeDiff
+= inputSegment.length() + yylength() - outputSegment.length();
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
inputSegment.reset();
return BR_END_TAG_REPLACEMENT;
}
}
{InlineElment} {
inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(END_TAG_TAIL_INCLUDE);
} else {
yybegin(END_TAG_TAIL_EXCLUDE);
}
}
{Name} {
inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(END_TAG_TAIL_INCLUDE);
} else {
yybegin(END_TAG_TAIL_SUBSTITUTE);
}
}
}
<END_TAG_TAIL_INCLUDE> {
\s* ">" {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
}
<END_TAG_TAIL_EXCLUDE> {
\s* ">" {
cumulativeDiff += inputSegment.length() + yylength();
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
}
}
<END_TAG_TAIL_SUBSTITUTE> {
\s* ">" {
cumulativeDiff += inputSegment.length() + yylength() - 1;
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return BLOCK_LEVEL_END_TAG_REPLACEMENT;
}
}
<LEFT_ANGLE_BRACKET> {
"!" { inputSegment.append('!'); yybegin(BANG); }
"/" { inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH); }
\s+ {
inputSegment.write(zzBuffer, zzStartRead, yylength());
yybegin(LEFT_ANGLE_BRACKET_SPACE);
}
"?" [^>]* [/?] ">" {
cumulativeDiff += inputSegment.length() + yylength();
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
}
\s* [bB][rR] ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
yybegin(YYINITIAL);
if (escapeBR) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
return outputSegment.nextChar();
} else {
cumulativeDiff
+= inputSegment.length() + yylength() - outputSegment.length();
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.reset();
return BR_START_TAG_REPLACEMENT;
}
}
\s* [sS][cC][rR][iI][pP][tT] ( \s+ {OpenTagContent} )? \s* ">" {
yybegin(SCRIPT);
if (escapeSCRIPT) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
inputStart += 1 + yylength();
return outputSegment.nextChar();
}
}
\s* [sS][tT][yY][lL][eE] ( \s+ {OpenTagContent} )? \s* ">" {
yybegin(STYLE);
if (escapeSTYLE) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
inputStart += 1 + yylength();
return outputSegment.nextChar();
}
}
}
<LEFT_ANGLE_BRACKET, LEFT_ANGLE_BRACKET_SPACE> {
{InlineElment} {
inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(START_TAG_TAIL_INCLUDE);
} else {
yybegin(START_TAG_TAIL_EXCLUDE);
}
}
{Name} {
inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(START_TAG_TAIL_INCLUDE);
} else {
yybegin(START_TAG_TAIL_SUBSTITUTE);
}
}
}
<START_TAG_TAIL_INCLUDE> {
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
}
<START_TAG_TAIL_EXCLUDE> {
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
cumulativeDiff += inputSegment.length() + yylength();
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
outputSegment = inputSegment;
yybegin(YYINITIAL);
}
}
<START_TAG_TAIL_SUBSTITUTE> {
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
cumulativeDiff += inputSegment.length() + yylength() - 1;
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return BLOCK_LEVEL_START_TAG_REPLACEMENT;
}
}
<BANG> {
"--" { yybegin(COMMENT); }
">" {
cumulativeDiff += inputSegment.length() + yylength();
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
}
// From XML 1.0 <http://www.w3.org/TR/xml/>:
//
// [18] CDSect ::= CDStart CData CDEnd
// [19] CDStart ::= '<![CDATA['
// [20] CData ::= (Char* - (Char* ']]>' Char*))
// [21] CDEnd ::= ']]>'
//
"[CDATA[" {
cumulativeDiff += inputSegment.length() + yylength();
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(CDATA);
}
[^] {
inputSegment.append(zzBuffer[zzStartRead]);
}
}
<CDATA> {
"]]>" {
cumulativeDiff += yylength();
addOffCorrectMap(outputCharCount, cumulativeDiff);
yybegin(YYINITIAL);
}
[^] { return zzBuffer[zzStartRead]; }
}
<COMMENT> {
"<!--#" { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
"-->" {
cumulativeDiff += yychar - inputStart + yylength();
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
}
[^] { }
}
<SERVER_SIDE_INCLUDE> {
"-->" { yybegin(restoreState); }
"'" {
previousRestoreState = restoreState;
restoreState = SERVER_SIDE_INCLUDE;
yybegin(SINGLE_QUOTED_STRING);
}
"\"" {
previousRestoreState = restoreState;
restoreState = SERVER_SIDE_INCLUDE;
yybegin(DOUBLE_QUOTED_STRING);
}
[^] { }
}
<SCRIPT_COMMENT> {
"<!--#" { restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
"'" { restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING); }
"\"" { restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING); }
"-->" { yybegin(SCRIPT); }
[^] { }
}
<STYLE_COMMENT> {
"<!--#" { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
"'" { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING); }
"\"" { restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING); }
"-->" { yybegin(STYLE); }
[^] { }
}
<SINGLE_QUOTED_STRING> {
"\\" [^] { }
"'" { yybegin(restoreState); restoreState = previousRestoreState; }
[^] { }
}
<DOUBLE_QUOTED_STRING> {
"\\" [^] { }
"\"" { yybegin(restoreState); restoreState = previousRestoreState; }
[^] { }
}
<SCRIPT> {
"<!--" { yybegin(SCRIPT_COMMENT); }
"</" \s* [sS][cC][rR][iI][pP][tT] \s* ">" {
inputSegment.clear();
yybegin(YYINITIAL);
cumulativeDiff += yychar - inputStart;
int outputEnd = outputCharCount;
int returnValue;
if (escapeSCRIPT) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
cumulativeDiff += yylength() - 1;
++outputEnd;
returnValue = SCRIPT_REPLACEMENT;
}
addOffCorrectMap(outputEnd, cumulativeDiff);
return returnValue;
}
[^] { }
}
<STYLE> {
"<!--" { yybegin(STYLE_COMMENT); }
"</" \s* [sS][tT][yY][lL][eE] \s* ">" {
inputSegment.clear();
yybegin(YYINITIAL);
cumulativeDiff += yychar - inputStart;
int outputEnd = outputCharCount;
int returnValue;
if (escapeSTYLE) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
cumulativeDiff += yylength() - 1;
++outputEnd;
returnValue = STYLE_REPLACEMENT;
}
addOffCorrectMap(outputEnd, cumulativeDiff);
return returnValue;
}
[^] { }
}
<AMPERSAND,NUMERIC_CHARACTER,CHARACTER_REFERENCE_TAIL,LEFT_ANGLE_BRACKET_SLASH,END_TAG_TAIL_INCLUDE,END_TAG_TAIL_EXCLUDE,END_TAG_TAIL_SUBSTITUTE,LEFT_ANGLE_BRACKET,LEFT_ANGLE_BRACKET_SPACE,START_TAG_TAIL_INCLUDE,START_TAG_TAIL_EXCLUDE,START_TAG_TAIL_SUBSTITUTE,BANG> {
[^] {
yypushback(1);
outputSegment = inputSegment;
outputSegment.restart();
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
}
[^] { return zzBuffer[zzStartRead]; }

View File

@ -0,0 +1,530 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
# A simple python script to generate an HTML entity map and a regex alternation
# for inclusion in HTMLStripCharFilter.jflex.
def main():
print get_apache_license()
codes = {}
regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
for line in get_entity_text().split('\n'):
match = regex.match(line)
if match:
key = match.group(1)
if key == 'quot': codes[key] = r'\"'
elif key == 'nbsp': codes[key] = ' ';
else : codes[key] = r'\u%04X' % int(match.group(2))
keys = sorted(codes)
first_entry = True
output_line = 'CharacterEntities = ( '
for key in keys:
new_entry = ('"%s"' if first_entry else ' | "%s"') % key
first_entry = False
if len(output_line) + len(new_entry) >= 80:
print output_line
output_line = ' '
output_line += new_entry
if key in ('quot','copy','gt','lt','reg','amp'):
new_entry = ' | "%s"' % key.upper()
if len(output_line) + len(new_entry) >= 80:
print output_line
output_line = ' '
output_line += new_entry
print output_line, ')'
print '%{'
print ' private static final Set<String> upperCaseVariantsAccepted'
print ' = new HashSet<String>(Arrays.asList("quot","copy","gt","lt","reg","amp"));'
print ' private static final CharArrayMap<Character> entityValues'
print ' = new CharArrayMap<Character>(Version.LUCENE_40, %i, false);' % len(keys)
print ' static {'
print ' String[] entities = {'
output_line = ' '
for key in keys:
new_entry = ' "%s", "%s",' % (key, codes[key])
if len(output_line) + len(new_entry) >= 80:
print output_line
output_line = ' '
output_line += new_entry
print output_line[:-1]
print ' };'
print ' for (int i = 0 ; i < entities.length ; i += 2) {'
print ' Character value = entities[i + 1].charAt(0);'
print ' entityValues.put(entities[i], value);'
print ' if (upperCaseVariantsAccepted.contains(entities[i])) {'
print ' entityValues.put(entities[i].toUpperCase(), value);'
print ' }'
print ' }'
print " }"
print "%}"
def get_entity_text():
# The text below is taken verbatim from
# <http://www.w3.org/TR/REC-html40/sgml/entities.html>:
text = r"""
F.1. XHTML Character Entities
XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section.
F.1.1. XHTML Latin 1 Character Entities
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent.
<!-- ...................................................................... -->
<!-- XML-compatible ISO Latin 1 Character Entity Set for XHTML ............ -->
<!-- file: xhtml-lat1.ent
Typical invocation:
<!ENTITY % xhtml-lat1
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
"xhtml-lat1.ent" >
%xhtml-lat1;
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent"
Revision: $Id: xhtml-lat1.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
Portions (C) International Organization for Standardization 1986:
Permission to copy in any form is granted for use with conforming
SGML systems and applications as defined in ISO 8879, provided
this notice is included in all copies.
-->
<!ENTITY nbsp "&#160;" ><!-- no-break space = non-breaking space, U+00A0 ISOnum -->
<!ENTITY iexcl "&#161;" ><!-- inverted exclamation mark, U+00A1 ISOnum -->
<!ENTITY cent "&#162;" ><!-- cent sign, U+00A2 ISOnum -->
<!ENTITY pound "&#163;" ><!-- pound sign, U+00A3 ISOnum -->
<!ENTITY curren "&#164;" ><!-- currency sign, U+00A4 ISOnum -->
<!ENTITY yen "&#165;" ><!-- yen sign = yuan sign, U+00A5 ISOnum -->
<!ENTITY brvbar "&#166;" ><!-- broken bar = broken vertical bar, U+00A6 ISOnum -->
<!ENTITY sect "&#167;" ><!-- section sign, U+00A7 ISOnum -->
<!ENTITY uml "&#168;" ><!-- diaeresis = spacing diaeresis, U+00A8 ISOdia -->
<!ENTITY copy "&#169;" ><!-- copyright sign, U+00A9 ISOnum -->
<!ENTITY ordf "&#170;" ><!-- feminine ordinal indicator, U+00AA ISOnum -->
<!ENTITY laquo "&#171;" ><!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum -->
<!ENTITY not "&#172;" ><!-- not sign, U+00AC ISOnum -->
<!ENTITY shy "&#173;" ><!-- soft hyphen = discretionary hyphen, U+00AD ISOnum -->
<!ENTITY reg "&#174;" ><!-- registered sign = registered trade mark sign, U+00AE ISOnum -->
<!ENTITY macr "&#175;" ><!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia -->
<!ENTITY deg "&#176;" ><!-- degree sign, U+00B0 ISOnum -->
<!ENTITY plusmn "&#177;" ><!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum -->
<!ENTITY sup2 "&#178;" ><!-- superscript two = superscript digit two = squared, U+00B2 ISOnum -->
<!ENTITY sup3 "&#179;" ><!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum -->
<!ENTITY acute "&#180;" ><!-- acute accent = spacing acute, U+00B4 ISOdia -->
<!ENTITY micro "&#181;" ><!-- micro sign, U+00B5 ISOnum -->
<!ENTITY para "&#182;" ><!-- pilcrow sign = paragraph sign, U+00B6 ISOnum -->
<!ENTITY middot "&#183;" ><!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum -->
<!ENTITY cedil "&#184;" ><!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
<!ENTITY sup1 "&#185;" ><!-- superscript one = superscript digit one, U+00B9 ISOnum -->
<!ENTITY ordm "&#186;" ><!-- masculine ordinal indicator, U+00BA ISOnum -->
<!ENTITY raquo "&#187;" ><!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum -->
<!ENTITY frac14 "&#188;" ><!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum -->
<!ENTITY frac12 "&#189;" ><!-- vulgar fraction one half = fraction one half, U+00BD ISOnum -->
<!ENTITY frac34 "&#190;" ><!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum -->
<!ENTITY iquest "&#191;" ><!-- inverted question mark = turned question mark, U+00BF ISOnum -->
<!ENTITY Agrave "&#192;" ><!-- latin capital A with grave = latin capital A grave, U+00C0 ISOlat1 -->
<!ENTITY Aacute "&#193;" ><!-- latin capital A with acute, U+00C1 ISOlat1 -->
<!ENTITY Acirc "&#194;" ><!-- latin capital A with circumflex, U+00C2 ISOlat1 -->
<!ENTITY Atilde "&#195;" ><!-- latin capital A with tilde, U+00C3 ISOlat1 -->
<!ENTITY Auml "&#196;" ><!-- latin capital A with diaeresis, U+00C4 ISOlat1 -->
<!ENTITY Aring "&#197;" ><!-- latin capital A with ring above = latin capital A ring, U+00C5 ISOlat1 -->
<!ENTITY AElig "&#198;" ><!-- latin capital AE = latin capital ligature AE, U+00C6 ISOlat1 -->
<!ENTITY Ccedil "&#199;" ><!-- latin capital C with cedilla, U+00C7 ISOlat1 -->
<!ENTITY Egrave "&#200;" ><!-- latin capital E with grave, U+00C8 ISOlat1 -->
<!ENTITY Eacute "&#201;" ><!-- latin capital E with acute, U+00C9 ISOlat1 -->
<!ENTITY Ecirc "&#202;" ><!-- latin capital E with circumflex, U+00CA ISOlat1 -->
<!ENTITY Euml "&#203;" ><!-- latin capital E with diaeresis, U+00CB ISOlat1 -->
<!ENTITY Igrave "&#204;" ><!-- latin capital I with grave, U+00CC ISOlat1 -->
<!ENTITY Iacute "&#205;" ><!-- latin capital I with acute, U+00CD ISOlat1 -->
<!ENTITY Icirc "&#206;" ><!-- latin capital I with circumflex, U+00CE ISOlat1 -->
<!ENTITY Iuml "&#207;" ><!-- latin capital I with diaeresis, U+00CF ISOlat1 -->
<!ENTITY ETH "&#208;" ><!-- latin capital ETH, U+00D0 ISOlat1 -->
<!ENTITY Ntilde "&#209;" ><!-- latin capital N with tilde, U+00D1 ISOlat1 -->
<!ENTITY Ograve "&#210;" ><!-- latin capital O with grave, U+00D2 ISOlat1 -->
<!ENTITY Oacute "&#211;" ><!-- latin capital O with acute, U+00D3 ISOlat1 -->
<!ENTITY Ocirc "&#212;" ><!-- latin capital O with circumflex, U+00D4 ISOlat1 -->
<!ENTITY Otilde "&#213;" ><!-- latin capital O with tilde, U+00D5 ISOlat1 -->
<!ENTITY Ouml "&#214;" ><!-- latin capital O with diaeresis, U+00D6 ISOlat1 -->
<!ENTITY times "&#215;" ><!-- multiplication sign, U+00D7 ISOnum -->
<!ENTITY Oslash "&#216;" ><!-- latin capital O with stroke = latin capital O slash, U+00D8 ISOlat1 -->
<!ENTITY Ugrave "&#217;" ><!-- latin capital U with grave, U+00D9 ISOlat1 -->
<!ENTITY Uacute "&#218;" ><!-- latin capital U with acute, U+00DA ISOlat1 -->
<!ENTITY Ucirc "&#219;" ><!-- latin capital U with circumflex, U+00DB ISOlat1 -->
<!ENTITY Uuml "&#220;" ><!-- latin capital U with diaeresis, U+00DC ISOlat1 -->
<!ENTITY Yacute "&#221;" ><!-- latin capital Y with acute, U+00DD ISOlat1 -->
<!ENTITY THORN "&#222;" ><!-- latin capital THORN, U+00DE ISOlat1 -->
<!ENTITY szlig "&#223;" ><!-- latin small sharp s = ess-zed, U+00DF ISOlat1 -->
<!ENTITY agrave "&#224;" ><!-- latin small a with grave = latin small a grave, U+00E0 ISOlat1 -->
<!ENTITY aacute "&#225;" ><!-- latin small a with acute, U+00E1 ISOlat1 -->
<!ENTITY acirc "&#226;" ><!-- latin small a with circumflex, U+00E2 ISOlat1 -->
<!ENTITY atilde "&#227;" ><!-- latin small a with tilde, U+00E3 ISOlat1 -->
<!ENTITY auml "&#228;" ><!-- latin small a with diaeresis, U+00E4 ISOlat1 -->
<!ENTITY aring "&#229;" ><!-- latin small a with ring above = latin small a ring, U+00E5 ISOlat1 -->
<!ENTITY aelig "&#230;" ><!-- latin small ae = latin small ligature ae, U+00E6 ISOlat1 -->
<!ENTITY ccedil "&#231;" ><!-- latin small c with cedilla, U+00E7 ISOlat1 -->
<!ENTITY egrave "&#232;" ><!-- latin small e with grave, U+00E8 ISOlat1 -->
<!ENTITY eacute "&#233;" ><!-- latin small e with acute, U+00E9 ISOlat1 -->
<!ENTITY ecirc "&#234;" ><!-- latin small e with circumflex, U+00EA ISOlat1 -->
<!ENTITY euml "&#235;" ><!-- latin small e with diaeresis, U+00EB ISOlat1 -->
<!ENTITY igrave "&#236;" ><!-- latin small i with grave, U+00EC ISOlat1 -->
<!ENTITY iacute "&#237;" ><!-- latin small i with acute, U+00ED ISOlat1 -->
<!ENTITY icirc "&#238;" ><!-- latin small i with circumflex, U+00EE ISOlat1 -->
<!ENTITY iuml "&#239;" ><!-- latin small i with diaeresis, U+00EF ISOlat1 -->
<!ENTITY eth "&#240;" ><!-- latin small eth, U+00F0 ISOlat1 -->
<!ENTITY ntilde "&#241;" ><!-- latin small n with tilde, U+00F1 ISOlat1 -->
<!ENTITY ograve "&#242;" ><!-- latin small o with grave, U+00F2 ISOlat1 -->
<!ENTITY oacute "&#243;" ><!-- latin small o with acute, U+00F3 ISOlat1 -->
<!ENTITY ocirc "&#244;" ><!-- latin small o with circumflex, U+00F4 ISOlat1 -->
<!ENTITY otilde "&#245;" ><!-- latin small o with tilde, U+00F5 ISOlat1 -->
<!ENTITY ouml "&#246;" ><!-- latin small o with diaeresis, U+00F6 ISOlat1 -->
<!ENTITY divide "&#247;" ><!-- division sign, U+00F7 ISOnum -->
<!ENTITY oslash "&#248;" ><!-- latin small o with stroke, = latin small o slash, U+00F8 ISOlat1 -->
<!ENTITY ugrave "&#249;" ><!-- latin small u with grave, U+00F9 ISOlat1 -->
<!ENTITY uacute "&#250;" ><!-- latin small u with acute, U+00FA ISOlat1 -->
<!ENTITY ucirc "&#251;" ><!-- latin small u with circumflex, U+00FB ISOlat1 -->
<!ENTITY uuml "&#252;" ><!-- latin small u with diaeresis, U+00FC ISOlat1 -->
<!ENTITY yacute "&#253;" ><!-- latin small y with acute, U+00FD ISOlat1 -->
<!ENTITY thorn "&#254;" ><!-- latin small thorn with, U+00FE ISOlat1 -->
<!ENTITY yuml "&#255;" ><!-- latin small y with diaeresis, U+00FF ISOlat1 -->
<!-- end of xhtml-lat1.ent -->
F.1.2. XHTML Special Characters
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent.
<!-- ...................................................................... -->
<!-- XML-compatible ISO Special Character Entity Set for XHTML ............ -->
<!-- file: xhtml-special.ent
Typical invocation:
<!ENTITY % xhtml-special
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
"xhtml-special.ent" >
%xhtml-special;
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-special.ent"
Revision: $Id: xhtml-special.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
Portions (C) International Organization for Standardization 1986:
Permission to copy in any form is granted for use with conforming
SGML systems and applications as defined in ISO 8879, provided
this notice is included in all copies.
Revisions:
2000-10-28: added &apos; and altered XML Predefined Entities for compatibility
-->
<!-- Relevant ISO entity set is given unless names are newly introduced.
New names (i.e., not in ISO 8879 [SGML] list) do not clash with
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
numbers are given for each character, in hex. Entity values are
decimal conversions of the ISO 10646 values and refer to the
document character set. Names are Unicode [UNICODE] names.
-->
<!-- C0 Controls and Basic Latin -->
<!ENTITY lt "&#38;#60;" ><!-- less-than sign, U+003C ISOnum -->
<!ENTITY gt "&#62;" ><!-- greater-than sign, U+003E ISOnum -->
<!ENTITY amp "&#38;#38;" ><!-- ampersand, U+0026 ISOnum -->
<!ENTITY apos "&#39;" ><!-- The Apostrophe (Apostrophe Quote, APL Quote), U+0027 ISOnum -->
<!ENTITY quot "&#34;" ><!-- quotation mark (Quote Double), U+0022 ISOnum -->
<!-- Latin Extended-A -->
<!ENTITY OElig "&#338;" ><!-- latin capital ligature OE, U+0152 ISOlat2 -->
<!ENTITY oelig "&#339;" ><!-- latin small ligature oe, U+0153 ISOlat2 -->
<!-- ligature is a misnomer, this is a separate character in some languages -->
<!ENTITY Scaron "&#352;" ><!-- latin capital letter S with caron, U+0160 ISOlat2 -->
<!ENTITY scaron "&#353;" ><!-- latin small letter s with caron, U+0161 ISOlat2 -->
<!ENTITY Yuml "&#376;" ><!-- latin capital letter Y with diaeresis, U+0178 ISOlat2 -->
<!-- Spacing Modifier Letters -->
<!ENTITY circ "&#710;" ><!-- modifier letter circumflex accent, U+02C6 ISOpub -->
<!ENTITY tilde "&#732;" ><!-- small tilde, U+02DC ISOdia -->
<!-- General Punctuation -->
<!ENTITY ensp "&#8194;" ><!-- en space, U+2002 ISOpub -->
<!ENTITY emsp "&#8195;" ><!-- em space, U+2003 ISOpub -->
<!ENTITY thinsp "&#8201;" ><!-- thin space, U+2009 ISOpub -->
<!ENTITY zwnj "&#8204;" ><!-- zero width non-joiner, U+200C NEW RFC 2070 -->
<!ENTITY zwj "&#8205;" ><!-- zero width joiner, U+200D NEW RFC 2070 -->
<!ENTITY lrm "&#8206;" ><!-- left-to-right mark, U+200E NEW RFC 2070 -->
<!ENTITY rlm "&#8207;" ><!-- right-to-left mark, U+200F NEW RFC 2070 -->
<!ENTITY ndash "&#8211;" ><!-- en dash, U+2013 ISOpub -->
<!ENTITY mdash "&#8212;" ><!-- em dash, U+2014 ISOpub -->
<!ENTITY lsquo "&#8216;" ><!-- left single quotation mark, U+2018 ISOnum -->
<!ENTITY rsquo "&#8217;" ><!-- right single quotation mark, U+2019 ISOnum -->
<!ENTITY sbquo "&#8218;" ><!-- single low-9 quotation mark, U+201A NEW -->
<!ENTITY ldquo "&#8220;" ><!-- left double quotation mark, U+201C ISOnum -->
<!ENTITY rdquo "&#8221;" ><!-- right double quotation mark, U+201D ISOnum -->
<!ENTITY bdquo "&#8222;" ><!-- double low-9 quotation mark, U+201E NEW -->
<!ENTITY dagger "&#8224;" ><!-- dagger, U+2020 ISOpub -->
<!ENTITY Dagger "&#8225;" ><!-- double dagger, U+2021 ISOpub -->
<!ENTITY permil "&#8240;" ><!-- per mille sign, U+2030 ISOtech -->
<!-- lsaquo is proposed but not yet ISO standardized -->
<!ENTITY lsaquo "&#8249;" ><!-- single left-pointing angle quotation mark, U+2039 ISO proposed -->
<!-- rsaquo is proposed but not yet ISO standardized -->
<!ENTITY rsaquo "&#8250;" ><!-- single right-pointing angle quotation mark, U+203A ISO proposed -->
<!ENTITY euro "&#8364;" ><!-- euro sign, U+20AC NEW -->
<!-- end of xhtml-special.ent -->
F.1.3. XHTML Mathematical, Greek, and Symbolic Characters
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent.
<!-- ...................................................................... -->
<!-- ISO Math, Greek and Symbolic Character Entity Set for XHTML .......... -->
<!-- file: xhtml-symbol.ent
Typical invocation:
<!ENTITY % xhtml-symbol
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
"xhtml-symbol.ent" >
%xhtml-symbol;
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent"
Revision: $Id: xhtml-symbol.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
Portions (C) International Organization for Standardization 1986:
Permission to copy in any form is granted for use with conforming
SGML systems and applications as defined in ISO 8879, provided
this notice is included in all copies.
-->
<!-- Relevant ISO entity set is given unless names are newly introduced.
New names (i.e., not in ISO 8879 [SGML] list) do not clash with
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
numbers are given for each character, in hex. Entity values are
decimal conversions of the ISO 10646 values and refer to the
document character set. Names are Unicode [UNICODE] names.
-->
<!-- Latin Extended-B -->
<!ENTITY fnof "&#402;" ><!-- latin small f with hook = function
= florin, U+0192 ISOtech -->
<!-- Greek -->
<!ENTITY Alpha "&#913;" ><!-- greek capital letter alpha, U+0391 -->
<!ENTITY Beta "&#914;" ><!-- greek capital letter beta, U+0392 -->
<!ENTITY Gamma "&#915;" ><!-- greek capital letter gamma, U+0393 ISOgrk3 -->
<!ENTITY Delta "&#916;" ><!-- greek capital letter delta, U+0394 ISOgrk3 -->
<!ENTITY Epsilon "&#917;" ><!-- greek capital letter epsilon, U+0395 -->
<!ENTITY Zeta "&#918;" ><!-- greek capital letter zeta, U+0396 -->
<!ENTITY Eta "&#919;" ><!-- greek capital letter eta, U+0397 -->
<!ENTITY Theta "&#920;" ><!-- greek capital letter theta, U+0398 ISOgrk3 -->
<!ENTITY Iota "&#921;" ><!-- greek capital letter iota, U+0399 -->
<!ENTITY Kappa "&#922;" ><!-- greek capital letter kappa, U+039A -->
<!ENTITY Lambda "&#923;" ><!-- greek capital letter lambda, U+039B ISOgrk3 -->
<!ENTITY Mu "&#924;" ><!-- greek capital letter mu, U+039C -->
<!ENTITY Nu "&#925;" ><!-- greek capital letter nu, U+039D -->
<!ENTITY Xi "&#926;" ><!-- greek capital letter xi, U+039E ISOgrk3 -->
<!ENTITY Omicron "&#927;" ><!-- greek capital letter omicron, U+039F -->
<!ENTITY Pi "&#928;" ><!-- greek capital letter pi, U+03A0 ISOgrk3 -->
<!ENTITY Rho "&#929;" ><!-- greek capital letter rho, U+03A1 -->
<!-- there is no Sigmaf, and no U+03A2 character either -->
<!ENTITY Sigma "&#931;" ><!-- greek capital letter sigma, U+03A3 ISOgrk3 -->
<!ENTITY Tau "&#932;" ><!-- greek capital letter tau, U+03A4 -->
<!ENTITY Upsilon "&#933;" ><!-- greek capital letter upsilon,
U+03A5 ISOgrk3 -->
<!ENTITY Phi "&#934;" ><!-- greek capital letter phi, U+03A6 ISOgrk3 -->
<!ENTITY Chi "&#935;" ><!-- greek capital letter chi, U+03A7 -->
<!ENTITY Psi "&#936;" ><!-- greek capital letter psi, U+03A8 ISOgrk3 -->
<!ENTITY Omega "&#937;" ><!-- greek capital letter omega, U+03A9 ISOgrk3 -->
<!ENTITY alpha "&#945;" ><!-- greek small letter alpha, U+03B1 ISOgrk3 -->
<!ENTITY beta "&#946;" ><!-- greek small letter beta, U+03B2 ISOgrk3 -->
<!ENTITY gamma "&#947;" ><!-- greek small letter gamma, U+03B3 ISOgrk3 -->
<!ENTITY delta "&#948;" ><!-- greek small letter delta, U+03B4 ISOgrk3 -->
<!ENTITY epsilon "&#949;" ><!-- greek small letter epsilon, U+03B5 ISOgrk3 -->
<!ENTITY zeta "&#950;" ><!-- greek small letter zeta, U+03B6 ISOgrk3 -->
<!ENTITY eta "&#951;" ><!-- greek small letter eta, U+03B7 ISOgrk3 -->
<!ENTITY theta "&#952;" ><!-- greek small letter theta, U+03B8 ISOgrk3 -->
<!ENTITY iota "&#953;" ><!-- greek small letter iota, U+03B9 ISOgrk3 -->
<!ENTITY kappa "&#954;" ><!-- greek small letter kappa, U+03BA ISOgrk3 -->
<!ENTITY lambda "&#955;" ><!-- greek small letter lambda, U+03BB ISOgrk3 -->
<!ENTITY mu "&#956;" ><!-- greek small letter mu, U+03BC ISOgrk3 -->
<!ENTITY nu "&#957;" ><!-- greek small letter nu, U+03BD ISOgrk3 -->
<!ENTITY xi "&#958;" ><!-- greek small letter xi, U+03BE ISOgrk3 -->
<!ENTITY omicron "&#959;" ><!-- greek small letter omicron, U+03BF NEW -->
<!ENTITY pi "&#960;" ><!-- greek small letter pi, U+03C0 ISOgrk3 -->
<!ENTITY rho "&#961;" ><!-- greek small letter rho, U+03C1 ISOgrk3 -->
<!ENTITY sigmaf "&#962;" ><!-- greek small letter final sigma, U+03C2 ISOgrk3 -->
<!ENTITY sigma "&#963;" ><!-- greek small letter sigma, U+03C3 ISOgrk3 -->
<!ENTITY tau "&#964;" ><!-- greek small letter tau, U+03C4 ISOgrk3 -->
<!ENTITY upsilon "&#965;" ><!-- greek small letter upsilon, U+03C5 ISOgrk3 -->
<!ENTITY phi "&#966;" ><!-- greek small letter phi, U+03C6 ISOgrk3 -->
<!ENTITY chi "&#967;" ><!-- greek small letter chi, U+03C7 ISOgrk3 -->
<!ENTITY psi "&#968;" ><!-- greek small letter psi, U+03C8 ISOgrk3 -->
<!ENTITY omega "&#969;" ><!-- greek small letter omega, U+03C9 ISOgrk3 -->
<!ENTITY thetasym "&#977;" ><!-- greek small letter theta symbol, U+03D1 NEW -->
<!ENTITY upsih "&#978;" ><!-- greek upsilon with hook symbol, U+03D2 NEW -->
<!ENTITY piv "&#982;" ><!-- greek pi symbol, U+03D6 ISOgrk3 -->
<!-- General Punctuation -->
<!ENTITY bull "&#8226;" ><!-- bullet = black small circle, U+2022 ISOpub -->
<!-- bullet is NOT the same as bullet operator, U+2219 -->
<!ENTITY hellip "&#8230;" ><!-- horizontal ellipsis = three dot leader, U+2026 ISOpub -->
<!ENTITY prime "&#8242;" ><!-- prime = minutes = feet, U+2032 ISOtech -->
<!ENTITY Prime "&#8243;" ><!-- double prime = seconds = inches, U+2033 ISOtech -->
<!ENTITY oline "&#8254;" ><!-- overline = spacing overscore, U+203E NEW -->
<!ENTITY frasl "&#8260;" ><!-- fraction slash, U+2044 NEW -->
<!-- Letterlike Symbols -->
<!ENTITY weierp "&#8472;" ><!-- script capital P = power set = Weierstrass p, U+2118 ISOamso -->
<!ENTITY image "&#8465;" ><!-- blackletter capital I = imaginary part, U+2111 ISOamso -->
<!ENTITY real "&#8476;" ><!-- blackletter capital R = real part symbol, U+211C ISOamso -->
<!ENTITY trade "&#8482;" ><!-- trade mark sign, U+2122 ISOnum -->
<!ENTITY alefsym "&#8501;" ><!-- alef symbol = first transfinite cardinal, U+2135 NEW -->
<!-- alef symbol is NOT the same as hebrew letter alef, U+05D0 although
the same glyph could be used to depict both characters -->
<!-- Arrows -->
<!ENTITY larr "&#8592;" ><!-- leftwards arrow, U+2190 ISOnum -->
<!ENTITY uarr "&#8593;" ><!-- upwards arrow, U+2191 ISOnum-->
<!ENTITY rarr "&#8594;" ><!-- rightwards arrow, U+2192 ISOnum -->
<!ENTITY darr "&#8595;" ><!-- downwards arrow, U+2193 ISOnum -->
<!ENTITY harr "&#8596;" ><!-- left right arrow, U+2194 ISOamsa -->
<!ENTITY crarr "&#8629;" ><!-- downwards arrow with corner leftwards
= carriage return, U+21B5 NEW -->
<!ENTITY lArr "&#8656;" ><!-- leftwards double arrow, U+21D0 ISOtech -->
<!-- Unicode does not say that lArr is the same as the 'is implied by' arrow
but also does not have any other character for that function. So ? lArr can
be used for 'is implied by' as ISOtech suggests -->
<!ENTITY uArr "&#8657;" ><!-- upwards double arrow, U+21D1 ISOamsa -->
<!ENTITY rArr "&#8658;" ><!-- rightwards double arrow, U+21D2 ISOtech -->
<!-- Unicode does not say this is the 'implies' character but does not have
another character with this function so ?
rArr can be used for 'implies' as ISOtech suggests -->
<!ENTITY dArr "&#8659;" ><!-- downwards double arrow, U+21D3 ISOamsa -->
<!ENTITY hArr "&#8660;" ><!-- left right double arrow, U+21D4 ISOamsa -->
<!-- Mathematical Operators -->
<!ENTITY forall "&#8704;" ><!-- for all, U+2200 ISOtech -->
<!ENTITY part "&#8706;" ><!-- partial differential, U+2202 ISOtech -->
<!ENTITY exist "&#8707;" ><!-- there exists, U+2203 ISOtech -->
<!ENTITY empty "&#8709;" ><!-- empty set = null set, U+2205 ISOamso -->
<!ENTITY nabla "&#8711;" ><!-- nabla = backward difference, U+2207 ISOtech -->
<!ENTITY isin "&#8712;" ><!-- element of, U+2208 ISOtech -->
<!ENTITY notin "&#8713;" ><!-- not an element of, U+2209 ISOtech -->
<!ENTITY ni "&#8715;" ><!-- contains as member, U+220B ISOtech -->
<!-- should there be a more memorable name than 'ni'? -->
<!ENTITY prod "&#8719;" ><!-- n-ary product = product sign, U+220F ISOamsb -->
<!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
the same glyph might be used for both -->
<!ENTITY sum "&#8721;" ><!-- n-ary sumation, U+2211 ISOamsb -->
<!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
though the same glyph might be used for both -->
<!ENTITY minus "&#8722;" ><!-- minus sign, U+2212 ISOtech -->
<!ENTITY lowast "&#8727;" ><!-- asterisk operator, U+2217 ISOtech -->
<!ENTITY radic "&#8730;" ><!-- square root = radical sign, U+221A ISOtech -->
<!ENTITY prop "&#8733;" ><!-- proportional to, U+221D ISOtech -->
<!ENTITY infin "&#8734;" ><!-- infinity, U+221E ISOtech -->
<!ENTITY ang "&#8736;" ><!-- angle, U+2220 ISOamso -->
<!ENTITY and "&#8743;" ><!-- logical and = wedge, U+2227 ISOtech -->
<!ENTITY or "&#8744;" ><!-- logical or = vee, U+2228 ISOtech -->
<!ENTITY cap "&#8745;" ><!-- intersection = cap, U+2229 ISOtech -->
<!ENTITY cup "&#8746;" ><!-- union = cup, U+222A ISOtech -->
<!ENTITY int "&#8747;" ><!-- integral, U+222B ISOtech -->
<!ENTITY there4 "&#8756;" ><!-- therefore, U+2234 ISOtech -->
<!ENTITY sim "&#8764;" ><!-- tilde operator = varies with = similar to, U+223C ISOtech -->
<!-- tilde operator is NOT the same character as the tilde, U+007E,
although the same glyph might be used to represent both -->
<!ENTITY cong "&#8773;" ><!-- approximately equal to, U+2245 ISOtech -->
<!ENTITY asymp "&#8776;" ><!-- almost equal to = asymptotic to, U+2248 ISOamsr -->
<!ENTITY ne "&#8800;" ><!-- not equal to, U+2260 ISOtech -->
<!ENTITY equiv "&#8801;" ><!-- identical to, U+2261 ISOtech -->
<!ENTITY le "&#8804;" ><!-- less-than or equal to, U+2264 ISOtech -->
<!ENTITY ge "&#8805;" ><!-- greater-than or equal to, U+2265 ISOtech -->
<!ENTITY sub "&#8834;" ><!-- subset of, U+2282 ISOtech -->
<!ENTITY sup "&#8835;" ><!-- superset of, U+2283 ISOtech -->
<!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
font encoding and is not included. Should it be, for symmetry?
It is in ISOamsn -->
<!ENTITY nsub "&#8836;" ><!-- not a subset of, U+2284 ISOamsn -->
<!ENTITY sube "&#8838;" ><!-- subset of or equal to, U+2286 ISOtech -->
<!ENTITY supe "&#8839;" ><!-- superset of or equal to, U+2287 ISOtech -->
<!ENTITY oplus "&#8853;" ><!-- circled plus = direct sum, U+2295 ISOamsb -->
<!ENTITY otimes "&#8855;" ><!-- circled times = vector product, U+2297 ISOamsb -->
<!ENTITY perp "&#8869;" ><!-- up tack = orthogonal to = perpendicular, U+22A5 ISOtech -->
<!ENTITY sdot "&#8901;" ><!-- dot operator, U+22C5 ISOamsb -->
<!-- dot operator is NOT the same character as U+00B7 middle dot -->
<!-- Miscellaneous Technical -->
<!ENTITY lceil "&#8968;" ><!-- left ceiling = apl upstile, U+2308 ISOamsc -->
<!ENTITY rceil "&#8969;" ><!-- right ceiling, U+2309 ISOamsc -->
<!ENTITY lfloor "&#8970;" ><!-- left floor = apl downstile, U+230A ISOamsc -->
<!ENTITY rfloor "&#8971;" ><!-- right floor, U+230B ISOamsc -->
<!ENTITY lang "&#9001;" ><!-- left-pointing angle bracket = bra, U+2329 ISOtech -->
<!-- lang is NOT the same character as U+003C 'less than'
or U+2039 'single left-pointing angle quotation mark' -->
<!ENTITY rang "&#9002;" ><!-- right-pointing angle bracket = ket, U+232A ISOtech -->
<!-- rang is NOT the same character as U+003E 'greater than'
or U+203A 'single right-pointing angle quotation mark' -->
<!-- Geometric Shapes -->
<!ENTITY loz "&#9674;" ><!-- lozenge, U+25CA ISOpub -->
<!-- Miscellaneous Symbols -->
<!ENTITY spades "&#9824;" ><!-- black spade suit, U+2660 ISOpub -->
<!-- black here seems to mean filled as opposed to hollow -->
<!ENTITY clubs "&#9827;" ><!-- black club suit = shamrock, U+2663 ISOpub -->
<!ENTITY hearts "&#9829;" ><!-- black heart suit = valentine, U+2665 ISOpub -->
<!ENTITY diams "&#9830;" ><!-- black diamond suit, U+2666 ISOpub -->
<!-- end of xhtml-symbol.ent -->
"""
return text
def get_apache_license():
license = r"""/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
"""
return license
main()

View File

@ -17,6 +17,42 @@
-->
<html><head></head>
<body>
Filters that normalize text before tokenization.
<p>
Chainable filters that normalize text before tokenization and provide
mappings between normalized text offsets and the corresponding offset
in the original text.
</p>
<H2>CharFilter offset mappings</H2>
<p>
CharFilters modify an input stream via a series of substring
replacements (including deletions and insertions) to produce an output
stream. There are three possible replacement cases: the replacement
string has the same length as the original substring; the replacement
is shorter; and the replacement is longer. In the latter two cases
(when the replacement has a different length than the original),
one or more offset correction mappings are required.
</p>
<p>
When the replacement is shorter than the original (e.g. when the
replacement is the empty string), a single offset correction mapping
should be added at the replacement's end offset in the output stream.
The <code>cumulativeDiff</code> parameter to the
<code>addOffCorrectMapping()</code> method will be the sum of all
previous replacement offset adjustments, with the addition of the
difference between the lengths of the original substring and the
replacement string (a positive value).
</p>
<p>
When the replacement is longer than the original (e.g. when the
original is the empty string), you should add as many offset
correction mappings as the difference between the lengths of the
replacement string and the original substring, starting at the
end offset the original substring would have had in the output stream.
The <code>cumulativeDiff</code> parameter to the
<code>addOffCorrectMapping()</code> method will be the sum of all
previous replacement offset adjustments, with the addition of the
difference between the lengths of the original substring and the
replacement string so far (a negative value).
</p>
</body>
</html>

View File

@ -154,13 +154,22 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
/** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */
public CompoundToken(int offset, int length) {
final int newStart = CompoundWordTokenFilterBase.this.offsetAtt.startOffset() + offset;
this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length);
// TODO: This ignores the original endOffset, if a CharFilter/Tokenizer/Filter removed
// chars from the term, offsets may not match correctly (other filters producing tokens
// may also have this problem):
this.startOffset = newStart;
this.endOffset = newStart + length;
// offsets of the original word
int startOff = CompoundWordTokenFilterBase.this.offsetAtt.startOffset();
int endOff = CompoundWordTokenFilterBase.this.offsetAtt.endOffset();
if (endOff - startOff != CompoundWordTokenFilterBase.this.termAtt.length()) {
// if length by start + end offsets doesn't match the term text then assume
// this is a synonym and don't adjust the offsets.
this.startOffset = startOff;
this.endOffset = endOff;
} else {
final int newStart = startOff + offset;
this.startOffset = newStart;
this.endOffset = newStart + length;
}
}
}

View File

@ -0,0 +1,47 @@
package org.apache.lucene.analysis.core;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import java.io.IOException;
import java.util.Set;
/**
* Removes tokens whose types appear in a set of blocked types from a token stream.
*/
public final class TypeTokenFilter extends FilteringTokenFilter {
private final Set<String> stopTypes;
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
public TypeTokenFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes) {
super(enablePositionIncrements, input);
this.stopTypes = stopTypes;
}
/**
* Returns the next input Token whose typeAttribute.type() is not a stop type.
*/
@Override
protected boolean accept() throws IOException {
return !stopTypes.contains(typeAttribute.type());
}
}

View File

@ -60,6 +60,7 @@ public final class HyphenatedWordsFilter extends TokenFilter {
private final StringBuilder hyphenated = new StringBuilder();
private State savedState;
private boolean exhausted = false;
private int lastEndOffset = 0;
/**
* Creates a new HyphenatedWordsFilter
@ -78,6 +79,7 @@ public final class HyphenatedWordsFilter extends TokenFilter {
while (!exhausted && input.incrementToken()) {
char[] term = termAttribute.buffer();
int termLength = termAttribute.length();
lastEndOffset = offsetAttribute.endOffset();
if (termLength > 0 && term[termLength - 1] == '-') {
// a hyphenated word
@ -119,6 +121,7 @@ public final class HyphenatedWordsFilter extends TokenFilter {
hyphenated.setLength(0);
savedState = null;
exhausted = false;
lastEndOffset = 0;
}
// ================================================= Helper Methods ================================================
@ -127,8 +130,6 @@ public final class HyphenatedWordsFilter extends TokenFilter {
* Writes the joined unhyphenated term
*/
private void unhyphenate() {
int endOffset = offsetAttribute.endOffset();
restoreState(savedState);
savedState = null;
@ -140,7 +141,7 @@ public final class HyphenatedWordsFilter extends TokenFilter {
hyphenated.getChars(0, length, term, 0);
termAttribute.setLength(length);
offsetAttribute.setOffset(offsetAttribute.startOffset(), endOffset);
offsetAttribute.setOffset(offsetAttribute.startOffset(), lastEndOffset);
hyphenated.setLength(0);
}
}

View File

@ -183,31 +183,33 @@ public final class PatternAnalyzer extends Analyzer {
*
* @param fieldName
* the name of the field to tokenize (currently ignored).
* @param reader
* reader (e.g. charfilter) of the original text. can be null.
* @param text
* the string to tokenize
* @return a new token stream
*/
public TokenStreamComponents createComponents(String fieldName, String text) {
public TokenStreamComponents createComponents(String fieldName, Reader reader, String text) {
// Ideally the Analyzer superclass should have a method with the same signature,
// with a default impl that simply delegates to the StringReader flavour.
if (text == null)
throw new IllegalArgumentException("text must not be null");
if (pattern == NON_WORD_PATTERN) { // fast path
return new TokenStreamComponents(new FastStringTokenizer(text, true, toLowerCase, stopWords));
return new TokenStreamComponents(new FastStringTokenizer(reader, text, true, toLowerCase, stopWords));
} else if (pattern == WHITESPACE_PATTERN) { // fast path
return new TokenStreamComponents(new FastStringTokenizer(text, false, toLowerCase, stopWords));
return new TokenStreamComponents(new FastStringTokenizer(reader, text, false, toLowerCase, stopWords));
}
Tokenizer tokenizer = new PatternTokenizer(text, pattern, toLowerCase);
Tokenizer tokenizer = new PatternTokenizer(reader, text, pattern, toLowerCase);
TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer;
return new TokenStreamComponents(tokenizer, result);
}
/**
* Creates a token stream that tokenizes all the text in the given Reader;
* This implementation forwards to <code>tokenStream(String, String)</code> and is
* less efficient than <code>tokenStream(String, String)</code>.
* This implementation forwards to <code>tokenStream(String, Reader, String)</code> and is
* less efficient than <code>tokenStream(String, Reader, String)</code>.
*
* @param fieldName
* the name of the field to tokenize (currently ignored).
@ -219,7 +221,7 @@ public final class PatternAnalyzer extends Analyzer {
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
try {
String text = toString(reader);
return createComponents(fieldName, text);
return createComponents(fieldName, reader, text);
} catch (IOException e) {
throw new RuntimeException(e);
}
@ -332,7 +334,8 @@ public final class PatternAnalyzer extends Analyzer {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
public PatternTokenizer(Reader input, String str, Pattern pattern, boolean toLowerCase) {
super(input);
this.pattern = pattern;
this.str = str;
this.matcher = pattern.matcher(str);
@ -359,7 +362,7 @@ public final class PatternAnalyzer extends Analyzer {
String text = str.substring(start, end);
if (toLowerCase) text = text.toLowerCase(locale);
termAtt.setEmpty().append(text);
offsetAtt.setOffset(start, end);
offsetAtt.setOffset(correctOffset(start), correctOffset(end));
return true;
}
if (!isMatch) return false;
@ -369,7 +372,7 @@ public final class PatternAnalyzer extends Analyzer {
@Override
public final void end() {
// set final offset
final int finalOffset = str.length();
final int finalOffset = correctOffset(str.length());
this.offsetAtt.setOffset(finalOffset, finalOffset);
}
@ -406,7 +409,8 @@ public final class PatternAnalyzer extends Analyzer {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set<?> stopWords) {
public FastStringTokenizer(Reader input, String str, boolean isLetter, boolean toLowerCase, Set<?> stopWords) {
super(input);
this.str = str;
this.isLetter = isLetter;
this.toLowerCase = toLowerCase;
@ -458,7 +462,7 @@ public final class PatternAnalyzer extends Analyzer {
return false;
}
termAtt.setEmpty().append(text);
offsetAtt.setOffset(start, i);
offsetAtt.setOffset(correctOffset(start), correctOffset(i));
return true;
}
@ -466,7 +470,7 @@ public final class PatternAnalyzer extends Analyzer {
public final void end() {
// set final offset
final int finalOffset = str.length();
this.offsetAtt.setOffset(finalOffset, finalOffset);
this.offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset));
}
private boolean isTokenChar(char c, boolean isLetter) {
@ -479,6 +483,7 @@ public final class PatternAnalyzer extends Analyzer {
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
this.str = PatternAnalyzer.toString(input);
}

View File

@ -68,7 +68,7 @@ public final class TrimFilter extends TokenFilter {
} else {
termAtt.setEmpty();
}
if (updateOffsets) {
if (updateOffsets && len == offsetAtt.endOffset() - offsetAtt.startOffset()) {
int newStart = offsetAtt.startOffset()+start;
int newEnd = offsetAtt.endOffset() - (start<end ? endOff:0);
offsetAtt.setOffset(newStart, newEnd);

View File

@ -405,10 +405,20 @@ public final class WordDelimiterFilter extends TokenFilter {
clearAttributes();
termAttribute.copyBuffer(savedBuffer, iterator.current, iterator.end - iterator.current);
int startOffSet = (isSingleWord || !hasIllegalOffsets) ? savedStartOffset + iterator.current : savedStartOffset;
int endOffSet = (hasIllegalOffsets) ? savedEndOffset : savedStartOffset + iterator.end;
offsetAttribute.setOffset(startOffSet, endOffSet);
int startOffset = savedStartOffset + iterator.current;
int endOffset = savedStartOffset + iterator.end;
if (hasIllegalOffsets) {
// historically this filter did this regardless for 'isSingleWord',
// but we must do a sanity check:
if (isSingleWord && startOffset <= savedEndOffset) {
offsetAttribute.setOffset(startOffset, savedEndOffset);
} else {
offsetAttribute.setOffset(savedStartOffset, savedEndOffset);
}
} else {
offsetAttribute.setOffset(startOffset, endOffset);
}
posIncAttribute.setPositionIncrement(position(false));
typeAttribute.setType(savedType);
}

View File

@ -74,7 +74,8 @@ public final class EdgeNGramTokenizer extends Tokenizer {
private int gramSize;
private Side side;
private boolean started = false;
private int inLen;
private int inLen; // length of the input AFTER trim()
private int charsRead; // length of the input
private String inStr;
@ -183,7 +184,11 @@ public final class EdgeNGramTokenizer extends Tokenizer {
if (!started) {
started = true;
char[] chars = new char[1024];
int charsRead = input.read(chars);
charsRead = input.read(chars);
if (charsRead < 0) {
charsRead = inLen = 0;
return false;
}
inStr = new String(chars, 0, charsRead).trim(); // remove any leading or trailing spaces
inLen = inStr.length();
gramSize = minGram;
@ -211,7 +216,7 @@ public final class EdgeNGramTokenizer extends Tokenizer {
@Override
public final void end() {
// set final offset
final int finalOffset = inLen;
final int finalOffset = correctOffset(charsRead);
this.offsetAtt.setOffset(finalOffset, finalOffset);
}
@ -225,5 +230,6 @@ public final class EdgeNGramTokenizer extends Tokenizer {
public void reset() throws IOException {
super.reset();
started = false;
charsRead = 0;
}
}

View File

@ -35,7 +35,8 @@ public final class NGramTokenizer extends Tokenizer {
private int minGram, maxGram;
private int gramSize;
private int pos = 0;
private int inLen;
private int inLen; // length of the input AFTER trim()
private int charsRead; // length of the input
private String inStr;
private boolean started = false;
@ -104,7 +105,11 @@ public final class NGramTokenizer extends Tokenizer {
started = true;
gramSize = minGram;
char[] chars = new char[1024];
input.read(chars);
charsRead = input.read(chars);
if (charsRead < 0) {
charsRead = inLen = 0;
return false;
}
inStr = new String(chars).trim(); // remove any trailing empty strings
inLen = inStr.length();
}
@ -128,7 +133,7 @@ public final class NGramTokenizer extends Tokenizer {
@Override
public final void end() {
// set final offset
final int finalOffset = inLen;
final int finalOffset = correctOffset(charsRead);
this.offsetAtt.setOffset(finalOffset, finalOffset);
}
@ -143,5 +148,6 @@ public final class NGramTokenizer extends Tokenizer {
super.reset();
started = false;
pos = 0;
charsRead = 0;
}
}

View File

@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
public final class PositionFilter extends TokenFilter {
/** Position increment to assign to all but the first token - default = 0 */
private int positionIncrement = 0;
private final int positionIncrement;
/** The first token must have non-zero positionIncrement **/
private boolean firstTokenPositioned = false;
@ -44,7 +44,7 @@ public final class PositionFilter extends TokenFilter {
* @param input the input stream
*/
public PositionFilter(final TokenStream input) {
super(input);
this(input, 0);
}
/**
@ -56,7 +56,7 @@ public final class PositionFilter extends TokenFilter {
* token from the input stream
*/
public PositionFilter(final TokenStream input, final int positionIncrement) {
this(input);
super(input);
this.positionIncrement = positionIncrement;
}

View File

@ -68,6 +68,7 @@ public final class ThaiWordFilter extends TokenFilter {
private CharTermAttribute clonedTermAtt = null;
private OffsetAttribute clonedOffsetAtt = null;
private boolean hasMoreTokensInClone = false;
private boolean hasIllegalOffsets = false; // only if the length changed before this filter
/** Creates a new ThaiWordFilter with the specified match version. */
public ThaiWordFilter(Version matchVersion, TokenStream input) {
@ -86,7 +87,11 @@ public final class ThaiWordFilter extends TokenFilter {
if (end != BreakIterator.DONE) {
clonedToken.copyTo(this);
termAtt.copyBuffer(clonedTermAtt.buffer(), start, end - start);
offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
if (hasIllegalOffsets) {
offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
} else {
offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
}
if (handlePosIncr) posAtt.setPositionIncrement(1);
return true;
}
@ -102,6 +107,10 @@ public final class ThaiWordFilter extends TokenFilter {
}
hasMoreTokensInClone = true;
// if length by start + end offsets doesn't match the term text then assume
// this is a synonym and don't adjust the offsets.
hasIllegalOffsets = offsetAtt.endOffset() - offsetAtt.startOffset() != termAtt.length();
// we lazy init the cloned token, as in ctor not all attributes may be added
if (clonedToken == null) {
@ -118,7 +127,11 @@ public final class ThaiWordFilter extends TokenFilter {
int end = breaker.next();
if (end != BreakIterator.DONE) {
termAtt.setLength(end);
offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
if (hasIllegalOffsets) {
offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
} else {
offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
}
// position increment keeps as it is for first token
return true;
}

View File

@ -306,13 +306,14 @@ public final class WikipediaTokenizer extends Tokenizer {
@Override
public void reset() throws IOException {
super.reset();
scanner.yyreset(input);
tokens = null;
scanner.reset();
}
@Override
public void reset(Reader reader) throws IOException {
super.reset(reader);
reset();
scanner.yyreset(input);
}
@Override

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/30/11 12:11 PM */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 1/22/12 10:26 PM */
package org.apache.lucene.analysis.wikipedia;
@ -25,8 +25,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
* on 9/30/11 12:11 PM from the specification file
* <tt>/lucene/jflex/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
* on 1/22/12 10:26 PM from the specification file
* <tt>/home/rmuir/workspace/lucene-clean-trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
*/
class WikipediaTokenizerImpl {
@ -498,6 +498,14 @@ final int setText(StringBuilder buffer){
return length;
}
final void reset() {
currentTokType = 0;
numBalanced = 0;
positionInc = 1;
numLinkToks = 0;
numWikiTokensSeen = 0;
}

View File

@ -91,6 +91,14 @@ final int setText(StringBuilder buffer){
return length;
}
final void reset() {
currentTokType = 0;
numBalanced = 0;
positionInc = 1;
numLinkToks = 0;
numWikiTokensSeen = 0;
}
%}

View File

@ -23,6 +23,7 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
@ -31,7 +32,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.junit.Ignore;
import org.apache.lucene.util._TestUtil;
public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
@ -41,9 +42,9 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
String html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " +
"another <a href=\"http://lucene.apache.org/\">link</a>. " +
"This is an entity: &amp; plus a &lt;. Here is an &. <!-- is a comment -->";
String gold = " this is some text here is a link and " +
"another link . " +
"This is an entity: & plus a <. Here is an &. ";
String gold = "\nthis is some text\n here is a link and " +
"another link. " +
"This is an entity: & plus a <. Here is an &. ";
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new StringReader(html)));
StringBuilder builder = new StringBuilder();
int ch = -1;
@ -56,7 +57,8 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
+ " Buffer so far: " + builder + "<EOB>", theChar == goldArray[position]);
position++;
}
assertEquals(gold, builder.toString());
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
//Some sanity checks, but not a full-fledged check
@ -77,6 +79,24 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
}
public void testMSWord14GeneratedHTML() throws Exception {
InputStream stream = getClass().getResourceAsStream("MS-Word 14 generated.htm");
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
String gold = "This is a test";
StringBuilder builder = new StringBuilder();
int ch = 0;
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString().trim() + "' is not equal to '" + gold + "'",
gold, builder.toString().trim());
}
public void testGamma() throws Exception {
String test = "&Gamma;";
String gold = "\u0393";
@ -89,9 +109,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
builder.append((char)ch);
}
String result = builder.toString();
// System.out.println("Resu: " + result + "<EOL>");
// System.out.println("Gold: " + gold + "<EOL>");
assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
}
public void testEntities() throws Exception {
@ -106,9 +124,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
builder.append((char)ch);
}
String result = builder.toString();
// System.out.println("Resu: " + result + "<EOL>");
// System.out.println("Gold: " + gold + "<EOL>");
assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
}
public void testMoreEntities() throws Exception {
@ -123,9 +139,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
builder.append((char)ch);
}
String result = builder.toString();
// System.out.println("Resu: " + result + "<EOL>");
// System.out.println("Gold: " + gold + "<EOL>");
assertTrue(result + " is not equal to " + gold, result.equals(gold) == true);
assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
}
public void testReserved() throws Exception {
@ -147,45 +161,248 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
}
public void testMalformedHTML() throws Exception {
String test = "a <a hr<ef=aa<a>> </close</a>";
String gold = "a <a hr<ef=aa > </close ";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
builder.append((char)ch);
String[] testGold = {
"a <a hr<ef=aa<a>> </close</a>",
"a <a hr<ef=aa> </close",
"<a href=http://dmoz.org/cgi-bin/add.cgi?where=/arts/\" class=lu style=\"font-size: 9px\" target=dmoz>Submit a Site</a>",
"Submit a Site",
"<a href=javascript:ioSwitch('p8','http://www.csmonitor.com/') title=expand id=e8 class=expanded rel=http://www.csmonitor.com/>Christian Science",
"Christian Science",
"<link rel=\"alternate\" type=\"application/rss+xml\" title=\"San Francisco \" 2008 RSS Feed\" href=\"http://2008.sf.wordcamp.org/feed/\" />",
"\n",
// "<" before ">" inhibits tag recognition
"<a href=\" http://www.surgery4was.happyhost.org/video-of-arthroscopic-knee-surgery symptoms.html, heat congestive heart failure <a href=\" http://www.symptoms1bad.happyhost.org/canine",
"<a href=\" http://www.surgery4was.happyhost.org/video-of-arthroscopic-knee-surgery symptoms.html, heat congestive heart failure <a href=\" http://www.symptoms1bad.happyhost.org/canine",
"<a href=\"http://ucblibraries.colorado.edu/how/index.htm\"class=\"pageNavAreaText\">",
"",
"<link title=\"^\\\" 21Sta's Blog\" rel=\"search\" type=\"application/opensearchdescription+xml\" href=\"http://21sta.com/blog/inc/opensearch.php\" />",
"\n",
"<a href=\"#postcomment\" title=\"\"Leave a comment\";\">?",
"?",
"<a href='/modern-furniture' ' id='21txt' class='offtab' onMouseout=\"this.className='offtab'; return true;\" onMouseover=\"this.className='ontab'; return true;\">",
"",
"<a href='http://alievi.wordpress.com/category/01-todos-posts/' style='font-size: 275%; padding: 1px; margin: 1px;' title='01 - Todos Post's (83)'>",
"",
"The <a href=<a href=\"http://www.advancedmd.com>medical\">http://www.advancedmd.com>medical</a> practice software</a>",
"The <a href=medical\">http://www.advancedmd.com>medical practice software",
"<a href=\"node/21426\" class=\"clipTitle2\" title=\"Levi.com/BMX 2008 Clip of the Week 29 \"Morgan Wade Leftover Clips\"\">Levi.com/BMX 2008 Clip of the Week 29...",
"Levi.com/BMX 2008 Clip of the Week 29...",
"<a href=\"printer_friendly.php?branch=&year=&submit=go&screen=\";\">Printer Friendly",
"Printer Friendly",
"<a href=#\" ondragstart=\"return false\" onclick=\"window.external.AddFavorite('http://www.amazingtextures.com', 'Amazing Textures');return false\" onmouseover=\"window.status='Add to Favorites';return true\">Add to Favorites",
"Add to Favorites",
"<a href=\"../at_home/at_home_search.html\"../_home/at_home_search.html\">At",
"At",
"E-mail: <a href=\"\"mailto:XXXXXX@example.com\" \">XXXXXX@example.com </a>",
"E-mail: XXXXXX@example.com ",
"<li class=\"farsi\"><a title=\"A'13?\" alt=\"A'13?\" href=\"http://www.america.gov/persian\" alt=\"\" name=\"A'13?\"A'13? title=\"A'13?\">A'13?</a></li>",
"\nA'13?\n",
"<li><a href=\"#28\" title=\"Hubert \"Geese\" Ausby\">Hubert \"Geese\" Ausby</a></li>",
"\nHubert \"Geese\" Ausby\n",
"<href=\"http://anbportal.com/mms/login.asp\">",
"\n",
"<a href=\"",
"<a href=\"",
"<a href=\">",
"",
"<a rel=\"nofollow\" href=\"http://anissanina31.skyrock.com/1895039493-Hi-tout-le-monde.html\" title=\" Hi, tout le monde !>#</a>",
"#",
"<a href=\"http://annunciharleydavidsonusate.myblog.it/\" title=\"Annunci Moto e Accessori Harley Davidson\" target=\"_blank\"><img src=\"http://annunciharleydavidsonusate.myblog.it/images/Antipixel.gif\" /></a>",
"",
"<a href=\"video/addvideo&v=120838887181\" onClick=\"return confirm('Are you sure you want add this video to your profile? If it exists some video in your profile will be overlapped by this video!!')\" \" onmouseover=\"this.className='border2'\" onmouseout=\"this.className=''\">",
"",
"<a href=#Services & Support>",
"",
// "<" and ">" chars are accepted in on[Event] attribute values
"<input type=\"image\" src=\"http://apologyindex.com/ThemeFiles/83401-72905/images/btn_search.gif\"value=\"Search\" name=\"Search\" alt=\"Search\" class=\"searchimage\" onclick=\"incom ='&sc=' + document.getElementById('sel').value ; var dt ='&dt=' + document.getElementById('dt').value; var searchKeyword = document.getElementById('q').value ; searchKeyword = searchKeyword.replace(/\\s/g,''); if (searchKeyword.length < 3){alert('Nothing to search. Search keyword should contain atleast 3 chars.'); return false; } var al='&al=' + document.getElementById('advancedlink').style.display ; document.location.href='http://apologyindex.com/search.aspx?q=' + document.getElementById('q').value + incom + dt + al;\" />",
"",
"<input type=\"image\" src=\"images/afbe.gif\" width=\"22\" height=\"22\" hspace=\"4\" title=\"Add to Favorite\" alt=\"Add to Favorite\"onClick=\" if(window.sidebar){ window.sidebar.addPanel(document.title,location.href,''); }else if(window.external){ window.external.AddFavorite(location.href,document.title); }else if(window.opera&&window.print) { return true; }\">",
"",
"<area shape=\"rect\" coords=\"12,153,115,305\" href=\"http://statenislandtalk.com/v-web/gallery/Osmundsen-family\"Art's Norwegian Roots in Rogaland\">",
"\n",
"<a rel=\"nofollow\" href=\"http://arth26.skyrock.com/660188240-bonzai.html\" title=\"bonza>#",
"#",
"<a href= >",
"",
"<ahref=http:..",
"<ahref=http:..",
"<ahref=http:..>",
"\n",
"<ahref=\"http://aseigo.bddf.ca/cms/1025\">A",
"\nA",
"<a href=\"javascript:calendar_window=window.open('/calendar.aspx?formname=frmCalendar.txtDate','calendar_window','width=154,height=188');calendar_window.focus()\">",
"",
"<a href=\"/applications/defenseaerospace/19+rackmounts\" title=\"19\" Rackmounts\">",
"",
"<a href=http://www.azimprimerie.fr/flash/backup/lewes-zip-code/savage-model-110-manual.html title=savage model 110 manual rel=dofollow>",
"",
"<a class=\"at\" name=\"Lamborghini href=\"http://lamborghini.coolbegin.com\">Lamborghini /a>",
"Lamborghini /a>",
"<A href='newslink.php?news_link=http%3A%2F%2Fwww.worldnetdaily.com%2Findex.php%3Ffa%3DPAGE.view%26pageId%3D85729&news_title=Florida QB makes 'John 3:16' hottest Google search Tebow inscribed Bible reference on eye black for championship game' TARGET=_blank>",
"",
"<a href=/myspace !style='color:#993333'>",
"",
"<meta name=3DProgId content=3DExcel.Sheet>",
"\n",
"<link id=3D\"shLink\" href=3D\"PSABrKelly-BADMINTONCupResults08FINAL2008_09_19=_files/sheet004.htm\">",
"\n",
"<td bgcolor=3D\"#FFFFFF\" nowrap>",
"\n",
"<a href=\"http://basnect.info/usersearch/\"predicciones-mundiales-2009\".html\">\"predicciones mundiales 2009\"</a>",
"\"predicciones mundiales 2009\"",
"<a class=\"comment-link\" href=\"https://www.blogger.com/comment.g?blogID=19402125&postID=114070605958684588\"location.href=https://www.blogger.com/comment.g?blogID=19402125&postID=114070605958684588;>",
"",
"<a href = \"/videos/Bishop\"/\" title = \"click to see more Bishop\" videos\">Bishop\"</a>",
"Bishop\"",
"<a href=\"http://bhaa.ie/calendar/event.php?eid=20081203150127531\"\">BHAA Eircom 2 &amp; 5 miles CC combined start</a>",
"BHAA Eircom 2 & 5 miles CC combined start",
"<a href=\"http://people.tribe.net/wolfmana\" onClick='setClick(\"Application[tribe].Person[bb7df210-9dc0-478c-917f-436b896bcb79]\")'\" title=\"Mana\">",
"",
"<a href=\"http://blog.edu-cyberpg.com/ct.ashx?id=6143c528-080c-4bb2-b765-5ec56c8256d3&url=http%3a%2f%2fwww.gsa.ac.uk%2fmackintoshsketchbook%2f\"\" eudora=\"autourl\">",
"",
// "<" before ">" inhibits tag recognition
"<input type=\"text\" value=\"<search here>\">",
"<input type=\"text\" value=\"\n\">",
"<input type=\"text\" value=\"<search here\">",
"<input type=\"text\" value=\"\n",
"<input type=\"text\" value=\"search here>\">",
"\">",
// "<" and ">" chars are accepted in on[Event] attribute values
"<input type=\"text\" value=\"&lt;search here&gt;\" onFocus=\"this.value='<search here>'\">",
"",
"<![if ! IE]>\n<link href=\"http://i.deviantart.com/icons/favicon.png\" rel=\"shortcut icon\"/>\n<![endif]>",
"\n\n\n",
"<![if supportMisalignedColumns]>\n<tr height=0 style='display:none'>\n<td width=64 style='width:48pt'></td>\n</tr>\n<![endif]>",
"\n\n\n\n\n\n\n\n",
};
for (int i = 0 ; i < testGold.length ; i += 2) {
String test = testGold[i];
String gold = testGold[i + 1];
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
String result = builder.toString();
assertEquals("Test: '" + test + "'", gold, result);
}
String result = builder.toString();
// System.out.println("Resu: " + result + "<EOL>");
// System.out.println("Gold: " + gold + "<EOL>");
assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
}
public void testBufferOverflow() throws Exception {
StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.DEFAULT_READ_AHEAD + 50);
StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.getInitialBufferSize() + 50);
testBuilder.append("ah<?> ??????");
appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions
testBuilder.setLength(0);
testBuilder.append("<!--");//comments
appendChars(testBuilder, 3*HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);//comments have two lookaheads
appendChars(testBuilder, 3 * HTMLStripCharFilter.getInitialBufferSize() + 500);//comments have two lookaheads
testBuilder.append("-->foo");
processBuffer(testBuilder.toString(), "Failed w/ comment");
String gold = "foo";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
testBuilder.setLength(0);
testBuilder.append("<?");
appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
testBuilder.append("?>");
processBuffer(testBuilder.toString(), "Failed with proc. instr.");
gold = "";
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
ch = 0;
builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
testBuilder.setLength(0);
testBuilder.append("<b ");
appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
testBuilder.append("/>");
processBuffer(testBuilder.toString(), "Failed on tag");
gold = "";
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
ch = 0;
builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
private void appendChars(StringBuilder testBuilder, int numChars) {
@ -208,13 +425,14 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
} finally {
// System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
}
assertTrue(assertMsg + "::: " + builder.toString() + " is not equal to " + test, builder.toString().equals(test) == true);
assertEquals(assertMsg + "::: " + builder.toString() + " is not equal to " + test,
test, builder.toString());
}
public void testComment() throws Exception {
String test = "<!--- three dashes, still a valid comment ---> ";
String gold = " ";
String gold = " ";
Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
int ch = 0;
StringBuilder builder = new StringBuilder();
@ -225,7 +443,8 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
} finally {
// System.out.println("String: " + builder.toString());
}
assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true);
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
@ -247,15 +466,32 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
}
public void testOffsets() throws Exception {
doTestOffsets("hello X how X are you");
// doTestOffsets("hello X how X are you");
doTestOffsets("hello <p> X<p> how <p>X are you");
doTestOffsets("X &amp; X &#40; X &lt; &gt; X");
// test backtracking
doTestOffsets("X < &zz >X &# < X > < &l > &g < X");
}
@Ignore("broken offsets: see LUCENE-2208")
static void assertLegalOffsets(String in) throws Exception {
int length = in.length();
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
int ch = 0;
int off = 0;
while ((ch = reader.read()) != -1) {
int correction = reader.correctOffset(off);
assertTrue("invalid offset correction: " + off + "->" + correction + " for doc of length: " + length,
correction <= length);
off++;
}
}
public void testLegalOffsets() throws Exception {
assertLegalOffsets("hello world");
assertLegalOffsets("hello &#x world");
}
public void testRandom() throws Exception {
Analyzer analyzer = new Analyzer() {
@ -267,11 +503,361 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
@Override
protected Reader initReader(Reader reader) {
return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
return new HTMLStripCharFilter(CharReader.get(reader));
}
};
int numRounds = RANDOM_MULTIPLIER * 10000;
checkRandomData(random, analyzer, numRounds);
}
public void testServerSideIncludes() throws Exception {
String test = "one<img src=\"image.png\"\n"
+ " alt = \"Alt: <!--#echo var='${IMAGE_CAPTION:<!--comment-->\\'Comment\\'}' -->\"\n\n"
+ " title=\"Title: <!--#echo var=\"IMAGE_CAPTION\"-->\">two";
String gold = "onetwo";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertTrue(builder.toString() + " is not equal to " + gold, builder.toString().equals(gold));
test = "one<script><!-- <!--#config comment=\"<!-- \\\"comment\\\"-->\"--> --></script>two";
gold = "one\ntwo";
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
ch = 0;
builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testScriptQuotes() throws Exception {
String test = "one<script attr= bare><!-- action('<!-- comment -->', \"\\\"-->\\\"\"); --></script>two";
String gold = "one\ntwo";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
test = "hello<script><!-- f('<!--internal--></script>'); --></script>";
gold = "hello\n";
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
ch = 0;
builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testEscapeScript() throws Exception {
String test = "one<script no-value-attr>callSomeMethod();</script>two";
String gold = "one<script no-value-attr></script>two";
Set<String> escapedTags = new HashSet<String>(Arrays.asList("SCRIPT"));
Reader reader = new HTMLStripCharFilter
(CharReader.get(new StringReader(test)), escapedTags);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testStyle() throws Exception {
String test = "one<style type=\"text/css\">\n"
+ "<!--\n"
+ "@import url('http://www.lasletrasdecanciones.com/css.css');\n"
+ "-->\n"
+ "</style>two";
String gold = "one\ntwo";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testEscapeStyle() throws Exception {
String test = "one<style type=\"text/css\"> body,font,a { font-family:arial; } </style>two";
String gold = "one<style type=\"text/css\"></style>two";
Set<String> escapedTags = new HashSet<String>(Arrays.asList("STYLE"));
Reader reader = new HTMLStripCharFilter
(CharReader.get(new StringReader(test)), escapedTags);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testBR() throws Exception {
String[] testGold = {
"one<BR />two<br>three",
"one\ntwo\nthree",
"one<BR some stuff here too>two</BR>",
"one\ntwo\n",
};
for (int i = 0 ; i < testGold.length ; i += 2) {
String test = testGold[i];
String gold = testGold[i + 1];
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
String result = builder.toString();
assertEquals("Test: '" + test + "'", gold, result);
}
}
public void testEscapeBR() throws Exception {
String test = "one<BR class='whatever'>two</\nBR\n>";
String gold = "one<BR class='whatever'>two</\nBR\n>";
Set<String> escapedTags = new HashSet<String>(Arrays.asList("BR"));
Reader reader = new HTMLStripCharFilter
(CharReader.get(new StringReader(test)), escapedTags);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testInlineTagsNoSpace() throws Exception {
String test = "one<sPAn class=\"invisible\">two<sup>2<sup>e</sup></sup>.</SpaN>three";
String gold = "onetwo2e.three";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testCDATA() throws Exception {
String test = "one<![CDATA[<one><two>three<four></four></two></one>]]>two";
String gold = "one<one><two>three<four></four></two></one>two";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
test = "one<![CDATA[two<![CDATA[three]]]]><![CDATA[>four]]>five";
gold = "onetwo<![CDATA[three]]>fourfive";
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
ch = 0;
builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testUppercaseCharacterEntityVariants() throws Exception {
String test = " &QUOT;-&COPY;&GT;>&LT;<&REG;&AMP;";
String gold = " \"-\u00A9>><<\u00AE&";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testMSWordMalformedProcessingInstruction() throws Exception {
String test = "one<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />two";
String gold = "onetwo";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testSupplementaryCharsInTags() throws Exception {
String test = "one<𩬅艱鍟䇹愯瀛>two<瀛愯𩬅>three 瀛愯𩬅</瀛愯𩬅>four</𩬅艱鍟䇹愯瀛>five<𠀀𠀀>six<𠀀𠀀/>seven";
String gold = "one\ntwo\nthree 瀛愯𩬅\nfour\nfive\nsix\nseven";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testRandomBrokenHTML() throws Exception {
int maxNumElements = 10000;
String text = _TestUtil.randomHtmlishString(random, maxNumElements);
Reader reader = new HTMLStripCharFilter
(CharReader.get(new StringReader(text)));
while (reader.read() != -1);
}
public void testRandomText() throws Exception {
StringBuilder text = new StringBuilder();
int minNumWords = 10;
int maxNumWords = 10000;
int minWordLength = 3;
int maxWordLength = 20;
int numWords = _TestUtil.nextInt(random, minNumWords, maxNumWords);
switch (_TestUtil.nextInt(random, 0, 4)) {
case 0: {
for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
text.append(_TestUtil.randomUnicodeString(random, maxWordLength));
text.append(' ');
}
break;
}
case 1: {
for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
text.append(_TestUtil.randomRealisticUnicodeString
(random, minWordLength, maxWordLength));
text.append(' ');
}
break;
}
default: { // ASCII 50% of the time
for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
text.append(_TestUtil.randomSimpleString(random));
text.append(' ');
}
}
}
Reader reader = new HTMLStripCharFilter
(CharReader.get(new StringReader(text.toString())));
while (reader.read() != -1);
}
public void testUTF16Surrogates() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, tokenizer);
}
@Override
protected Reader initReader(Reader reader) {
return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
}
};
// Paired surrogates
assertAnalyzesTo(analyzer, " one two &#xD86C;&#XdC01;three",
new String[] { "one", "two", "\uD86C\uDC01three" } );
assertAnalyzesTo(analyzer, " &#55404;&#XdC01;", new String[] { "\uD86C\uDC01" } );
assertAnalyzesTo(analyzer, " &#xD86C;&#56321;", new String[] { "\uD86C\uDC01" } );
assertAnalyzesTo(analyzer, " &#55404;&#56321;", new String[] { "\uD86C\uDC01" } );
// Improperly paired surrogates
assertAnalyzesTo(analyzer, " &#55404;&#57999;", new String[] { "\uFFFD\uE28F" } );
assertAnalyzesTo(analyzer, " &#xD86C;&#57999;", new String[] { "\uFFFD\uE28F" } );
assertAnalyzesTo(analyzer, " &#55002;&#XdC01;", new String[] { "\uD6DA\uFFFD" } );
assertAnalyzesTo(analyzer, " &#55002;&#56321;", new String[] { "\uD6DA\uFFFD" } );
// Unpaired high surrogates
assertAnalyzesTo(analyzer, " &#Xd921;", new String[] { "\uFFFD" } );
assertAnalyzesTo(analyzer, " &#Xd921", new String[] { "\uFFFD" } );
assertAnalyzesTo(analyzer, " &#Xd921<br>", new String[] { "&#Xd921" } );
assertAnalyzesTo(analyzer, " &#55528;", new String[] { "\uFFFD" } );
assertAnalyzesTo(analyzer, " &#55528", new String[] { "\uFFFD" } );
assertAnalyzesTo(analyzer, " &#55528<br>", new String[] { "&#55528" } );
// Unpaired low surrogates
assertAnalyzesTo(analyzer, " &#xdfdb;", new String[] { "\uFFFD" } );
assertAnalyzesTo(analyzer, " &#xdfdb", new String[] { "\uFFFD" } );
assertAnalyzesTo(analyzer, " &#xdfdb<br>", new String[] { "&#xdfdb" } );
assertAnalyzesTo(analyzer, " &#57209;", new String[] { "\uFFFD" } );
assertAnalyzesTo(analyzer, " &#57209", new String[] { "\uFFFD" } );
assertAnalyzesTo(analyzer, " &#57209<br>", new String[] { "&#57209" } );
}
}

View File

@ -0,0 +1,653 @@
<html xmlns:v="urn:schemas-microsoft-com:vml"
xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns:w="urn:schemas-microsoft-com:office:word"
xmlns:m="http://schemas.microsoft.com/office/2004/12/omml"
xmlns="http://www.w3.org/TR/REC-html40">
<head>
<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
<meta name=ProgId content=Word.Document>
<meta name=Generator content="Microsoft Word 14">
<meta name=Originator content="Microsoft Word 14">
<link rel=File-List href="This%20is%20a%20test_files/filelist.xml">
<!--[if gte mso 9]><xml>
<o:DocumentProperties>
<o:Author>s</o:Author>
<o:LastAuthor>s</o:LastAuthor>
<o:Revision>1</o:Revision>
<o:TotalTime>1</o:TotalTime>
<o:Created>2012-01-13T03:36:00Z</o:Created>
<o:LastSaved>2012-01-13T03:37:00Z</o:LastSaved>
<o:Pages>1</o:Pages>
<o:Words>8</o:Words>
<o:Characters>48</o:Characters>
<o:Lines>1</o:Lines>
<o:Paragraphs>1</o:Paragraphs>
<o:CharactersWithSpaces>55</o:CharactersWithSpaces>
<o:Version>14.00</o:Version>
</o:DocumentProperties>
<o:OfficeDocumentSettings>
<o:AllowPNG/>
</o:OfficeDocumentSettings>
</xml><![endif]-->
<link rel=themeData href="This%20is%20a%20test_files/themedata.thmx">
<link rel=colorSchemeMapping
href="This%20is%20a%20test_files/colorschememapping.xml">
<!--[if gte mso 9]><xml>
<w:WordDocument>
<w:SpellingState>Clean</w:SpellingState>
<w:GrammarState>Clean</w:GrammarState>
<w:TrackMoves>false</w:TrackMoves>
<w:TrackFormatting/>
<w:PunctuationKerning/>
<w:ValidateAgainstSchemas/>
<w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
<w:IgnoreMixedContent>false</w:IgnoreMixedContent>
<w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
<w:DoNotPromoteQF/>
<w:LidThemeOther>EN-US</w:LidThemeOther>
<w:LidThemeAsian>X-NONE</w:LidThemeAsian>
<w:LidThemeComplexScript>X-NONE</w:LidThemeComplexScript>
<w:Compatibility>
<w:BreakWrappedTables/>
<w:SnapToGridInCell/>
<w:WrapTextWithPunct/>
<w:UseAsianBreakRules/>
<w:DontGrowAutofit/>
<w:SplitPgBreakAndParaMark/>
<w:EnableOpenTypeKerning/>
<w:DontFlipMirrorIndents/>
<w:OverrideTableStyleHps/>
</w:Compatibility>
<m:mathPr>
<m:mathFont m:val="Cambria Math"/>
<m:brkBin m:val="before"/>
<m:brkBinSub m:val="&#45;-"/>
<m:smallFrac m:val="off"/>
<m:dispDef/>
<m:lMargin m:val="0"/>
<m:rMargin m:val="0"/>
<m:defJc m:val="centerGroup"/>
<m:wrapIndent m:val="1440"/>
<m:intLim m:val="subSup"/>
<m:naryLim m:val="undOvr"/>
</m:mathPr></w:WordDocument>
</xml><![endif]--><!--[if gte mso 9]><xml>
<w:LatentStyles DefLockedState="false" DefUnhideWhenUsed="true"
DefSemiHidden="true" DefQFormat="false" DefPriority="99"
LatentStyleCount="267">
<w:LsdException Locked="false" Priority="0" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Normal"/>
<w:LsdException Locked="false" Priority="9" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="heading 1"/>
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 2"/>
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 3"/>
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 4"/>
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 5"/>
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 6"/>
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 7"/>
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 8"/>
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 9"/>
<w:LsdException Locked="false" Priority="39" Name="toc 1"/>
<w:LsdException Locked="false" Priority="39" Name="toc 2"/>
<w:LsdException Locked="false" Priority="39" Name="toc 3"/>
<w:LsdException Locked="false" Priority="39" Name="toc 4"/>
<w:LsdException Locked="false" Priority="39" Name="toc 5"/>
<w:LsdException Locked="false" Priority="39" Name="toc 6"/>
<w:LsdException Locked="false" Priority="39" Name="toc 7"/>
<w:LsdException Locked="false" Priority="39" Name="toc 8"/>
<w:LsdException Locked="false" Priority="39" Name="toc 9"/>
<w:LsdException Locked="false" Priority="35" QFormat="true" Name="caption"/>
<w:LsdException Locked="false" Priority="10" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Title"/>
<w:LsdException Locked="false" Priority="1" Name="Default Paragraph Font"/>
<w:LsdException Locked="false" Priority="11" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Subtitle"/>
<w:LsdException Locked="false" Priority="22" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Strong"/>
<w:LsdException Locked="false" Priority="20" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Emphasis"/>
<w:LsdException Locked="false" Priority="59" SemiHidden="false"
UnhideWhenUsed="false" Name="Table Grid"/>
<w:LsdException Locked="false" UnhideWhenUsed="false" Name="Placeholder Text"/>
<w:LsdException Locked="false" Priority="1" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="No Spacing"/>
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Shading"/>
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
UnhideWhenUsed="false" Name="Light List"/>
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Grid"/>
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 1"/>
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 2"/>
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 1"/>
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 2"/>
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 1"/>
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 2"/>
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 3"/>
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
UnhideWhenUsed="false" Name="Dark List"/>
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Shading"/>
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful List"/>
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Grid"/>
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Shading Accent 1"/>
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
UnhideWhenUsed="false" Name="Light List Accent 1"/>
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Grid Accent 1"/>
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 1"/>
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 1"/>
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 1 Accent 1"/>
<w:LsdException Locked="false" UnhideWhenUsed="false" Name="Revision"/>
<w:LsdException Locked="false" Priority="34" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="List Paragraph"/>
<w:LsdException Locked="false" Priority="29" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Quote"/>
<w:LsdException Locked="false" Priority="30" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Intense Quote"/>
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 2 Accent 1"/>
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 1"/>
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 1"/>
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 1"/>
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
UnhideWhenUsed="false" Name="Dark List Accent 1"/>
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Shading Accent 1"/>
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful List Accent 1"/>
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Grid Accent 1"/>
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Shading Accent 2"/>
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
UnhideWhenUsed="false" Name="Light List Accent 2"/>
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Grid Accent 2"/>
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 2"/>
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 2"/>
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 1 Accent 2"/>
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 2 Accent 2"/>
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 2"/>
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 2"/>
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 2"/>
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
UnhideWhenUsed="false" Name="Dark List Accent 2"/>
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Shading Accent 2"/>
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful List Accent 2"/>
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Grid Accent 2"/>
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Shading Accent 3"/>
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
UnhideWhenUsed="false" Name="Light List Accent 3"/>
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Grid Accent 3"/>
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 3"/>
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 3"/>
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 1 Accent 3"/>
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 2 Accent 3"/>
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 3"/>
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 3"/>
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 3"/>
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
UnhideWhenUsed="false" Name="Dark List Accent 3"/>
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Shading Accent 3"/>
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful List Accent 3"/>
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Grid Accent 3"/>
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Shading Accent 4"/>
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
UnhideWhenUsed="false" Name="Light List Accent 4"/>
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Grid Accent 4"/>
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 4"/>
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 4"/>
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 1 Accent 4"/>
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 2 Accent 4"/>
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 4"/>
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 4"/>
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 4"/>
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
UnhideWhenUsed="false" Name="Dark List Accent 4"/>
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Shading Accent 4"/>
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful List Accent 4"/>
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Grid Accent 4"/>
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Shading Accent 5"/>
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
UnhideWhenUsed="false" Name="Light List Accent 5"/>
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Grid Accent 5"/>
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 5"/>
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 5"/>
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 1 Accent 5"/>
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 2 Accent 5"/>
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 5"/>
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 5"/>
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 5"/>
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
UnhideWhenUsed="false" Name="Dark List Accent 5"/>
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Shading Accent 5"/>
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful List Accent 5"/>
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Grid Accent 5"/>
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Shading Accent 6"/>
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
UnhideWhenUsed="false" Name="Light List Accent 6"/>
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Grid Accent 6"/>
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 6"/>
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 6"/>
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 1 Accent 6"/>
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 2 Accent 6"/>
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 6"/>
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 6"/>
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 6"/>
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
UnhideWhenUsed="false" Name="Dark List Accent 6"/>
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Shading Accent 6"/>
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful List Accent 6"/>
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Grid Accent 6"/>
<w:LsdException Locked="false" Priority="19" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Subtle Emphasis"/>
<w:LsdException Locked="false" Priority="21" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Intense Emphasis"/>
<w:LsdException Locked="false" Priority="31" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Subtle Reference"/>
<w:LsdException Locked="false" Priority="32" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Intense Reference"/>
<w:LsdException Locked="false" Priority="33" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Book Title"/>
<w:LsdException Locked="false" Priority="37" Name="Bibliography"/>
<w:LsdException Locked="false" Priority="39" QFormat="true" Name="TOC Heading"/>
</w:LatentStyles>
</xml><![endif]-->
<style>
<!--
/* Font Definitions */
@font-face
{font-family:"Cambria Math";
panose-1:2 4 5 3 5 4 6 3 2 4;
mso-font-charset:1;
mso-generic-font-family:roman;
mso-font-format:other;
mso-font-pitch:variable;
mso-font-signature:0 0 0 0 0 0;}
@font-face
{font-family:Cambria;
panose-1:2 4 5 3 5 4 6 3 2 4;
mso-font-charset:0;
mso-generic-font-family:roman;
mso-font-pitch:variable;
mso-font-signature:-536870145 1073743103 0 0 415 0;}
@font-face
{font-family:Calibri;
panose-1:2 15 5 2 2 2 4 3 2 4;
mso-font-charset:0;
mso-generic-font-family:swiss;
mso-font-pitch:variable;
mso-font-signature:-520092929 1073786111 9 0 415 0;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
{mso-style-unhide:no;
mso-style-qformat:yes;
mso-style-parent:"";
margin-top:0in;
margin-right:0in;
margin-bottom:10.0pt;
margin-left:0in;
line-height:115%;
mso-pagination:widow-orphan;
font-size:11.0pt;
font-family:"Calibri","sans-serif";
mso-ascii-font-family:Calibri;
mso-ascii-theme-font:minor-latin;
mso-fareast-font-family:Calibri;
mso-fareast-theme-font:minor-latin;
mso-hansi-font-family:Calibri;
mso-hansi-theme-font:minor-latin;
mso-bidi-font-family:"Times New Roman";
mso-bidi-theme-font:minor-bidi;}
h1
{mso-style-priority:9;
mso-style-unhide:no;
mso-style-qformat:yes;
mso-style-link:"Heading 1 Char";
mso-style-next:Normal;
margin-top:24.0pt;
margin-right:0in;
margin-bottom:0in;
margin-left:0in;
margin-bottom:.0001pt;
line-height:115%;
mso-pagination:widow-orphan lines-together;
page-break-after:avoid;
mso-outline-level:1;
font-size:14.0pt;
font-family:"Cambria","serif";
mso-ascii-font-family:Cambria;
mso-ascii-theme-font:major-latin;
mso-fareast-font-family:"Times New Roman";
mso-fareast-theme-font:major-fareast;
mso-hansi-font-family:Cambria;
mso-hansi-theme-font:major-latin;
mso-bidi-font-family:"Times New Roman";
mso-bidi-theme-font:major-bidi;
color:#365F91;
mso-themecolor:accent1;
mso-themeshade:191;
mso-font-kerning:0pt;}
p.MsoTitle, li.MsoTitle, div.MsoTitle
{mso-style-priority:10;
mso-style-unhide:no;
mso-style-qformat:yes;
mso-style-link:"Title Char";
mso-style-next:Normal;
margin-top:0in;
margin-right:0in;
margin-bottom:15.0pt;
margin-left:0in;
mso-add-space:auto;
mso-pagination:widow-orphan;
border:none;
mso-border-bottom-alt:solid #4F81BD 1.0pt;
mso-border-bottom-themecolor:accent1;
padding:0in;
mso-padding-alt:0in 0in 4.0pt 0in;
font-size:26.0pt;
font-family:"Cambria","serif";
mso-ascii-font-family:Cambria;
mso-ascii-theme-font:major-latin;
mso-fareast-font-family:"Times New Roman";
mso-fareast-theme-font:major-fareast;
mso-hansi-font-family:Cambria;
mso-hansi-theme-font:major-latin;
mso-bidi-font-family:"Times New Roman";
mso-bidi-theme-font:major-bidi;
color:#17365D;
mso-themecolor:text2;
mso-themeshade:191;
letter-spacing:.25pt;
mso-font-kerning:14.0pt;}
p.MsoTitleCxSpFirst, li.MsoTitleCxSpFirst, div.MsoTitleCxSpFirst
{mso-style-priority:10;
mso-style-unhide:no;
mso-style-qformat:yes;
mso-style-link:"Title Char";
mso-style-next:Normal;
mso-style-type:export-only;
margin:0in;
margin-bottom:.0001pt;
mso-add-space:auto;
mso-pagination:widow-orphan;
border:none;
mso-border-bottom-alt:solid #4F81BD 1.0pt;
mso-border-bottom-themecolor:accent1;
padding:0in;
mso-padding-alt:0in 0in 4.0pt 0in;
font-size:26.0pt;
font-family:"Cambria","serif";
mso-ascii-font-family:Cambria;
mso-ascii-theme-font:major-latin;
mso-fareast-font-family:"Times New Roman";
mso-fareast-theme-font:major-fareast;
mso-hansi-font-family:Cambria;
mso-hansi-theme-font:major-latin;
mso-bidi-font-family:"Times New Roman";
mso-bidi-theme-font:major-bidi;
color:#17365D;
mso-themecolor:text2;
mso-themeshade:191;
letter-spacing:.25pt;
mso-font-kerning:14.0pt;}
p.MsoTitleCxSpMiddle, li.MsoTitleCxSpMiddle, div.MsoTitleCxSpMiddle
{mso-style-priority:10;
mso-style-unhide:no;
mso-style-qformat:yes;
mso-style-link:"Title Char";
mso-style-next:Normal;
mso-style-type:export-only;
margin:0in;
margin-bottom:.0001pt;
mso-add-space:auto;
mso-pagination:widow-orphan;
border:none;
mso-border-bottom-alt:solid #4F81BD 1.0pt;
mso-border-bottom-themecolor:accent1;
padding:0in;
mso-padding-alt:0in 0in 4.0pt 0in;
font-size:26.0pt;
font-family:"Cambria","serif";
mso-ascii-font-family:Cambria;
mso-ascii-theme-font:major-latin;
mso-fareast-font-family:"Times New Roman";
mso-fareast-theme-font:major-fareast;
mso-hansi-font-family:Cambria;
mso-hansi-theme-font:major-latin;
mso-bidi-font-family:"Times New Roman";
mso-bidi-theme-font:major-bidi;
color:#17365D;
mso-themecolor:text2;
mso-themeshade:191;
letter-spacing:.25pt;
mso-font-kerning:14.0pt;}
p.MsoTitleCxSpLast, li.MsoTitleCxSpLast, div.MsoTitleCxSpLast
{mso-style-priority:10;
mso-style-unhide:no;
mso-style-qformat:yes;
mso-style-link:"Title Char";
mso-style-next:Normal;
mso-style-type:export-only;
margin-top:0in;
margin-right:0in;
margin-bottom:15.0pt;
margin-left:0in;
mso-add-space:auto;
mso-pagination:widow-orphan;
border:none;
mso-border-bottom-alt:solid #4F81BD 1.0pt;
mso-border-bottom-themecolor:accent1;
padding:0in;
mso-padding-alt:0in 0in 4.0pt 0in;
font-size:26.0pt;
font-family:"Cambria","serif";
mso-ascii-font-family:Cambria;
mso-ascii-theme-font:major-latin;
mso-fareast-font-family:"Times New Roman";
mso-fareast-theme-font:major-fareast;
mso-hansi-font-family:Cambria;
mso-hansi-theme-font:major-latin;
mso-bidi-font-family:"Times New Roman";
mso-bidi-theme-font:major-bidi;
color:#17365D;
mso-themecolor:text2;
mso-themeshade:191;
letter-spacing:.25pt;
mso-font-kerning:14.0pt;}
span.TitleChar
{mso-style-name:"Title Char";
mso-style-priority:10;
mso-style-unhide:no;
mso-style-locked:yes;
mso-style-link:Title;
mso-ansi-font-size:26.0pt;
mso-bidi-font-size:26.0pt;
font-family:"Cambria","serif";
mso-ascii-font-family:Cambria;
mso-ascii-theme-font:major-latin;
mso-fareast-font-family:"Times New Roman";
mso-fareast-theme-font:major-fareast;
mso-hansi-font-family:Cambria;
mso-hansi-theme-font:major-latin;
mso-bidi-font-family:"Times New Roman";
mso-bidi-theme-font:major-bidi;
color:#17365D;
mso-themecolor:text2;
mso-themeshade:191;
letter-spacing:.25pt;
mso-font-kerning:14.0pt;}
span.Heading1Char
{mso-style-name:"Heading 1 Char";
mso-style-priority:9;
mso-style-unhide:no;
mso-style-locked:yes;
mso-style-link:"Heading 1";
mso-ansi-font-size:14.0pt;
mso-bidi-font-size:14.0pt;
font-family:"Cambria","serif";
mso-ascii-font-family:Cambria;
mso-ascii-theme-font:major-latin;
mso-fareast-font-family:"Times New Roman";
mso-fareast-theme-font:major-fareast;
mso-hansi-font-family:Cambria;
mso-hansi-theme-font:major-latin;
mso-bidi-font-family:"Times New Roman";
mso-bidi-theme-font:major-bidi;
color:#365F91;
mso-themecolor:accent1;
mso-themeshade:191;
font-weight:bold;}
.MsoChpDefault
{mso-style-type:export-only;
mso-default-props:yes;
font-family:"Calibri","sans-serif";
mso-ascii-font-family:Calibri;
mso-ascii-theme-font:minor-latin;
mso-fareast-font-family:Calibri;
mso-fareast-theme-font:minor-latin;
mso-hansi-font-family:Calibri;
mso-hansi-theme-font:minor-latin;
mso-bidi-font-family:"Times New Roman";
mso-bidi-theme-font:minor-bidi;}
.MsoPapDefault
{mso-style-type:export-only;
margin-bottom:10.0pt;
line-height:115%;}
@page WordSection1
{size:8.5in 11.0in;
margin:1.0in 1.0in 1.0in 1.0in;
mso-header-margin:.5in;
mso-footer-margin:.5in;
mso-paper-source:0;}
div.WordSection1
{page:WordSection1;}
-->
</style>
<!--[if gte mso 10]>
<style>
/* Style Definitions */
table.MsoNormalTable
{mso-style-name:"Table Normal";
mso-tstyle-rowband-size:0;
mso-tstyle-colband-size:0;
mso-style-noshow:yes;
mso-style-priority:99;
mso-style-parent:"";
mso-padding-alt:0in 5.4pt 0in 5.4pt;
mso-para-margin-top:0in;
mso-para-margin-right:0in;
mso-para-margin-bottom:10.0pt;
mso-para-margin-left:0in;
line-height:115%;
mso-pagination:widow-orphan;
font-size:11.0pt;
font-family:"Calibri","sans-serif";
mso-ascii-font-family:Calibri;
mso-ascii-theme-font:minor-latin;
mso-hansi-font-family:Calibri;
mso-hansi-theme-font:minor-latin;
mso-bidi-font-family:"Times New Roman";
mso-bidi-theme-font:minor-bidi;}
</style>
<![endif]--><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="1026"/>
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1"/>
</o:shapelayout></xml><![endif]-->
</head>
<body lang=EN-US style='tab-interval:.5in'>
<div class=WordSection1>
<div style='mso-element:para-border-div;border:none;border-bottom:solid #4F81BD 1.0pt;
mso-border-bottom-themecolor:accent1;padding:0in 0in 4.0pt 0in'>
<p class=MsoTitle>This is a test</p>
</div>
</div>
</body>
</html>

View File

@ -117,5 +117,10 @@ public class TestChineseTokenizer extends BaseTokenStreamTestCase
assertAnalyzesTo(justFilter, "This is a Test. b c d",
new String[] { "This", "Test." });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random, new ChineseAnalyzer(), 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -306,4 +306,31 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "the_of" });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, t, commonWords);
return new TokenStreamComponents(t, cgf);
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
Analyzer b = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, t, commonWords);
return new TokenStreamComponents(t, new CommonGramsQueryFilter(cgf));
}
};
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -18,14 +18,19 @@ package org.apache.lucene.analysis.compound;
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
@ -299,5 +304,61 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
}
}
}
// SOLR-2891
// *CompoundWordTokenFilter blindly adds term length to offset, but this can take things out of bounds
// wrt original text if a previous filter increases the length of the word (in this case ü -> ue)
// so in this case we behave like WDF, and preserve any modified offsets
public void testInvalidOffsets() throws Exception {
final CharArraySet dict = makeDictionary("fall");
final NormalizeCharMap normMap = new NormalizeCharMap();
normMap.add("ü", "ue");
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenFilter filter = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict);
return new TokenStreamComponents(tokenizer, filter);
}
@Override
protected Reader initReader(Reader reader) {
return new MappingCharFilter(normMap, CharReader.get(reader));
}
};
assertAnalyzesTo(analyzer, "banküberfall",
new String[] { "bankueberfall", "fall" },
new int[] { 0, 0 },
new int[] { 12, 12 });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict));
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
Analyzer b = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenFilter filter = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, hyphenator);
return new TokenStreamComponents(tokenizer, filter);
}
};
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -0,0 +1,92 @@
package org.apache.lucene.analysis.core;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.English;
import java.io.IOException;
import java.io.StringReader;
import java.util.Set;
public class TestTypeTokenFilter extends BaseTokenStreamTestCase {
public void testTypeFilter() throws IOException {
StringReader reader = new StringReader("121 is palindrome, while 123 is not");
Set<String> stopTypes = asSet("<NUM>");
TokenStream stream = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes);
assertTokenStreamContents(stream, new String[]{"is", "palindrome", "while", "is", "not"});
}
/**
* Test Position increments applied by TypeTokenFilter with and without enabling this option.
*/
public void testStopPositons() throws IOException {
StringBuilder sb = new StringBuilder();
for (int i = 10; i < 20; i++) {
if (i % 3 != 0) {
sb.append(i).append(" ");
} else {
String w = English.intToEnglish(i).trim();
sb.append(w).append(" ");
}
}
log(sb.toString());
String stopTypes[] = new String[]{"<NUM>"};
Set<String> stopSet = asSet(stopTypes);
// with increments
StringReader reader = new StringReader(sb.toString());
TypeTokenFilter typeTokenFilter = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
testPositons(typeTokenFilter);
// without increments
reader = new StringReader(sb.toString());
typeTokenFilter = new TypeTokenFilter(false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
testPositons(typeTokenFilter);
}
private void testPositons(TypeTokenFilter stpf) throws IOException {
TypeAttribute typeAtt = stpf.getAttribute(TypeAttribute.class);
CharTermAttribute termAttribute = stpf.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncrAtt = stpf.getAttribute(PositionIncrementAttribute.class);
stpf.reset();
boolean enablePositionIncrements = stpf.getEnablePositionIncrements();
while (stpf.incrementToken()) {
log("Token: " + termAttribute.toString() + ": " + typeAtt.type() + " - " + posIncrAtt.getPositionIncrement());
assertEquals("if position increment is enabled the positionIncrementAttribute value should be 3, otherwise 1",
posIncrAtt.getPositionIncrement(), enablePositionIncrements ? 3 : 1);
}
stpf.end();
stpf.close();
}
// print debug info depending on VERBOSE
private static void log(String s) {
if (VERBOSE) {
System.out.println(s);
}
}
}

View File

@ -18,12 +18,15 @@ package org.apache.lucene.analysis.hunspell;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.text.ParseException;
import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.junit.BeforeClass;
@ -57,4 +60,17 @@ public class HunspellStemFilterTest extends BaseTokenStreamTestCase {
filter = new HunspellStemFilter(new KeywordMarkerFilter(tokenizer, set), DICTIONARY);
assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY));
}
};
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -22,6 +22,7 @@ import java.io.StringReader;
import java.util.Arrays;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.StopAnalyzer;
@ -132,4 +133,10 @@ public class PatternAnalyzerTest extends BaseTokenStreamTestCase {
TokenStream ts2 = analyzer.tokenStream("dummy", new StringReader(document));
assertTokenStreamContents(ts2, expected);
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -17,11 +17,14 @@ package org.apache.lucene.analysis.miscellaneous;
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.io.Reader;
import java.io.StringReader;
import java.util.List;
import java.util.ArrayList;
@ -1907,4 +1910,17 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
assertTrue(stream.incrementToken());
assertEquals(expected, termAtt.toString());
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new ASCIIFoldingFilter(tokenizer));
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -18,12 +18,14 @@
package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
@ -117,4 +119,18 @@ public class TestCapitalizationFilter extends BaseTokenStreamTestCase {
new String[] { expected }, onlyFirstWord, keep, forceFirstLetter, okPrefix,
minWordLength, maxWordCount, maxTokenLength);
}
/** blast some random strings through the analyzer */
public void testRandomString() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new CapitalizationFilter(tokenizer));
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -17,11 +17,14 @@
package org.apache.lucene.analysis.miscellaneous;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
/**
* HyphenatedWordsFilter test
@ -46,5 +49,29 @@ public class TestHyphenatedWordsFilter extends BaseTokenStreamTestCase {
ts = new HyphenatedWordsFilter(ts);
assertTokenStreamContents(ts,
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecology-" });
}
}
public void testOffsets() throws Exception {
String input = "abc- def geh 1234- 5678-";
TokenStream ts = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
ts = new HyphenatedWordsFilter(ts);
assertTokenStreamContents(ts,
new String[] { "abcdef", "geh", "12345678-" },
new int[] { 0, 9, 13 },
new int[] { 8, 12, 24 });
}
/** blast some random strings through the analyzer */
public void testRandomString() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new HyphenatedWordsFilter(tokenizer));
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -17,13 +17,16 @@
package org.apache.lucene.analysis.miscellaneous;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
/** Test {@link KeepWordFilter} */
@ -57,4 +60,23 @@ public class TestKeepWordFilter extends BaseTokenStreamTestCase {
stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 1 });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
final Set<String> words = new HashSet<String>();
words.add( "a" );
words.add( "b" );
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream stream = new KeepWordFilter(true, tokenizer, new CharArraySet(TEST_VERSION_CURRENT, words, true));
return new TokenStreamComponents(tokenizer, stream);
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -17,13 +17,21 @@
package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util._TestUtil;
import java.io.Reader;
import java.util.Iterator;
import java.util.Arrays;
@ -116,6 +124,45 @@ public class TestRemoveDuplicatesTokenFilter extends BaseTokenStreamTestCase {
}
// some helper methods for the below test with synonyms
private String randomNonEmptyString() {
while(true) {
final String s = _TestUtil.randomUnicodeString(random).trim();
if (s.length() != 0 && s.indexOf('\u0000') == -1) {
return s;
}
}
}
private void add(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
new CharsRef(output.replaceAll(" +", "\u0000")),
keepOrig);
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
final int numIters = atLeast(10);
for (int i = 0; i < numIters; i++) {
SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean());
final int numEntries = atLeast(10);
for (int j = 0; j < numEntries; j++) {
add(b, randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
}
final SynonymMap map = b.build();
final boolean ignoreCase = random.nextBoolean();
final Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
TokenStream stream = new SynonymFilter(tokenizer, map, ignoreCase);
return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream));
}
};
checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER);
}
}
}

View File

@ -18,11 +18,15 @@
package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import java.io.Reader;
import java.util.Collection;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.*;
/**
@ -103,4 +107,27 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
}
}
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, false));
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
Analyzer b = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, true));
}
};
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -298,4 +298,28 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
new int[] { 10, 15, 15 },
new int[] { 2, 1, 0 });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
int numIterations = atLeast(5);
for (int i = 0; i < numIterations; i++) {
final int flags = random.nextInt(512);
final CharArraySet protectedWords;
if (random.nextBoolean()) {
protectedWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet<String>(Arrays.asList("a", "b", "cd")), false);
} else {
protectedWords = null;
}
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
}
}

View File

@ -129,4 +129,27 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer,
new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.FRONT, 2, 15));
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
Analyzer b = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer,
new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 15));
}
};
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -18,9 +18,13 @@ package org.apache.lucene.analysis.ngram;
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
/**
* Tests {@link EdgeNGramTokenizer} for correctness.
@ -95,4 +99,25 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
tokenizer.reset(new StringReader("abcde"));
assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3}, 5 /* abcde */);
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new EdgeNGramTokenizer(reader, EdgeNGramTokenizer.Side.FRONT, 2, 15);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
Analyzer b = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new EdgeNGramTokenizer(reader, EdgeNGramTokenizer.Side.BACK, 2, 15);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -23,6 +23,7 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
@ -33,89 +34,102 @@ import java.io.StringReader;
* Tests {@link NGramTokenFilter} for correctness.
*/
public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
private TokenStream input;
@Override
public void setUp() throws Exception {
super.setUp();
input = new MockTokenizer(new StringReader("abcde"), MockTokenizer.WHITESPACE, false);
private TokenStream input;
@Override
public void setUp() throws Exception {
super.setUp();
input = new MockTokenizer(new StringReader("abcde"), MockTokenizer.WHITESPACE, false);
}
public void testInvalidInput() throws Exception {
boolean gotException = false;
try {
new NGramTokenFilter(input, 2, 1);
} catch (IllegalArgumentException e) {
gotException = true;
}
public void testInvalidInput() throws Exception {
boolean gotException = false;
try {
new NGramTokenFilter(input, 2, 1);
} catch (IllegalArgumentException e) {
gotException = true;
}
assertTrue(gotException);
assertTrue(gotException);
}
public void testInvalidInput2() throws Exception {
boolean gotException = false;
try {
new NGramTokenFilter(input, 0, 1);
} catch (IllegalArgumentException e) {
gotException = true;
}
public void testInvalidInput2() throws Exception {
boolean gotException = false;
try {
new NGramTokenFilter(input, 0, 1);
} catch (IllegalArgumentException e) {
gotException = true;
}
assertTrue(gotException);
}
public void testUnigrams() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
}
public void testBigrams() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5});
}
public void testNgrams() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
assertTokenStreamContents(filter,
assertTrue(gotException);
}
public void testUnigrams() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
}
public void testBigrams() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5});
}
public void testNgrams() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
assertTokenStreamContents(filter,
new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5}
);
}
public void testOversizedNgrams() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
assertTokenStreamContents(filter, new String[0], new int[0], new int[0]);
}
public void testSmallTokenInStream() throws Exception {
input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
}
public void testReset() throws Exception {
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
tokenizer.reset(new StringReader("abcde"));
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
}
// LUCENE-3642
// EdgeNgram blindly adds term length to offset, but this can take things out of bounds
// wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
// so in this case we behave like WDF, and preserve any modified offsets
public void testInvalidOffsets() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
filters = new NGramTokenFilter(filters, 2, 2);
return new TokenStreamComponents(tokenizer, filters);
}
};
assertAnalyzesTo(analyzer, "mosfellsbær",
new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 });
}
);
}
public void testOversizedNgrams() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
assertTokenStreamContents(filter, new String[0], new int[0], new int[0]);
}
public void testSmallTokenInStream() throws Exception {
input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
}
public void testReset() throws Exception {
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
tokenizer.reset(new StringReader("abcde"));
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
}
// LUCENE-3642
// EdgeNgram blindly adds term length to offset, but this can take things out of bounds
// wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
// so in this case we behave like WDF, and preserve any modified offsets
public void testInvalidOffsets() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
filters = new NGramTokenFilter(filters, 2, 2);
return new TokenStreamComponents(tokenizer, filters);
}
};
assertAnalyzesTo(analyzer, "mosfellsbær",
new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer,
new NGramTokenFilter(tokenizer, 2, 15));
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -18,71 +18,86 @@ package org.apache.lucene.analysis.ngram;
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
/**
* Tests {@link NGramTokenizer} for correctness.
*/
public class NGramTokenizerTest extends BaseTokenStreamTestCase {
private StringReader input;
@Override
public void setUp() throws Exception {
super.setUp();
input = new StringReader("abcde");
private StringReader input;
@Override
public void setUp() throws Exception {
super.setUp();
input = new StringReader("abcde");
}
public void testInvalidInput() throws Exception {
boolean gotException = false;
try {
new NGramTokenizer(input, 2, 1);
} catch (IllegalArgumentException e) {
gotException = true;
}
public void testInvalidInput() throws Exception {
boolean gotException = false;
try {
new NGramTokenizer(input, 2, 1);
} catch (IllegalArgumentException e) {
gotException = true;
}
assertTrue(gotException);
assertTrue(gotException);
}
public void testInvalidInput2() throws Exception {
boolean gotException = false;
try {
new NGramTokenizer(input, 0, 1);
} catch (IllegalArgumentException e) {
gotException = true;
}
public void testInvalidInput2() throws Exception {
boolean gotException = false;
try {
new NGramTokenizer(input, 0, 1);
} catch (IllegalArgumentException e) {
gotException = true;
}
assertTrue(gotException);
}
public void testUnigrams() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
}
public void testBigrams() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, 5 /* abcde */);
}
public void testNgrams() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
assertTokenStreamContents(tokenizer,
new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
5 /* abcde */
assertTrue(gotException);
}
public void testUnigrams() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
}
public void testBigrams() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, 5 /* abcde */);
}
public void testNgrams() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
assertTokenStreamContents(tokenizer,
new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
5 /* abcde */
);
}
public void testOversizedNgrams() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
}
public void testReset() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
tokenizer.reset(new StringReader("abcde"));
assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
}
}
public void testOversizedNgrams() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
}
public void testReset() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
tokenizer.reset(new StringReader("abcde"));
assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new NGramTokenizer(reader, 2, 15);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -17,10 +17,13 @@ package org.apache.lucene.analysis.path;
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
@ -193,4 +196,16 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
new int[]{1},
path.length());
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new PathHierarchyTokenizer(reader);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -17,9 +17,13 @@ package org.apache.lucene.analysis.path;
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
@ -154,4 +158,16 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
new int[]{1, 0},
path.length());
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new ReversePathHierarchyTokenizer(reader);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -18,14 +18,17 @@
package org.apache.lucene.analysis.pattern;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
/**
* Tests {@link PatternReplaceCharFilter}
@ -172,4 +175,21 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
private Pattern pattern( String p ){
return Pattern.compile( p );
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, tokenizer);
}
@Override
protected Reader initReader(Reader reader) {
return new PatternReplaceCharFilter(Pattern.compile("a"), "b", CharReader.get(reader));
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -17,10 +17,13 @@
package org.apache.lucene.analysis.pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import java.io.Reader;
import java.io.StringReader;
import java.util.regex.Pattern;
@ -77,5 +80,28 @@ public class TestPatternReplaceFilter extends BaseTokenStreamTestCase {
assertTokenStreamContents(ts,
new String[] { "aa$fooaa$fooa$foo$", "a$", "caaaaaaaaa$" });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream filter = new PatternReplaceFilter(tokenizer, Pattern.compile("a"), "b", false);
return new TokenStreamComponents(tokenizer, filter);
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
Analyzer b = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream filter = new PatternReplaceFilter(tokenizer, Pattern.compile("a"), "b", true);
return new TokenStreamComponents(tokenizer, filter);
}
};
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -18,17 +18,22 @@
package org.apache.lucene.analysis.pattern;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
public class TestPatternTokenizer extends BaseTokenStreamTestCase
@ -117,4 +122,35 @@ public class TestPatternTokenizer extends BaseTokenStreamTestCase
in.close();
return out.toString();
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = null;
try {
tokenizer = new PatternTokenizer(reader, Pattern.compile("a"), -1);
} catch (IOException e) {
throw new RuntimeException(e);
}
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
Analyzer b = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = null;
try {
tokenizer = new PatternTokenizer(reader, Pattern.compile("a"), 0);
} catch (IOException e) {
throw new RuntimeException(e);
}
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -17,11 +17,14 @@
package org.apache.lucene.analysis.reverse;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.Version;
public class TestReverseStringFilter extends BaseTokenStreamTestCase {
@ -96,4 +99,16 @@ public class TestReverseStringFilter extends BaseTokenStreamTestCase {
ReverseStringFilter.reverse(TEST_VERSION_CURRENT, buffer, 3, 7);
assertEquals("abcfed𩬅愯瀛", new String(buffer));
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(TEST_VERSION_CURRENT, tokenizer));
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -18,9 +18,12 @@ package org.apache.lucene.analysis.shingle;
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@ -1129,4 +1132,16 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
token.setPositionIncrement(positionIncrement);
return token;
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new ShingleFilter(tokenizer));
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -160,7 +160,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
hasSentence = false;
clearAttributes();
termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd-sentenceStart);
offsetAtt.setOffset(offset+sentenceStart, offset+sentenceEnd);
offsetAtt.setOffset(correctOffset(offset+sentenceStart), correctOffset(offset+sentenceEnd));
return true;
} else {
return false;
@ -215,7 +215,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
clearAttributes();
termAtt.copyBuffer(buffer, wordStart, wordEnd-wordStart);
offsetAtt.setOffset(offset+wordStart, offset+wordEnd);
offsetAtt.setOffset(correctOffset(offset+wordStart), correctOffset(offset+wordEnd));
posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posBoost);
posBoost = 0;
return true;

View File

@ -18,12 +18,15 @@
package org.apache.lucene.analysis.wikipedia;
import java.io.Reader;
import java.io.StringReader;
import java.io.IOException;
import java.util.Set;
import java.util.HashSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import static org.apache.lucene.analysis.wikipedia.WikipediaTokenizer.*;
@ -169,4 +172,17 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
assertFalse(tf.incrementToken());
tf.close();
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new WikipediaTokenizer(reader);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -112,7 +112,24 @@ are part of the ICU4C package. See http://site.icu-project.org/ </echo>
</assertions>
</java>
</target>
<property name="html.strip.charfilter.supp.macros.output.file"
location="../common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro"/>
<target name="gen-html-strip-charfilter-supp-macros" depends="compile-tools">
<java
classname="org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros"
dir="."
fork="true"
failonerror="true"
output="${html.strip.charfilter.supp.macros.output.file}">
<classpath>
<path refid="additional.dependencies"/>
<pathelement location="${build.dir}/classes/tools"/>
</classpath>
</java>
</target>
<target name="compile-tools" depends="common.compile-tools">
<compile
srcdir="src/tools/java"

View File

@ -111,7 +111,7 @@ public final class ICUTokenizer extends Tokenizer {
@Override
public void end() throws IOException {
final int finalOffset = (length < 0) ? offset : offset + length;
offsetAtt.setOffset(finalOffset, finalOffset);
offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset));
}
/*

View File

@ -0,0 +1,110 @@
package org.apache.lucene.analysis.icu;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.DateFormat;
import java.util.*;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.VersionInfo;
/** creates a macro to augment jflex's unicode support for > BMP */
public class GenerateHTMLStripCharFilterSupplementaryMacros {
private static final UnicodeSet BMP = new UnicodeSet("[\u0000-\uFFFF]");
private static final String NL = System.getProperty("line.separator");
private static final DateFormat DATE_FORMAT = DateFormat.getDateTimeInstance
(DateFormat.FULL, DateFormat.FULL, Locale.US);
static {
DATE_FORMAT.setTimeZone(TimeZone.getTimeZone("UTC"));
}
private static final String APACHE_LICENSE
= "/*" + NL
+ " * Copyright 2010 The Apache Software Foundation." + NL
+ " *" + NL
+ " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
+ " * you may not use this file except in compliance with the License." + NL
+ " * You may obtain a copy of the License at" + NL
+ " *" + NL
+ " * http://www.apache.org/licenses/LICENSE-2.0" + NL
+ " *" + NL
+ " * Unless required by applicable law or agreed to in writing, software" + NL
+ " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
+ " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
+ " * See the License for the specific language governing permissions and" + NL
+ " * limitations under the License." + NL
+ " */" + NL + NL;
public static void main(String args[]) throws Exception {
outputHeader();
outputMacro("ID_Start_Supp", "[:ID_Start:]");
outputMacro("ID_Continue_Supp", "[:ID_Continue:]");
}
static void outputHeader() {
System.out.print(APACHE_LICENSE);
System.out.print("// Generated using ICU4J " + VersionInfo.ICU_VERSION.toString() + " on ");
System.out.println(DATE_FORMAT.format(new Date()));
System.out.println("// by " + GenerateHTMLStripCharFilterSupplementaryMacros.class.getName());
System.out.print(NL + NL);
}
// we have to carefully output the possibilities as compact utf-16
// range expressions, or jflex will OOM!
static void outputMacro(String name, String pattern) {
UnicodeSet set = new UnicodeSet(pattern);
set.removeAll(BMP);
System.out.println(name + " = (");
// if the set is empty, we have to do this or jflex will barf
if (set.isEmpty()) {
System.out.println("\t []");
}
HashMap<Character,UnicodeSet> utf16ByLead = new HashMap<Character,UnicodeSet>();
for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) {
char utf16[] = Character.toChars(it.codepoint);
UnicodeSet trails = utf16ByLead.get(utf16[0]);
if (trails == null) {
trails = new UnicodeSet();
utf16ByLead.put(utf16[0], trails);
}
trails.add(utf16[1]);
}
Map<String,UnicodeSet> utf16ByTrail = new HashMap<String,UnicodeSet>();
for (Map.Entry<Character,UnicodeSet> entry : utf16ByLead.entrySet()) {
String trail = entry.getValue().getRegexEquivalent();
UnicodeSet leads = utf16ByTrail.get(trail);
if (leads == null) {
leads = new UnicodeSet();
utf16ByTrail.put(trail, leads);
}
leads.add(entry.getKey());
}
boolean isFirst = true;
for (Map.Entry<String,UnicodeSet> entry : utf16ByTrail.entrySet()) {
System.out.print( isFirst ? "\t " : "\t| ");
isFirst = false;
System.out.println(entry.getValue().getRegexEquivalent() + entry.getKey());
}
System.out.println(")");
}
}

View File

@ -102,4 +102,9 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
assertPOSToken(ts, "list", "subst:sg:loc.voc:m3");
assertPOSToken(ts, "lista", "subst:sg:dat.loc:f");
}
/** blast some random strings through the analyzer */
public void testRandom() throws Exception {
checkRandomData(random, getTestAnalyzer(), 10000 * RANDOM_MULTIPLIER);
}
}

View File

@ -16,11 +16,17 @@
*/
package org.apache.lucene.analysis.phonetic;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.util._TestUtil;
public class DoubleMetaphoneFilterTest extends BaseTokenStreamTestCase {
@ -65,4 +71,28 @@ public class DoubleMetaphoneFilterTest extends BaseTokenStreamTestCase {
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
}
public void testRandom() throws Exception {
final int codeLen = _TestUtil.nextInt(random, 1, 8);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new DoubleMetaphoneFilter(tokenizer, codeLen, false));
}
};
checkRandomData(random, a, 1000 * RANDOM_MULTIPLIER);
Analyzer b = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new DoubleMetaphoneFilter(tokenizer, codeLen, true));
}
};
checkRandomData(random, b, 1000 * RANDOM_MULTIPLIER);
}
}

View File

@ -17,6 +17,8 @@
package org.apache.lucene.analysis.phonetic;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.commons.codec.Encoder;
@ -25,7 +27,9 @@ import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.commons.codec.language.Metaphone;
import org.apache.commons.codec.language.RefinedSoundex;
import org.apache.commons.codec.language.Soundex;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
@ -70,4 +74,33 @@ public class TestPhoneticFilter extends BaseTokenStreamTestCase {
PhoneticFilter filter = new PhoneticFilter(tokenizer, encoder, inject);
assertTokenStreamContents(filter, expected);
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws IOException {
Encoder encoders[] = new Encoder[] {
new Metaphone(), new DoubleMetaphone(), new Soundex(), new RefinedSoundex(), new Caverphone()
};
for (final Encoder e : encoders) {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, false));
}
};
checkRandomData(random, a, 1000*RANDOM_MULTIPLIER);
Analyzer b = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, false));
}
};
checkRandomData(random, b, 1000*RANDOM_MULTIPLIER);
}
}
}

View File

@ -199,9 +199,6 @@ public abstract class PerfTask implements Cloneable {
return new String(c);
}
/* (non-Javadoc)
* @see java.lang.Object#toString()
*/
@Override
public String toString() {
String padd = getPadding();
@ -248,22 +245,23 @@ public abstract class PerfTask implements Cloneable {
}
/**
* Task setup work that should not be measured for that specific task.
* By default it does nothing, but tasks can implement this, moving work from
* doLogic() to this method. Only the work done in doLogicis measured for this task.
* Notice that higher level (sequence) tasks containing this task would then
* measure larger time than the sum of their contained tasks.
* @throws Exception
* Task setup work that should not be measured for that specific task. By
* default it does nothing, but tasks can implement this, moving work from
* {@link #doLogic()} to this method. Only the work done in {@link #doLogic()}
* is measured for this task. Notice that higher level (sequence) tasks
* containing this task would then measure larger time than the sum of their
* contained tasks.
*/
public void setup () throws Exception {
}
/**
* Task tearDown work that should not be measured for that specific task.
* By default it does nothing, but tasks can implement this, moving work from
* doLogic() to this method. Only the work done in doLogicis measured for this task.
* Notice that higher level (sequence) tasks containing this task would then
* measure larger time than the sum of their contained tasks.
* Task tearDown work that should not be measured for that specific task. By
* default it does nothing, but tasks can implement this, moving work from
* {@link #doLogic()} to this method. Only the work done in {@link #doLogic()}
* is measured for this task. Notice that higher level (sequence) tasks
* containing this task would then measure larger time than the sum of their
* contained tasks.
*/
public void tearDown() throws Exception {
if (++logStepCount % logStep == 0) {
@ -274,16 +272,20 @@ public abstract class PerfTask implements Cloneable {
}
/**
* Sub classes that supports parameters must override this method to return true.
* Sub classes that support parameters must override this method to return
* true.
*
* @return true iff this task supports command line params.
*/
public boolean supportsParams () {
return false;
}
/**
* Set the params of this task.
* @exception UnsupportedOperationException for tasks supporting command line parameters.
*
* @exception UnsupportedOperationException
* for tasks supporting command line parameters.
*/
public void setParams(String params) {
if (!supportsParams()) {

View File

@ -4,6 +4,7 @@ import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.logging.Level;
@ -100,6 +101,9 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
private volatile boolean closed = false;
// set refCount to 1 at start
private final AtomicInteger refCount = new AtomicInteger(1);
/**
* Open for reading a taxonomy stored in a given {@link Directory}.
* @param directory
@ -130,7 +134,7 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
* @throws AlreadyClosedException if this IndexReader is closed
*/
protected final void ensureOpen() throws AlreadyClosedException {
if (indexReader.getRefCount() <= 0) {
if (getRefCount() <= 0) {
throw new AlreadyClosedException("this TaxonomyReader is closed");
}
}
@ -415,8 +419,12 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
public void close() throws IOException {
if (!closed) {
decRef();
closed = true;
synchronized (this) {
if (!closed) {
decRef();
closed = true;
}
}
}
}
@ -555,27 +563,31 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
}
/**
* Expert: decreases the refCount of this TaxonomyReader instance.
* If the refCount drops to 0, then pending changes (if any) are
* committed to the taxonomy index and this reader is closed.
* @throws IOException
* Expert: decreases the refCount of this TaxonomyReader instance. If the
* refCount drops to 0, then this reader is closed.
*/
public void decRef() throws IOException {
ensureOpen();
if (indexReader.getRefCount() == 1) {
// Do not decRef the indexReader - doClose does it by calling reader.close()
doClose();
} else {
indexReader.decRef();
final int rc = refCount.decrementAndGet();
if (rc == 0) {
boolean success = false;
try {
doClose();
success = true;
} finally {
if (!success) {
// Put reference back on failure
refCount.incrementAndGet();
}
}
} else if (rc < 0) {
throw new IllegalStateException("too many decRef calls: refCount is " + rc + " after decrement");
}
}
/**
* Expert: returns the current refCount for this taxonomy reader
*/
/** Expert: returns the current refCount for this taxonomy reader */
public int getRefCount() {
ensureOpen();
return this.indexReader.getRefCount();
return refCount.get();
}
/**
@ -587,6 +599,6 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
*/
public void incRef() {
ensureOpen();
this.indexReader.incRef();
refCount.incrementAndGet();
}
}

View File

@ -11,6 +11,7 @@ import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
@ -178,4 +179,28 @@ public class TestDirectoryTaxonomyReader extends LuceneTestCase {
}
}
@Test
public void testRefreshAndRefCount() throws Exception {
Directory dir = new RAMDirectory(); // no need for random directories here
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(dir);
taxoWriter.addCategory(new CategoryPath("a"));
taxoWriter.commit();
DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(dir);
assertEquals("wrong refCount", 1, taxoReader.getRefCount());
taxoReader.incRef();
assertEquals("wrong refCount", 2, taxoReader.getRefCount());
taxoWriter.addCategory(new CategoryPath("a", "b"));
taxoWriter.commit();
taxoReader.refresh();
assertEquals("wrong refCount", 2, taxoReader.getRefCount());
taxoWriter.close();
taxoReader.close();
dir.close();
}
}

View File

@ -42,7 +42,7 @@
<h2>Search-time joins</h2>
<p>
The query time joining is terms based and implemented as two pass search. The first pass collects all the terms from a fromField
The query time joining is index term based and implemented as two pass search. The first pass collects all the terms from a fromField
that match the fromQuery. The second pass returns all documents that have matching terms in a toField to the terms
collected in the first pass.
</p>
@ -62,7 +62,7 @@
<pre class="prettyprint">
String fromField = "from"; // Name of the from field
boolean multipleValuesPerDocument = false; // Set only yo true in the case when your fromField has multiple values per document in your index
String fromField = "to"; // Name of the to field
String toField = "to"; // Name of the to field
Query fromQuery = new TermQuery(new Term("content", searchTerm)); // Query executed to collect from values to join to the to values
MultiTermQuery joinQuery = JoinUtil.createJoinQuery(fromField, multipleValuesPerDocument, toField, fromQuery, fromSearcher);

View File

@ -24,11 +24,11 @@ $Id$
================== 4.0.0-dev ==================
Versions of Major Components
---------------------
Apache Tika 0.10
Apache Tika 1.0
Carrot2 3.5.0
Velocity 1.6.4 and Velocity Tools 2.0
Apache UIMA 2.3.1
Apache ZooKeeper 3.3.3
Apache ZooKeeper 3.3.4
Upgrading from Solr 3.6-dev
@ -401,6 +401,14 @@ Upgrading from Solr 3.5
* As doGet() methods in SimplePostTool was changed to static, the client applications of this
class need to be recompiled.
* In Solr version 3.5 and earlier, HTMLStripCharFilter had known bugs in the
character offsets it provided, triggering e.g. exceptions in highlighting.
HTMLStripCharFilter has been re-implemented, addressing this and other
issues. See the entry for LUCENE-3690 in the Bug Fixes section below for a
detailed list of changes. For people who depend on the behavior of
HTMLStripCharFilter in Solr version 3.5 and earlier: the old implementation
(bugs and all) is preserved as LegacyHTMLStripCharFilter.
New Features
----------------------
* SOLR-2904: BinaryUpdateRequestHandler should be able to accept multiple update requests from
@ -442,6 +450,10 @@ New Features
* SOLR-1709: Distributed support for Date and Numeric Range Faceting
(Peter Sturge, David Smiley, hossman, Simon Willnauer)
* SOLR-3054, LUCENE-3671: Add TypeTokenFilterFactory that creates TypeTokenFilter
that filters tokens based on their TypeAttribute. (Tommaso Teofili via
Uwe Schindler)
Optimizations
----------------------
* SOLR-1931: Speedup for LukeRequestHandler and admin/schema browser. New parameter
@ -483,6 +495,52 @@ Bug Fixes
* SOLR-2970: CSV ResponseWriter returns fields defined as stored=false in schema (janhoy)
* LUCENE-3690, LUCENE-2208, SOLR-882, SOLR-42: Re-implemented
HTMLStripCharFilter as a JFlex-generated scanner. See below for a list
of bug fixes and other changes. To get the same behavior as
HTMLStripCharFilter in Solr version 3.5 and earlier (including the bugs),
use LegacyHTMLStripCharFilter, which is the previous implementation.
Behavior changes from the previous version:
- Known offset bugs are fixed.
- The "Mark invalid" exceptions reported in SOLR-1283 are no longer
triggered (the bug is still present in LegacyHTMLStripCharFilter).
- The character entity "&apos;" is now always properly decoded.
- More cases of <script> tags are now properly stripped.
- CDATA sections are now handled properly.
- Valid tag name characters now include the supplementary Unicode characters
from Unicode character classes [:ID_Start:] and [:ID_Continue:].
- Uppercase character entities "&QUOT;", "&COPY;", "&GT;", "&LT;", "&REG;",
and "&AMP;" are now recognized and handled as if they were in lowercase.
- The REPLACEMENT CHARACTER U+FFFD is now used to replace numeric character
entities for unpaired UTF-16 low and high surrogates (in the range
[U+D800-U+DFFF]).
- Properly paired numeric character entities for UTF-16 surrogates are now
converted to the corresponding code units.
- Opening tags with unbalanced quotation marks are now properly stripped.
- Literal "<" and ">" characters in opening tags, regardless of whether they
appear inside quotation marks, now inhibit recognition (and stripping) of
the tags. The only exception to this is for values of event-handler
attributes, e.g. "onClick", "onLoad", "onSelect".
- A newline '\n' is substituted instead of a space for stripped HTML markup.
- Nothing is substituted for opening and closing inline tags - they are
simply removed. The list of inline tags is (case insensitively): <a>,
<abbr>, <acronym>, <b>, <basefont>, <bdo>, <big>, <cite>, <code>, <dfn>,
<em>, <font>, <i>, <img>, <input>, <kbd>, <label>, <q>, <s>, <samp>,
<select>, <small>, <span>, <strike>, <strong>, <sub>, <sup>, <textarea>,
<tt>, <u>, and <var>.
- HTMLStripCharFilterFactory now handles HTMLStripCharFilter's "escapedTags"
feature: opening and closing tags with the given names, including any
attributes and their values, are left intact in the output.
(Steve Rowe)
* LUCENE-3717: Fixed offset bugs in TrimFilter, WordDelimiterFilter, and
HyphenatedWordsFilter where they would create invalid offsets in
some situations, leading to problems in highlighting. (Robert Muir)
* SOLR-2280: commitWithin ignored for a delete query (Juan Grande via janhoy)
Other Changes
----------------------
* SOLR-2922: Upgrade commons-io and commons-lang to 2.1 and 2.6, respectively. (koji)
@ -498,6 +556,8 @@ Other Changes
* SOLR-2718: Add ability to lazy load response writers, defined with startup="lazy".
(ehatcher)
* SOLR-2901: Upgrade Solr to Tika 1.0 (janhoy)
Build
----------------------
* SOLR-2487: Add build target to package war without slf4j jars (janhoy)

View File

@ -482,7 +482,7 @@
<packageset dir="contrib/langid/src/java"/>
<packageset dir="contrib/uima/src/java"/>
<group title="Core" packages="org.apache.*" />
<group title="SolrJ" packages="org.apache.solr.common.*,org.apache.solr.client.solrj*" />
<group title="SolrJ" packages="org.apache.solr.common.*,org.apache.solr.client.solrj.*,org.apache.zookeeper.*" />
<group title="contrib: Clustering" packages="org.apache.solr.handler.clustering*" />
<group title="contrib: DataImportHandler" packages="org.apache.solr.handler.dataimport*" />
<group title="contrib: Solr Cell" packages="org.apache.solr.handler.extraction*" />

View File

@ -0,0 +1,25 @@
#!/usr/bin/env bash
cd ..
rm -r -f example2
rm -r -f dist
rm -r -f build
rm -r -f example/solr/zoo_data
rm -f example/example.log
ant example dist
cp -r -f example example2
cd example
java -DzkRun -DnumShards=2 -DSTOP.PORT=7983 -DSTOP.KEY=key -Dbootstrap_confdir=solr/conf -jar start.jar 1>example.log 2>&1 &
sleep 10
cd ../example2
java -Djetty.port=9574 -DzkRun -DzkHost=localhost:9983 -DnumShards=2 -DSTOP.PORT=6574 -DSTOP.KEY=key -jar start.jar 1>example2.log 2>&1 &

View File

@ -0,0 +1,34 @@
#!/usr/bin/env bash
cd ..
rm -r -f example2
rm -r -f example3
rm -r -f example4
rm -r -f dist
rm -r -f build
rm -r -f example/solr/zoo_data
rm -f example/example.log
ant example dist
cp -r -f example example2
cp -r -f example example3
cp -r -f example example4
cd example
java -DzkRun -DnumShards=2 -DSTOP.PORT=7983 -DSTOP.KEY=key -Dbootstrap_confdir=solr/conf -jar start.jar 1>example.log 2>&1 &
sleep 10
cd ../example2
java -Djetty.port=9574 -DzkRun -DzkHost=localhost:9983 -DnumShards=2 -DSTOP.PORT=6574 -DSTOP.KEY=key -jar start.jar 1>example2.log 2>&1 &
cd ../example3
java -Djetty.port=9575 -DzkRun -DzkHost=localhost:9983 -DnumShards=2 -DSTOP.PORT=6575 -DSTOP.KEY=key -jar start.jar 1>example3.log 2>&1 &
cd ../example4
java -Djetty.port=9576 -DzkHost=localhost:9983 -DnumShards=2 -DSTOP.PORT=6576 -DSTOP.KEY=key -jar start.jar 1>example4.log 2>&1 &

View File

@ -0,0 +1,33 @@
#!/usr/bin/env bash
cd ..
rm -r -f example2
rm -r -f example3
rm -r -f example4
rm -r -f dist
rm -r -f build
rm -r -f example/solr/zoo_data
rm -f example/example.log
ant example dist
cp -r -f example example2
cp -r -f example example3
cp -r -f example example4
cd example
java -DzkRun -DnumShards=2 -DSTOP.PORT=7983 -DSTOP.KEY=key -Dbootstrap_confdir=solr/conf -DzkHost=localhost:9983,localhost:14574,localhost:14585 -jar start.jar 1>example.log 2>&1 &
sleep 10
cd ../example2
java -Djetty.port=13574 -DzkRun -DzkHost=localhost:9983,localhost:14574,localhost:14575 -DnumShards=2 -DSTOP.PORT=6574 -DSTOP.KEY=key -jar start.jar 1>example2.log 2>&1 &
cd ../example3
java -Djetty.port=13585 -DzkRun -DzkHost=localhost:9983,localhost:14574,localhost:14585 -DnumShards=2 -DSTOP.PORT=6575 -DSTOP.KEY=key -jar start.jar 1>example3.log 2>&1 &
cd ../example4
java -Djetty.port=13596 -DzkHost=localhost:9983,localhost:14574,localhost:14585 -DnumShards=2 -DSTOP.PORT=6576 -DSTOP.KEY=key -jar start.jar 1>example4.log 2>&1 &

Some files were not shown because too many files have changed in this diff Show More