diff --git a/contrib/benchmark/README.enwiki b/contrib/benchmark/README.enwiki index f9d49300e87..62999495908 100644 --- a/contrib/benchmark/README.enwiki +++ b/contrib/benchmark/README.enwiki @@ -20,3 +20,50 @@ After that, ant enwiki should process the data set and run a load test. Ant targets get-enwiki, expand-enwiki, and extract-enwiki can also be used to download, decompress, and extract (to individual files in work/enwiki) the dataset, respectively. + +NOTE: This bug in Xerces: + + https://issues.apache.org/jira/browse/XERCESJ-1257 + +which is still present as of 2.9.1, causes an exception like this when +processing Wikipedia's XML: + +Caused by: org.apache.xerces.impl.io.MalformedByteSequenceException: Invalid byte 2 of 4-byte UTF-8 sequence. + at org.apache.xerces.util.ErrorHandlerWrapper.createSAXParseException(Unknown Source) + at org.apache.xerces.util.ErrorHandlerWrapper.fatalError(Unknown Source) + at org.apache.xerces.impl.XMLErrorReporter.reportError(Unknown Source) + at org.apache.xerces.impl.XMLErrorReporter.reportError(Unknown Source) + at org.apache.xerces.impl.XMLDocumentFragmentScannerImpl$FragmentContentDispatcher.dispatch(Unknown Source) + at org.apache.xerces.impl.XMLDocumentFragmentScannerImpl.scanDocument(Unknown Source) + at org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source) + at org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source) + at org.apache.xerces.parsers.XMLParser.parse(Unknown Source) + at org.apache.xerces.parsers.AbstractSAXParser.parse(Unknown Source) + at org.apache.lucene.benchmark.byTask.feeds.EnwikiDocMaker$Parser.run(EnwikiDocMaker.java:77) + ... 1 more + +The original poster in the Xerces bug provided this patch: + +--- UTF8Reader.java 2006-11-23 00:36:53.000000000 +0100 ++++ /home/rainman/lucene/xerces-2_9_0/src/org/apache/xerces/impl/io/UTF8Reader.java 2008-04-04 00:40:58.000000000 +0200 +@@ -534,6 +534,16 @@ + invalidByte(4, 4, b2); + } + ++ // check if output buffer is large enough to hold 2 surrogate chars ++ if( out + 1 >= offset + length ){ ++ fBuffer[0] = (byte)b0; ++ fBuffer[1] = (byte)b1; ++ fBuffer[2] = (byte)b2; ++ fBuffer[3] = (byte)b3; ++ fOffset = 4; ++ return out - offset; ++ } ++ + // decode bytes into surrogate characters + int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003); + if (uuuuu > 0x10) { + +which I've applied to Xerces 2.9.1 sources, and committed under +lib/xerces-2.9.1-patched-XERCESJ-1257.jar. Once XERCESJ-1257 is fixed +we can upgrade to a standard Xerces release. diff --git a/contrib/benchmark/build.xml b/contrib/benchmark/build.xml index 05174e962d6..25209865db2 100644 --- a/contrib/benchmark/build.xml +++ b/contrib/benchmark/build.xml @@ -104,7 +104,7 @@ - + diff --git a/contrib/benchmark/lib/xerces-2.9.0.jar b/contrib/benchmark/lib/xerces-2.9.0.jar deleted file mode 100644 index b190fd95304..00000000000 --- a/contrib/benchmark/lib/xerces-2.9.0.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[50df93072e50efffced25bc814f0f20426a6b69c] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/benchmark/lib/xerces-2.9.1-patched-XERCESJ-1257.jar b/contrib/benchmark/lib/xerces-2.9.1-patched-XERCESJ-1257.jar new file mode 100644 index 00000000000..e1665118f98 --- /dev/null +++ b/contrib/benchmark/lib/xerces-2.9.1-patched-XERCESJ-1257.jar @@ -0,0 +1,2 @@ +AnyObjectId[ea000b3b79ec201e637841bebce16cf004231096] was removed in git history. +Apache SVN contains full history. \ No newline at end of file