mirror of https://github.com/apache/poi.git
#59739 When testing for a valid NFIB in a FileInformationBlock, accept common "nearby" values as found from CommonCrawl
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1750866 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e616b8cf12
commit
3680dc8992
|
@ -143,36 +143,45 @@ public final class FileInformationBlock implements Cloneable
|
||||||
|
|
||||||
private void assertCbRgFcLcb()
|
private void assertCbRgFcLcb()
|
||||||
{
|
{
|
||||||
switch ( getNFib() )
|
int nfib = getNFib();
|
||||||
|
String nfibHex = String.format("%04X", nfib);
|
||||||
|
|
||||||
|
// Note - CommonCrawl shows there's more variation in these than
|
||||||
|
// the documentation suggests, so accept common clusters around
|
||||||
|
// the "correct" value as well
|
||||||
|
switch ( nfib )
|
||||||
{
|
{
|
||||||
case 0x00C1:
|
case 0x0071:
|
||||||
assertCbRgFcLcb( "0x00C1", 0x005D, "0x005D", _cbRgFcLcb );
|
// Found in CommonCrawl corpus but not in the docs...
|
||||||
break;
|
break;
|
||||||
case 0x00D9:
|
case 0x00BE:
|
||||||
assertCbRgFcLcb( "0x00D9", 0x006C, "0x006C", _cbRgFcLcb );
|
case 0x00BF:
|
||||||
|
case 0x00C0:
|
||||||
|
case 0x00C1: // Docs "official"
|
||||||
|
case 0x00C2:
|
||||||
|
case 0x00C3:
|
||||||
|
assertCbRgFcLcb(nfibHex, 0x005D, "0x005D", _cbRgFcLcb );
|
||||||
|
break;
|
||||||
|
case 0x00D8:
|
||||||
|
case 0x00D9: // Docs "official"
|
||||||
|
assertCbRgFcLcb(nfibHex, 0x006C, "0x006C", _cbRgFcLcb );
|
||||||
break;
|
break;
|
||||||
case 0x0101:
|
case 0x0101:
|
||||||
assertCbRgFcLcb( "0x0101", 0x0088, "0x0088", _cbRgFcLcb );
|
assertCbRgFcLcb( "0x0101", 0x0088, "0x0088", _cbRgFcLcb );
|
||||||
break;
|
break;
|
||||||
case 0x010C:
|
// TODO Is CommonCrawl 265 = 0x109 the one above or below?
|
||||||
assertCbRgFcLcb( "0x010C", 0x00A4, "0x00A4", _cbRgFcLcb );
|
case 0x010B:
|
||||||
|
case 0x010C: // Docs "official"
|
||||||
|
assertCbRgFcLcb(nfibHex, 0x00A4, "0x00A4", _cbRgFcLcb );
|
||||||
break;
|
break;
|
||||||
case 0x0112:
|
case 0x0112:
|
||||||
assertCbRgFcLcb( "0x0112", 0x00B7, "0x00B7", _cbRgFcLcb );
|
assertCbRgFcLcb( "0x0112", 0x00B7, "0x00B7", _cbRgFcLcb );
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
/*
|
/* The Word spec has a much smaller list of "valid" values
|
||||||
When running with the large CommonCrawl corpus we found the following ids in documents that are processed fine:
|
* to what the large CommonCrawl corpus contains!
|
||||||
java.lang.IllegalStateException: Invalid file format version number: 113
|
|
||||||
java.lang.IllegalStateException: Invalid file format version number: 191
|
|
||||||
java.lang.IllegalStateException: Invalid file format version number: 192
|
|
||||||
java.lang.IllegalStateException: Invalid file format version number: 194
|
|
||||||
java.lang.IllegalStateException: Invalid file format version number: 195
|
|
||||||
java.lang.IllegalStateException: Invalid file format version number: 216
|
|
||||||
java.lang.IllegalStateException: Invalid file format version number: 265
|
|
||||||
java.lang.IllegalStateException: Invalid file format version number: 267
|
|
||||||
*/
|
*/
|
||||||
logger.log(POILogger.WARN, "Invalid file format version number: " + getNFib());
|
logger.log(POILogger.WARN, "Invalid file format version number: " + nfib + "("+nfibHex+")");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue