mirror of https://github.com/apache/poi.git
bug 50955 - try originally guessed codepoint, backoff to 1252 if that fails
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1790904 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1f113bc64a
commit
97c5c54eb7
|
@ -96,25 +96,17 @@ public class HWPFOldDocument extends HWPFDocumentCore {
|
||||||
} else {
|
} else {
|
||||||
// TODO Discover if these older documents can ever hold Unicode Strings?
|
// TODO Discover if these older documents can ever hold Unicode Strings?
|
||||||
// (We think not, because they seem to lack a Piece table)
|
// (We think not, because they seem to lack a Piece table)
|
||||||
// TODO Build the Piece Descriptor properly
|
//
|
||||||
// (We have to fake it, as they don't seem to have a proper Piece table)
|
// What we have here is a wretched hack. We need to figure out
|
||||||
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0, guessedCharset);
|
// how to get the correct charset for the doc.
|
||||||
pd.setFilePosition(_fib.getFibBase().getFcMin());
|
TextPiece tp = null;
|
||||||
|
try {
|
||||||
// Generate a single Text Piece Table, with a single Text Piece
|
tp = buildTextPiece(guessedCharset);
|
||||||
// which covers all the (8 bit only) text in the file
|
} catch (IllegalStateException e) {
|
||||||
tpt = new OldTextPieceTable();
|
//if there was a problem with the guessed charset and the length of the
|
||||||
byte[] textData = new byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()];
|
//textpiece, back off to win1252. This is effectively what we used to do.
|
||||||
System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length);
|
tp = buildTextPiece(StringUtil.WIN_1252);
|
||||||
|
|
||||||
int numChars = textData.length;
|
|
||||||
if (CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) {
|
|
||||||
numChars /= 2;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TextPiece tp = new TextPiece(
|
|
||||||
0, numChars, textData, pd
|
|
||||||
);
|
|
||||||
tpt.add(tp);
|
tpt.add(tp);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -156,6 +148,33 @@ public class HWPFOldDocument extends HWPFDocumentCore {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param guessedCharset charset that we think this is
|
||||||
|
* @return a new text piece
|
||||||
|
* @throws IllegalStateException if the length isn't correct
|
||||||
|
*/
|
||||||
|
private TextPiece buildTextPiece(Charset guessedCharset) throws IllegalStateException {
|
||||||
|
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0, guessedCharset);
|
||||||
|
pd.setFilePosition(_fib.getFibBase().getFcMin());
|
||||||
|
|
||||||
|
// Generate a single Text Piece Table, with a single Text Piece
|
||||||
|
// which covers all the (8 bit only) text in the file
|
||||||
|
tpt = new OldTextPieceTable();
|
||||||
|
byte[] textData = new byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()];
|
||||||
|
System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length);
|
||||||
|
|
||||||
|
int numChars = textData.length;
|
||||||
|
if (CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) {
|
||||||
|
numChars /= 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new TextPiece(
|
||||||
|
0, numChars, textData, pd
|
||||||
|
);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Take the first codepage that is not default, ansi or symbol.
|
* Take the first codepage that is not default, ansi or symbol.
|
||||||
|
|
Loading…
Reference in New Issue