Add HPBF hyperlinks support to the extractor

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@690729 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-08-31 16:58:29 +00:00
parent ea9ed46b9a
commit 5fb6697aee
5 changed files with 77 additions and 0 deletions

View File

@ -37,6 +37,7 @@
<!-- Don't forget to update status.xml too! --> <!-- Don't forget to update status.xml too! -->
<release version="3.1.1-alpha1" date="2008-??-??"> <release version="3.1.1-alpha1" date="2008-??-??">
<action dev="POI-DEVELOPERS" type="add">Support for HPBF Publisher hyperlinks, including during text extraction</action>
<action dev="POI-DEVELOPERS" type="fix">26321 and 44958 - preserve position of ArrayRecords and TableRecords among cell value records</action> <action dev="POI-DEVELOPERS" type="fix">26321 and 44958 - preserve position of ArrayRecords and TableRecords among cell value records</action>
<action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action> <action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
<action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action> <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>

View File

@ -34,6 +34,7 @@
<!-- Don't forget to update changes.xml too! --> <!-- Don't forget to update changes.xml too! -->
<changes> <changes>
<release version="3.1.1-alpha1" date="2008-??-??"> <release version="3.1.1-alpha1" date="2008-??-??">
<action dev="POI-DEVELOPERS" type="add">Support for HPBF Publisher hyperlinks, including during text extraction</action>
<action dev="POI-DEVELOPERS" type="fix">26321 and 44958 - preserve position of ArrayRecords and TableRecords among cell value records</action> <action dev="POI-DEVELOPERS" type="fix">26321 and 44958 - preserve position of ArrayRecords and TableRecords among cell value records</action>
<action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action> <action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
<action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action> <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>

View File

@ -24,6 +24,7 @@ import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hpbf.HPBFDocument; import org.apache.poi.hpbf.HPBFDocument;
import org.apache.poi.hpbf.model.qcbits.QCBit; import org.apache.poi.hpbf.model.qcbits.QCBit;
import org.apache.poi.hpbf.model.qcbits.QCTextBit; import org.apache.poi.hpbf.model.qcbits.QCTextBit;
import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/** /**
@ -31,6 +32,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
*/ */
public class PublisherTextExtractor extends POIOLE2TextExtractor { public class PublisherTextExtractor extends POIOLE2TextExtractor {
private HPBFDocument doc; private HPBFDocument doc;
private boolean hyperlinksByDefault = false;
public PublisherTextExtractor(HPBFDocument doc) { public PublisherTextExtractor(HPBFDocument doc) {
super(doc); super(doc);
@ -43,6 +45,16 @@ public class PublisherTextExtractor extends POIOLE2TextExtractor {
this(new POIFSFileSystem(is)); this(new POIFSFileSystem(is));
} }
/**
* Should a call to getText() return hyperlinks inline
* with the text?
* Default is no
*/
public void setHyperlinksByDefault(boolean hyperlinksByDefault) {
this.hyperlinksByDefault = hyperlinksByDefault;
}
public String getText() { public String getText() {
StringBuffer text = new StringBuffer(); StringBuffer text = new StringBuffer();
@ -55,6 +67,24 @@ public class PublisherTextExtractor extends POIOLE2TextExtractor {
} }
} }
// If requested, add in the hyperlinks
// Ideally, we'd do these inline, but the hyperlink
// positions are relative to the text area the
// hyperlink is in, and we have yet to figure out
// how to tie that together.
if(hyperlinksByDefault) {
for(int i=0; i<bits.length; i++) {
if(bits[i] != null && bits[i] instanceof Type12) {
Type12 hyperlinks = (Type12)bits[i];
for(int j=0; j<hyperlinks.getNumberOfHyperlinks(); j++) {
text.append("<");
text.append(hyperlinks.getHyperlink(j));
text.append(">\n");
}
}
}
}
// Get more text // Get more text
// TODO // TODO

View File

@ -167,6 +167,10 @@ public class QCPLCBit extends QCBit {
/** /**
* Type 12 holds hyperlinks, and is very complex. * Type 12 holds hyperlinks, and is very complex.
* There is normally one of these for each text
* area that contains at least one hyperlinks.
* The character offsets are relative to the start
* of the text area that this applies to.
*/ */
public static class Type12 extends QCPLCBit { public static class Type12 extends QCPLCBit {
private String[] hyperlinks; private String[] hyperlinks;
@ -249,6 +253,8 @@ public class QCPLCBit extends QCBit {
* Returns where in the text (in characters) the * Returns where in the text (in characters) the
* hyperlink at the given index starts * hyperlink at the given index starts
* applying to. * applying to.
* This position is relative to the text area that this
* PLCBit applies to.
* @param number The hyperlink number, zero based * @param number The hyperlink number, zero based
*/ */
public int getTextStartAt(int number) { public int getTextStartAt(int number) {
@ -258,6 +264,8 @@ public class QCPLCBit extends QCBit {
* Returns where in the text that this block * Returns where in the text that this block
* of hyperlinks stops applying to. Normally, * of hyperlinks stops applying to. Normally,
* but not always the end of the text. * but not always the end of the text.
* This position is relative to the text area that this
* PLCBit applies to.
*/ */
public int getAllTextEndAt() { public int getAllTextEndAt() {
return preData[numberOfPLCs+1]; return preData[numberOfPLCs+1];

View File

@ -134,4 +134,41 @@ public class TextPublisherTextExtractor extends TestCase {
assertEquals(s2007, s2000); assertEquals(s2007, s2000);
assertEquals(s2007, s98); assertEquals(s2007, s98);
} }
/**
* Test that the hyperlink extraction stuff works as well
* as we can hope it to.
*/
public void testWithHyperlinks() throws Exception {
File f = new File(dir, "LinkAt10.pub");
HPBFDocument doc = new HPBFDocument(
new FileInputStream(f)
);
PublisherTextExtractor ext =
new PublisherTextExtractor(doc);
ext.getText();
// Default is no hyperlinks
assertEquals("1234567890LINK\n", ext.getText());
// Turn on
ext.setHyperlinksByDefault(true);
assertEquals("1234567890LINK\n<http://poi.apache.org/>\n", ext.getText());
// Now a much more complex document
f = new File(dir, "Sample.pub");
ext = new PublisherTextExtractor(new FileInputStream(f));
ext.setHyperlinksByDefault(true);
String text = ext.getText();
assertTrue(text.endsWith(
"<http://poi.apache.org/>\n" +
"<C:\\Documents and Settings\\Nick\\My Documents\\Booleans.xlsx>\n" +
"<>\n" +
"<mailto:dev@poi.apache.org?subject=HPBF>\n" +
"<mailto:dev@poi.apache.org?subject=HPBF>\n"
));
}
} }