mirror of https://github.com/apache/poi.git
Add HPBF hyperlinks support to the extractor
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@690729 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ea9ed46b9a
commit
5fb6697aee
|
@ -37,6 +37,7 @@
|
||||||
|
|
||||||
<!-- Don't forget to update status.xml too! -->
|
<!-- Don't forget to update status.xml too! -->
|
||||||
<release version="3.1.1-alpha1" date="2008-??-??">
|
<release version="3.1.1-alpha1" date="2008-??-??">
|
||||||
|
<action dev="POI-DEVELOPERS" type="add">Support for HPBF Publisher hyperlinks, including during text extraction</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">26321 and 44958 - preserve position of ArrayRecords and TableRecords among cell value records</action>
|
<action dev="POI-DEVELOPERS" type="fix">26321 and 44958 - preserve position of ArrayRecords and TableRecords among cell value records</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
|
<action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
|
<action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
|
||||||
|
|
|
@ -34,6 +34,7 @@
|
||||||
<!-- Don't forget to update changes.xml too! -->
|
<!-- Don't forget to update changes.xml too! -->
|
||||||
<changes>
|
<changes>
|
||||||
<release version="3.1.1-alpha1" date="2008-??-??">
|
<release version="3.1.1-alpha1" date="2008-??-??">
|
||||||
|
<action dev="POI-DEVELOPERS" type="add">Support for HPBF Publisher hyperlinks, including during text extraction</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">26321 and 44958 - preserve position of ArrayRecords and TableRecords among cell value records</action>
|
<action dev="POI-DEVELOPERS" type="fix">26321 and 44958 - preserve position of ArrayRecords and TableRecords among cell value records</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
|
<action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
|
<action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
|
||||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.poi.POIOLE2TextExtractor;
|
||||||
import org.apache.poi.hpbf.HPBFDocument;
|
import org.apache.poi.hpbf.HPBFDocument;
|
||||||
import org.apache.poi.hpbf.model.qcbits.QCBit;
|
import org.apache.poi.hpbf.model.qcbits.QCBit;
|
||||||
import org.apache.poi.hpbf.model.qcbits.QCTextBit;
|
import org.apache.poi.hpbf.model.qcbits.QCTextBit;
|
||||||
|
import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -31,6 +32,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
*/
|
*/
|
||||||
public class PublisherTextExtractor extends POIOLE2TextExtractor {
|
public class PublisherTextExtractor extends POIOLE2TextExtractor {
|
||||||
private HPBFDocument doc;
|
private HPBFDocument doc;
|
||||||
|
private boolean hyperlinksByDefault = false;
|
||||||
|
|
||||||
public PublisherTextExtractor(HPBFDocument doc) {
|
public PublisherTextExtractor(HPBFDocument doc) {
|
||||||
super(doc);
|
super(doc);
|
||||||
|
@ -43,6 +45,16 @@ public class PublisherTextExtractor extends POIOLE2TextExtractor {
|
||||||
this(new POIFSFileSystem(is));
|
this(new POIFSFileSystem(is));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should a call to getText() return hyperlinks inline
|
||||||
|
* with the text?
|
||||||
|
* Default is no
|
||||||
|
*/
|
||||||
|
public void setHyperlinksByDefault(boolean hyperlinksByDefault) {
|
||||||
|
this.hyperlinksByDefault = hyperlinksByDefault;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public String getText() {
|
public String getText() {
|
||||||
StringBuffer text = new StringBuffer();
|
StringBuffer text = new StringBuffer();
|
||||||
|
|
||||||
|
@ -55,6 +67,24 @@ public class PublisherTextExtractor extends POIOLE2TextExtractor {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If requested, add in the hyperlinks
|
||||||
|
// Ideally, we'd do these inline, but the hyperlink
|
||||||
|
// positions are relative to the text area the
|
||||||
|
// hyperlink is in, and we have yet to figure out
|
||||||
|
// how to tie that together.
|
||||||
|
if(hyperlinksByDefault) {
|
||||||
|
for(int i=0; i<bits.length; i++) {
|
||||||
|
if(bits[i] != null && bits[i] instanceof Type12) {
|
||||||
|
Type12 hyperlinks = (Type12)bits[i];
|
||||||
|
for(int j=0; j<hyperlinks.getNumberOfHyperlinks(); j++) {
|
||||||
|
text.append("<");
|
||||||
|
text.append(hyperlinks.getHyperlink(j));
|
||||||
|
text.append(">\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Get more text
|
// Get more text
|
||||||
// TODO
|
// TODO
|
||||||
|
|
||||||
|
|
|
@ -167,6 +167,10 @@ public class QCPLCBit extends QCBit {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Type 12 holds hyperlinks, and is very complex.
|
* Type 12 holds hyperlinks, and is very complex.
|
||||||
|
* There is normally one of these for each text
|
||||||
|
* area that contains at least one hyperlinks.
|
||||||
|
* The character offsets are relative to the start
|
||||||
|
* of the text area that this applies to.
|
||||||
*/
|
*/
|
||||||
public static class Type12 extends QCPLCBit {
|
public static class Type12 extends QCPLCBit {
|
||||||
private String[] hyperlinks;
|
private String[] hyperlinks;
|
||||||
|
@ -249,6 +253,8 @@ public class QCPLCBit extends QCBit {
|
||||||
* Returns where in the text (in characters) the
|
* Returns where in the text (in characters) the
|
||||||
* hyperlink at the given index starts
|
* hyperlink at the given index starts
|
||||||
* applying to.
|
* applying to.
|
||||||
|
* This position is relative to the text area that this
|
||||||
|
* PLCBit applies to.
|
||||||
* @param number The hyperlink number, zero based
|
* @param number The hyperlink number, zero based
|
||||||
*/
|
*/
|
||||||
public int getTextStartAt(int number) {
|
public int getTextStartAt(int number) {
|
||||||
|
@ -258,6 +264,8 @@ public class QCPLCBit extends QCBit {
|
||||||
* Returns where in the text that this block
|
* Returns where in the text that this block
|
||||||
* of hyperlinks stops applying to. Normally,
|
* of hyperlinks stops applying to. Normally,
|
||||||
* but not always the end of the text.
|
* but not always the end of the text.
|
||||||
|
* This position is relative to the text area that this
|
||||||
|
* PLCBit applies to.
|
||||||
*/
|
*/
|
||||||
public int getAllTextEndAt() {
|
public int getAllTextEndAt() {
|
||||||
return preData[numberOfPLCs+1];
|
return preData[numberOfPLCs+1];
|
||||||
|
|
|
@ -134,4 +134,41 @@ public class TextPublisherTextExtractor extends TestCase {
|
||||||
assertEquals(s2007, s2000);
|
assertEquals(s2007, s2000);
|
||||||
assertEquals(s2007, s98);
|
assertEquals(s2007, s98);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test that the hyperlink extraction stuff works as well
|
||||||
|
* as we can hope it to.
|
||||||
|
*/
|
||||||
|
public void testWithHyperlinks() throws Exception {
|
||||||
|
File f = new File(dir, "LinkAt10.pub");
|
||||||
|
HPBFDocument doc = new HPBFDocument(
|
||||||
|
new FileInputStream(f)
|
||||||
|
);
|
||||||
|
|
||||||
|
PublisherTextExtractor ext =
|
||||||
|
new PublisherTextExtractor(doc);
|
||||||
|
ext.getText();
|
||||||
|
|
||||||
|
// Default is no hyperlinks
|
||||||
|
assertEquals("1234567890LINK\n", ext.getText());
|
||||||
|
|
||||||
|
// Turn on
|
||||||
|
ext.setHyperlinksByDefault(true);
|
||||||
|
assertEquals("1234567890LINK\n<http://poi.apache.org/>\n", ext.getText());
|
||||||
|
|
||||||
|
|
||||||
|
// Now a much more complex document
|
||||||
|
f = new File(dir, "Sample.pub");
|
||||||
|
ext = new PublisherTextExtractor(new FileInputStream(f));
|
||||||
|
ext.setHyperlinksByDefault(true);
|
||||||
|
String text = ext.getText();
|
||||||
|
|
||||||
|
assertTrue(text.endsWith(
|
||||||
|
"<http://poi.apache.org/>\n" +
|
||||||
|
"<C:\\Documents and Settings\\Nick\\My Documents\\Booleans.xlsx>\n" +
|
||||||
|
"<>\n" +
|
||||||
|
"<mailto:dev@poi.apache.org?subject=HPBF>\n" +
|
||||||
|
"<mailto:dev@poi.apache.org?subject=HPBF>\n"
|
||||||
|
));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue