HPBF text extractor and unit tests

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@687443 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-08-20 20:13:08 +00:00
parent 4262bdd181
commit f56ab22521
5 changed files with 238 additions and 1 deletions

View File

@ -0,0 +1,78 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hpbf.extractor;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hpbf.HPBFDocument;
import org.apache.poi.hpbf.model.qcbits.QCBit;
import org.apache.poi.hpbf.model.qcbits.QCTextBit;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
* Extract text from HPBF Publisher files
*/
public class PublisherTextExtractor extends POIOLE2TextExtractor {
private HPBFDocument doc;
public PublisherTextExtractor(HPBFDocument doc) {
super(doc);
this.doc = doc;
}
public PublisherTextExtractor(POIFSFileSystem fs) throws IOException {
this(new HPBFDocument(fs));
}
public PublisherTextExtractor(InputStream is) throws IOException {
this(new POIFSFileSystem(is));
}
public String getText() {
StringBuffer text = new StringBuffer();
// Get the text from the Quill Contents
QCBit[] bits = doc.getQuillContents().getBits();
for(int i=0; i<bits.length; i++) {
if(bits[i] != null && bits[i] instanceof QCTextBit) {
QCTextBit t = (QCTextBit)bits[i];
text.append( t.getText().replace('\r', '\n') );
}
}
// Get more text
// TODO
return text.toString();
}
public static void main(String[] args) throws Exception {
if(args.length == 0) {
System.err.println("Use:");
System.err.println(" PublisherTextExtractor <file.pub>");
}
for(int i=0; i<args.length; i++) {
PublisherTextExtractor te = new PublisherTextExtractor(
new FileInputStream(args[i])
);
System.out.println(te.getText());
}
}
}

View File

@ -26,6 +26,10 @@ public class QCTextBit extends QCBit {
super(thingType, bitType, data);
}
/**
* Returns the text. Note that line endings
* are \r and not \n
*/
public String getText() {
return StringUtil.getFromUnicodeLE(
data, 0, data.length/2

Binary file not shown.

View File

@ -0,0 +1,105 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hpbf.extractor;
import java.io.File;
import java.io.FileInputStream;
import org.apache.poi.hpbf.HPBFDocument;
import junit.framework.TestCase;
public class TextPublisherTextExtractor extends TestCase {
private String dir;
protected void setUp() throws Exception {
dir = System.getProperty("HPBF.testdata.path");
}
public void testBasics() throws Exception {
File f = new File(dir, "Sample.pub");
HPBFDocument doc = new HPBFDocument(
new FileInputStream(f)
);
PublisherTextExtractor ext =
new PublisherTextExtractor(doc);
ext.getText();
f = new File(dir, "Simple.pub");
ext = new PublisherTextExtractor(
new FileInputStream(f)
);
ext.getText();
}
public void testContents() throws Exception {
File f = new File(dir, "Sample.pub");
HPBFDocument doc = new HPBFDocument(
new FileInputStream(f)
);
PublisherTextExtractor ext =
new PublisherTextExtractor(doc);
String text = ext.getText();
assertEquals(
"This is some text on the first page\n" +
"Its in times new roman, font size 10, all normal\n" +
"" +
"This is in bold and italic\n" +
"Its Arial, 20 point font\n" +
"Its in the second textbox on the first page\n" +
"" +
"This is the second page\n\n" +
"" +
"It is also times new roman, 10 point\n" +
"" +
"Table on page 2\nTop right\n" +
"P2 table left\nP2 table right\n" +
"Bottom Left\nBottom Right\n" +
"" +
"This text is on page two\n" +
"#This is a link to Apache POI\n" +
"More normal text\n" +
"Link to a file\n" +
"" +
"More text, more hyperlinks\n" +
"email link\n" +
"Final hyperlink\n" +
"Within doc to page 1\n"
, text
);
// Now a simpler one
f = new File(dir, "Simple.pub");
ext = new PublisherTextExtractor(
new FileInputStream(f)
);
text = ext.getText();
assertEquals(
"0123456789\n" +
"0123456789abcdef\n" +
"0123456789abcdef0123456789abcdef\n" +
"0123456789\n" +
"0123456789abcdef\n" +
"0123456789abcdef0123456789abcdef\n" +
"0123456789abcdef0123456789abcdef0123456789abcdef\n"
, text
);
}
}

View File

@ -0,0 +1,50 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hpbf.model;
import java.io.File;
import java.io.FileInputStream;
import org.apache.poi.hpbf.HPBFDocument;
import junit.framework.TestCase;
public class TestEscherParts extends TestCase {
private String dir;
protected void setUp() throws Exception {
dir = System.getProperty("HPBF.testdata.path");
}
public void testBasics() throws Exception {
File f = new File(dir, "Sample.pub");
HPBFDocument doc = new HPBFDocument(
new FileInputStream(f)
);
EscherStm es = doc.getEscherStm();
EscherDelayStm eds = doc.getEscherDelayStm();
assertNotNull(es);
assertNotNull(eds);
assertEquals(13, es.getEscherRecords().length);
assertEquals(0, eds.getEscherRecords().length);
// TODO - check the contents
}
}