51519 -- follow on, make concatenation of rPh configurable

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1786021 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tim Allison 2017-03-08 16:44:40 +00:00
parent 396cd26693
commit 7ffc645a5f
4 changed files with 85 additions and 32 deletions

View File

@ -78,6 +78,8 @@ import org.xml.sax.helpers.DefaultHandler;
* *
*/ */
public class ReadOnlySharedStringsTable extends DefaultHandler { public class ReadOnlySharedStringsTable extends DefaultHandler {
private final boolean includePhoneticRuns;
/** /**
* An integer representing the total count of strings in the workbook. This count does not * An integer representing the total count of strings in the workbook. This count does not
* include any numbers, it counts only the total of text strings in the workbook. * include any numbers, it counts only the total of text strings in the workbook.
@ -103,12 +105,29 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
private Map<Integer, String> phoneticStrings; private Map<Integer, String> phoneticStrings;
/** /**
* Calls {{@link #ReadOnlySharedStringsTable(OPCPackage, boolean)}} with
* a value of <code>true</code> for including phonetic runs
*
* @param pkg The {@link OPCPackage} to use as basis for the shared-strings table. * @param pkg The {@link OPCPackage} to use as basis for the shared-strings table.
* @throws IOException If reading the data from the package fails. * @throws IOException If reading the data from the package fails.
* @throws SAXException if parsing the XML data fails. * @throws SAXException if parsing the XML data fails.
*/ */
public ReadOnlySharedStringsTable(OPCPackage pkg) public ReadOnlySharedStringsTable(OPCPackage pkg)
throws IOException, SAXException { throws IOException, SAXException {
this(pkg, true);
}
/**
*
* @param pkg The {@link OPCPackage} to use as basis for the shared-strings table.
* @param includePhoneticRuns whether or not to concatenate phoneticRuns onto the shared string
* @since POI 3.14-Beta3
* @throws IOException If reading the data from the package fails.
* @throws SAXException if parsing the XML data fails.
*/
public ReadOnlySharedStringsTable(OPCPackage pkg, boolean includePhoneticRuns)
throws IOException, SAXException {
this.includePhoneticRuns = includePhoneticRuns;
ArrayList<PackagePart> parts = ArrayList<PackagePart> parts =
pkg.getPartsByContentType(XSSFRelation.SHARED_STRINGS.getContentType()); pkg.getPartsByContentType(XSSFRelation.SHARED_STRINGS.getContentType());
@ -121,10 +140,24 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
/** /**
* Like POIXMLDocumentPart constructor * Like POIXMLDocumentPart constructor
* *
* Calls {@link #ReadOnlySharedStringsTable(PackagePart, boolean)}, with a
* value of <code>true</code> to include phonetic runs.
*
* @since POI 3.14-Beta1 * @since POI 3.14-Beta1
*/ */
public ReadOnlySharedStringsTable(PackagePart part) throws IOException, SAXException { public ReadOnlySharedStringsTable(PackagePart part) throws IOException, SAXException {
this(part, true);
}
/**
* Like POIXMLDocumentPart constructor
*
* @since POI 3.14-Beta3
*/
public ReadOnlySharedStringsTable(PackagePart part, boolean includePhoneticRuns)
throws IOException, SAXException {
this.includePhoneticRuns = includePhoneticRuns;
readFrom(part.getInputStream()); readFrom(part.getInputStream());
} }
@ -184,22 +217,6 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
return strings.get(idx); return strings.get(idx);
} }
/**
* Return the phonetic string at a given index.
* Returns <code>null</code> if no phonetic string
* exists at that index.
* @param idx
* @return
*/
public String getPhoneticStringAt(int idx) {
//avoid an NPE. If the parser hasn't
//yet hit <sst/> phoneticStrings could be null
if (phoneticStrings == null) {
return null;
}
return phoneticStrings.get(idx);
}
public List<String> getItems() { public List<String> getItems() {
return strings; return strings;
} }
@ -207,7 +224,6 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
//// ContentHandler methods //// //// ContentHandler methods ////
private StringBuffer characters; private StringBuffer characters;
private StringBuffer rphCharacters;
private boolean tIsOpen; private boolean tIsOpen;
private boolean inRPh; private boolean inRPh;
@ -226,13 +242,16 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
this.strings = new ArrayList<String>(this.uniqueCount); this.strings = new ArrayList<String>(this.uniqueCount);
this.phoneticStrings = new HashMap<Integer, String>(); this.phoneticStrings = new HashMap<Integer, String>();
characters = new StringBuffer(); characters = new StringBuffer();
rphCharacters = new StringBuffer();
} else if ("si".equals(localName)) { } else if ("si".equals(localName)) {
characters.setLength(0); characters.setLength(0);
} else if ("t".equals(localName)) { } else if ("t".equals(localName)) {
tIsOpen = true; tIsOpen = true;
} else if ("rPh".equals(localName)) { } else if ("rPh".equals(localName)) {
inRPh = true; inRPh = true;
//append space...this assumes that rPh always comes after regular <t>
if (includePhoneticRuns && characters.length() > 0) {
characters.append(" ");
}
} }
} }
@ -244,10 +263,6 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
if ("si".equals(localName)) { if ("si".equals(localName)) {
strings.add(characters.toString()); strings.add(characters.toString());
if (rphCharacters.length() > 0) {
phoneticStrings.put(strings.size()-1, rphCharacters.toString());
rphCharacters.setLength(0);
}
} else if ("t".equals(localName)) { } else if ("t".equals(localName)) {
tIsOpen = false; tIsOpen = false;
} else if ("rPh".equals(localName)) { } else if ("rPh".equals(localName)) {
@ -261,9 +276,9 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
public void characters(char[] ch, int start, int length) public void characters(char[] ch, int start, int length)
throws SAXException { throws SAXException {
if (tIsOpen) { if (tIsOpen) {
if (inRPh) { if (inRPh && includePhoneticRuns) {
rphCharacters.append(ch, start, length); characters.append(ch, start, length);
} else { } else if (! inRPh){
characters.append(ch, start, length); characters.append(ch, start, length);
} }
} }

View File

@ -16,6 +16,7 @@
==================================================================== */ ==================================================================== */
package org.apache.poi.xssf.extractor; package org.apache.poi.xssf.extractor;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.HashMap; import java.util.HashMap;
@ -23,8 +24,6 @@ import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.poi.POIXMLProperties; import org.apache.poi.POIXMLProperties;
import org.apache.poi.POIXMLProperties.CoreProperties; import org.apache.poi.POIXMLProperties.CoreProperties;
import org.apache.poi.POIXMLProperties.CustomProperties; import org.apache.poi.POIXMLProperties.CustomProperties;
@ -64,6 +63,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
private boolean includeCellComments = false; private boolean includeCellComments = false;
private boolean includeHeadersFooters = true; private boolean includeHeadersFooters = true;
private boolean formulasNotResults = false; private boolean formulasNotResults = false;
private boolean concatenatePhoneticRuns = true;
public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException { public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException {
this(OPCPackage.open(path)); this(OPCPackage.open(path));
@ -120,6 +120,14 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
this.includeCellComments = includeCellComments; this.includeCellComments = includeCellComments;
} }
/**
* Concatenate text from &lt;rPh&gt; text elements in SharedStringsTable
* Default is true;
* @param concatenatePhoneticRuns
*/
public void setConcatenatePhoneticRuns(boolean concatenatePhoneticRuns) {
this.concatenatePhoneticRuns = concatenatePhoneticRuns;
}
public void setLocale(Locale locale) { public void setLocale(Locale locale) {
this.locale = locale; this.locale = locale;
} }
@ -189,7 +197,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
*/ */
public String getText() { public String getText() {
try { try {
ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(container); ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(container, concatenatePhoneticRuns);
XSSFReader xssfReader = new XSSFReader(container); XSSFReader xssfReader = new XSSFReader(container);
StylesTable styles = xssfReader.getStylesTable(); StylesTable styles = xssfReader.getStylesTable();
XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData(); XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();

View File

@ -59,19 +59,27 @@ public final class TestReadOnlySharedStringsTable extends TestCase {
} }
//51519
public void testPhoneticRuns() throws Exception { public void testPhoneticRuns() throws Exception {
OPCPackage pkg = OPCPackage.open(_ssTests.openResourceAsStream("51519.xlsx")); OPCPackage pkg = OPCPackage.open(_ssTests.openResourceAsStream("51519.xlsx"));
List<PackagePart> parts = pkg.getPartsByName(Pattern.compile("/xl/sharedStrings.xml")); List<PackagePart> parts = pkg.getPartsByName(Pattern.compile("/xl/sharedStrings.xml"));
assertEquals(1, parts.size()); assertEquals(1, parts.size());
ReadOnlySharedStringsTable rtbl = new ReadOnlySharedStringsTable(parts.get(0)); ReadOnlySharedStringsTable rtbl = new ReadOnlySharedStringsTable(parts.get(0), true);
List<String> strings = rtbl.getItems(); List<String> strings = rtbl.getItems();
assertEquals(49, strings.size()); assertEquals(49, strings.size());
assertEquals("\u30B3\u30E1\u30F3\u30C8", rtbl.getEntryAt(0)); assertEquals("\u30B3\u30E1\u30F3\u30C8", rtbl.getEntryAt(0));
assertNull(rtbl.getPhoneticStringAt(0)); assertEquals("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB \u30CB\u30DB\u30F3", rtbl.getEntryAt(3));
//now do not include phonetic runs
rtbl = new ReadOnlySharedStringsTable(parts.get(0), false);
strings = rtbl.getItems();
assertEquals(49, strings.size());
assertEquals("\u30B3\u30E1\u30F3\u30C8", rtbl.getEntryAt(0));
assertEquals("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB", rtbl.getEntryAt(3)); assertEquals("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB", rtbl.getEntryAt(3));
assertEquals("\u30CB\u30DB\u30F3", rtbl.getPhoneticStringAt(3));
} }
public void testEmptySSTOnPackageObtainedViaWorkbook() throws Exception { public void testEmptySSTOnPackageObtainedViaWorkbook() throws Exception {

View File

@ -18,6 +18,7 @@
package org.apache.poi.xssf.extractor; package org.apache.poi.xssf.extractor;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;
@ -359,4 +360,25 @@ public class TestXSSFEventBasedExcelExtractor {
assertTrue("can't find 10/02/2016", text.contains("10/02/2016")); assertTrue("can't find 10/02/2016", text.contains("10/02/2016"));
ex.close(); ex.close();
} }
@Test
public void test51519() throws Exception {
//default behavior: include phonetic runs
XSSFEventBasedExcelExtractor ex =
new XSSFEventBasedExcelExtractor(
XSSFTestDataSamples.openSamplePackage("51519.xlsx"));
String text = ex.getText();
assertTrue("can't find appended phonetic run", text.contains("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB \u30CB\u30DB\u30F3"));
ex.close();
//now try turning them off
ex =
new XSSFEventBasedExcelExtractor(
XSSFTestDataSamples.openSamplePackage("51519.xlsx"));
ex.setConcatenatePhoneticRuns(false);
text = ex.getText();
assertFalse("should not be able to find appended phonetic run", text.contains("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB \u30CB\u30DB\u30F3"));
ex.close();
}
} }