mirror of https://github.com/apache/poi.git
BUG 54771 extract text from SDTs at the cell level within a table row
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1602955 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
42bee0dcba
commit
1eebd090e7
|
@ -27,6 +27,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
|
|||
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
|
||||
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
|
||||
import org.apache.poi.xwpf.usermodel.IBodyElement;
|
||||
import org.apache.poi.xwpf.usermodel.ICell;
|
||||
import org.apache.poi.xwpf.usermodel.IRunElement;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
|
||||
|
@ -34,6 +35,7 @@ import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
|
|||
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFRelation;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFSDT;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFTable;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
|
||||
|
@ -161,14 +163,18 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
|
|||
|
||||
}
|
||||
|
||||
private void appendTableText(StringBuffer text, XWPFTable table){
|
||||
private void appendTableText(StringBuffer text, XWPFTable table) {
|
||||
//this works recursively to pull embedded tables from tables
|
||||
for (XWPFTableRow row : table.getRows()){
|
||||
List<XWPFTableCell> cells = row.getTableCells();
|
||||
for (int i = 0; i < cells.size(); i++){
|
||||
XWPFTableCell cell = cells.get(i);
|
||||
text.append(cell.getTextRecursively());
|
||||
if (i < cells.size()-1){
|
||||
for (XWPFTableRow row : table.getRows()) {
|
||||
List<ICell> cells = row.getTableICells();
|
||||
for (int i = 0; i < cells.size(); i++) {
|
||||
ICell cell = cells.get(i);
|
||||
if (cell instanceof XWPFTableCell) {
|
||||
text.append(((XWPFTableCell)cell).getTextRecursively());
|
||||
} else if (cell instanceof XWPFSDTCell) {
|
||||
text.append(((XWPFSDTCell)cell).getContent().getText());
|
||||
}
|
||||
if (i < cells.size()-1) {
|
||||
text.append("\t");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,113 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.xwpf.usermodel;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.POIXMLDocumentPart;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtPr;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTString;
|
||||
|
||||
/**
|
||||
* Experimental abstract class that is a base for XWPFSDT and XWPFSDTCell
|
||||
*
|
||||
* WARNING - APIs expected to change rapidly.
|
||||
*
|
||||
* These classes have so far been built only for read-only processing.
|
||||
*
|
||||
*/
|
||||
public abstract class AbstractXWPFSDT implements ISDTContents {
|
||||
private final String title;
|
||||
private final String tag;
|
||||
private final IBody part;
|
||||
|
||||
public AbstractXWPFSDT(CTSdtPr pr, IBody part){
|
||||
|
||||
List<CTString> aliases = pr.getAliasList();
|
||||
if (aliases != null && aliases.size() > 0){
|
||||
title = aliases.get(0).getVal();
|
||||
} else {
|
||||
title = "";
|
||||
}
|
||||
List<CTString> tags = pr.getTagList();
|
||||
if (tags != null && tags.size() > 0){
|
||||
tag = tags.get(0).getVal();
|
||||
} else {
|
||||
tag = "";
|
||||
}
|
||||
this.part = part;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return first SDT Title
|
||||
*/
|
||||
public String getTitle(){
|
||||
return title;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return first SDT Tag
|
||||
*/
|
||||
public String getTag(){
|
||||
return tag;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return the content object
|
||||
*/
|
||||
public abstract ISDTContent getContent();
|
||||
|
||||
/**
|
||||
*
|
||||
* @return null
|
||||
*/
|
||||
public IBody getBody() {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return document part
|
||||
*/
|
||||
public POIXMLDocumentPart getPart() {
|
||||
return part.getPart();
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return partType
|
||||
*/
|
||||
public BodyType getPartType() {
|
||||
return BodyType.CONTENTCONTROL;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return element type
|
||||
*/
|
||||
public BodyElementType getElementType() {
|
||||
return BodyElementType.CONTENTCONTROL;
|
||||
}
|
||||
|
||||
public XWPFDocument getDocument() {
|
||||
return part.getXWPFDocument();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.xwpf.usermodel;
|
||||
|
||||
/**
|
||||
* Interface for anything that can be at a table cell level:
|
||||
* {@link XWPFTableCell}, {@link XWPFSDTCell}
|
||||
* <p>
|
||||
* Schematically something like this:
|
||||
* <tr><tc/><tc/><sdt><tc/></sdt></tr>
|
||||
*/
|
||||
public interface ICell {
|
||||
}
|
|
@ -17,7 +17,7 @@
|
|||
package org.apache.poi.xwpf.usermodel;
|
||||
|
||||
/**
|
||||
* Interface for anything that can be within a STD:
|
||||
* Interface for anything that can be within an SDT:
|
||||
* {@link XWPFRun}, {@link XWPFTable}, {@link XWPFParagraph},
|
||||
* {@link XWPFSDT} etc
|
||||
*/
|
||||
|
|
|
@ -16,95 +16,32 @@
|
|||
==================================================================== */
|
||||
package org.apache.poi.xwpf.usermodel;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.POIXMLDocumentPart;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtPr;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRun;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTString;
|
||||
|
||||
/**
|
||||
* Experimental class to offer rudimentary read-only processing of
|
||||
* of StructuredDocumentTags/ContentControl
|
||||
*
|
||||
*
|
||||
*
|
||||
* WARNING - APIs expected to change rapidly
|
||||
*
|
||||
*/
|
||||
public class XWPFSDT implements IBodyElement, IRunBody, ISDTContents, IRunElement {
|
||||
private final String title;
|
||||
private final String tag;
|
||||
private final XWPFSDTContent content;
|
||||
private final IBody part;
|
||||
public class XWPFSDT extends AbstractXWPFSDT
|
||||
implements IBodyElement, IRunBody, ISDTContents, IRunElement {
|
||||
private final ISDTContent content;
|
||||
|
||||
public XWPFSDT(CTSdtRun sdtRun, IBody part){
|
||||
this.part = part;
|
||||
super(sdtRun.getSdtPr(), part);
|
||||
this.content = new XWPFSDTContent(sdtRun.getSdtContent(), part, this);
|
||||
CTSdtPr pr = sdtRun.getSdtPr();
|
||||
List<CTString> aliases = pr.getAliasList();
|
||||
if (aliases != null && aliases.size() > 0){
|
||||
title = aliases.get(0).getVal();
|
||||
} else {
|
||||
title = "";
|
||||
}
|
||||
@SuppressWarnings("deprecation")
|
||||
CTString[] array = pr.getTagArray();
|
||||
if (array != null && array.length > 0){
|
||||
tag = array[0].getVal();
|
||||
} else {
|
||||
tag = "";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public XWPFSDT(CTSdtBlock block, IBody part){
|
||||
this.part = part;
|
||||
super(block.getSdtPr(), part);
|
||||
this.content = new XWPFSDTContent( block.getSdtContent(), part, this);
|
||||
CTSdtPr pr = block.getSdtPr();
|
||||
List<CTString> aliases = pr.getAliasList();
|
||||
if (aliases != null && aliases.size() > 0){
|
||||
title = aliases.get(0).getVal();
|
||||
} else {
|
||||
title = "";
|
||||
}
|
||||
@SuppressWarnings("deprecation")
|
||||
CTString[] array = pr.getTagArray();
|
||||
if (array != null && array.length > 0){
|
||||
tag = array[0].getVal();
|
||||
} else {
|
||||
tag = "";
|
||||
}
|
||||
|
||||
}
|
||||
public String getTitle(){
|
||||
return title;
|
||||
}
|
||||
public String getTag(){
|
||||
return tag;
|
||||
}
|
||||
public XWPFSDTContent getContent(){
|
||||
|
||||
public ISDTContent getContent(){
|
||||
return content;
|
||||
}
|
||||
|
||||
public IBody getBody() {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
public POIXMLDocumentPart getPart() {
|
||||
return part.getPart();
|
||||
}
|
||||
|
||||
public BodyType getPartType() {
|
||||
return BodyType.CONTENTCONTROL;
|
||||
}
|
||||
|
||||
public BodyElementType getElementType() {
|
||||
return BodyElementType.CONTENTCONTROL;
|
||||
}
|
||||
|
||||
public XWPFDocument getDocument() {
|
||||
return part.getXWPFDocument();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.xwpf.usermodel;
|
||||
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtCell;
|
||||
|
||||
/**
|
||||
* Experimental class to offer rudimentary read-only processing of
|
||||
* of StructuredDocumentTags/ContentControl that can appear
|
||||
* in a table row as if a table cell.
|
||||
* <p>
|
||||
* These can contain one or more cells or other SDTs within them.
|
||||
*
|
||||
* WARNING - APIs expected to change rapidly
|
||||
*
|
||||
*/
|
||||
public class XWPFSDTCell extends AbstractXWPFSDT implements ICell {
|
||||
private final XWPFSDTContentCell cellContent;
|
||||
|
||||
public XWPFSDTCell(CTSdtCell sdtCell, XWPFTableRow xwpfTableRow, IBody part){
|
||||
super(sdtCell.getSdtPr(), part);
|
||||
cellContent = new XWPFSDTContentCell(sdtCell.getSdtContent(), xwpfTableRow, part);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ISDTContent getContent(){
|
||||
return cellContent;
|
||||
}
|
||||
|
||||
}
|
|
@ -39,7 +39,7 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
|
|||
* WARNING - APIs expected to change rapidly
|
||||
*
|
||||
*/
|
||||
public class XWPFSDTContent {
|
||||
public class XWPFSDTContent implements ISDTContent {
|
||||
|
||||
// private final IBody part;
|
||||
// private final XWPFDocument document;
|
||||
|
@ -87,10 +87,10 @@ public class XWPFSDTContent {
|
|||
for (int i = 0; i < bodyElements.size(); i++){
|
||||
Object o = bodyElements.get(i);
|
||||
if (o instanceof XWPFParagraph){
|
||||
text.append(((XWPFParagraph)o).getText());
|
||||
appendParagraph((XWPFParagraph)o, text);
|
||||
addNewLine = true;
|
||||
} else if (o instanceof XWPFTable){
|
||||
text.append(((XWPFTable)o).getText());
|
||||
appendTable((XWPFTable)o, text);
|
||||
addNewLine = true;
|
||||
} else if (o instanceof XWPFSDT){
|
||||
text.append(((XWPFSDT)o).getContent().getText());
|
||||
|
@ -106,6 +106,31 @@ public class XWPFSDTContent {
|
|||
return text.toString();
|
||||
}
|
||||
|
||||
private void appendTable(XWPFTable table, StringBuilder text) {
|
||||
//this works recursively to pull embedded tables from within cells
|
||||
for (XWPFTableRow row : table.getRows()) {
|
||||
List<ICell> cells = row.getTableICells();
|
||||
for (int i = 0; i < cells.size(); i++) {
|
||||
ICell cell = cells.get(i);
|
||||
if (cell instanceof XWPFTableCell) {
|
||||
text.append(((XWPFTableCell)cell).getTextRecursively());
|
||||
} else if (cell instanceof XWPFSDTCell) {
|
||||
text.append(((XWPFSDTCell)cell).getContent().getText());
|
||||
}
|
||||
if (i < cells.size()-1) {
|
||||
text.append("\t");
|
||||
}
|
||||
}
|
||||
text.append('\n');
|
||||
}
|
||||
}
|
||||
|
||||
private void appendParagraph(XWPFParagraph paragraph, StringBuilder text) {
|
||||
for(IRunElement run : paragraph.getRuns()) {
|
||||
text.append(run.toString());
|
||||
}
|
||||
}
|
||||
|
||||
public String toString(){
|
||||
return getText();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,114 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.xwpf.usermodel;
|
||||
|
||||
|
||||
import javax.xml.namespace.QName;
|
||||
|
||||
import org.apache.xmlbeans.XmlCursor;
|
||||
import org.apache.xmlbeans.XmlCursor.TokenType;
|
||||
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentCell;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Experimental class to offer rudimentary read-only processing of
|
||||
* of the XWPFSDTCellContent.
|
||||
|
||||
* WARNING - APIs expected to change rapidly
|
||||
*
|
||||
*/
|
||||
public class XWPFSDTContentCell implements ISDTContent {
|
||||
|
||||
//A full implementation would grab the icells
|
||||
//that a content cell can contain. This would require
|
||||
//significant changes, including changing the notion that the
|
||||
//parent of a cell can be not just a row, but an sdt.
|
||||
//For now we are just grabbing the text out of the text tokentypes.
|
||||
|
||||
//private List<ICell> cells = new ArrayList<ICell>().
|
||||
|
||||
private String text = "";
|
||||
public XWPFSDTContentCell(CTSdtContentCell sdtContentCell,
|
||||
XWPFTableRow xwpfTableRow, IBody part){
|
||||
super();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
XmlCursor cursor = sdtContentCell.newCursor();
|
||||
|
||||
//keep track of the following,
|
||||
//and add "\n" only before the start of a body
|
||||
//element if it is not the first body element.
|
||||
|
||||
//index of cell in row
|
||||
int tcCnt = 0;
|
||||
//count of body objects
|
||||
int iBodyCnt = 0;
|
||||
int depth = 1;
|
||||
|
||||
while (cursor.hasNextToken() && depth > 0) {
|
||||
TokenType t = cursor.toNextToken();
|
||||
if (t.isText()){
|
||||
sb.append(cursor.getTextValue());
|
||||
} else if (isStartToken(cursor, "tr")) {
|
||||
tcCnt = 0;
|
||||
iBodyCnt = 0;
|
||||
} else if (isStartToken(cursor, "tc")) {
|
||||
if (tcCnt++ > 0) {
|
||||
sb.append("\t");
|
||||
}
|
||||
iBodyCnt = 0;
|
||||
} else if (isStartToken(cursor, "p") ||
|
||||
isStartToken(cursor, "tbl") ||
|
||||
isStartToken(cursor, "sdt")) {
|
||||
if (iBodyCnt > 0) {
|
||||
sb.append("\n");
|
||||
}
|
||||
iBodyCnt++;
|
||||
}
|
||||
if (cursor.isStart()){
|
||||
depth++;
|
||||
} else if (cursor.isEnd()){
|
||||
depth--;
|
||||
}
|
||||
}
|
||||
text = sb.toString();
|
||||
}
|
||||
|
||||
|
||||
|
||||
private boolean isStartToken(XmlCursor cursor, String string) {
|
||||
if (! cursor.isStart()) {
|
||||
return false;
|
||||
}
|
||||
QName qName = cursor.getName();
|
||||
if (qName != null && qName.getLocalPart() != null &&
|
||||
qName.getLocalPart().equals(string)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public String getText(){
|
||||
return text;
|
||||
}
|
||||
|
||||
public String toString(){
|
||||
return getText();
|
||||
}
|
||||
}
|
|
@ -159,6 +159,13 @@ public class XWPFTable implements IBodyElement, ISDTContents {
|
|||
}
|
||||
|
||||
/**
|
||||
* Convenience method to extract text in cells. This
|
||||
* does not extract text recursively in cells, and it does not
|
||||
* currently include text in SDT (form) components.
|
||||
* <p>
|
||||
* To get all text within a table, see XWPFWordExtractor's appendTableText
|
||||
* as an example.
|
||||
*
|
||||
* @return text
|
||||
*/
|
||||
public String getText() {
|
||||
|
|
|
@ -42,7 +42,7 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.STVerticalJc;
|
|||
* Represents a Cell within a {@link XWPFTable}. The
|
||||
* Cell is the thing that holds the actual content (paragraphs etc)
|
||||
*/
|
||||
public class XWPFTableCell implements IBody {
|
||||
public class XWPFTableCell implements IBody, ICell {
|
||||
private final CTTc ctTc;
|
||||
protected List<XWPFParagraph> paragraphs = null;
|
||||
protected List<XWPFTable> tables = null;
|
||||
|
|
|
@ -21,9 +21,12 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.poi.util.Internal;
|
||||
import org.apache.xmlbeans.XmlCursor;
|
||||
import org.apache.xmlbeans.XmlObject;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHeight;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtCell;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTrPr;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.STOnOff;
|
||||
|
@ -121,6 +124,29 @@ public class XWPFTableRow {
|
|||
return table;
|
||||
}
|
||||
|
||||
/**
|
||||
* create and return a list of all XWPFTableCell
|
||||
* who belongs to this row
|
||||
* @return a list of {@link XWPFTableCell}
|
||||
*/
|
||||
public List<ICell> getTableICells(){
|
||||
|
||||
List<ICell> cells = new ArrayList<ICell>();
|
||||
//Can't use ctRow.getTcList because that only gets table cells
|
||||
//Can't use ctRow.getSdtList because that only gets sdts that are at cell level
|
||||
XmlCursor cursor = ctRow.newCursor();
|
||||
cursor.selectPath("./*");
|
||||
while (cursor.toNextSelection()) {
|
||||
XmlObject o = cursor.getObject();
|
||||
if (o instanceof CTTc){
|
||||
cells.add(new XWPFTableCell((CTTc)o, this, table.getBody()));
|
||||
} else if (o instanceof CTSdtCell) {
|
||||
cells.add(new XWPFSDTCell((CTSdtCell)o, this, table.getBody()));
|
||||
}
|
||||
}
|
||||
return cells;
|
||||
}
|
||||
|
||||
/**
|
||||
* create and return a list of all XWPFTableCell
|
||||
* who belongs to this row
|
||||
|
|
|
@ -18,6 +18,8 @@
|
|||
package org.apache.poi.xwpf.extractor;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
|
@ -327,12 +329,14 @@ public class TestXWPFWordExtractor extends TestCase {
|
|||
String[] targs = new String[]{
|
||||
"header_rich_text",
|
||||
"rich_text",
|
||||
"rich_text_pre_table\nrich_text_cell1\t\t\t\n\nrich_text_post_table",
|
||||
"rich_text_pre_table\nrich_text_cell1\t\t\t\n\t\t\t\n\t\t\t\n\nrich_text_post_table",
|
||||
"plain_text_no_newlines",
|
||||
"plain_text_with_newlines1\nplain_text_with_newlines2\n",
|
||||
"watermelon\n",
|
||||
"dirt\n",
|
||||
"4/16/2013\n",
|
||||
"rich_text_in_cell",
|
||||
"abc",
|
||||
"rich_text_in_paragraph_in_cell",
|
||||
"footer_rich_text",
|
||||
"footnote_sdt",
|
||||
|
@ -352,6 +356,36 @@ public class TestXWPFWordExtractor extends TestCase {
|
|||
}
|
||||
assertEquals("controlled content loading hit count", targs.length, hits);
|
||||
ex.close();
|
||||
|
||||
|
||||
doc = XWPFTestDataSamples.openSampleDocument("Bug54771a.docx");
|
||||
targs = new String[]{
|
||||
"bb",
|
||||
"test subtitle\n",
|
||||
"test user\n",
|
||||
};
|
||||
ex = new XWPFWordExtractor(doc);
|
||||
s = ex.getText().toLowerCase();
|
||||
|
||||
//At one point in development there were three copies of the text.
|
||||
//This ensures that there is only one copy.
|
||||
for (String targ : targs){
|
||||
Matcher m = Pattern.compile(targ).matcher(s);
|
||||
int hit = 0;
|
||||
while (m.find()) {
|
||||
hit++;
|
||||
}
|
||||
assertEquals("controlled content loading-"+targ, 1, hit);
|
||||
}
|
||||
//"test\n" appears twice: once as the "title" and once in the text.
|
||||
//This also happens when you save this document as text from MSWord.
|
||||
Matcher m = Pattern.compile("test\n").matcher(s);
|
||||
int hit = 0;
|
||||
while (m.find()){
|
||||
hit++;
|
||||
}
|
||||
assertEquals("test<N>", 2, hit);
|
||||
ex.close();
|
||||
}
|
||||
|
||||
/** No Header or Footer in document */
|
||||
|
|
|
@ -18,8 +18,10 @@
|
|||
package org.apache.poi.xwpf.usermodel;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
|
@ -35,15 +37,16 @@ public final class TestXWPFSDT extends TestCase {
|
|||
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx");
|
||||
String tag = null;
|
||||
String title= null;
|
||||
List<XWPFSDT> sdts = extractAllSDTs(doc);
|
||||
for (XWPFSDT sdt :sdts){
|
||||
List<AbstractXWPFSDT> sdts = extractAllSDTs(doc);
|
||||
for (AbstractXWPFSDT sdt :sdts){
|
||||
if (sdt.getContent().toString().equals("Rich_text")){
|
||||
tag = "MyTag";
|
||||
title = "MyTitle";
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
assertEquals("controls size", 12, sdts.size());
|
||||
assertEquals("controls size", 13, sdts.size());
|
||||
|
||||
assertEquals("tag", "MyTag", tag);
|
||||
assertEquals("title", "MyTitle", title);
|
||||
|
@ -54,12 +57,13 @@ public final class TestXWPFSDT extends TestCase {
|
|||
String[] contents = new String[]{
|
||||
"header_rich_text",
|
||||
"Rich_text",
|
||||
"Rich_text_pre_table\nRich_text_cell1\t\t\t\n\nRich_text_post_table",
|
||||
"Rich_text_pre_table\nRich_text_cell1\t\t\t\n\t\t\t\n\t\t\t\n\nRich_text_post_table",
|
||||
"Plain_text_no_newlines",
|
||||
"Plain_text_with_newlines1\nplain_text_with_newlines2",
|
||||
"Watermelon",
|
||||
"Dirt",
|
||||
"4/16/2013",
|
||||
"Rich_text_in_cell",
|
||||
"rich_text_in_paragraph_in_cell",
|
||||
"Footer_rich_text",
|
||||
"Footnote_sdt",
|
||||
|
@ -67,31 +71,40 @@ public final class TestXWPFSDT extends TestCase {
|
|||
|
||||
};
|
||||
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx");
|
||||
List<XWPFSDT> sdts = extractAllSDTs(doc);
|
||||
List<AbstractXWPFSDT> sdts = extractAllSDTs(doc);
|
||||
|
||||
assertEquals("number of sdts", contents.length, sdts.size());
|
||||
|
||||
for (int i = 0; i < sdts.size(); i++){//contents.length; i++){
|
||||
XWPFSDT sdt = sdts.get(i);
|
||||
|
||||
for (int i = 0; i < contents.length; i++){
|
||||
AbstractXWPFSDT sdt = sdts.get(i);
|
||||
assertEquals(i+ ": " + contents[i], contents[i], sdt.getContent().toString());
|
||||
}
|
||||
}
|
||||
/**
|
||||
* POI-54771 and TIKA-1317
|
||||
*/
|
||||
public void testSDTAsCell() throws Exception {
|
||||
//Bug54771a.docx and Bug54771b.docx test slightly
|
||||
//different recursion patterns. Keep both!
|
||||
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54771a.docx");
|
||||
List<AbstractXWPFSDT> sdts = extractAllSDTs(doc);
|
||||
String text = sdts.get(0).getContent().getText();
|
||||
assertEquals(2, sdts.size());
|
||||
assertTrue(text.indexOf("Test") > -1);
|
||||
|
||||
text = sdts.get(1).getContent().getText();
|
||||
assertTrue(text.indexOf("Test Subtitle") > -1);
|
||||
assertTrue(text.indexOf("Test User") > -1);
|
||||
assertTrue(text.indexOf("Test") < text.indexOf("Test Subtitle"));
|
||||
|
||||
doc = XWPFTestDataSamples.openSampleDocument("Bug54771b.docx");
|
||||
sdts = extractAllSDTs(doc);
|
||||
assertEquals(3, sdts.size());
|
||||
assertTrue(sdts.get(0).getContent().getText().indexOf("Test") > -1);
|
||||
|
||||
assertTrue(sdts.get(1).getContent().getText().indexOf("Test Subtitle") > -1);
|
||||
assertTrue(sdts.get(2).getContent().getText().indexOf("Test User") > -1);
|
||||
|
||||
public void testFailureToGetSDTAsCell() throws Exception{
|
||||
/**
|
||||
* The current code fails to extract an sdt if it comprises/is the parent
|
||||
* of a cell in a table.
|
||||
*/
|
||||
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx");
|
||||
List<XWPFSDT> sdts = extractAllSDTs(doc);
|
||||
boolean found = false;
|
||||
for (XWPFSDT sdt : sdts){
|
||||
if (sdt.getContent().getText().toLowerCase().indexOf("rich_text_in_cell") > -1){
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
assertEquals("SDT as cell known failure", false, found);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -99,7 +112,7 @@ public final class TestXWPFSDT extends TestCase {
|
|||
*/
|
||||
public void testNewLinesBetweenRuns() throws Exception{
|
||||
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug55142.docx");
|
||||
List<XWPFSDT> sdts = extractAllSDTs(doc);
|
||||
List<AbstractXWPFSDT> sdts = extractAllSDTs(doc);
|
||||
List<String> targs = new ArrayList<String>();
|
||||
//these test newlines and tabs in paragraphs/body elements
|
||||
targs.add("Rich-text1 abcdefghi");
|
||||
|
@ -114,14 +127,14 @@ public final class TestXWPFSDT extends TestCase {
|
|||
targs.add("sdt_incell2 abcdefg");
|
||||
|
||||
for (int i = 0; i < sdts.size(); i++){
|
||||
XWPFSDT sdt = sdts.get(i);
|
||||
AbstractXWPFSDT sdt = sdts.get(i);
|
||||
assertEquals(targs.get(i), targs.get(i), sdt.getContent().getText());
|
||||
}
|
||||
}
|
||||
|
||||
private List<XWPFSDT> extractAllSDTs(XWPFDocument doc){
|
||||
|
||||
List<XWPFSDT> sdts = new ArrayList<XWPFSDT>();
|
||||
private List<AbstractXWPFSDT> extractAllSDTs(XWPFDocument doc){
|
||||
|
||||
List<AbstractXWPFSDT> sdts = new ArrayList<AbstractXWPFSDT>();
|
||||
|
||||
List<XWPFHeader> headers = doc.getHeaderList();
|
||||
for (XWPFHeader header : headers){
|
||||
|
@ -135,7 +148,6 @@ public final class TestXWPFSDT extends TestCase {
|
|||
}
|
||||
|
||||
for (XWPFFootnote footnote : doc.getFootnotes()){
|
||||
|
||||
sdts.addAll(extractSDTsFromBodyElements(footnote.getBodyElements()));
|
||||
}
|
||||
for (Map.Entry<Integer, XWPFFootnote> e : doc.endnotes.entrySet()){
|
||||
|
@ -144,8 +156,8 @@ public final class TestXWPFSDT extends TestCase {
|
|||
return sdts;
|
||||
}
|
||||
|
||||
private List<XWPFSDT> extractSDTsFromBodyElements(List<IBodyElement> elements){
|
||||
List<XWPFSDT> sdts = new ArrayList<XWPFSDT>();
|
||||
private List<AbstractXWPFSDT> extractSDTsFromBodyElements(List<IBodyElement> elements){
|
||||
List<AbstractXWPFSDT> sdts = new ArrayList<AbstractXWPFSDT>();
|
||||
for (IBodyElement e : elements){
|
||||
if (e instanceof XWPFSDT){
|
||||
XWPFSDT sdt = (XWPFSDT)e;
|
||||
|
@ -167,11 +179,16 @@ public final class TestXWPFSDT extends TestCase {
|
|||
return sdts;
|
||||
}
|
||||
|
||||
private List<XWPFSDT> extractSDTsFromTable(XWPFTable table){
|
||||
List<XWPFSDT> sdts = new ArrayList<XWPFSDT>();
|
||||
for (XWPFTableRow r : table.getRows()){
|
||||
for (XWPFTableCell c : r.getTableCells()){
|
||||
sdts.addAll(extractSDTsFromBodyElements(c.getBodyElements()));
|
||||
private List<AbstractXWPFSDT> extractSDTsFromTable(XWPFTable table) {
|
||||
|
||||
List<AbstractXWPFSDT> sdts = new ArrayList<AbstractXWPFSDT>();
|
||||
for (XWPFTableRow r : table.getRows()) {
|
||||
for (ICell c : r.getTableICells()) {
|
||||
if (c instanceof XWPFSDTCell) {
|
||||
sdts.add((XWPFSDTCell)c);
|
||||
} else if (c instanceof XWPFTableCell) {
|
||||
sdts.addAll(extractSDTsFromBodyElements(((XWPFTableCell)c).getBodyElements()));
|
||||
}
|
||||
}
|
||||
}
|
||||
return sdts;
|
||||
|
|
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue