PDF to X
This commit is contained in:
parent
e3b40e4556
commit
1150cbd406
|
@ -0,0 +1 @@
|
|||
/target/
|
|
@ -0,0 +1,100 @@
|
|||
<?xml version="1.0"?>
|
||||
<project
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
|
||||
xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>com.baeldung</groupId>
|
||||
<artifactId>parent-modules</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>pdf</artifactId>
|
||||
<name>pdf</name>
|
||||
<url>http://maven.apache.org</url>
|
||||
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<maven-compiler-plugin.version>3.5.1</maven-compiler-plugin.version>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>3.8.1</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
|
||||
<dependency>
|
||||
<groupId>org.apache.pdfbox</groupId>
|
||||
<artifactId>pdfbox</artifactId>
|
||||
<version>2.0.3</version>
|
||||
</dependency>
|
||||
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox-tools -->
|
||||
<dependency>
|
||||
<groupId>org.apache.pdfbox</groupId>
|
||||
<artifactId>pdfbox-tools</artifactId>
|
||||
<version>2.0.3</version>
|
||||
</dependency>
|
||||
<!-- https://mvnrepository.com/artifact/net.sf.cssbox/pdf2dom -->
|
||||
<dependency>
|
||||
<groupId>net.sf.cssbox</groupId>
|
||||
<artifactId>pdf2dom</artifactId>
|
||||
<version>1.6</version>
|
||||
</dependency>
|
||||
<!-- https://mvnrepository.com/artifact/com.itextpdf/itextpdf -->
|
||||
<dependency>
|
||||
<groupId>com.itextpdf</groupId>
|
||||
<artifactId>itextpdf</artifactId>
|
||||
<version>5.5.10</version>
|
||||
</dependency>
|
||||
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi</artifactId>
|
||||
<version>3.15</version>
|
||||
</dependency>
|
||||
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi-ooxml</artifactId>
|
||||
<version>3.15</version>
|
||||
</dependency>
|
||||
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-scratchpad -->
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi-scratchpad</artifactId>
|
||||
<version>3.15</version>
|
||||
</dependency>
|
||||
<!-- https://mvnrepository.com/artifact/org.apache.xmlgraphics/batik-transcoder -->
|
||||
<dependency>
|
||||
<groupId>org.apache.xmlgraphics</groupId>
|
||||
<artifactId>batik-transcoder</artifactId>
|
||||
<version>1.8</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<finalName>pdf</finalName>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>src/main/resources</directory>
|
||||
<filtering>true</filtering>
|
||||
</resource>
|
||||
</resources>
|
||||
|
||||
<plugins>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>${maven-compiler-plugin.version}</version>
|
||||
<configuration>
|
||||
<source>1.8</source>
|
||||
<target>1.8</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
|
@ -0,0 +1,49 @@
|
|||
package com.baeldung.pdf;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.io.Writer;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.fit.pdfdom.PDFDomTree;
|
||||
|
||||
public class PDF2HTMLExample {
|
||||
|
||||
private static final String FILENAME = "src/main/resources/pdf.pdf";
|
||||
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
generateHTMLFromPDF(FILENAME);
|
||||
} catch (IOException | ParserConfigurationException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private static void generateHTMLFromPDF(String filename) throws ParserConfigurationException, IOException {
|
||||
System.out.println("Creating HTML file from a PDF file: " + filename);
|
||||
PDDocument pdf = null;
|
||||
try {
|
||||
// load the PDF file using PDFBox
|
||||
pdf = PDDocument.load(new File(filename));
|
||||
// create the DOM parser
|
||||
PDFDomTree parser = new PDFDomTree();
|
||||
// parse the file and get the DOM Document
|
||||
Writer output = new PrintWriter("src/output/pdf.html", "utf-8");
|
||||
parser.writeText(pdf, output);
|
||||
output.close();
|
||||
} finally {
|
||||
if (pdf != null) {
|
||||
try {
|
||||
pdf.close();
|
||||
} catch (IOException e) {
|
||||
System.err.println("Error: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
System.out.println("Done.");
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
package com.baeldung.pdf;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.rendering.ImageType;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
|
||||
|
||||
public class PDF2ImageExample {
|
||||
|
||||
private static final String FILENAME = "src/main/resources/pdf.pdf";
|
||||
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
generateImageFromPDF(FILENAME, "png");
|
||||
generateImageFromPDF(FILENAME, "jpeg");
|
||||
generateImageFromPDF(FILENAME, "gif");
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private static void generateImageFromPDF(String filename, String extension) throws IOException {
|
||||
System.out.println("Creating " + extension + " image from a PDF file: " + filename);
|
||||
PDDocument document = PDDocument.load(new File(filename));
|
||||
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||
for (int page = 0; page < document.getNumberOfPages(); ++page) {
|
||||
System.out.println("Page number: " + (page + 1) + " is being rendered");
|
||||
BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
|
||||
ImageIOUtil.writeImage(bim, "src/output/pdf" + "-" + (page + 1) + "." + extension, 300);
|
||||
}
|
||||
document.close();
|
||||
System.out.println("Done.");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,65 @@
|
|||
package com.baeldung.pdf;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
|
||||
import org.apache.pdfbox.cos.COSDocument;
|
||||
import org.apache.pdfbox.io.RandomAccessFile;
|
||||
import org.apache.pdfbox.pdfparser.PDFParser;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
|
||||
public class PDF2TextExample {
|
||||
|
||||
private static final String FILENAME = "src/main/resources/pdf.pdf";
|
||||
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
generateTxtFromPDF(FILENAME);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private static void generateTxtFromPDF(String filename) throws IOException {
|
||||
System.out.println("Parsing text from PDF file " + filename);
|
||||
String parsedText = null;
|
||||
PDFTextStripper pdfStripper;
|
||||
PDDocument pdDoc = null;
|
||||
COSDocument cosDoc = null;
|
||||
|
||||
File f = new File(filename);
|
||||
PDFParser parser = new PDFParser(new RandomAccessFile(f, "r"));
|
||||
|
||||
try {
|
||||
parser.parse();
|
||||
cosDoc = parser.getDocument();
|
||||
pdfStripper = new PDFTextStripper();
|
||||
pdDoc = new PDDocument(cosDoc);
|
||||
parsedText = pdfStripper.getText(pdDoc);
|
||||
} catch (Exception e) {
|
||||
System.err.println("An exception occured in parsing the PDF Document.");
|
||||
e.printStackTrace();
|
||||
try {
|
||||
if (cosDoc != null)
|
||||
cosDoc.close();
|
||||
if (pdDoc != null)
|
||||
pdDoc.close();
|
||||
} catch (Exception e1) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
System.out.println("Writing PDF text to output text file");
|
||||
try {
|
||||
PrintWriter pw = new PrintWriter("src/output/pdf.txt");
|
||||
pw.print(parsedText);
|
||||
pw.close();
|
||||
} catch (Exception e) {
|
||||
System.out.println("An exception occured in writing the pdf text to file.");
|
||||
e.printStackTrace();
|
||||
}
|
||||
System.out.println("Done.");
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
package com.baeldung.pdf;
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.poi.xwpf.usermodel.BreakType;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFRun;
|
||||
|
||||
import com.itextpdf.text.pdf.PdfReader;
|
||||
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
|
||||
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
|
||||
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
|
||||
|
||||
public class PDF2WordExample {
|
||||
|
||||
private static final String FILENAME = "src/main/resources/pdf.pdf";
|
||||
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
generateDocFromPDF(FILENAME);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private static void generateDocFromPDF(String filename) throws IOException {
|
||||
System.out.println("Creating a docx file from a PDF file: " + filename);
|
||||
// Create the word document
|
||||
XWPFDocument doc = new XWPFDocument();
|
||||
|
||||
// Open the pdf file
|
||||
String pdf = filename;
|
||||
PdfReader reader = new PdfReader(pdf);
|
||||
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
|
||||
|
||||
// Read the PDF page by page
|
||||
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
|
||||
TextExtractionStrategy strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
|
||||
// Extract the text
|
||||
String text = strategy.getResultantText();
|
||||
// Create a new paragraph in the word document, adding the extracted
|
||||
// text
|
||||
XWPFParagraph p = doc.createParagraph();
|
||||
XWPFRun run = p.createRun();
|
||||
run.setText(text);
|
||||
// Adding a page break
|
||||
run.addBreak(BreakType.PAGE);
|
||||
}
|
||||
// Write the word document
|
||||
FileOutputStream out = new FileOutputStream("src/output/pdf.docx");
|
||||
doc.write(out);
|
||||
// Close all open files
|
||||
out.close();
|
||||
reader.close();
|
||||
doc.close();
|
||||
System.out.println("Done.");
|
||||
}
|
||||
|
||||
}
|
Binary file not shown.
9
pom.xml
9
pom.xml
|
@ -1,5 +1,5 @@
|
|||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>com.baeldung</groupId>
|
||||
<artifactId>parent-modules</artifactId>
|
||||
|
@ -143,6 +143,7 @@
|
|||
<module>xml</module>
|
||||
<module>xmlunit2</module>
|
||||
<module>xstream</module>
|
||||
</modules>
|
||||
<module>pdf</module>
|
||||
</modules>
|
||||
|
||||
</project>
|
||||
</project>
|
Loading…
Reference in New Issue