commit
95a37ccc10
25
apache-tika/pom.xml
Normal file
25
apache-tika/pom.xml
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<groupId>com.baeldung</groupId>
|
||||||
|
<artifactId>apache-tika</artifactId>
|
||||||
|
<version>0.0.1-SNAPSHOT</version>
|
||||||
|
|
||||||
|
<parent>
|
||||||
|
<groupId>com.baeldung</groupId>
|
||||||
|
<artifactId>parent-modules</artifactId>
|
||||||
|
<version>1.0.0-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<tika.version>1.17</tika.version>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.tika</groupId>
|
||||||
|
<artifactId>tika-parsers</artifactId>
|
||||||
|
<version>${tika.version}</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</project>
|
@ -0,0 +1,67 @@
|
|||||||
|
package com.baeldung.tika;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
|
||||||
|
import org.apache.tika.Tika;
|
||||||
|
import org.apache.tika.detect.DefaultDetector;
|
||||||
|
import org.apache.tika.detect.Detector;
|
||||||
|
import org.apache.tika.exception.TikaException;
|
||||||
|
import org.apache.tika.metadata.Metadata;
|
||||||
|
import org.apache.tika.mime.MediaType;
|
||||||
|
import org.apache.tika.parser.AutoDetectParser;
|
||||||
|
import org.apache.tika.parser.ParseContext;
|
||||||
|
import org.apache.tika.parser.Parser;
|
||||||
|
import org.apache.tika.sax.BodyContentHandler;
|
||||||
|
import org.xml.sax.ContentHandler;
|
||||||
|
import org.xml.sax.SAXException;
|
||||||
|
|
||||||
|
public class TikaAnalysis {
|
||||||
|
public static String detectDocTypeUsingDetector(InputStream stream) throws IOException {
|
||||||
|
Detector detector = new DefaultDetector();
|
||||||
|
Metadata metadata = new Metadata();
|
||||||
|
|
||||||
|
MediaType mediaType = detector.detect(stream, metadata);
|
||||||
|
return mediaType.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String detectDocTypeUsingFacade(InputStream stream) throws IOException {
|
||||||
|
Tika tika = new Tika();
|
||||||
|
String mediaType = tika.detect(stream);
|
||||||
|
return mediaType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String extractContentUsingParser(InputStream stream) throws IOException, TikaException, SAXException {
|
||||||
|
Parser parser = new AutoDetectParser();
|
||||||
|
ContentHandler handler = new BodyContentHandler();
|
||||||
|
Metadata metadata = new Metadata();
|
||||||
|
ParseContext context = new ParseContext();
|
||||||
|
|
||||||
|
parser.parse(stream, handler, metadata, context);
|
||||||
|
return handler.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String extractContentUsingFacade(InputStream stream) throws IOException, TikaException {
|
||||||
|
Tika tika = new Tika();
|
||||||
|
String content = tika.parseToString(stream);
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Metadata extractMetadatatUsingParser(InputStream stream) throws IOException, SAXException, TikaException {
|
||||||
|
Parser parser = new AutoDetectParser();
|
||||||
|
ContentHandler handler = new BodyContentHandler();
|
||||||
|
Metadata metadata = new Metadata();
|
||||||
|
ParseContext context = new ParseContext();
|
||||||
|
|
||||||
|
parser.parse(stream, handler, metadata, context);
|
||||||
|
return metadata;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Metadata extractMetadatatUsingFacade(InputStream stream) throws IOException, TikaException {
|
||||||
|
Tika tika = new Tika();
|
||||||
|
Metadata metadata = new Metadata();
|
||||||
|
|
||||||
|
tika.parse(stream, metadata);
|
||||||
|
return metadata;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,79 @@
|
|||||||
|
package com.baeldung.tika;
|
||||||
|
|
||||||
|
import static org.hamcrest.CoreMatchers.containsString;
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertThat;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
|
||||||
|
import org.apache.tika.exception.TikaException;
|
||||||
|
import org.apache.tika.metadata.Metadata;
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.xml.sax.SAXException;
|
||||||
|
|
||||||
|
public class TikaUnitTest {
|
||||||
|
@Test
|
||||||
|
public void whenUsingDetector_thenDocumentTypeIsReturned() throws IOException {
|
||||||
|
InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.txt");
|
||||||
|
String mediaType = TikaAnalysis.detectDocTypeUsingDetector(stream);
|
||||||
|
|
||||||
|
assertEquals("application/pdf", mediaType);
|
||||||
|
|
||||||
|
stream.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void whenUsingFacade_thenDocumentTypeIsReturned() throws IOException {
|
||||||
|
InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.txt");
|
||||||
|
String mediaType = TikaAnalysis.detectDocTypeUsingFacade(stream);
|
||||||
|
|
||||||
|
assertEquals("application/pdf", mediaType);
|
||||||
|
|
||||||
|
stream.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void whenUsingParser_thenContentIsReturned() throws IOException, TikaException, SAXException {
|
||||||
|
InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.docx");
|
||||||
|
String content = TikaAnalysis.extractContentUsingParser(stream);
|
||||||
|
|
||||||
|
assertThat(content, containsString("Apache Tika - a content analysis toolkit"));
|
||||||
|
assertThat(content, containsString("detects and extracts metadata and text"));
|
||||||
|
|
||||||
|
stream.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void whenUsingFacade_thenContentIsReturned() throws IOException, TikaException {
|
||||||
|
InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.docx");
|
||||||
|
String content = TikaAnalysis.extractContentUsingFacade(stream);
|
||||||
|
|
||||||
|
assertThat(content, containsString("Apache Tika - a content analysis toolkit"));
|
||||||
|
assertThat(content, containsString("detects and extracts metadata and text"));
|
||||||
|
|
||||||
|
stream.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void whenUsingParser_thenMetadataIsReturned() throws IOException, TikaException, SAXException {
|
||||||
|
InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.xlsx");
|
||||||
|
Metadata metadata = TikaAnalysis.extractMetadatatUsingParser(stream);
|
||||||
|
|
||||||
|
assertEquals("org.apache.tika.parser.DefaultParser", metadata.get("X-Parsed-By"));
|
||||||
|
assertEquals("Microsoft Office User", metadata.get("Author"));
|
||||||
|
|
||||||
|
stream.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void whenUsingFacade_thenMetadataIsReturned() throws IOException, TikaException {
|
||||||
|
InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.xlsx");
|
||||||
|
Metadata metadata = TikaAnalysis.extractMetadatatUsingFacade(stream);
|
||||||
|
|
||||||
|
assertEquals("org.apache.tika.parser.DefaultParser", metadata.get("X-Parsed-By"));
|
||||||
|
assertEquals("Microsoft Office User", metadata.get("Author"));
|
||||||
|
|
||||||
|
stream.close();
|
||||||
|
}
|
||||||
|
}
|
BIN
apache-tika/src/test/resources/tika.docx
Normal file
BIN
apache-tika/src/test/resources/tika.docx
Normal file
Binary file not shown.
BIN
apache-tika/src/test/resources/tika.txt
Normal file
BIN
apache-tika/src/test/resources/tika.txt
Normal file
Binary file not shown.
BIN
apache-tika/src/test/resources/tika.xlsx
Normal file
BIN
apache-tika/src/test/resources/tika.xlsx
Normal file
Binary file not shown.
1
pom.xml
1
pom.xml
@ -38,6 +38,7 @@
|
|||||||
<module>apache-cxf</module>
|
<module>apache-cxf</module>
|
||||||
<module>apache-fop</module>
|
<module>apache-fop</module>
|
||||||
<module>apache-poi</module>
|
<module>apache-poi</module>
|
||||||
|
<module>apache-tika</module>
|
||||||
<module>apache-thrift</module>
|
<module>apache-thrift</module>
|
||||||
<module>apache-curator</module>
|
<module>apache-curator</module>
|
||||||
<module>apache-zookeeper</module>
|
<module>apache-zookeeper</module>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user