From 43ff38c5dabcd862e260db28efd951d077cdf777 Mon Sep 17 00:00:00 2001 From: Tal Levy Date: Fri, 15 Dec 2017 13:47:26 -0800 Subject: [PATCH] update ingest-attachment to use Tika 1.17 and newer deps (#27824) - this pr updates tika and its dependencies - updates the SHAs - updates the class excludes --- plugins/ingest-attachment/build.gradle | 113 +++++++++------- .../apache-mime4j-core-0.7.2.jar.sha1 | 1 - .../apache-mime4j-core-0.8.1.jar.sha1 | 1 + .../licenses/apache-mime4j-dom-0.7.2.jar.sha1 | 1 - .../licenses/apache-mime4j-dom-0.8.1.jar.sha1 | 1 + .../licenses/commons-io-2.4.jar.sha1 | 1 - .../licenses/commons-io-2.5.jar.sha1 | 1 + .../licenses/fontbox-2.0.3.jar.sha1 | 1 - .../licenses/fontbox-2.0.8.jar.sha1 | 1 + .../licenses/jempbox-1.8.12.jar.sha1 | 1 - .../licenses/jempbox-1.8.13.jar.sha1 | 1 + .../licenses/pdfbox-2.0.3.jar.sha1 | 1 - .../licenses/pdfbox-2.0.8.jar.sha1 | 1 + .../licenses/poi-3.16.jar.sha1 | 1 - .../licenses/poi-3.17.jar.sha1 | 1 + .../licenses/poi-ooxml-3.16.jar.sha1 | 1 - .../licenses/poi-ooxml-3.17.jar.sha1 | 1 + .../licenses/poi-ooxml-schemas-3.16.jar.sha1 | 1 - .../licenses/poi-ooxml-schemas-3.17.jar.sha1 | 1 + .../licenses/poi-scratchpad-3.16.jar.sha1 | 1 - .../licenses/poi-scratchpad-3.17.jar.sha1 | 1 + .../licenses/tika-core-1.15.jar.sha1 | 1 - .../licenses/tika-core-1.17.jar.sha1 | 1 + .../licenses/tika-parsers-1.15.jar.sha1 | 1 - .../licenses/tika-parsers-1.17.jar.sha1 | 1 + .../attachment/AttachmentProcessor.java | 123 +++++++++--------- 26 files changed, 144 insertions(+), 116 deletions(-) delete mode 100644 plugins/ingest-attachment/licenses/apache-mime4j-core-0.7.2.jar.sha1 create mode 100644 plugins/ingest-attachment/licenses/apache-mime4j-core-0.8.1.jar.sha1 delete mode 100644 plugins/ingest-attachment/licenses/apache-mime4j-dom-0.7.2.jar.sha1 create mode 100644 plugins/ingest-attachment/licenses/apache-mime4j-dom-0.8.1.jar.sha1 delete mode 100644 plugins/ingest-attachment/licenses/commons-io-2.4.jar.sha1 create mode 100644 plugins/ingest-attachment/licenses/commons-io-2.5.jar.sha1 delete mode 100644 plugins/ingest-attachment/licenses/fontbox-2.0.3.jar.sha1 create mode 100644 plugins/ingest-attachment/licenses/fontbox-2.0.8.jar.sha1 delete mode 100644 plugins/ingest-attachment/licenses/jempbox-1.8.12.jar.sha1 create mode 100644 plugins/ingest-attachment/licenses/jempbox-1.8.13.jar.sha1 delete mode 100644 plugins/ingest-attachment/licenses/pdfbox-2.0.3.jar.sha1 create mode 100644 plugins/ingest-attachment/licenses/pdfbox-2.0.8.jar.sha1 delete mode 100644 plugins/ingest-attachment/licenses/poi-3.16.jar.sha1 create mode 100644 plugins/ingest-attachment/licenses/poi-3.17.jar.sha1 delete mode 100644 plugins/ingest-attachment/licenses/poi-ooxml-3.16.jar.sha1 create mode 100644 plugins/ingest-attachment/licenses/poi-ooxml-3.17.jar.sha1 delete mode 100644 plugins/ingest-attachment/licenses/poi-ooxml-schemas-3.16.jar.sha1 create mode 100644 plugins/ingest-attachment/licenses/poi-ooxml-schemas-3.17.jar.sha1 delete mode 100644 plugins/ingest-attachment/licenses/poi-scratchpad-3.16.jar.sha1 create mode 100644 plugins/ingest-attachment/licenses/poi-scratchpad-3.17.jar.sha1 delete mode 100644 plugins/ingest-attachment/licenses/tika-core-1.15.jar.sha1 create mode 100644 plugins/ingest-attachment/licenses/tika-core-1.17.jar.sha1 delete mode 100644 plugins/ingest-attachment/licenses/tika-parsers-1.15.jar.sha1 create mode 100644 plugins/ingest-attachment/licenses/tika-parsers-1.17.jar.sha1 diff --git a/plugins/ingest-attachment/build.gradle b/plugins/ingest-attachment/build.gradle index b79501966e3..a57d8f880bc 100644 --- a/plugins/ingest-attachment/build.gradle +++ b/plugins/ingest-attachment/build.gradle @@ -23,11 +23,11 @@ esplugin { } versions << [ - 'tika': '1.15', - 'pdfbox': '2.0.3', + 'tika': '1.17', + 'pdfbox': '2.0.8', 'bouncycastle': '1.55', - 'poi': '3.16', - 'mime4j': '0.7.2' + 'poi': '3.17', + 'mime4j': '0.8.1' ] dependencies { @@ -35,7 +35,7 @@ dependencies { compile "org.apache.tika:tika-core:${versions.tika}" compile "org.apache.tika:tika-parsers:${versions.tika}" compile 'org.tukaani:xz:1.6' - compile 'commons-io:commons-io:2.4' + compile 'commons-io:commons-io:2.5' compile "org.slf4j:slf4j-api:${versions.slf4j}" // character set detection @@ -47,7 +47,7 @@ dependencies { // Adobe PDF compile "org.apache.pdfbox:pdfbox:${versions.pdfbox}" compile "org.apache.pdfbox:fontbox:${versions.pdfbox}" - compile "org.apache.pdfbox:jempbox:1.8.12" + compile "org.apache.pdfbox:jempbox:1.8.13" compile "commons-logging:commons-logging:${versions.commonslogging}" compile "org.bouncycastle:bcmail-jdk15on:${versions.bouncycastle}" compile "org.bouncycastle:bcprov-jdk15on:${versions.bouncycastle}" @@ -546,6 +546,7 @@ thirdPartyAudit.excludes = [ 'org.apache.http.client.utils.URIBuilder', 'org.apache.http.entity.ByteArrayEntity', 'org.apache.http.impl.client.DefaultHttpClient', + 'org.apache.jcp.xml.dsig.internal.dom.ApacheNodeSetData', 'org.apache.jcp.xml.dsig.internal.dom.DOMDigestMethod', 'org.apache.jcp.xml.dsig.internal.dom.DOMKeyInfo', 'org.apache.jcp.xml.dsig.internal.dom.DOMReference', @@ -588,6 +589,7 @@ thirdPartyAudit.excludes = [ 'org.apache.uima.util.XmlCasSerializer', 'org.apache.xml.security.Init', 'org.apache.xml.security.c14n.Canonicalizer', + 'org.apache.xml.security.signature.XMLSignatureInput', 'org.apache.xml.security.utils.Base64', 'org.brotli.dec.BrotliInputStream', 'org.etsi.uri.x01903.v13.AnyType', @@ -635,11 +637,9 @@ thirdPartyAudit.excludes = [ 'org.etsi.uri.x01903.v14.ValidationDataType', 'org.json.JSONArray', 'org.json.JSONObject', - 'org.json.XML', 'org.json.simple.JSONArray', 'org.json.simple.JSONObject', 'org.json.simple.parser.JSONParser', - 'org.junit.Assert', 'org.junit.Test', 'org.junit.internal.TextListener', 'org.junit.runner.JUnitCore', @@ -690,7 +690,6 @@ thirdPartyAudit.excludes = [ 'org.openxmlformats.schemas.drawingml.x2006.chart.CTDLbls', 'org.openxmlformats.schemas.drawingml.x2006.chart.CTDPt', 'org.openxmlformats.schemas.drawingml.x2006.chart.CTDTable', - 'org.openxmlformats.schemas.drawingml.x2006.chart.CTDateAx', 'org.openxmlformats.schemas.drawingml.x2006.chart.CTDispBlanksAs', 'org.openxmlformats.schemas.drawingml.x2006.chart.CTDispUnits', 'org.openxmlformats.schemas.drawingml.x2006.chart.CTDoughnutChart', @@ -720,6 +719,7 @@ thirdPartyAudit.excludes = [ 'org.openxmlformats.schemas.drawingml.x2006.chart.CTSurface3DChart', 'org.openxmlformats.schemas.drawingml.x2006.chart.CTSurfaceChart', 'org.openxmlformats.schemas.drawingml.x2006.chart.CTTextLanguageID', + 'org.openxmlformats.schemas.drawingml.x2006.chart.CTTimeUnit', 'org.openxmlformats.schemas.drawingml.x2006.chart.CTTrendline', 'org.openxmlformats.schemas.drawingml.x2006.chart.CTUpDownBars', 'org.openxmlformats.schemas.drawingml.x2006.chart.CTView3D', @@ -828,8 +828,8 @@ thirdPartyAudit.excludes = [ 'org.openxmlformats.schemas.drawingml.x2006.main.STOnOffStyleType$Enum', 'org.openxmlformats.schemas.drawingml.x2006.main.STPanose', 'org.openxmlformats.schemas.drawingml.x2006.main.STPathFillMode', - 'org.openxmlformats.schemas.drawingml.x2006.main.STPresetPatternVal', 'org.openxmlformats.schemas.drawingml.x2006.main.STPresetPatternVal$Enum', + 'org.openxmlformats.schemas.drawingml.x2006.main.STPresetPatternVal', 'org.openxmlformats.schemas.drawingml.x2006.main.STRectAlignment', 'org.openxmlformats.schemas.drawingml.x2006.main.STTextColumnCount', 'org.openxmlformats.schemas.drawingml.x2006.main.STTextNonNegativePoint', @@ -937,34 +937,6 @@ thirdPartyAudit.excludes = [ 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTPresetColorImpl$1SatOffList', 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTPresetColorImpl$1ShadeList', 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTPresetColorImpl$1TintList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1AlphaList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1AlphaModList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1AlphaOffList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1BlueList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1BlueModList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1BlueOffList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1CompList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1GammaList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1GrayList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1GreenList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1GreenModList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1GreenOffList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1HueList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1HueModList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1HueOffList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1InvGammaList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1InvList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1LumList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1LumModList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1LumOffList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1RedList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1RedModList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1RedOffList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1SatList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1SatModList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1SatOffList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1ShadeList', - 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1TintList', 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTSRgbColorImpl$1AlphaList', 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTSRgbColorImpl$1AlphaModList', 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTSRgbColorImpl$1AlphaOffList', @@ -993,6 +965,34 @@ thirdPartyAudit.excludes = [ 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTSRgbColorImpl$1SatOffList', 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTSRgbColorImpl$1ShadeList', 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTSRgbColorImpl$1TintList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1AlphaList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1AlphaModList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1AlphaOffList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1BlueList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1BlueModList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1BlueOffList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1CompList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1GammaList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1GrayList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1GreenList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1GreenModList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1GreenOffList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1HueList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1HueModList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1HueOffList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1InvGammaList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1InvList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1LumList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1LumModList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1LumOffList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1RedList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1RedModList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1RedOffList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1SatList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1SatModList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1SatOffList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1ShadeList', + 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTScRgbColorImpl$1TintList', 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTSchemeColorImpl$1AlphaList', 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTSchemeColorImpl$1AlphaModList', 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTSchemeColorImpl$1AlphaOffList', @@ -1058,7 +1058,6 @@ thirdPartyAudit.excludes = [ 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTTextParagraphImpl$1FldList', 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTTextParagraphImpl$1RList', 'org.openxmlformats.schemas.drawingml.x2006.main.impl.CTTextTabStopListImpl$1TabList', - 'org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTAbsoluteAnchor', 'org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.impl.CTDrawingImpl$1AbsoluteAnchorList', 'org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.impl.CTDrawingImpl$1OneCellAnchorList', 'org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.impl.CTDrawingImpl$1TwoCellAnchorList', @@ -1183,7 +1182,6 @@ thirdPartyAudit.excludes = [ 'org.openxmlformats.schemas.spreadsheetml.x2006.main.CTChartsheetViews', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.CTColHierarchiesUsage', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.CTColItems', - 'org.openxmlformats.schemas.spreadsheetml.x2006.main.CTColors', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.CTConditionalFormats', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.CTConsolidation', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.CTControls', @@ -1208,6 +1206,7 @@ thirdPartyAudit.excludes = [ 'org.openxmlformats.schemas.spreadsheetml.x2006.main.CTFormats', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.CTFunctionGroups', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.CTGradientFill', + 'org.openxmlformats.schemas.spreadsheetml.x2006.main.CTMRUColors', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.CTMeasureDimensionMaps', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.CTMeasureGroups', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.CTMissing', @@ -1231,7 +1230,6 @@ thirdPartyAudit.excludes = [ 'org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSortState', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.CTString', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.CTTableFormula', - 'org.openxmlformats.schemas.spreadsheetml.x2006.main.CTTableStyles', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.CTTupleCache', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWebPublishItems', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWebPublishObjects', @@ -1333,14 +1331,14 @@ thirdPartyAudit.excludes = [ 'org.openxmlformats.schemas.spreadsheetml.x2006.main.impl.CTSharedItemsImpl$1NList', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.impl.CTSharedItemsImpl$1SList', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.impl.CTSheetDataImpl$1RowList', - 'org.openxmlformats.schemas.spreadsheetml.x2006.main.impl.CTSheetsImpl$1SheetList', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.impl.CTSheetViewImpl$1PivotSelectionList', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.impl.CTSheetViewImpl$1SelectionList', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.impl.CTSheetViewsImpl$1SheetViewList', + 'org.openxmlformats.schemas.spreadsheetml.x2006.main.impl.CTSheetsImpl$1SheetList', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.impl.CTSingleXmlCellsImpl$1SingleXmlCellList', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.impl.CTSstImpl$1SiList', - 'org.openxmlformats.schemas.spreadsheetml.x2006.main.impl.CTTableColumnsImpl$1TableColumnList', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.impl.CTTablePartsImpl$1TablePartList', + 'org.openxmlformats.schemas.spreadsheetml.x2006.main.impl.CTTableStylesImpl$1TableStyleList', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.impl.CTWorkbookImpl$1FileRecoveryPrList', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.impl.CTWorksheetImpl$1ColsList', 'org.openxmlformats.schemas.spreadsheetml.x2006.main.impl.CTWorksheetImpl$1ConditionalFormattingList', @@ -1400,7 +1398,6 @@ thirdPartyAudit.excludes = [ 'org.openxmlformats.schemas.wordprocessingml.x2006.main.CTProof', 'org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRPrChange', 'org.openxmlformats.schemas.wordprocessingml.x2006.main.CTReadingModeInkLockDown', - 'org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRuby', 'org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSaveThroughXslt', 'org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtComboBox', 'org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtDate', @@ -1441,6 +1438,7 @@ thirdPartyAudit.excludes = [ 'org.openxmlformats.schemas.wordprocessingml.x2006.main.STPTabLeader', 'org.openxmlformats.schemas.wordprocessingml.x2006.main.STPTabRelativeTo', 'org.openxmlformats.schemas.wordprocessingml.x2006.main.STProofErr', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.STRubyAlign', 'org.openxmlformats.schemas.wordprocessingml.x2006.main.STShortHexNumber', 'org.openxmlformats.schemas.wordprocessingml.x2006.main.STThemeColor', 'org.openxmlformats.schemas.wordprocessingml.x2006.main.STUcharHexNumber', @@ -1708,6 +1706,32 @@ thirdPartyAudit.excludes = [ 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRowImpl$1ProofErrList', 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRowImpl$1SdtList', 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRowImpl$1TcList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1BookmarkEndList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1BookmarkStartList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1CommentRangeEndList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1CommentRangeStartList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1CustomXmlDelRangeEndList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1CustomXmlDelRangeStartList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1CustomXmlInsRangeEndList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1CustomXmlInsRangeStartList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1CustomXmlMoveFromRangeEndList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1CustomXmlMoveFromRangeStartList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1CustomXmlMoveToRangeEndList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1CustomXmlMoveToRangeStartList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1DelList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1InsList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1MoveFromList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1MoveFromRangeEndList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1MoveFromRangeStartList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1MoveToList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1MoveToRangeEndList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1MoveToRangeStartList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1OMathList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1OMathParaList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1PermEndList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1PermStartList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1ProofErrList', + 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRubyContentImpl$1RList', 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRunTrackChangeImpl$1AccList', 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRunTrackChangeImpl$1BarList', 'org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTRunTrackChangeImpl$1BookmarkEndList', @@ -2054,7 +2078,6 @@ thirdPartyAudit.excludes = [ 'org.sqlite.SQLiteConfig', 'org.w3.x2000.x09.xmldsig.KeyInfoType', 'org.w3.x2000.x09.xmldsig.SignatureMethodType', - 'org.w3.x2000.x09.xmldsig.SignatureValueType', 'org.w3.x2000.x09.xmldsig.TransformsType', 'org.w3.x2000.x09.xmldsig.impl.SignatureTypeImpl$1ObjectList', 'org.w3.x2000.x09.xmldsig.impl.SignedInfoTypeImpl$1ReferenceList', diff --git a/plugins/ingest-attachment/licenses/apache-mime4j-core-0.7.2.jar.sha1 b/plugins/ingest-attachment/licenses/apache-mime4j-core-0.7.2.jar.sha1 deleted file mode 100644 index 8210fc7fc16..00000000000 --- a/plugins/ingest-attachment/licenses/apache-mime4j-core-0.7.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -a81264fe0265ebe8fd1d8128aad06dc320de6eef \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/apache-mime4j-core-0.8.1.jar.sha1 b/plugins/ingest-attachment/licenses/apache-mime4j-core-0.8.1.jar.sha1 new file mode 100644 index 00000000000..6ae3e58d22b --- /dev/null +++ b/plugins/ingest-attachment/licenses/apache-mime4j-core-0.8.1.jar.sha1 @@ -0,0 +1 @@ +c62dfe18a3b827a2c626ade0ffba44562ddf3f61 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/apache-mime4j-dom-0.7.2.jar.sha1 b/plugins/ingest-attachment/licenses/apache-mime4j-dom-0.7.2.jar.sha1 deleted file mode 100644 index 3ede85a0191..00000000000 --- a/plugins/ingest-attachment/licenses/apache-mime4j-dom-0.7.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -1c289aa264548a0a1f1b43685a9cb2ab23f67287 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/apache-mime4j-dom-0.8.1.jar.sha1 b/plugins/ingest-attachment/licenses/apache-mime4j-dom-0.8.1.jar.sha1 new file mode 100644 index 00000000000..408dfe12ef2 --- /dev/null +++ b/plugins/ingest-attachment/licenses/apache-mime4j-dom-0.8.1.jar.sha1 @@ -0,0 +1 @@ +f2d653c617004193f3350330d907f77b60c88c56 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/commons-io-2.4.jar.sha1 b/plugins/ingest-attachment/licenses/commons-io-2.4.jar.sha1 deleted file mode 100644 index 688318c938c..00000000000 --- a/plugins/ingest-attachment/licenses/commons-io-2.4.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -b1b6ea3b7e4aa4f492509a4952029cd8e48019ad diff --git a/plugins/ingest-attachment/licenses/commons-io-2.5.jar.sha1 b/plugins/ingest-attachment/licenses/commons-io-2.5.jar.sha1 new file mode 100644 index 00000000000..b7f1d93e897 --- /dev/null +++ b/plugins/ingest-attachment/licenses/commons-io-2.5.jar.sha1 @@ -0,0 +1 @@ +2852e6e05fbb95076fc091f6d1780f1f8fe35e0f \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/fontbox-2.0.3.jar.sha1 b/plugins/ingest-attachment/licenses/fontbox-2.0.3.jar.sha1 deleted file mode 100644 index e3ff3d39459..00000000000 --- a/plugins/ingest-attachment/licenses/fontbox-2.0.3.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -448ee588d0136121cf5c4dd397384cccb9db1ad7 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/fontbox-2.0.8.jar.sha1 b/plugins/ingest-attachment/licenses/fontbox-2.0.8.jar.sha1 new file mode 100644 index 00000000000..f8abddbc755 --- /dev/null +++ b/plugins/ingest-attachment/licenses/fontbox-2.0.8.jar.sha1 @@ -0,0 +1 @@ +52f852fcfc7481d45efdffd224eb78b85981b17b \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/jempbox-1.8.12.jar.sha1 b/plugins/ingest-attachment/licenses/jempbox-1.8.12.jar.sha1 deleted file mode 100644 index 0e3dcf4573b..00000000000 --- a/plugins/ingest-attachment/licenses/jempbox-1.8.12.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -426450c573c19f6f2c751a7a52c11931b712c9f6 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/jempbox-1.8.13.jar.sha1 b/plugins/ingest-attachment/licenses/jempbox-1.8.13.jar.sha1 new file mode 100644 index 00000000000..2593719dfb3 --- /dev/null +++ b/plugins/ingest-attachment/licenses/jempbox-1.8.13.jar.sha1 @@ -0,0 +1 @@ +a874cef0ed0e2a8c4cc5ed52c23ba3e6d78eca4e \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/pdfbox-2.0.3.jar.sha1 b/plugins/ingest-attachment/licenses/pdfbox-2.0.3.jar.sha1 deleted file mode 100644 index 807e2482ac2..00000000000 --- a/plugins/ingest-attachment/licenses/pdfbox-2.0.3.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -be7b09de93f7c7795c57f4fbf14db60ab93806b4 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/pdfbox-2.0.8.jar.sha1 b/plugins/ingest-attachment/licenses/pdfbox-2.0.8.jar.sha1 new file mode 100644 index 00000000000..1c346871e21 --- /dev/null +++ b/plugins/ingest-attachment/licenses/pdfbox-2.0.8.jar.sha1 @@ -0,0 +1 @@ +17bdf273d66f3afe41eedb9d3ab6a7b819c44a0c \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/poi-3.16.jar.sha1 b/plugins/ingest-attachment/licenses/poi-3.16.jar.sha1 deleted file mode 100644 index 75cbf233847..00000000000 --- a/plugins/ingest-attachment/licenses/poi-3.16.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -ad21c123ee5d6b5b2a8f0d4ed23b3ffe6759a889 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/poi-3.17.jar.sha1 b/plugins/ingest-attachment/licenses/poi-3.17.jar.sha1 new file mode 100644 index 00000000000..bd472c0bec7 --- /dev/null +++ b/plugins/ingest-attachment/licenses/poi-3.17.jar.sha1 @@ -0,0 +1 @@ +0ae92292a2043888b40d418da97dc0b669fde326 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/poi-ooxml-3.16.jar.sha1 b/plugins/ingest-attachment/licenses/poi-ooxml-3.16.jar.sha1 deleted file mode 100644 index c2283c7fa1a..00000000000 --- a/plugins/ingest-attachment/licenses/poi-ooxml-3.16.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -76e20fe22404cc4da55ddfdaaaadee32bbfa3bdd \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/poi-ooxml-3.17.jar.sha1 b/plugins/ingest-attachment/licenses/poi-ooxml-3.17.jar.sha1 new file mode 100644 index 00000000000..37c5e068814 --- /dev/null +++ b/plugins/ingest-attachment/licenses/poi-ooxml-3.17.jar.sha1 @@ -0,0 +1 @@ +07d8c44407178b73246462842bf1e206e99c8e0a \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/poi-ooxml-schemas-3.16.jar.sha1 b/plugins/ingest-attachment/licenses/poi-ooxml-schemas-3.16.jar.sha1 deleted file mode 100644 index 8ddbb300169..00000000000 --- a/plugins/ingest-attachment/licenses/poi-ooxml-schemas-3.16.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -9828a49307fc6bebfd42185b677d88b6e4994c63 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/poi-ooxml-schemas-3.17.jar.sha1 b/plugins/ingest-attachment/licenses/poi-ooxml-schemas-3.17.jar.sha1 new file mode 100644 index 00000000000..744e323e5d7 --- /dev/null +++ b/plugins/ingest-attachment/licenses/poi-ooxml-schemas-3.17.jar.sha1 @@ -0,0 +1 @@ +890114bfa82f5b6380ea0e9b0bf49b0af797b414 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/poi-scratchpad-3.16.jar.sha1 b/plugins/ingest-attachment/licenses/poi-scratchpad-3.16.jar.sha1 deleted file mode 100644 index 8dc53c0bfbc..00000000000 --- a/plugins/ingest-attachment/licenses/poi-scratchpad-3.16.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -69d6dda524e38a491b362d0f94ef74a514faf70a \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/poi-scratchpad-3.17.jar.sha1 b/plugins/ingest-attachment/licenses/poi-scratchpad-3.17.jar.sha1 new file mode 100644 index 00000000000..16686b3e89b --- /dev/null +++ b/plugins/ingest-attachment/licenses/poi-scratchpad-3.17.jar.sha1 @@ -0,0 +1 @@ +85d86a0e26c7f5c0db4ee63e8c7728e51c5d64ce \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-core-1.15.jar.sha1 b/plugins/ingest-attachment/licenses/tika-core-1.15.jar.sha1 deleted file mode 100644 index cc764669b5b..00000000000 --- a/plugins/ingest-attachment/licenses/tika-core-1.15.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -17850c2224e4e3867e588060dc8ce6ba3bcfab2a \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-core-1.17.jar.sha1 b/plugins/ingest-attachment/licenses/tika-core-1.17.jar.sha1 new file mode 100644 index 00000000000..571314b3378 --- /dev/null +++ b/plugins/ingest-attachment/licenses/tika-core-1.17.jar.sha1 @@ -0,0 +1 @@ +b450102c2aee98107474d2f92661d947b9cef183 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parsers-1.15.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parsers-1.15.jar.sha1 deleted file mode 100644 index cada2d9f3ac..00000000000 --- a/plugins/ingest-attachment/licenses/tika-parsers-1.15.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -aa07c2cda051709e5fe70fd6e244386fc93b0a1e \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parsers-1.17.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parsers-1.17.jar.sha1 new file mode 100644 index 00000000000..c4487e4970f --- /dev/null +++ b/plugins/ingest-attachment/licenses/tika-parsers-1.17.jar.sha1 @@ -0,0 +1 @@ +4277c54fcaed542fbc8a0001fdb4c23baccc0132 \ No newline at end of file diff --git a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java index f7f474711be..b23c627290e 100644 --- a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java +++ b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java @@ -19,6 +19,7 @@ package org.elasticsearch.ingest.attachment; +import org.apache.tika.exception.ZeroByteFileException; import org.apache.tika.language.LanguageIdentifier; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -81,70 +82,74 @@ public final class AttachmentProcessor extends AbstractProcessor { throw new IllegalArgumentException("field [" + field + "] is null, cannot parse."); } + Metadata metadata = new Metadata(); + String parsedContent = ""; try { - Metadata metadata = new Metadata(); - String parsedContent = TikaImpl.parse(input, metadata, indexedChars); - - if (properties.contains(Property.CONTENT) && Strings.hasLength(parsedContent)) { - // somehow tika seems to append a newline at the end automatically, lets remove that again - additionalFields.put(Property.CONTENT.toLowerCase(), parsedContent.trim()); - } - - if (properties.contains(Property.LANGUAGE) && Strings.hasLength(parsedContent)) { - LanguageIdentifier identifier = new LanguageIdentifier(parsedContent); - String language = identifier.getLanguage(); - additionalFields.put(Property.LANGUAGE.toLowerCase(), language); - } - - if (properties.contains(Property.DATE)) { - String createdDate = metadata.get(TikaCoreProperties.CREATED); - if (createdDate != null) { - additionalFields.put(Property.DATE.toLowerCase(), createdDate); - } - } - - if (properties.contains(Property.TITLE)) { - String title = metadata.get(TikaCoreProperties.TITLE); - if (Strings.hasLength(title)) { - additionalFields.put(Property.TITLE.toLowerCase(), title); - } - } - - if (properties.contains(Property.AUTHOR)) { - String author = metadata.get("Author"); - if (Strings.hasLength(author)) { - additionalFields.put(Property.AUTHOR.toLowerCase(), author); - } - } - - if (properties.contains(Property.KEYWORDS)) { - String keywords = metadata.get("Keywords"); - if (Strings.hasLength(keywords)) { - additionalFields.put(Property.KEYWORDS.toLowerCase(), keywords); - } - } - - if (properties.contains(Property.CONTENT_TYPE)) { - String contentType = metadata.get(Metadata.CONTENT_TYPE); - if (Strings.hasLength(contentType)) { - additionalFields.put(Property.CONTENT_TYPE.toLowerCase(), contentType); - } - } - - if (properties.contains(Property.CONTENT_LENGTH)) { - String contentLength = metadata.get(Metadata.CONTENT_LENGTH); - long length; - if (Strings.hasLength(contentLength)) { - length = Long.parseLong(contentLength); - } else { - length = parsedContent.length(); - } - additionalFields.put(Property.CONTENT_LENGTH.toLowerCase(), length); - } + parsedContent = TikaImpl.parse(input, metadata, indexedChars); + } catch (ZeroByteFileException e) { + // tika 1.17 throws an exception when the InputStream has 0 bytes. + // previously, it did not mind. This is here to preserve that behavior. } catch (Exception e) { throw new ElasticsearchParseException("Error parsing document in field [{}]", e, field); } + if (properties.contains(Property.CONTENT) && Strings.hasLength(parsedContent)) { + // somehow tika seems to append a newline at the end automatically, lets remove that again + additionalFields.put(Property.CONTENT.toLowerCase(), parsedContent.trim()); + } + + if (properties.contains(Property.LANGUAGE) && Strings.hasLength(parsedContent)) { + LanguageIdentifier identifier = new LanguageIdentifier(parsedContent); + String language = identifier.getLanguage(); + additionalFields.put(Property.LANGUAGE.toLowerCase(), language); + } + + if (properties.contains(Property.DATE)) { + String createdDate = metadata.get(TikaCoreProperties.CREATED); + if (createdDate != null) { + additionalFields.put(Property.DATE.toLowerCase(), createdDate); + } + } + + if (properties.contains(Property.TITLE)) { + String title = metadata.get(TikaCoreProperties.TITLE); + if (Strings.hasLength(title)) { + additionalFields.put(Property.TITLE.toLowerCase(), title); + } + } + + if (properties.contains(Property.AUTHOR)) { + String author = metadata.get("Author"); + if (Strings.hasLength(author)) { + additionalFields.put(Property.AUTHOR.toLowerCase(), author); + } + } + + if (properties.contains(Property.KEYWORDS)) { + String keywords = metadata.get("Keywords"); + if (Strings.hasLength(keywords)) { + additionalFields.put(Property.KEYWORDS.toLowerCase(), keywords); + } + } + + if (properties.contains(Property.CONTENT_TYPE)) { + String contentType = metadata.get(Metadata.CONTENT_TYPE); + if (Strings.hasLength(contentType)) { + additionalFields.put(Property.CONTENT_TYPE.toLowerCase(), contentType); + } + } + + if (properties.contains(Property.CONTENT_LENGTH)) { + String contentLength = metadata.get(Metadata.CONTENT_LENGTH); + long length; + if (Strings.hasLength(contentLength)) { + length = Long.parseLong(contentLength); + } else { + length = parsedContent.length(); + } + additionalFields.put(Property.CONTENT_LENGTH.toLowerCase(), length); + } + ingestDocument.setFieldValue(targetField, additionalFields); }