2021-04-09 15:28:18 -04:00
|
|
|
/*
|
|
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
|
|
*
|
|
|
|
* The OpenSearch Contributors require contributions made to
|
|
|
|
* this file be licensed under the Apache-2.0 license or a
|
|
|
|
* compatible open source license.
|
|
|
|
*
|
|
|
|
* Modifications Copyright OpenSearch Contributors. See
|
|
|
|
* GitHub history for details.
|
|
|
|
*/
|
2019-11-01 14:33:11 -04:00
|
|
|
|
2016-02-09 08:57:05 -05:00
|
|
|
/*
|
|
|
|
* Licensed to Elasticsearch under one or more contributor
|
|
|
|
* license agreements. See the NOTICE file distributed with
|
|
|
|
* this work for additional information regarding copyright
|
|
|
|
* ownership. Elasticsearch licenses this file to you under
|
|
|
|
* the Apache License, Version 2.0 (the "License"); you may
|
|
|
|
* not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at
|
|
|
|
*
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing,
|
|
|
|
* software distributed under the License is distributed on an
|
|
|
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
|
|
* KIND, either express or implied. See the License for the
|
|
|
|
* specific language governing permissions and limitations
|
|
|
|
* under the License.
|
|
|
|
*/
|
2021-04-09 15:28:18 -04:00
|
|
|
|
|
|
|
import org.opensearch.gradle.info.BuildParams
|
|
|
|
|
2021-03-11 00:06:21 -05:00
|
|
|
apply plugin: 'opensearch.yaml-rest-test'
|
2016-02-09 08:57:05 -05:00
|
|
|
|
2021-03-11 00:06:21 -05:00
|
|
|
opensearchplugin {
|
2016-02-09 08:57:05 -05:00
|
|
|
description 'Ingest processor that uses Apache Tika to extract contents'
|
2021-03-11 00:06:21 -05:00
|
|
|
classname 'org.opensearch.ingest.attachment.IngestAttachmentPlugin'
|
2016-02-09 08:57:05 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
versions << [
|
Upgrading ingest-attachment dependencies (#3111)
* Upgrading Tika from 1.24.1 to 2.1.0 and bumping xmlbeans version
This major version upgrade requires an explicit dependency on tika-parsers-standard-package to import the parser implementations, and an update to the namespace of RTFParser. Also, LanguageIdentifier has been deprecated and replaced by LanguageDetector.
This change includes a bump in xmlbeans version from 3.0.1 to 3.1.0
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgrade Tika libraries from 2.1.0 to 2.2.0
This also requires a update of Apache Commons-IO from 2.7 to 2.11.0
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgrade Tika libraries from 2.2.0 to 2.2.1
Also update PDFBox to 2.0.25 as per Tika release notes
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgraded Tika and xmlbeans libraries
Tika libraries have been upgraded from 2.2.1 to 2.3.0. xmlbeans is now a subproject of POI, so POI was upgraded from 4.1.2 to 5.2.2. With POI 5.x the ooxml-schemas library has been moved to ooxml-lite/ooxml-full. Since ooxml-schemas no longer exists, the LICENSE and NOTICE files in the licenses/ directory have been removed. Finally, xmlbeans has been updated from 3.1.0 to 5.0.2
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* (In progress) Added tika-langdetect
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgrading tika libraries to 2.4.0
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Switched from tika-langdetect to tika-langdetect-optimaize
To fix the license check, the mapping regex was expanded to tika-.*
This now means the tika-core LICENSE and NOTICE files are no longer needed.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* (Work in progress) Switching AttachmentProcessor to use OptimaizeLangDetector
This is a concrete implementation of LanguageDetector. Using this requires bringing in the optimaize dependency.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Manually added LICENSE and NOTICE files for Optimaize language-detector
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Move Optimaize dependency to runtimeOnly
Also bring in transitive Guava dependency. This requires manual addition of LICENSE and NOTICE files as with other plugins.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Fix Optimaize langDetector to load models first before detecting
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Fallback logic, and test updates
Following the Tika library upgrade, some fallback logic is necessary:
1. "Author" is deprecated for MSOffice document parsing. It is recommended to use CREATOR from Tika Core Properties instead.
2. EPUB parsing no longer automatically extracts keywords. The convention to fall back to SUBJECT is now manually implemented in AttachmentProcessor
Finally, unit tests have been upgraded to account for non-deterministic language results across library upgrades.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Drop Guava version from 31.1 to 18.0
This is the version that Optimaize 0.6 depends on, and it allows for a smaller ignoreViolations list
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Fix ingest-attachment integration test to assert correct language
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
2022-05-04 12:51:59 -04:00
|
|
|
'tika' : '2.4.0',
|
|
|
|
'pdfbox': '2.0.25',
|
|
|
|
'poi' : '5.2.2',
|
2019-08-19 18:16:32 -04:00
|
|
|
'mime4j': '0.8.3'
|
2016-02-09 08:57:05 -05:00
|
|
|
]
|
|
|
|
|
|
|
|
dependencies {
|
|
|
|
// mandatory for tika
|
2020-06-30 09:57:41 -04:00
|
|
|
api "org.apache.tika:tika-core:${versions.tika}"
|
|
|
|
api "org.apache.tika:tika-parsers:${versions.tika}"
|
Upgrading ingest-attachment dependencies (#3111)
* Upgrading Tika from 1.24.1 to 2.1.0 and bumping xmlbeans version
This major version upgrade requires an explicit dependency on tika-parsers-standard-package to import the parser implementations, and an update to the namespace of RTFParser. Also, LanguageIdentifier has been deprecated and replaced by LanguageDetector.
This change includes a bump in xmlbeans version from 3.0.1 to 3.1.0
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgrade Tika libraries from 2.1.0 to 2.2.0
This also requires a update of Apache Commons-IO from 2.7 to 2.11.0
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgrade Tika libraries from 2.2.0 to 2.2.1
Also update PDFBox to 2.0.25 as per Tika release notes
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgraded Tika and xmlbeans libraries
Tika libraries have been upgraded from 2.2.1 to 2.3.0. xmlbeans is now a subproject of POI, so POI was upgraded from 4.1.2 to 5.2.2. With POI 5.x the ooxml-schemas library has been moved to ooxml-lite/ooxml-full. Since ooxml-schemas no longer exists, the LICENSE and NOTICE files in the licenses/ directory have been removed. Finally, xmlbeans has been updated from 3.1.0 to 5.0.2
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* (In progress) Added tika-langdetect
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgrading tika libraries to 2.4.0
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Switched from tika-langdetect to tika-langdetect-optimaize
To fix the license check, the mapping regex was expanded to tika-.*
This now means the tika-core LICENSE and NOTICE files are no longer needed.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* (Work in progress) Switching AttachmentProcessor to use OptimaizeLangDetector
This is a concrete implementation of LanguageDetector. Using this requires bringing in the optimaize dependency.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Manually added LICENSE and NOTICE files for Optimaize language-detector
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Move Optimaize dependency to runtimeOnly
Also bring in transitive Guava dependency. This requires manual addition of LICENSE and NOTICE files as with other plugins.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Fix Optimaize langDetector to load models first before detecting
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Fallback logic, and test updates
Following the Tika library upgrade, some fallback logic is necessary:
1. "Author" is deprecated for MSOffice document parsing. It is recommended to use CREATOR from Tika Core Properties instead.
2. EPUB parsing no longer automatically extracts keywords. The convention to fall back to SUBJECT is now manually implemented in AttachmentProcessor
Finally, unit tests have been upgraded to account for non-deterministic language results across library upgrades.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Drop Guava version from 31.1 to 18.0
This is the version that Optimaize 0.6 depends on, and it allows for a smaller ignoreViolations list
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Fix ingest-attachment integration test to assert correct language
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
2022-05-04 12:51:59 -04:00
|
|
|
// Required for the various document parsers
|
|
|
|
api "org.apache.tika:tika-parsers-standard-package:${versions.tika}"
|
|
|
|
// Required for language detection
|
|
|
|
api "org.apache.tika:tika-langdetect-optimaize:${versions.tika}"
|
|
|
|
// Optimaize libraries/dependencies
|
|
|
|
runtimeOnly "com.optimaize.languagedetector:language-detector:0.6"
|
|
|
|
runtimeOnly 'com.google.guava:guava:18.0'
|
|
|
|
// Other dependencies
|
2020-06-30 09:57:41 -04:00
|
|
|
api 'org.tukaani:xz:1.8'
|
Upgrading ingest-attachment dependencies (#3111)
* Upgrading Tika from 1.24.1 to 2.1.0 and bumping xmlbeans version
This major version upgrade requires an explicit dependency on tika-parsers-standard-package to import the parser implementations, and an update to the namespace of RTFParser. Also, LanguageIdentifier has been deprecated and replaced by LanguageDetector.
This change includes a bump in xmlbeans version from 3.0.1 to 3.1.0
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgrade Tika libraries from 2.1.0 to 2.2.0
This also requires a update of Apache Commons-IO from 2.7 to 2.11.0
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgrade Tika libraries from 2.2.0 to 2.2.1
Also update PDFBox to 2.0.25 as per Tika release notes
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgraded Tika and xmlbeans libraries
Tika libraries have been upgraded from 2.2.1 to 2.3.0. xmlbeans is now a subproject of POI, so POI was upgraded from 4.1.2 to 5.2.2. With POI 5.x the ooxml-schemas library has been moved to ooxml-lite/ooxml-full. Since ooxml-schemas no longer exists, the LICENSE and NOTICE files in the licenses/ directory have been removed. Finally, xmlbeans has been updated from 3.1.0 to 5.0.2
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* (In progress) Added tika-langdetect
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgrading tika libraries to 2.4.0
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Switched from tika-langdetect to tika-langdetect-optimaize
To fix the license check, the mapping regex was expanded to tika-.*
This now means the tika-core LICENSE and NOTICE files are no longer needed.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* (Work in progress) Switching AttachmentProcessor to use OptimaizeLangDetector
This is a concrete implementation of LanguageDetector. Using this requires bringing in the optimaize dependency.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Manually added LICENSE and NOTICE files for Optimaize language-detector
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Move Optimaize dependency to runtimeOnly
Also bring in transitive Guava dependency. This requires manual addition of LICENSE and NOTICE files as with other plugins.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Fix Optimaize langDetector to load models first before detecting
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Fallback logic, and test updates
Following the Tika library upgrade, some fallback logic is necessary:
1. "Author" is deprecated for MSOffice document parsing. It is recommended to use CREATOR from Tika Core Properties instead.
2. EPUB parsing no longer automatically extracts keywords. The convention to fall back to SUBJECT is now manually implemented in AttachmentProcessor
Finally, unit tests have been upgraded to account for non-deterministic language results across library upgrades.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Drop Guava version from 31.1 to 18.0
This is the version that Optimaize 0.6 depends on, and it allows for a smaller ignoreViolations list
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Fix ingest-attachment integration test to assert correct language
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
2022-05-04 12:51:59 -04:00
|
|
|
api 'commons-io:commons-io:2.11.0'
|
2020-06-30 09:57:41 -04:00
|
|
|
api "org.slf4j:slf4j-api:${versions.slf4j}"
|
2016-02-09 08:57:05 -05:00
|
|
|
|
|
|
|
// character set detection
|
2020-06-30 09:57:41 -04:00
|
|
|
api 'com.googlecode.juniversalchardet:juniversalchardet:1.0.3'
|
2016-02-09 08:57:05 -05:00
|
|
|
|
|
|
|
// external parser libraries
|
|
|
|
// HTML
|
2020-06-30 09:57:41 -04:00
|
|
|
api 'org.ccil.cowan.tagsoup:tagsoup:1.2.1'
|
2016-02-09 08:57:05 -05:00
|
|
|
// Adobe PDF
|
2020-06-30 09:57:41 -04:00
|
|
|
api "org.apache.pdfbox:pdfbox:${versions.pdfbox}"
|
|
|
|
api "org.apache.pdfbox:fontbox:${versions.pdfbox}"
|
|
|
|
api "org.apache.pdfbox:jempbox:1.8.16"
|
|
|
|
api "commons-logging:commons-logging:${versions.commonslogging}"
|
|
|
|
api "org.bouncycastle:bcmail-jdk15on:${versions.bouncycastle}"
|
|
|
|
api "org.bouncycastle:bcprov-jdk15on:${versions.bouncycastle}"
|
|
|
|
api "org.bouncycastle:bcpkix-jdk15on:${versions.bouncycastle}"
|
2016-02-09 08:57:05 -05:00
|
|
|
// OpenOffice
|
2020-06-30 09:57:41 -04:00
|
|
|
api "org.apache.poi:poi-ooxml:${versions.poi}"
|
|
|
|
api "org.apache.poi:poi:${versions.poi}"
|
Upgrading ingest-attachment dependencies (#3111)
* Upgrading Tika from 1.24.1 to 2.1.0 and bumping xmlbeans version
This major version upgrade requires an explicit dependency on tika-parsers-standard-package to import the parser implementations, and an update to the namespace of RTFParser. Also, LanguageIdentifier has been deprecated and replaced by LanguageDetector.
This change includes a bump in xmlbeans version from 3.0.1 to 3.1.0
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgrade Tika libraries from 2.1.0 to 2.2.0
This also requires a update of Apache Commons-IO from 2.7 to 2.11.0
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgrade Tika libraries from 2.2.0 to 2.2.1
Also update PDFBox to 2.0.25 as per Tika release notes
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgraded Tika and xmlbeans libraries
Tika libraries have been upgraded from 2.2.1 to 2.3.0. xmlbeans is now a subproject of POI, so POI was upgraded from 4.1.2 to 5.2.2. With POI 5.x the ooxml-schemas library has been moved to ooxml-lite/ooxml-full. Since ooxml-schemas no longer exists, the LICENSE and NOTICE files in the licenses/ directory have been removed. Finally, xmlbeans has been updated from 3.1.0 to 5.0.2
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* (In progress) Added tika-langdetect
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgrading tika libraries to 2.4.0
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Switched from tika-langdetect to tika-langdetect-optimaize
To fix the license check, the mapping regex was expanded to tika-.*
This now means the tika-core LICENSE and NOTICE files are no longer needed.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* (Work in progress) Switching AttachmentProcessor to use OptimaizeLangDetector
This is a concrete implementation of LanguageDetector. Using this requires bringing in the optimaize dependency.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Manually added LICENSE and NOTICE files for Optimaize language-detector
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Move Optimaize dependency to runtimeOnly
Also bring in transitive Guava dependency. This requires manual addition of LICENSE and NOTICE files as with other plugins.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Fix Optimaize langDetector to load models first before detecting
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Fallback logic, and test updates
Following the Tika library upgrade, some fallback logic is necessary:
1. "Author" is deprecated for MSOffice document parsing. It is recommended to use CREATOR from Tika Core Properties instead.
2. EPUB parsing no longer automatically extracts keywords. The convention to fall back to SUBJECT is now manually implemented in AttachmentProcessor
Finally, unit tests have been upgraded to account for non-deterministic language results across library upgrades.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Drop Guava version from 31.1 to 18.0
This is the version that Optimaize 0.6 depends on, and it allows for a smaller ignoreViolations list
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Fix ingest-attachment integration test to assert correct language
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
2022-05-04 12:51:59 -04:00
|
|
|
api "org.apache.poi:poi-ooxml-lite:${versions.poi}"
|
2020-06-30 09:57:41 -04:00
|
|
|
api "commons-codec:commons-codec:${versions.commonscodec}"
|
2022-05-04 17:44:49 -04:00
|
|
|
api 'org.apache.xmlbeans:xmlbeans:5.0.3'
|
2020-06-30 09:57:41 -04:00
|
|
|
api 'org.apache.commons:commons-collections4:4.1'
|
2016-02-09 08:57:05 -05:00
|
|
|
// MS Office
|
2020-06-30 09:57:41 -04:00
|
|
|
api "org.apache.poi:poi-scratchpad:${versions.poi}"
|
2016-02-09 08:57:05 -05:00
|
|
|
// Apple iWork
|
2021-09-01 23:05:42 -04:00
|
|
|
api 'org.apache.commons:commons-compress:1.21'
|
2017-01-24 04:25:02 -05:00
|
|
|
// Outlook documents
|
2020-06-30 09:57:41 -04:00
|
|
|
api "org.apache.james:apache-mime4j-core:${versions.mime4j}"
|
|
|
|
api "org.apache.james:apache-mime4j-dom:${versions.mime4j}"
|
2019-08-19 18:16:32 -04:00
|
|
|
// EPUB books
|
2022-05-04 19:15:30 -04:00
|
|
|
api 'org.apache.commons:commons-lang3:3.12.0'
|
2020-03-25 05:03:26 -04:00
|
|
|
// Microsoft Word files with visio diagrams
|
2020-06-30 09:57:41 -04:00
|
|
|
api 'org.apache.commons:commons-math3:3.6.1'
|
2020-03-25 05:03:26 -04:00
|
|
|
// POIs dependency
|
2020-06-30 09:57:41 -04:00
|
|
|
api 'com.zaxxer:SparseBitSet:1.2'
|
2016-02-09 08:57:05 -05:00
|
|
|
}
|
|
|
|
|
2020-03-19 13:28:59 -04:00
|
|
|
restResources {
|
|
|
|
restApi {
|
2020-03-25 05:03:26 -04:00
|
|
|
includeCore '_common', 'cluster', 'nodes', 'ingest', 'index', 'get'
|
2020-03-19 13:28:59 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-18 02:15:50 -04:00
|
|
|
tasks.named("dependencyLicenses").configure {
|
2017-01-24 04:25:02 -05:00
|
|
|
mapping from: /apache-mime4j-.*/, to: 'apache-mime4j'
|
Upgrading ingest-attachment dependencies (#3111)
* Upgrading Tika from 1.24.1 to 2.1.0 and bumping xmlbeans version
This major version upgrade requires an explicit dependency on tika-parsers-standard-package to import the parser implementations, and an update to the namespace of RTFParser. Also, LanguageIdentifier has been deprecated and replaced by LanguageDetector.
This change includes a bump in xmlbeans version from 3.0.1 to 3.1.0
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgrade Tika libraries from 2.1.0 to 2.2.0
This also requires a update of Apache Commons-IO from 2.7 to 2.11.0
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgrade Tika libraries from 2.2.0 to 2.2.1
Also update PDFBox to 2.0.25 as per Tika release notes
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgraded Tika and xmlbeans libraries
Tika libraries have been upgraded from 2.2.1 to 2.3.0. xmlbeans is now a subproject of POI, so POI was upgraded from 4.1.2 to 5.2.2. With POI 5.x the ooxml-schemas library has been moved to ooxml-lite/ooxml-full. Since ooxml-schemas no longer exists, the LICENSE and NOTICE files in the licenses/ directory have been removed. Finally, xmlbeans has been updated from 3.1.0 to 5.0.2
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* (In progress) Added tika-langdetect
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgrading tika libraries to 2.4.0
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Switched from tika-langdetect to tika-langdetect-optimaize
To fix the license check, the mapping regex was expanded to tika-.*
This now means the tika-core LICENSE and NOTICE files are no longer needed.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* (Work in progress) Switching AttachmentProcessor to use OptimaizeLangDetector
This is a concrete implementation of LanguageDetector. Using this requires bringing in the optimaize dependency.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Manually added LICENSE and NOTICE files for Optimaize language-detector
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Move Optimaize dependency to runtimeOnly
Also bring in transitive Guava dependency. This requires manual addition of LICENSE and NOTICE files as with other plugins.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Fix Optimaize langDetector to load models first before detecting
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Fallback logic, and test updates
Following the Tika library upgrade, some fallback logic is necessary:
1. "Author" is deprecated for MSOffice document parsing. It is recommended to use CREATOR from Tika Core Properties instead.
2. EPUB parsing no longer automatically extracts keywords. The convention to fall back to SUBJECT is now manually implemented in AttachmentProcessor
Finally, unit tests have been upgraded to account for non-deterministic language results across library upgrades.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Drop Guava version from 31.1 to 18.0
This is the version that Optimaize 0.6 depends on, and it allows for a smaller ignoreViolations list
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Fix ingest-attachment integration test to assert correct language
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
2022-05-04 12:51:59 -04:00
|
|
|
mapping from: /tika-.*/, to: 'tika-parsers'
|
|
|
|
mapping from: /poi-ooxml-.*/, to: 'poi-ooxml'
|
2017-01-24 04:25:02 -05:00
|
|
|
}
|
|
|
|
|
2016-02-09 08:57:05 -05:00
|
|
|
forbiddenPatterns {
|
2016-12-09 10:51:17 -05:00
|
|
|
exclude '**/*.doc'
|
2016-02-09 08:57:05 -05:00
|
|
|
exclude '**/*.docx'
|
|
|
|
exclude '**/*.pdf'
|
|
|
|
exclude '**/*.epub'
|
2016-12-09 10:51:17 -05:00
|
|
|
exclude '**/*.vsdx'
|
2016-02-09 08:57:05 -05:00
|
|
|
}
|
|
|
|
|
2019-11-14 06:01:23 -05:00
|
|
|
thirdPartyAudit {
|
|
|
|
ignoreMissingClasses()
|
Upgrading ingest-attachment dependencies (#3111)
* Upgrading Tika from 1.24.1 to 2.1.0 and bumping xmlbeans version
This major version upgrade requires an explicit dependency on tika-parsers-standard-package to import the parser implementations, and an update to the namespace of RTFParser. Also, LanguageIdentifier has been deprecated and replaced by LanguageDetector.
This change includes a bump in xmlbeans version from 3.0.1 to 3.1.0
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgrade Tika libraries from 2.1.0 to 2.2.0
This also requires a update of Apache Commons-IO from 2.7 to 2.11.0
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgrade Tika libraries from 2.2.0 to 2.2.1
Also update PDFBox to 2.0.25 as per Tika release notes
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgraded Tika and xmlbeans libraries
Tika libraries have been upgraded from 2.2.1 to 2.3.0. xmlbeans is now a subproject of POI, so POI was upgraded from 4.1.2 to 5.2.2. With POI 5.x the ooxml-schemas library has been moved to ooxml-lite/ooxml-full. Since ooxml-schemas no longer exists, the LICENSE and NOTICE files in the licenses/ directory have been removed. Finally, xmlbeans has been updated from 3.1.0 to 5.0.2
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* (In progress) Added tika-langdetect
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Upgrading tika libraries to 2.4.0
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Switched from tika-langdetect to tika-langdetect-optimaize
To fix the license check, the mapping regex was expanded to tika-.*
This now means the tika-core LICENSE and NOTICE files are no longer needed.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* (Work in progress) Switching AttachmentProcessor to use OptimaizeLangDetector
This is a concrete implementation of LanguageDetector. Using this requires bringing in the optimaize dependency.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Manually added LICENSE and NOTICE files for Optimaize language-detector
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Move Optimaize dependency to runtimeOnly
Also bring in transitive Guava dependency. This requires manual addition of LICENSE and NOTICE files as with other plugins.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Fix Optimaize langDetector to load models first before detecting
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Fallback logic, and test updates
Following the Tika library upgrade, some fallback logic is necessary:
1. "Author" is deprecated for MSOffice document parsing. It is recommended to use CREATOR from Tika Core Properties instead.
2. EPUB parsing no longer automatically extracts keywords. The convention to fall back to SUBJECT is now manually implemented in AttachmentProcessor
Finally, unit tests have been upgraded to account for non-deterministic language results across library upgrades.
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Drop Guava version from 31.1 to 18.0
This is the version that Optimaize 0.6 depends on, and it allows for a smaller ignoreViolations list
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
* Fix ingest-attachment integration test to assert correct language
Signed-off-by: Kartik Ganesh <gkart@amazon.com>
2022-05-04 12:51:59 -04:00
|
|
|
ignoreViolations(
|
|
|
|
'com.google.common.cache.Striped64',
|
|
|
|
'com.google.common.cache.Striped64$1',
|
|
|
|
'com.google.common.cache.Striped64$Cell',
|
|
|
|
'com.google.common.primitives.UnsignedBytes$LexicographicalComparatorHolder$UnsafeComparator',
|
|
|
|
'com.google.common.primitives.UnsignedBytes$LexicographicalComparatorHolder$UnsafeComparator$1'
|
|
|
|
)
|
2018-01-16 22:59:29 -05:00
|
|
|
}
|
2018-08-29 10:43:40 -04:00
|
|
|
|
2020-02-10 03:57:03 -05:00
|
|
|
if (BuildParams.inFipsJvm) {
|
2020-01-27 04:14:52 -05:00
|
|
|
// FIPS JVM includes many classes from bouncycastle which count as jar hell for the third party audit,
|
2019-05-30 13:29:42 -04:00
|
|
|
// rather than provide a long list of exclusions, disable the check on FIPS.
|
2020-02-10 03:57:03 -05:00
|
|
|
jarHell.enabled = false
|
|
|
|
test.enabled = false
|
2020-08-13 19:08:53 -04:00
|
|
|
yamlRestTest.enabled = false;
|
|
|
|
testingConventions.enabled = false;
|
2019-11-14 06:01:23 -05:00
|
|
|
}
|