NIFI-12619 Fixed Python dependencies in ParseDocument

This closes #8255 Signed-off-by: David Handermann <exceptionfactory@apache.org>
2024-01-16 23:03:39 +04:00 · 2024-01-16 23:03:39 +04:00 · 6e9da11be9
parent 787c45dd61
commit 6e9da11be9
1 changed files with 2 additions and 2 deletions
--- a/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/ParseDocument.py
+++ b/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/ParseDocument.py
@ -51,8 +51,8 @@ class ParseDocument(FlowFileTransform):
            Note that use of this Processor may require significant storage space and RAM utilization due to third-party dependencies necessary for processing PDF and image files.
            Also note that in order to process PDF or Images, Tesseract and Poppler must be installed on the system."""
        tags = ["text", "embeddings", "vector", "machine learning", "ML", "artificial intelligence", "ai", "document", "langchain", "pdf", "html", "markdown", "word", "excel", "powerpoint"]
-        dependencies = ['langchain', 'unstructured', 'unstructured-inference', 'unstructured_pytesseract', 'numpy',
-                        'opencv-python', 'pdf2image', 'pdfminer.six[image]', 'python-docx', 'openpyxl', 'python-pptx']
+        dependencies = ['pikepdf', 'pypdf', 'langchain', 'unstructured', 'unstructured-inference', 'unstructured_pytesseract', 'numpy',
+                        'opencv-python', 'pdf2image', 'pdfminer.six', 'python-docx', 'openpyxl', 'python-pptx']


    INPUT_FORMAT = PropertyDescriptor(