From 2d82cdc0f5ccdd7ee6e98ef1e90b9ceee84c1085 Mon Sep 17 00:00:00 2001 From: krisztina-zsihovszki Date: Wed, 8 Nov 2023 17:31:59 +0100 Subject: [PATCH] NIFI-12366 Add HuggingFace support to Pinecone processors Signed-off-by: Pierre Villard This closes #8026. --- .../python/vectorstores/EmbeddingUtils.py | 29 ++++++++++-- .../main/python/vectorstores/PutPinecone.py | 45 +++++++++++++++---- .../main/python/vectorstores/QueryPinecone.py | 40 ++++++++++++++--- 3 files changed, 94 insertions(+), 20 deletions(-) diff --git a/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/EmbeddingUtils.py b/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/EmbeddingUtils.py index b305942da8..9b0218c9c0 100644 --- a/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/EmbeddingUtils.py +++ b/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/EmbeddingUtils.py @@ -14,6 +14,9 @@ # limitations under the License. from nifiapi.properties import PropertyDescriptor, StandardValidators, PropertyDependency, ExpressionLanguageScope +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings + # Embedding Functions ONNX_ALL_MINI_LM_L6_V2 = "ONNX all-MiniLM-L6-v2 Model" @@ -99,13 +102,19 @@ SENTENCE_TRANSFORMER_MODEL_NAME = PropertyDescriptor( ) SENTENCE_TRANSFORMER_DEVICE = PropertyDescriptor( name="Sentence Transformer Device Type", - description="The type of device to use for performing the embeddings using the Sentence Transformer, such as 'cpu', 'cuda', 'mps', 'cuda:0', etc. If not specified, a GPU will be used if " - + "possible, otherwise a CPU.", + description="""The type of device to use for performing the embeddings using the Sentence Transformer, such as 'cpu', 'cuda', 'mps', 'cuda:0', etc. + If not specified, a GPU will be used if possible, otherwise a CPU.""", validators=[StandardValidators.NON_EMPTY_VALIDATOR], required=False, dependencies=[PropertyDependency(EMBEDDING_FUNCTION, SENTENCE_TRANSFORMERS)] ) - +EMBEDDING_MODEL = PropertyDescriptor( + name="Embedding Model", + description="Specifies which embedding model should be used in order to create embeddings from incoming Documents. Default model is OpenAI.", + allowable_values=[HUGGING_FACE, OPENAI], + default_value=OPENAI, + required=True +) PROPERTIES = [ EMBEDDING_FUNCTION, HUGGING_FACE_MODEL_NAME, @@ -117,7 +126,8 @@ PROPERTIES = [ OPENAI_API_TYPE, OPENAI_API_VERSION, SENTENCE_TRANSFORMER_MODEL_NAME, - SENTENCE_TRANSFORMER_DEVICE + SENTENCE_TRANSFORMER_DEVICE, + EMBEDDING_MODEL ] @@ -145,3 +155,14 @@ def create_embedding_function(context): model_name = context.getProperty(SENTENCE_TRANSFORMER_MODEL_NAME).getValue() device = context.getProperty(SENTENCE_TRANSFORMER_DEVICE).getValue() return SentenceTransformerEmbeddingFunction(model_name=model_name, device=device) + + +def create_embedding_service(context): + embedding_service = context.getProperty(EMBEDDING_MODEL).getValue() + + if embedding_service == OPENAI: + openai_api_key = context.getProperty(OPENAI_API_KEY).getValue() + return OpenAIEmbeddings(openai_api_key=openai_api_key) + else: + huggingface_api_key = context.getProperty(HUGGING_FACE_API_KEY).getValue() + return HuggingFaceInferenceAPIEmbeddings(api_key=huggingface_api_key) diff --git a/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/PutPinecone.py b/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/PutPinecone.py index 42f51e0102..409f2b9279 100644 --- a/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/PutPinecone.py +++ b/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/PutPinecone.py @@ -14,11 +14,11 @@ # limitations under the License. from langchain.vectorstores import Pinecone -from langchain.embeddings.openai import OpenAIEmbeddings from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult -from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope +from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope, PropertyDependency import pinecone import json +from EmbeddingUtils import OPENAI, HUGGING_FACE, EMBEDDING_MODEL, create_embedding_service class PutPinecone(FlowFileTransform): @@ -31,7 +31,6 @@ class PutPinecone(FlowFileTransform): The text must be a string, while metadata must be a map with strings for values. Any additional fields will be ignored.""" tags = ["pinecone", "vector", "vectordb", "vectorstore", "embeddings", "ai", "artificial intelligence", "ml", "machine learning", "text", "LLM"] - PINECONE_API_KEY = PropertyDescriptor( name="Pinecone API Key", description="The API Key to use in order to authentication with Pinecone", @@ -39,12 +38,37 @@ class PutPinecone(FlowFileTransform): required=True, validators=[StandardValidators.NON_EMPTY_VALIDATOR] ) + HUGGING_FACE_API_KEY = PropertyDescriptor( + name="HuggingFace API Key", + description="The API Key for interacting with HuggingFace", + validators=[StandardValidators.NON_EMPTY_VALIDATOR], + required=True, + sensitive=True, + dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)] + ) + HUGGING_FACE_MODEL = PropertyDescriptor( + name="HuggingFace Model", + description="The name of the HuggingFace model to use", + validators=[StandardValidators.NON_EMPTY_VALIDATOR], + required=True, + default_value="sentence-transformers/all-MiniLM-L6-v2", + dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)] + ) OPENAI_API_KEY = PropertyDescriptor( name="OpenAI API Key", description="The API Key for OpenAI in order to create embeddings", sensitive=True, required=True, - validators=[StandardValidators.NON_EMPTY_VALIDATOR] + validators=[StandardValidators.NON_EMPTY_VALIDATOR], + dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)] + ) + OPENAI_API_MODEL = PropertyDescriptor( + name="OpenAI Model", + description="The API Key for OpenAI in order to create embeddings", + required=True, + validators=[StandardValidators.NON_EMPTY_VALIDATOR], + default_value="text-embedding-ada-002", + dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)] ) PINECONE_ENV = PropertyDescriptor( name="Pinecone Environment", @@ -78,15 +102,19 @@ class PutPinecone(FlowFileTransform): ) DOC_ID_FIELD_NAME = PropertyDescriptor( name="Document ID Field Name", - description="Specifies the name of the field in the 'metadata' element of each document where the document's ID can be found. " + - "If not specified, an ID will be generated based on the FlowFile's filename and a one-up number.", + description="""Specifies the name of the field in the 'metadata' element of each document where the document's ID can be found. + If not specified, an ID will be generated based on the FlowFile's filename and a one-up number.""", required=False, validators=[StandardValidators.NON_EMPTY_VALIDATOR], expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES ) properties = [PINECONE_API_KEY, + EMBEDDING_MODEL, OPENAI_API_KEY, + OPENAI_API_MODEL, + HUGGING_FACE_API_KEY, + HUGGING_FACE_MODEL, PINECONE_ENV, INDEX_NAME, TEXT_KEY, @@ -110,9 +138,8 @@ class PutPinecone(FlowFileTransform): api_key=api_key, environment=pinecone_env, ) - openai_api_key = context.getProperty(self.OPENAI_API_KEY).getValue() - self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) + self.embeddings = create_embedding_service(context) def transform(self, context, flowfile): # First, check if our index already exists. If it doesn't, we create it @@ -158,4 +185,4 @@ class PutPinecone(FlowFileTransform): text_key = context.getProperty(self.TEXT_KEY).evaluateAttributeExpressions().getValue() vectorstore = Pinecone(index, self.embeddings.embed_query, text_key) vectorstore.add_texts(texts=texts, metadatas=metadatas, ids=ids, namespace=namespace) - return FlowFileTransformResult(relationship = "success") + return FlowFileTransformResult(relationship="success") diff --git a/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/QueryPinecone.py b/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/QueryPinecone.py index c0521d1bc9..e12b7a6e77 100644 --- a/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/QueryPinecone.py +++ b/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/QueryPinecone.py @@ -14,11 +14,11 @@ # limitations under the License. from langchain.vectorstores import Pinecone -from langchain.embeddings.openai import OpenAIEmbeddings from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult -from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope +from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope, PropertyDependency import QueryUtils import pinecone +from EmbeddingUtils import OPENAI, HUGGING_FACE, EMBEDDING_MODEL, create_embedding_service class QueryPinecone(FlowFileTransform): @@ -30,7 +30,6 @@ class QueryPinecone(FlowFileTransform): description = "Queries Pinecone in order to gather a specified number of documents that are most closely related to the given query." tags = ["pinecone", "vector", "vectordb", "vectorstore", "embeddings", "ai", "artificial intelligence", "ml", "machine learning", "text", "LLM"] - PINECONE_API_KEY = PropertyDescriptor( name="Pinecone API Key", description="The API Key to use in order to authentication with Pinecone", @@ -43,7 +42,32 @@ class QueryPinecone(FlowFileTransform): description="The API Key for OpenAI in order to create embeddings", sensitive=True, required=True, - validators=[StandardValidators.NON_EMPTY_VALIDATOR] + validators=[StandardValidators.NON_EMPTY_VALIDATOR], + dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)] + ) + HUGGING_FACE_API_KEY = PropertyDescriptor( + name="HuggingFace API Key", + description="The API Key for interacting with HuggingFace", + validators=[StandardValidators.NON_EMPTY_VALIDATOR], + required=True, + sensitive=True, + dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)] + ) + OPENAI_MODEL = PropertyDescriptor( + name="OpenAI Model", + description="The API Key for OpenAI in order to create embeddings", + required=True, + validators=[StandardValidators.NON_EMPTY_VALIDATOR], + default_value="text-embedding-ada-002", + dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)] + ) + HUGGING_FACE_MODEL = PropertyDescriptor( + name="HuggingFace Model", + description="The name of the HuggingFace model to use", + validators=[StandardValidators.NON_EMPTY_VALIDATOR], + required=True, + default_value="sentence-transformers/all-MiniLM-L6-v2", + dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)] ) PINECONE_ENV = PropertyDescriptor( name="Pinecone Environment", @@ -91,9 +115,12 @@ class QueryPinecone(FlowFileTransform): expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES ) - properties = [PINECONE_API_KEY, + EMBEDDING_MODEL, OPENAI_API_KEY, + OPENAI_MODEL, + HUGGING_FACE_API_KEY, + HUGGING_FACE_MODEL, PINECONE_ENV, INDEX_NAME, QUERY, @@ -123,8 +150,7 @@ class QueryPinecone(FlowFileTransform): api_key=api_key, environment=pinecone_env, ) - openai_api_key = context.getProperty(self.OPENAI_API_KEY).getValue() - self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) + self.embeddings = create_embedding_service(context) self.query_utils = QueryUtils.QueryUtils(context)