NIFI-12366 Add HuggingFace support to Pinecone processors

Signed-off-by: Pierre Villard <pierre.villard.fr@gmail.com>

This closes #8026.
This commit is contained in:
krisztina-zsihovszki 2023-11-08 17:31:59 +01:00 committed by Pierre Villard
parent 9154b708cc
commit 2d82cdc0f5
No known key found for this signature in database
GPG Key ID: F92A93B30C07C6D5
3 changed files with 94 additions and 20 deletions

View File

@ -14,6 +14,9 @@
# limitations under the License.
from nifiapi.properties import PropertyDescriptor, StandardValidators, PropertyDependency, ExpressionLanguageScope
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings
# Embedding Functions
ONNX_ALL_MINI_LM_L6_V2 = "ONNX all-MiniLM-L6-v2 Model"
@ -99,13 +102,19 @@ SENTENCE_TRANSFORMER_MODEL_NAME = PropertyDescriptor(
)
SENTENCE_TRANSFORMER_DEVICE = PropertyDescriptor(
name="Sentence Transformer Device Type",
description="The type of device to use for performing the embeddings using the Sentence Transformer, such as 'cpu', 'cuda', 'mps', 'cuda:0', etc. If not specified, a GPU will be used if "
+ "possible, otherwise a CPU.",
description="""The type of device to use for performing the embeddings using the Sentence Transformer, such as 'cpu', 'cuda', 'mps', 'cuda:0', etc.
If not specified, a GPU will be used if possible, otherwise a CPU.""",
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
required=False,
dependencies=[PropertyDependency(EMBEDDING_FUNCTION, SENTENCE_TRANSFORMERS)]
)
EMBEDDING_MODEL = PropertyDescriptor(
name="Embedding Model",
description="Specifies which embedding model should be used in order to create embeddings from incoming Documents. Default model is OpenAI.",
allowable_values=[HUGGING_FACE, OPENAI],
default_value=OPENAI,
required=True
)
PROPERTIES = [
EMBEDDING_FUNCTION,
HUGGING_FACE_MODEL_NAME,
@ -117,7 +126,8 @@ PROPERTIES = [
OPENAI_API_TYPE,
OPENAI_API_VERSION,
SENTENCE_TRANSFORMER_MODEL_NAME,
SENTENCE_TRANSFORMER_DEVICE
SENTENCE_TRANSFORMER_DEVICE,
EMBEDDING_MODEL
]
@ -145,3 +155,14 @@ def create_embedding_function(context):
model_name = context.getProperty(SENTENCE_TRANSFORMER_MODEL_NAME).getValue()
device = context.getProperty(SENTENCE_TRANSFORMER_DEVICE).getValue()
return SentenceTransformerEmbeddingFunction(model_name=model_name, device=device)
def create_embedding_service(context):
embedding_service = context.getProperty(EMBEDDING_MODEL).getValue()
if embedding_service == OPENAI:
openai_api_key = context.getProperty(OPENAI_API_KEY).getValue()
return OpenAIEmbeddings(openai_api_key=openai_api_key)
else:
huggingface_api_key = context.getProperty(HUGGING_FACE_API_KEY).getValue()
return HuggingFaceInferenceAPIEmbeddings(api_key=huggingface_api_key)

View File

@ -14,11 +14,11 @@
# limitations under the License.
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope
from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope, PropertyDependency
import pinecone
import json
from EmbeddingUtils import OPENAI, HUGGING_FACE, EMBEDDING_MODEL, create_embedding_service
class PutPinecone(FlowFileTransform):
@ -31,7 +31,6 @@ class PutPinecone(FlowFileTransform):
The text must be a string, while metadata must be a map with strings for values. Any additional fields will be ignored."""
tags = ["pinecone", "vector", "vectordb", "vectorstore", "embeddings", "ai", "artificial intelligence", "ml", "machine learning", "text", "LLM"]
PINECONE_API_KEY = PropertyDescriptor(
name="Pinecone API Key",
description="The API Key to use in order to authentication with Pinecone",
@ -39,12 +38,37 @@ class PutPinecone(FlowFileTransform):
required=True,
validators=[StandardValidators.NON_EMPTY_VALIDATOR]
)
HUGGING_FACE_API_KEY = PropertyDescriptor(
name="HuggingFace API Key",
description="The API Key for interacting with HuggingFace",
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
required=True,
sensitive=True,
dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)]
)
HUGGING_FACE_MODEL = PropertyDescriptor(
name="HuggingFace Model",
description="The name of the HuggingFace model to use",
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
required=True,
default_value="sentence-transformers/all-MiniLM-L6-v2",
dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)]
)
OPENAI_API_KEY = PropertyDescriptor(
name="OpenAI API Key",
description="The API Key for OpenAI in order to create embeddings",
sensitive=True,
required=True,
validators=[StandardValidators.NON_EMPTY_VALIDATOR]
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)]
)
OPENAI_API_MODEL = PropertyDescriptor(
name="OpenAI Model",
description="The API Key for OpenAI in order to create embeddings",
required=True,
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
default_value="text-embedding-ada-002",
dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)]
)
PINECONE_ENV = PropertyDescriptor(
name="Pinecone Environment",
@ -78,15 +102,19 @@ class PutPinecone(FlowFileTransform):
)
DOC_ID_FIELD_NAME = PropertyDescriptor(
name="Document ID Field Name",
description="Specifies the name of the field in the 'metadata' element of each document where the document's ID can be found. " +
"If not specified, an ID will be generated based on the FlowFile's filename and a one-up number.",
description="""Specifies the name of the field in the 'metadata' element of each document where the document's ID can be found.
If not specified, an ID will be generated based on the FlowFile's filename and a one-up number.""",
required=False,
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES
)
properties = [PINECONE_API_KEY,
EMBEDDING_MODEL,
OPENAI_API_KEY,
OPENAI_API_MODEL,
HUGGING_FACE_API_KEY,
HUGGING_FACE_MODEL,
PINECONE_ENV,
INDEX_NAME,
TEXT_KEY,
@ -110,9 +138,8 @@ class PutPinecone(FlowFileTransform):
api_key=api_key,
environment=pinecone_env,
)
openai_api_key = context.getProperty(self.OPENAI_API_KEY).getValue()
self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
self.embeddings = create_embedding_service(context)
def transform(self, context, flowfile):
# First, check if our index already exists. If it doesn't, we create it
@ -158,4 +185,4 @@ class PutPinecone(FlowFileTransform):
text_key = context.getProperty(self.TEXT_KEY).evaluateAttributeExpressions().getValue()
vectorstore = Pinecone(index, self.embeddings.embed_query, text_key)
vectorstore.add_texts(texts=texts, metadatas=metadatas, ids=ids, namespace=namespace)
return FlowFileTransformResult(relationship = "success")
return FlowFileTransformResult(relationship="success")

View File

@ -14,11 +14,11 @@
# limitations under the License.
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope
from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope, PropertyDependency
import QueryUtils
import pinecone
from EmbeddingUtils import OPENAI, HUGGING_FACE, EMBEDDING_MODEL, create_embedding_service
class QueryPinecone(FlowFileTransform):
@ -30,7 +30,6 @@ class QueryPinecone(FlowFileTransform):
description = "Queries Pinecone in order to gather a specified number of documents that are most closely related to the given query."
tags = ["pinecone", "vector", "vectordb", "vectorstore", "embeddings", "ai", "artificial intelligence", "ml", "machine learning", "text", "LLM"]
PINECONE_API_KEY = PropertyDescriptor(
name="Pinecone API Key",
description="The API Key to use in order to authentication with Pinecone",
@ -43,7 +42,32 @@ class QueryPinecone(FlowFileTransform):
description="The API Key for OpenAI in order to create embeddings",
sensitive=True,
required=True,
validators=[StandardValidators.NON_EMPTY_VALIDATOR]
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)]
)
HUGGING_FACE_API_KEY = PropertyDescriptor(
name="HuggingFace API Key",
description="The API Key for interacting with HuggingFace",
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
required=True,
sensitive=True,
dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)]
)
OPENAI_MODEL = PropertyDescriptor(
name="OpenAI Model",
description="The API Key for OpenAI in order to create embeddings",
required=True,
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
default_value="text-embedding-ada-002",
dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)]
)
HUGGING_FACE_MODEL = PropertyDescriptor(
name="HuggingFace Model",
description="The name of the HuggingFace model to use",
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
required=True,
default_value="sentence-transformers/all-MiniLM-L6-v2",
dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)]
)
PINECONE_ENV = PropertyDescriptor(
name="Pinecone Environment",
@ -91,9 +115,12 @@ class QueryPinecone(FlowFileTransform):
expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES
)
properties = [PINECONE_API_KEY,
EMBEDDING_MODEL,
OPENAI_API_KEY,
OPENAI_MODEL,
HUGGING_FACE_API_KEY,
HUGGING_FACE_MODEL,
PINECONE_ENV,
INDEX_NAME,
QUERY,
@ -123,8 +150,7 @@ class QueryPinecone(FlowFileTransform):
api_key=api_key,
environment=pinecone_env,
)
openai_api_key = context.getProperty(self.OPENAI_API_KEY).getValue()
self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
self.embeddings = create_embedding_service(context)
self.query_utils = QueryUtils.QueryUtils(context)