mirror of https://github.com/apache/nifi.git
NIFI-12366 Add HuggingFace support to Pinecone processors
Signed-off-by: Pierre Villard <pierre.villard.fr@gmail.com> This closes #8026.
This commit is contained in:
parent
9154b708cc
commit
2d82cdc0f5
|
@ -14,6 +14,9 @@
|
|||
# limitations under the License.
|
||||
|
||||
from nifiapi.properties import PropertyDescriptor, StandardValidators, PropertyDependency, ExpressionLanguageScope
|
||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||
from langchain.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings
|
||||
|
||||
|
||||
# Embedding Functions
|
||||
ONNX_ALL_MINI_LM_L6_V2 = "ONNX all-MiniLM-L6-v2 Model"
|
||||
|
@ -99,13 +102,19 @@ SENTENCE_TRANSFORMER_MODEL_NAME = PropertyDescriptor(
|
|||
)
|
||||
SENTENCE_TRANSFORMER_DEVICE = PropertyDescriptor(
|
||||
name="Sentence Transformer Device Type",
|
||||
description="The type of device to use for performing the embeddings using the Sentence Transformer, such as 'cpu', 'cuda', 'mps', 'cuda:0', etc. If not specified, a GPU will be used if "
|
||||
+ "possible, otherwise a CPU.",
|
||||
description="""The type of device to use for performing the embeddings using the Sentence Transformer, such as 'cpu', 'cuda', 'mps', 'cuda:0', etc.
|
||||
If not specified, a GPU will be used if possible, otherwise a CPU.""",
|
||||
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
||||
required=False,
|
||||
dependencies=[PropertyDependency(EMBEDDING_FUNCTION, SENTENCE_TRANSFORMERS)]
|
||||
)
|
||||
|
||||
EMBEDDING_MODEL = PropertyDescriptor(
|
||||
name="Embedding Model",
|
||||
description="Specifies which embedding model should be used in order to create embeddings from incoming Documents. Default model is OpenAI.",
|
||||
allowable_values=[HUGGING_FACE, OPENAI],
|
||||
default_value=OPENAI,
|
||||
required=True
|
||||
)
|
||||
PROPERTIES = [
|
||||
EMBEDDING_FUNCTION,
|
||||
HUGGING_FACE_MODEL_NAME,
|
||||
|
@ -117,7 +126,8 @@ PROPERTIES = [
|
|||
OPENAI_API_TYPE,
|
||||
OPENAI_API_VERSION,
|
||||
SENTENCE_TRANSFORMER_MODEL_NAME,
|
||||
SENTENCE_TRANSFORMER_DEVICE
|
||||
SENTENCE_TRANSFORMER_DEVICE,
|
||||
EMBEDDING_MODEL
|
||||
]
|
||||
|
||||
|
||||
|
@ -145,3 +155,14 @@ def create_embedding_function(context):
|
|||
model_name = context.getProperty(SENTENCE_TRANSFORMER_MODEL_NAME).getValue()
|
||||
device = context.getProperty(SENTENCE_TRANSFORMER_DEVICE).getValue()
|
||||
return SentenceTransformerEmbeddingFunction(model_name=model_name, device=device)
|
||||
|
||||
|
||||
def create_embedding_service(context):
|
||||
embedding_service = context.getProperty(EMBEDDING_MODEL).getValue()
|
||||
|
||||
if embedding_service == OPENAI:
|
||||
openai_api_key = context.getProperty(OPENAI_API_KEY).getValue()
|
||||
return OpenAIEmbeddings(openai_api_key=openai_api_key)
|
||||
else:
|
||||
huggingface_api_key = context.getProperty(HUGGING_FACE_API_KEY).getValue()
|
||||
return HuggingFaceInferenceAPIEmbeddings(api_key=huggingface_api_key)
|
||||
|
|
|
@ -14,11 +14,11 @@
|
|||
# limitations under the License.
|
||||
|
||||
from langchain.vectorstores import Pinecone
|
||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||
from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
|
||||
from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope
|
||||
from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope, PropertyDependency
|
||||
import pinecone
|
||||
import json
|
||||
from EmbeddingUtils import OPENAI, HUGGING_FACE, EMBEDDING_MODEL, create_embedding_service
|
||||
|
||||
|
||||
class PutPinecone(FlowFileTransform):
|
||||
|
@ -31,7 +31,6 @@ class PutPinecone(FlowFileTransform):
|
|||
The text must be a string, while metadata must be a map with strings for values. Any additional fields will be ignored."""
|
||||
tags = ["pinecone", "vector", "vectordb", "vectorstore", "embeddings", "ai", "artificial intelligence", "ml", "machine learning", "text", "LLM"]
|
||||
|
||||
|
||||
PINECONE_API_KEY = PropertyDescriptor(
|
||||
name="Pinecone API Key",
|
||||
description="The API Key to use in order to authentication with Pinecone",
|
||||
|
@ -39,12 +38,37 @@ class PutPinecone(FlowFileTransform):
|
|||
required=True,
|
||||
validators=[StandardValidators.NON_EMPTY_VALIDATOR]
|
||||
)
|
||||
HUGGING_FACE_API_KEY = PropertyDescriptor(
|
||||
name="HuggingFace API Key",
|
||||
description="The API Key for interacting with HuggingFace",
|
||||
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
||||
required=True,
|
||||
sensitive=True,
|
||||
dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)]
|
||||
)
|
||||
HUGGING_FACE_MODEL = PropertyDescriptor(
|
||||
name="HuggingFace Model",
|
||||
description="The name of the HuggingFace model to use",
|
||||
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
||||
required=True,
|
||||
default_value="sentence-transformers/all-MiniLM-L6-v2",
|
||||
dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)]
|
||||
)
|
||||
OPENAI_API_KEY = PropertyDescriptor(
|
||||
name="OpenAI API Key",
|
||||
description="The API Key for OpenAI in order to create embeddings",
|
||||
sensitive=True,
|
||||
required=True,
|
||||
validators=[StandardValidators.NON_EMPTY_VALIDATOR]
|
||||
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
||||
dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)]
|
||||
)
|
||||
OPENAI_API_MODEL = PropertyDescriptor(
|
||||
name="OpenAI Model",
|
||||
description="The API Key for OpenAI in order to create embeddings",
|
||||
required=True,
|
||||
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
||||
default_value="text-embedding-ada-002",
|
||||
dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)]
|
||||
)
|
||||
PINECONE_ENV = PropertyDescriptor(
|
||||
name="Pinecone Environment",
|
||||
|
@ -78,15 +102,19 @@ class PutPinecone(FlowFileTransform):
|
|||
)
|
||||
DOC_ID_FIELD_NAME = PropertyDescriptor(
|
||||
name="Document ID Field Name",
|
||||
description="Specifies the name of the field in the 'metadata' element of each document where the document's ID can be found. " +
|
||||
"If not specified, an ID will be generated based on the FlowFile's filename and a one-up number.",
|
||||
description="""Specifies the name of the field in the 'metadata' element of each document where the document's ID can be found.
|
||||
If not specified, an ID will be generated based on the FlowFile's filename and a one-up number.""",
|
||||
required=False,
|
||||
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
||||
expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES
|
||||
)
|
||||
|
||||
properties = [PINECONE_API_KEY,
|
||||
EMBEDDING_MODEL,
|
||||
OPENAI_API_KEY,
|
||||
OPENAI_API_MODEL,
|
||||
HUGGING_FACE_API_KEY,
|
||||
HUGGING_FACE_MODEL,
|
||||
PINECONE_ENV,
|
||||
INDEX_NAME,
|
||||
TEXT_KEY,
|
||||
|
@ -110,9 +138,8 @@ class PutPinecone(FlowFileTransform):
|
|||
api_key=api_key,
|
||||
environment=pinecone_env,
|
||||
)
|
||||
openai_api_key = context.getProperty(self.OPENAI_API_KEY).getValue()
|
||||
self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
|
||||
|
||||
self.embeddings = create_embedding_service(context)
|
||||
|
||||
def transform(self, context, flowfile):
|
||||
# First, check if our index already exists. If it doesn't, we create it
|
||||
|
@ -158,4 +185,4 @@ class PutPinecone(FlowFileTransform):
|
|||
text_key = context.getProperty(self.TEXT_KEY).evaluateAttributeExpressions().getValue()
|
||||
vectorstore = Pinecone(index, self.embeddings.embed_query, text_key)
|
||||
vectorstore.add_texts(texts=texts, metadatas=metadatas, ids=ids, namespace=namespace)
|
||||
return FlowFileTransformResult(relationship = "success")
|
||||
return FlowFileTransformResult(relationship="success")
|
||||
|
|
|
@ -14,11 +14,11 @@
|
|||
# limitations under the License.
|
||||
|
||||
from langchain.vectorstores import Pinecone
|
||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||
from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
|
||||
from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope
|
||||
from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope, PropertyDependency
|
||||
import QueryUtils
|
||||
import pinecone
|
||||
from EmbeddingUtils import OPENAI, HUGGING_FACE, EMBEDDING_MODEL, create_embedding_service
|
||||
|
||||
|
||||
class QueryPinecone(FlowFileTransform):
|
||||
|
@ -30,7 +30,6 @@ class QueryPinecone(FlowFileTransform):
|
|||
description = "Queries Pinecone in order to gather a specified number of documents that are most closely related to the given query."
|
||||
tags = ["pinecone", "vector", "vectordb", "vectorstore", "embeddings", "ai", "artificial intelligence", "ml", "machine learning", "text", "LLM"]
|
||||
|
||||
|
||||
PINECONE_API_KEY = PropertyDescriptor(
|
||||
name="Pinecone API Key",
|
||||
description="The API Key to use in order to authentication with Pinecone",
|
||||
|
@ -43,7 +42,32 @@ class QueryPinecone(FlowFileTransform):
|
|||
description="The API Key for OpenAI in order to create embeddings",
|
||||
sensitive=True,
|
||||
required=True,
|
||||
validators=[StandardValidators.NON_EMPTY_VALIDATOR]
|
||||
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
||||
dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)]
|
||||
)
|
||||
HUGGING_FACE_API_KEY = PropertyDescriptor(
|
||||
name="HuggingFace API Key",
|
||||
description="The API Key for interacting with HuggingFace",
|
||||
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
||||
required=True,
|
||||
sensitive=True,
|
||||
dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)]
|
||||
)
|
||||
OPENAI_MODEL = PropertyDescriptor(
|
||||
name="OpenAI Model",
|
||||
description="The API Key for OpenAI in order to create embeddings",
|
||||
required=True,
|
||||
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
||||
default_value="text-embedding-ada-002",
|
||||
dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)]
|
||||
)
|
||||
HUGGING_FACE_MODEL = PropertyDescriptor(
|
||||
name="HuggingFace Model",
|
||||
description="The name of the HuggingFace model to use",
|
||||
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
||||
required=True,
|
||||
default_value="sentence-transformers/all-MiniLM-L6-v2",
|
||||
dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)]
|
||||
)
|
||||
PINECONE_ENV = PropertyDescriptor(
|
||||
name="Pinecone Environment",
|
||||
|
@ -91,9 +115,12 @@ class QueryPinecone(FlowFileTransform):
|
|||
expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES
|
||||
)
|
||||
|
||||
|
||||
properties = [PINECONE_API_KEY,
|
||||
EMBEDDING_MODEL,
|
||||
OPENAI_API_KEY,
|
||||
OPENAI_MODEL,
|
||||
HUGGING_FACE_API_KEY,
|
||||
HUGGING_FACE_MODEL,
|
||||
PINECONE_ENV,
|
||||
INDEX_NAME,
|
||||
QUERY,
|
||||
|
@ -123,8 +150,7 @@ class QueryPinecone(FlowFileTransform):
|
|||
api_key=api_key,
|
||||
environment=pinecone_env,
|
||||
)
|
||||
openai_api_key = context.getProperty(self.OPENAI_API_KEY).getValue()
|
||||
self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
|
||||
self.embeddings = create_embedding_service(context)
|
||||
self.query_utils = QueryUtils.QueryUtils(context)
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue