mirror of https://github.com/apache/nifi.git
NIFI-12366 Add HuggingFace support to Pinecone processors
Signed-off-by: Pierre Villard <pierre.villard.fr@gmail.com> This closes #8026.
This commit is contained in:
parent
9154b708cc
commit
2d82cdc0f5
|
@ -14,6 +14,9 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
from nifiapi.properties import PropertyDescriptor, StandardValidators, PropertyDependency, ExpressionLanguageScope
|
from nifiapi.properties import PropertyDescriptor, StandardValidators, PropertyDependency, ExpressionLanguageScope
|
||||||
|
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||||
|
from langchain.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings
|
||||||
|
|
||||||
|
|
||||||
# Embedding Functions
|
# Embedding Functions
|
||||||
ONNX_ALL_MINI_LM_L6_V2 = "ONNX all-MiniLM-L6-v2 Model"
|
ONNX_ALL_MINI_LM_L6_V2 = "ONNX all-MiniLM-L6-v2 Model"
|
||||||
|
@ -99,13 +102,19 @@ SENTENCE_TRANSFORMER_MODEL_NAME = PropertyDescriptor(
|
||||||
)
|
)
|
||||||
SENTENCE_TRANSFORMER_DEVICE = PropertyDescriptor(
|
SENTENCE_TRANSFORMER_DEVICE = PropertyDescriptor(
|
||||||
name="Sentence Transformer Device Type",
|
name="Sentence Transformer Device Type",
|
||||||
description="The type of device to use for performing the embeddings using the Sentence Transformer, such as 'cpu', 'cuda', 'mps', 'cuda:0', etc. If not specified, a GPU will be used if "
|
description="""The type of device to use for performing the embeddings using the Sentence Transformer, such as 'cpu', 'cuda', 'mps', 'cuda:0', etc.
|
||||||
+ "possible, otherwise a CPU.",
|
If not specified, a GPU will be used if possible, otherwise a CPU.""",
|
||||||
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
||||||
required=False,
|
required=False,
|
||||||
dependencies=[PropertyDependency(EMBEDDING_FUNCTION, SENTENCE_TRANSFORMERS)]
|
dependencies=[PropertyDependency(EMBEDDING_FUNCTION, SENTENCE_TRANSFORMERS)]
|
||||||
)
|
)
|
||||||
|
EMBEDDING_MODEL = PropertyDescriptor(
|
||||||
|
name="Embedding Model",
|
||||||
|
description="Specifies which embedding model should be used in order to create embeddings from incoming Documents. Default model is OpenAI.",
|
||||||
|
allowable_values=[HUGGING_FACE, OPENAI],
|
||||||
|
default_value=OPENAI,
|
||||||
|
required=True
|
||||||
|
)
|
||||||
PROPERTIES = [
|
PROPERTIES = [
|
||||||
EMBEDDING_FUNCTION,
|
EMBEDDING_FUNCTION,
|
||||||
HUGGING_FACE_MODEL_NAME,
|
HUGGING_FACE_MODEL_NAME,
|
||||||
|
@ -117,7 +126,8 @@ PROPERTIES = [
|
||||||
OPENAI_API_TYPE,
|
OPENAI_API_TYPE,
|
||||||
OPENAI_API_VERSION,
|
OPENAI_API_VERSION,
|
||||||
SENTENCE_TRANSFORMER_MODEL_NAME,
|
SENTENCE_TRANSFORMER_MODEL_NAME,
|
||||||
SENTENCE_TRANSFORMER_DEVICE
|
SENTENCE_TRANSFORMER_DEVICE,
|
||||||
|
EMBEDDING_MODEL
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -145,3 +155,14 @@ def create_embedding_function(context):
|
||||||
model_name = context.getProperty(SENTENCE_TRANSFORMER_MODEL_NAME).getValue()
|
model_name = context.getProperty(SENTENCE_TRANSFORMER_MODEL_NAME).getValue()
|
||||||
device = context.getProperty(SENTENCE_TRANSFORMER_DEVICE).getValue()
|
device = context.getProperty(SENTENCE_TRANSFORMER_DEVICE).getValue()
|
||||||
return SentenceTransformerEmbeddingFunction(model_name=model_name, device=device)
|
return SentenceTransformerEmbeddingFunction(model_name=model_name, device=device)
|
||||||
|
|
||||||
|
|
||||||
|
def create_embedding_service(context):
|
||||||
|
embedding_service = context.getProperty(EMBEDDING_MODEL).getValue()
|
||||||
|
|
||||||
|
if embedding_service == OPENAI:
|
||||||
|
openai_api_key = context.getProperty(OPENAI_API_KEY).getValue()
|
||||||
|
return OpenAIEmbeddings(openai_api_key=openai_api_key)
|
||||||
|
else:
|
||||||
|
huggingface_api_key = context.getProperty(HUGGING_FACE_API_KEY).getValue()
|
||||||
|
return HuggingFaceInferenceAPIEmbeddings(api_key=huggingface_api_key)
|
||||||
|
|
|
@ -14,11 +14,11 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
from langchain.vectorstores import Pinecone
|
from langchain.vectorstores import Pinecone
|
||||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
|
||||||
from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
|
from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
|
||||||
from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope
|
from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope, PropertyDependency
|
||||||
import pinecone
|
import pinecone
|
||||||
import json
|
import json
|
||||||
|
from EmbeddingUtils import OPENAI, HUGGING_FACE, EMBEDDING_MODEL, create_embedding_service
|
||||||
|
|
||||||
|
|
||||||
class PutPinecone(FlowFileTransform):
|
class PutPinecone(FlowFileTransform):
|
||||||
|
@ -31,7 +31,6 @@ class PutPinecone(FlowFileTransform):
|
||||||
The text must be a string, while metadata must be a map with strings for values. Any additional fields will be ignored."""
|
The text must be a string, while metadata must be a map with strings for values. Any additional fields will be ignored."""
|
||||||
tags = ["pinecone", "vector", "vectordb", "vectorstore", "embeddings", "ai", "artificial intelligence", "ml", "machine learning", "text", "LLM"]
|
tags = ["pinecone", "vector", "vectordb", "vectorstore", "embeddings", "ai", "artificial intelligence", "ml", "machine learning", "text", "LLM"]
|
||||||
|
|
||||||
|
|
||||||
PINECONE_API_KEY = PropertyDescriptor(
|
PINECONE_API_KEY = PropertyDescriptor(
|
||||||
name="Pinecone API Key",
|
name="Pinecone API Key",
|
||||||
description="The API Key to use in order to authentication with Pinecone",
|
description="The API Key to use in order to authentication with Pinecone",
|
||||||
|
@ -39,12 +38,37 @@ class PutPinecone(FlowFileTransform):
|
||||||
required=True,
|
required=True,
|
||||||
validators=[StandardValidators.NON_EMPTY_VALIDATOR]
|
validators=[StandardValidators.NON_EMPTY_VALIDATOR]
|
||||||
)
|
)
|
||||||
|
HUGGING_FACE_API_KEY = PropertyDescriptor(
|
||||||
|
name="HuggingFace API Key",
|
||||||
|
description="The API Key for interacting with HuggingFace",
|
||||||
|
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
||||||
|
required=True,
|
||||||
|
sensitive=True,
|
||||||
|
dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)]
|
||||||
|
)
|
||||||
|
HUGGING_FACE_MODEL = PropertyDescriptor(
|
||||||
|
name="HuggingFace Model",
|
||||||
|
description="The name of the HuggingFace model to use",
|
||||||
|
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
||||||
|
required=True,
|
||||||
|
default_value="sentence-transformers/all-MiniLM-L6-v2",
|
||||||
|
dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)]
|
||||||
|
)
|
||||||
OPENAI_API_KEY = PropertyDescriptor(
|
OPENAI_API_KEY = PropertyDescriptor(
|
||||||
name="OpenAI API Key",
|
name="OpenAI API Key",
|
||||||
description="The API Key for OpenAI in order to create embeddings",
|
description="The API Key for OpenAI in order to create embeddings",
|
||||||
sensitive=True,
|
sensitive=True,
|
||||||
required=True,
|
required=True,
|
||||||
validators=[StandardValidators.NON_EMPTY_VALIDATOR]
|
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
||||||
|
dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)]
|
||||||
|
)
|
||||||
|
OPENAI_API_MODEL = PropertyDescriptor(
|
||||||
|
name="OpenAI Model",
|
||||||
|
description="The API Key for OpenAI in order to create embeddings",
|
||||||
|
required=True,
|
||||||
|
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
||||||
|
default_value="text-embedding-ada-002",
|
||||||
|
dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)]
|
||||||
)
|
)
|
||||||
PINECONE_ENV = PropertyDescriptor(
|
PINECONE_ENV = PropertyDescriptor(
|
||||||
name="Pinecone Environment",
|
name="Pinecone Environment",
|
||||||
|
@ -78,15 +102,19 @@ class PutPinecone(FlowFileTransform):
|
||||||
)
|
)
|
||||||
DOC_ID_FIELD_NAME = PropertyDescriptor(
|
DOC_ID_FIELD_NAME = PropertyDescriptor(
|
||||||
name="Document ID Field Name",
|
name="Document ID Field Name",
|
||||||
description="Specifies the name of the field in the 'metadata' element of each document where the document's ID can be found. " +
|
description="""Specifies the name of the field in the 'metadata' element of each document where the document's ID can be found.
|
||||||
"If not specified, an ID will be generated based on the FlowFile's filename and a one-up number.",
|
If not specified, an ID will be generated based on the FlowFile's filename and a one-up number.""",
|
||||||
required=False,
|
required=False,
|
||||||
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
||||||
expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES
|
expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES
|
||||||
)
|
)
|
||||||
|
|
||||||
properties = [PINECONE_API_KEY,
|
properties = [PINECONE_API_KEY,
|
||||||
|
EMBEDDING_MODEL,
|
||||||
OPENAI_API_KEY,
|
OPENAI_API_KEY,
|
||||||
|
OPENAI_API_MODEL,
|
||||||
|
HUGGING_FACE_API_KEY,
|
||||||
|
HUGGING_FACE_MODEL,
|
||||||
PINECONE_ENV,
|
PINECONE_ENV,
|
||||||
INDEX_NAME,
|
INDEX_NAME,
|
||||||
TEXT_KEY,
|
TEXT_KEY,
|
||||||
|
@ -110,9 +138,8 @@ class PutPinecone(FlowFileTransform):
|
||||||
api_key=api_key,
|
api_key=api_key,
|
||||||
environment=pinecone_env,
|
environment=pinecone_env,
|
||||||
)
|
)
|
||||||
openai_api_key = context.getProperty(self.OPENAI_API_KEY).getValue()
|
|
||||||
self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
|
|
||||||
|
|
||||||
|
self.embeddings = create_embedding_service(context)
|
||||||
|
|
||||||
def transform(self, context, flowfile):
|
def transform(self, context, flowfile):
|
||||||
# First, check if our index already exists. If it doesn't, we create it
|
# First, check if our index already exists. If it doesn't, we create it
|
||||||
|
@ -158,4 +185,4 @@ class PutPinecone(FlowFileTransform):
|
||||||
text_key = context.getProperty(self.TEXT_KEY).evaluateAttributeExpressions().getValue()
|
text_key = context.getProperty(self.TEXT_KEY).evaluateAttributeExpressions().getValue()
|
||||||
vectorstore = Pinecone(index, self.embeddings.embed_query, text_key)
|
vectorstore = Pinecone(index, self.embeddings.embed_query, text_key)
|
||||||
vectorstore.add_texts(texts=texts, metadatas=metadatas, ids=ids, namespace=namespace)
|
vectorstore.add_texts(texts=texts, metadatas=metadatas, ids=ids, namespace=namespace)
|
||||||
return FlowFileTransformResult(relationship = "success")
|
return FlowFileTransformResult(relationship="success")
|
||||||
|
|
|
@ -14,11 +14,11 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
from langchain.vectorstores import Pinecone
|
from langchain.vectorstores import Pinecone
|
||||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
|
||||||
from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
|
from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
|
||||||
from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope
|
from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope, PropertyDependency
|
||||||
import QueryUtils
|
import QueryUtils
|
||||||
import pinecone
|
import pinecone
|
||||||
|
from EmbeddingUtils import OPENAI, HUGGING_FACE, EMBEDDING_MODEL, create_embedding_service
|
||||||
|
|
||||||
|
|
||||||
class QueryPinecone(FlowFileTransform):
|
class QueryPinecone(FlowFileTransform):
|
||||||
|
@ -30,7 +30,6 @@ class QueryPinecone(FlowFileTransform):
|
||||||
description = "Queries Pinecone in order to gather a specified number of documents that are most closely related to the given query."
|
description = "Queries Pinecone in order to gather a specified number of documents that are most closely related to the given query."
|
||||||
tags = ["pinecone", "vector", "vectordb", "vectorstore", "embeddings", "ai", "artificial intelligence", "ml", "machine learning", "text", "LLM"]
|
tags = ["pinecone", "vector", "vectordb", "vectorstore", "embeddings", "ai", "artificial intelligence", "ml", "machine learning", "text", "LLM"]
|
||||||
|
|
||||||
|
|
||||||
PINECONE_API_KEY = PropertyDescriptor(
|
PINECONE_API_KEY = PropertyDescriptor(
|
||||||
name="Pinecone API Key",
|
name="Pinecone API Key",
|
||||||
description="The API Key to use in order to authentication with Pinecone",
|
description="The API Key to use in order to authentication with Pinecone",
|
||||||
|
@ -43,7 +42,32 @@ class QueryPinecone(FlowFileTransform):
|
||||||
description="The API Key for OpenAI in order to create embeddings",
|
description="The API Key for OpenAI in order to create embeddings",
|
||||||
sensitive=True,
|
sensitive=True,
|
||||||
required=True,
|
required=True,
|
||||||
validators=[StandardValidators.NON_EMPTY_VALIDATOR]
|
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
||||||
|
dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)]
|
||||||
|
)
|
||||||
|
HUGGING_FACE_API_KEY = PropertyDescriptor(
|
||||||
|
name="HuggingFace API Key",
|
||||||
|
description="The API Key for interacting with HuggingFace",
|
||||||
|
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
||||||
|
required=True,
|
||||||
|
sensitive=True,
|
||||||
|
dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)]
|
||||||
|
)
|
||||||
|
OPENAI_MODEL = PropertyDescriptor(
|
||||||
|
name="OpenAI Model",
|
||||||
|
description="The API Key for OpenAI in order to create embeddings",
|
||||||
|
required=True,
|
||||||
|
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
||||||
|
default_value="text-embedding-ada-002",
|
||||||
|
dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)]
|
||||||
|
)
|
||||||
|
HUGGING_FACE_MODEL = PropertyDescriptor(
|
||||||
|
name="HuggingFace Model",
|
||||||
|
description="The name of the HuggingFace model to use",
|
||||||
|
validators=[StandardValidators.NON_EMPTY_VALIDATOR],
|
||||||
|
required=True,
|
||||||
|
default_value="sentence-transformers/all-MiniLM-L6-v2",
|
||||||
|
dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)]
|
||||||
)
|
)
|
||||||
PINECONE_ENV = PropertyDescriptor(
|
PINECONE_ENV = PropertyDescriptor(
|
||||||
name="Pinecone Environment",
|
name="Pinecone Environment",
|
||||||
|
@ -91,9 +115,12 @@ class QueryPinecone(FlowFileTransform):
|
||||||
expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES
|
expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
properties = [PINECONE_API_KEY,
|
properties = [PINECONE_API_KEY,
|
||||||
|
EMBEDDING_MODEL,
|
||||||
OPENAI_API_KEY,
|
OPENAI_API_KEY,
|
||||||
|
OPENAI_MODEL,
|
||||||
|
HUGGING_FACE_API_KEY,
|
||||||
|
HUGGING_FACE_MODEL,
|
||||||
PINECONE_ENV,
|
PINECONE_ENV,
|
||||||
INDEX_NAME,
|
INDEX_NAME,
|
||||||
QUERY,
|
QUERY,
|
||||||
|
@ -123,8 +150,7 @@ class QueryPinecone(FlowFileTransform):
|
||||||
api_key=api_key,
|
api_key=api_key,
|
||||||
environment=pinecone_env,
|
environment=pinecone_env,
|
||||||
)
|
)
|
||||||
openai_api_key = context.getProperty(self.OPENAI_API_KEY).getValue()
|
self.embeddings = create_embedding_service(context)
|
||||||
self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
|
|
||||||
self.query_utils = QueryUtils.QueryUtils(context)
|
self.query_utils = QueryUtils.QueryUtils(context)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue