NIFI-12636 - Upgrade dependencies for Pinecone, ChromaDB and OpenAI processors

Signed-off-by: Joe Gresock <jgresock@gmail.com>
This closes #8293.
This commit is contained in:
Pierre Villard 2024-01-23 21:02:40 +04:00 committed by Joe Gresock
parent 85dc637a96
commit 3afeac6341
No known key found for this signature in database
GPG Key ID: 37F5B9B6E258C8B7
7 changed files with 28 additions and 32 deletions

View File

@ -33,7 +33,7 @@ class PromptChatGPT(FlowFileTransform):
version = '2.0.0-SNAPSHOT'
description = "Submits a prompt to ChatGPT, writing the results either to a FlowFile attribute or to the contents of the FlowFile"
tags = ["text", "chatgpt", "gpt", "machine learning", "ML", "artificial intelligence", "ai", "document", "langchain"]
dependencies = ['langchain==0.0.331', 'openai==0.28.1', 'jsonpath-ng']
dependencies = ['langchain==0.1.2', 'openai==1.9.0', 'jsonpath-ng']
MODEL = PropertyDescriptor(

View File

@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from nifiapi.properties import PropertyDescriptor, StandardValidators, PropertyDependency, ExpressionLanguageScope
from nifiapi.properties import PropertyDescriptor, StandardValidators, PropertyDependency
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings

View File

@ -13,13 +13,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from langchain.vectorstores import Pinecone
import langchain.vectorstores
from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope, PropertyDependency
import pinecone
from pinecone import Pinecone
import json
from EmbeddingUtils import OPENAI, HUGGING_FACE, EMBEDDING_MODEL, create_embedding_service
from nifiapi.documentation import use_case, multi_processor_use_case, ProcessorConfiguration
from nifiapi.documentation import use_case
@use_case(description="Create vectors/embeddings that represent text content and send the vectors to Pinecone",
notes="This use case assumes that the data has already been formatted in JSONL format with the text to store in Pinecone provided in the 'text' field.",
@ -149,6 +149,7 @@ class PutPinecone(FlowFileTransform):
DOC_ID_FIELD_NAME]
embeddings = None
pc = None
def __init__(self, **kwargs):
pass
@ -157,15 +158,12 @@ class PutPinecone(FlowFileTransform):
return self.properties
def onScheduled(self, context):
api_key = context.getProperty(self.PINECONE_API_KEY).getValue()
pinecone_env = context.getProperty(self.PINECONE_ENV).getValue()
# initialize pinecone
pinecone.init(
api_key=api_key,
environment=pinecone_env,
self.pc = Pinecone(
api_key=context.getProperty(self.PINECONE_API_KEY).getValue(),
environment=context.getProperty(self.PINECONE_ENV).getValue()
)
# initialize embedding service
self.embeddings = create_embedding_service(context)
def transform(self, context, flowfile):
@ -174,7 +172,7 @@ class PutPinecone(FlowFileTransform):
namespace = context.getProperty(self.NAMESPACE).evaluateAttributeExpressions(flowfile).getValue()
id_field_name = context.getProperty(self.DOC_ID_FIELD_NAME).evaluateAttributeExpressions(flowfile).getValue()
index = pinecone.Index(index_name)
index = self.pc.Index(index_name)
# Read the FlowFile content as "json-lines".
json_lines = flowfile.getContentsAsBytes().decode()
@ -210,6 +208,6 @@ class PutPinecone(FlowFileTransform):
i += 1
text_key = context.getProperty(self.TEXT_KEY).evaluateAttributeExpressions().getValue()
vectorstore = Pinecone(index, self.embeddings.embed_query, text_key)
vectorstore = langchain.vectorstores.Pinecone(index, self.embeddings.embed_query, text_key)
vectorstore.add_texts(texts=texts, metadatas=metadatas, ids=ids, namespace=namespace)
return FlowFileTransformResult(relationship="success")

View File

@ -16,7 +16,7 @@
import json
from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope, PropertyDependency
from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope
import ChromaUtils
import EmbeddingUtils
import QueryUtils

View File

@ -13,11 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from langchain.vectorstores import Pinecone
import langchain.vectorstores
from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope, PropertyDependency
import QueryUtils
import pinecone
from pinecone import Pinecone
import json
from EmbeddingUtils import OPENAI, HUGGING_FACE, EMBEDDING_MODEL, create_embedding_service
@ -143,6 +143,7 @@ class QueryPinecone(FlowFileTransform):
embeddings = None
query_utils = None
pc = None
def __init__(self, **kwargs):
pass
@ -151,18 +152,15 @@ class QueryPinecone(FlowFileTransform):
return self.properties
def onScheduled(self, context):
api_key = context.getProperty(self.PINECONE_API_KEY).getValue()
pinecone_env = context.getProperty(self.PINECONE_ENV).getValue()
# initialize pinecone
pinecone.init(
api_key=api_key,
environment=pinecone_env,
self.pc = Pinecone(
api_key=context.getProperty(self.PINECONE_API_KEY).getValue(),
environment=context.getProperty(self.PINECONE_ENV).getValue()
)
self.embeddings = create_embedding_service(context)
# initialize embedding service
self.embeddings = create_embedding_service(context)
self.query_utils = QueryUtils.QueryUtils(context)
def transform(self, context, flowfile):
# First, check if our index already exists. If it doesn't, we create it
index_name = context.getProperty(self.INDEX_NAME).evaluateAttributeExpressions(flowfile).getValue()
@ -170,11 +168,11 @@ class QueryPinecone(FlowFileTransform):
namespace = context.getProperty(self.NAMESPACE).evaluateAttributeExpressions(flowfile).getValue()
num_results = context.getProperty(self.NUMBER_OF_RESULTS).evaluateAttributeExpressions(flowfile).asInteger()
index = pinecone.Index(index_name)
index = self.pc.Index(index_name)
text_key = context.getProperty(self.TEXT_KEY).evaluateAttributeExpressions().getValue()
filter = context.getProperty(self.FILTER).evaluateAttributeExpressions(flowfile).getValue()
vectorstore = Pinecone(index, self.embeddings.embed_query, text_key, namespace=namespace)
vectorstore = langchain.vectorstores.Pinecone(index, self.embeddings.embed_query, text_key, namespace=namespace)
results = vectorstore.similarity_search_with_score(query, num_results, filter=None if filter is None else json.loads(filter))
documents = []

View File

@ -14,7 +14,7 @@
# limitations under the License.
from typing import Tuple
from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope, PropertyDependency
from nifiapi.properties import PropertyDescriptor, StandardValidators, PropertyDependency
import json
ROW_ORIENTED = "Row-Oriented"

View File

@ -14,16 +14,16 @@
# limitations under the License.
# Shared requirements
openai==0.28.1
openai==1.9.0
# Chroma requirements
chromadb==0.4.14
chromadb==0.4.22
onnxruntime
tokenizers
tqdm
requests
# Pinecone requirements
pinecone-client
pinecone-client==3.0.1
tiktoken
langchain==0.0.331
langchain==0.1.2