2023-09-20 17:39:10 -04:00
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
from langchain . text_splitter import Language
from nifiapi . flowfiletransform import FlowFileTransform , FlowFileTransformResult
from nifiapi . properties import PropertyDescriptor , StandardValidators , PropertyDependency , ExpressionLanguageScope
2024-01-03 15:06:03 -05:00
from nifiapi . documentation import use_case , multi_processor_use_case , ProcessorConfiguration
2023-09-20 17:39:10 -04:00
SPLIT_BY_CHARACTER = ' Split by Character '
SPLIT_CODE = ' Split Code '
RECURSIVELY_SPLIT_BY_CHARACTER = ' Recursively Split by Character '
TEXT_KEY = " text "
METADATA_KEY = " metadata "
2024-01-03 15:06:03 -05:00
@use_case (
description = " Create chunks of text from a single larger chunk. " ,
notes = " The input for this use case is expected to be a FlowFile whose content is a JSON Lines document, with each line having a ' text ' and a ' metadata ' element. " ,
keywords = [ " embedding " , " vector " , " text " , " rag " , " retrieval augmented generation " ] ,
configuration = """
Set " Input Format " to " Plain Text "
Set " Element Strategy " to " Single Document "
"""
)
@multi_processor_use_case (
description = """
Chunk Plaintext data in order to prepare it for storage in a vector store . The output is in " json-lines " format ,
containing the chunked data as text , as well as metadata pertaining to the chunk . """ ,
notes = " The input for this use case is expected to be a FlowFile whose content is a plaintext document. " ,
keywords = [ " embedding " , " vector " , " text " , " rag " , " retrieval augmented generation " ] ,
configurations = [
ProcessorConfiguration (
processor_type = " ParseDocument " ,
configuration = """
Set " Input Format " to " Plain Text "
Set " Element Strategy " to " Single Document "
Connect the ' success ' Relationship to ChunkDocument .
"""
) ,
ProcessorConfiguration (
processor_type = " ChunkDocument " ,
configuration = """
Set the following properties :
" Chunking Strategy " = " Recursively Split by Character "
" Separator " = " \\ n \\ n, \\ n, , "
" Separator Format " = " Plain Text "
" Chunk Size " = " 4000 "
" Chunk Overlap " = " 200 "
" Keep Separator " = " false "
Connect the ' success ' Relationship to the appropriate destination to store data in the desired vector store .
"""
)
] )
@multi_processor_use_case (
description = """
Parse and chunk the textual contents of a PDF document in order to prepare it for storage in a vector store . The output is in " json-lines " format ,
containing the chunked data as text , as well as metadata pertaining to the chunk . """ ,
notes = " The input for this use case is expected to be a FlowFile whose content is a PDF document. " ,
keywords = [ " pdf " , " embedding " , " vector " , " text " , " rag " , " retrieval augmented generation " ] ,
configurations = [
ProcessorConfiguration (
processor_type = " ParseDocument " ,
configuration = """
Set " Input Format " to " PDF "
Set " Element Strategy " to " Single Document "
Set " Include Extracted Metadata " to " false "
Connect the ' success ' Relationship to ChunkDocument .
"""
) ,
ProcessorConfiguration (
processor_type = " ChunkDocument " ,
configuration = """
Set the following properties :
" Chunking Strategy " = " Recursively Split by Character "
" Separator " = " \\ n \\ n, \\ n, , "
" Separator Format " = " Plain Text "
" Chunk Size " = " 4000 "
" Chunk Overlap " = " 200 "
" Keep Separator " = " false "
2023-09-20 17:39:10 -04:00
2024-01-03 15:06:03 -05:00
Connect the ' success ' Relationship to the appropriate destination to store data in the desired vector store .
"""
)
] )
2023-09-20 17:39:10 -04:00
class ChunkDocument ( FlowFileTransform ) :
class Java :
implements = [ ' org.apache.nifi.python.processor.FlowFileTransform ' ]
class ProcessorDetails :
2024-01-29 09:12:42 -05:00
version = ' 2.0.0-SNAPSHOT '
2024-01-03 15:06:03 -05:00
description = """ Chunks incoming documents that are formatted as JSON Lines into chunks that are appropriately sized for creating Text Embeddings.
The input is expected to be in " json-lines " format , with each line having a ' text ' and a ' metadata ' element .
Each line will then be split into one or more lines in the output . """
2023-09-20 17:39:10 -04:00
tags = [ " text " , " split " , " chunk " , " langchain " , " embeddings " , " vector " , " machine learning " , " ML " , " artificial intelligence " , " ai " , " document " ]
dependencies = [ ' langchain ' ]
CHUNK_STRATEGY = PropertyDescriptor (
name = " Chunking Strategy " ,
description = " Specifies which splitter should be used to split the text " ,
allowable_values = [ RECURSIVELY_SPLIT_BY_CHARACTER , SPLIT_BY_CHARACTER , SPLIT_CODE ] ,
required = True ,
default_value = RECURSIVELY_SPLIT_BY_CHARACTER
)
SEPARATOR = PropertyDescriptor (
name = " Separator " ,
description = " Specifies the character sequence to use for splitting apart the text. If using a Chunking Strategy of Recursively Split by Character, " +
" it is a comma-separated list of character sequences. Meta-characters \\ n, \\ r and \\ t are automatically un-escaped. " ,
required = True ,
default_value = " \\ n \\ n, \\ n, , " ,
validators = [ StandardValidators . NON_EMPTY_VALIDATOR ] ,
dependencies = [ PropertyDependency ( CHUNK_STRATEGY , SPLIT_BY_CHARACTER , RECURSIVELY_SPLIT_BY_CHARACTER ) ] ,
expression_language_scope = ExpressionLanguageScope . FLOWFILE_ATTRIBUTES
)
SEPARATOR_FORMAT = PropertyDescriptor (
name = " Separator Format " ,
description = " Specifies how to interpret the value of the <Separator> property " ,
required = True ,
default_value = " Plain Text " ,
allowable_values = [ " Plain Text " , " Regular Expression " ] ,
dependencies = [ PropertyDependency ( CHUNK_STRATEGY , SPLIT_BY_CHARACTER , RECURSIVELY_SPLIT_BY_CHARACTER ) ]
)
CHUNK_SIZE = PropertyDescriptor (
name = " Chunk Size " ,
description = " The maximum size of a chunk that should be returned " ,
required = True ,
default_value = " 4000 " ,
validators = [ StandardValidators . POSITIVE_INTEGER_VALIDATOR ]
)
CHUNK_OVERLAP = PropertyDescriptor (
name = " Chunk Overlap " ,
description = " The number of characters that should be overlapped between each chunk of text " ,
required = True ,
default_value = " 200 " ,
validators = [ StandardValidators . NON_NEGATIVE_INTEGER_VALIDATOR ]
)
KEEP_SEPARATOR = PropertyDescriptor (
name = " Keep Separator " ,
description = " Whether or not to keep the text separator in each chunk of data " ,
required = True ,
default_value = " false " ,
allowable_values = [ " true " , " false " ] ,
dependencies = [ PropertyDependency ( CHUNK_STRATEGY , SPLIT_BY_CHARACTER , RECURSIVELY_SPLIT_BY_CHARACTER ) ]
)
STRIP_WHITESPACE = PropertyDescriptor (
name = " Strip Whitespace " ,
description = " Whether or not to strip the whitespace at the beginning and end of each chunk " ,
required = True ,
default_value = " true " ,
allowable_values = [ " true " , " false " ] ,
dependencies = [ PropertyDependency ( CHUNK_STRATEGY , SPLIT_BY_CHARACTER , RECURSIVELY_SPLIT_BY_CHARACTER ) ]
)
LANGUAGE = PropertyDescriptor (
name = " Language " ,
description = " The language to use for the Code ' s syntax " ,
required = True ,
default_value = " python " ,
allowable_values = [ e . value for e in Language ] ,
dependencies = [ PropertyDependency ( CHUNK_STRATEGY , SPLIT_CODE ) ]
)
property_descriptors = [ CHUNK_STRATEGY ,
SEPARATOR ,
SEPARATOR_FORMAT ,
CHUNK_SIZE ,
CHUNK_OVERLAP ,
KEEP_SEPARATOR ,
STRIP_WHITESPACE ]
def __init__ ( self , * * kwargs ) :
pass
def getPropertyDescriptors ( self ) :
return self . property_descriptors
def split_docs ( self , context , flowfile , documents ) :
from langchain . text_splitter import CharacterTextSplitter
from langchain . text_splitter import RecursiveCharacterTextSplitter
strategy = context . getProperty ( self . CHUNK_STRATEGY ) . getValue ( )
if strategy == SPLIT_BY_CHARACTER :
text_splitter = CharacterTextSplitter (
separator = context . getProperty ( self . SEPARATOR ) . evaluateAttributeExpressions ( flowfile ) . getValue ( ) ,
keep_separator = context . getProperty ( self . KEEP_SEPARATOR ) . asBoolean ( ) ,
is_separator_regex = context . getProperty ( self . SEPARATOR_FORMAT ) . getValue ( ) == ' Regular Expression ' ,
chunk_size = context . getProperty ( self . CHUNK_SIZE ) . asInteger ( ) ,
chunk_overlap = context . getProperty ( self . CHUNK_OVERLAP ) . asInteger ( ) ,
length_function = len ,
strip_whitespace = context . getProperty ( self . STRIP_WHITESPACE ) . asBoolean ( )
)
elif strategy == SPLIT_CODE :
text_splitter = RecursiveCharacterTextSplitter . from_language (
language = context . getProperty ( self . LANGUAGE ) . getValue ( ) ,
chunk_size = context . getProperty ( self . CHUNK_SIZE ) . asInteger ( ) ,
chunk_overlap = context . getProperty ( self . CHUNK_OVERLAP ) . asInteger ( )
)
else :
separator_text = context . getProperty ( self . SEPARATOR ) . evaluateAttributeExpressions ( flowfile ) . getValue ( )
splits = separator_text . split ( " , " )
unescaped = [ ]
for split in splits :
unescaped . append ( split . replace ( " \\ n " , " \n " ) . replace ( " \\ r " , " \r " ) . replace ( " \\ t " , " \t " ) )
text_splitter = RecursiveCharacterTextSplitter (
separators = unescaped ,
keep_separator = context . getProperty ( self . KEEP_SEPARATOR ) . asBoolean ( ) ,
is_separator_regex = context . getProperty ( self . SEPARATOR_FORMAT ) . getValue ( ) == ' Regular Expression ' ,
chunk_size = context . getProperty ( self . CHUNK_SIZE ) . asInteger ( ) ,
chunk_overlap = context . getProperty ( self . CHUNK_OVERLAP ) . asInteger ( ) ,
length_function = len ,
strip_whitespace = context . getProperty ( self . STRIP_WHITESPACE ) . asBoolean ( )
)
splits = text_splitter . split_documents ( documents )
return splits
def to_json ( self , docs ) - > str :
json_docs = [ ]
i = 0
for doc in docs :
doc . metadata [ ' chunk_index ' ] = i
doc . metadata [ ' chunk_count ' ] = len ( docs )
i + = 1
json_doc = json . dumps ( {
TEXT_KEY : doc . page_content ,
METADATA_KEY : doc . metadata
} )
json_docs . append ( json_doc )
return " \n " . join ( json_docs )
def load_docs ( self , flowfile ) :
from langchain . schema import Document
flowfile_contents = flowfile . getContentsAsBytes ( ) . decode ( )
docs = [ ]
for line in flowfile_contents . split ( " \n " ) :
stripped = line . strip ( )
if stripped == " " :
continue
json_element = json . loads ( stripped )
page_content = json_element . get ( TEXT_KEY )
if page_content is None :
continue
metadata = json_element . get ( METADATA_KEY )
if metadata is None :
metadata = { }
doc = Document ( page_content = page_content , metadata = metadata )
docs . append ( doc )
return docs
def transform ( self , context , flowfile ) :
documents = self . load_docs ( flowfile )
split_docs = self . split_docs ( context , flowfile , documents )
output_json = self . to_json ( split_docs )
attributes = { " document.count " : str ( len ( split_docs ) ) }
return FlowFileTransformResult ( " success " , contents = output_json , attributes = attributes )