668 lines
19 KiB
Ruby
668 lines
19 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
# This code is copied from the MIT licensed json-stream
|
|
# see: https://github.com/dgraham/json-stream
|
|
#
|
|
# It was copied to avoid the dependency and allow us to make some small changes
|
|
# particularly we need better access to internal state when parsing
|
|
|
|
module DiscourseAi
|
|
module Completions
|
|
# Raised on any invalid JSON text.
|
|
ParserError = Class.new(RuntimeError)
|
|
|
|
# A streaming JSON parser that generates SAX-like events for state changes.
|
|
# Use the json gem for small documents. Use this for huge documents that
|
|
# won't fit in memory.
|
|
#
|
|
# Examples
|
|
#
|
|
# parser = JSON::Stream::Parser.new
|
|
# parser.key { |key| puts key }
|
|
# parser.value { |value| puts value }
|
|
# parser << '{"answer":'
|
|
# parser << ' 42}'
|
|
class JsonStreamingParser
|
|
# our changes:
|
|
attr_reader :state, :buf, :pos
|
|
|
|
# A character buffer that expects a UTF-8 encoded stream of bytes.
|
|
# This handles truncated multi-byte characters properly so we can just
|
|
# feed it binary data and receive a properly formatted UTF-8 String as
|
|
# output.
|
|
#
|
|
# More UTF-8 parsing details are available at:
|
|
#
|
|
# http://en.wikipedia.org/wiki/UTF-8
|
|
# http://tools.ietf.org/html/rfc3629#section-3
|
|
class Buffer
|
|
def initialize
|
|
@state = :start
|
|
@buffer = []
|
|
@need = 0
|
|
end
|
|
|
|
# Fill the buffer with a String of binary UTF-8 encoded bytes. Returns
|
|
# as much of the data in a UTF-8 String as we have. Truncated multi-byte
|
|
# characters are saved in the buffer until the next call to this method
|
|
# where we expect to receive the rest of the multi-byte character.
|
|
#
|
|
# data - The partial binary encoded String data.
|
|
#
|
|
# Raises JSON::Stream::ParserError if the UTF-8 byte sequence is malformed.
|
|
#
|
|
# Returns a UTF-8 encoded String.
|
|
def <<(data)
|
|
# Avoid state machine for complete UTF-8.
|
|
if @buffer.empty?
|
|
data.force_encoding(Encoding::UTF_8)
|
|
return data if data.valid_encoding?
|
|
end
|
|
|
|
bytes = []
|
|
data.each_byte do |byte|
|
|
case @state
|
|
when :start
|
|
if byte < 128
|
|
bytes << byte
|
|
elsif byte >= 192
|
|
@state = :multi_byte
|
|
@buffer << byte
|
|
@need =
|
|
case
|
|
when byte >= 240
|
|
4
|
|
when byte >= 224
|
|
3
|
|
when byte >= 192
|
|
2
|
|
end
|
|
else
|
|
error("Expected start of multi-byte or single byte char")
|
|
end
|
|
when :multi_byte
|
|
if byte > 127 && byte < 192
|
|
@buffer << byte
|
|
if @buffer.size == @need
|
|
bytes += @buffer.slice!(0, @buffer.size)
|
|
@state = :start
|
|
end
|
|
else
|
|
error("Expected continuation byte")
|
|
end
|
|
end
|
|
end
|
|
|
|
# Build UTF-8 encoded string from completed codepoints.
|
|
bytes
|
|
.pack("C*")
|
|
.force_encoding(Encoding::UTF_8)
|
|
.tap { |text| error("Invalid UTF-8 byte sequence") unless text.valid_encoding? }
|
|
end
|
|
|
|
# Determine if the buffer contains partial UTF-8 continuation bytes that
|
|
# are waiting on subsequent completion bytes before a full codepoint is
|
|
# formed.
|
|
#
|
|
# Examples
|
|
#
|
|
# bytes = "é".bytes
|
|
#
|
|
# buffer << bytes[0]
|
|
# buffer.empty?
|
|
# # => false
|
|
#
|
|
# buffer << bytes[1]
|
|
# buffer.empty?
|
|
# # => true
|
|
#
|
|
# Returns true if the buffer is empty.
|
|
def empty?
|
|
@buffer.empty?
|
|
end
|
|
|
|
private
|
|
|
|
def error(message)
|
|
raise ParserError, message
|
|
end
|
|
end
|
|
|
|
BUF_SIZE = 4096
|
|
CONTROL = /[\x00-\x1F]/
|
|
WS = /[ \n\t\r]/
|
|
HEX = /[0-9a-fA-F]/
|
|
DIGIT = /[0-9]/
|
|
DIGIT_1_9 = /[1-9]/
|
|
DIGIT_END = /\d$/
|
|
TRUE_RE = /[rue]/
|
|
FALSE_RE = /[alse]/
|
|
NULL_RE = /[ul]/
|
|
TRUE_KEYWORD = "true"
|
|
FALSE_KEYWORD = "false"
|
|
NULL_KEYWORD = "null"
|
|
LEFT_BRACE = "{"
|
|
RIGHT_BRACE = "}"
|
|
LEFT_BRACKET = "["
|
|
RIGHT_BRACKET = "]"
|
|
BACKSLASH = '\\'
|
|
SLASH = "/"
|
|
QUOTE = '"'
|
|
COMMA = ","
|
|
COLON = ":"
|
|
ZERO = "0"
|
|
MINUS = "-"
|
|
PLUS = "+"
|
|
POINT = "."
|
|
EXPONENT = /[eE]/
|
|
B, F, N, R, T, U = %w[b f n r t u]
|
|
|
|
# Create a new parser with an optional initialization block where
|
|
# we can register event callbacks.
|
|
#
|
|
# Examples
|
|
#
|
|
# parser = JSON::Stream::Parser.new do
|
|
# start_document { puts "start document" }
|
|
# end_document { puts "end document" }
|
|
# start_object { puts "start object" }
|
|
# end_object { puts "end object" }
|
|
# start_array { puts "start array" }
|
|
# end_array { puts "end array" }
|
|
# key { |k| puts "key: #{k}" }
|
|
# value { |v| puts "value: #{v}" }
|
|
# end
|
|
def initialize(&block)
|
|
@state = :start_document
|
|
@utf8 = Buffer.new
|
|
@listeners = {
|
|
start_document: [],
|
|
end_document: [],
|
|
start_object: [],
|
|
end_object: [],
|
|
start_array: [],
|
|
end_array: [],
|
|
key: [],
|
|
value: [],
|
|
}
|
|
|
|
# Track parse stack.
|
|
@stack = []
|
|
@unicode = +""
|
|
@buf = +""
|
|
@pos = -1
|
|
|
|
# Register any observers in the block.
|
|
instance_eval(&block) if block_given?
|
|
end
|
|
|
|
def start_document(&block)
|
|
@listeners[:start_document] << block
|
|
end
|
|
|
|
def end_document(&block)
|
|
@listeners[:end_document] << block
|
|
end
|
|
|
|
def start_object(&block)
|
|
@listeners[:start_object] << block
|
|
end
|
|
|
|
def end_object(&block)
|
|
@listeners[:end_object] << block
|
|
end
|
|
|
|
def start_array(&block)
|
|
@listeners[:start_array] << block
|
|
end
|
|
|
|
def end_array(&block)
|
|
@listeners[:end_array] << block
|
|
end
|
|
|
|
def key(&block)
|
|
@listeners[:key] << block
|
|
end
|
|
|
|
def value(&block)
|
|
@listeners[:value] << block
|
|
end
|
|
|
|
# Pass data into the parser to advance the state machine and
|
|
# generate callback events. This is well suited for an EventMachine
|
|
# receive_data loop.
|
|
#
|
|
# data - The String of partial JSON data to parse.
|
|
#
|
|
# Raises a JSON::Stream::ParserError if the JSON data is malformed.
|
|
#
|
|
# Returns nothing.
|
|
def <<(data)
|
|
(@utf8 << data).each_char do |ch|
|
|
@pos += 1
|
|
case @state
|
|
when :start_document
|
|
start_value(ch)
|
|
when :start_object
|
|
case ch
|
|
when QUOTE
|
|
@state = :start_string
|
|
@stack.push(:key)
|
|
when RIGHT_BRACE
|
|
end_container(:object)
|
|
when WS
|
|
# ignore
|
|
else
|
|
error("Expected object key start")
|
|
end
|
|
when :start_string
|
|
case ch
|
|
when QUOTE
|
|
if @stack.pop == :string
|
|
end_value(@buf)
|
|
else # :key
|
|
@state = :end_key
|
|
notify(:key, @buf)
|
|
end
|
|
@buf = +""
|
|
when BACKSLASH
|
|
@state = :start_escape
|
|
when CONTROL
|
|
error("Control characters must be escaped")
|
|
else
|
|
@buf << ch
|
|
end
|
|
when :start_escape
|
|
case ch
|
|
when QUOTE, BACKSLASH, SLASH
|
|
@buf << ch
|
|
@state = :start_string
|
|
when B
|
|
@buf << "\b"
|
|
@state = :start_string
|
|
when F
|
|
@buf << "\f"
|
|
@state = :start_string
|
|
when N
|
|
@buf << "\n"
|
|
@state = :start_string
|
|
when R
|
|
@buf << "\r"
|
|
@state = :start_string
|
|
when T
|
|
@buf << "\t"
|
|
@state = :start_string
|
|
when U
|
|
@state = :unicode_escape
|
|
else
|
|
error("Expected escaped character")
|
|
end
|
|
when :unicode_escape
|
|
case ch
|
|
when HEX
|
|
@unicode << ch
|
|
if @unicode.size == 4
|
|
codepoint = @unicode.slice!(0, 4).hex
|
|
if codepoint >= 0xD800 && codepoint <= 0xDBFF
|
|
error("Expected low surrogate pair half") if @stack[-1].is_a?(Integer)
|
|
@state = :start_surrogate_pair
|
|
@stack.push(codepoint)
|
|
elsif codepoint >= 0xDC00 && codepoint <= 0xDFFF
|
|
high = @stack.pop
|
|
error("Expected high surrogate pair half") unless high.is_a?(Integer)
|
|
pair = ((high - 0xD800) * 0x400) + (codepoint - 0xDC00) + 0x10000
|
|
@buf << pair
|
|
@state = :start_string
|
|
else
|
|
@buf << codepoint
|
|
@state = :start_string
|
|
end
|
|
end
|
|
else
|
|
error("Expected unicode escape hex digit")
|
|
end
|
|
when :start_surrogate_pair
|
|
case ch
|
|
when BACKSLASH
|
|
@state = :start_surrogate_pair_u
|
|
else
|
|
error("Expected low surrogate pair half")
|
|
end
|
|
when :start_surrogate_pair_u
|
|
case ch
|
|
when U
|
|
@state = :unicode_escape
|
|
else
|
|
error("Expected low surrogate pair half")
|
|
end
|
|
when :start_negative_number
|
|
case ch
|
|
when ZERO
|
|
@state = :start_zero
|
|
@buf << ch
|
|
when DIGIT_1_9
|
|
@state = :start_int
|
|
@buf << ch
|
|
else
|
|
error("Expected 0-9 digit")
|
|
end
|
|
when :start_zero
|
|
case ch
|
|
when POINT
|
|
@state = :start_float
|
|
@buf << ch
|
|
when EXPONENT
|
|
@state = :start_exponent
|
|
@buf << ch
|
|
else
|
|
end_value(@buf.to_i)
|
|
@buf = +""
|
|
@pos -= 1
|
|
redo
|
|
end
|
|
when :start_float
|
|
case ch
|
|
when DIGIT
|
|
@state = :in_float
|
|
@buf << ch
|
|
else
|
|
error("Expected 0-9 digit")
|
|
end
|
|
when :in_float
|
|
case ch
|
|
when DIGIT
|
|
@buf << ch
|
|
when EXPONENT
|
|
@state = :start_exponent
|
|
@buf << ch
|
|
else
|
|
end_value(@buf.to_f)
|
|
@buf = +""
|
|
@pos -= 1
|
|
redo
|
|
end
|
|
when :start_exponent
|
|
case ch
|
|
when MINUS, PLUS, DIGIT
|
|
@state = :in_exponent
|
|
@buf << ch
|
|
else
|
|
error("Expected +, -, or 0-9 digit")
|
|
end
|
|
when :in_exponent
|
|
case ch
|
|
when DIGIT
|
|
@buf << ch
|
|
else
|
|
error("Expected 0-9 digit") unless @buf =~ DIGIT_END
|
|
end_value(@buf.to_f)
|
|
@buf = +""
|
|
@pos -= 1
|
|
redo
|
|
end
|
|
when :start_int
|
|
case ch
|
|
when DIGIT
|
|
@buf << ch
|
|
when POINT
|
|
@state = :start_float
|
|
@buf << ch
|
|
when EXPONENT
|
|
@state = :start_exponent
|
|
@buf << ch
|
|
else
|
|
end_value(@buf.to_i)
|
|
@buf = +""
|
|
@pos -= 1
|
|
redo
|
|
end
|
|
when :start_true
|
|
keyword(TRUE_KEYWORD, true, TRUE_RE, ch)
|
|
when :start_false
|
|
keyword(FALSE_KEYWORD, false, FALSE_RE, ch)
|
|
when :start_null
|
|
keyword(NULL_KEYWORD, nil, NULL_RE, ch)
|
|
when :end_key
|
|
case ch
|
|
when COLON
|
|
@state = :key_sep
|
|
when WS
|
|
# ignore
|
|
else
|
|
error("Expected colon key separator")
|
|
end
|
|
when :key_sep
|
|
start_value(ch)
|
|
when :start_array
|
|
case ch
|
|
when RIGHT_BRACKET
|
|
end_container(:array)
|
|
when WS
|
|
# ignore
|
|
else
|
|
start_value(ch)
|
|
end
|
|
when :end_value
|
|
case ch
|
|
when COMMA
|
|
@state = :value_sep
|
|
when RIGHT_BRACE
|
|
end_container(:object)
|
|
when RIGHT_BRACKET
|
|
end_container(:array)
|
|
when WS
|
|
# ignore
|
|
else
|
|
error("Expected comma or object or array close")
|
|
end
|
|
when :value_sep
|
|
if @stack[-1] == :object
|
|
case ch
|
|
when QUOTE
|
|
@state = :start_string
|
|
@stack.push(:key)
|
|
when WS
|
|
# ignore
|
|
else
|
|
error("Expected object key start")
|
|
end
|
|
else
|
|
start_value(ch)
|
|
end
|
|
when :end_document
|
|
error("Unexpected data") unless ch =~ WS
|
|
end
|
|
end
|
|
end
|
|
|
|
# Drain any remaining buffered characters into the parser to complete
|
|
# the parsing of the document.
|
|
#
|
|
# This is only required when parsing a document containing a single
|
|
# numeric value, integer or float. The parser has no other way to
|
|
# detect when it should no longer expect additional characters with
|
|
# which to complete the parse, so it must be signaled by a call to
|
|
# this method.
|
|
#
|
|
# If you're parsing more typical object or array documents, there's no
|
|
# need to call `finish` because the parse will complete when the final
|
|
# closing `]` or `}` character is scanned.
|
|
#
|
|
# Raises a JSON::Stream::ParserError if the JSON data is malformed.
|
|
#
|
|
# Returns nothing.
|
|
def finish
|
|
# Partial multi-byte character waiting for completion bytes.
|
|
error("Unexpected end-of-file") unless @utf8.empty?
|
|
|
|
# Partial array, object, or string.
|
|
error("Unexpected end-of-file") unless @stack.empty?
|
|
|
|
case @state
|
|
when :end_document
|
|
# done, do nothing
|
|
when :in_float
|
|
end_value(@buf.to_f)
|
|
when :in_exponent
|
|
error("Unexpected end-of-file") unless @buf =~ DIGIT_END
|
|
end_value(@buf.to_f)
|
|
when :start_zero
|
|
end_value(@buf.to_i)
|
|
when :start_int
|
|
end_value(@buf.to_i)
|
|
else
|
|
error("Unexpected end-of-file")
|
|
end
|
|
end
|
|
|
|
private
|
|
|
|
# Invoke all registered observer procs for the event type.
|
|
#
|
|
# type - The Symbol listener name.
|
|
# args - The argument list to pass into the observer procs.
|
|
#
|
|
# Examples
|
|
#
|
|
# # broadcast events for {"answer": 42}
|
|
# notify(:start_object)
|
|
# notify(:key, "answer")
|
|
# notify(:value, 42)
|
|
# notify(:end_object)
|
|
#
|
|
# Returns nothing.
|
|
def notify(type, *args)
|
|
@listeners[type].each { |block| block.call(*args) }
|
|
end
|
|
|
|
# Complete an object or array container value type.
|
|
#
|
|
# type - The Symbol, :object or :array, of the expected type.
|
|
#
|
|
# Raises a JSON::Stream::ParserError if the expected container type
|
|
# was not completed.
|
|
#
|
|
# Returns nothing.
|
|
def end_container(type)
|
|
@state = :end_value
|
|
if @stack.pop == type
|
|
case type
|
|
when :object
|
|
notify(:end_object)
|
|
when :array
|
|
notify(:end_array)
|
|
end
|
|
else
|
|
error("Expected end of #{type}")
|
|
end
|
|
notify_end_document if @stack.empty?
|
|
end
|
|
|
|
# Broadcast an `end_document` event to observers after a complete JSON
|
|
# value document (object, array, number, string, true, false, null) has
|
|
# been parsed from the text. This is the final event sent to observers
|
|
# and signals the parse has finished.
|
|
#
|
|
# Returns nothing.
|
|
def notify_end_document
|
|
@state = :end_document
|
|
notify(:end_document)
|
|
end
|
|
|
|
# Parse one of the three allowed keywords: true, false, null.
|
|
#
|
|
# word - The String keyword ('true', 'false', 'null').
|
|
# value - The Ruby value (true, false, nil).
|
|
# re - The Regexp of allowed keyword characters.
|
|
# ch - The current String character being parsed.
|
|
#
|
|
# Raises a JSON::Stream::ParserError if the character does not belong
|
|
# in the expected keyword.
|
|
#
|
|
# Returns nothing.
|
|
def keyword(word, value, re, ch)
|
|
if ch =~ re
|
|
@buf << ch
|
|
else
|
|
error("Expected #{word} keyword")
|
|
end
|
|
|
|
if @buf.size == word.size
|
|
if @buf == word
|
|
@buf = +""
|
|
end_value(value)
|
|
else
|
|
error("Expected #{word} keyword")
|
|
end
|
|
end
|
|
end
|
|
|
|
# Process the first character of one of the seven possible JSON
|
|
# values: object, array, string, true, false, null, number.
|
|
#
|
|
# ch - The current character String.
|
|
#
|
|
# Raises a JSON::Stream::ParserError if the character does not signal
|
|
# the start of a value.
|
|
#
|
|
# Returns nothing.
|
|
def start_value(ch)
|
|
case ch
|
|
when LEFT_BRACE
|
|
notify(:start_document) if @stack.empty?
|
|
@state = :start_object
|
|
@stack.push(:object)
|
|
notify(:start_object)
|
|
when LEFT_BRACKET
|
|
notify(:start_document) if @stack.empty?
|
|
@state = :start_array
|
|
@stack.push(:array)
|
|
notify(:start_array)
|
|
when QUOTE
|
|
@state = :start_string
|
|
@stack.push(:string)
|
|
when T
|
|
@state = :start_true
|
|
@buf << ch
|
|
when F
|
|
@state = :start_false
|
|
@buf << ch
|
|
when N
|
|
@state = :start_null
|
|
@buf << ch
|
|
when MINUS
|
|
@state = :start_negative_number
|
|
@buf << ch
|
|
when ZERO
|
|
@state = :start_zero
|
|
@buf << ch
|
|
when DIGIT_1_9
|
|
@state = :start_int
|
|
@buf << ch
|
|
when WS
|
|
# ignore
|
|
else
|
|
error("Expected value")
|
|
end
|
|
end
|
|
|
|
# Advance the state machine and notify `value` observers that a
|
|
# string, number or keyword (true, false, null) value was parsed.
|
|
#
|
|
# value - The object to broadcast to observers.
|
|
#
|
|
# Returns nothing.
|
|
def end_value(value)
|
|
@state = :end_value
|
|
notify(:start_document) if @stack.empty?
|
|
notify(:value, value)
|
|
notify_end_document if @stack.empty?
|
|
end
|
|
|
|
def error(message)
|
|
raise ParserError, "#{message}: char #{@pos}"
|
|
end
|
|
end
|
|
end
|
|
end
|