DEV: Extend truncation to all summarizable content (#884)
This commit is contained in:
parent
e8eed710e0
commit
e8f0633141
|
@ -25,8 +25,10 @@ module DiscourseAi
|
||||||
def summarize(user, &on_partial_blk)
|
def summarize(user, &on_partial_blk)
|
||||||
base_summary = ""
|
base_summary = ""
|
||||||
initial_pos = 0
|
initial_pos = 0
|
||||||
folded_summary =
|
|
||||||
fold(content_to_summarize, base_summary, initial_pos, user, &on_partial_blk)
|
truncated_content = content_to_summarize.map { |cts| truncate(cts) }
|
||||||
|
|
||||||
|
folded_summary = fold(truncated_content, base_summary, initial_pos, user, &on_partial_blk)
|
||||||
|
|
||||||
clean_summary =
|
clean_summary =
|
||||||
Nokogiri::HTML5.fragment(folded_summary).css("ai")&.first&.text || folded_summary
|
Nokogiri::HTML5.fragment(folded_summary).css("ai")&.first&.text || folded_summary
|
||||||
|
@ -37,7 +39,7 @@ module DiscourseAi
|
||||||
strategy.type,
|
strategy.type,
|
||||||
llm_model.name,
|
llm_model.name,
|
||||||
clean_summary,
|
clean_summary,
|
||||||
content_to_summarize.map { |c| c[:id] },
|
truncated_content.map { |c| c[:id] },
|
||||||
)
|
)
|
||||||
else
|
else
|
||||||
AiSummary.new(summarized_text: clean_summary)
|
AiSummary.new(summarized_text: clean_summary)
|
||||||
|
@ -121,9 +123,9 @@ module DiscourseAi
|
||||||
prompt =
|
prompt =
|
||||||
(
|
(
|
||||||
if summary.blank?
|
if summary.blank?
|
||||||
strategy.first_summary_prompt(iteration_content, tokenizer)
|
strategy.first_summary_prompt(iteration_content)
|
||||||
else
|
else
|
||||||
strategy.summary_extension_prompt(summary, iteration_content, tokenizer)
|
strategy.summary_extension_prompt(summary, iteration_content)
|
||||||
end
|
end
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -143,6 +145,22 @@ module DiscourseAi
|
||||||
|
|
||||||
llm_model.max_prompt_tokens - reserved_tokens
|
llm_model.max_prompt_tokens - reserved_tokens
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def truncate(item)
|
||||||
|
item_content = item[:text].to_s
|
||||||
|
split_1, split_2 =
|
||||||
|
[item_content[0, item_content.size / 2], item_content[(item_content.size / 2)..-1]]
|
||||||
|
|
||||||
|
truncation_length = 500
|
||||||
|
tokenizer = llm_model.tokenizer_class
|
||||||
|
|
||||||
|
item[:text] = [
|
||||||
|
tokenizer.truncate(split_1, truncation_length),
|
||||||
|
tokenizer.truncate(split_2.reverse, truncation_length).reverse,
|
||||||
|
].join(" ")
|
||||||
|
|
||||||
|
item
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -34,12 +34,12 @@ module DiscourseAi
|
||||||
end
|
end
|
||||||
|
|
||||||
# @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM when extending an existing summary.
|
# @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM when extending an existing summary.
|
||||||
def summary_extension_prompt(_summary, _texts_to_summarize, _tokenizer)
|
def summary_extension_prompt(_summary, _texts_to_summarize)
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
end
|
end
|
||||||
|
|
||||||
# @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM for summarizing a single chunk of content.
|
# @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM for summarizing a single chunk of content.
|
||||||
def first_summary_prompt(_input, _tokenizer)
|
def first_summary_prompt(_input)
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -23,7 +23,7 @@ module DiscourseAi
|
||||||
.map { { id: _1, poster: _2, text: _3 } }
|
.map { { id: _1, poster: _2, text: _3 } }
|
||||||
end
|
end
|
||||||
|
|
||||||
def summary_extension_prompt(summary, contents, _tokenizer)
|
def summary_extension_prompt(summary, contents)
|
||||||
input =
|
input =
|
||||||
contents
|
contents
|
||||||
.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
|
.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
|
||||||
|
@ -63,7 +63,7 @@ module DiscourseAi
|
||||||
prompt
|
prompt
|
||||||
end
|
end
|
||||||
|
|
||||||
def first_summary_prompt(contents, _tokenizer)
|
def first_summary_prompt(contents)
|
||||||
content_title = target.name
|
content_title = target.name
|
||||||
input =
|
input =
|
||||||
contents.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }.join
|
contents.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }.join
|
||||||
|
|
|
@ -57,7 +57,7 @@ module DiscourseAi
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def summary_extension_prompt(summary, contents, _tokenizer)
|
def summary_extension_prompt(summary, contents)
|
||||||
statements =
|
statements =
|
||||||
contents
|
contents
|
||||||
.to_a
|
.to_a
|
||||||
|
@ -98,22 +98,11 @@ module DiscourseAi
|
||||||
prompt
|
prompt
|
||||||
end
|
end
|
||||||
|
|
||||||
def first_summary_prompt(contents, tokenizer)
|
def first_summary_prompt(contents)
|
||||||
content_title = target.title
|
content_title = target.title
|
||||||
statements =
|
statements =
|
||||||
contents.to_a.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
|
contents.to_a.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
|
||||||
|
|
||||||
op_statement = statements.shift.to_s
|
|
||||||
split_1, split_2 =
|
|
||||||
[op_statement[0, op_statement.size / 2], op_statement[(op_statement.size / 2)..-1]]
|
|
||||||
|
|
||||||
truncation_length = 500
|
|
||||||
|
|
||||||
op_statement = [
|
|
||||||
tokenizer.truncate(split_1, truncation_length),
|
|
||||||
tokenizer.truncate(split_2.reverse, truncation_length).reverse,
|
|
||||||
].join(" ")
|
|
||||||
|
|
||||||
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
|
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
|
||||||
You are an advanced summarization bot. Analyze a given conversation and produce a concise,
|
You are an advanced summarization bot. Analyze a given conversation and produce a concise,
|
||||||
single-sentence summary that conveys the main topic and current developments to someone with no prior context.
|
single-sentence summary that conveys the main topic and current developments to someone with no prior context.
|
||||||
|
@ -138,7 +127,7 @@ module DiscourseAi
|
||||||
|
|
||||||
The conversation began with the following statement:
|
The conversation began with the following statement:
|
||||||
|
|
||||||
#{op_statement}\n
|
#{statements.shift}\n
|
||||||
TEXT
|
TEXT
|
||||||
|
|
||||||
if statements.present?
|
if statements.present?
|
||||||
|
|
|
@ -27,7 +27,7 @@ module DiscourseAi
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def summary_extension_prompt(summary, contents, _tokenizer)
|
def summary_extension_prompt(summary, contents)
|
||||||
resource_path = "#{Discourse.base_path}/t/-/#{target.id}"
|
resource_path = "#{Discourse.base_path}/t/-/#{target.id}"
|
||||||
content_title = target.title
|
content_title = target.title
|
||||||
input =
|
input =
|
||||||
|
@ -70,7 +70,7 @@ module DiscourseAi
|
||||||
prompt
|
prompt
|
||||||
end
|
end
|
||||||
|
|
||||||
def first_summary_prompt(contents, _tokenizer)
|
def first_summary_prompt(contents)
|
||||||
resource_path = "#{Discourse.base_path}/t/-/#{target.id}"
|
resource_path = "#{Discourse.base_path}/t/-/#{target.id}"
|
||||||
content_title = target.title
|
content_title = target.title
|
||||||
input =
|
input =
|
||||||
|
|
Loading…
Reference in New Issue