fix: send only final answer with reasoning

When reasoning is enabled on non‑block providers, we now ignore interim streaming chunks and send only the final assistant answer at completion, so replies aren’t partial or duplicated.
This commit is contained in:
Anton Sotkov
2026-01-10 14:55:52 +02:00
committed by Peter Steinberger
parent 4c86da044e
commit 3b5149ca39
2 changed files with 99 additions and 3 deletions

View File

@@ -294,6 +294,7 @@ export function subscribeEmbeddedPiSession(params: {
let lastStreamedReasoning: string | undefined;
let lastBlockReplyText: string | undefined;
let assistantTextBaseline = 0;
let suppressBlockChunks = false; // Avoid late chunk inserts after final text merge.
let compactionInFlight = false;
let pendingCompactionRetry = 0;
let compactionRetryResolve: (() => void) | undefined;
@@ -419,6 +420,7 @@ export function subscribeEmbeddedPiSession(params: {
};
const emitBlockChunk = (text: string) => {
if (suppressBlockChunks) return;
// Strip <think> blocks across chunk boundaries to avoid leaking reasoning.
const strippedText = stripBlockThinkingSegments(text);
const chunk = strippedText.trimEnd();
@@ -476,6 +478,7 @@ export function subscribeEmbeddedPiSession(params: {
lastStreamedAssistant = undefined;
lastStreamedReasoning = undefined;
lastBlockReplyText = undefined;
suppressBlockChunks = false;
assistantTextBaseline = 0;
};
@@ -497,6 +500,7 @@ export function subscribeEmbeddedPiSession(params: {
lastBlockReplyText = undefined;
lastStreamedReasoning = undefined;
lastReasoningSent = undefined;
suppressBlockChunks = false;
assistantTextBaseline = assistantTexts.length;
}
}
@@ -818,9 +822,23 @@ export function subscribeEmbeddedPiSession(params: {
const addedDuringMessage =
assistantTexts.length > assistantTextBaseline;
const chunkerHasBuffered = blockChunker?.hasBuffered() ?? false;
// Non-streaming models (no text_delta): ensure assistantTexts gets the
// final text when the chunker has nothing buffered to drain.
if (!addedDuringMessage && !chunkerHasBuffered && text) {
// If we're not streaming block replies, ensure the final payload
// includes the final text even when deltas already populated assistantTexts.
if (includeReasoning && text && !params.onBlockReply) {
if (assistantTexts.length > assistantTextBaseline) {
assistantTexts.splice(
assistantTextBaseline,
assistantTexts.length - assistantTextBaseline,
text,
);
} else {
const last = assistantTexts.at(-1);
if (!last || last !== text) assistantTexts.push(text);
}
suppressBlockChunks = true;
} else if (!addedDuringMessage && !chunkerHasBuffered && text) {
// Non-streaming models (no text_delta): ensure assistantTexts gets the
// final text when the chunker has nothing buffered to drain.
const last = assistantTexts.at(-1);
if (!last || last !== text) assistantTexts.push(text);
}