Merge pull request #26 from rfc1437/feature/sse-streaming

feat: SSE streaming for chat providers
2026-03-01 12:14:51 +01:00
parent 5938aa9642 2ddaad422f
commit f21dc74113
5 changed files with 2466 additions and 120 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -4,6 +4,10 @@
        "npx tsc": true,
        "git remote": true,
        "npx asar": true,
-        "npx tsx": true
+        "npx tsx": true,
        "gh": true,
        "git add": true,
        "git commit": true,
        "git push": true
    }
 }
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -7,6 +7,11 @@ This document provides context and best practices for GitHub Copilot when workin
 - Make the plan extremely concise. Sacrifice grammar for the sake of concision.
 - At the end of each plan, give me a list of unresolved questions to answer, if any.
 ## Commits
 - commit messages are short - one sentence. do not write long articles.
 - pull requests are more verbose and especially give reasoning for changes
 ---
 ## ⚠️ MANDATORY: Test-First Development
--- a/src/main/engine/OpenCodeManager.ts
+++ b/src/main/engine/OpenCodeManager.ts
@@ -12,6 +12,14 @@ import https from 'https';
 import http from 'http';
 import { URL } from 'url';
 import { BrowserWindow } from 'electron';
 import {
  parseAnthropicStreamEvent,
  parseOpenAIStreamEvent,
  createAnthropicStreamAccumulator,
  createOpenAIStreamAccumulator,
  httpRequestStream,
  withRetry,
 } from './streaming';
 import { ChatEngine } from './ChatEngine';
 import { PostEngine, type PostData } from './PostEngine';
 import { MediaEngine, type MediaData } from './MediaEngine';
@@ -129,6 +137,8 @@ interface AnthropicContentBlock {
  input?: unknown;
  tool_use_id?: string;
  content?: string | AnthropicToolResultContent[];
  is_error?: boolean;
  signature?: string;
  source?: {
    type: 'base64';
    media_type: string;
@@ -463,6 +473,7 @@ export class OpenCodeManager {
    while (round < MAX_TOOL_ROUNDS) {
      round++;
      if (signal.aborted) break;
      const body: Record<string, unknown> = {
        model: modelId,
@@ -470,42 +481,79 @@ export class OpenCodeManager {
        system: systemPrompt,
        messages,
        tools,
        stream: true,
        cache_control: { type: 'ephemeral' },
      };
-      const response = await this.httpRequest(ZEN_ANTHROPIC_URL, {
+      // Retry only the HTTP connection (429/502/503 are caught before any events are emitted).
-        method: 'POST',
+      // Event processing is outside retry scope to prevent double-emission of onDelta on retry.
-        headers: {
+      const { events } = await withRetry(async () => {
-          'Content-Type': 'application/json',
+        return httpRequestStream(ZEN_ANTHROPIC_URL, {
-          'x-api-key': this.apiKey,
+          method: 'POST',
-          'Authorization': `Bearer ${this.apiKey}`,
+          headers: {
-          'anthropic-version': '2023-06-01',
+            'Content-Type': 'application/json',
-        },
+            'x-api-key': this.apiKey,
-        body: JSON.stringify(body),
+            'Authorization': `Bearer ${this.apiKey}`,
-        signal,
+            'anthropic-version': '2023-06-01',
          },
          body: JSON.stringify(body),
          signal,
        });
      });
-      if (response.statusCode >= 400) {
+      // Process stream events outside retry scope — onDelta is never called twice for the same text
-        const errorMsg = this.parseErrorResponse(response);
+      const streamAccumulator = createAnthropicStreamAccumulator();
-        throw new Error(errorMsg);
+      let stopReason = '';
      let inputTokens = 0;
      let outputTokens = 0;
      let cacheReadTokens = 0;
      let cacheWriteTokens = 0;
      let roundText = '';
      let receivedUsage = false;
      try {
        for await (const event of events) {
          const result = parseAnthropicStreamEvent(event, streamAccumulator);
          if (result.textDelta) {
            roundText += result.textDelta;
            if (callbacks.onDelta) {
              callbacks.onDelta(result.textDelta);
            }
          }
          if (result.usage) {
            receivedUsage = true;
            if (result.usage.inputTokens !== undefined) inputTokens = result.usage.inputTokens;
            if (result.usage.cacheReadTokens !== undefined) cacheReadTokens = result.usage.cacheReadTokens;
            if (result.usage.cacheWriteTokens !== undefined) cacheWriteTokens = result.usage.cacheWriteTokens;
            if (result.usage.outputTokens !== undefined) outputTokens = result.usage.outputTokens;
          }
          if (result.finishReason) {
            stopReason = result.finishReason;
          }
          if (result.done) break;
        }
      } finally {
        // Preserve text already emitted via onDelta even if the stream errors mid-round
        accumulatedText += roundText;
      }
-      const data = JSON.parse(response.body);
+      const streamToolCalls = streamAccumulator.toolCalls;
      const streamThinkingBlocks = streamAccumulator.thinkingBlocks;
-      // Extract and emit token usage
+      // Emit token usage after stream completes (only when usage data was received)
-      if (data.usage && callbacks.onTokenUsage) {
+      if (callbacks.onTokenUsage && receivedUsage) {
-        const usage = data.usage;
+        const adjustedInputTokens = inputTokens - cacheReadTokens - cacheWriteTokens;
-        const cacheReadTokens = usage.cache_read_input_tokens || 0;
+        const totalTokens = inputTokens + outputTokens;
        const cacheWriteTokens = usage.cache_creation_input_tokens || 0;
        const inputTokens = (usage.input_tokens || 0) - cacheReadTokens - cacheWriteTokens;
        const outputTokens = usage.output_tokens || 0;
        const totalTokens = (usage.input_tokens || 0) + outputTokens;
        const prev = this.conversationUsage.get(conversationId) || {
          inputTokens: 0, outputTokens: 0, cacheReadTokens: 0, cacheWriteTokens: 0,
        };
        const cumulative = {
-          inputTokens: prev.inputTokens + inputTokens,
+          inputTokens: prev.inputTokens + adjustedInputTokens,
          outputTokens: prev.outputTokens + outputTokens,
          cacheReadTokens: prev.cacheReadTokens + cacheReadTokens,
          cacheWriteTokens: prev.cacheWriteTokens + cacheWriteTokens,
@@ -513,7 +561,7 @@ export class OpenCodeManager {
        this.conversationUsage.set(conversationId, cumulative);
        callbacks.onTokenUsage({
-          inputTokens, outputTokens, cacheReadTokens, cacheWriteTokens, totalTokens,
+          inputTokens: adjustedInputTokens, outputTokens, cacheReadTokens, cacheWriteTokens, totalTokens,
          cumulativeInputTokens: cumulative.inputTokens,
          cumulativeOutputTokens: cumulative.outputTokens,
          cumulativeCacheReadTokens: cumulative.cacheReadTokens,
@@ -522,35 +570,20 @@ export class OpenCodeManager {
        });
      }
-      console.log('[OpenCodeManager] Round', round, 'stop_reason:', data.stop_reason, 'content blocks:', JSON.stringify(data.content?.map((b: AnthropicContentBlock) => ({ type: b.type, textLen: b.text?.length, name: b.name }))));
+      // Collect tool calls from stream accumulator
-
+      const toolUseBlocks: Array<{ id: string; name: string; input: unknown; parseError?: string }> = [];
-      if (!data.content) {
+      for (const [, tc] of streamToolCalls) {
-        throw new Error('API response missing content field');
+        try {
-      }
+          toolUseBlocks.push({ id: tc.id, name: tc.name, input: JSON.parse(tc.arguments) });
-
+        } catch (e) {
-      // Check if there are tool_use blocks
+          console.error(`[OpenCodeManager] Failed to parse tool arguments for ${tc.name}:`, tc.arguments);
-      const toolUseBlocks = (data.content as AnthropicContentBlock[]).filter(
+          toolUseBlocks.push({ id: tc.id, name: tc.name, input: {}, parseError: `Failed to parse tool arguments: ${(e as Error).message}` });
        (b: AnthropicContentBlock) => b.type === 'tool_use'
      );
      // Capture text from any block type that has a text field (text, thinking, etc.)
      const textBlocks = (data.content as AnthropicContentBlock[]).filter(
        (b: AnthropicContentBlock) => b.text
      );
      // Accumulate and stream text content to frontend
      for (const block of textBlocks) {
        if (block.text) {
          accumulatedText += block.text;
          if (callbacks.onDelta) {
            callbacks.onDelta(block.text);
          }
        }
      }
-      console.log('[OpenCodeManager] Round', round, 'accumulatedText length:', accumulatedText.length, 'toolUseBlocks:', toolUseBlocks.length);
+      console.log('[OpenCodeManager] Round', round, 'stopReason:', stopReason, 'accumulatedText length:', accumulatedText.length, 'toolCalls:', toolUseBlocks.length);
-      if (toolUseBlocks.length === 0 || data.stop_reason !== 'tool_use') {
+      if (toolUseBlocks.length === 0 || stopReason !== 'tool_use') {
        // No more tool calls - return all accumulated text
        console.log('[OpenCodeManager] Returning accumulated text length:', accumulatedText.length);
        return { content: accumulatedText, toolCalls: allToolCalls };
@@ -558,11 +591,37 @@ export class OpenCodeManager {
      // Execute tool calls
      const toolResults: AnthropicContentBlock[] = [];
      // Build assistant content blocks for the next message round
      const assistantContentBlocks: AnthropicContentBlock[] = [];
      // Add thinking blocks first (Anthropic requires thinking before text when extended thinking is enabled)
      for (const [, tb] of streamThinkingBlocks) {
        if (tb.text) {
          const thinkingBlock: AnthropicContentBlock = { type: 'thinking', text: tb.text };
          if (tb.signature) {
            thinkingBlock.signature = tb.signature;
          }
          assistantContentBlocks.push(thinkingBlock);
        }
      }
      // Add text block with text from this round
      if (roundText) {
        assistantContentBlocks.push({ type: 'text', text: roundText });
      }
      for (const toolBlock of toolUseBlocks) {
-        const toolName = toolBlock.name!;
+        const toolName = toolBlock.name;
        const toolArgs = toolBlock.input;
-        const toolUseId = toolBlock.id!;
+        const toolUseId = toolBlock.id;
        // Add tool_use block to assistant content
        assistantContentBlocks.push({
          type: 'tool_use',
          id: toolUseId,
          name: toolName,
          input: toolArgs,
        });
        allToolCalls.push({ name: toolName, args: toolArgs });
@@ -570,6 +629,21 @@ export class OpenCodeManager {
          callbacks.onToolCall({ name: toolName, args: toolArgs });
        }
        // If JSON parsing of tool arguments failed, report the error to the model
        if (toolBlock.parseError) {
          const errorResult = { error: true, message: toolBlock.parseError };
          if (callbacks.onToolResult) {
            callbacks.onToolResult({ name: toolName, result: errorResult });
          }
          toolResults.push({
            type: 'tool_result',
            tool_use_id: toolUseId,
            content: JSON.stringify(errorResult),
            is_error: true,
          });
          continue;
        }
        // Check if this is a render tool — generate A2UI messages instead of executing
        if (isRenderTool(toolName)) {
          const a2uiMessages = generateFromToolCall(
@@ -593,7 +667,8 @@ export class OpenCodeManager {
          continue;
        }
-        // Execute the tool
+        // Execute the tool (check abort before each tool execution)
        if (signal.aborted) break;
        const result = await this.executeTool(toolName, toolArgs as Record<string, unknown>);
        if (callbacks.onToolResult) {
@@ -640,10 +715,12 @@ export class OpenCodeManager {
        }
      }
      if (signal.aborted) break;
      // Add assistant response and tool results to messages for next round
      messages = [
        ...messages,
-        { role: 'assistant' as const, content: data.content },
+        { role: 'assistant' as const, content: assistantContentBlocks },
        { role: 'user' as const, content: toolResults },
      ];
    }
@@ -712,39 +789,77 @@ export class OpenCodeManager {
    while (round < MAX_TOOL_ROUNDS) {
      round++;
      if (signal.aborted) break;
      const body: Record<string, unknown> = {
        model: modelId,
        max_tokens: 4096,
        messages,
        tools: openaiTools,
        stream: true,
        stream_options: { include_usage: true },
      };
-      const response = await this.httpRequest(ZEN_OPENAI_URL, {
+      // Retry only the HTTP connection (429/502/503 are caught before any events are emitted).
-        method: 'POST',
+      // Event processing is outside retry scope to prevent double-emission of onDelta on retry.
-        headers: {
+      const { events } = await withRetry(async () => {
-          'Content-Type': 'application/json',
+        return httpRequestStream(ZEN_OPENAI_URL, {
-          'Authorization': `Bearer ${this.apiKey}`,
+          method: 'POST',
-        },
+          headers: {
-        body: JSON.stringify(body),
+            'Content-Type': 'application/json',
-        signal,
+            'Authorization': `Bearer ${this.apiKey}`,
          },
          body: JSON.stringify(body),
          signal,
        });
      });
-      if (response.statusCode >= 400) {
+      // Process stream events outside retry scope — onDelta is never called twice for the same text
-        const errorMsg = this.parseErrorResponse(response);
+      const streamAccumulator = createOpenAIStreamAccumulator();
-        throw new Error(errorMsg);
+      let finishReason = '';
      let promptTokens = 0;
      let completionTokens = 0;
      let totalTokens = 0;
      let cacheReadTokens = 0;
      let roundText = '';
      let receivedUsage = false;
      try {
        for await (const event of events) {
          const result = parseOpenAIStreamEvent(event, streamAccumulator);
          if (result.textDelta) {
            roundText += result.textDelta;
            if (callbacks.onDelta) {
              callbacks.onDelta(result.textDelta);
            }
          }
          if (result.usage) {
            receivedUsage = true;
            if (result.usage.promptTokens !== undefined) promptTokens = result.usage.promptTokens;
            if (result.usage.completionTokens !== undefined) completionTokens = result.usage.completionTokens;
            if (result.usage.totalTokens !== undefined) totalTokens = result.usage.totalTokens;
            if (result.usage.cacheReadTokens !== undefined) cacheReadTokens = result.usage.cacheReadTokens;
          }
          if (result.finishReason) {
            finishReason = result.finishReason;
          }
          if (result.done) break;
        }
      } finally {
        // Preserve text already emitted via onDelta even if the stream errors mid-round
        accumulatedText += roundText;
      }
-      const data = JSON.parse(response.body);
+      const streamToolCalls = streamAccumulator.toolCalls;
      const choice = data.choices?.[0];
-      // Extract and emit token usage (OpenAI format)
+      // Emit token usage after stream completes (only when usage data was received)
-      if (data.usage && callbacks.onTokenUsage) {
+      if (callbacks.onTokenUsage && receivedUsage) {
-        const usage = data.usage;
+        const inputTokens = promptTokens - cacheReadTokens;
-        const cacheReadTokens = usage.prompt_tokens_details?.cached_tokens || 0;
+        const outputTokens = completionTokens;
        const inputTokens = (usage.prompt_tokens || 0) - cacheReadTokens;
        const outputTokens = usage.completion_tokens || 0;
        const totalTokens = usage.total_tokens || (usage.prompt_tokens || 0) + outputTokens;
        const prev = this.conversationUsage.get(conversationId) || {
          inputTokens: 0, outputTokens: 0, cacheReadTokens: 0, cacheWriteTokens: 0,
@@ -758,7 +873,9 @@ export class OpenCodeManager {
        this.conversationUsage.set(conversationId, cumulative);
        callbacks.onTokenUsage({
-          inputTokens, outputTokens, cacheReadTokens, cacheWriteTokens: 0, totalTokens,
+          inputTokens, outputTokens, cacheReadTokens,
          cacheWriteTokens: 0, // OpenAI streaming does not report cache write tokens
          totalTokens: totalTokens || inputTokens + outputTokens,
          cumulativeInputTokens: cumulative.inputTokens,
          cumulativeOutputTokens: cumulative.outputTokens,
          cumulativeCacheReadTokens: cumulative.cacheReadTokens,
@@ -767,66 +884,64 @@ export class OpenCodeManager {
        });
      }
-      console.log('[OpenCodeManager:OpenAI] Round', round, 'status:', response.statusCode, 'content type:', typeof choice?.message?.content, 'content length:', choice?.message?.content?.length, 'tool_calls:', choice?.message?.tool_calls?.length);
+      // Collect tool calls from stream accumulator
-
+      const parsedToolCalls: Array<{ id: string; name: string; args: unknown; parseError?: string }> = [];
-      if (!choice?.message) {
+      for (const [, tc] of streamToolCalls) {
-        throw new Error('API response missing expected message content');
+        try {
-      }
+          parsedToolCalls.push({ id: tc.id, name: tc.name, args: JSON.parse(tc.arguments) });
-
+        } catch (e) {
-      // Handle content that might be a string or an array of content parts
+          console.error(`[OpenCodeManager:OpenAI] Failed to parse tool arguments for ${tc.name}:`, tc.arguments);
-      let textContent = '';
+          parsedToolCalls.push({ id: tc.id, name: tc.name, args: {}, parseError: `Failed to parse tool arguments: ${(e as Error).message}` });
      const content = choice.message.content;
      if (typeof content === 'string') {
        textContent = content;
      } else if (Array.isArray(content)) {
        // Handle array of content parts (some models return this format)
        // Accept any part that has a text field, regardless of type
        textContent = content
          .filter((part: { type?: string; text?: string }) => part.text)
          .map((part: { text: string }) => part.text)
          .join('');
        // Log what types we're seeing for debugging
        const types = content.map((p: { type?: string }) => p.type).filter(Boolean);
        if (types.length > 0) {
          console.log('[OpenCodeManager:OpenAI] Content block types:', types);
        }
      } else if (content && typeof content === 'object') {
        // Handle single object with text field
        if ('text' in content && typeof content.text === 'string') {
          textContent = content.text;
        }
      }
-      if (textContent) {
+      console.log('[OpenCodeManager:OpenAI] Round', round, 'finishReason:', finishReason, 'text length:', accumulatedText.length, 'toolCalls:', parsedToolCalls.length);
        accumulatedText += textContent;
        if (callbacks.onDelta) {
          callbacks.onDelta(textContent);
        }
      }
      // If no tool calls, we're done
-      if (!choice.message.tool_calls || choice.message.tool_calls.length === 0) {
+      if (parsedToolCalls.length === 0 || finishReason !== 'tool_calls') {
        console.log('[OpenCodeManager:OpenAI] Done. Accumulated text length:', accumulatedText.length);
        return { content: accumulatedText, toolCalls: allToolCalls };
      }
-      // Add assistant message (with tool_calls) to conversation
+      // Build the assistant message with tool_calls for conversation history
-      messages.push(choice.message);
+      const assistantMessage: Record<string, unknown> = {
        role: 'assistant',
        content: roundText || null,
        tool_calls: parsedToolCalls.map((tc) => ({
          id: tc.id,
          type: 'function',
          function: { name: tc.name, arguments: JSON.stringify(tc.args) },
        })),
      };
      messages.push(assistantMessage);
      // Execute tool calls and add results
-      for (const toolCall of choice.message.tool_calls) {
+      for (const toolCall of parsedToolCalls) {
-        const toolName = toolCall.function.name;
+        const toolName = toolCall.name;
-        const toolArgs = JSON.parse(toolCall.function.arguments || '{}');
+        const toolArgs = toolCall.args;
        allToolCalls.push({ name: toolName, args: toolArgs });
        if (callbacks.onToolCall) {
          callbacks.onToolCall({ name: toolName, args: toolArgs });
        }
        // If JSON parsing of tool arguments failed, report the error to the model
        if (toolCall.parseError) {
          const errorResult = { error: true, message: toolCall.parseError };
          if (callbacks.onToolResult) {
            callbacks.onToolResult({ name: toolName, result: errorResult });
          }
          messages.push({
            role: 'tool',
            content: JSON.stringify(errorResult),
            tool_call_id: toolCall.id,
          });
          continue;
        }
        // Check if this is a render tool
        if (isRenderTool(toolName)) {
-          const a2uiMessages = generateFromToolCall(conversationId, toolName, toolArgs);
+          const a2uiMessages = generateFromToolCall(conversationId, toolName, toolArgs as Record<string, unknown>);
          if (a2uiMessages) {
            emitA2UIMessages(a2uiMessages);
          }
@@ -843,7 +958,9 @@ export class OpenCodeManager {
          continue;
        }
-        const result = await this.executeTool(toolName, toolArgs);
+        // Check abort before each tool execution
        if (signal.aborted) break;
        const result = await this.executeTool(toolName, toolArgs as Record<string, unknown>);
        if (callbacks.onToolResult) {
          callbacks.onToolResult({ name: toolName, result });
@@ -855,6 +972,8 @@ export class OpenCodeManager {
          tool_call_id: toolCall.id,
        });
      }
      if (signal.aborted) break;
    }
    // Hit max rounds
@@ -2251,7 +2370,7 @@ Respond with JSON only: {"title": "...", "alt": "...", "caption": "..."}`;
        options.signal.addEventListener('abort', () => {
          req.destroy();
          reject(new Error('Request cancelled'));
-        });
+        }, { once: true });
      }
      if (options.body) {
--- a/src/main/engine/streaming.ts
+++ b/src/main/engine/streaming.ts
@@ -0,0 +1,620 @@
 /**
 * SSE Streaming Infrastructure
 *
 * Provides SSE line parsing, event parsers for OpenAI/Mistral and Anthropic
 * stream formats, tool-call accumulation, and retry-with-exponential-backoff.
 *
 * Used by OpenCodeManager to convert buffered HTTP calls to real-time
 * token-by-token streaming for all chat providers.
 */
 import https from 'https';
 import http from 'http';
 import { URL } from 'url';
 // ── Types ──
 export interface SSEEvent {
  event?: string;
  data: string;
 }
 export interface StreamEventResult {
  /** Text content delta to emit to UI */
  textDelta?: string;
  /** Whether the stream is complete */
  done: boolean;
  /** Finish reason from the model */
  finishReason?: string;
  /** Token usage information */
  usage?: {
    promptTokens?: number;
    completionTokens?: number;
    totalTokens?: number;
    inputTokens?: number;
    outputTokens?: number;
    cacheReadTokens?: number;
    cacheWriteTokens?: number;
  };
 }
 interface ToolCallAccumulator {
  id: string;
  name: string;
  arguments: string;
 }
 export interface OpenAIStreamAccumulator {
  toolCalls: Map<number, ToolCallAccumulator>;
 }
 export interface AnthropicStreamAccumulator {
  toolCalls: Map<number, ToolCallAccumulator>;
  thinkingBlocks: Map<number, { text: string; signature?: string }>;
 }
 export interface HttpStreamError extends Error {
  statusCode?: number;
  retryAfter?: number;
  isAbort?: boolean;
 }
 // ── SSE Line Parsing ──
 /**
 * Parse raw SSE text into structured events.
 *
 * SSE protocol: events are separated by double-newlines (\n\n).
 * Each event can have `event:` and `data:` lines.
 * Multiple `data:` lines within one event are concatenated with newlines.
 * Lines starting with `:` are comments (ignored).
 *
 * Returns parsed events and any remaining incomplete text (buffer).
 */
 export function parseSSELines(text: string): { events: SSEEvent[]; remaining: string } {
  const events: SSEEvent[] = [];
  // Normalize \r\n to \n
  const normalized = text.replace(/\r\n/g, '\n');
  // Split on double-newline (event boundary)
  const parts = normalized.split('\n\n');
  // Last part may be incomplete (no trailing \n\n)
  const remaining = normalized.endsWith('\n\n') ? '' : parts.pop() || '';
  for (const part of parts) {
    if (!part.trim()) continue;
    let eventType: string | undefined;
    const dataLines: string[] = [];
    for (const line of part.split('\n')) {
      // Comment lines start with ':'
      if (line.startsWith(':')) continue;
      if (line.startsWith('event: ') || line.startsWith('event:')) {
        const afterColon = line.slice(line.indexOf(':') + 1);
        eventType = afterColon.startsWith(' ') ? afterColon.slice(1) : afterColon;
      } else if (line.startsWith('data: ') || line.startsWith('data:')) {
        const afterColon = line.slice(line.indexOf(':') + 1);
        dataLines.push(afterColon.startsWith(' ') ? afterColon.slice(1) : afterColon);
      }
    }
    if (dataLines.length > 0) {
      events.push({
        event: eventType,
        data: dataLines.join('\n'),
      });
    }
  }
  return { events, remaining };
 }
 // ── Accumulator Factories ──
 export function createOpenAIStreamAccumulator(): OpenAIStreamAccumulator {
  return { toolCalls: new Map() };
 }
 export function createAnthropicStreamAccumulator(): AnthropicStreamAccumulator {
  return { toolCalls: new Map(), thinkingBlocks: new Map() };
 }
 // ── OpenAI/Mistral SSE Parser ──
 /**
 * Parse a single OpenAI/Mistral SSE event and update the accumulator.
 *
 * OpenAI streaming format:
 * - Text deltas: choices[0].delta.content
 * - Tool call start: delta.tool_calls[i] with id + function.name
 * - Tool call fragments: delta.tool_calls[i].function.arguments (append)
 * - Finish reason: choices[0].finish_reason
 * - Usage: usage object in final chunk (requires stream_options.include_usage)
 * - [DONE] sentinel: stop iteration
 */
 export function parseOpenAIStreamEvent(
  event: SSEEvent,
  accumulator: OpenAIStreamAccumulator,
 ): StreamEventResult {
  // Handle [DONE] sentinel
  if (event.data === '[DONE]') {
    return { done: true };
  }
  let data: Record<string, unknown>;
  try {
    // eslint-disable-next-line @typescript-eslint/no-explicit-any
    data = JSON.parse(event.data) as any;
  } catch {
    // Skip corrupted SSE events (e.g. partial JSON from TCP split)
    return { done: false };
  }
  const choice = (data as any).choices?.[0];
  const result: StreamEventResult = { done: false };
  if (choice) {
    const delta = choice.delta;
    // Text content delta
    if (delta?.content && delta.content.length > 0) {
      result.textDelta = delta.content;
    }
    // Tool calls
    if (delta?.tool_calls) {
      for (const tc of delta.tool_calls) {
        const idx = tc.index;
        const existing = accumulator.toolCalls.get(idx);
        if (tc.id || tc.function?.name) {
          // New tool call or update
          if (!existing) {
            accumulator.toolCalls.set(idx, {
              id: tc.id || '',
              name: tc.function?.name || '',
              arguments: tc.function?.arguments || '',
            });
          } else {
            if (tc.id) existing.id = tc.id;
            if (tc.function?.name) existing.name = tc.function.name;
            if (tc.function?.arguments) existing.arguments += tc.function.arguments;
          }
        } else if (existing && tc.function?.arguments) {
          // Append argument fragment
          existing.arguments += tc.function.arguments;
        }
      }
    }
    // Finish reason
    if (choice.finish_reason) {
      result.finishReason = choice.finish_reason;
    }
  }
  // Token usage (arrives in final chunk with stream_options.include_usage)
  if ((data as any).usage) {
    const usage = (data as any).usage;
    const promptDetails = usage.prompt_tokens_details;
    result.usage = {
      promptTokens: usage.prompt_tokens,
      completionTokens: usage.completion_tokens,
      totalTokens: usage.total_tokens,
      cacheReadTokens: promptDetails?.cached_tokens,
    };
  }
  return result;
 }
 // ── Anthropic SSE Parser ──
 /**
 * Parse a single Anthropic SSE event and update the accumulator.
 *
 * Anthropic streaming format uses named event types:
 * - message_start: input token usage
 * - content_block_start: text, tool_use, or thinking block begins
 * - content_block_delta: text_delta, input_json_delta, or thinking_delta
 * - content_block_stop: block ends
 * - message_delta: output tokens + stop_reason
 * - message_stop: stream complete
 * - ping: keep-alive (ignored)
 * - error: server error mid-stream
 */
 export function parseAnthropicStreamEvent(
  event: SSEEvent,
  accumulator: AnthropicStreamAccumulator,
 ): StreamEventResult {
  let data: Record<string, unknown>;
  try {
    // eslint-disable-next-line @typescript-eslint/no-explicit-any
    data = JSON.parse(event.data) as any;
  } catch {
    // Skip corrupted SSE events (e.g. partial JSON from TCP split)
    return { done: false };
  }
  const result: StreamEventResult = { done: false };
  switch (event.event) {
    case 'message_start': {
      const usage = (data as any).message?.usage;
      if (usage) {
        result.usage = {
          inputTokens: usage.input_tokens || 0,
          cacheReadTokens: usage.cache_read_input_tokens || 0,
          cacheWriteTokens: usage.cache_creation_input_tokens || 0,
        };
      }
      break;
    }
    case 'content_block_start': {
      const block = (data as any).content_block;
      if (block?.type === 'tool_use') {
        accumulator.toolCalls.set(data.index as number, {
          id: block.id,
          name: block.name,
          arguments: '',
        });
      } else if (block?.type === 'thinking') {
        accumulator.thinkingBlocks.set(data.index as number, { text: '' });
      }
      // text block start is a no-op (empty initial text)
      break;
    }
    case 'content_block_delta': {
      const delta = (data as any).delta;
      if (delta?.type === 'text_delta' && delta.text) {
        result.textDelta = delta.text;
      } else if (delta?.type === 'input_json_delta' && delta.partial_json) {
        const tc = accumulator.toolCalls.get(data.index as number);
        if (tc) {
          tc.arguments += delta.partial_json;
        }
      } else if (delta?.type === 'thinking_delta' && delta.thinking) {
        const tb = accumulator.thinkingBlocks.get(data.index as number);
        if (tb) {
          tb.text += delta.thinking;
        }
      }
      break;
    }
    case 'content_block_stop': {
      // Block is complete. Tool arguments can now be parsed by the caller.
      // For thinking blocks, capture the signature (required by Anthropic when replaying thinking blocks).
      const stopBlock = (data as any).content_block;
      if (stopBlock?.type === 'thinking' && stopBlock.signature) {
        const tb = accumulator.thinkingBlocks.get(data.index as number);
        if (tb) {
          tb.signature = stopBlock.signature;
        }
      }
      break;
    }
    case 'message_delta': {
      if ((data as any).usage) {
        result.usage = {
          outputTokens: (data as any).usage.output_tokens || 0,
        };
      }
      if ((data as any).delta?.stop_reason) {
        result.finishReason = (data as any).delta.stop_reason;
      }
      break;
    }
    case 'message_stop':
      result.done = true;
      break;
    case 'ping':
      // Keep-alive, ignore
      break;
    case 'error': {
      const errorMsg = (data as any).error?.message || 'Unknown streaming error';
      throw new Error(errorMsg);
    }
    default:
      // Unknown event type, ignore
      break;
  }
  return result;
 }
 // ── Retry with Exponential Backoff ──
 const RETRYABLE_STATUS_CODES = new Set([429, 502, 503]);
 /**
 * Retry a function with exponential backoff for transient HTTP errors.
 *
 * Retries on 429 (rate limit), 502 (bad gateway), 503 (service unavailable).
 * Also retries errors without a statusCode (e.g. ECONNRESET, EPIPE) since
 * these indicate transient network failures during connection.
 *
 * Does NOT retry on other 4xx errors (400, 401, 403 — client errors) or abort.
 * Respects Retry-After header for 429 responses.
 *
 * Best practice: wrap only the HTTP connection (httpRequestStream) in withRetry,
 * NOT the event processing loop. This ensures onDelta callbacks are never
 * called twice for the same text on retry.
 */
 export async function withRetry<T>(
  fn: () => Promise<T>,
  options: { maxRetries?: number; onRetry?: (attempt: number, error: Error) => void; signal?: AbortSignal } = {},
 ): Promise<T> {
  const maxRetries = options.maxRetries ?? 3;
  let lastError: Error | undefined;
  for (let attempt = 0; attempt <= maxRetries; attempt++) {
    try {
      return await fn();
    } catch (error) {
      lastError = error as Error;
      const httpError = error as HttpStreamError;
      // Don't retry on abort
      if (httpError.isAbort || httpError.message === 'Request cancelled') {
        throw error;
      }
      // Check signal before retrying
      if (options.signal?.aborted) {
        const abortError: HttpStreamError = new Error('Request cancelled') as HttpStreamError;
        abortError.isAbort = true;
        throw abortError;
      }
      // Don't retry on non-retryable status codes
      if (httpError.statusCode && !RETRYABLE_STATUS_CODES.has(httpError.statusCode)) {
        throw error;
      }
      // Don't retry if we've exhausted retries
      if (attempt >= maxRetries) {
        throw error;
      }
      // Calculate delay with exponential backoff and jitter
      const baseDelay = Math.pow(2, attempt) * 1000; // 1s, 2s, 4s
      const jitter = Math.random() * 500;
      let delay = baseDelay + jitter;
      // Respect Retry-After header for 429
      if (httpError.retryAfter && httpError.retryAfter > 0) {
        delay = Math.max(delay, httpError.retryAfter * 1000);
      }
      if (options.onRetry) {
        options.onRetry(attempt + 1, lastError);
      }
      // Abort-aware delay: reject immediately if signal fires during wait
      await new Promise<void>((resolve, reject) => {
        const timer = setTimeout(resolve, delay);
        if (options.signal) {
          const onAbort = () => {
            clearTimeout(timer);
            const abortError: HttpStreamError = new Error('Request cancelled') as HttpStreamError;
            abortError.isAbort = true;
            reject(abortError);
          };
          if (options.signal.aborted) {
            clearTimeout(timer);
            const abortError: HttpStreamError = new Error('Request cancelled') as HttpStreamError;
            abortError.isAbort = true;
            reject(abortError);
            return;
          }
          options.signal.addEventListener('abort', onAbort, { once: true });
        }
      });
    }
  }
  throw lastError;
 }
 // ── HTTP Streaming Request ──
 interface HttpStreamOptions {
  method?: string;
  headers?: Record<string, string>;
  body?: string;
  signal?: AbortSignal;
  timeout?: number;
 }
 /**
 * Make an HTTP request that returns an async iterable of SSE events.
 *
 * Uses Node.js http/https modules directly, reading the response
 * as a readable stream and parsing SSE events incrementally.
 *
 * On non-2xx status: collects the error body and throws.
 * Supports AbortSignal for cancellation.
 */
 export function httpRequestStream(
  urlStr: string,
  options: HttpStreamOptions,
 ): Promise<{
  statusCode: number;
  events: AsyncIterable<SSEEvent>;
 }> {
  return new Promise((resolve, reject) => {
    const url = new URL(urlStr);
    const protocol = url.protocol === 'https:' ? https : http;
    const timeout = options.timeout ?? 120000;
    const req = protocol.request(url, {
      method: options.method || 'POST',
      headers: options.headers || {},
      timeout,
    }, (res) => {
      const statusCode = res.statusCode || 0;
      // Non-2xx: collect error body and throw
      if (statusCode < 200 || statusCode >= 300) {
        let errorBody = '';
        res.on('data', (chunk: Buffer) => { errorBody += chunk; });
        res.on('end', () => {
          const error: HttpStreamError = new Error(`API error: ${statusCode}`) as HttpStreamError;
          error.statusCode = statusCode;
          // Parse Retry-After for 429
          if (statusCode === 429) {
            const retryAfter = res.headers['retry-after'];
            if (retryAfter) {
              const seconds = parseInt(retryAfter, 10);
              if (!isNaN(seconds)) {
                error.retryAfter = seconds;
              }
            }
          }
          // Try to extract a better error message
          try {
            const parsed = JSON.parse(errorBody);
            error.message = parsed.error?.message || parsed.message || error.message;
          } catch {
            if (errorBody.length > 0) {
              error.message = `${error.message}: ${errorBody.slice(0, 200)}`;
            }
          }
          reject(error);
        });
        return;
      }
      // 2xx: create async iterable of SSE events
      const events: AsyncIterable<SSEEvent> = {
        [Symbol.asyncIterator]() {
          let buffer = '';
          let done = false;
          let pendingError: Error | null = null;
          const eventQueue: SSEEvent[] = [];
          let resolveNext: ((value: IteratorResult<SSEEvent>) => void) | null = null;
          let rejectNext: ((error: Error) => void) | null = null;
          res.on('data', (chunk: Buffer) => {
            buffer += chunk.toString('utf-8');
            const { events: parsed, remaining } = parseSSELines(buffer);
            buffer = remaining;
            for (const event of parsed) {
              if (resolveNext) {
                const resolve = resolveNext;
                resolveNext = null;
                rejectNext = null;
                resolve({ value: event, done: false });
              } else {
                eventQueue.push(event);
              }
            }
          });
          res.on('end', () => {
            done = true;
            if (resolveNext) {
              const resolve = resolveNext;
              resolveNext = null;
              rejectNext = null;
              resolve({ value: undefined as unknown as SSEEvent, done: true });
            }
          });
          res.on('error', (err: Error) => {
            done = true;
            if (rejectNext) {
              const reject = rejectNext;
              resolveNext = null;
              rejectNext = null;
              reject(err);
            } else {
              // Store error for next .next() call so it's not silently swallowed
              pendingError = err;
            }
          });
          return {
            next(): Promise<IteratorResult<SSEEvent>> {
              // Return queued event immediately
              if (eventQueue.length > 0) {
                return Promise.resolve({ value: eventQueue.shift()!, done: false });
              }
              // Throw stored error from a previous event that fired with no consumer waiting
              if (pendingError) {
                const err = pendingError;
                pendingError = null;
                return Promise.reject(err);
              }
              // Stream already ended
              if (done) {
                return Promise.resolve({ value: undefined as unknown as SSEEvent, done: true });
              }
              // Wait for next event
              return new Promise<IteratorResult<SSEEvent>>((resolve, reject) => {
                resolveNext = resolve;
                rejectNext = reject;
              });
            },
            return(): Promise<IteratorResult<SSEEvent>> {
              // Called when for-await-of exits early (break, return, throw).
              // Destroy the response stream to free the socket immediately.
              done = true;
              res.destroy();
              return Promise.resolve({ value: undefined as unknown as SSEEvent, done: true });
            },
          };
        },
      };
      resolve({ statusCode, events });
    });
    req.on('error', (err: Error) => {
      const error: HttpStreamError = err as HttpStreamError;
      if (options.signal?.aborted) {
        error.isAbort = true;
      }
      reject(error);
    });
    req.on('timeout', () => {
      req.destroy();
      reject(new Error('Request timed out'));
    });
    if (options.signal) {
      if (options.signal.aborted) {
        req.destroy();
        const error: HttpStreamError = new Error('Request cancelled') as HttpStreamError;
        error.isAbort = true;
        reject(error);
        return;
      }
      options.signal.addEventListener('abort', () => {
        req.destroy();
      }, { once: true });
    }
    if (options.body) {
      req.write(options.body);
    }
    req.end();
  });
 }
--- a/tests/engine/streaming.test.ts
+++ b/tests/engine/streaming.test.ts