feat: SSE streaming for chat providers

2026-03-01 10:10:54 +01:00
parent 5938aa9642
commit 78c2cb7bb7
3 changed files with 1417 additions and 104 deletions
--- a/src/main/engine/OpenCodeManager.ts
+++ b/src/main/engine/OpenCodeManager.ts
@@ -12,6 +12,16 @@ import https from 'https';
 import http from 'http';
 import { URL } from 'url';
 import { BrowserWindow } from 'electron';
+import {
+  parseSSELines,
+  parseAnthropicStreamEvent,
+  parseOpenAIStreamEvent,
+  createAnthropicStreamAccumulator,
+  createOpenAIStreamAccumulator,
+  httpRequestStream,
+  withRetry,
+  type HttpStreamError,
+} from './streaming';
 import { ChatEngine } from './ChatEngine';
 import { PostEngine, type PostData } from './PostEngine';
 import { MediaEngine, type MediaData } from './MediaEngine';
@@ -470,10 +480,20 @@ export class OpenCodeManager {
        system: systemPrompt,
        messages,
        tools,
+        stream: true,
        cache_control: { type: 'ephemeral' },
      };

-      const response = await this.httpRequest(ZEN_ANTHROPIC_URL, {
+      // Stream the response with retry for transient errors
+      const streamAccumulator = createAnthropicStreamAccumulator();
+      let stopReason = '';
+      let inputTokens = 0;
+      let outputTokens = 0;
+      let cacheReadTokens = 0;
+      let cacheWriteTokens = 0;
+      let roundText = '';  // Text produced in this round only
+
+      const { events } = await withRetry(() => httpRequestStream(ZEN_ANTHROPIC_URL, {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json',
@@ -483,29 +503,43 @@ export class OpenCodeManager {
        },
        body: JSON.stringify(body),
        signal,
-      });
+      }));

-      if (response.statusCode >= 400) {
-        const errorMsg = this.parseErrorResponse(response);
-        throw new Error(errorMsg);
+      for await (const event of events) {
+        const result = parseAnthropicStreamEvent(event, streamAccumulator);
+
+        // Emit text deltas immediately for real-time streaming
+        if (result.textDelta) {
+          accumulatedText += result.textDelta;
+          roundText += result.textDelta;
+          if (callbacks.onDelta) {
+            callbacks.onDelta(result.textDelta);
+          }
+        }
+
+        // Collect usage from message_start (input tokens) and message_delta (output tokens)
+        if (result.usage) {
+          if (result.usage.inputTokens !== undefined) inputTokens = result.usage.inputTokens;
+          if (result.usage.cacheReadTokens !== undefined) cacheReadTokens = result.usage.cacheReadTokens;
+          if (result.usage.cacheWriteTokens !== undefined) cacheWriteTokens = result.usage.cacheWriteTokens;
+          if (result.usage.outputTokens !== undefined) outputTokens = result.usage.outputTokens;
+        }
+
+        if (result.finishReason) {
+          stopReason = result.finishReason;
+        }
      }

-      const data = JSON.parse(response.body);
-
-      // Extract and emit token usage
-      if (data.usage && callbacks.onTokenUsage) {
-        const usage = data.usage;
-        const cacheReadTokens = usage.cache_read_input_tokens || 0;
-        const cacheWriteTokens = usage.cache_creation_input_tokens || 0;
-        const inputTokens = (usage.input_tokens || 0) - cacheReadTokens - cacheWriteTokens;
-        const outputTokens = usage.output_tokens || 0;
-        const totalTokens = (usage.input_tokens || 0) + outputTokens;
+      // Emit token usage after stream completes
+      if (callbacks.onTokenUsage) {
+        const adjustedInputTokens = inputTokens - cacheReadTokens - cacheWriteTokens;
+        const totalTokens = inputTokens + outputTokens;

        const prev = this.conversationUsage.get(conversationId) || {
          inputTokens: 0, outputTokens: 0, cacheReadTokens: 0, cacheWriteTokens: 0,
        };
        const cumulative = {
-          inputTokens: prev.inputTokens + inputTokens,
+          inputTokens: prev.inputTokens + adjustedInputTokens,
          outputTokens: prev.outputTokens + outputTokens,
          cacheReadTokens: prev.cacheReadTokens + cacheReadTokens,
          cacheWriteTokens: prev.cacheWriteTokens + cacheWriteTokens,
@@ -513,7 +547,7 @@ export class OpenCodeManager {
        this.conversationUsage.set(conversationId, cumulative);

        callbacks.onTokenUsage({
-          inputTokens, outputTokens, cacheReadTokens, cacheWriteTokens, totalTokens,
+          inputTokens: adjustedInputTokens, outputTokens, cacheReadTokens, cacheWriteTokens, totalTokens,
          cumulativeInputTokens: cumulative.inputTokens,
          cumulativeOutputTokens: cumulative.outputTokens,
          cumulativeCacheReadTokens: cumulative.cacheReadTokens,
@@ -522,35 +556,19 @@ export class OpenCodeManager {
        });
      }

-      console.log('[OpenCodeManager] Round', round, 'stop_reason:', data.stop_reason, 'content blocks:', JSON.stringify(data.content?.map((b: AnthropicContentBlock) => ({ type: b.type, textLen: b.text?.length, name: b.name }))));
-
-      if (!data.content) {
-        throw new Error('API response missing content field');
-      }
-
-      // Check if there are tool_use blocks
-      const toolUseBlocks = (data.content as AnthropicContentBlock[]).filter(
-        (b: AnthropicContentBlock) => b.type === 'tool_use'
-      );
-      
-      // Capture text from any block type that has a text field (text, thinking, etc.)
-      const textBlocks = (data.content as AnthropicContentBlock[]).filter(
-        (b: AnthropicContentBlock) => b.text
-      );
-
-      // Accumulate and stream text content to frontend
-      for (const block of textBlocks) {
-        if (block.text) {
-          accumulatedText += block.text;
-          if (callbacks.onDelta) {
-            callbacks.onDelta(block.text);
-          }
+      // Collect tool calls from stream accumulator
+      const toolUseBlocks: Array<{ id: string; name: string; input: unknown }> = [];
+      for (const [, tc] of streamAccumulator.toolCalls) {
+        try {
+          toolUseBlocks.push({ id: tc.id, name: tc.name, input: JSON.parse(tc.arguments) });
+        } catch {
+          toolUseBlocks.push({ id: tc.id, name: tc.name, input: {} });
        }
      }

-      console.log('[OpenCodeManager] Round', round, 'accumulatedText length:', accumulatedText.length, 'toolUseBlocks:', toolUseBlocks.length);
+      console.log('[OpenCodeManager] Round', round, 'stopReason:', stopReason, 'accumulatedText length:', accumulatedText.length, 'toolCalls:', toolUseBlocks.length);

-      if (toolUseBlocks.length === 0 || data.stop_reason !== 'tool_use') {
+      if (toolUseBlocks.length === 0 || stopReason !== 'tool_use') {
        // No more tool calls - return all accumulated text
        console.log('[OpenCodeManager] Returning accumulated text length:', accumulatedText.length);
        return { content: accumulatedText, toolCalls: allToolCalls };
@@ -558,11 +576,26 @@ export class OpenCodeManager {

      // Execute tool calls
      const toolResults: AnthropicContentBlock[] = [];
+      // Build assistant content blocks for the next message round
+      const assistantContentBlocks: AnthropicContentBlock[] = [];
+
+      // Add text block with text from this round
+      if (roundText) {
+        assistantContentBlocks.push({ type: 'text', text: roundText });
+      }

      for (const toolBlock of toolUseBlocks) {
-        const toolName = toolBlock.name!;
+        const toolName = toolBlock.name;
        const toolArgs = toolBlock.input;
-        const toolUseId = toolBlock.id!;
+        const toolUseId = toolBlock.id;
+
+        // Add tool_use block to assistant content
+        assistantContentBlocks.push({
+          type: 'tool_use',
+          id: toolUseId,
+          name: toolName,
+          input: toolArgs,
+        });

        allToolCalls.push({ name: toolName, args: toolArgs });

@@ -643,7 +676,7 @@ export class OpenCodeManager {
      // Add assistant response and tool results to messages for next round
      messages = [
        ...messages,
-        { role: 'assistant' as const, content: data.content },
+        { role: 'assistant' as const, content: assistantContentBlocks },
        { role: 'user' as const, content: toolResults },
      ];
    }
@@ -718,9 +751,18 @@ export class OpenCodeManager {
        max_tokens: 4096,
        messages,
        tools: openaiTools,
+        stream: true,
+        stream_options: { include_usage: true },
      };

-      const response = await this.httpRequest(ZEN_OPENAI_URL, {
+      // Stream the response with retry for transient errors
+      const streamAccumulator = createOpenAIStreamAccumulator();
+      let finishReason = '';
+      let promptTokens = 0;
+      let completionTokens = 0;
+      let totalTokens = 0;
+
+      const { events } = await withRetry(() => httpRequestStream(ZEN_OPENAI_URL, {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json',
@@ -728,23 +770,38 @@ export class OpenCodeManager {
        },
        body: JSON.stringify(body),
        signal,
-      });
+      }));

-      if (response.statusCode >= 400) {
-        const errorMsg = this.parseErrorResponse(response);
-        throw new Error(errorMsg);
+      for await (const event of events) {
+        const result = parseOpenAIStreamEvent(event, streamAccumulator);
+
+        // Emit text deltas immediately for real-time streaming
+        if (result.textDelta) {
+          accumulatedText += result.textDelta;
+          if (callbacks.onDelta) {
+            callbacks.onDelta(result.textDelta);
+          }
+        }
+
+        // Collect usage from final chunk
+        if (result.usage) {
+          if (result.usage.promptTokens !== undefined) promptTokens = result.usage.promptTokens;
+          if (result.usage.completionTokens !== undefined) completionTokens = result.usage.completionTokens;
+          if (result.usage.totalTokens !== undefined) totalTokens = result.usage.totalTokens;
+        }
+
+        if (result.finishReason) {
+          finishReason = result.finishReason;
+        }
+
+        if (result.done) break;
      }

-      const data = JSON.parse(response.body);
-      const choice = data.choices?.[0];
-
-      // Extract and emit token usage (OpenAI format)
-      if (data.usage && callbacks.onTokenUsage) {
-        const usage = data.usage;
-        const cacheReadTokens = usage.prompt_tokens_details?.cached_tokens || 0;
-        const inputTokens = (usage.prompt_tokens || 0) - cacheReadTokens;
-        const outputTokens = usage.completion_tokens || 0;
-        const totalTokens = usage.total_tokens || (usage.prompt_tokens || 0) + outputTokens;
+      // Emit token usage after stream completes
+      if (callbacks.onTokenUsage) {
+        const cacheReadTokens = 0; // OpenAI doesn't provide cache info in streaming
+        const inputTokens = promptTokens;
+        const outputTokens = completionTokens;

        const prev = this.conversationUsage.get(conversationId) || {
          inputTokens: 0, outputTokens: 0, cacheReadTokens: 0, cacheWriteTokens: 0,
@@ -758,7 +815,8 @@ export class OpenCodeManager {
        this.conversationUsage.set(conversationId, cumulative);

        callbacks.onTokenUsage({
-          inputTokens, outputTokens, cacheReadTokens, cacheWriteTokens: 0, totalTokens,
+          inputTokens, outputTokens, cacheReadTokens, cacheWriteTokens: 0,
+          totalTokens: totalTokens || inputTokens + outputTokens,
          cumulativeInputTokens: cumulative.inputTokens,
          cumulativeOutputTokens: cumulative.outputTokens,
          cumulativeCacheReadTokens: cumulative.cacheReadTokens,
@@ -767,57 +825,40 @@ export class OpenCodeManager {
        });
      }

-      console.log('[OpenCodeManager:OpenAI] Round', round, 'status:', response.statusCode, 'content type:', typeof choice?.message?.content, 'content length:', choice?.message?.content?.length, 'tool_calls:', choice?.message?.tool_calls?.length);
-
-      if (!choice?.message) {
-        throw new Error('API response missing expected message content');
+      // Collect tool calls from stream accumulator
+      const parsedToolCalls: Array<{ id: string; name: string; args: unknown }> = [];
+      for (const [, tc] of streamAccumulator.toolCalls) {
+        try {
+          parsedToolCalls.push({ id: tc.id, name: tc.name, args: JSON.parse(tc.arguments) });
+        } catch {
+          parsedToolCalls.push({ id: tc.id, name: tc.name, args: {} });
+        }
      }

-      // Handle content that might be a string or an array of content parts
-      let textContent = '';
-      const content = choice.message.content;
-      if (typeof content === 'string') {
-        textContent = content;
-      } else if (Array.isArray(content)) {
-        // Handle array of content parts (some models return this format)
-        // Accept any part that has a text field, regardless of type
-        textContent = content
-          .filter((part: { type?: string; text?: string }) => part.text)
-          .map((part: { text: string }) => part.text)
-          .join('');
-        
-        // Log what types we're seeing for debugging
-        const types = content.map((p: { type?: string }) => p.type).filter(Boolean);
-        if (types.length > 0) {
-          console.log('[OpenCodeManager:OpenAI] Content block types:', types);
-        }
-      } else if (content && typeof content === 'object') {
-        // Handle single object with text field
-        if ('text' in content && typeof content.text === 'string') {
-          textContent = content.text;
-        }
-      }
-      
-      if (textContent) {
-        accumulatedText += textContent;
-        if (callbacks.onDelta) {
-          callbacks.onDelta(textContent);
-        }
-      }
+      console.log('[OpenCodeManager:OpenAI] Round', round, 'finishReason:', finishReason, 'text length:', accumulatedText.length, 'toolCalls:', parsedToolCalls.length);

      // If no tool calls, we're done
-      if (!choice.message.tool_calls || choice.message.tool_calls.length === 0) {
+      if (parsedToolCalls.length === 0 || finishReason !== 'tool_calls') {
        console.log('[OpenCodeManager:OpenAI] Done. Accumulated text length:', accumulatedText.length);
        return { content: accumulatedText, toolCalls: allToolCalls };
      }

-      // Add assistant message (with tool_calls) to conversation
-      messages.push(choice.message);
+      // Build the assistant message with tool_calls for conversation history
+      const assistantMessage: Record<string, unknown> = {
+        role: 'assistant',
+        content: accumulatedText || null,
+        tool_calls: parsedToolCalls.map((tc) => ({
+          id: tc.id,
+          type: 'function',
+          function: { name: tc.name, arguments: JSON.stringify(tc.args) },
+        })),
+      };
+      messages.push(assistantMessage);

      // Execute tool calls and add results
-      for (const toolCall of choice.message.tool_calls) {
-        const toolName = toolCall.function.name;
-        const toolArgs = JSON.parse(toolCall.function.arguments || '{}');
+      for (const toolCall of parsedToolCalls) {
+        const toolName = toolCall.name;
+        const toolArgs = toolCall.args;

        allToolCalls.push({ name: toolName, args: toolArgs });
        if (callbacks.onToolCall) {
@@ -826,7 +867,7 @@ export class OpenCodeManager {

        // Check if this is a render tool
        if (isRenderTool(toolName)) {
-          const a2uiMessages = generateFromToolCall(conversationId, toolName, toolArgs);
+          const a2uiMessages = generateFromToolCall(conversationId, toolName, toolArgs as Record<string, unknown>);
          if (a2uiMessages) {
            emitA2UIMessages(a2uiMessages);
          }
@@ -843,7 +884,7 @@ export class OpenCodeManager {
          continue;
        }

-        const result = await this.executeTool(toolName, toolArgs);
+        const result = await this.executeTool(toolName, toolArgs as Record<string, unknown>);

        if (callbacks.onToolResult) {
          callbacks.onToolResult({ name: toolName, result });
--- a/src/main/engine/streaming.ts
+++ b/src/main/engine/streaming.ts
@@ -0,0 +1,529 @@
+/**
+ * SSE Streaming Infrastructure
+ *
+ * Provides SSE line parsing, event parsers for OpenAI/Mistral and Anthropic
+ * stream formats, tool-call accumulation, and retry-with-exponential-backoff.
+ *
+ * Used by OpenCodeManager to convert buffered HTTP calls to real-time
+ * token-by-token streaming for all chat providers.
+ */
+
+import https from 'https';
+import http from 'http';
+import { URL } from 'url';
+
+// ── Types ──
+
+export interface SSEEvent {
+  event?: string;
+  data: string;
+}
+
+export interface StreamEventResult {
+  /** Text content delta to emit to UI */
+  textDelta?: string;
+  /** Whether the stream is complete */
+  done: boolean;
+  /** Finish reason from the model */
+  finishReason?: string;
+  /** Token usage information */
+  usage?: {
+    promptTokens?: number;
+    completionTokens?: number;
+    totalTokens?: number;
+    inputTokens?: number;
+    outputTokens?: number;
+    cacheReadTokens?: number;
+    cacheWriteTokens?: number;
+  };
+}
+
+interface ToolCallAccumulator {
+  id: string;
+  name: string;
+  arguments: string;
+}
+
+export interface OpenAIStreamAccumulator {
+  toolCalls: Map<number, ToolCallAccumulator>;
+}
+
+export interface AnthropicStreamAccumulator {
+  toolCalls: Map<number, ToolCallAccumulator>;
+}
+
+export interface HttpStreamError extends Error {
+  statusCode?: number;
+  retryAfter?: number;
+  isAbort?: boolean;
+}
+
+// ── SSE Line Parsing ──
+
+/**
+ * Parse raw SSE text into structured events.
+ *
+ * SSE protocol: events are separated by double-newlines (\n\n).
+ * Each event can have `event:` and `data:` lines.
+ * Multiple `data:` lines within one event are concatenated with newlines.
+ * Lines starting with `:` are comments (ignored).
+ *
+ * Returns parsed events and any remaining incomplete text (buffer).
+ */
+export function parseSSELines(text: string): { events: SSEEvent[]; remaining: string } {
+  const events: SSEEvent[] = [];
+
+  // Normalize \r\n to \n
+  const normalized = text.replace(/\r\n/g, '\n');
+
+  // Split on double-newline (event boundary)
+  const parts = normalized.split('\n\n');
+
+  // Last part may be incomplete (no trailing \n\n)
+  const remaining = normalized.endsWith('\n\n') ? '' : parts.pop() || '';
+
+  for (const part of parts) {
+    if (!part.trim()) continue;
+
+    let eventType: string | undefined;
+    const dataLines: string[] = [];
+
+    for (const line of part.split('\n')) {
+      // Comment lines start with ':'
+      if (line.startsWith(':')) continue;
+
+      if (line.startsWith('event: ') || line.startsWith('event:')) {
+        eventType = line.slice(line.indexOf(':') + 1).trim();
+      } else if (line.startsWith('data: ') || line.startsWith('data:')) {
+        dataLines.push(line.slice(line.indexOf(':') + 1).trimStart());
+      }
+    }
+
+    if (dataLines.length > 0) {
+      events.push({
+        event: eventType,
+        data: dataLines.join('\n'),
+      });
+    }
+  }
+
+  return { events, remaining };
+}
+
+// ── Accumulator Factories ──
+
+export function createOpenAIStreamAccumulator(): OpenAIStreamAccumulator {
+  return { toolCalls: new Map() };
+}
+
+export function createAnthropicStreamAccumulator(): AnthropicStreamAccumulator {
+  return { toolCalls: new Map() };
+}
+
+// ── OpenAI/Mistral SSE Parser ──
+
+/**
+ * Parse a single OpenAI/Mistral SSE event and update the accumulator.
+ *
+ * OpenAI streaming format:
+ * - Text deltas: choices[0].delta.content
+ * - Tool call start: delta.tool_calls[i] with id + function.name
+ * - Tool call fragments: delta.tool_calls[i].function.arguments (append)
+ * - Finish reason: choices[0].finish_reason
+ * - Usage: usage object in final chunk (requires stream_options.include_usage)
+ * - [DONE] sentinel: stop iteration
+ */
+export function parseOpenAIStreamEvent(
+  event: SSEEvent,
+  accumulator: OpenAIStreamAccumulator,
+): StreamEventResult {
+  // Handle [DONE] sentinel
+  if (event.data === '[DONE]') {
+    return { done: true };
+  }
+
+  const data = JSON.parse(event.data);
+  const choice = data.choices?.[0];
+  const result: StreamEventResult = { done: false };
+
+  if (choice) {
+    const delta = choice.delta;
+
+    // Text content delta
+    if (delta?.content && delta.content.length > 0) {
+      result.textDelta = delta.content;
+    }
+
+    // Tool calls
+    if (delta?.tool_calls) {
+      for (const tc of delta.tool_calls) {
+        const idx = tc.index;
+        const existing = accumulator.toolCalls.get(idx);
+
+        if (tc.id || tc.function?.name) {
+          // New tool call or update
+          if (!existing) {
+            accumulator.toolCalls.set(idx, {
+              id: tc.id || '',
+              name: tc.function?.name || '',
+              arguments: tc.function?.arguments || '',
+            });
+          } else {
+            if (tc.id) existing.id = tc.id;
+            if (tc.function?.name) existing.name = tc.function.name;
+            if (tc.function?.arguments) existing.arguments += tc.function.arguments;
+          }
+        } else if (existing && tc.function?.arguments) {
+          // Append argument fragment
+          existing.arguments += tc.function.arguments;
+        }
+      }
+    }
+
+    // Finish reason
+    if (choice.finish_reason) {
+      result.finishReason = choice.finish_reason;
+    }
+  }
+
+  // Token usage (arrives in final chunk with stream_options.include_usage)
+  if (data.usage) {
+    result.usage = {
+      promptTokens: data.usage.prompt_tokens,
+      completionTokens: data.usage.completion_tokens,
+      totalTokens: data.usage.total_tokens,
+    };
+  }
+
+  return result;
+}
+
+// ── Anthropic SSE Parser ──
+
+/**
+ * Parse a single Anthropic SSE event and update the accumulator.
+ *
+ * Anthropic streaming format uses named event types:
+ * - message_start: input token usage
+ * - content_block_start: text or tool_use block begins
+ * - content_block_delta: text_delta or input_json_delta
+ * - content_block_stop: block ends
+ * - message_delta: output tokens + stop_reason
+ * - message_stop: stream complete
+ * - ping: keep-alive (ignored)
+ * - error: server error mid-stream
+ */
+export function parseAnthropicStreamEvent(
+  event: SSEEvent,
+  accumulator: AnthropicStreamAccumulator,
+): StreamEventResult {
+  const data = JSON.parse(event.data);
+  const result: StreamEventResult = { done: false };
+
+  switch (event.event) {
+    case 'message_start': {
+      const usage = data.message?.usage;
+      if (usage) {
+        result.usage = {
+          inputTokens: usage.input_tokens || 0,
+          cacheReadTokens: usage.cache_read_input_tokens || 0,
+          cacheWriteTokens: usage.cache_creation_input_tokens || 0,
+        };
+      }
+      break;
+    }
+
+    case 'content_block_start': {
+      const block = data.content_block;
+      if (block?.type === 'tool_use') {
+        accumulator.toolCalls.set(data.index, {
+          id: block.id,
+          name: block.name,
+          arguments: '',
+        });
+      }
+      // text block start is a no-op (empty initial text)
+      break;
+    }
+
+    case 'content_block_delta': {
+      const delta = data.delta;
+      if (delta?.type === 'text_delta' && delta.text) {
+        result.textDelta = delta.text;
+      } else if (delta?.type === 'input_json_delta' && delta.partial_json) {
+        const tc = accumulator.toolCalls.get(data.index);
+        if (tc) {
+          tc.arguments += delta.partial_json;
+        }
+      }
+      break;
+    }
+
+    case 'content_block_stop':
+      // Block is complete. Tool arguments can now be parsed by the caller.
+      break;
+
+    case 'message_delta': {
+      if (data.usage) {
+        result.usage = {
+          outputTokens: data.usage.output_tokens || 0,
+        };
+      }
+      if (data.delta?.stop_reason) {
+        result.finishReason = data.delta.stop_reason;
+      }
+      break;
+    }
+
+    case 'message_stop':
+      result.done = true;
+      break;
+
+    case 'ping':
+      // Keep-alive, ignore
+      break;
+
+    case 'error': {
+      const errorMsg = data.error?.message || 'Unknown streaming error';
+      throw new Error(errorMsg);
+    }
+
+    default:
+      // Unknown event type, ignore
+      break;
+  }
+
+  return result;
+}
+
+// ── Retry with Exponential Backoff ──
+
+const RETRYABLE_STATUS_CODES = new Set([429, 502, 503]);
+
+/**
+ * Retry a function with exponential backoff for transient HTTP errors.
+ *
+ * Retries on 429 (rate limit), 502 (bad gateway), 503 (service unavailable).
+ * Does NOT retry on other 4xx errors or abort.
+ * Respects Retry-After header for 429 responses.
+ */
+export async function withRetry<T>(
+  fn: () => Promise<T>,
+  options: { maxRetries?: number } = {},
+): Promise<T> {
+  const maxRetries = options.maxRetries ?? 3;
+  let lastError: Error | undefined;
+
+  for (let attempt = 0; attempt <= maxRetries; attempt++) {
+    try {
+      return await fn();
+    } catch (error) {
+      lastError = error as Error;
+      const httpError = error as HttpStreamError;
+
+      // Don't retry on abort
+      if (httpError.isAbort || httpError.message === 'Request cancelled') {
+        throw error;
+      }
+
+      // Don't retry on non-retryable status codes
+      if (httpError.statusCode && !RETRYABLE_STATUS_CODES.has(httpError.statusCode)) {
+        throw error;
+      }
+
+      // Don't retry if we've exhausted retries
+      if (attempt >= maxRetries) {
+        throw error;
+      }
+
+      // Calculate delay with exponential backoff and jitter
+      const baseDelay = Math.pow(2, attempt) * 1000; // 1s, 2s, 4s
+      const jitter = Math.random() * 500;
+      let delay = baseDelay + jitter;
+
+      // Respect Retry-After header for 429
+      if (httpError.retryAfter && httpError.retryAfter > 0) {
+        delay = Math.max(delay, httpError.retryAfter * 1000);
+      }
+
+      await new Promise(resolve => setTimeout(resolve, delay));
+    }
+  }
+
+  throw lastError;
+}
+
+// ── HTTP Streaming Request ──
+
+interface HttpStreamOptions {
+  method?: string;
+  headers?: Record<string, string>;
+  body?: string;
+  signal?: AbortSignal;
+  timeout?: number;
+}
+
+/**
+ * Make an HTTP request that returns an async iterable of SSE events.
+ *
+ * Uses Node.js http/https modules directly, reading the response
+ * as a readable stream and parsing SSE events incrementally.
+ *
+ * On non-2xx status: collects the error body and throws.
+ * Supports AbortSignal for cancellation.
+ */
+export function httpRequestStream(
+  urlStr: string,
+  options: HttpStreamOptions,
+): Promise<{
+  statusCode: number;
+  events: AsyncIterable<SSEEvent>;
+}> {
+  return new Promise((resolve, reject) => {
+    const url = new URL(urlStr);
+    const protocol = url.protocol === 'https:' ? https : http;
+    const timeout = options.timeout ?? 120000;
+
+    const req = protocol.request(url, {
+      method: options.method || 'POST',
+      headers: options.headers || {},
+      timeout,
+    }, (res) => {
+      const statusCode = res.statusCode || 0;
+
+      // Non-2xx: collect error body and throw
+      if (statusCode < 200 || statusCode >= 300) {
+        let errorBody = '';
+        res.on('data', (chunk: Buffer) => { errorBody += chunk; });
+        res.on('end', () => {
+          const error: HttpStreamError = new Error(`API error: ${statusCode}`) as HttpStreamError;
+          error.statusCode = statusCode;
+
+          // Parse Retry-After for 429
+          if (statusCode === 429) {
+            const retryAfter = res.headers['retry-after'];
+            if (retryAfter) {
+              const seconds = parseInt(retryAfter, 10);
+              if (!isNaN(seconds)) {
+                error.retryAfter = seconds;
+              }
+            }
+          }
+
+          // Try to extract a better error message
+          try {
+            const parsed = JSON.parse(errorBody);
+            error.message = parsed.error?.message || parsed.message || error.message;
+          } catch {
+            if (errorBody.length > 0) {
+              error.message = `${error.message}: ${errorBody.slice(0, 200)}`;
+            }
+          }
+          reject(error);
+        });
+        return;
+      }
+
+      // 2xx: create async iterable of SSE events
+      const events: AsyncIterable<SSEEvent> = {
+        [Symbol.asyncIterator]() {
+          let buffer = '';
+          let done = false;
+          const eventQueue: SSEEvent[] = [];
+          let resolveNext: ((value: IteratorResult<SSEEvent>) => void) | null = null;
+          let rejectNext: ((error: Error) => void) | null = null;
+
+          res.on('data', (chunk: Buffer) => {
+            buffer += chunk.toString('utf-8');
+            const { events: parsed, remaining } = parseSSELines(buffer);
+            buffer = remaining;
+
+            for (const event of parsed) {
+              if (resolveNext) {
+                const resolve = resolveNext;
+                resolveNext = null;
+                rejectNext = null;
+                resolve({ value: event, done: false });
+              } else {
+                eventQueue.push(event);
+              }
+            }
+          });
+
+          res.on('end', () => {
+            done = true;
+            if (resolveNext) {
+              const resolve = resolveNext;
+              resolveNext = null;
+              rejectNext = null;
+              resolve({ value: undefined as unknown as SSEEvent, done: true });
+            }
+          });
+
+          res.on('error', (err: Error) => {
+            done = true;
+            if (rejectNext) {
+              const reject = rejectNext;
+              resolveNext = null;
+              rejectNext = null;
+              reject(err);
+            }
+          });
+
+          return {
+            next(): Promise<IteratorResult<SSEEvent>> {
+              // Return queued event immediately
+              if (eventQueue.length > 0) {
+                return Promise.resolve({ value: eventQueue.shift()!, done: false });
+              }
+
+              // Stream already ended
+              if (done) {
+                return Promise.resolve({ value: undefined as unknown as SSEEvent, done: true });
+              }
+
+              // Wait for next event
+              return new Promise<IteratorResult<SSEEvent>>((resolve, reject) => {
+                resolveNext = resolve;
+                rejectNext = reject;
+              });
+            },
+          };
+        },
+      };
+
+      resolve({ statusCode, events });
+    });
+
+    req.on('error', (err: Error) => {
+      const error: HttpStreamError = err as HttpStreamError;
+      if (options.signal?.aborted) {
+        error.isAbort = true;
+      }
+      reject(error);
+    });
+
+    req.on('timeout', () => {
+      req.destroy();
+      reject(new Error('Request timed out'));
+    });
+
+    if (options.signal) {
+      if (options.signal.aborted) {
+        req.destroy();
+        const error: HttpStreamError = new Error('Request cancelled') as HttpStreamError;
+        error.isAbort = true;
+        reject(error);
+        return;
+      }
+      options.signal.addEventListener('abort', () => {
+        req.destroy();
+      });
+    }
+
+    if (options.body) {
+      req.write(options.body);
+    }
+    req.end();
+  });
+}