fix: implemented TD-06 real SSE implementation

2026-06-11 16:37:08 +02:00
parent a5391e8e25
commit 9325de2db4
9 changed files with 991 additions and 15 deletions
--- a/test/bds/ai/chat_streaming_test.exs
+++ b/test/bds/ai/chat_streaming_test.exs
@@ -0,0 +1,164 @@
+defmodule BDS.AI.ChatStreamingTest do
+  use ExUnit.Case, async: false
+
+  defmodule StreamingChatPlug do
+    import Plug.Conn
+
+    def init(opts), do: opts
+
+    def call(conn, _opts) do
+      {:ok, body, conn} = read_body(conn)
+      payload = Jason.decode!(body)
+
+      if payload["stream"] == true do
+        stream_chat(conn)
+      else
+        # Chat-title generation and other one-shot requests stay non-streaming.
+        conn
+        |> put_resp_content_type("application/json")
+        |> send_resp(
+          200,
+          Jason.encode!(%{
+            "choices" => [%{"message" => %{"content" => "Story Time"}}],
+            "usage" => %{"prompt_tokens" => 1, "completion_tokens" => 1}
+          })
+        )
+      end
+    end
+
+    defp stream_chat(conn) do
+      conn =
+        conn
+        |> put_resp_content_type("text/event-stream")
+        |> send_chunked(200)
+
+      case Application.get_env(:bds, :chat_stream_scenario, :short) do
+        :short -> stream_short(conn)
+        :endless -> stream_endless(conn)
+      end
+    end
+
+    defp stream_short(conn) do
+      events =
+        [
+          delta_event(%{"content" => "Once"}),
+          delta_event(%{"content" => " upon"}),
+          delta_event(%{"content" => " a time"}),
+          "data: " <>
+            Jason.encode!(%{
+              "choices" => [],
+              "usage" => %{"prompt_tokens" => 9, "completion_tokens" => 4}
+            }) <> "\n\n",
+          "data: [DONE]\n\n"
+        ]
+
+      Enum.reduce_while(events, conn, fn event, conn ->
+        case chunk(conn, event) do
+          {:ok, conn} -> {:cont, conn}
+          {:error, _reason} -> {:halt, conn}
+        end
+      end)
+    end
+
+    defp stream_endless(conn) do
+      case chunk(conn, delta_event(%{"content" => "tick "})) do
+        {:ok, conn} ->
+          Process.sleep(50)
+          stream_endless(conn)
+
+        {:error, _reason} ->
+          send(test_pid(), :sse_client_disconnected)
+          conn
+      end
+    end
+
+    defp delta_event(delta) do
+      "data: " <> Jason.encode!(%{"choices" => [%{"delta" => delta}]}) <> "\n\n"
+    end
+
+    defp test_pid, do: Application.get_env(:bds, :chat_stream_test_pid)
+  end
+
+  setup do
+    :ok = Ecto.Adapters.SQL.Sandbox.checkout(BDS.Repo)
+
+    Application.put_env(:bds, :chat_stream_test_pid, self())
+    Application.put_env(:bds, :chat_stream_scenario, :short)
+
+    original_chat = Application.fetch_env(:bds, :chat)
+
+    Application.put_env(
+      :bds,
+      :chat,
+      Keyword.merge(Application.get_env(:bds, :chat, []), stream_emit_interval_ms: 0)
+    )
+
+    on_exit(fn ->
+      Application.delete_env(:bds, :chat_stream_scenario)
+
+      case original_chat do
+        {:ok, value} -> Application.put_env(:bds, :chat, value)
+        :error -> Application.delete_env(:bds, :chat)
+      end
+    end)
+
+    server = start_supervised!({Bandit, plug: StreamingChatPlug, port: 0, startup_log: false})
+    {:ok, {_address, port}} = ThousandIsland.listener_info(server)
+
+    assert {:ok, _endpoint} =
+             BDS.AI.put_endpoint(:online, %{
+               url: "http://127.0.0.1:#{port}/v1",
+               api_key: "sk-stream",
+               model: "stream-model"
+             })
+
+    assert :ok = BDS.AI.set_airplane_mode(false)
+    assert {:ok, conversation} = BDS.AI.start_chat(%{model: "stream-model"})
+
+    {:ok, conversation: conversation}
+  end
+
+  test "incremental content events arrive before the final reply and persistence matches", %{
+    conversation: conversation
+  } do
+    conversation_id = conversation.id
+
+    assert {:ok, reply} =
+             BDS.AI.send_chat_message(conversation_id, "tell me a story",
+               event_target: self()
+             )
+
+    assert reply.assistant_message.content == "Once upon a time"
+
+    assert_received {:chat_streaming_content, ^conversation_id, "Once"}
+    assert_received {:chat_streaming_content, ^conversation_id, "Once upon"}
+    assert_received {:chat_streaming_content, ^conversation_id, "Once upon a time"}
+
+    messages = BDS.AI.list_chat_messages(conversation_id)
+    assistant_message = List.last(messages)
+    assert assistant_message.role == :assistant
+    assert assistant_message.content == "Once upon a time"
+    assert assistant_message.token_usage_input == 9
+    assert assistant_message.token_usage_output == 4
+  end
+
+  test "cancel_chat mid-stream aborts the HTTP request", %{conversation: conversation} do
+    Application.put_env(:bds, :chat_stream_scenario, :endless)
+    conversation_id = conversation.id
+    test_pid = self()
+
+    task =
+      Task.async(fn ->
+        BDS.AI.send_chat_message(conversation_id, "stream forever", event_target: test_pid)
+      end)
+
+    # Wait until tokens are actually flowing before cancelling.
+    assert_receive {:chat_streaming_content, ^conversation_id, _content}, 2_000
+
+    assert :ok = BDS.AI.cancel_chat(conversation_id)
+    assert {:error, :cancelled} = Task.await(task)
+
+    # The server notices the closed connection — the request was truly aborted.
+    assert_receive :sse_client_disconnected, 2_000
+  end
+end
--- a/test/bds/ai/openai_compatible_runtime_streaming_test.exs
+++ b/test/bds/ai/openai_compatible_runtime_streaming_test.exs
@@ -0,0 +1,253 @@
+defmodule BDS.AI.OpenAICompatibleRuntimeStreamingTest do
+  use ExUnit.Case, async: false
+
+  alias BDS.AI.OpenAICompatibleRuntime
+
+  defmodule SSEPlug do
+    import Plug.Conn
+
+    def init(opts), do: opts
+
+    def call(conn, _opts) do
+      {:ok, body, conn} = read_body(conn)
+      payload = Jason.decode!(body)
+      send(test_pid(), {:endpoint_request, payload})
+
+      respond(conn, payload["model"], payload)
+    end
+
+    defp respond(conn, "stream-content", %{"stream" => true}) do
+      stream(conn, [
+        delta_event(%{"role" => "assistant", "content" => ""}),
+        delta_event(%{"content" => "Once"}),
+        delta_event(%{"content" => " upon"}),
+        delta_event(%{"content" => " a time"}),
+        ~s(data: ) <>
+          Jason.encode!(%{
+            "choices" => [],
+            "usage" => %{"prompt_tokens" => 7, "completion_tokens" => 3}
+          }) <> "\n\n",
+        "data: [DONE]\n\n"
+      ])
+    end
+
+    defp respond(conn, "stream-tools", %{"stream" => true}) do
+      stream(conn, [
+        delta_event(%{
+          "tool_calls" => [
+            %{
+              "index" => 0,
+              "id" => "call-1",
+              "function" => %{"name" => "search_posts", "arguments" => ""}
+            }
+          ]
+        }),
+        delta_event(%{
+          "tool_calls" => [%{"index" => 0, "function" => %{"arguments" => "{\"query\":"}}]
+        }),
+        delta_event(%{
+          "tool_calls" => [%{"index" => 0, "function" => %{"arguments" => "\"sun\"}"}}]
+        }),
+        "data: [DONE]\n\n"
+      ])
+    end
+
+    defp respond(conn, "stream-error", %{"stream" => true}) do
+      send_resp(conn, 503, ~s({"error":"overloaded"}))
+    end
+
+    # Simulates a provider that ignores the "stream" flag and answers with a
+    # plain JSON completion.
+    defp respond(conn, "ignores-stream", %{"stream" => true}) do
+      conn
+      |> put_resp_content_type("application/json")
+      |> send_resp(
+        200,
+        Jason.encode!(%{
+          "choices" => [%{"message" => %{"content" => "plain json despite stream"}}],
+          "usage" => %{"prompt_tokens" => 5, "completion_tokens" => 2}
+        })
+      )
+    end
+
+    defp respond(conn, _model, _payload) do
+      conn
+      |> put_resp_content_type("application/json")
+      |> send_resp(
+        200,
+        Jason.encode!(%{
+          "choices" => [%{"message" => %{"content" => "non-streaming reply"}}],
+          "usage" => %{"prompt_tokens" => 1, "completion_tokens" => 1}
+        })
+      )
+    end
+
+    defp delta_event(delta) do
+      "data: " <> Jason.encode!(%{"choices" => [%{"delta" => delta}]}) <> "\n\n"
+    end
+
+    defp stream(conn, events) do
+      conn =
+        conn
+        |> put_resp_content_type("text/event-stream")
+        |> send_chunked(200)
+
+      Enum.reduce_while(events, conn, fn event, conn ->
+        case chunk(conn, event) do
+          {:ok, conn} -> {:cont, conn}
+          {:error, _reason} -> {:halt, conn}
+        end
+      end)
+    end
+
+    defp test_pid, do: Application.get_env(:bds, :sse_plug_test_pid)
+  end
+
+  setup do
+    :ok = Ecto.Adapters.SQL.Sandbox.checkout(BDS.Repo)
+    Application.put_env(:bds, :sse_plug_test_pid, self())
+
+    original_chat = Application.fetch_env(:bds, :chat)
+
+    Application.put_env(
+      :bds,
+      :chat,
+      Keyword.merge(Application.get_env(:bds, :chat, []), stream_emit_interval_ms: 0)
+    )
+
+    on_exit(fn ->
+      case original_chat do
+        {:ok, value} -> Application.put_env(:bds, :chat, value)
+        :error -> Application.delete_env(:bds, :chat)
+      end
+    end)
+
+    server = start_supervised!({Bandit, plug: SSEPlug, port: 0, startup_log: false})
+    {:ok, {_address, port}} = ThousandIsland.listener_info(server)
+
+    {:ok, url: "http://127.0.0.1:#{port}/v1"}
+  end
+
+  defp chat_request(model) do
+    %{
+      operation: :chat,
+      model: model,
+      max_output_tokens: 64,
+      messages: [%{"role" => "user", "content" => "hello"}]
+    }
+  end
+
+  defp stream_collector do
+    test_pid = self()
+    fn event -> send(test_pid, {:stream_event, event}) end
+  end
+
+  test "generate streams cumulative content and returns the assembled response", %{url: url} do
+    assert {:ok, response} =
+             OpenAICompatibleRuntime.generate(
+               %{url: url, api_key: "sk-test"},
+               chat_request("stream-content"),
+               on_stream: stream_collector()
+             )
+
+    assert response.content == "Once upon a time"
+    assert response.tool_calls == []
+    assert response.usage.input_tokens == 7
+    assert response.usage.output_tokens == 3
+
+    assert_received {:endpoint_request, payload}
+    assert payload["stream"] == true
+    assert payload["stream_options"] == %{"include_usage" => true}
+
+    assert_received {:stream_event, %{content: "Once"}}
+    assert_received {:stream_event, %{content: "Once upon"}}
+    assert_received {:stream_event, %{content: "Once upon a time"}}
+  end
+
+  test "generate assembles tool calls streamed as fragments", %{url: url} do
+    assert {:ok, response} =
+             OpenAICompatibleRuntime.generate(
+               %{url: url, api_key: "sk-test"},
+               chat_request("stream-tools"),
+               on_stream: stream_collector()
+             )
+
+    assert response.content == nil
+
+    assert response.tool_calls == [
+             %{id: "call-1", name: "search_posts", arguments: %{"query" => "sun"}}
+           ]
+  end
+
+  test "an error status during streaming surfaces as a structured error", %{url: url} do
+    assert {:error, %{kind: :http_error, status: 503}} =
+             OpenAICompatibleRuntime.generate(
+               %{url: url, api_key: "sk-test"},
+               chat_request("stream-error"),
+               on_stream: stream_collector()
+             )
+  end
+
+  test "a provider that ignores the stream flag still produces a full response", %{url: url} do
+    assert {:ok, response} =
+             OpenAICompatibleRuntime.generate(
+               %{url: url, api_key: "sk-test"},
+               chat_request("ignores-stream"),
+               on_stream: stream_collector()
+             )
+
+    assert response.content == "plain json despite stream"
+    assert response.usage.input_tokens == 5
+    assert response.usage.output_tokens == 2
+  end
+
+  test "streaming is skipped when disabled via config", %{url: url} do
+    Application.put_env(
+      :bds,
+      :chat,
+      Keyword.merge(Application.get_env(:bds, :chat, []), streaming: false)
+    )
+
+    assert {:ok, %{content: "non-streaming reply"}} =
+             OpenAICompatibleRuntime.generate(
+               %{url: url, api_key: "sk-test"},
+               chat_request("any-model"),
+               on_stream: stream_collector()
+             )
+
+    assert_received {:endpoint_request, payload}
+    refute Map.has_key?(payload, "stream")
+    refute_received {:stream_event, _event}
+  end
+
+  test "streaming requires an on_stream callback", %{url: url} do
+    assert {:ok, %{content: "non-streaming reply"}} =
+             OpenAICompatibleRuntime.generate(
+               %{url: url, api_key: "sk-test"},
+               chat_request("any-model"),
+               []
+             )
+
+    assert_received {:endpoint_request, payload}
+    refute Map.has_key?(payload, "stream")
+  end
+
+  test "non-chat operations never stream", %{url: url} do
+    request = %{
+      operation: :chat_title,
+      model: "any-model",
+      max_output_tokens: 32,
+      messages: [%{"role" => "user", "content" => "Topic: hello"}]
+    }
+
+    assert {:ok, %{content: "non-streaming reply"}} =
+             OpenAICompatibleRuntime.generate(
+               %{url: url, api_key: "sk-test"},
+               request,
+               on_stream: stream_collector()
+             )
+
+    assert_received {:endpoint_request, payload}
+    refute Map.has_key?(payload, "stream")
+  end
+end
--- a/test/bds/ai/sse_test.exs
+++ b/test/bds/ai/sse_test.exs
@@ -0,0 +1,200 @@
+defmodule BDS.AI.SSETest do
+  use ExUnit.Case, async: true
+
+  alias BDS.AI.SSE
+
+  defp chunk_event(payload), do: "data: " <> Jason.encode!(payload) <> "\n\n"
+
+  defp content_delta(text) do
+    %{"choices" => [%{"delta" => %{"content" => text}}]}
+  end
+
+  test "assembles content from deltas across separate chunks" do
+    sse = SSE.new(nil)
+
+    sse =
+      sse
+      |> SSE.feed(chunk_event(content_delta("Hel")))
+      |> SSE.feed(chunk_event(content_delta("lo ")))
+      |> SSE.feed(chunk_event(content_delta("world")))
+      |> SSE.feed("data: [DONE]\n\n")
+
+    assert %{content: "Hello world", tool_calls: [], usage: nil} = SSE.finish(sse)
+  end
+
+  test "handles events split across arbitrary chunk boundaries" do
+    raw =
+      chunk_event(content_delta("alpha ")) <>
+        chunk_event(content_delta("beta")) <> "data: [DONE]\n\n"
+
+    # Feed the byte stream in 7-byte slices to exercise buffering.
+    sse =
+      raw
+      |> :binary.bin_to_list()
+      |> Enum.chunk_every(7)
+      |> Enum.map(&:binary.list_to_bin/1)
+      |> Enum.reduce(SSE.new(nil), &SSE.feed(&2, &1))
+
+    assert %{content: "alpha beta"} = SSE.finish(sse)
+  end
+
+  test "supports CRLF line endings and data lines without a space" do
+    payload = Jason.encode!(content_delta("crlf"))
+    sse = SSE.feed(SSE.new(nil), "data:" <> payload <> "\r\n\r\ndata: [DONE]\r\n\r\n")
+
+    assert %{content: "crlf"} = SSE.finish(sse)
+  end
+
+  test "ignores comments, other fields, and undecodable data" do
+    sse =
+      SSE.new(nil)
+      |> SSE.feed(": keep-alive\n\n")
+      |> SSE.feed("event: message\nid: 7\n" <> "data: " <> Jason.encode!(content_delta("ok")) <> "\n\n")
+      |> SSE.feed("data: not-json\n\n")
+
+    assert %{content: "ok"} = SSE.finish(sse)
+  end
+
+  test "stops processing after [DONE]" do
+    sse =
+      SSE.new(nil)
+      |> SSE.feed(chunk_event(content_delta("kept")))
+      |> SSE.feed("data: [DONE]\n\n")
+      |> SSE.feed(chunk_event(content_delta(" dropped")))
+
+    assert %{content: "kept"} = SSE.finish(sse)
+  end
+
+  test "finishes a trailing event that lacks the final blank line" do
+    sse = SSE.feed(SSE.new(nil), "data: " <> Jason.encode!(content_delta("tail")))
+
+    assert %{content: "tail"} = SSE.finish(sse)
+  end
+
+  test "content is nil when the stream carried no content" do
+    sse = SSE.feed(SSE.new(nil), "data: [DONE]\n\n")
+
+    assert %{content: nil} = SSE.finish(sse)
+  end
+
+  test "assembles tool calls from fragments in OpenAI wire shape" do
+    fragments = [
+      %{
+        "choices" => [
+          %{
+            "delta" => %{
+              "tool_calls" => [
+                %{
+                  "index" => 0,
+                  "id" => "call-1",
+                  "function" => %{"name" => "search_posts", "arguments" => ""}
+                }
+              ]
+            }
+          }
+        ]
+      },
+      %{
+        "choices" => [
+          %{
+            "delta" => %{
+              "tool_calls" => [
+                %{"index" => 0, "function" => %{"arguments" => "{\"query\":"}},
+                %{
+                  "index" => 1,
+                  "id" => "call-2",
+                  "function" => %{"name" => "count_posts", "arguments" => "{}"}
+                }
+              ]
+            }
+          }
+        ]
+      },
+      %{
+        "choices" => [
+          %{
+            "delta" => %{
+              "tool_calls" => [%{"index" => 0, "function" => %{"arguments" => "\"sun\"}"}}]
+            }
+          }
+        ]
+      }
+    ]
+
+    sse = Enum.reduce(fragments, SSE.new(nil), &SSE.feed(&2, chunk_event(&1)))
+
+    assert %{tool_calls: tool_calls} = SSE.finish(sse)
+
+    assert tool_calls == [
+             %{
+               "id" => "call-1",
+               "function" => %{"name" => "search_posts", "arguments" => ~s({"query":"sun"})}
+             },
+             %{"id" => "call-2", "function" => %{"name" => "count_posts", "arguments" => "{}"}}
+           ]
+  end
+
+  test "captures usage from the final chunk" do
+    sse =
+      SSE.new(nil)
+      |> SSE.feed(chunk_event(content_delta("hi")))
+      |> SSE.feed(
+        chunk_event(%{"choices" => [], "usage" => %{"prompt_tokens" => 7, "completion_tokens" => 2}})
+      )
+
+    assert %{usage: %{"prompt_tokens" => 7, "completion_tokens" => 2}} = SSE.finish(sse)
+  end
+
+  test "emits cumulative content snapshots to the callback" do
+    test_pid = self()
+    sse = SSE.new(fn event -> send(test_pid, {:stream_event, event}) end, emit_interval_ms: 0)
+
+    sse
+    |> SSE.feed(chunk_event(content_delta("one")))
+    |> SSE.feed(chunk_event(content_delta(" two")))
+
+    assert_received {:stream_event, %{content: "one"}}
+    assert_received {:stream_event, %{content: "one two"}}
+  end
+
+  test "throttles intermediate emissions but always emits the first delta" do
+    test_pid = self()
+
+    sse =
+      SSE.new(fn event -> send(test_pid, {:stream_event, event}) end,
+        emit_interval_ms: 60_000
+      )
+
+    sse
+    |> SSE.feed(chunk_event(content_delta("first")))
+    |> SSE.feed(chunk_event(content_delta(" second")))
+    |> SSE.feed(chunk_event(content_delta(" third")))
+
+    assert_received {:stream_event, %{content: "first"}}
+    refute_received {:stream_event, _event}
+  end
+
+  test "tool-call-only streams emit no content events" do
+    test_pid = self()
+
+    sse =
+      SSE.new(fn event -> send(test_pid, {:stream_event, event}) end, emit_interval_ms: 0)
+
+    SSE.feed(
+      sse,
+      chunk_event(%{
+        "choices" => [
+          %{
+            "delta" => %{
+              "tool_calls" => [
+                %{"index" => 0, "id" => "c", "function" => %{"name" => "n", "arguments" => "{}"}}
+              ]
+            }
+          }
+        ]
+      })
+    )
+
+    refute_received {:stream_event, _event}
+  end
+end